mspire 0.8.5 → 0.8.6
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/hash/inverse.rb +15 -0
- data/lib/mspire/error_rate/qvalue.rb +5 -5
- data/lib/mspire/fasta.rb +2 -0
- data/lib/mspire/ident/peptide/db/creator.rb +48 -58
- data/lib/mspire/ident/peptide/db/io.rb +5 -0
- data/lib/mspire/ident/peptide_hit/qvalue.rb +2 -2
- data/lib/mspire/ident/peptide_hit.rb +2 -2
- data/lib/mspire/ident/protein_group.rb +4 -2
- data/lib/mspire/isotope/aa.rb +10 -10
- data/lib/mspire/mzml/instrument_configuration.rb +10 -3
- data/lib/mspire/quant/cmdline.rb +42 -0
- data/lib/mspire/quant/protein_group_comparison.rb +29 -0
- data/lib/mspire/quant/spectral_counts.rb +42 -0
- data/script/fasta_to_peptide_centric_db.rb +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +37 -45
- data/script/mass_correct.rb +118 -0
- data/script/minimal_protein_set.rb +345 -0
- data/script/mzml_to_mgf.rb +46 -0
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +275 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +11 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +157 -157
- metadata +11 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.6
|
data/lib/hash/inverse.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# inverse from Tilo Sloboda (now in facets)
|
4
|
+
|
5
|
+
class Hash
|
6
|
+
def inverse
|
7
|
+
i = Hash.new
|
8
|
+
self.each_pair do |k,v|
|
9
|
+
if (Array === v) ; v.each{ |x| i[x] = ( i.has_key?(x) ? [k,i[x]].flatten : k ) }
|
10
|
+
else ; i[v] = ( i.has_key?(v) ? [k,i[v]].flatten : k ) end
|
11
|
+
end ; i
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
@@ -22,8 +22,8 @@ module Mspire
|
|
22
22
|
# Proc.new doesn't do arity checking
|
23
23
|
hit_with_qvalue_pairs = Proc.new do |hits|
|
24
24
|
sorted_best_to_worst = (hits.sort_by(&sorting)).reverse
|
25
|
-
(
|
26
|
-
|
25
|
+
(sorted_target_hits, qvalues) = Mspire::ErrorRate::Qvalue.mixed_target_decoy(sorted_best_to_worst, target_set, opts)
|
26
|
+
sorted_target_hits.zip(qvalues)
|
27
27
|
end
|
28
28
|
|
29
29
|
all_together = target_hits + decoy_hits
|
@@ -49,13 +49,13 @@ module Mspire
|
|
49
49
|
opts = {:monotonic => true}.merge(opts)
|
50
50
|
num_target = 0 ; num_decoy = 0
|
51
51
|
monotonic = opts[:monotonic]
|
52
|
-
|
52
|
+
sorted_target_hits = []
|
53
53
|
qvalues = []
|
54
54
|
best_to_worst.each do |hit|
|
55
55
|
if target_setlike.include?(hit)
|
56
56
|
num_target += 1
|
57
57
|
precision = Mspire::ErrorRate::Decoy.precision(num_target, num_decoy)
|
58
|
-
|
58
|
+
sorted_target_hits << hit
|
59
59
|
qvalues << (1.0 - precision)
|
60
60
|
else
|
61
61
|
num_decoy += 1
|
@@ -72,7 +72,7 @@ module Mspire
|
|
72
72
|
end
|
73
73
|
end.reverse
|
74
74
|
end
|
75
|
-
[
|
75
|
+
[sorted_target_hits, qvalues]
|
76
76
|
end
|
77
77
|
|
78
78
|
|
data/lib/mspire/fasta.rb
CHANGED
@@ -61,6 +61,8 @@ module Mspire
|
|
61
61
|
# takes the header string and returns the uniprot id
|
62
62
|
#
|
63
63
|
# 'sp|Q04917|1433F_HUMAN' #=> 'Q04917'
|
64
|
+
# This can also be found with BioFastaFormat#accession (but it may be much
|
65
|
+
# slower)
|
64
66
|
def self.uniprot_id(header)
|
65
67
|
header[/^[^\|]+\|([^\|]+)\|/, 1]
|
66
68
|
end
|
@@ -43,8 +43,11 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
43
43
|
op.on("--no-expand-x", "don't enumerate aa possibilities", "(removes these peptides)") { opt[:expand_aa] = false }
|
44
44
|
op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false }
|
45
45
|
op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v }
|
46
|
+
|
46
47
|
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
|
48
|
+
op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 }
|
47
49
|
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
50
|
+
op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 }
|
48
51
|
puts Mspire::Digester::ENZYMES.keys.join("\n")
|
49
52
|
exit
|
50
53
|
end
|
@@ -95,7 +98,7 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
95
98
|
(pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep
|
96
99
|
end
|
97
100
|
else
|
98
|
-
peptides.
|
101
|
+
peptides.select {|pep| pep !~ letters_to_expand_re }
|
99
102
|
end
|
100
103
|
header = prot.header
|
101
104
|
id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s+/).first
|
@@ -118,15 +121,25 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
118
121
|
hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:trie])
|
119
122
|
|
120
123
|
base = digestion_file.chomp(File.extname(digestion_file))
|
121
|
-
final_outfile =
|
124
|
+
final_outfile =
|
125
|
+
if opts[:trie]
|
126
|
+
base + ".min_aaseq#{opts[:min_length]}"
|
127
|
+
else
|
128
|
+
base + ".min_aaseq#{opts[:min_length]}" + ".yml"
|
129
|
+
end
|
122
130
|
|
123
131
|
start_time = Time.now
|
124
132
|
print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE
|
125
133
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
134
|
+
if opts[:trie]
|
135
|
+
trie = hash_like
|
136
|
+
trie.save(final_outfile)
|
137
|
+
else
|
138
|
+
File.open(final_outfile, 'w') do |out|
|
139
|
+
hash_like.each do |k,v|
|
140
|
+
out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) )
|
141
|
+
#out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}"
|
142
|
+
end
|
130
143
|
end
|
131
144
|
end
|
132
145
|
puts "#{Time.now - start_time} sec" if $VERBOSE
|
@@ -137,71 +150,47 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
137
150
|
File.expand_path(final_outfile)
|
138
151
|
end
|
139
152
|
|
140
|
-
def
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
if val.nil?
|
146
|
-
self.add(key,"")
|
147
|
-
self.get(key)
|
148
|
-
else
|
149
|
-
val
|
150
|
-
end
|
153
|
+
def get_a_trie
|
154
|
+
begin
|
155
|
+
require 'trie'
|
156
|
+
rescue
|
157
|
+
raise LoadError, "must first install fast_trie"
|
151
158
|
end
|
152
|
-
|
159
|
+
Trie.new
|
153
160
|
end
|
154
161
|
|
155
162
|
def hash_like_from_digestion_file(digestion_file, min_length, use_trie=false)
|
156
|
-
cnt = 0
|
157
163
|
if use_trie
|
158
|
-
|
159
|
-
#puts "using trie" if $VERBOSE
|
160
|
-
#trie = hash_like_tree
|
161
|
-
#line_cnt = 0
|
162
|
-
#::IO.foreach(digestion_file) do |line|
|
163
|
-
#line_cnt += 1
|
164
|
-
##puts "LINE COUND"
|
165
|
-
##p line_cnt
|
166
|
-
#(prot, *peps) = line.chomp!.split(/\s+/)
|
167
|
-
##p peps
|
168
|
-
##p peps.class
|
169
|
-
## prot is something like this: "P31946"
|
170
|
-
#puts line
|
171
|
-
#peps.each do |pep|
|
172
|
-
#if pep.size >= min_length
|
173
|
-
#to_set =
|
174
|
-
#if val = trie.get(pep)
|
175
|
-
#val + Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER + prot
|
176
|
-
#else
|
177
|
-
#prot
|
178
|
-
#end
|
179
|
-
#p to_set.size
|
180
|
-
#trie.add(pep, to_set)
|
181
|
-
#end
|
182
|
-
#end
|
183
|
-
#cnt += 1
|
184
|
-
#puts cnt if (cnt % 1000) == 0
|
185
|
-
#end
|
186
|
-
#abort "HERE"
|
187
|
-
#trie
|
188
|
-
else
|
189
|
-
hash = {}
|
164
|
+
trie = get_a_trie
|
190
165
|
::IO.foreach(digestion_file) do |line|
|
191
|
-
|
166
|
+
line.chomp!
|
167
|
+
(prot, *peps) = line.split(/\s+/)
|
192
168
|
# prot is something like this: "P31946"
|
169
|
+
peps.uniq!
|
193
170
|
peps.each do |pep|
|
194
171
|
if pep.size >= min_length
|
195
|
-
if
|
196
|
-
|
172
|
+
if trie.has_key?(pep)
|
173
|
+
ar = trie.get(pep)
|
174
|
+
ar << prot
|
197
175
|
else
|
198
|
-
|
176
|
+
trie.add( pep, [prot] )
|
199
177
|
end
|
200
|
-
hash[pep] = val
|
201
178
|
end
|
202
179
|
end
|
203
|
-
|
204
|
-
|
180
|
+
end
|
181
|
+
trie
|
182
|
+
else
|
183
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
184
|
+
::IO.foreach(digestion_file) do |line|
|
185
|
+
line.chomp!
|
186
|
+
(prot, *peps) = line.split(/\s+/)
|
187
|
+
# prot is something like this: "P31946"
|
188
|
+
peps.uniq!
|
189
|
+
peps.each do |pep|
|
190
|
+
if pep.size >= min_length
|
191
|
+
hash[pep] << prot
|
192
|
+
end
|
193
|
+
end
|
205
194
|
end
|
206
195
|
hash
|
207
196
|
end
|
@@ -215,6 +204,7 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
215
204
|
def create(fasta_file, opts={})
|
216
205
|
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
217
206
|
digestion_file = create_digestion_file(fasta_file, opts)
|
207
|
+
puts "created file of size: #{File.size(digestion_file)}" if $VERBOSE
|
218
208
|
db_from_fasta_digestion_file(digestion_file, opts)
|
219
209
|
end
|
220
210
|
|
@@ -6,6 +6,7 @@ class Mspire::Ident::Peptide::Db::IO
|
|
6
6
|
# behaves like a hash once it is opened.
|
7
7
|
include Enumerable
|
8
8
|
def self.open(filename, &block)
|
9
|
+
#p filename
|
9
10
|
raise ArgumentError unless block
|
10
11
|
File.open(filename) do |io|
|
11
12
|
block.call(self.new(io))
|
@@ -39,6 +40,10 @@ class Mspire::Ident::Peptide::Db::IO
|
|
39
40
|
string.split(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)
|
40
41
|
end
|
41
42
|
|
43
|
+
def key?(key)
|
44
|
+
@index[key]
|
45
|
+
end
|
46
|
+
|
42
47
|
# number of entries
|
43
48
|
def size ; @index.size end
|
44
49
|
alias_method :length, :size
|
@@ -12,7 +12,7 @@ class Mspire::Ident::PeptideHit
|
|
12
12
|
|
13
13
|
class << self
|
14
14
|
|
15
|
-
# writes to the file, adding an extension
|
15
|
+
# writes to the file, adding an extension. returns the filename
|
16
16
|
def to_phq(base, hits, qvalues=[])
|
17
17
|
to_file(base + FILE_EXTENSION, hits, qvalues)
|
18
18
|
end
|
@@ -20,7 +20,7 @@ class Mspire::Ident::PeptideHit
|
|
20
20
|
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
21
21
|
# to hits that can provide qvalues if not inherent to the hits
|
22
22
|
# returns the filename. Expects each hit to implement #search_id, #id,
|
23
|
-
# #aaseq and #charge
|
23
|
+
# #aaseq and #charge. returns the filename
|
24
24
|
def to_file(filename, hits, qvalues=[])
|
25
25
|
File.open(filename,'w') do |out|
|
26
26
|
out.puts HEADER.join(FILE_DELIMITER)
|
@@ -38,8 +38,10 @@ module Mspire
|
|
38
38
|
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
39
39
|
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
40
40
|
peptide_hits.each do |peptide_hit|
|
41
|
-
peptide_hit.proteins
|
42
|
-
|
41
|
+
if prots = peptide_hit.proteins
|
42
|
+
prots.each do |protein|
|
43
|
+
protein_to_peptides[protein] << peptide_hit
|
44
|
+
end
|
43
45
|
end
|
44
46
|
end
|
45
47
|
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
data/lib/mspire/isotope/aa.rb
CHANGED
@@ -8,27 +8,27 @@ module Mspire
|
|
8
8
|
# and OH on the ends)
|
9
9
|
aa_to_el_hash = {
|
10
10
|
'A' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
11
|
-
'R' => { :c =>6, :h =>12 , :o =>1 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
12
|
-
'N' => { :c =>4, :h =>6 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
13
|
-
'D' => { :c =>4, :h =>5 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
14
11
|
'C' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
12
|
+
'D' => { :c =>4, :h =>5 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
15
13
|
'E' => { :c =>5, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
16
|
-
'
|
14
|
+
'F' => { :c =>9, :h =>9 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
17
15
|
'G' => { :c =>2, :h =>3 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
18
|
-
'H' => { :c =>6, :h =>7 , :o =>1 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
19
16
|
'I' => { :c =>6, :h =>11 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
20
|
-
'
|
17
|
+
'H' => { :c =>6, :h =>7 , :o =>1 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
21
18
|
'K' => { :c =>6, :h =>12 , :o =>1 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
19
|
+
'L' => { :c =>6, :h =>11 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
22
20
|
'M' => { :c =>5, :h =>9 , :o =>1 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
23
|
-
'
|
21
|
+
'N' => { :c =>4, :h =>6 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
22
|
+
'O' => { :c =>12, :h =>19 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
24
23
|
'P' => { :c =>5, :h =>7 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
24
|
+
'Q' => { :c =>5, :h =>8 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
25
|
+
'R' => { :c =>6, :h =>12 , :o =>1 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
25
26
|
'S' => { :c =>3, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
26
27
|
'T' => { :c =>4, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
28
|
+
'U' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
29
|
+
'V' => { :c =>5, :h =>9 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
27
30
|
'W' => { :c =>11, :h =>10 , :o =>1 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
28
31
|
'Y' => { :c =>9, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
29
|
-
'V' => { :c =>5, :h =>9 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
30
|
-
'U' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
31
|
-
'O' => { :c =>12, :h =>19 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
|
32
32
|
}
|
33
33
|
|
34
34
|
#
|
@@ -35,10 +35,17 @@ module Mspire
|
|
35
35
|
def self.from_xml(xml, link)
|
36
36
|
obj = self.new(xml[:id])
|
37
37
|
next_n = obj.describe_from_xml!(xml, link[:ref_hash])
|
38
|
-
if next_n && next_n.name == 'componentList'
|
38
|
+
if next_n && (next_n.name == 'componentList')
|
39
39
|
obj.components = next_n.children.map do |component_n|
|
40
|
-
|
41
|
-
|
40
|
+
if component_n.is_a?(Nokogiri::XML::Text)
|
41
|
+
# TODO: this is a fix for when there is an empty component list but
|
42
|
+
# Nokogiri returns a text node. Really this needs to be fixed
|
43
|
+
# in our xml writer!
|
44
|
+
nil
|
45
|
+
else
|
46
|
+
Mspire::Mzml.const_get(component_n.name.capitalize).new.describe_self_from_xml!(component_n, link[:ref_hash])
|
47
|
+
end
|
48
|
+
end.compact
|
42
49
|
next_n = next_n.next
|
43
50
|
end
|
44
51
|
if next_n && next_n.name == 'softwareRef'
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'hash/inverse'
|
2
|
+
|
3
|
+
module Mspire ; module Quant ; end ; end
|
4
|
+
|
5
|
+
module Mspire::Quant::Cmdline
|
6
|
+
|
7
|
+
# expects arguments in one of two forms. The first form is grouped by
|
8
|
+
# condition as shown:
|
9
|
+
#
|
10
|
+
# condition1=file1,file2,file3... condition2=file4,file5...
|
11
|
+
#
|
12
|
+
# The second is where each file is its own condition (1 replicate):
|
13
|
+
#
|
14
|
+
# file1 file2 file3
|
15
|
+
#
|
16
|
+
# Returns three ordered hashes (only ordered for ruby 1.9):
|
17
|
+
#
|
18
|
+
# 1) Condition to an array of samplenames
|
19
|
+
# 2) Samplename to the filename
|
20
|
+
# 3) Samplename to condition
|
21
|
+
def self.args_to_hashes(args, replicate_postfix="-rep")
|
22
|
+
# groupname => files
|
23
|
+
condition_to_samplenames = {}
|
24
|
+
samplename_to_filename = {}
|
25
|
+
args.each do |arg|
|
26
|
+
(condition, files) =
|
27
|
+
if arg.include?('=')
|
28
|
+
(condition, filestring) = arg.split('=')
|
29
|
+
[condition, filestring.split(',')]
|
30
|
+
else
|
31
|
+
[basename(arg), [arg]]
|
32
|
+
end
|
33
|
+
sample_to_file_pairs = files.each_with_index.map do |file,i|
|
34
|
+
rep_string = (files.size == 1) ? "" : "#{replicate_postfix}#{i+1}"
|
35
|
+
["#{condition}#{rep_string}", file]
|
36
|
+
end
|
37
|
+
sample_to_file_pairs.each {|name,file| samplename_to_filename[name] = file }
|
38
|
+
condition_to_samplenames[condition] = sample_to_file_pairs.map(&:first)
|
39
|
+
end
|
40
|
+
[samplename_to_filename, condition_to_samplenames, condition_to_samplenames.inverse]
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
|
2
|
+
module Mspire
|
3
|
+
module Quant
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Mspire::Quant::ProteinGroupComparison
|
8
|
+
|
9
|
+
# a protein group object
|
10
|
+
attr_accessor :protein_group
|
11
|
+
|
12
|
+
# an array of experiment names
|
13
|
+
attr_accessor :experiments
|
14
|
+
|
15
|
+
# parallel array to experiments with the measured values
|
16
|
+
attr_accessor :values
|
17
|
+
|
18
|
+
def initialize(protein_group, experiments, values)
|
19
|
+
(@protein_group, @experiment, @values) = protein_group, experiments, values
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Mspire::Quant::ProteinGroupComparison::SpectralCounts
|
24
|
+
include Mspire::Quant::ProteinGroupComparison
|
25
|
+
end
|
26
|
+
|
27
|
+
class Mspire::Quant::ProteinGroupComparison::UniqAAzCounts
|
28
|
+
include Mspire::Quant::ProteinGroupComparison
|
29
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#require 'set'
|
2
|
+
#require 'mspire/ident/protein_group'
|
3
|
+
|
4
|
+
module Mspire
|
5
|
+
module Quant
|
6
|
+
module SpectralCounts
|
7
|
+
Counts = Struct.new(:spectral, :aaseqcharge, :aaseq)
|
8
|
+
class Counts
|
9
|
+
def initialize(*args)
|
10
|
+
super(*args)
|
11
|
+
# default is zero counts
|
12
|
+
self[0] ||= 0.0 ; self[1] ||= 0.0 ; self[2] ||= 0.0
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# returns a parallel array of Count objects. If split_hits then counts
|
17
|
+
# are split between groups sharing the hit. peptide_hits must respond
|
18
|
+
# to :charge and :aaseq. If a block is given, the weight of a
|
19
|
+
# particular hit can be given (typically this will be 1/#proteins
|
20
|
+
# sharing the hit
|
21
|
+
def self.counts(peptide_hits, &share_the_pephit)
|
22
|
+
uniq_aaseq = {}
|
23
|
+
uniq_aaseq_charge = {}
|
24
|
+
weights = peptide_hits.map do |hit|
|
25
|
+
weight = share_the_pephit ? share_the_pephit.call(hit) : 1
|
26
|
+
# these guys will end up clobbering themselves, but the
|
27
|
+
# linked_to_size should be consistent if the key is the same
|
28
|
+
uniq_aaseq_charge[[hit.aaseq, hit.charge]] = weight
|
29
|
+
uniq_aaseq[hit.aaseq] = weight
|
30
|
+
weight
|
31
|
+
end
|
32
|
+
counts_data = [weights, uniq_aaseq_charge.values, uniq_aaseq.values].map do |array|
|
33
|
+
array.reduce(:+)
|
34
|
+
end
|
35
|
+
Counts.new(*counts_data)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
|
@@ -4,16 +4,7 @@ require 'trollop'
|
|
4
4
|
require 'set'
|
5
5
|
require 'mspire/ident/peptide_hit/qvalue'
|
6
6
|
require 'mspire/error_rate/qvalue'
|
7
|
-
|
8
|
-
begin
|
9
|
-
require 'mascot/dat'
|
10
|
-
rescue LoadError
|
11
|
-
puts "You need the mascot-dat gem for this to work!"
|
12
|
-
puts "AND IT MUST BE THE PRINCELAB GITHUB FORK until changes get incorporated upstream!"
|
13
|
-
puts "> gem install mascot-dat"
|
14
|
-
raise LoadError
|
15
|
-
end
|
16
|
-
raise "need princelab mascot-dat gem!" unless Mascot::DAT::VERSION == "0.3.1.1"
|
7
|
+
require 'mspire/mascot/dat'
|
17
8
|
|
18
9
|
# target-decoy bundle
|
19
10
|
SearchBundle = Struct.new(:target, :decoy) do
|
@@ -28,36 +19,35 @@ end
|
|
28
19
|
|
29
20
|
PSM = Struct.new(:search_id, :id, :aaseq, :charge, :score)
|
30
21
|
|
31
|
-
|
32
|
-
def charge_string_to_charge(st)
|
33
|
-
md = st.match(/(\d)([\+\-])/)
|
34
|
-
i = md[1].to_i
|
35
|
-
i *= -1 if (md[2] == '-')
|
36
|
-
i
|
37
|
-
end
|
38
|
-
|
39
|
-
def read_mascot_dat_hits(dat_file)
|
22
|
+
def run_name_from_dat(dat_file)
|
40
23
|
filename =nil
|
41
24
|
IO.foreach(dat_file) do |line|
|
42
|
-
if line =~ /^FILE=(
|
25
|
+
if line =~ /^FILE=(.*)/i
|
43
26
|
filename = $1.dup
|
27
|
+
filename.sub!(/^File Name: /,'')
|
28
|
+
filename.sub!(/.(mgf|raw|mzxml|mzml)$/i,'')
|
44
29
|
break
|
45
30
|
end
|
46
31
|
end
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
32
|
+
filename
|
33
|
+
end
|
34
|
+
|
35
|
+
def read_mascot_dat_hits(dat_file)
|
36
|
+
filename = run_name_from_dat(dat_file)
|
37
|
+
|
38
|
+
reply = Mspire::Mascot::Dat.open(dat_file) do |dat|
|
39
|
+
# for some reason, I am getting diff results using the 'map' tagged onto the
|
40
|
+
# method. For now just going to collect old-fashioned.
|
41
|
+
cnt = 0
|
42
|
+
target_and_decoy = [true, false].map do |target_or_decoy|
|
43
|
+
dat.each_peptide(target_or_decoy, 1).map do |pephit|
|
44
|
+
cnt += 1
|
45
|
+
query = dat.query(pephit.query_num)
|
46
|
+
PSM.new(filename, query.title, pephit.seq, query.charge, pephit.ions_score)
|
47
|
+
end
|
56
48
|
end
|
57
|
-
|
49
|
+
SearchBundle.new(*target_and_decoy)
|
58
50
|
end
|
59
|
-
dat.close
|
60
|
-
SearchBundle.new(*data)
|
61
51
|
end
|
62
52
|
|
63
53
|
|
@@ -66,16 +56,18 @@ def putsv(*args)
|
|
66
56
|
$stdout.flush
|
67
57
|
end
|
68
58
|
|
69
|
-
EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
|
70
59
|
combine_base = "combined"
|
71
60
|
|
61
|
+
EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
|
62
|
+
|
72
63
|
opts = Trollop::Parser.new do
|
73
|
-
#banner %Q{usage: #{File.basename(__FILE__)} <target>.xml <decoy>.xml ...
|
74
64
|
banner %Q{usage: #{File.basename(__FILE__)} <mascot>.dat ...
|
75
|
-
outputs: <mascot
|
76
|
-
|
77
|
-
|
65
|
+
outputs: <mascot>#{EXT}
|
66
|
+
|
67
|
+
assumes a decoy search was run *with* the initial search
|
68
|
+
phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
|
78
69
|
}
|
70
|
+
text ""
|
79
71
|
opt :combine, "groups target and decoy hits together from all files, writing to #{combine_base}#{EXT}", :default => false
|
80
72
|
opt :z_together, "do not group by charge state", :default => false
|
81
73
|
opt :verbose, "be verbose", :default => false
|
@@ -100,19 +92,19 @@ to_run = {}
|
|
100
92
|
if opt[:combine]
|
101
93
|
putsv "combining all target hits together and all decoy hits together"
|
102
94
|
bundle = SearchBundle.new.combine(bundles)
|
103
|
-
to_run[combine_base
|
95
|
+
to_run[combine_base] = bundle
|
104
96
|
else
|
105
97
|
files.zip(bundles) do |file, bundle|
|
106
|
-
to_run[file.chomp(File.extname(file))
|
98
|
+
to_run[file.chomp(File.extname(file))] = bundle
|
107
99
|
end
|
108
100
|
end
|
109
101
|
|
110
|
-
to_run.each do |
|
111
|
-
putsv "calculating qvalues for #{
|
112
|
-
|
113
|
-
|
114
|
-
outfile = Mspire::Ident::PeptideHit::Qvalue.
|
102
|
+
to_run.each do |file_base, bundle|
|
103
|
+
putsv "calculating qvalues for #{file_base}"
|
104
|
+
hit_and_qvalue_pairs = Mspire::ErrorRate::Qvalue.target_decoy_qvalues(bundle.target, bundle.decoy, :z_together => opt[:z_together])
|
105
|
+
|
106
|
+
outfile = Mspire::Ident::PeptideHit::Qvalue.to_phq(file_base, *hit_and_qvalue_pairs.transpose)
|
107
|
+
|
115
108
|
putsv "created: #{outfile}"
|
116
109
|
end
|
117
110
|
|
118
|
-
|