mspire 0.8.5 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/hash/inverse.rb +15 -0
- data/lib/mspire/error_rate/qvalue.rb +5 -5
- data/lib/mspire/fasta.rb +2 -0
- data/lib/mspire/ident/peptide/db/creator.rb +48 -58
- data/lib/mspire/ident/peptide/db/io.rb +5 -0
- data/lib/mspire/ident/peptide_hit/qvalue.rb +2 -2
- data/lib/mspire/ident/peptide_hit.rb +2 -2
- data/lib/mspire/ident/protein_group.rb +4 -2
- data/lib/mspire/isotope/aa.rb +10 -10
- data/lib/mspire/mzml/instrument_configuration.rb +10 -3
- data/lib/mspire/quant/cmdline.rb +42 -0
- data/lib/mspire/quant/protein_group_comparison.rb +29 -0
- data/lib/mspire/quant/spectral_counts.rb +42 -0
- data/script/fasta_to_peptide_centric_db.rb +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +37 -45
- data/script/mass_correct.rb +118 -0
- data/script/minimal_protein_set.rb +345 -0
- data/script/mzml_to_mgf.rb +46 -0
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +275 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +11 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +157 -157
- metadata +11 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.6
|
data/lib/hash/inverse.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# inverse from Tilo Sloboda (now in facets)
|
4
|
+
|
5
|
+
class Hash
|
6
|
+
def inverse
|
7
|
+
i = Hash.new
|
8
|
+
self.each_pair do |k,v|
|
9
|
+
if (Array === v) ; v.each{ |x| i[x] = ( i.has_key?(x) ? [k,i[x]].flatten : k ) }
|
10
|
+
else ; i[v] = ( i.has_key?(v) ? [k,i[v]].flatten : k ) end
|
11
|
+
end ; i
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
@@ -22,8 +22,8 @@ module Mspire
|
|
22
22
|
# Proc.new doesn't do arity checking
|
23
23
|
hit_with_qvalue_pairs = Proc.new do |hits|
|
24
24
|
sorted_best_to_worst = (hits.sort_by(&sorting)).reverse
|
25
|
-
(
|
26
|
-
|
25
|
+
(sorted_target_hits, qvalues) = Mspire::ErrorRate::Qvalue.mixed_target_decoy(sorted_best_to_worst, target_set, opts)
|
26
|
+
sorted_target_hits.zip(qvalues)
|
27
27
|
end
|
28
28
|
|
29
29
|
all_together = target_hits + decoy_hits
|
@@ -49,13 +49,13 @@ module Mspire
|
|
49
49
|
opts = {:monotonic => true}.merge(opts)
|
50
50
|
num_target = 0 ; num_decoy = 0
|
51
51
|
monotonic = opts[:monotonic]
|
52
|
-
|
52
|
+
sorted_target_hits = []
|
53
53
|
qvalues = []
|
54
54
|
best_to_worst.each do |hit|
|
55
55
|
if target_setlike.include?(hit)
|
56
56
|
num_target += 1
|
57
57
|
precision = Mspire::ErrorRate::Decoy.precision(num_target, num_decoy)
|
58
|
-
|
58
|
+
sorted_target_hits << hit
|
59
59
|
qvalues << (1.0 - precision)
|
60
60
|
else
|
61
61
|
num_decoy += 1
|
@@ -72,7 +72,7 @@ module Mspire
|
|
72
72
|
end
|
73
73
|
end.reverse
|
74
74
|
end
|
75
|
-
[
|
75
|
+
[sorted_target_hits, qvalues]
|
76
76
|
end
|
77
77
|
|
78
78
|
|
data/lib/mspire/fasta.rb
CHANGED
@@ -61,6 +61,8 @@ module Mspire
|
|
61
61
|
# takes the header string and returns the uniprot id
|
62
62
|
#
|
63
63
|
# 'sp|Q04917|1433F_HUMAN' #=> 'Q04917'
|
64
|
+
# This can also be found with BioFastaFormat#accession (but it may be much
|
65
|
+
# slower)
|
64
66
|
def self.uniprot_id(header)
|
65
67
|
header[/^[^\|]+\|([^\|]+)\|/, 1]
|
66
68
|
end
|
@@ -43,8 +43,11 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
43
43
|
op.on("--no-expand-x", "don't enumerate aa possibilities", "(removes these peptides)") { opt[:expand_aa] = false }
|
44
44
|
op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false }
|
45
45
|
op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v }
|
46
|
+
|
46
47
|
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
|
48
|
+
op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 }
|
47
49
|
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
50
|
+
op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 }
|
48
51
|
puts Mspire::Digester::ENZYMES.keys.join("\n")
|
49
52
|
exit
|
50
53
|
end
|
@@ -95,7 +98,7 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
95
98
|
(pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep
|
96
99
|
end
|
97
100
|
else
|
98
|
-
peptides.
|
101
|
+
peptides.select {|pep| pep !~ letters_to_expand_re }
|
99
102
|
end
|
100
103
|
header = prot.header
|
101
104
|
id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s+/).first
|
@@ -118,15 +121,25 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
118
121
|
hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:trie])
|
119
122
|
|
120
123
|
base = digestion_file.chomp(File.extname(digestion_file))
|
121
|
-
final_outfile =
|
124
|
+
final_outfile =
|
125
|
+
if opts[:trie]
|
126
|
+
base + ".min_aaseq#{opts[:min_length]}"
|
127
|
+
else
|
128
|
+
base + ".min_aaseq#{opts[:min_length]}" + ".yml"
|
129
|
+
end
|
122
130
|
|
123
131
|
start_time = Time.now
|
124
132
|
print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE
|
125
133
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
134
|
+
if opts[:trie]
|
135
|
+
trie = hash_like
|
136
|
+
trie.save(final_outfile)
|
137
|
+
else
|
138
|
+
File.open(final_outfile, 'w') do |out|
|
139
|
+
hash_like.each do |k,v|
|
140
|
+
out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) )
|
141
|
+
#out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}"
|
142
|
+
end
|
130
143
|
end
|
131
144
|
end
|
132
145
|
puts "#{Time.now - start_time} sec" if $VERBOSE
|
@@ -137,71 +150,47 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
137
150
|
File.expand_path(final_outfile)
|
138
151
|
end
|
139
152
|
|
140
|
-
def
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
if val.nil?
|
146
|
-
self.add(key,"")
|
147
|
-
self.get(key)
|
148
|
-
else
|
149
|
-
val
|
150
|
-
end
|
153
|
+
def get_a_trie
|
154
|
+
begin
|
155
|
+
require 'trie'
|
156
|
+
rescue
|
157
|
+
raise LoadError, "must first install fast_trie"
|
151
158
|
end
|
152
|
-
|
159
|
+
Trie.new
|
153
160
|
end
|
154
161
|
|
155
162
|
def hash_like_from_digestion_file(digestion_file, min_length, use_trie=false)
|
156
|
-
cnt = 0
|
157
163
|
if use_trie
|
158
|
-
|
159
|
-
#puts "using trie" if $VERBOSE
|
160
|
-
#trie = hash_like_tree
|
161
|
-
#line_cnt = 0
|
162
|
-
#::IO.foreach(digestion_file) do |line|
|
163
|
-
#line_cnt += 1
|
164
|
-
##puts "LINE COUND"
|
165
|
-
##p line_cnt
|
166
|
-
#(prot, *peps) = line.chomp!.split(/\s+/)
|
167
|
-
##p peps
|
168
|
-
##p peps.class
|
169
|
-
## prot is something like this: "P31946"
|
170
|
-
#puts line
|
171
|
-
#peps.each do |pep|
|
172
|
-
#if pep.size >= min_length
|
173
|
-
#to_set =
|
174
|
-
#if val = trie.get(pep)
|
175
|
-
#val + Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER + prot
|
176
|
-
#else
|
177
|
-
#prot
|
178
|
-
#end
|
179
|
-
#p to_set.size
|
180
|
-
#trie.add(pep, to_set)
|
181
|
-
#end
|
182
|
-
#end
|
183
|
-
#cnt += 1
|
184
|
-
#puts cnt if (cnt % 1000) == 0
|
185
|
-
#end
|
186
|
-
#abort "HERE"
|
187
|
-
#trie
|
188
|
-
else
|
189
|
-
hash = {}
|
164
|
+
trie = get_a_trie
|
190
165
|
::IO.foreach(digestion_file) do |line|
|
191
|
-
|
166
|
+
line.chomp!
|
167
|
+
(prot, *peps) = line.split(/\s+/)
|
192
168
|
# prot is something like this: "P31946"
|
169
|
+
peps.uniq!
|
193
170
|
peps.each do |pep|
|
194
171
|
if pep.size >= min_length
|
195
|
-
if
|
196
|
-
|
172
|
+
if trie.has_key?(pep)
|
173
|
+
ar = trie.get(pep)
|
174
|
+
ar << prot
|
197
175
|
else
|
198
|
-
|
176
|
+
trie.add( pep, [prot] )
|
199
177
|
end
|
200
|
-
hash[pep] = val
|
201
178
|
end
|
202
179
|
end
|
203
|
-
|
204
|
-
|
180
|
+
end
|
181
|
+
trie
|
182
|
+
else
|
183
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
184
|
+
::IO.foreach(digestion_file) do |line|
|
185
|
+
line.chomp!
|
186
|
+
(prot, *peps) = line.split(/\s+/)
|
187
|
+
# prot is something like this: "P31946"
|
188
|
+
peps.uniq!
|
189
|
+
peps.each do |pep|
|
190
|
+
if pep.size >= min_length
|
191
|
+
hash[pep] << prot
|
192
|
+
end
|
193
|
+
end
|
205
194
|
end
|
206
195
|
hash
|
207
196
|
end
|
@@ -215,6 +204,7 @@ class Mspire::Ident::Peptide::Db::Creator
|
|
215
204
|
def create(fasta_file, opts={})
|
216
205
|
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
217
206
|
digestion_file = create_digestion_file(fasta_file, opts)
|
207
|
+
puts "created file of size: #{File.size(digestion_file)}" if $VERBOSE
|
218
208
|
db_from_fasta_digestion_file(digestion_file, opts)
|
219
209
|
end
|
220
210
|
|
@@ -6,6 +6,7 @@ class Mspire::Ident::Peptide::Db::IO
|
|
6
6
|
# behaves like a hash once it is opened.
|
7
7
|
include Enumerable
|
8
8
|
def self.open(filename, &block)
|
9
|
+
#p filename
|
9
10
|
raise ArgumentError unless block
|
10
11
|
File.open(filename) do |io|
|
11
12
|
block.call(self.new(io))
|
@@ -39,6 +40,10 @@ class Mspire::Ident::Peptide::Db::IO
|
|
39
40
|
string.split(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)
|
40
41
|
end
|
41
42
|
|
43
|
+
def key?(key)
|
44
|
+
@index[key]
|
45
|
+
end
|
46
|
+
|
42
47
|
# number of entries
|
43
48
|
def size ; @index.size end
|
44
49
|
alias_method :length, :size
|
@@ -12,7 +12,7 @@ class Mspire::Ident::PeptideHit
|
|
12
12
|
|
13
13
|
class << self
|
14
14
|
|
15
|
-
# writes to the file, adding an extension
|
15
|
+
# writes to the file, adding an extension. returns the filename
|
16
16
|
def to_phq(base, hits, qvalues=[])
|
17
17
|
to_file(base + FILE_EXTENSION, hits, qvalues)
|
18
18
|
end
|
@@ -20,7 +20,7 @@ class Mspire::Ident::PeptideHit
|
|
20
20
|
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
21
21
|
# to hits that can provide qvalues if not inherent to the hits
|
22
22
|
# returns the filename. Expects each hit to implement #search_id, #id,
|
23
|
-
# #aaseq and #charge
|
23
|
+
# #aaseq and #charge. returns the filename
|
24
24
|
def to_file(filename, hits, qvalues=[])
|
25
25
|
File.open(filename,'w') do |out|
|
26
26
|
out.puts HEADER.join(FILE_DELIMITER)
|
@@ -38,8 +38,10 @@ module Mspire
|
|
38
38
|
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
39
39
|
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
40
40
|
peptide_hits.each do |peptide_hit|
|
41
|
-
peptide_hit.proteins
|
42
|
-
|
41
|
+
if prots = peptide_hit.proteins
|
42
|
+
prots.each do |protein|
|
43
|
+
protein_to_peptides[protein] << peptide_hit
|
44
|
+
end
|
43
45
|
end
|
44
46
|
end
|
45
47
|
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
data/lib/mspire/isotope/aa.rb
CHANGED
@@ -8,27 +8,27 @@ module Mspire
|
|
8
8
|
# and OH on the ends)
|
9
9
|
aa_to_el_hash = {
|
10
10
|
'A' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
11
|
-
'R' => { :c =>6, :h =>12 , :o =>1 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
12
|
-
'N' => { :c =>4, :h =>6 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
13
|
-
'D' => { :c =>4, :h =>5 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
14
11
|
'C' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
12
|
+
'D' => { :c =>4, :h =>5 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
15
13
|
'E' => { :c =>5, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
16
|
-
'
|
14
|
+
'F' => { :c =>9, :h =>9 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
17
15
|
'G' => { :c =>2, :h =>3 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
18
|
-
'H' => { :c =>6, :h =>7 , :o =>1 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
19
16
|
'I' => { :c =>6, :h =>11 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
20
|
-
'
|
17
|
+
'H' => { :c =>6, :h =>7 , :o =>1 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
21
18
|
'K' => { :c =>6, :h =>12 , :o =>1 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
19
|
+
'L' => { :c =>6, :h =>11 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
22
20
|
'M' => { :c =>5, :h =>9 , :o =>1 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
|
23
|
-
'
|
21
|
+
'N' => { :c =>4, :h =>6 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
22
|
+
'O' => { :c =>12, :h =>19 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
|
24
23
|
'P' => { :c =>5, :h =>7 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
24
|
+
'Q' => { :c =>5, :h =>8 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
25
|
+
'R' => { :c =>6, :h =>12 , :o =>1 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
|
25
26
|
'S' => { :c =>3, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
26
27
|
'T' => { :c =>4, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
28
|
+
'U' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
29
|
+
'V' => { :c =>5, :h =>9 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
27
30
|
'W' => { :c =>11, :h =>10 , :o =>1 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
|
28
31
|
'Y' => { :c =>9, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
29
|
-
'V' => { :c =>5, :h =>9 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
|
30
|
-
'U' => { :c =>3, :h =>5 , :o =>1 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
|
31
|
-
'O' => { :c =>12, :h =>19 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
|
32
32
|
}
|
33
33
|
|
34
34
|
#
|
@@ -35,10 +35,17 @@ module Mspire
|
|
35
35
|
def self.from_xml(xml, link)
|
36
36
|
obj = self.new(xml[:id])
|
37
37
|
next_n = obj.describe_from_xml!(xml, link[:ref_hash])
|
38
|
-
if next_n && next_n.name == 'componentList'
|
38
|
+
if next_n && (next_n.name == 'componentList')
|
39
39
|
obj.components = next_n.children.map do |component_n|
|
40
|
-
|
41
|
-
|
40
|
+
if component_n.is_a?(Nokogiri::XML::Text)
|
41
|
+
# TODO: this is a fix for when there is an empty component list but
|
42
|
+
# Nokogiri returns a text node. Really this needs to be fixed
|
43
|
+
# in our xml writer!
|
44
|
+
nil
|
45
|
+
else
|
46
|
+
Mspire::Mzml.const_get(component_n.name.capitalize).new.describe_self_from_xml!(component_n, link[:ref_hash])
|
47
|
+
end
|
48
|
+
end.compact
|
42
49
|
next_n = next_n.next
|
43
50
|
end
|
44
51
|
if next_n && next_n.name == 'softwareRef'
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'hash/inverse'
|
2
|
+
|
3
|
+
module Mspire ; module Quant ; end ; end
|
4
|
+
|
5
|
+
module Mspire::Quant::Cmdline
|
6
|
+
|
7
|
+
# expects arguments in one of two forms. The first form is grouped by
|
8
|
+
# condition as shown:
|
9
|
+
#
|
10
|
+
# condition1=file1,file2,file3... condition2=file4,file5...
|
11
|
+
#
|
12
|
+
# The second is where each file is its own condition (1 replicate):
|
13
|
+
#
|
14
|
+
# file1 file2 file3
|
15
|
+
#
|
16
|
+
# Returns three ordered hashes (only ordered for ruby 1.9):
|
17
|
+
#
|
18
|
+
# 1) Condition to an array of samplenames
|
19
|
+
# 2) Samplename to the filename
|
20
|
+
# 3) Samplename to condition
|
21
|
+
def self.args_to_hashes(args, replicate_postfix="-rep")
|
22
|
+
# groupname => files
|
23
|
+
condition_to_samplenames = {}
|
24
|
+
samplename_to_filename = {}
|
25
|
+
args.each do |arg|
|
26
|
+
(condition, files) =
|
27
|
+
if arg.include?('=')
|
28
|
+
(condition, filestring) = arg.split('=')
|
29
|
+
[condition, filestring.split(',')]
|
30
|
+
else
|
31
|
+
[basename(arg), [arg]]
|
32
|
+
end
|
33
|
+
sample_to_file_pairs = files.each_with_index.map do |file,i|
|
34
|
+
rep_string = (files.size == 1) ? "" : "#{replicate_postfix}#{i+1}"
|
35
|
+
["#{condition}#{rep_string}", file]
|
36
|
+
end
|
37
|
+
sample_to_file_pairs.each {|name,file| samplename_to_filename[name] = file }
|
38
|
+
condition_to_samplenames[condition] = sample_to_file_pairs.map(&:first)
|
39
|
+
end
|
40
|
+
[samplename_to_filename, condition_to_samplenames, condition_to_samplenames.inverse]
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
|
2
|
+
module Mspire
|
3
|
+
module Quant
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Mspire::Quant::ProteinGroupComparison
|
8
|
+
|
9
|
+
# a protein group object
|
10
|
+
attr_accessor :protein_group
|
11
|
+
|
12
|
+
# an array of experiment names
|
13
|
+
attr_accessor :experiments
|
14
|
+
|
15
|
+
# parallel array to experiments with the measured values
|
16
|
+
attr_accessor :values
|
17
|
+
|
18
|
+
def initialize(protein_group, experiments, values)
|
19
|
+
(@protein_group, @experiment, @values) = protein_group, experiments, values
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Mspire::Quant::ProteinGroupComparison::SpectralCounts
|
24
|
+
include Mspire::Quant::ProteinGroupComparison
|
25
|
+
end
|
26
|
+
|
27
|
+
class Mspire::Quant::ProteinGroupComparison::UniqAAzCounts
|
28
|
+
include Mspire::Quant::ProteinGroupComparison
|
29
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#require 'set'
|
2
|
+
#require 'mspire/ident/protein_group'
|
3
|
+
|
4
|
+
module Mspire
|
5
|
+
module Quant
|
6
|
+
module SpectralCounts
|
7
|
+
Counts = Struct.new(:spectral, :aaseqcharge, :aaseq)
|
8
|
+
class Counts
|
9
|
+
def initialize(*args)
|
10
|
+
super(*args)
|
11
|
+
# default is zero counts
|
12
|
+
self[0] ||= 0.0 ; self[1] ||= 0.0 ; self[2] ||= 0.0
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# returns a parallel array of Count objects. If split_hits then counts
|
17
|
+
# are split between groups sharing the hit. peptide_hits must respond
|
18
|
+
# to :charge and :aaseq. If a block is given, the weight of a
|
19
|
+
# particular hit can be given (typically this will be 1/#proteins
|
20
|
+
# sharing the hit
|
21
|
+
def self.counts(peptide_hits, &share_the_pephit)
|
22
|
+
uniq_aaseq = {}
|
23
|
+
uniq_aaseq_charge = {}
|
24
|
+
weights = peptide_hits.map do |hit|
|
25
|
+
weight = share_the_pephit ? share_the_pephit.call(hit) : 1
|
26
|
+
# these guys will end up clobbering themselves, but the
|
27
|
+
# linked_to_size should be consistent if the key is the same
|
28
|
+
uniq_aaseq_charge[[hit.aaseq, hit.charge]] = weight
|
29
|
+
uniq_aaseq[hit.aaseq] = weight
|
30
|
+
weight
|
31
|
+
end
|
32
|
+
counts_data = [weights, uniq_aaseq_charge.values, uniq_aaseq.values].map do |array|
|
33
|
+
array.reduce(:+)
|
34
|
+
end
|
35
|
+
Counts.new(*counts_data)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
|
@@ -4,16 +4,7 @@ require 'trollop'
|
|
4
4
|
require 'set'
|
5
5
|
require 'mspire/ident/peptide_hit/qvalue'
|
6
6
|
require 'mspire/error_rate/qvalue'
|
7
|
-
|
8
|
-
begin
|
9
|
-
require 'mascot/dat'
|
10
|
-
rescue LoadError
|
11
|
-
puts "You need the mascot-dat gem for this to work!"
|
12
|
-
puts "AND IT MUST BE THE PRINCELAB GITHUB FORK until changes get incorporated upstream!"
|
13
|
-
puts "> gem install mascot-dat"
|
14
|
-
raise LoadError
|
15
|
-
end
|
16
|
-
raise "need princelab mascot-dat gem!" unless Mascot::DAT::VERSION == "0.3.1.1"
|
7
|
+
require 'mspire/mascot/dat'
|
17
8
|
|
18
9
|
# target-decoy bundle
|
19
10
|
SearchBundle = Struct.new(:target, :decoy) do
|
@@ -28,36 +19,35 @@ end
|
|
28
19
|
|
29
20
|
PSM = Struct.new(:search_id, :id, :aaseq, :charge, :score)
|
30
21
|
|
31
|
-
|
32
|
-
def charge_string_to_charge(st)
|
33
|
-
md = st.match(/(\d)([\+\-])/)
|
34
|
-
i = md[1].to_i
|
35
|
-
i *= -1 if (md[2] == '-')
|
36
|
-
i
|
37
|
-
end
|
38
|
-
|
39
|
-
def read_mascot_dat_hits(dat_file)
|
22
|
+
def run_name_from_dat(dat_file)
|
40
23
|
filename =nil
|
41
24
|
IO.foreach(dat_file) do |line|
|
42
|
-
if line =~ /^FILE=(
|
25
|
+
if line =~ /^FILE=(.*)/i
|
43
26
|
filename = $1.dup
|
27
|
+
filename.sub!(/^File Name: /,'')
|
28
|
+
filename.sub!(/.(mgf|raw|mzxml|mzml)$/i,'')
|
44
29
|
break
|
45
30
|
end
|
46
31
|
end
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
32
|
+
filename
|
33
|
+
end
|
34
|
+
|
35
|
+
def read_mascot_dat_hits(dat_file)
|
36
|
+
filename = run_name_from_dat(dat_file)
|
37
|
+
|
38
|
+
reply = Mspire::Mascot::Dat.open(dat_file) do |dat|
|
39
|
+
# for some reason, I am getting diff results using the 'map' tagged onto the
|
40
|
+
# method. For now just going to collect old-fashioned.
|
41
|
+
cnt = 0
|
42
|
+
target_and_decoy = [true, false].map do |target_or_decoy|
|
43
|
+
dat.each_peptide(target_or_decoy, 1).map do |pephit|
|
44
|
+
cnt += 1
|
45
|
+
query = dat.query(pephit.query_num)
|
46
|
+
PSM.new(filename, query.title, pephit.seq, query.charge, pephit.ions_score)
|
47
|
+
end
|
56
48
|
end
|
57
|
-
|
49
|
+
SearchBundle.new(*target_and_decoy)
|
58
50
|
end
|
59
|
-
dat.close
|
60
|
-
SearchBundle.new(*data)
|
61
51
|
end
|
62
52
|
|
63
53
|
|
@@ -66,16 +56,18 @@ def putsv(*args)
|
|
66
56
|
$stdout.flush
|
67
57
|
end
|
68
58
|
|
69
|
-
EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
|
70
59
|
combine_base = "combined"
|
71
60
|
|
61
|
+
EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
|
62
|
+
|
72
63
|
opts = Trollop::Parser.new do
|
73
|
-
#banner %Q{usage: #{File.basename(__FILE__)} <target>.xml <decoy>.xml ...
|
74
64
|
banner %Q{usage: #{File.basename(__FILE__)} <mascot>.dat ...
|
75
|
-
outputs: <mascot
|
76
|
-
|
77
|
-
|
65
|
+
outputs: <mascot>#{EXT}
|
66
|
+
|
67
|
+
assumes a decoy search was run *with* the initial search
|
68
|
+
phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
|
78
69
|
}
|
70
|
+
text ""
|
79
71
|
opt :combine, "groups target and decoy hits together from all files, writing to #{combine_base}#{EXT}", :default => false
|
80
72
|
opt :z_together, "do not group by charge state", :default => false
|
81
73
|
opt :verbose, "be verbose", :default => false
|
@@ -100,19 +92,19 @@ to_run = {}
|
|
100
92
|
if opt[:combine]
|
101
93
|
putsv "combining all target hits together and all decoy hits together"
|
102
94
|
bundle = SearchBundle.new.combine(bundles)
|
103
|
-
to_run[combine_base
|
95
|
+
to_run[combine_base] = bundle
|
104
96
|
else
|
105
97
|
files.zip(bundles) do |file, bundle|
|
106
|
-
to_run[file.chomp(File.extname(file))
|
98
|
+
to_run[file.chomp(File.extname(file))] = bundle
|
107
99
|
end
|
108
100
|
end
|
109
101
|
|
110
|
-
to_run.each do |
|
111
|
-
putsv "calculating qvalues for #{
|
112
|
-
|
113
|
-
|
114
|
-
outfile = Mspire::Ident::PeptideHit::Qvalue.
|
102
|
+
to_run.each do |file_base, bundle|
|
103
|
+
putsv "calculating qvalues for #{file_base}"
|
104
|
+
hit_and_qvalue_pairs = Mspire::ErrorRate::Qvalue.target_decoy_qvalues(bundle.target, bundle.decoy, :z_together => opt[:z_together])
|
105
|
+
|
106
|
+
outfile = Mspire::Ident::PeptideHit::Qvalue.to_phq(file_base, *hit_and_qvalue_pairs.transpose)
|
107
|
+
|
115
108
|
putsv "created: #{outfile}"
|
116
109
|
end
|
117
110
|
|
118
|
-
|