ms-error_rate 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,157 @@
1
+
2
+ # A transmemIndex is a hash that takes a fasta reference as key and returns
3
+ # a structured hash containing the transmembrane information.
4
+ module TransmembraneIndex
5
+
6
+ # returns :toppred or :phobius
7
+ def self.filetype(file)
8
+ tp = nil
9
+ File.open(file) do |fh|
10
+ while (line = fh.gets)
11
+ case line
12
+ when /SEQENCE/
13
+ tp = :phobius
14
+ break
15
+ when / 0 0 i/
16
+ tp = :phobius # if they don't have the headers,
17
+ # this will pick it up if they have a
18
+ # single prot without tm or signal peptide.
19
+ break
20
+ when /Algorithm specific parameters/
21
+ tp = :toppred # New text
22
+ break
23
+ when /<parameters>/
24
+ tp = :toppred # XML
25
+ break
26
+ end
27
+ end
28
+ end
29
+ tp
30
+ end
31
+
32
+ def reference_to_key(reference)
33
+ # needs to be subclassed or written
34
+ end
35
+
36
+ # right now accepts toppred.out files
37
+ # Phobius objects can use the fasta object to update their hash for methods
38
+ # like avg_overlap
39
+ def self.new(file)
40
+ case x = filetype(file)
41
+ when :toppred
42
+ require 'transmembrane/toppred'
43
+ TopPred::Index.new(file)
44
+ when :phobius
45
+ require 'transmembrane/phobius'
46
+ # warn "WARNING: You have NO fasta object with Phobius based TransmembraneIndex! (which needs one to do proper indexing!)" unless fasta
47
+ Phobius::Index.new(file)
48
+ else
49
+ raise ArgumentError, "#{x} filetype for #{file} not recognized!"
50
+ end
51
+ end
52
+
53
+ # returns a hash of key -> num certain transmembrane segments
54
+ def num_certain_index
55
+ hash = {}
56
+ self.each do |k,v|
57
+ hash[k] = v[:num_certain_transmembrane_segments] || 0
58
+ end
59
+ hash
60
+ end
61
+
62
+ # tp = :number or :fraction which is the fraction of the sequence size
63
+ # returns the average number of overlapping amino acids with transmembrane
64
+ # segments
65
+ # returns nil if there is no protein by that key
66
+ def avg_overlap(key, sequence, tp=:number)
67
+ if self.key? key
68
+ numbers = num_transmem_aa(self[key], sequence)
69
+ if numbers.size > 0
70
+ sum = 0
71
+ numbers.each {|num| sum += num}
72
+ avg_num = sum.to_f / numbers.size
73
+ # the one line way to do it
74
+ #avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
75
+ if tp == :fraction
76
+ avg_num / sequence.size
77
+ # this is the same as doing this:
78
+ #numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
79
+ else
80
+ avg_num
81
+ end
82
+ else
83
+ 0.0
84
+ end
85
+ else # what to do if the protein isn't there?? which happens on occasion
86
+ nil
87
+ end
88
+ end
89
+
90
+ # returns an array (usually length of 1) of the number of amino acids
91
+ # contained inside transmembrane spanning segments.
92
+ # assumes that tmhash has the key 'transmembrane_segments'
93
+ # if there are no transmembrane segments, returns empty array.
94
+ def num_transmem_aa(tmhash, sequence)
95
+ if tmhash.key? :transmembrane_segments
96
+ ranges = tmhash[:transmembrane_segments].map do |tmseg|
97
+ Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
98
+ end
99
+ num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
100
+ else
101
+ []
102
+ end
103
+ end
104
+
105
+ # returns an array of the number of overlapping sequences in substring with
106
+ # the substrings defined in start_stop_doublets within full_sequence
107
+ # start_stop_doublets should be 0 indexed!!!
108
+ # the span includes the 'stop' position i.e., full_sequence[start..stop]
109
+ def num_overlapping_chars(full_sequence, ranges, substring)
110
+ #start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
111
+ if ranges.size == 0
112
+ []
113
+ #full_sequence.enum_for(:scan, substring).map { 0 }
114
+ else
115
+ substring_ranges = []
116
+ pos = 0
117
+ slen = substring.size
118
+ while i=full_sequence.index(substring,pos)
119
+ substring_ranges << Range.new(i, i+slen-1)
120
+ pos = i + slen
121
+ end
122
+ # brute force way
123
+ last_tm_range = ranges.last.last
124
+ to_return = substring_ranges.map do |sb|
125
+ overlap = 0
126
+ # there's got to be a much simpler way to do this, but this does work...
127
+ ranges.each do |tm|
128
+ (frst, lst) =
129
+ if tm.include?( sb.first )
130
+ [tm, sb]
131
+ elsif tm.include?( sb.last )
132
+ [sb, tm]
133
+ else
134
+ nil
135
+ end
136
+ if frst
137
+ if lst.last <= frst.last
138
+ overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
139
+ else
140
+ overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
141
+ end
142
+ end
143
+ end
144
+ overlap
145
+ end
146
+ end
147
+ end
148
+
149
+
150
+ end
151
+
152
+
153
+ #substring_ranges = full_sequence.enum_for(:scan, substring).map do
154
+ # (ofirst, olast) = $~.offset(0)
155
+ # Range.new(ofirst, olast - 1)
156
+ # end
157
+
@@ -0,0 +1,5 @@
1
+ # structure of a very simple file for holding peptide hit qvalues
2
+ # entries should be separated by a tab!!!
3
+ aaseq charge qvalue
4
+ String Integer Float
5
+ ... ... ...
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'yaml'
4
+ require 'set'
5
+
6
+ if ARGV.size == 0
7
+ puts "usage: prog summary__<setname>__name_to_gene_id.yml"
8
+ exit
9
+ end
10
+
11
+ file = ARGV.shift
12
+
13
+ hash = YAML.load_file(file)
14
+
15
+ previous_hits = Set.new
16
+ results = []
17
+ hash.sort.each do |fdr, hits|
18
+ new_hits = hits - previous_hits.to_a
19
+ previous_hits.merge(new_hits)
20
+ results << [fdr, hits.size, *new_hits]
21
+ end
22
+
23
+ results.shift.zip(*results) do |row|
24
+ puts row.join("\t")
25
+ end
26
+
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'orderedhash'
4
+ require 'yaml'
5
+ require 'set'
6
+
7
+ if ARGV.size != 2
8
+ puts "usage: #{File.basename(__FILE__)} <gene_ids>.txt summary.yml"
9
+ puts "writes a yml file with unique proteins per qvalue cutoff"
10
+ puts "for each set"
11
+ puts "summary__<setname>__<gene_ids>.yml"
12
+ exit
13
+ end
14
+
15
+ (gene_ids, summary) = ARGV
16
+
17
+ globs = IO.readlines(gene_ids).reject{|v| v[0,1] == '#'}.map{|v| v.chomp }.select {|v| v =~ /\w/ }
18
+
19
+ hash = YAML.load_file(summary)
20
+ protein_info = hash['protein_info']
21
+ results = hash['results']
22
+ output_hashes = OrderedHash.new
23
+ results.each do |result|
24
+
25
+ qvalue_cutoff = result['qvalue_cutoff']
26
+ result['sets'].each do |setname, sethash|
27
+ matches = Set.new
28
+ output_hashes[setname] ||= OrderedHash.new
29
+ proteins = sethash['proteins']
30
+ proteins.each do |ipi,info|
31
+ if info['num_hits_minimal'].first > 0
32
+ all_proteins = [ipi, *info['indistinguishable']]
33
+ all_proteins.each do |id|
34
+ globs.each do |glob|
35
+ if File.fnmatch?(glob, protein_info[id]['Gene_Symbol'])
36
+ matches << protein_info[id]['Gene_Symbol']
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ output = matches.to_a.sort
43
+ output_hashes[setname][qvalue_cutoff] = output
44
+ end
45
+ end
46
+
47
+ output_hashes.each do |setname, output|
48
+ gene_ids_base = File.basename(gene_ids, '.*')
49
+ summary_base = summary.chomp(File.extname(summary))
50
+ output_file = [summary_base, setname, gene_ids_base].join("__") + ".yml"
51
+
52
+ File.open(output_file, 'w') {|out| out.print output.to_yaml }
53
+ end
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/ruby
2
+
3
+ if ARGV.size == 0
4
+ puts "usage: #{File.basename(__FILE__)} <IPI_based>.fasta ..."
5
+ puts "moves any leading \"><.*_>\" to the IPI value"
6
+ puts "for example:"
7
+ puts ">DCY_IPI:IPI0032311.1|STUFF -> >IPI:DCY_IPI0032311.1|STUFF"
8
+ exit
9
+ end
10
+
11
+ ARGV.each do |file|
12
+ tmp = file + '.tmp'
13
+ if File.exist?(tmp) ; warn "Skipping #{file} since #{tmp} exists" ; next end
14
+ File.open(tmp, 'w') do |out|
15
+ IO.foreach(file) do |line|
16
+ if line =~ />([^\:\|]+_)/
17
+ line.sub!("#{$1}IPI:IPI", "IPI:#{$1}IPI")
18
+ end
19
+ out.print line
20
+ end
21
+ end
22
+ FileUtils.mv tmp, file
23
+ end
@@ -0,0 +1,366 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'yaml'
4
+ require 'set'
5
+ require 'optparse'
6
+ require 'ms/fasta'
7
+ require 'ms/fasta/ipi'
8
+
9
+ SET_RE = /Set\s+(.*)/i
10
+ QVALUE_EXT = ".qval.yml"
11
+
12
+ # returns [sets_to_paths_hash, sets_order]
13
+ def sets_compare_to_paths(file, ext=QVALUE_EXT)
14
+ dirname = File.dirname(File.expand_path(file))
15
+ lines = IO.readlines(file).map {|v| v.chomp }.select {|v| v =~ /\w/}
16
+ sets = {}
17
+ current_set = nil
18
+ sets_order = []
19
+ lines.each do |line|
20
+ if line =~ SET_RE
21
+ current_set = $1.dup
22
+ sets[current_set] = []
23
+ sets_order << current_set
24
+ else
25
+ full_path = (File.join(dirname,(line + ext)))
26
+ raise RuntimeError, "file #{full_path} does not exist!!" unless File.exist?(full_path)
27
+ sets[current_set] << full_path
28
+ end
29
+ end
30
+ [sets, sets_order]
31
+ end
32
+
33
+ # returns [minimal_protein_to_uniq_peps_hash, indistinguishable_protein_hash]
34
+ # takes a hash of proteins to aaseqs. Uses a greedy algorithm where
35
+ # things are sorted first by the number of uniq amino acid sequences and total
36
+ # aa length. if a block is given, then will yield the prot and the
37
+ # peptide_array and sort by the returned value. The greedy algorithm acts on
38
+ # the REVERSE of the sorted proteins. indistinguishable_protein_hash is keyed
39
+ # on the proteins in the minimal_protein_array and gives an array of other
40
+ # proteins.
41
+ def minimal_protein_set(proteins_to_aaseqs)
42
+ blk_given = block_given?
43
+ #STDERR.puts "using block for minimal_protein_set" if blk_given
44
+ proteins_and_uniq_peps = []
45
+
46
+ sorted_most_to_least = proteins_to_aaseqs.sort_by do |k,v|
47
+ if blk_given
48
+ yield(k,v)
49
+ else
50
+ [ v.size, v.inject(0){|m,s| m+s.size} ]
51
+ end
52
+ end.reverse
53
+
54
+ found_seq = Set.new
55
+
56
+ same_peptide_hits = {}
57
+
58
+ last_peps = nil
59
+ last_uniq_prot = nil
60
+ sorted_most_to_least.each do |prot, peps|
61
+ sorted_peps = peps.sort # is it necessary to SORT?????????
62
+ uniq_peps = peps.select do |pep|
63
+ if found_seq.include?(pep)
64
+ false
65
+ else
66
+ found_seq.add pep
67
+ true
68
+ end
69
+ end
70
+ if uniq_peps.size > 0
71
+ proteins_and_uniq_peps << [prot, uniq_peps]
72
+ same_peptide_hits[prot] = []
73
+ last_peps = sorted_peps
74
+ last_uniq_prot = prot
75
+ else
76
+ if sorted_peps == last_peps
77
+ same_peptide_hits[last_uniq_prot] << prot
78
+ end
79
+ end
80
+ end
81
+ prot_to_uniq_peps_hash = {}
82
+ proteins_and_uniq_peps.each do |prot, uniq_peps|
83
+ prot_to_uniq_peps_hash[prot] = uniq_peps
84
+ end
85
+
86
+ [prot_to_uniq_peps_hash, same_peptide_hits]
87
+ end
88
+
89
+ def cutoffs_to_floats(ar)
90
+ ar.map do |v|
91
+ if v == 'nil' || v == '-'
92
+ nil
93
+ else
94
+ answ = v.to_f
95
+ end
96
+ end
97
+ end
98
+
99
+ # returns a hash keyed on protein id that yields an array:
100
+ # [#aaseq, #aaseq_and_charge, #total_hits]
101
+ def stats_per_prot(prot_to_peps, seq_to_hits)
102
+ per_protein_hash = {}
103
+ prot_to_peps.each do |prot, uniq_pep_seqs|
104
+ all = Set.new
105
+ aaseqcharges = Set.new
106
+ aaseqs = Set.new
107
+
108
+ uniq_pep_seqs.each do |pep_seq|
109
+ all_hits = seq_to_hits[pep_seq]
110
+ all.merge( all_hits )
111
+ all_hits.each do |hit|
112
+ aaseq = hit.sequence
113
+ aaseqs.add( aaseq )
114
+ aaseqcharges.add( aaseq + '_' + hit.charge.to_s )
115
+ end
116
+ per_protein_hash[prot] = [aaseqs.size, aaseqcharges.size, all.size]
117
+
118
+ end
119
+ end
120
+ per_protein_hash
121
+ end
122
+
123
+ opt = {
124
+ :cutoffs => [nil],
125
+ :outfile => "summary.yml",
126
+ }
127
+
128
+ opts = OptionParser.new do |op|
129
+ op.banner = "USAGE: #{File.basename(__FILE__)} sets_compare.txt"
130
+ op.separator "OUTPUT: #{opt[:outfile]}"
131
+ op.separator ""
132
+ op.separator "INPUT: "
133
+ op.separator " each <file> referenced in sets_compare.txt should have a"
134
+ op.separator " <file>.qval.yml file"
135
+ op.separator ""
136
+ op.separator "OPTIONS:"
137
+ op.on("-q", "--qvalue <0-1[,...]>", Array, "only take qvalues < given ['-' for no threshold]") {|v| opt[:cutoffs] = cutoffs_to_floats(v)}
138
+ op.separator ""
139
+ op.on("--proteins <fasta>,<pep-db>", Array, "path to fasta and peptide centric DB", "peptide_centric_db is in the format: ", "<PEPTIDE>: <ID>-<ID>-<ID>") {|v| opt[:proteins] = v }
140
+ op.separator "FORMATS:"
141
+ op.on("--output-format", "prints the output yaml scheme and exits") {|v| opt[:output_format] = v }
142
+ op.on("--input-format", "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
143
+ end
144
+
145
+ # later on we could implement full isoform resolution like IsoformResolver
146
+ # for now we will generate a report, realizing that some isoforms may not be
147
+ # reported
148
+ # it is implemented by using a pre-made map from sequence to protein groups
149
+ # then, a set of sequences allows one to deduce all the relationships from the
150
+ # protein groups.
151
+
152
+ opts.parse!
153
+
154
+ if opt[:output_format]
155
+ yaml = <<SKEL
156
+ results:
157
+ - qvalue_cutoff: <Float>
158
+ sets:
159
+ <set_name>:
160
+ num_uniq_aaseqs: <Integer>
161
+ num_aaseqs_not_in_pep_db: <Integer>
162
+ num_uniq_aaseqs_charge: <Integer>
163
+ proteins:
164
+ <IPI_ID>:
165
+ num_hits_all:
166
+ - <Integer> # total num aaseqs
167
+ - <Integer> # total num aaseq+charge
168
+ - <Integer> # total num hits
169
+ num_hits_minimal:
170
+ - <Integer> # total num aaseqs
171
+ - <Integer> # total num aaseq+charge
172
+ - <Integer> # total num hits
173
+ indistinguishable:
174
+ - <IPI_ID>
175
+ - <IPI_ID>
176
+ aaseqs:
177
+ - <String>
178
+ - <String>
179
+ sets_order:
180
+ - <String>
181
+ - <String>
182
+ protein_info:
183
+ <IPI_ID>:
184
+ Gene_Symbol: <String>
185
+ IPI: <IPI_ID>
186
+ Tax_Id: <String>
187
+ SWISS-PROT: <String>
188
+ description: <String>
189
+ ENSEMBL: <String>
190
+ SKEL
191
+ print yaml
192
+ exit
193
+ end
194
+
195
+ if opt[:input_format]
196
+ string =<<EXPLANATION
197
+ # the sets_compare.yml format is very simple:
198
+
199
+ Set <some_name_for_set1>
200
+ filename1_no_ext
201
+ filename2_no_ext
202
+ Set <some_name_for_set2>
203
+ filename3_no_ext
204
+ filename4_no_ext
205
+ ...
206
+ EXPLANATION
207
+ puts string
208
+ exit
209
+ end
210
+
211
+ if ARGV.size != 1
212
+ puts opts.to_s
213
+ exit
214
+ end
215
+
216
+
217
+ results = {}
218
+
219
+ protein_info = {}
220
+ results['protein_info'] = protein_info
221
+ results['results'] = []
222
+
223
+ (sets_hash, sets_order) = sets_compare_to_paths(ARGV.shift)
224
+ results['sets_order'] = sets_order
225
+
226
+ if opt[:proteins]
227
+ (fasta, pep_db_file) = opt[:proteins]
228
+
229
+ # a hash indexed on ipi containing all info
230
+ prot_header_hash = {}
231
+
232
+ STDERR.print "Loading information from fasta file..."
233
+ start = Time.now
234
+ prot_sizes_hash = {}
235
+ Ms::Fasta.open(fasta, 'rb', :io_index => []) do |obj|
236
+ obj.each do |entry|
237
+ hash = Ms::Fasta::Ipi.parse(entry.header)
238
+ ipi = hash['IPI']
239
+ prot_header_hash[ipi] = hash
240
+ prot_sizes_hash[ipi] = entry.sequence.size
241
+ end
242
+ end
243
+ STDERR.puts "#{Time.now - start} seconds."
244
+
245
+ STDERR.print "Loading peptide centric DB (this takes about a minute)..."
246
+ start = Time.now
247
+ pep_db = YAML.load_file(pep_db_file)
248
+ STDERR.puts "#{Time.now - start} seconds."
249
+
250
+ end
251
+
252
+ opt[:cutoffs].each do |cutoff|
253
+
254
+ cutoff_results = {'qvalue_cutoff' => cutoff}
255
+ results_sets_hash = {}
256
+ cutoff_results['sets'] = results_sets_hash
257
+ results['results'] << cutoff_results
258
+
259
+ #########################
260
+ # FOR EACH SET:
261
+ #########################
262
+ pep_klass = nil
263
+ sets_hash.each do |set, files|
264
+ set_results = {}
265
+ results_sets_hash[set] = set_results
266
+
267
+ # assumes the indices are the same into each data file
268
+
269
+ # get the complete set of passing hits
270
+ all_passing_hits = files.inject([]) do |all_passing_hits, file|
271
+ hash = YAML.load_file(file)
272
+
273
+ header_hash = hash['headers']
274
+ pep_klass ||= Struct.new(*(header_hash.map {|v| v.to_sym }))
275
+ hits = hash['data'].map {|v| pep_klass.new(*v) }
276
+
277
+ passing_hits =
278
+ if cutoff
279
+ # assumes monotonic qvalues values!
280
+ (above, below) = hits.partition {|hit| hit.qvalue <= cutoff }
281
+ above
282
+ else
283
+ hits
284
+ end
285
+ all_passing_hits.push(*passing_hits)
286
+ end
287
+
288
+
289
+ # create an index from aaseq to hits
290
+ seq_to_hits = Hash.new {|h,k| h[k] = []}
291
+ uniq_seqcharge = Set.new
292
+ all_passing_hits.each do |hit|
293
+ seq_to_hits[hit.sequence] << hit
294
+ uniq_seqcharge.add( hit.sequence + '_' + hit.charge.to_s )
295
+ end
296
+
297
+
298
+ # determine the number of uniq aaseqs
299
+ uniq_seqs = seq_to_hits.size
300
+
301
+ num_uniq_seqcharges = uniq_seqcharge.size
302
+
303
+ set_results.merge!( { 'num_peptide_hits' => all_passing_hits.size,
304
+ 'num_uniq_aaseqs' => uniq_seqs,
305
+ 'num_uniq_aaseqs_charge' => num_uniq_seqcharges,
306
+ })
307
+
308
+ if opt[:proteins]
309
+
310
+ # create an index from proteins to peptides
311
+ prots_to_peps = Hash.new {|h,k| h[k] = [] }
312
+ peptides_not_found = []
313
+ seq_to_hits.keys.each do |seq|
314
+ if pep_db.key?(seq)
315
+ pep_db[seq].split('-').each do |prot|
316
+ prots_to_peps[prot] << seq
317
+ end
318
+ else
319
+ peptides_not_found << seq
320
+ end
321
+ end
322
+
323
+ # Determine the number of 1) hits, 2) aaseqs, 3) aaseqcharges per protein BEFORE minimization
324
+ stats_per_protein_before = stats_per_prot(prots_to_peps, seq_to_hits)
325
+
326
+ # get the minimal protein set
327
+ (prot_to_uniq_peps_hash, indistinguishable_protein_hash) = minimal_protein_set(prots_to_peps) do |prot,peps|
328
+ # will sort with lowest
329
+ [ peps.size, peps.inject(0){|m,s| m+s.size}, -(prot_sizes_hash[prot])]
330
+ end
331
+
332
+ prot_to_uniq_peps_hash.each do |prot, peps|
333
+ [prot, *indistinguishable_protein_hash[prot]].each do |prot|
334
+ protein_info[prot] = prot_header_hash[prot]
335
+ end
336
+ end
337
+
338
+ stats_per_protein_minimal = stats_per_prot(prot_to_uniq_peps_hash, seq_to_hits)
339
+
340
+ # create a hash of data for each protein
341
+ protein_data_hashes_hash = {}
342
+ prot_to_uniq_peps_hash.each do |prot, peps|
343
+ protein_data_hashes_hash[prot] = {
344
+ 'aaseqs' => peps,
345
+ # this will be a triplet
346
+ 'num_hits_minimal' => stats_per_protein_minimal[prot],
347
+ 'indistinguishable' => indistinguishable_protein_hash[prot],
348
+ 'num_hits_all' => stats_per_protein_before[prot],
349
+ }
350
+ end
351
+
352
+ set_results['proteins'] = protein_data_hashes_hash
353
+ set_results['num_proteins'] = prot_to_uniq_peps_hash.size
354
+ set_results['num_aaseqs_not_in_pep_db'] = peptides_not_found.size
355
+ if peptides_not_found.size > 0
356
+ warn "Did not find in peptide centric db: #{peptides_not_found.join(', ')}"
357
+ end
358
+ end
359
+ end
360
+ end
361
+
362
+ File.open(opt[:outfile], 'w') do |out|
363
+ out.print results.to_yaml
364
+ end
365
+
366
+
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ require 'set'
4
+ require 'yaml'
5
+ require 'optparse'
6
+
7
+ opt = {}
8
+ opts = OptionParser.new do |op|
9
+ op.banner = "usage: #{File.basename(__FILE__)} <precision_file>.yml ..."
10
+ op.separator "outputs information collected by combining hits from files:"
11
+ op.separator "---"
12
+ op.separator "filenames: "
13
+ op.separator "- <pathgiven>"
14
+ op.separator "num_unique_aaseqs: <Int>"
15
+ op.separator "num_unique_aaseqs_charge: <Int>"
16
+ op.separator "num_peptide_hits: <Int>"
17
+ op.separator ""
18
+ op.separator "NOTE: if a precision cutoff is given, all hits that have a better"
19
+ op.separator "score than the worst score at the cutoff are included, even if "
20
+ op.separator "the precision for that hit was below the cutoff"
21
+ op.separator "this prevents early, local aberrations in precision from messing"
22
+ op.separator "up the analysis"
23
+ op.separator ""
24
+ op.on("-p", "--precision <0-1>", Float, "precision cutoff") {|v| opt[:cutoff] = v }
25
+ op.on("-f", "--fdr <0-1>", Float, "false discovery rate cutoff (1-precision)") {|v| opt[:cutoff] = 1.0 - v }
26
+ end
27
+
28
+ opts.parse!
29
+
30
+ if ARGV.size == 0
31
+ puts opts.to_s
32
+ exit
33
+ end
34
+
35
+ unique_sequences = Set.new
36
+ unique_ions = Set.new
37
+ all_hits = []
38
+
39
+ ARGV.each do |file|
40
+ hash = YAML.load_file(file)
41
+
42
+ prec_index = hash['headers'].index('precision')
43
+ mowse_index = hash['headers'].index('mowse')
44
+ aaseq_index = hash['headers'].index('aaseq')
45
+ charge_index = hash['headers'].index('charge')
46
+
47
+ above_cutoff.each do |ar|
48
+ sequence = ar[aaseq_index]
49
+ seq_plus_charge = sequence + ar[charge_index]
50
+ unique_sequences.add sequence
51
+ unique_ions.add seq_plus_charge
52
+ end
53
+ end
54
+
55
+ prec_k = 'precision cutoff'
56
+ fn_k = 'filenames'
57
+ uniq_aaseq_k = 'num unique aaseqs'
58
+ uniq_ions_k = 'num unique aaseqs+charge'
59
+ num_hits_k = 'num peptide hits'
60
+
61
+ order = [fn_k, prec_k, num_hits_k, uniq_ions_k, uniq_aaseq_k]
62
+
63
+ results = {}
64
+ results[fn_k] = '[' + ARGV.join(", ") + ']'
65
+ results[prec_k] = opt[:cutoff]
66
+ results[uniq_aaseq_k] = unique_sequences.size
67
+ results[uniq_ions_k] = unique_ions.size
68
+ results[num_hits_k] = all_hits.size
69
+
70
+ order.each do |key|
71
+ puts "#{key}: #{results[key]}"
72
+ end