ms-error_rate 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+
2
+ # A transmemIndex is a hash that takes a fasta reference as key and returns
3
+ # a structured hash containing the transmembrane information.
4
+ module TransmembraneIndex
5
+
6
+ # returns :toppred or :phobius
7
+ def self.filetype(file)
8
+ tp = nil
9
+ File.open(file) do |fh|
10
+ while (line = fh.gets)
11
+ case line
12
+ when /SEQENCE/
13
+ tp = :phobius
14
+ break
15
+ when / 0 0 i/
16
+ tp = :phobius # if they don't have the headers,
17
+ # this will pick it up if they have a
18
+ # single prot without tm or signal peptide.
19
+ break
20
+ when /Algorithm specific parameters/
21
+ tp = :toppred # New text
22
+ break
23
+ when /<parameters>/
24
+ tp = :toppred # XML
25
+ break
26
+ end
27
+ end
28
+ end
29
+ tp
30
+ end
31
+
32
+ def reference_to_key(reference)
33
+ # needs to be subclassed or written
34
+ end
35
+
36
+ # right now accepts toppred.out files
37
+ # Phobius objects can use the fasta object to update their hash for methods
38
+ # like avg_overlap
39
+ def self.new(file)
40
+ case x = filetype(file)
41
+ when :toppred
42
+ require 'transmembrane/toppred'
43
+ TopPred::Index.new(file)
44
+ when :phobius
45
+ require 'transmembrane/phobius'
46
+ # warn "WARNING: You have NO fasta object with Phobius based TransmembraneIndex! (which needs one to do proper indexing!)" unless fasta
47
+ Phobius::Index.new(file)
48
+ else
49
+ raise ArgumentError, "#{x} filetype for #{file} not recognized!"
50
+ end
51
+ end
52
+
53
+ # returns a hash of key -> num certain transmembrane segments
54
+ def num_certain_index
55
+ hash = {}
56
+ self.each do |k,v|
57
+ hash[k] = v[:num_certain_transmembrane_segments] || 0
58
+ end
59
+ hash
60
+ end
61
+
62
+ # tp = :number or :fraction which is the fraction of the sequence size
63
+ # returns the average number of overlapping amino acids with transmembrane
64
+ # segments
65
+ # returns nil if there is no protein by that key
66
+ def avg_overlap(key, sequence, tp=:number)
67
+ if self.key? key
68
+ numbers = num_transmem_aa(self[key], sequence)
69
+ if numbers.size > 0
70
+ sum = 0
71
+ numbers.each {|num| sum += num}
72
+ avg_num = sum.to_f / numbers.size
73
+ # the one line way to do it
74
+ #avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
75
+ if tp == :fraction
76
+ avg_num / sequence.size
77
+ # this is the same as doing this:
78
+ #numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
79
+ else
80
+ avg_num
81
+ end
82
+ else
83
+ 0.0
84
+ end
85
+ else # what to do if the protein isn't there?? which happens on occasion
86
+ nil
87
+ end
88
+ end
89
+
90
+ # returns an array (usually length of 1) of the number of amino acids
91
+ # contained inside transmembrane spanning segments.
92
+ # assumes that tmhash has the key 'transmembrane_segments'
93
+ # if there are no transmembrane segments, returns empty array.
94
+ def num_transmem_aa(tmhash, sequence)
95
+ if tmhash.key? :transmembrane_segments
96
+ ranges = tmhash[:transmembrane_segments].map do |tmseg|
97
+ Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
98
+ end
99
+ num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
100
+ else
101
+ []
102
+ end
103
+ end
104
+
105
+ # returns an array of the number of overlapping sequences in substring with
106
+ # the substrings defined in start_stop_doublets within full_sequence
107
+ # start_stop_doublets should be 0 indexed!!!
108
+ # the span includes the 'stop' position i.e., full_sequence[start..stop]
109
+ def num_overlapping_chars(full_sequence, ranges, substring)
110
+ #start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
111
+ if ranges.size == 0
112
+ []
113
+ #full_sequence.enum_for(:scan, substring).map { 0 }
114
+ else
115
+ substring_ranges = []
116
+ pos = 0
117
+ slen = substring.size
118
+ while i=full_sequence.index(substring,pos)
119
+ substring_ranges << Range.new(i, i+slen-1)
120
+ pos = i + slen
121
+ end
122
+ # brute force way
123
+ last_tm_range = ranges.last.last
124
+ to_return = substring_ranges.map do |sb|
125
+ overlap = 0
126
+ # there's got to be a much simpler way to do this, but this does work...
127
+ ranges.each do |tm|
128
+ (frst, lst) =
129
+ if tm.include?( sb.first )
130
+ [tm, sb]
131
+ elsif tm.include?( sb.last )
132
+ [sb, tm]
133
+ else
134
+ nil
135
+ end
136
+ if frst
137
+ if lst.last <= frst.last
138
+ overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
139
+ else
140
+ overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
141
+ end
142
+ end
143
+ end
144
+ overlap
145
+ end
146
+ end
147
+ end
148
+
149
+
150
+ end
151
+
152
+
153
+ #substring_ranges = full_sequence.enum_for(:scan, substring).map do
154
+ # (ofirst, olast) = $~.offset(0)
155
+ # Range.new(ofirst, olast - 1)
156
+ # end
157
+
@@ -0,0 +1,5 @@
1
+ # structure of a very simple file for holding peptide hit qvalues
2
+ # entries should be separated by a tab!!!
3
+ aaseq charge qvalue
4
+ String Integer Float
5
+ ... ... ...
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'yaml'
4
+ require 'set'
5
+
6
+ if ARGV.size == 0
7
+ puts "usage: prog summary__<setname>__name_to_gene_id.yml"
8
+ exit
9
+ end
10
+
11
+ file = ARGV.shift
12
+
13
+ hash = YAML.load_file(file)
14
+
15
+ previous_hits = Set.new
16
+ results = []
17
+ hash.sort.each do |fdr, hits|
18
+ new_hits = hits - previous_hits.to_a
19
+ previous_hits.merge(new_hits)
20
+ results << [fdr, hits.size, *new_hits]
21
+ end
22
+
23
+ results.shift.zip(*results) do |row|
24
+ puts row.join("\t")
25
+ end
26
+
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'orderedhash'
4
+ require 'yaml'
5
+ require 'set'
6
+
7
+ if ARGV.size != 2
8
+ puts "usage: #{File.basename(__FILE__)} <gene_ids>.txt summary.yml"
9
+ puts "writes a yml file with unique proteins per qvalue cutoff"
10
+ puts "for each set"
11
+ puts "summary__<setname>__<gene_ids>.yml"
12
+ exit
13
+ end
14
+
15
+ (gene_ids, summary) = ARGV
16
+
17
+ globs = IO.readlines(gene_ids).reject{|v| v[0,1] == '#'}.map{|v| v.chomp }.select {|v| v =~ /\w/ }
18
+
19
+ hash = YAML.load_file(summary)
20
+ protein_info = hash['protein_info']
21
+ results = hash['results']
22
+ output_hashes = OrderedHash.new
23
+ results.each do |result|
24
+
25
+ qvalue_cutoff = result['qvalue_cutoff']
26
+ result['sets'].each do |setname, sethash|
27
+ matches = Set.new
28
+ output_hashes[setname] ||= OrderedHash.new
29
+ proteins = sethash['proteins']
30
+ proteins.each do |ipi,info|
31
+ if info['num_hits_minimal'].first > 0
32
+ all_proteins = [ipi, *info['indistinguishable']]
33
+ all_proteins.each do |id|
34
+ globs.each do |glob|
35
+ if File.fnmatch?(glob, protein_info[id]['Gene_Symbol'])
36
+ matches << protein_info[id]['Gene_Symbol']
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ output = matches.to_a.sort
43
+ output_hashes[setname][qvalue_cutoff] = output
44
+ end
45
+ end
46
+
47
+ output_hashes.each do |setname, output|
48
+ gene_ids_base = File.basename(gene_ids, '.*')
49
+ summary_base = summary.chomp(File.extname(summary))
50
+ output_file = [summary_base, setname, gene_ids_base].join("__") + ".yml"
51
+
52
+ File.open(output_file, 'w') {|out| out.print output.to_yaml }
53
+ end
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/ruby
2
+
3
+ if ARGV.size == 0
4
+ puts "usage: #{File.basename(__FILE__)} <IPI_based>.fasta ..."
5
+ puts "moves any leading \"><.*_>\" to the IPI value"
6
+ puts "for example:"
7
+ puts ">DCY_IPI:IPI0032311.1|STUFF -> >IPI:DCY_IPI0032311.1|STUFF"
8
+ exit
9
+ end
10
+
11
+ ARGV.each do |file|
12
+ tmp = file + '.tmp'
13
+ if File.exist?(tmp) ; warn "Skipping #{file} since #{tmp} exists" ; next end
14
+ File.open(tmp, 'w') do |out|
15
+ IO.foreach(file) do |line|
16
+ if line =~ />([^\:\|]+_)/
17
+ line.sub!("#{$1}IPI:IPI", "IPI:#{$1}IPI")
18
+ end
19
+ out.print line
20
+ end
21
+ end
22
+ FileUtils.mv tmp, file
23
+ end
@@ -0,0 +1,366 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'yaml'
4
+ require 'set'
5
+ require 'optparse'
6
+ require 'ms/fasta'
7
+ require 'ms/fasta/ipi'
8
+
9
+ SET_RE = /Set\s+(.*)/i
10
+ QVALUE_EXT = ".qval.yml"
11
+
12
+ # returns [sets_to_paths_hash, sets_order]
13
+ def sets_compare_to_paths(file, ext=QVALUE_EXT)
14
+ dirname = File.dirname(File.expand_path(file))
15
+ lines = IO.readlines(file).map {|v| v.chomp }.select {|v| v =~ /\w/}
16
+ sets = {}
17
+ current_set = nil
18
+ sets_order = []
19
+ lines.each do |line|
20
+ if line =~ SET_RE
21
+ current_set = $1.dup
22
+ sets[current_set] = []
23
+ sets_order << current_set
24
+ else
25
+ full_path = (File.join(dirname,(line + ext)))
26
+ raise RuntimeError, "file #{full_path} does not exist!!" unless File.exist?(full_path)
27
+ sets[current_set] << full_path
28
+ end
29
+ end
30
+ [sets, sets_order]
31
+ end
32
+
33
+ # returns [minimal_protein_to_uniq_peps_hash, indistinguishable_protein_hash]
34
+ # takes a hash of proteins to aaseqs. Uses a greedy algorithm where
35
+ # things are sorted first by the number of uniq amino acid sequences and total
36
+ # aa length. if a block is given, then will yield the prot and the
37
+ # peptide_array and sort by the returned value. The greedy algorithm acts on
38
+ # the REVERSE of the sorted proteins. indistinguishable_protein_hash is keyed
39
+ # on the proteins in the minimal_protein_array and gives an array of other
40
+ # proteins.
41
+ def minimal_protein_set(proteins_to_aaseqs)
42
+ blk_given = block_given?
43
+ #STDERR.puts "using block for minimal_protein_set" if blk_given
44
+ proteins_and_uniq_peps = []
45
+
46
+ sorted_most_to_least = proteins_to_aaseqs.sort_by do |k,v|
47
+ if blk_given
48
+ yield(k,v)
49
+ else
50
+ [ v.size, v.inject(0){|m,s| m+s.size} ]
51
+ end
52
+ end.reverse
53
+
54
+ found_seq = Set.new
55
+
56
+ same_peptide_hits = {}
57
+
58
+ last_peps = nil
59
+ last_uniq_prot = nil
60
+ sorted_most_to_least.each do |prot, peps|
61
+ sorted_peps = peps.sort # is it necessary to SORT?????????
62
+ uniq_peps = peps.select do |pep|
63
+ if found_seq.include?(pep)
64
+ false
65
+ else
66
+ found_seq.add pep
67
+ true
68
+ end
69
+ end
70
+ if uniq_peps.size > 0
71
+ proteins_and_uniq_peps << [prot, uniq_peps]
72
+ same_peptide_hits[prot] = []
73
+ last_peps = sorted_peps
74
+ last_uniq_prot = prot
75
+ else
76
+ if sorted_peps == last_peps
77
+ same_peptide_hits[last_uniq_prot] << prot
78
+ end
79
+ end
80
+ end
81
+ prot_to_uniq_peps_hash = {}
82
+ proteins_and_uniq_peps.each do |prot, uniq_peps|
83
+ prot_to_uniq_peps_hash[prot] = uniq_peps
84
+ end
85
+
86
+ [prot_to_uniq_peps_hash, same_peptide_hits]
87
+ end
88
+
89
+ def cutoffs_to_floats(ar)
90
+ ar.map do |v|
91
+ if v == 'nil' || v == '-'
92
+ nil
93
+ else
94
+ answ = v.to_f
95
+ end
96
+ end
97
+ end
98
+
99
+ # returns a hash keyed on protein id that yields an array:
100
+ # [#aaseq, #aaseq_and_charge, #total_hits]
101
+ def stats_per_prot(prot_to_peps, seq_to_hits)
102
+ per_protein_hash = {}
103
+ prot_to_peps.each do |prot, uniq_pep_seqs|
104
+ all = Set.new
105
+ aaseqcharges = Set.new
106
+ aaseqs = Set.new
107
+
108
+ uniq_pep_seqs.each do |pep_seq|
109
+ all_hits = seq_to_hits[pep_seq]
110
+ all.merge( all_hits )
111
+ all_hits.each do |hit|
112
+ aaseq = hit.sequence
113
+ aaseqs.add( aaseq )
114
+ aaseqcharges.add( aaseq + '_' + hit.charge.to_s )
115
+ end
116
+ per_protein_hash[prot] = [aaseqs.size, aaseqcharges.size, all.size]
117
+
118
+ end
119
+ end
120
+ per_protein_hash
121
+ end
122
+
123
+ opt = {
124
+ :cutoffs => [nil],
125
+ :outfile => "summary.yml",
126
+ }
127
+
128
+ opts = OptionParser.new do |op|
129
+ op.banner = "USAGE: #{File.basename(__FILE__)} sets_compare.txt"
130
+ op.separator "OUTPUT: #{opt[:outfile]}"
131
+ op.separator ""
132
+ op.separator "INPUT: "
133
+ op.separator " each <file> referenced in sets_compare.txt should have a"
134
+ op.separator " <file>.qval.yml file"
135
+ op.separator ""
136
+ op.separator "OPTIONS:"
137
+ op.on("-q", "--qvalue <0-1[,...]>", Array, "only take qvalues < given ['-' for no threshold]") {|v| opt[:cutoffs] = cutoffs_to_floats(v)}
138
+ op.separator ""
139
+ op.on("--proteins <fasta>,<pep-db>", Array, "path to fasta and peptide centric DB", "peptide_centric_db is in the format: ", "<PEPTIDE>: <ID>-<ID>-<ID>") {|v| opt[:proteins] = v }
140
+ op.separator "FORMATS:"
141
+ op.on("--output-format", "prints the output yaml scheme and exits") {|v| opt[:output_format] = v }
142
+ op.on("--input-format", "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
143
+ end
144
+
145
+ # later on we could implement full isoform resolution like IsoformResolver
146
+ # for now we will generate a report, realizing that some isoforms may not be
147
+ # reported
148
+ # it is implemented by using a pre-made map from sequence to protein groups
149
+ # then, a set of sequences allows one to deduce all the relationships from the
150
+ # protein groups.
151
+
152
+ opts.parse!
153
+
154
+ if opt[:output_format]
155
+ yaml = <<SKEL
156
+ results:
157
+ - qvalue_cutoff: <Float>
158
+ sets:
159
+ <set_name>:
160
+ num_uniq_aaseqs: <Integer>
161
+ num_aaseqs_not_in_pep_db: <Integer>
162
+ num_uniq_aaseqs_charge: <Integer>
163
+ proteins:
164
+ <IPI_ID>:
165
+ num_hits_all:
166
+ - <Integer> # total num aaseqs
167
+ - <Integer> # total num aaseq+charge
168
+ - <Integer> # total num hits
169
+ num_hits_minimal:
170
+ - <Integer> # total num aaseqs
171
+ - <Integer> # total num aaseq+charge
172
+ - <Integer> # total num hits
173
+ indistinguishable:
174
+ - <IPI_ID>
175
+ - <IPI_ID>
176
+ aaseqs:
177
+ - <String>
178
+ - <String>
179
+ sets_order:
180
+ - <String>
181
+ - <String>
182
+ protein_info:
183
+ <IPI_ID>:
184
+ Gene_Symbol: <String>
185
+ IPI: <IPI_ID>
186
+ Tax_Id: <String>
187
+ SWISS-PROT: <String>
188
+ description: <String>
189
+ ENSEMBL: <String>
190
+ SKEL
191
+ print yaml
192
+ exit
193
+ end
194
+
195
+ if opt[:input_format]
196
+ string =<<EXPLANATION
197
+ # the sets_compare.yml format is very simple:
198
+
199
+ Set <some_name_for_set1>
200
+ filename1_no_ext
201
+ filename2_no_ext
202
+ Set <some_name_for_set2>
203
+ filename3_no_ext
204
+ filename4_no_ext
205
+ ...
206
+ EXPLANATION
207
+ puts string
208
+ exit
209
+ end
210
+
211
+ if ARGV.size != 1
212
+ puts opts.to_s
213
+ exit
214
+ end
215
+
216
+
217
+ results = {}
218
+
219
+ protein_info = {}
220
+ results['protein_info'] = protein_info
221
+ results['results'] = []
222
+
223
+ (sets_hash, sets_order) = sets_compare_to_paths(ARGV.shift)
224
+ results['sets_order'] = sets_order
225
+
226
+ if opt[:proteins]
227
+ (fasta, pep_db_file) = opt[:proteins]
228
+
229
+ # a hash indexed on ipi containing all info
230
+ prot_header_hash = {}
231
+
232
+ STDERR.print "Loading information from fasta file..."
233
+ start = Time.now
234
+ prot_sizes_hash = {}
235
+ Ms::Fasta.open(fasta, 'rb', :io_index => []) do |obj|
236
+ obj.each do |entry|
237
+ hash = Ms::Fasta::Ipi.parse(entry.header)
238
+ ipi = hash['IPI']
239
+ prot_header_hash[ipi] = hash
240
+ prot_sizes_hash[ipi] = entry.sequence.size
241
+ end
242
+ end
243
+ STDERR.puts "#{Time.now - start} seconds."
244
+
245
+ STDERR.print "Loading peptide centric DB (this takes about a minute)..."
246
+ start = Time.now
247
+ pep_db = YAML.load_file(pep_db_file)
248
+ STDERR.puts "#{Time.now - start} seconds."
249
+
250
+ end
251
+
252
+ opt[:cutoffs].each do |cutoff|
253
+
254
+ cutoff_results = {'qvalue_cutoff' => cutoff}
255
+ results_sets_hash = {}
256
+ cutoff_results['sets'] = results_sets_hash
257
+ results['results'] << cutoff_results
258
+
259
+ #########################
260
+ # FOR EACH SET:
261
+ #########################
262
+ pep_klass = nil
263
+ sets_hash.each do |set, files|
264
+ set_results = {}
265
+ results_sets_hash[set] = set_results
266
+
267
+ # assumes the indices are the same into each data file
268
+
269
+ # get the complete set of passing hits
270
+ all_passing_hits = files.inject([]) do |all_passing_hits, file|
271
+ hash = YAML.load_file(file)
272
+
273
+ header_hash = hash['headers']
274
+ pep_klass ||= Struct.new(*(header_hash.map {|v| v.to_sym }))
275
+ hits = hash['data'].map {|v| pep_klass.new(*v) }
276
+
277
+ passing_hits =
278
+ if cutoff
279
+ # assumes monotonic qvalues values!
280
+ (above, below) = hits.partition {|hit| hit.qvalue <= cutoff }
281
+ above
282
+ else
283
+ hits
284
+ end
285
+ all_passing_hits.push(*passing_hits)
286
+ end
287
+
288
+
289
+ # create an index from aaseq to hits
290
+ seq_to_hits = Hash.new {|h,k| h[k] = []}
291
+ uniq_seqcharge = Set.new
292
+ all_passing_hits.each do |hit|
293
+ seq_to_hits[hit.sequence] << hit
294
+ uniq_seqcharge.add( hit.sequence + '_' + hit.charge.to_s )
295
+ end
296
+
297
+
298
+ # determine the number of uniq aaseqs
299
+ uniq_seqs = seq_to_hits.size
300
+
301
+ num_uniq_seqcharges = uniq_seqcharge.size
302
+
303
+ set_results.merge!( { 'num_peptide_hits' => all_passing_hits.size,
304
+ 'num_uniq_aaseqs' => uniq_seqs,
305
+ 'num_uniq_aaseqs_charge' => num_uniq_seqcharges,
306
+ })
307
+
308
+ if opt[:proteins]
309
+
310
+ # create an index from proteins to peptides
311
+ prots_to_peps = Hash.new {|h,k| h[k] = [] }
312
+ peptides_not_found = []
313
+ seq_to_hits.keys.each do |seq|
314
+ if pep_db.key?(seq)
315
+ pep_db[seq].split('-').each do |prot|
316
+ prots_to_peps[prot] << seq
317
+ end
318
+ else
319
+ peptides_not_found << seq
320
+ end
321
+ end
322
+
323
+ # Determine the number of 1) hits, 2) aaseqs, 3) aaseqcharges per protein BEFORE minimization
324
+ stats_per_protein_before = stats_per_prot(prots_to_peps, seq_to_hits)
325
+
326
+ # get the minimal protein set
327
+ (prot_to_uniq_peps_hash, indistinguishable_protein_hash) = minimal_protein_set(prots_to_peps) do |prot,peps|
328
+ # will sort with lowest
329
+ [ peps.size, peps.inject(0){|m,s| m+s.size}, -(prot_sizes_hash[prot])]
330
+ end
331
+
332
+ prot_to_uniq_peps_hash.each do |prot, peps|
333
+ [prot, *indistinguishable_protein_hash[prot]].each do |prot|
334
+ protein_info[prot] = prot_header_hash[prot]
335
+ end
336
+ end
337
+
338
+ stats_per_protein_minimal = stats_per_prot(prot_to_uniq_peps_hash, seq_to_hits)
339
+
340
+ # create a hash of data for each protein
341
+ protein_data_hashes_hash = {}
342
+ prot_to_uniq_peps_hash.each do |prot, peps|
343
+ protein_data_hashes_hash[prot] = {
344
+ 'aaseqs' => peps,
345
+ # this will be a triplet
346
+ 'num_hits_minimal' => stats_per_protein_minimal[prot],
347
+ 'indistinguishable' => indistinguishable_protein_hash[prot],
348
+ 'num_hits_all' => stats_per_protein_before[prot],
349
+ }
350
+ end
351
+
352
+ set_results['proteins'] = protein_data_hashes_hash
353
+ set_results['num_proteins'] = prot_to_uniq_peps_hash.size
354
+ set_results['num_aaseqs_not_in_pep_db'] = peptides_not_found.size
355
+ if peptides_not_found.size > 0
356
+ warn "Did not find in peptide centric db: #{peptides_not_found.join(', ')}"
357
+ end
358
+ end
359
+ end
360
+ end
361
+
362
+ File.open(opt[:outfile], 'w') do |out|
363
+ out.print results.to_yaml
364
+ end
365
+
366
+
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ require 'set'
4
+ require 'yaml'
5
+ require 'optparse'
6
+
7
+ opt = {}
8
+ opts = OptionParser.new do |op|
9
+ op.banner = "usage: #{File.basename(__FILE__)} <precision_file>.yml ..."
10
+ op.separator "outputs information collected by combining hits from files:"
11
+ op.separator "---"
12
+ op.separator "filenames: "
13
+ op.separator "- <pathgiven>"
14
+ op.separator "num_unique_aaseqs: <Int>"
15
+ op.separator "num_unique_aaseqs_charge: <Int>"
16
+ op.separator "num_peptide_hits: <Int>"
17
+ op.separator ""
18
+ op.separator "NOTE: if a precision cutoff is given, all hits that have a better"
19
+ op.separator "score than the worst score at the cutoff are included, even if "
20
+ op.separator "the precision for that hit was below the cutoff"
21
+ op.separator "this prevents early, local aberrations in precision from messing"
22
+ op.separator "up the analysis"
23
+ op.separator ""
24
+ op.on("-p", "--precision <0-1>", Float, "precision cutoff") {|v| opt[:cutoff] = v }
25
+ op.on("-f", "--fdr <0-1>", Float, "false discovery rate cutoff (1-precision)") {|v| opt[:cutoff] = 1.0 - v }
26
+ end
27
+
28
+ opts.parse!
29
+
30
+ if ARGV.size == 0
31
+ puts opts.to_s
32
+ exit
33
+ end
34
+
35
+ unique_sequences = Set.new
36
+ unique_ions = Set.new
37
+ all_hits = []
38
+
39
+ ARGV.each do |file|
40
+ hash = YAML.load_file(file)
41
+
42
+ prec_index = hash['headers'].index('precision')
43
+ mowse_index = hash['headers'].index('mowse')
44
+ aaseq_index = hash['headers'].index('aaseq')
45
+ charge_index = hash['headers'].index('charge')
46
+
47
+ above_cutoff.each do |ar|
48
+ sequence = ar[aaseq_index]
49
+ seq_plus_charge = sequence + ar[charge_index]
50
+ unique_sequences.add sequence
51
+ unique_ions.add seq_plus_charge
52
+ end
53
+ end
54
+
55
+ prec_k = 'precision cutoff'
56
+ fn_k = 'filenames'
57
+ uniq_aaseq_k = 'num unique aaseqs'
58
+ uniq_ions_k = 'num unique aaseqs+charge'
59
+ num_hits_k = 'num peptide hits'
60
+
61
+ order = [fn_k, prec_k, num_hits_k, uniq_ions_k, uniq_aaseq_k]
62
+
63
+ results = {}
64
+ results[fn_k] = '[' + ARGV.join(", ") + ']'
65
+ results[prec_k] = opt[:cutoff]
66
+ results[uniq_aaseq_k] = unique_sequences.size
67
+ results[uniq_ions_k] = unique_ions.size
68
+ results[num_hits_k] = all_hits.size
69
+
70
+ order.each do |key|
71
+ puts "#{key}: #{results[key]}"
72
+ end