mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/Rakefile +41 -14
  2. data/bin/bioworks2excel.rb +1 -1
  3. data/bin/bioworks_to_pepxml.rb +46 -59
  4. data/bin/fasta_shaker.rb +1 -1
  5. data/bin/filter.rb +6 -0
  6. data/bin/find_aa_freq.rb +23 -0
  7. data/bin/id_precision.rb +3 -2
  8. data/bin/mzxml_to_lmat.rb +2 -1
  9. data/bin/pepproph_filter.rb +1 -1
  10. data/bin/precision.rb +1 -1
  11. data/bin/protein_summary.rb +2 -451
  12. data/bin/raw_to_mzXML.rb +55 -0
  13. data/bin/srf_group.rb +26 -0
  14. data/changelog.txt +7 -0
  15. data/lib/align.rb +3 -3
  16. data/lib/fasta.rb +6 -1
  17. data/lib/gi.rb +9 -4
  18. data/lib/roc.rb +2 -0
  19. data/lib/sample_enzyme.rb +2 -1
  20. data/lib/spec/mzxml/parser.rb +2 -43
  21. data/lib/spec/mzxml.rb +65 -2
  22. data/lib/spec_id/aa_freqs.rb +10 -7
  23. data/lib/spec_id/bioworks.rb +67 -87
  24. data/lib/spec_id/filter.rb +794 -0
  25. data/lib/spec_id/precision.rb +29 -36
  26. data/lib/spec_id/proph.rb +5 -3
  27. data/lib/spec_id/protein_summary.rb +459 -0
  28. data/lib/spec_id/sequest.rb +323 -271
  29. data/lib/spec_id/srf.rb +189 -135
  30. data/lib/spec_id.rb +276 -227
  31. data/lib/spec_id_xml.rb +101 -0
  32. data/lib/toppred.rb +18 -0
  33. data/script/degenerate_peptides.rb +47 -0
  34. data/script/filter-peps.rb +5 -1
  35. data/test/tc_align.rb +1 -1
  36. data/test/tc_bioworks.rb +25 -22
  37. data/test/tc_bioworks_to_pepxml.rb +37 -4
  38. data/test/tc_fasta.rb +3 -1
  39. data/test/tc_fasta_shaker.rb +8 -6
  40. data/test/tc_filter.rb +203 -0
  41. data/test/tc_gi.rb +6 -9
  42. data/test/tc_id_precision.rb +31 -0
  43. data/test/tc_mzxml.rb +8 -6
  44. data/test/tc_peptide_parent_times.rb +2 -1
  45. data/test/tc_precision.rb +1 -1
  46. data/test/tc_proph.rb +5 -5
  47. data/test/tc_protein_summary.rb +36 -13
  48. data/test/tc_sequest.rb +78 -33
  49. data/test/tc_spec_id.rb +128 -6
  50. data/test/tc_srf.rb +84 -38
  51. metadata +67 -62
  52. data/bin/fasta_cat.rb +0 -39
  53. data/bin/fasta_cat_mod.rb +0 -59
  54. data/bin/fasta_mod.rb +0 -57
  55. data/bin/filter_spec_id.rb +0 -365
  56. data/bin/raw2mzXML.rb +0 -21
  57. data/script/gen_database_searching.rb +0 -258
@@ -1,455 +1,6 @@
1
1
  #!/usr/bin/ruby -w
2
2
 
3
- require 'axml'
4
- require 'hash_by'
5
- require 'optparse'
6
- require 'ostruct'
7
- require 'spec_id'
8
-
9
- #############################################################
10
- # GLOBALS:
11
- PRECISION_PROGRAM_BASE = 'precision'
12
- DEF_PREFIX = "INV_"
13
- DEF_PERCENT_FP = "5.0"
14
- #############################################################
15
-
16
-
17
- # @TODO: add group probability title (showin all group probabilities) for protein prob
18
-
19
- #class String
20
- # def margin
21
- # self.gsub(/^\s*\|/,'')
22
- # end
23
- #end
24
-
25
-
26
- class Runner
27
- module HTML
28
- def header
29
- %Q{<html>
30
- <head>
31
- #{style}
32
- </head>
33
- <body>
34
- <script type="text/javascript">
35
- <!--
36
- function toggle_vis(id) {
37
- var e = document.getElementById(id);
38
- if(e.style.display == 'none')
39
- e.style.display = 'block';
40
- else
41
- e.style.display = 'none';
42
- }
43
- //-->
44
- </script>
45
- }
46
- end
47
-
48
- def style
49
- '
50
- <style type="text/css">
51
- table {
52
- border-width:1px;
53
- border-color:#DDDDDD;
54
- border-collapse: collapse;
55
- }
56
- td,th {
57
- padding-top: 2px;
58
- padding-bottom: 2px;
59
- padding-left: 5;
60
- padding-right: 5;
61
- }
62
- td.redline {
63
- background-color: #FF0000;
64
- color: #FFFFFF
65
- }
66
- div.file_info, div.software, div.fppr, div.num_proteins{
67
- margin-left: 20px;
68
- margin-top: 20px;
69
- }
70
- div.main {
71
- margin-left: 10px;
72
- margin-right: 10px;
73
- margin-top: 50px;
74
- margin-bottom: 50px;
75
- }
76
- div#error {
77
- margin: 30px;
78
- text-align:center
79
- }
80
- hr {color: sienna}
81
- body { font-size: 8pt; font-family: Arial,Helvetica,Times}
82
- </style>
83
- '
84
- end
85
-
86
- # an anchor and a title
87
- def at(display, title)
88
- "<a title=\"#{title}\">#{display}</a>"
89
- end
90
-
91
- def trailer
92
- %q{
93
- </body>
94
- </html>
95
- }
96
- end
97
-
98
- def tr
99
- "|<tr>
100
- | #{yield}
101
- |</tr>\n".margin
102
- end
103
-
104
- def table
105
- "|<div class=\"main\"><table align=\"center\" border=\"1\" style=\"font-size:100%\" width=\"800px\">
106
- | #{yield}
107
- |</table></div>\n".margin
108
- end
109
-
110
- def tds(arr)
111
- arr.map {|v| "<td>#{v}</td>"}.join
112
- end
113
-
114
- def ths(arr)
115
- str = arr.map {|v| "<th>#{v}</th>"}.join
116
- str << "\n"
117
- end
118
- end
119
-
120
- end
121
-
122
-
123
- class Runner
124
-
125
- include Runner::HTML
126
-
127
- def ref_html(gi, name)
128
- "<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
129
- end
130
-
131
- # Takes the -prot.xml filename and grabs the png file (if available)
132
- def error_info(prot_file_name)
133
- img = prot_file_name.gsub('.xml', '.png')
134
- img_bn = File.basename(img)
135
- "<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
136
- end
137
-
138
- # attempts to get the NCBI gi code
139
- def accession(name)
140
- if (name.include? '|') && (name[0,3] == 'gi|')
141
- name.split('|')[1]
142
- else
143
- name
144
- end
145
- end
146
-
147
- def prefix_to_regex(prefix)
148
- if prefix
149
- /^#{Regexp.escape(prefix)}/
150
- else
151
- nil
152
- end
153
- end
154
-
155
- # given a list of proteins, output a tab delimited textfile with protein
156
- # name and the total number of peptides found
157
- def output_peptide_counts_file(prots, filename)
158
- File.open(filename, "w") do |fh_out|
159
- prots.each do |prot|
160
- fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
161
- end
162
- end
163
- end
164
-
165
- # filters on the false positive regex and sorts by prot probability
166
- def filter_and_sort(uniq_prots, prefix=nil)
167
- prefix_re = prefix_to_regex(prefix)
168
- sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
169
- ## filter on prefix
170
- if prefix
171
- sorted = sorted.reject {|prot| prot.reference =~ prefix_re }
172
- end
173
- sorted
174
- end
175
-
176
- # assumes that these are sorted on probability
177
- # desired_fppr is a float
178
- # returns [number_of_prots, actual_fppr]
179
- def num_prots_above_fppr(prots, desired_fppr)
180
- current_fppr_rate_percent = 0.0
181
- previous_fppr_rate_percent = 0.0
182
- current_sum_one_minus_prob = 0.0
183
- proteins_within_fppr = 0
184
- actual_fppr = nil
185
- already_found = false
186
- prot_cnt = 0
187
- prots.each do |prot|
188
- prot_cnt += 1
189
- # SUM(1-probX)/#prots
190
- current_sum_one_minus_prob += 1.0 - prot._probability.to_f
191
- current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
192
-
193
- if current_fppr_rate_percent > desired_fppr && !already_found
194
- actual_fppr = previous_fppr_rate_percent
195
- proteins_within_fppr = prot_cnt
196
- already_found = true
197
- end
198
- previous_fppr_rate_percent = current_fppr_rate_percent
199
- end
200
- [proteins_within_fppr, actual_fppr]
201
- end
202
-
203
- #### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
204
-
205
- # returns a string of the table rows
206
- # false_positive_rate (give as a %) is the cutoff mark
207
- # returns the number of proteins at the desired_fppr (if given)
208
- def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
209
- prot_cnt = 0
210
- uniq_prots.map do |prot|
211
- tr do
212
- prot_cnt += 1
213
- gi = accession(prot._protein_name)
214
- tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
215
- end
216
- end.join
217
- end
218
-
219
- def print_html_pieces(file, *pieces)
220
- File.open(file, "w") do |out|
221
- pieces.each do |piece|
222
- out.print piece
223
- end
224
- end
225
- end
226
-
227
- def file_info(file)
228
- "<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
229
- <br/>Last Modified: #{File.mtime(file)}
230
- <br/>Size: #{File.size(file)/1000} KB
231
- </div>"
232
- end
233
-
234
- def bioworks_script_info(obj)
235
- version = "3.2??"
236
- if obj.version
237
- version = obj.version
238
- end
239
- script_info{"Bioworks version #{version}"}
240
- end
241
-
242
- def protproph_script_info
243
- begin
244
- where = `which xinteract`
245
- reply = `#{where}`
246
- rescue Exception
247
- reply = ""
248
- end
249
- prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically
250
- if reply =~ /xinteract.*?\((TPP .*)\)/
251
- prophet = $1.dup
252
- end
253
- script_info { "ProteinProphet from: #{prophet}" }
254
- end
255
-
256
- def mspire_version
257
- string = "mspire"
258
- begin
259
- if `gem list --local mspire` =~ /mspire \((.*?)\)/
260
- string << (" v" + $1)
261
- end
262
- rescue Exception
263
- end
264
- string
265
- end
266
-
267
- def script_info
268
- "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
269
- end
270
-
271
- def proph_output(file, outfn, opt, fppr_output_as_html)
272
- header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
273
- num_cols = header_anchors.size
274
- theaders = ths(header_anchors)
275
-
276
- root = AXML.parse_file(file)
277
- prots = []
278
- ## find the min_prob at a fppr of XX
279
- min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
280
-
281
- if opt.c
282
- actual_percent_fp = opt.c.to_f
283
- elsif opt.cut_at
284
- actual_percent_fp = opt.cut_at.to_f
285
- else
286
- actual_percent_fp = nil
287
- end
288
- root.protein_group.each do |group|
289
- group.protein.each do |prt|
290
- prots << prt
291
- end
292
- end
293
- uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
294
- filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
295
-
296
- ## num proteins above cutoff (if opt.c)
297
- num_prots_html = ''
298
- if opt.c || opt.cut_at
299
- (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
300
- num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
301
- end
302
- if opt.cut_at
303
- filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
304
- end
305
-
306
- output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
307
-
308
- table_string = table do
309
- tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
310
- end
311
- er_info = opt.precision ? error_info(file) : ""
312
- html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
313
- print_html_pieces(*html_pieces)
314
- end # proph_output
315
-
316
- # given a list of peptide sequences creates javascript to hide/show them
317
- def peptide_cell(prot_num, peptide_sequences)
318
- "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
319
- end
320
-
321
- def bioworks_output(file, outfn, opt, fppr_output_as_html)
322
- header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
323
- num_cols = header_anchors.size
324
- theaders = ths(header_anchors)
325
- bio_obj = SpecID.new(file)
326
- proteins = bio_obj.prots
327
- protein_num = 0
328
- rows = ""
329
- prefix_re = prefix_to_regex(opt.f)
330
- proteins.each do |prot|
331
- if opt.f && prot.reference =~ prefix_re
332
- next
333
- end
334
- uniq_peps = Hash.new {|h,k| h[k] = true; }
335
- protein_num += 1
336
- prot.peps.each do |pep|
337
- uniq_peps[pep.sequence.split('.')[1]] = true
338
- end
339
- pieces = prot.reference.split(' ')
340
- long_prot_name = pieces.shift
341
- annotation = pieces.join(' ')
342
- accession = prot.accession
343
- if accession == '0' ; accession = long_prot_name end
344
- rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
345
- end
346
- table_string = table do
347
- tr{theaders} + rows
348
- end
349
- print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
350
- end # bioworks_output
351
-
352
- def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
353
- actual_cutoff = sprintf("%.3f", actual_cutoff)
354
- desired_cutoff = sprintf("%.3f", desired_cutoff)
355
- "<div class=\"num_proteins\"><h3>False Positive Rate Information</h3>
356
- Desired FPR: #{desired_cutoff} %<br/>
357
- Actual FPR: #{actual_cutoff} %<br/>
358
- Number of Proteins at Actual FPR: #{num_proteins}
359
- </div>"
360
- end
361
-
362
- # transforms the output string of file_as_decoy into html
363
- def file_as_decoy_to_html(string)
364
- lines = string.split("\n")
365
- #puts lines ?? is this supposed to be commented out?
366
- lines = lines.reject do |obj| obj =~ /\*{10}/ end
367
- lines.map! do |line| "#{line}<br/>" end
368
- "<div class=\"fppr\">
369
- <h3>Classification Analysis</h3>
370
- #{lines.join("\n")}
371
- </div>"
372
- end
373
-
374
- # transforms the output string of file_as_decoy into html
375
- def prefix_as_decoy_to_html(string)
376
- "<div class=\"fppr\">
377
- <h3>Classification Analysis</h3>
378
- </div>" +
379
- string
380
- end
381
-
382
- def go(argv)
383
- @orig_argv = argv.dup
384
- dup_argv = argv.dup
385
-
386
- opt = OpenStruct.new
387
- opt.f = DEF_PREFIX
388
- opts = OptionParser.new do |op|
389
- op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
390
- op.separator " where file = bioworks -or- <run>-prot (prophet output)"
391
- op.separator " outputs: <file>.summary.html"
392
- op.separator ""
393
- op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
394
- op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
395
- op.separator(" if --precision then -f is used to specify a file or prefix")
396
- op.separator(" that indicates the false positives.")
397
- op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
398
- op.separator ""
399
- op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
400
- op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
401
- op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
402
- op.separator ""
403
- op.separator "Specific to ProteinProphet (with no concatenated DB):"
404
- op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
405
- op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
406
- end
407
-
408
- opts.parse!
409
-
410
- if argv.size < 1
411
- puts opts
412
- exit
413
- end
414
-
415
- fppr_output_as_html = ''
416
- files = argv.to_a
417
- files.each do |file|
418
- outfn = file.gsub(/\.xml$/, '.summary.html')
419
- ## False Positive Rate Calculation:
420
- if opt.precision
421
- opt.o = outfn # won't actually be written over, but used
422
- to_use_argv = create_precision_argv(file, opt)
423
- (out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
424
- fppr_output_as_html = prefix_as_decoy_to_html(out_string)
425
- end
426
-
427
- case SpecID.file_type(file)
428
- when "protproph"
429
- proph_output(file, outfn, opt, fppr_output_as_html)
430
- when "bioworks"
431
- bioworks_output(file, outfn, opt, fppr_output_as_html)
432
- else
433
- abort "filetype for #{file} not recognized!"
434
- end
435
- end
436
-
437
- end # method go
438
-
439
- def create_precision_argv(file, opt)
440
- # include only those options specific
441
- new_argv = [file]
442
- if opt.f ; new_argv << '-f' << opt.f end
443
- if opt.o ; new_argv << '-o' << opt.o end
444
- new_argv
445
- end
446
-
447
- end # Runner
448
-
449
- ##################################################################
450
- # MAIN
451
- ##################################################################
452
-
453
- Runner.new.go(ARGV)
3
+ require 'spec_id/protein_summary'
454
4
 
5
+ ProteinSummary.new.create_from_command_line_args(ARGV)
455
6
 
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ require 'optparse'
4
+ require 'spec/mzxml'
5
+ require 'fileutils'
6
+
7
+ progname = File.basename(__FILE__)
8
+
9
+
10
+ opt = {}
11
+ opts = OptionParser.new do |op|
12
+ op.banner = "usage: #{progname} [OPTIONS] <file>.RAW ..."
13
+ op.separator ""
14
+ op.on("-p", "--profile", "uses profile output instead of centroid (default)") {|v| opt[:profile] = v}
15
+ end
16
+
17
+ opts.parse!
18
+
19
+ if ARGV.size == 0
20
+ puts opts
21
+ exit
22
+ end
23
+
24
+ converter = Spec::MzXML.find_mzxml_converter
25
+ if converter
26
+ $stderr.puts "using #{converter} to convert files"
27
+ else
28
+ puts "cannot find [#{Spec::MzXML::Potential_mzxml_converters.join(', ')}] in the paths:"
29
+ puts ENV['PATH'].split(/[:;]/).join(", ")
30
+ abort
31
+ end
32
+
33
+ files = ARGV.to_a
34
+ files.each do |file|
35
+ puts "******************************************"
36
+ puts "Converting: #{file}"
37
+ if converter =~ /readw/
38
+ centroid_or_profile = 'c'
39
+ if opt[:profile]
40
+ centroid_or_profile = 'p'
41
+ end
42
+ outfile = file.sub(/\.RAW$/i, '.mzXML')
43
+ cmd = "#{converter} #{file} #{centroid_or_profile} #{outfile}"
44
+ puts "Performing: '#{cmd}'"
45
+ puts `#{cmd}`
46
+ else
47
+ ## t2x only outputs in cwd!
48
+ Dir.chdir(File.dirname(file)) do |dir|
49
+ puts "Performing: '#{cmd}' in #{dir}"
50
+ puts `#{cmd}`
51
+ system "#{converter} #{File.basename(file)}"
52
+ end
53
+ end
54
+ puts "******************************************"
55
+ end
data/bin/srf_group.rb ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/ruby
2
+
3
+
4
+ require 'optparse'
5
+ require 'spec_id/srf'
6
+
7
+ $OUTFILE = 'bioworks.srg'
8
+
9
+ opts = OptionParser.new do |op|
10
+ op.banner = "usage: #{File.basename(__FILE__)} <file1>.srf <file2>.srf ..."
11
+ op.separator "outputs: 'bioworks.srg'"
12
+ op.separator ""
13
+ op.separator " A '.srg' file is an ascii text file with a list"
14
+ op.separator " of the srf files (full path names) in that group."
15
+ op.separator ""
16
+ op.on('-o', '--output <filename>', 'a different output name') {|v| $OUTFILE }
17
+ end
18
+
19
+ if ARGV.size == 0
20
+ puts opts
21
+ end
22
+
23
+ obj = SRFGroup.new
24
+ obj.filenames = ARGV.to_a
25
+ obj.to_srg($OUTFILE)
26
+
data/changelog.txt CHANGED
@@ -1,4 +1,6 @@
1
1
 
2
+ ## version 0.1.7
3
+
2
4
  1. A couple of scripts and subroutines were hashing peptides but not on the file
3
5
  basename. This would result in slightly incorrect results (any time there
4
6
  were overlapping scan numbers in multiple datasets, only the top one would be
@@ -31,4 +33,9 @@ Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
31
33
  called the False Positive Predictive Rate (FPPR). I will probably implement
32
34
  this in a future release.
33
35
 
36
+ ## version 0.2.0
34
37
 
38
+ ** This is a definite code breaker **
39
+ Revamped the way SpecID works (it is now subclassed). Since I want to return
40
+ the specific object that the file specifies, I use 'create' now instead of
41
+ 'new' (which forces one to return *that* class.
data/lib/align.rb CHANGED
@@ -24,12 +24,12 @@ class Align
24
24
  scanindex_by_basename_noext[runindex.basename_noext] = runindex.scans_by_num
25
25
  end
26
26
 
27
- dta_filenames = SpecID::Proph::Pep::Parser.new.dta_filenames_by_seq_charge(pep_proph_xml, "regex")
27
+ dta_filenames = Proph::Pep::Parser.new.dta_filenames_by_seq_charge(pep_proph_xml, "regex")
28
28
 
29
- parser = SpecID::Proph::Prot::Parser.new
29
+ parser = Proph::Prot::Parser.new
30
30
  parser.get_prots_and_peps(prot_xml, prot_prob, pep_init_prob, pep_nsp_prob, "regex")
31
31
  peptides = parser.peps
32
- peptides = SpecID::Proph::Pep.uniq_by_seqcharge(peptides)
32
+ peptides = Proph::Pep.uniq_by_seqcharge(peptides)
33
33
  ## we update each peptide with a list of dtafilenames
34
34
  ## then we update with a parallel list of scans (one for each dtafn...
35
35
  ## unless there are multiple scans associated with each filename
data/lib/fasta.rb CHANGED
@@ -1,5 +1,8 @@
1
1
  require 'sample_enzyme'
2
+ require 'each_index'
2
3
 
4
+
5
+ tmp = $VERBOSE ; $VERBOSE = nil
3
6
  class String
4
7
 
5
8
  def each_index
@@ -21,6 +24,8 @@ class String
21
24
  end
22
25
 
23
26
  end
27
+ $VERBOSE = tmp
28
+
24
29
 
25
30
 
26
31
  class Fasta
@@ -259,9 +264,9 @@ class Fasta
259
264
  end
260
265
 
261
266
  class Fasta::Prot
262
- attr_accessor :header, :aaseq
263
267
  # header given as full line with starting '>' (but no newline chars!).
264
268
  # aaseq also given without any newline chars
269
+ attr_accessor :header, :aaseq
265
270
  def initialize(header=nil, aaseq=nil)
266
271
  @header = header || ''
267
272
  if aaseq
data/lib/gi.rb CHANGED
@@ -40,19 +40,24 @@ class GI
40
40
  BATCH_SIZE = 500
41
41
  # takes an array of gi numbers and returns an array of annotation
42
42
  # This allows use of the batch search mode on NCBI
43
+ # returns nil if no internet connection
43
44
  def self.gi2annot(list_of_gi_numbers)
45
+ annots = []
44
46
  loop do
45
47
  batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
46
48
  if batch.size == 0 then break end
47
49
  string = batch.join(",")
48
50
  url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
49
51
  #puts url
50
- annots = []
51
- open(url) do |handle|
52
- annots = parse_etool_output(handle)
52
+ begin
53
+ open(url) do |handle|
54
+ annots.push( *(parse_etool_output(handle)) )
55
+ end
56
+ rescue SocketError
57
+ return nil
53
58
  end
54
- annots
55
59
  end
60
+ annots
56
61
  end
57
62
 
58
63
  protected
data/lib/roc.rb CHANGED
@@ -90,6 +90,8 @@ end
90
90
  # For calculating precision given lists of hits and decoy hits. The hits are
91
91
  # assumed to have false positives within them that can be estimated from the
92
92
  # number of decoy hits at the same rate
93
+ # NOTE: this class assumes that lower scores are better. Negate your scores
94
+ # if this is not the case.
93
95
  class DecoyROC < ROC
94
96
 
95
97
  # returns the [num_hits, num_tps, precision] as a function of true
data/lib/sample_enzyme.rb CHANGED
@@ -1,6 +1,7 @@
1
+
1
2
  module SpecIDXML; end
2
3
 
3
- require 'spec_id'
4
+ require 'spec_id_xml'
4
5
  require 'strscan'
5
6
 
6
7
  class SampleEnzyme