mspire 0.3.9 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/INSTALL +24 -7
  2. data/README +15 -13
  3. data/README.rdoc +18 -0
  4. data/Rakefile +50 -14
  5. data/bin/aafreqs.rb +0 -0
  6. data/bin/bioworks2excel.rb +0 -0
  7. data/bin/bioworks_to_pepxml.rb +2 -1
  8. data/bin/bioworks_to_pepxml_gui.rb +0 -0
  9. data/bin/fasta_shaker.rb +0 -0
  10. data/bin/filter_and_validate.rb +0 -0
  11. data/bin/gi2annot.rb +0 -0
  12. data/bin/id_class_anal.rb +0 -0
  13. data/bin/id_precision.rb +0 -0
  14. data/bin/ms_to_lmat.rb +0 -0
  15. data/bin/pepproph_filter.rb +0 -0
  16. data/bin/protein_summary.rb +0 -0
  17. data/bin/protxml2prots_peps.rb +0 -0
  18. data/bin/raw_to_mzXML.rb +3 -3
  19. data/bin/run_percolator.rb +122 -0
  20. data/bin/sqt_group.rb +0 -0
  21. data/bin/srf_group.rb +0 -0
  22. data/changelog.txt +29 -0
  23. data/lib/ms/gradient_program.rb +0 -1
  24. data/lib/ms/msrun.rb +62 -29
  25. data/lib/ms/parser/mzdata/axml.rb +55 -0
  26. data/lib/ms/parser/mzdata/dom.rb +51 -36
  27. data/lib/ms/parser/mzdata.rb +8 -2
  28. data/lib/ms/parser/mzxml/axml.rb +59 -0
  29. data/lib/ms/parser/mzxml/dom.rb +80 -57
  30. data/lib/ms/parser/mzxml/hpricot.rb +1 -1
  31. data/lib/ms/parser/mzxml/libxml.rb +6 -2
  32. data/lib/ms/parser/mzxml.rb +110 -3
  33. data/lib/ms/parser.rb +4 -4
  34. data/lib/ms/precursor.rb +19 -4
  35. data/lib/ms/scan.rb +7 -7
  36. data/lib/ms/spectrum.rb +249 -58
  37. data/lib/mspire.rb +1 -1
  38. data/lib/spec_id/bioworks.rb +2 -2
  39. data/lib/spec_id/precision/filter/cmdline.rb +8 -1
  40. data/lib/spec_id/precision/prob/cmdline.rb +2 -2
  41. data/lib/spec_id/precision/prob.rb +1 -0
  42. data/lib/spec_id/proph/pep_summary.rb +3 -4
  43. data/lib/spec_id/proph/prot_summary.rb +3 -3
  44. data/lib/spec_id/protein_summary.rb +1 -1
  45. data/lib/spec_id/sequest/pepxml.rb +5 -5
  46. data/lib/spec_id/sqt.rb +4 -4
  47. data/lib/spec_id/srf.rb +49 -8
  48. data/lib/spec_id.rb +5 -0
  49. data/lib/xml_style_parser.rb +16 -2
  50. data/script/compile_and_plot_smriti_final.rb +0 -0
  51. data/script/create_little_pepxml.rb +0 -0
  52. data/script/degenerate_peptides.rb +0 -0
  53. data/script/estimate_fpr_by_cysteine.rb +0 -0
  54. data/script/extract_gradient_programs.rb +1 -1
  55. data/script/find_cysteine_background.rb +0 -0
  56. data/script/genuine_tps_and_probs.rb +0 -0
  57. data/script/get_apex_values_rexml.rb +0 -0
  58. data/script/mascot_fix_pepxml.rb +123 -0
  59. data/script/msvis.rb +0 -0
  60. data/script/mzXML2timeIndex.rb +0 -0
  61. data/script/peps_per_bin.rb +0 -0
  62. data/script/prep_dir.rb +0 -0
  63. data/script/simple_protein_digestion.rb +0 -0
  64. data/script/smriti_final_analysis.rb +0 -0
  65. data/script/sqt_to_meta.rb +0 -0
  66. data/script/top_hit_per_scan.rb +0 -0
  67. data/script/toppred_to_yaml.rb +0 -0
  68. data/script/tpp_installer.rb +0 -0
  69. data/specs/bin/prob_validate_spec.rb +5 -2
  70. data/specs/bin/protein_summary_spec.rb +5 -1
  71. data/specs/ms/msrun_spec.rb +176 -133
  72. data/specs/ms/parser_spec.rb +3 -3
  73. data/specs/ms/spectrum_spec.rb +0 -2
  74. data/specs/spec_id/precision/filter_spec.rb +4 -1
  75. data/specs/spec_id/precision/prob_spec.rb +2 -2
  76. data/specs/spec_id/sequest/pepxml_spec.rb +1 -1
  77. data/specs/spec_id/sqt_spec.rb +5 -5
  78. data/specs/spec_id/srf_spec.rb +56 -93
  79. data/specs/spec_id/srf_spec_helper.rb +121 -284
  80. data/specs/spec_id_spec.rb +3 -0
  81. data/specs/transmem/toppred_spec.rb +1 -0
  82. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +683 -0
  83. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +382 -0
  84. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +683 -0
  85. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +382 -0
  86. data/test_files/opd1_2runs_2mods/data/README.txt +6 -0
  87. metadata +247 -229
data/lib/spec_id/sqt.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'spec_id'
2
- require 'array_class'
2
+ require 'arrayclass'
3
3
  require 'set'
4
4
 
5
5
  class SQTGroup
@@ -176,7 +176,7 @@ end
176
176
 
177
177
  # all are cast as expected (total_intensity is a float)
178
178
  # mh = observed mh
179
- SQT::Spectrum = ArrayClass.new(%w[first_scan last_scan charge time_to_process node mh total_intensity lowest_sp num_matched_peptides matches])
179
+ SQT::Spectrum = Arrayclass.new(%w[first_scan last_scan charge time_to_process node mh total_intensity lowest_sp num_matched_peptides matches])
180
180
 
181
181
  # 0=first_scan 1=last_scan 2=charge 3=time_to_process 4=node 5=mh 6=total_intensity 7=lowest_sp 8=num_matched_peptides 9=matches
182
182
 
@@ -262,7 +262,7 @@ class SQT::Spectrum
262
262
  end
263
263
 
264
264
  # SQT format uses only indices 0 - 9
265
- SQT::Match = ArrayClass.new(%w[rxcorr rsp mh deltacn_orig xcorr sp ions_matched ions_total sequence manual_validation_status first_scan last_scan charge deltacn aaseq base_name loci])
265
+ SQT::Match = Arrayclass.new(%w[rxcorr rsp mh deltacn_orig xcorr sp ions_matched ions_total sequence manual_validation_status first_scan last_scan charge deltacn aaseq base_name loci])
266
266
 
267
267
  # 0=rxcorr 1=rsp 2=mh 3=deltacn_orig 4=xcorr 5=sp 6=ions_matched 7=ions_total 8=sequence 9=manual_validation_status 10=first_scan 11=last_scan 12=charge 13=deltacn 14=aaseq 15=base_name 16=loci
268
268
 
@@ -329,7 +329,7 @@ class SQT::Match::Percolator < SQT::Match
329
329
  end
330
330
  end
331
331
 
332
- SQT::Locus = ArrayClass.new(%w[locus description peps])
332
+ SQT::Locus = Arrayclass.new(%w[locus description peps])
333
333
 
334
334
  class SQT::Locus
335
335
  include SpecID::Prot
data/lib/spec_id/srf.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'fileutils'
2
+
1
3
  require 'spec_id'
2
4
  require 'spec_id/sequest'
3
5
  require 'fasta'
@@ -45,7 +47,7 @@ class SRFGroup
45
47
  if filenames.is_a?(String) && filenames =~ /\.srg$/
46
48
  srg_filename = filenames.dup
47
49
  @filename = srg_filename
48
- filenames = IO.readlines(filenames).grep(/\w/).map {|v| v.chomp }
50
+ filenames = SRFGroup.srg_to_paths(filenames)
49
51
  filenames.each do |file|
50
52
  if !File.exist? file
51
53
  puts "File: #{file} in #{srg_filename} does not exist!"
@@ -64,6 +66,11 @@ class SRFGroup
64
66
  end
65
67
  end
66
68
 
69
+ # reads a srg file and delivers the path names
70
+ def self.srg_to_paths(file)
71
+ IO.readlines(file).grep(/\w/).map {|v| v.chomp }
72
+ end
73
+
67
74
  # if srfs were read in separately, then the proteins will need to be merged
68
75
  # by their reference
69
76
  def merge_different_sets(srfs)
@@ -200,6 +207,23 @@ class SRF
200
207
  sprintf("%.#{decimal_places}f", float)
201
208
  end
202
209
 
210
+ # not given an out_folder, will make one with the basename
211
+ def to_dta_files(out_folder=nil)
212
+ outdir =
213
+ if out_folder ; out_folder
214
+ else base_name
215
+ end
216
+
217
+ FileUtils.mkpath(outdir)
218
+ Dir.chdir(outdir) do
219
+ dta_files.zip(index) do |dta,i_ar|
220
+ File.open([base_name, *i_ar].join('.') << '.dta', 'wb') do |out|
221
+ dta.write_dta_file(out)
222
+ end
223
+ end
224
+ end
225
+ end
226
+
203
227
  # the out_filename will be the base_name + .sqt unless 'out_filename' is
204
228
  # defined
205
229
  # :round => round floating point numbers
@@ -389,7 +413,7 @@ class SRF
389
413
  else
390
414
  @params = Sequest::Params.new.parse_handle(fh)
391
415
  # This is very sensitive to the grab_params method in sequest params
392
- fh.read(12) ## gap between last params entry and index
416
+ fh.read(12) ## gap between last params entry and index
393
417
  @index = read_scan_index(fh,@header.num_dta_files)
394
418
  end
395
419
  end
@@ -526,18 +550,26 @@ end
526
550
  class SRF::DTAGen
527
551
 
528
552
  ## not sure if this is correct
553
+ # Float
529
554
  attr_accessor :start_time
530
- # group scan (not sure if this is correct)
555
+ # Float
531
556
  attr_accessor :start_mass
557
+ # Float
532
558
  attr_accessor :end_mass
559
+ # Integer
533
560
  attr_accessor :num_dta_files
561
+ # Integer
534
562
  attr_accessor :group_scan
535
563
  ## not sure if this is correct
564
+ # Integer
536
565
  attr_accessor :min_group_count
566
+ # Integer
537
567
  attr_accessor :min_ion_threshold
538
568
  #attr_accessor :intensity_threshold # can't find yet
539
569
  #attr_accessor :precursor_tolerance # can't find yet
570
+ # Integer
540
571
  attr_accessor :start_scan
572
+ # Integer
541
573
  attr_accessor :end_scan
542
574
 
543
575
  #
@@ -551,7 +583,7 @@ end
551
583
 
552
584
  # total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
553
585
  # unknown is, well unknown...
554
- SRF::DTA = ArrayClass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
586
+ SRF::DTA = Arrayclass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
555
587
 
556
588
  class SRF::DTA
557
589
  # original
@@ -594,9 +626,18 @@ class SRF::DTA
594
626
  self
595
627
  end
596
628
 
629
+ # write a class dta file to the io object
630
+ def write_dta_file(io)
631
+ io.print("#{mh} #{charge}\r\n")
632
+ peak_ar = peaks.unpack('e*')
633
+ (0...(peak_ar.size)).step(2) do |i|
634
+ io.print( peak_ar[i,2].join(' '), "\r\n" )
635
+ end
636
+ end
637
+
597
638
  end
598
639
 
599
- SRF::OUT = ArrayClass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
640
+ SRF::OUT = Arrayclass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
600
641
  # 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
601
642
 
602
643
  class SRF::OUT
@@ -666,7 +707,7 @@ end
666
707
  # the first one listed
667
708
  # srf = the srf object this scan came from
668
709
 
669
- SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
710
+ SRF::OUT::Pep = Arrayclass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
670
711
 
671
712
  # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
672
713
 
@@ -787,7 +828,7 @@ class SRF::OUT::Pep
787
828
 
788
829
  end
789
830
 
790
- SRF::OUT::Prot = ArrayClass.new( %w(reference peps) )
831
+ SRF::OUT::Prot = Arrayclass.new( %w(reference peps) )
791
832
 
792
833
  class SRF::OUT::Prot
793
834
  include SpecID::Prot
@@ -798,7 +839,7 @@ class SRF::OUT::Prot
798
839
  tmp = $VERBOSE ; $VERBOSE = nil
799
840
  def initialize(reference=nil, peps=[])
800
841
  #super(@@arr_size)
801
- super(size)
842
+ super(self.class.size)
802
843
  #@reference = reference
803
844
  #@peps = peps
804
845
  self[0,2] = reference, peps
data/lib/spec_id.rb CHANGED
@@ -5,12 +5,17 @@ require 'roc'
5
5
  require 'sample_enzyme' # for others
6
6
  require 'spec_id/bioworks'
7
7
  require 'spec_id/sequest'
8
+
8
9
  require 'spec_id/proph/prot_summary'
10
+ require 'spec_id/proph/pep_summary'
11
+
9
12
  require 'spec_id_xml'
10
13
  require 'spec_id/sqt'
11
14
  require 'spec_id/mass'
12
15
  require 'fasta'
13
16
 
17
+
18
+
14
19
  module ProteinReferenceable ; end
15
20
 
16
21
  class SampleEnzyme ; end
@@ -82,7 +82,7 @@ module XMLStyleParser
82
82
  end
83
83
 
84
84
  # seeks a subclass that has the public_method @method
85
- def self.choose_parser(const, method)
85
+ def self.choose_parser(const, method, special_subclass=nil)
86
86
  ## First update @@parser_precedence to ensure we should get these guys
87
87
  parser_precedence = available_xml_parsers
88
88
 
@@ -95,10 +95,24 @@ module XMLStyleParser
95
95
  available = available_subclasses.select do |subclass|
96
96
  subclass.public_method_defined? method
97
97
  end
98
+ if special_subclass
99
+ available_special_subclasses = []
100
+ available.each do |subclass|
101
+ if subclass.const_defined?(special_subclass)
102
+ available_special_subclasses << subclass.const_get(special_subclass)
103
+ end
104
+ end
105
+ available = available_special_subclasses
106
+ end
98
107
  if available.size > 0
99
108
  available.first
100
109
  else
101
- raise NoMethodError, "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
110
+ warning = ""
111
+ if special_subclass
112
+ warning << "** while looking for special subclass: #{special_subclass} **\n"
113
+ end
114
+ warning << "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
115
+ raise NoMethodError, warning
102
116
  end
103
117
  end
104
118
 
File without changes
File without changes
File without changes
File without changes
@@ -3,7 +3,7 @@
3
3
  require 'optparse'
4
4
  require 'table'
5
5
 
6
- require 'spec/gradient_program'
6
+ require 'ms/gradient_program'
7
7
 
8
8
  delimiter = "\t"
9
9
  table_format = false
File without changes
File without changes
File without changes
@@ -0,0 +1,123 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'ms/msrun'
5
+ gem 'axml', '= 0.0.2'
6
+
7
+ # returns an array containing one or two pairs of [cycle_num, time] that
8
+ # represent the lowest and highest cycle numbers coupled to lowest and highest
9
+ # time (in seconds) and the lowest and highest associated experiment numbers
10
+ def get_cycle_exp_time_triplets(string)
11
+ hash = {}
12
+ cycle_index = nil
13
+ ssplit = string.split(', ')
14
+ ssplit.each_with_index do |piece,i|
15
+ if piece =~ /^Cycle\(s\):/
16
+ cycle_index = i
17
+ break
18
+ end
19
+ end
20
+ cycle_info = ssplit[cycle_index..-1].join(", ")
21
+ #Cycle(s): 663, 675 (Experiment 2), 667 (Experiment 4)
22
+ (header, info) = cycle_info.split(': ')
23
+ cycles = []
24
+ cycle_exp_pairs = []
25
+ info.split('), ').each do |a|
26
+ (nums, exp_num) = a.split('(')
27
+ nums = nums.split(', ').map {|v| v.to_i }
28
+ exp_num = exp_num.split(' ').last.sub(/\)$/,'').to_i
29
+ nums.each {|v| cycle_exp_pairs << [v, exp_num] }
30
+ end
31
+
32
+ min = cycle_exp_pairs.min
33
+ max = cycle_exp_pairs.max
34
+
35
+ elution = ssplit.select {|v| v.match(/^Elution:(.*)/) }.first
36
+ times = elution.split(': ').last
37
+ times.sub!(/ min$/,'')
38
+ times = times.split(' to ')
39
+ times.map! do |v|
40
+ (minutes, minute_decimals) = v.split('.')
41
+ seconds = minutes.to_f * 60
42
+ seconds + ( minute_decimals.to_f * 60 / 100 )
43
+ end
44
+
45
+ if max == min
46
+ [[min.first, min.last, times.first]]
47
+ else
48
+ [[min.first, min.last, times.first], [max.first, max.last, times.last]]
49
+ end
50
+ end
51
+
52
+ def get_scan_num(cycle, cycle_time, time_to_scan_num)
53
+ # grossly inefficient, but guaranteed to get right answer!
54
+ below_scan = nil
55
+ time_to_scan_num.each do |scan_time, scan_num|
56
+ if scan_time < cycle_time
57
+ below_scan = scan_num
58
+ else
59
+ break # scan_time > cycle_time
60
+ end
61
+ end
62
+ below_scan
63
+ end
64
+
65
+ #####################################################
66
+ # MAIN:
67
+ #####################################################
68
+
69
+ additional_ext = ".with_scan_nums"
70
+
71
+ if ARGV.size != 2
72
+ puts "usage: #{File.basename(__FILE__)} <file>.pepXML <file>.mzXML"
73
+ puts ""
74
+ puts "uses information from the mzXML file to fix the pepXML file"
75
+ puts "(adds in msms_run_summary: 'base_name' and 'raw_data' attributes;"
76
+ puts " adds scan numbers based on cycle and experiment times)"
77
+ puts ""
78
+ puts "outputs: <file>#{additional_ext}.pepXML"
79
+ exit
80
+ end
81
+
82
+ # get time_to_scan_num for msLevel=1 from the mzXML file
83
+ (pepxml, mzxml) = ARGV
84
+ mzxml_basename = File.basename(mzxml).sub(/\.mzxml$/i, '')
85
+
86
+ ext = File.extname(pepxml)
87
+ output = pepxml.sub(Regexp.new(Regexp.escape(ext)), additional_ext + ext)
88
+
89
+ ms = MS::MSRun.new(mzxml, :lazy => :no_spectra)
90
+ time_to_scan_num = ms.scans.select {|scan| scan.ms_level == 1 }.map do |scan|
91
+ [scan.time, scan.num]
92
+ end
93
+
94
+ # update spectrum queries based on scan number
95
+
96
+ root = AXML.parse_file(pepxml)
97
+ # fix the basename stuff:
98
+ msms_r_summary_n = root.child
99
+ atts = msms_r_summary_n.attrs
100
+ atts['base_name'] = mzxml_basename
101
+ atts['raw_data'] = '.mzXML'
102
+
103
+ root.child.find("child::spectrum_query").each do |sq|
104
+ triplets = get_cycle_exp_time_triplets(sq['spectrum'])
105
+ triplets.map! do |triplet|
106
+ [get_scan_num(triplet[0], triplet[2], time_to_scan_num), *triplet]
107
+ end
108
+ # [scan_num, cycle, exp, time]
109
+ quad = triplets.first
110
+ first_scan_num = (quad[0] + quad[2] - 1)
111
+ sq.attrs['start_scan'] = first_scan_num.to_s
112
+ sq.attrs['end_scan'] =
113
+ if triplets.size > 1
114
+ quad = triplets.last
115
+ (quad[0] + quad[2] - 1).to_s
116
+ else
117
+ first_scan_num.to_s
118
+ end
119
+ end
120
+
121
+ xml_header = '<?xml version="1.0" encoding="UTF-8"?>'
122
+ File.open(output, 'w') {|out| out.puts(xml_header); out.print root.to_s }
123
+
data/script/msvis.rb CHANGED
File without changes
File without changes
File without changes
data/script/prep_dir.rb CHANGED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -42,7 +42,7 @@ describe 'filter_and_validate.rb on small bioworks file' do
42
42
 
43
43
  it 'outputs to yaml' do
44
44
  reply = @st_to_yaml.call( @args )
45
- keys = [:probabilities, :params, :pephits_precision, :charges, :aaseqs, :count].map {|v| v.to_s }.sort
45
+ keys = [:probabilities, :params, :pephits, :pephits_precision, :charges, :aaseqs, :count].map {|v| v.to_s }.sort
46
46
  reply.keys.map {|v| v.to_s}.sort.should == keys
47
47
  end
48
48
 
@@ -55,7 +55,10 @@ describe 'filter_and_validate.rb on small bioworks file' do
55
55
  #normal_nsp = @st_to_yaml.call( @args + " --prob nsp" )
56
56
  #normal.should == normal_nsp
57
57
  init = @st_to_yaml.call( @args + " --prob init" )
58
- init.should_not == normal
58
+
59
+ init[:pephits_precision].first[:values].should_not == normal[:pephits_precision].first[:values]
60
+
61
+
59
62
  init[:pephits_precision].first[:values].zip([1.0, 0.95, 0.963333333333333, 0.8025]) do |got,exp|
60
63
  got.should be_close(exp, 0.000000000001)
61
64
  end
@@ -1,10 +1,14 @@
1
1
  require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
2
2
 
3
- xdescribe 'protein_summary.rb' do
3
+ describe 'protein_summary.rb' do
4
4
 
5
5
  before(:all) do
6
6
  @progname = 'protein_summary.rb'
7
7
  end
8
8
  it_should_behave_like 'a cmdline program'
9
9
 
10
+ it 'outputs basic protein prophet -prot.xml summary' do
11
+
12
+ end
13
+
10
14
  end