mspire 0.3.9 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. data/INSTALL +24 -7
  2. data/README +15 -13
  3. data/README.rdoc +18 -0
  4. data/Rakefile +50 -14
  5. data/bin/aafreqs.rb +0 -0
  6. data/bin/bioworks2excel.rb +0 -0
  7. data/bin/bioworks_to_pepxml.rb +2 -1
  8. data/bin/bioworks_to_pepxml_gui.rb +0 -0
  9. data/bin/fasta_shaker.rb +0 -0
  10. data/bin/filter_and_validate.rb +0 -0
  11. data/bin/gi2annot.rb +0 -0
  12. data/bin/id_class_anal.rb +0 -0
  13. data/bin/id_precision.rb +0 -0
  14. data/bin/ms_to_lmat.rb +0 -0
  15. data/bin/pepproph_filter.rb +0 -0
  16. data/bin/protein_summary.rb +0 -0
  17. data/bin/protxml2prots_peps.rb +0 -0
  18. data/bin/raw_to_mzXML.rb +3 -3
  19. data/bin/run_percolator.rb +122 -0
  20. data/bin/sqt_group.rb +0 -0
  21. data/bin/srf_group.rb +0 -0
  22. data/changelog.txt +29 -0
  23. data/lib/ms/gradient_program.rb +0 -1
  24. data/lib/ms/msrun.rb +62 -29
  25. data/lib/ms/parser/mzdata/axml.rb +55 -0
  26. data/lib/ms/parser/mzdata/dom.rb +51 -36
  27. data/lib/ms/parser/mzdata.rb +8 -2
  28. data/lib/ms/parser/mzxml/axml.rb +59 -0
  29. data/lib/ms/parser/mzxml/dom.rb +80 -57
  30. data/lib/ms/parser/mzxml/hpricot.rb +1 -1
  31. data/lib/ms/parser/mzxml/libxml.rb +6 -2
  32. data/lib/ms/parser/mzxml.rb +110 -3
  33. data/lib/ms/parser.rb +4 -4
  34. data/lib/ms/precursor.rb +19 -4
  35. data/lib/ms/scan.rb +7 -7
  36. data/lib/ms/spectrum.rb +249 -58
  37. data/lib/mspire.rb +1 -1
  38. data/lib/spec_id/bioworks.rb +2 -2
  39. data/lib/spec_id/precision/filter/cmdline.rb +8 -1
  40. data/lib/spec_id/precision/prob/cmdline.rb +2 -2
  41. data/lib/spec_id/precision/prob.rb +1 -0
  42. data/lib/spec_id/proph/pep_summary.rb +3 -4
  43. data/lib/spec_id/proph/prot_summary.rb +3 -3
  44. data/lib/spec_id/protein_summary.rb +1 -1
  45. data/lib/spec_id/sequest/pepxml.rb +5 -5
  46. data/lib/spec_id/sqt.rb +4 -4
  47. data/lib/spec_id/srf.rb +49 -8
  48. data/lib/spec_id.rb +5 -0
  49. data/lib/xml_style_parser.rb +16 -2
  50. data/script/compile_and_plot_smriti_final.rb +0 -0
  51. data/script/create_little_pepxml.rb +0 -0
  52. data/script/degenerate_peptides.rb +0 -0
  53. data/script/estimate_fpr_by_cysteine.rb +0 -0
  54. data/script/extract_gradient_programs.rb +1 -1
  55. data/script/find_cysteine_background.rb +0 -0
  56. data/script/genuine_tps_and_probs.rb +0 -0
  57. data/script/get_apex_values_rexml.rb +0 -0
  58. data/script/mascot_fix_pepxml.rb +123 -0
  59. data/script/msvis.rb +0 -0
  60. data/script/mzXML2timeIndex.rb +0 -0
  61. data/script/peps_per_bin.rb +0 -0
  62. data/script/prep_dir.rb +0 -0
  63. data/script/simple_protein_digestion.rb +0 -0
  64. data/script/smriti_final_analysis.rb +0 -0
  65. data/script/sqt_to_meta.rb +0 -0
  66. data/script/top_hit_per_scan.rb +0 -0
  67. data/script/toppred_to_yaml.rb +0 -0
  68. data/script/tpp_installer.rb +0 -0
  69. data/specs/bin/prob_validate_spec.rb +5 -2
  70. data/specs/bin/protein_summary_spec.rb +5 -1
  71. data/specs/ms/msrun_spec.rb +176 -133
  72. data/specs/ms/parser_spec.rb +3 -3
  73. data/specs/ms/spectrum_spec.rb +0 -2
  74. data/specs/spec_id/precision/filter_spec.rb +4 -1
  75. data/specs/spec_id/precision/prob_spec.rb +2 -2
  76. data/specs/spec_id/sequest/pepxml_spec.rb +1 -1
  77. data/specs/spec_id/sqt_spec.rb +5 -5
  78. data/specs/spec_id/srf_spec.rb +56 -93
  79. data/specs/spec_id/srf_spec_helper.rb +121 -284
  80. data/specs/spec_id_spec.rb +3 -0
  81. data/specs/transmem/toppred_spec.rb +1 -0
  82. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +683 -0
  83. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +382 -0
  84. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +683 -0
  85. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +382 -0
  86. data/test_files/opd1_2runs_2mods/data/README.txt +6 -0
  87. metadata +247 -229
data/lib/spec_id/sqt.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'spec_id'
2
- require 'array_class'
2
+ require 'arrayclass'
3
3
  require 'set'
4
4
 
5
5
  class SQTGroup
@@ -176,7 +176,7 @@ end
176
176
 
177
177
  # all are cast as expected (total_intensity is a float)
178
178
  # mh = observed mh
179
- SQT::Spectrum = ArrayClass.new(%w[first_scan last_scan charge time_to_process node mh total_intensity lowest_sp num_matched_peptides matches])
179
+ SQT::Spectrum = Arrayclass.new(%w[first_scan last_scan charge time_to_process node mh total_intensity lowest_sp num_matched_peptides matches])
180
180
 
181
181
  # 0=first_scan 1=last_scan 2=charge 3=time_to_process 4=node 5=mh 6=total_intensity 7=lowest_sp 8=num_matched_peptides 9=matches
182
182
 
@@ -262,7 +262,7 @@ class SQT::Spectrum
262
262
  end
263
263
 
264
264
  # SQT format uses only indices 0 - 9
265
- SQT::Match = ArrayClass.new(%w[rxcorr rsp mh deltacn_orig xcorr sp ions_matched ions_total sequence manual_validation_status first_scan last_scan charge deltacn aaseq base_name loci])
265
+ SQT::Match = Arrayclass.new(%w[rxcorr rsp mh deltacn_orig xcorr sp ions_matched ions_total sequence manual_validation_status first_scan last_scan charge deltacn aaseq base_name loci])
266
266
 
267
267
  # 0=rxcorr 1=rsp 2=mh 3=deltacn_orig 4=xcorr 5=sp 6=ions_matched 7=ions_total 8=sequence 9=manual_validation_status 10=first_scan 11=last_scan 12=charge 13=deltacn 14=aaseq 15=base_name 16=loci
268
268
 
@@ -329,7 +329,7 @@ class SQT::Match::Percolator < SQT::Match
329
329
  end
330
330
  end
331
331
 
332
- SQT::Locus = ArrayClass.new(%w[locus description peps])
332
+ SQT::Locus = Arrayclass.new(%w[locus description peps])
333
333
 
334
334
  class SQT::Locus
335
335
  include SpecID::Prot
data/lib/spec_id/srf.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'fileutils'
2
+
1
3
  require 'spec_id'
2
4
  require 'spec_id/sequest'
3
5
  require 'fasta'
@@ -45,7 +47,7 @@ class SRFGroup
45
47
  if filenames.is_a?(String) && filenames =~ /\.srg$/
46
48
  srg_filename = filenames.dup
47
49
  @filename = srg_filename
48
- filenames = IO.readlines(filenames).grep(/\w/).map {|v| v.chomp }
50
+ filenames = SRFGroup.srg_to_paths(filenames)
49
51
  filenames.each do |file|
50
52
  if !File.exist? file
51
53
  puts "File: #{file} in #{srg_filename} does not exist!"
@@ -64,6 +66,11 @@ class SRFGroup
64
66
  end
65
67
  end
66
68
 
69
+ # reads a srg file and delivers the path names
70
+ def self.srg_to_paths(file)
71
+ IO.readlines(file).grep(/\w/).map {|v| v.chomp }
72
+ end
73
+
67
74
  # if srfs were read in separately, then the proteins will need to be merged
68
75
  # by their reference
69
76
  def merge_different_sets(srfs)
@@ -200,6 +207,23 @@ class SRF
200
207
  sprintf("%.#{decimal_places}f", float)
201
208
  end
202
209
 
210
+ # not given an out_folder, will make one with the basename
211
+ def to_dta_files(out_folder=nil)
212
+ outdir =
213
+ if out_folder ; out_folder
214
+ else base_name
215
+ end
216
+
217
+ FileUtils.mkpath(outdir)
218
+ Dir.chdir(outdir) do
219
+ dta_files.zip(index) do |dta,i_ar|
220
+ File.open([base_name, *i_ar].join('.') << '.dta', 'wb') do |out|
221
+ dta.write_dta_file(out)
222
+ end
223
+ end
224
+ end
225
+ end
226
+
203
227
  # the out_filename will be the base_name + .sqt unless 'out_filename' is
204
228
  # defined
205
229
  # :round => round floating point numbers
@@ -389,7 +413,7 @@ class SRF
389
413
  else
390
414
  @params = Sequest::Params.new.parse_handle(fh)
391
415
  # This is very sensitive to the grab_params method in sequest params
392
- fh.read(12) ## gap between last params entry and index
416
+ fh.read(12) ## gap between last params entry and index
393
417
  @index = read_scan_index(fh,@header.num_dta_files)
394
418
  end
395
419
  end
@@ -526,18 +550,26 @@ end
526
550
  class SRF::DTAGen
527
551
 
528
552
  ## not sure if this is correct
553
+ # Float
529
554
  attr_accessor :start_time
530
- # group scan (not sure if this is correct)
555
+ # Float
531
556
  attr_accessor :start_mass
557
+ # Float
532
558
  attr_accessor :end_mass
559
+ # Integer
533
560
  attr_accessor :num_dta_files
561
+ # Integer
534
562
  attr_accessor :group_scan
535
563
  ## not sure if this is correct
564
+ # Integer
536
565
  attr_accessor :min_group_count
566
+ # Integer
537
567
  attr_accessor :min_ion_threshold
538
568
  #attr_accessor :intensity_threshold # can't find yet
539
569
  #attr_accessor :precursor_tolerance # can't find yet
570
+ # Integer
540
571
  attr_accessor :start_scan
572
+ # Integer
541
573
  attr_accessor :end_scan
542
574
 
543
575
  #
@@ -551,7 +583,7 @@ end
551
583
 
552
584
  # total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
553
585
  # unknown is, well unknown...
554
- SRF::DTA = ArrayClass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
586
+ SRF::DTA = Arrayclass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
555
587
 
556
588
  class SRF::DTA
557
589
  # original
@@ -594,9 +626,18 @@ class SRF::DTA
594
626
  self
595
627
  end
596
628
 
629
+ # write a class dta file to the io object
630
+ def write_dta_file(io)
631
+ io.print("#{mh} #{charge}\r\n")
632
+ peak_ar = peaks.unpack('e*')
633
+ (0...(peak_ar.size)).step(2) do |i|
634
+ io.print( peak_ar[i,2].join(' '), "\r\n" )
635
+ end
636
+ end
637
+
597
638
  end
598
639
 
599
- SRF::OUT = ArrayClass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
640
+ SRF::OUT = Arrayclass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
600
641
  # 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
601
642
 
602
643
  class SRF::OUT
@@ -666,7 +707,7 @@ end
666
707
  # the first one listed
667
708
  # srf = the srf object this scan came from
668
709
 
669
- SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
710
+ SRF::OUT::Pep = Arrayclass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
670
711
 
671
712
  # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
672
713
 
@@ -787,7 +828,7 @@ class SRF::OUT::Pep
787
828
 
788
829
  end
789
830
 
790
- SRF::OUT::Prot = ArrayClass.new( %w(reference peps) )
831
+ SRF::OUT::Prot = Arrayclass.new( %w(reference peps) )
791
832
 
792
833
  class SRF::OUT::Prot
793
834
  include SpecID::Prot
@@ -798,7 +839,7 @@ class SRF::OUT::Prot
798
839
  tmp = $VERBOSE ; $VERBOSE = nil
799
840
  def initialize(reference=nil, peps=[])
800
841
  #super(@@arr_size)
801
- super(size)
842
+ super(self.class.size)
802
843
  #@reference = reference
803
844
  #@peps = peps
804
845
  self[0,2] = reference, peps
data/lib/spec_id.rb CHANGED
@@ -5,12 +5,17 @@ require 'roc'
5
5
  require 'sample_enzyme' # for others
6
6
  require 'spec_id/bioworks'
7
7
  require 'spec_id/sequest'
8
+
8
9
  require 'spec_id/proph/prot_summary'
10
+ require 'spec_id/proph/pep_summary'
11
+
9
12
  require 'spec_id_xml'
10
13
  require 'spec_id/sqt'
11
14
  require 'spec_id/mass'
12
15
  require 'fasta'
13
16
 
17
+
18
+
14
19
  module ProteinReferenceable ; end
15
20
 
16
21
  class SampleEnzyme ; end
@@ -82,7 +82,7 @@ module XMLStyleParser
82
82
  end
83
83
 
84
84
  # seeks a subclass that has the public_method @method
85
- def self.choose_parser(const, method)
85
+ def self.choose_parser(const, method, special_subclass=nil)
86
86
  ## First update @@parser_precedence to ensure we should get these guys
87
87
  parser_precedence = available_xml_parsers
88
88
 
@@ -95,10 +95,24 @@ module XMLStyleParser
95
95
  available = available_subclasses.select do |subclass|
96
96
  subclass.public_method_defined? method
97
97
  end
98
+ if special_subclass
99
+ available_special_subclasses = []
100
+ available.each do |subclass|
101
+ if subclass.const_defined?(special_subclass)
102
+ available_special_subclasses << subclass.const_get(special_subclass)
103
+ end
104
+ end
105
+ available = available_special_subclasses
106
+ end
98
107
  if available.size > 0
99
108
  available.first
100
109
  else
101
- raise NoMethodError, "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
110
+ warning = ""
111
+ if special_subclass
112
+ warning << "** while looking for special subclass: #{special_subclass} **\n"
113
+ end
114
+ warning << "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
115
+ raise NoMethodError, warning
102
116
  end
103
117
  end
104
118
 
File without changes
File without changes
File without changes
File without changes
@@ -3,7 +3,7 @@
3
3
  require 'optparse'
4
4
  require 'table'
5
5
 
6
- require 'spec/gradient_program'
6
+ require 'ms/gradient_program'
7
7
 
8
8
  delimiter = "\t"
9
9
  table_format = false
File without changes
File without changes
File without changes
@@ -0,0 +1,123 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'ms/msrun'
5
+ gem 'axml', '= 0.0.2'
6
+
7
+ # returns an array containing one or two pairs of [cycle_num, time] that
8
+ # represent the lowest and highest cycle numbers coupled to lowest and highest
9
+ # time (in seconds) and the lowest and highest associated experiment numbers
10
+ def get_cycle_exp_time_triplets(string)
11
+ hash = {}
12
+ cycle_index = nil
13
+ ssplit = string.split(', ')
14
+ ssplit.each_with_index do |piece,i|
15
+ if piece =~ /^Cycle\(s\):/
16
+ cycle_index = i
17
+ break
18
+ end
19
+ end
20
+ cycle_info = ssplit[cycle_index..-1].join(", ")
21
+ #Cycle(s): 663, 675 (Experiment 2), 667 (Experiment 4)
22
+ (header, info) = cycle_info.split(': ')
23
+ cycles = []
24
+ cycle_exp_pairs = []
25
+ info.split('), ').each do |a|
26
+ (nums, exp_num) = a.split('(')
27
+ nums = nums.split(', ').map {|v| v.to_i }
28
+ exp_num = exp_num.split(' ').last.sub(/\)$/,'').to_i
29
+ nums.each {|v| cycle_exp_pairs << [v, exp_num] }
30
+ end
31
+
32
+ min = cycle_exp_pairs.min
33
+ max = cycle_exp_pairs.max
34
+
35
+ elution = ssplit.select {|v| v.match(/^Elution:(.*)/) }.first
36
+ times = elution.split(': ').last
37
+ times.sub!(/ min$/,'')
38
+ times = times.split(' to ')
39
+ times.map! do |v|
40
+ (minutes, minute_decimals) = v.split('.')
41
+ seconds = minutes.to_f * 60
42
+ seconds + ( minute_decimals.to_f * 60 / 100 )
43
+ end
44
+
45
+ if max == min
46
+ [[min.first, min.last, times.first]]
47
+ else
48
+ [[min.first, min.last, times.first], [max.first, max.last, times.last]]
49
+ end
50
+ end
51
+
52
+ def get_scan_num(cycle, cycle_time, time_to_scan_num)
53
+ # grossly inefficient, but guaranteed to get right answer!
54
+ below_scan = nil
55
+ time_to_scan_num.each do |scan_time, scan_num|
56
+ if scan_time < cycle_time
57
+ below_scan = scan_num
58
+ else
59
+ break # scan_time > cycle_time
60
+ end
61
+ end
62
+ below_scan
63
+ end
64
+
65
+ #####################################################
66
+ # MAIN:
67
+ #####################################################
68
+
69
+ additional_ext = ".with_scan_nums"
70
+
71
+ if ARGV.size != 2
72
+ puts "usage: #{File.basename(__FILE__)} <file>.pepXML <file>.mzXML"
73
+ puts ""
74
+ puts "uses information from the mzXML file to fix the pepXML file"
75
+ puts "(adds in msms_run_summary: 'base_name' and 'raw_data' attributes;"
76
+ puts " adds scan numbers based on cycle and experiment times)"
77
+ puts ""
78
+ puts "outputs: <file>#{additional_ext}.pepXML"
79
+ exit
80
+ end
81
+
82
+ # get time_to_scan_num for msLevel=1 from the mzXML file
83
+ (pepxml, mzxml) = ARGV
84
+ mzxml_basename = File.basename(mzxml).sub(/\.mzxml$/i, '')
85
+
86
+ ext = File.extname(pepxml)
87
+ output = pepxml.sub(Regexp.new(Regexp.escape(ext)), additional_ext + ext)
88
+
89
+ ms = MS::MSRun.new(mzxml, :lazy => :no_spectra)
90
+ time_to_scan_num = ms.scans.select {|scan| scan.ms_level == 1 }.map do |scan|
91
+ [scan.time, scan.num]
92
+ end
93
+
94
+ # update spectrum queries based on scan number
95
+
96
+ root = AXML.parse_file(pepxml)
97
+ # fix the basename stuff:
98
+ msms_r_summary_n = root.child
99
+ atts = msms_r_summary_n.attrs
100
+ atts['base_name'] = mzxml_basename
101
+ atts['raw_data'] = '.mzXML'
102
+
103
+ root.child.find("child::spectrum_query").each do |sq|
104
+ triplets = get_cycle_exp_time_triplets(sq['spectrum'])
105
+ triplets.map! do |triplet|
106
+ [get_scan_num(triplet[0], triplet[2], time_to_scan_num), *triplet]
107
+ end
108
+ # [scan_num, cycle, exp, time]
109
+ quad = triplets.first
110
+ first_scan_num = (quad[0] + quad[2] - 1)
111
+ sq.attrs['start_scan'] = first_scan_num.to_s
112
+ sq.attrs['end_scan'] =
113
+ if triplets.size > 1
114
+ quad = triplets.last
115
+ (quad[0] + quad[2] - 1).to_s
116
+ else
117
+ first_scan_num.to_s
118
+ end
119
+ end
120
+
121
+ xml_header = '<?xml version="1.0" encoding="UTF-8"?>'
122
+ File.open(output, 'w') {|out| out.puts(xml_header); out.print root.to_s }
123
+
data/script/msvis.rb CHANGED
File without changes
File without changes
File without changes
data/script/prep_dir.rb CHANGED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -42,7 +42,7 @@ describe 'filter_and_validate.rb on small bioworks file' do
42
42
 
43
43
  it 'outputs to yaml' do
44
44
  reply = @st_to_yaml.call( @args )
45
- keys = [:probabilities, :params, :pephits_precision, :charges, :aaseqs, :count].map {|v| v.to_s }.sort
45
+ keys = [:probabilities, :params, :pephits, :pephits_precision, :charges, :aaseqs, :count].map {|v| v.to_s }.sort
46
46
  reply.keys.map {|v| v.to_s}.sort.should == keys
47
47
  end
48
48
 
@@ -55,7 +55,10 @@ describe 'filter_and_validate.rb on small bioworks file' do
55
55
  #normal_nsp = @st_to_yaml.call( @args + " --prob nsp" )
56
56
  #normal.should == normal_nsp
57
57
  init = @st_to_yaml.call( @args + " --prob init" )
58
- init.should_not == normal
58
+
59
+ init[:pephits_precision].first[:values].should_not == normal[:pephits_precision].first[:values]
60
+
61
+
59
62
  init[:pephits_precision].first[:values].zip([1.0, 0.95, 0.963333333333333, 0.8025]) do |got,exp|
60
63
  got.should be_close(exp, 0.000000000001)
61
64
  end
@@ -1,10 +1,14 @@
1
1
  require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
2
2
 
3
- xdescribe 'protein_summary.rb' do
3
+ describe 'protein_summary.rb' do
4
4
 
5
5
  before(:all) do
6
6
  @progname = 'protein_summary.rb'
7
7
  end
8
8
  it_should_behave_like 'a cmdline program'
9
9
 
10
+ it 'outputs basic protein prophet -prot.xml summary' do
11
+
12
+ end
13
+
10
14
  end