RubyGems - mspire - Versions diffs - 0.3.9 → 0.4.2 - Mend

mspire 0.3.9 → 0.4.2

Files changed (87) hide show

data/INSTALL +24 -7
data/README +15 -13
data/README.rdoc +18 -0
data/Rakefile +50 -14
data/bin/aafreqs.rb +0 -0
data/bin/bioworks2excel.rb +0 -0
data/bin/bioworks_to_pepxml.rb +2 -1
data/bin/bioworks_to_pepxml_gui.rb +0 -0
data/bin/fasta_shaker.rb +0 -0
data/bin/filter_and_validate.rb +0 -0
data/bin/gi2annot.rb +0 -0
data/bin/id_class_anal.rb +0 -0
data/bin/id_precision.rb +0 -0
data/bin/ms_to_lmat.rb +0 -0
data/bin/pepproph_filter.rb +0 -0
data/bin/protein_summary.rb +0 -0
data/bin/protxml2prots_peps.rb +0 -0
data/bin/raw_to_mzXML.rb +3 -3
data/bin/run_percolator.rb +122 -0
data/bin/sqt_group.rb +0 -0
data/bin/srf_group.rb +0 -0
data/changelog.txt +29 -0
data/lib/ms/gradient_program.rb +0 -1
data/lib/ms/msrun.rb +62 -29
data/lib/ms/parser/mzdata/axml.rb +55 -0
data/lib/ms/parser/mzdata/dom.rb +51 -36
data/lib/ms/parser/mzdata.rb +8 -2
data/lib/ms/parser/mzxml/axml.rb +59 -0
data/lib/ms/parser/mzxml/dom.rb +80 -57
data/lib/ms/parser/mzxml/hpricot.rb +1 -1
data/lib/ms/parser/mzxml/libxml.rb +6 -2
data/lib/ms/parser/mzxml.rb +110 -3
data/lib/ms/parser.rb +4 -4
data/lib/ms/precursor.rb +19 -4
data/lib/ms/scan.rb +7 -7
data/lib/ms/spectrum.rb +249 -58
data/lib/mspire.rb +1 -1
data/lib/spec_id/bioworks.rb +2 -2
data/lib/spec_id/precision/filter/cmdline.rb +8 -1
data/lib/spec_id/precision/prob/cmdline.rb +2 -2
data/lib/spec_id/precision/prob.rb +1 -0
data/lib/spec_id/proph/pep_summary.rb +3 -4
data/lib/spec_id/proph/prot_summary.rb +3 -3
data/lib/spec_id/protein_summary.rb +1 -1
data/lib/spec_id/sequest/pepxml.rb +5 -5
data/lib/spec_id/sqt.rb +4 -4
data/lib/spec_id/srf.rb +49 -8
data/lib/spec_id.rb +5 -0
data/lib/xml_style_parser.rb +16 -2
data/script/compile_and_plot_smriti_final.rb +0 -0
data/script/create_little_pepxml.rb +0 -0
data/script/degenerate_peptides.rb +0 -0
data/script/estimate_fpr_by_cysteine.rb +0 -0
data/script/extract_gradient_programs.rb +1 -1
data/script/find_cysteine_background.rb +0 -0
data/script/genuine_tps_and_probs.rb +0 -0
data/script/get_apex_values_rexml.rb +0 -0
data/script/mascot_fix_pepxml.rb +123 -0
data/script/msvis.rb +0 -0
data/script/mzXML2timeIndex.rb +0 -0
data/script/peps_per_bin.rb +0 -0
data/script/prep_dir.rb +0 -0
data/script/simple_protein_digestion.rb +0 -0
data/script/smriti_final_analysis.rb +0 -0
data/script/sqt_to_meta.rb +0 -0
data/script/top_hit_per_scan.rb +0 -0
data/script/toppred_to_yaml.rb +0 -0
data/script/tpp_installer.rb +0 -0
data/specs/bin/prob_validate_spec.rb +5 -2
data/specs/bin/protein_summary_spec.rb +5 -1
data/specs/ms/msrun_spec.rb +176 -133
data/specs/ms/parser_spec.rb +3 -3
data/specs/ms/spectrum_spec.rb +0 -2
data/specs/spec_id/precision/filter_spec.rb +4 -1
data/specs/spec_id/precision/prob_spec.rb +2 -2
data/specs/spec_id/sequest/pepxml_spec.rb +1 -1
data/specs/spec_id/sqt_spec.rb +5 -5
data/specs/spec_id/srf_spec.rb +56 -93
data/specs/spec_id/srf_spec_helper.rb +121 -284
data/specs/spec_id_spec.rb +3 -0
data/specs/transmem/toppred_spec.rb +1 -0
data/test_files/opd1_2runs_2mods/data/020.mzData.xml +683 -0
data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +382 -0
data/test_files/opd1_2runs_2mods/data/040.mzData.xml +683 -0
data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +382 -0
data/test_files/opd1_2runs_2mods/data/README.txt +6 -0
metadata +247 -229

data/lib/spec_id/sqt.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require 'spec_id'
-require 'array_class'
+require 'arrayclass'
 require 'set'
 class SQTGroup
@@ -176,7 +176,7 @@ end
 # all are cast as expected (total_intensity is a float)
 # mh = observed mh
-SQT::Spectrum = ArrayClass.new(%w[first_scan last_scan charge time_to_process node mh total_intensity lowest_sp num_matched_peptides matches])
+SQT::Spectrum = Arrayclass.new(%w[first_scan last_scan charge time_to_process node mh total_intensity lowest_sp num_matched_peptides matches])
 # 0=first_scan 1=last_scan 2=charge 3=time_to_process 4=node 5=mh 6=total_intensity 7=lowest_sp 8=num_matched_peptides 9=matches
@@ -262,7 +262,7 @@ class SQT::Spectrum
 end
 # SQT format uses only indices 0 - 9
-SQT::Match = ArrayClass.new(%w[rxcorr rsp mh deltacn_orig xcorr sp ions_matched ions_total sequence manual_validation_status first_scan last_scan charge deltacn aaseq base_name loci])
+SQT::Match = Arrayclass.new(%w[rxcorr rsp mh deltacn_orig xcorr sp ions_matched ions_total sequence manual_validation_status first_scan last_scan charge deltacn aaseq base_name loci])
 # 0=rxcorr 1=rsp 2=mh 3=deltacn_orig 4=xcorr 5=sp 6=ions_matched 7=ions_total 8=sequence 9=manual_validation_status 10=first_scan 11=last_scan 12=charge 13=deltacn 14=aaseq 15=base_name 16=loci
@@ -329,7 +329,7 @@ class SQT::Match::Percolator < SQT::Match
   end
 end
-SQT::Locus = ArrayClass.new(%w[locus description peps])
+SQT::Locus = Arrayclass.new(%w[locus description peps])
 class SQT::Locus
   include SpecID::Prot

data/lib/spec_id/srf.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'fileutils'
 require 'spec_id'
 require 'spec_id/sequest'
 require 'fasta'
@@ -45,7 +47,7 @@ class SRFGroup
       if filenames.is_a?(String) && filenames =~ /\.srg$/
         srg_filename = filenames.dup
         @filename = srg_filename
-        filenames = IO.readlines(filenames).grep(/\w/).map {|v| v.chomp }
+        filenames = SRFGroup.srg_to_paths(filenames)
         filenames.each do |file|
           if !File.exist? file
             puts "File: #{file} in #{srg_filename} does not exist!"
@@ -64,6 +66,11 @@ class SRFGroup
     end
   end
+  # reads a srg file and delivers the path names
+  def self.srg_to_paths(file)
+    IO.readlines(file).grep(/\w/).map {|v| v.chomp }
+  end
   # if srfs were read in separately, then the proteins will need to be merged
   # by their reference
   def merge_different_sets(srfs)
@@ -200,6 +207,23 @@ class SRF
     sprintf("%.#{decimal_places}f", float)
   end
+  # not given an out_folder, will make one with the basename
+  def to_dta_files(out_folder=nil)
+    outdir =
+      if out_folder ; out_folder
+      else base_name
+      end
+    FileUtils.mkpath(outdir)
+    Dir.chdir(outdir) do
+      dta_files.zip(index) do |dta,i_ar|
+        File.open([base_name, *i_ar].join('.') << '.dta', 'wb') do |out|
+          dta.write_dta_file(out)
+        end
+      end
+    end
+  end
   # the out_filename will be the base_name + .sqt unless 'out_filename' is
   # defined
   # :round => round floating point numbers
@@ -389,7 +413,7 @@ class SRF
       else
         @params = Sequest::Params.new.parse_handle(fh)
         # This is very sensitive to the grab_params method in sequest params
-        fh.read(12)  ## gap between last params entry and index
+        fh.read(12)  ## gap between last params entry and index
         @index = read_scan_index(fh,@header.num_dta_files)
       end
     end
@@ -526,18 +550,26 @@ end
 class SRF::DTAGen
   ## not sure if this is correct
+  # Float
   attr_accessor :start_time
-  # group scan (not sure if this is correct)
+  # Float
   attr_accessor :start_mass
+  # Float
   attr_accessor :end_mass
+  # Integer
   attr_accessor :num_dta_files
+  # Integer
   attr_accessor :group_scan
   ## not sure if this is correct
+  # Integer
   attr_accessor :min_group_count
+  # Integer
   attr_accessor :min_ion_threshold
   #attr_accessor :intensity_threshold # can't find yet
   #attr_accessor :precursor_tolerance # can't find yet
+  # Integer
   attr_accessor :start_scan
+  # Integer
   attr_accessor :end_scan
   #
@@ -551,7 +583,7 @@ end
 # total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
 # unknown is, well unknown...
-SRF::DTA = ArrayClass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
+SRF::DTA = Arrayclass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
 class SRF::DTA
   # original
@@ -594,9 +626,18 @@ class SRF::DTA
     self
   end
+  # write a class dta file to the io object
+  def write_dta_file(io)
+    io.print("#{mh} #{charge}\r\n")
+    peak_ar = peaks.unpack('e*')
+    (0...(peak_ar.size)).step(2) do |i|
+      io.print( peak_ar[i,2].join(' '), "\r\n" )
+    end
+  end
 end
-SRF::OUT =  ArrayClass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
+SRF::OUT =  Arrayclass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
 # 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
 class SRF::OUT
@@ -666,7 +707,7 @@ end
 # the first one listed
 # srf = the srf object this scan came from
-SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
+SRF::OUT::Pep = Arrayclass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
 # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
@@ -787,7 +828,7 @@ class SRF::OUT::Pep
  end
-SRF::OUT::Prot = ArrayClass.new( %w(reference peps) )
+SRF::OUT::Prot = Arrayclass.new( %w(reference peps) )
 class SRF::OUT::Prot
   include SpecID::Prot
@@ -798,7 +839,7 @@ class SRF::OUT::Prot
   tmp = $VERBOSE ; $VERBOSE = nil
   def initialize(reference=nil, peps=[])
     #super(@@arr_size)
-    super(size)
+    super(self.class.size)
     #@reference = reference
     #@peps = peps
     self[0,2] = reference, peps

data/lib/spec_id.rb CHANGED Viewed

@@ -5,12 +5,17 @@ require 'roc'
 require 'sample_enzyme'  # for others
 require 'spec_id/bioworks'
 require 'spec_id/sequest'
 require 'spec_id/proph/prot_summary'
+require 'spec_id/proph/pep_summary'
 require 'spec_id_xml'
 require 'spec_id/sqt'
 require 'spec_id/mass'
 require 'fasta'
 module ProteinReferenceable ; end
 class SampleEnzyme ; end

data/lib/xml_style_parser.rb CHANGED Viewed

@@ -82,7 +82,7 @@ module XMLStyleParser
   end
   # seeks a subclass that has the public_method @method
-  def self.choose_parser(const, method)
+  def self.choose_parser(const, method, special_subclass=nil)
     ## First update @@parser_precedence to ensure we should get these guys
     parser_precedence = available_xml_parsers
@@ -95,10 +95,24 @@ module XMLStyleParser
     available = available_subclasses.select do |subclass|
       subclass.public_method_defined? method
     end
+    if special_subclass
+      available_special_subclasses = []
+      available.each do |subclass|
+        if subclass.const_defined?(special_subclass)
+          available_special_subclasses << subclass.const_get(special_subclass)
+        end
+      end
+      available = available_special_subclasses
+    end
     if available.size > 0
       available.first
     else
-      raise NoMethodError, "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
+      warning = ""
+      if special_subclass
+        warning << "** while looking for special subclass: #{special_subclass} **\n"
+      end
+      warning << "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
+      raise NoMethodError, warning
     end
   end

data/script/compile_and_plot_smriti_final.rb CHANGED Viewed

File without changes

data/script/create_little_pepxml.rb CHANGED Viewed

File without changes

data/script/degenerate_peptides.rb CHANGED Viewed

File without changes

data/script/estimate_fpr_by_cysteine.rb CHANGED Viewed

File without changes

data/script/extract_gradient_programs.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 require 'optparse'
 require 'table'
-require 'spec/gradient_program'
+require 'ms/gradient_program'
 delimiter = "\t"
 table_format = false

data/script/find_cysteine_background.rb CHANGED Viewed

File without changes

data/script/genuine_tps_and_probs.rb CHANGED Viewed

File without changes

data/script/get_apex_values_rexml.rb CHANGED Viewed

File without changes

data/script/mascot_fix_pepxml.rb ADDED Viewed

@@ -0,0 +1,123 @@
+#!/usr/bin/ruby
+require 'rubygems'
+require 'ms/msrun'
+gem 'axml', '= 0.0.2'
+# returns an array containing one or two pairs of [cycle_num, time] that
+# represent the lowest and highest cycle numbers coupled to lowest and highest
+# time (in seconds) and the lowest and highest associated experiment numbers
+def get_cycle_exp_time_triplets(string)
+  hash = {}
+  cycle_index = nil
+  ssplit = string.split(', ')
+  ssplit.each_with_index do |piece,i|
+    if piece =~ /^Cycle\(s\):/
+      cycle_index = i
+      break
+    end
+  end
+  cycle_info = ssplit[cycle_index..-1].join(", ")
+  #Cycle(s): 663, 675 (Experiment 2), 667 (Experiment 4)
+  (header, info) = cycle_info.split(': ')
+  cycles = []
+  cycle_exp_pairs = []
+  info.split('), ').each do |a|
+    (nums, exp_num) = a.split('(')
+    nums = nums.split(', ').map {|v| v.to_i }
+    exp_num = exp_num.split(' ').last.sub(/\)$/,'').to_i
+    nums.each {|v| cycle_exp_pairs << [v, exp_num] }
+  end
+  min = cycle_exp_pairs.min
+  max = cycle_exp_pairs.max
+  elution = ssplit.select {|v| v.match(/^Elution:(.*)/) }.first
+  times = elution.split(': ').last
+  times.sub!(/ min$/,'')
+  times = times.split(' to ')
+  times.map! do |v|
+    (minutes, minute_decimals) = v.split('.')
+    seconds = minutes.to_f * 60
+    seconds + ( minute_decimals.to_f * 60 / 100 )
+  end
+  if max == min
+    [[min.first, min.last, times.first]]
+  else
+    [[min.first, min.last, times.first], [max.first, max.last, times.last]]
+  end
+end
+def get_scan_num(cycle, cycle_time, time_to_scan_num)
+  # grossly inefficient, but guaranteed to get right answer!
+  below_scan = nil
+  time_to_scan_num.each do |scan_time, scan_num|
+    if scan_time < cycle_time
+      below_scan = scan_num
+    else
+      break  # scan_time > cycle_time
+    end
+  end
+  below_scan
+end
+#####################################################
+# MAIN:
+#####################################################
+additional_ext = ".with_scan_nums"
+if ARGV.size != 2
+  puts "usage: #{File.basename(__FILE__)} <file>.pepXML <file>.mzXML"
+  puts ""
+  puts "uses information from the mzXML file to fix the pepXML file"
+  puts "(adds in msms_run_summary: 'base_name' and 'raw_data' attributes;"
+  puts " adds scan numbers based on cycle and experiment times)"
+  puts ""
+  puts "outputs: <file>#{additional_ext}.pepXML"
+  exit
+end
+# get time_to_scan_num for msLevel=1 from the mzXML file
+(pepxml, mzxml) = ARGV
+mzxml_basename = File.basename(mzxml).sub(/\.mzxml$/i, '')
+ext = File.extname(pepxml)
+output = pepxml.sub(Regexp.new(Regexp.escape(ext)), additional_ext + ext)
+ms = MS::MSRun.new(mzxml, :lazy => :no_spectra)
+time_to_scan_num = ms.scans.select {|scan| scan.ms_level == 1 }.map do |scan|
+  [scan.time, scan.num]
+end
+# update spectrum queries based on scan number
+root = AXML.parse_file(pepxml)
+# fix the basename stuff:
+msms_r_summary_n = root.child
+atts = msms_r_summary_n.attrs
+atts['base_name'] = mzxml_basename
+atts['raw_data'] = '.mzXML'
+root.child.find("child::spectrum_query").each do |sq|
+  triplets = get_cycle_exp_time_triplets(sq['spectrum'])
+  triplets.map! do |triplet|
+    [get_scan_num(triplet[0], triplet[2], time_to_scan_num), *triplet]
+  end
+  # [scan_num, cycle, exp, time]
+  quad = triplets.first
+  first_scan_num = (quad[0] + quad[2] - 1)
+  sq.attrs['start_scan'] = first_scan_num.to_s
+  sq.attrs['end_scan'] =
+    if triplets.size > 1
+      quad = triplets.last
+      (quad[0] + quad[2] - 1).to_s
+    else
+      first_scan_num.to_s
+    end
+end
+xml_header = '<?xml version="1.0" encoding="UTF-8"?>'
+File.open(output, 'w') {|out| out.puts(xml_header); out.print root.to_s }

data/script/msvis.rb CHANGED Viewed

File without changes

data/script/mzXML2timeIndex.rb CHANGED Viewed

File without changes

data/script/peps_per_bin.rb CHANGED Viewed

File without changes

data/script/prep_dir.rb CHANGED Viewed

File without changes

data/script/simple_protein_digestion.rb CHANGED Viewed

File without changes

data/script/smriti_final_analysis.rb CHANGED Viewed

File without changes

data/script/sqt_to_meta.rb CHANGED Viewed

File without changes

data/script/top_hit_per_scan.rb CHANGED Viewed

File without changes

data/script/toppred_to_yaml.rb CHANGED Viewed

File without changes

data/script/tpp_installer.rb CHANGED Viewed

File without changes

data/specs/bin/prob_validate_spec.rb CHANGED Viewed

@@ -42,7 +42,7 @@ describe 'filter_and_validate.rb on small bioworks file' do
   it 'outputs to yaml' do
     reply = @st_to_yaml.call( @args )
-    keys = [:probabilities, :params, :pephits_precision, :charges, :aaseqs, :count].map {|v| v.to_s }.sort
+    keys = [:probabilities, :params, :pephits, :pephits_precision, :charges, :aaseqs, :count].map {|v| v.to_s }.sort
     reply.keys.map {|v| v.to_s}.sort.should == keys
   end
@@ -55,7 +55,10 @@ describe 'filter_and_validate.rb on small bioworks file' do
     #normal_nsp = @st_to_yaml.call( @args + " --prob nsp" )
     #normal.should == normal_nsp
     init = @st_to_yaml.call( @args + " --prob init" )
-    init.should_not == normal
+    init[:pephits_precision].first[:values].should_not == normal[:pephits_precision].first[:values]
     init[:pephits_precision].first[:values].zip([1.0, 0.95, 0.963333333333333, 0.8025]) do |got,exp|
       got.should be_close(exp, 0.000000000001)
     end

data/specs/bin/protein_summary_spec.rb CHANGED Viewed

@@ -1,10 +1,14 @@
 require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
-xdescribe 'protein_summary.rb' do
+describe 'protein_summary.rb' do
    before(:all) do
     @progname = 'protein_summary.rb'
   end
   it_should_behave_like 'a cmdline program'
+  it 'outputs basic protein prophet -prot.xml summary' do
+  end
 end