RubyGems - mspire - Versions diffs - 0.3.1 → 0.3.9 - Mend

mspire 0.3.1 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

data/Rakefile +2 -2
data/bin/bioworks_to_pepxml.rb +15 -3
data/bin/ms_to_lmat.rb +2 -1
data/bin/sqt_group.rb +26 -0
data/changelog.txt +36 -0
data/lib/ms/msrun.rb +3 -1
data/lib/ms/parser/mzdata/dom.rb +14 -14
data/lib/ms/scan.rb +3 -3
data/lib/mspire.rb +1 -1
data/lib/sample_enzyme.rb +39 -0
data/lib/spec_id.rb +18 -0
data/lib/spec_id/aa_freqs.rb +6 -9
data/lib/spec_id/digestor.rb +16 -17
data/lib/spec_id/mass.rb +63 -1
data/lib/spec_id/parser/proph.rb +101 -2
data/lib/spec_id/precision/filter.rb +3 -2
data/lib/spec_id/precision/filter/cmdline.rb +3 -1
data/lib/spec_id/precision/filter/output.rb +1 -0
data/lib/spec_id/precision/prob.rb +88 -21
data/lib/spec_id/precision/prob/cmdline.rb +28 -16
data/lib/spec_id/precision/prob/output.rb +8 -2
data/lib/spec_id/proph/pep_summary.rb +25 -12
data/lib/spec_id/sequest.rb +28 -0
data/lib/spec_id/sequest/pepxml.rb +142 -197
data/lib/spec_id/sqt.rb +349 -0
data/lib/spec_id/srf.rb +33 -23
data/lib/validator.rb +40 -57
data/lib/validator/aa.rb +3 -90
data/lib/validator/aa_est.rb +112 -0
data/lib/validator/cmdline.rb +163 -31
data/lib/validator/decoy.rb +15 -7
data/lib/validator/digestion_based.rb +5 -4
data/lib/validator/q_value.rb +32 -0
data/script/peps_per_bin.rb +67 -0
data/script/sqt_to_meta.rb +24 -0
data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
data/specs/bin/fasta_shaker_spec.rb +2 -2
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
data/specs/bin/filter_and_validate_spec.rb +25 -6
data/specs/bin/ms_to_lmat_spec.rb +2 -2
data/specs/bin/prob_validate_spec.rb +5 -3
data/specs/sample_enzyme_spec.rb +86 -1
data/specs/spec_helper.rb +11 -9
data/specs/spec_id/bioworks_spec.rb +2 -1
data/specs/spec_id/precision/filter_spec.rb +5 -5
data/specs/spec_id/precision/prob_spec.rb +0 -67
data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
data/specs/spec_id/protein_summary_spec.rb +4 -4
data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
data/specs/spec_id/sequest_spec.rb +38 -0
data/specs/spec_id/sqt_spec.rb +111 -3
data/specs/spec_id_spec.rb +2 -0
data/specs/transmem/phobius_spec.rb +3 -1
data/specs/transmem/toppred_spec.rb +1 -1
data/specs/validator/aa_est_spec.rb +66 -0
data/specs/validator/aa_spec.rb +1 -68
data/specs/validator/background_spec.rb +2 -0
data/specs/validator/bias_spec.rb +3 -27
data/specs/validator/decoy_spec.rb +2 -2
data/specs/validator/transmem_spec.rb +2 -1
data/test_files/small.sqt +87 -0
metadata +312 -293

data/Rakefile CHANGED Viewed

@@ -238,8 +238,8 @@ spec = Gem::Specification.new do |s|
   s.rdoc_options = rdoc_options
   s.extra_rdoc_files = rdoc_extra_includes
   s.executables = FL["bin/*"].map {|file| File.basename(file) }
-  s.add_dependency('libjtp', '~> 0.2.12')
-  s.add_dependency('axml')
+  s.add_dependency('libjtp', '~> 0.2.13')
+  s.add_dependency('axml', '~> 0.0.0')
   s.requirements << '"libxml" is the prefered xml parser right now.  libxml, xmlparser, REXML and regular expressions are used as fallback in some routines.'
   s.requirements << 'some plotting functions will not be available without the "gnuplot" gem (and underlying gnuplot binary)'
   s.requirements << 'the "t2x" binary (in archive) or readw.exe is required to convert .RAW files to mzXML in some applications'

data/bin/bioworks_to_pepxml.rb CHANGED Viewed

@@ -43,14 +43,26 @@ opt_obj = OptionParser.new do |op|
   op.separator "Options:"
   op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
   op.on('-o', '--outdir path', "output directory     d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
+  op.on('--sample_enzyme <type>', "For digested samples run with no enzymatic",
+                                  "search constraint, the enzyme used for",
+                                  "digestion, options: 'Trypsin_KR_P'") {|v|
+    case v
+    when 'Trypsin_KR_P'
+      opt.sample_enzyme = SampleEnzyme.new("trypsin")
+    else
+      raise ArgumentError, "Don't recognize enzyme: #{v}"
+    end
+  }
+  op.on('-a', '--all_hits', "includes all hits, not just top xcorr") {|v| opt.all_hits = v }
+  op.on('--deltacn_orig', "top hit deltacn = 0.0, (no deltacnstar att)") {|v| opt.deltacn_orig = v }
+  op.on('-m', '--mspath path', "path to MS files     d: '#{DEFAULT_MZ_PATH}'") {|v| opt.mspath = v }
+  op.on('--copy_mzxml', "copies mzXML files to outdir path"){|v| opt.copy_mzxml = v }
   op.separator ""
   op.separator "bioworks.xml files may require additional options:"
   op.separator ""
   op.on('-p', '--params file', "sequest params file  d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
   op.on('-d', '--dbpath path', "path to databases    d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
-  op.on('-m', '--mspath path', "path to MS files     d: '#{DEFAULT_MZ_PATH}'") {|v| opt.mspath = v }
-  op.on('--copy_mzxml', "copies mzXML files to outdir path"){|v| opt.copy_mzxml = v }
   op.on('--model <LCQ|Orbi|string>', "MS model      (xml)  d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
   op.on('--mass_analyzer <string>',  "Mass Analyzer (xml)  d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
@@ -131,5 +143,5 @@ opt.params ||= DEFAULT_PARAMS_FILE
 opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
 opt.model ||= DEFAULT_MS_MODEL
-xml_objs = Sequest::PepXML.set_from_bioworks(bioworks_file, {:params => opt.params, :ms_data => opt.mspath, :out_path => opt.outdir, :model => model, :backup_db_path => opt.dbpath, :copy_mzxml => opt.copy_mzxml, :ms_mass_analyzer => opt.mass_analyzer, :print => true})
+xml_objs = Sequest::PepXML.set_from_bioworks(bioworks_file, {:params => opt.params, :ms_data => opt.mspath, :out_path => opt.outdir, :model => model, :backup_db_path => opt.dbpath, :copy_mzxml => opt.copy_mzxml, :ms_mass_analyzer => opt.mass_analyzer, :print => true, :all_hits => opt.all_hits, :deltacn_orig => opt.deltacn_orig, :sample_enzyme => opt.sample_enzyme})

data/bin/ms_to_lmat.rb CHANGED Viewed

@@ -47,7 +47,8 @@ ARGV.each do |file|
   }
   args.merge!(opt)
   lmat = LMat.new.from_times_and_spectra(times, spectra, args)
-  outfile = file.sub(/\.mzXML$/, opt[:newext])
+  ext = File.extname(file)
+  outfile = file.sub(/#{Regexp.escape(ext)}$/, opt[:newext])
   if args[:ascii]
     outfile << "a"
     lmat.print(outfile)

data/bin/sqt_group.rb ADDED Viewed

@@ -0,0 +1,26 @@
+#!/usr/bin/ruby
+require 'optparse'
+require 'spec_id/sqt'
+$OUTFILE = 'bioworks.sqg'
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} <file1>.sqt <file2>.sqt ..."
+  op.separator "outputs: 'bioworks.sqg'"
+  op.separator ""
+  op.separator "    A '.sqg' file is an ascii text file with a list"
+  op.separator "    of the sqt files (full path names) in that group."
+  op.separator ""
+  op.on('-o', '--output <filename>', 'a different output name') {|v| $OUTFILE }
+end
+if ARGV.size == 0
+  puts opts
+  exit
+end
+obj = SQTGroup.new
+obj.filenames = ARGV.to_a
+obj.to_sqg($OUTFILE)

data/changelog.txt CHANGED Viewed

@@ -126,3 +126,39 @@ interfaces and implementations (using ArrayClass)
 ## version 0.3.1
 1. Bug fix in srf filtering (num_hits adjusted)
+## version 0.3.2
+1. Uses sequest peptide_mass_tolerance filter on srf group files by default
+now.
+## version 0.3.3
+1. Worked out minor kinks in prob_precision.rb
+## version 0.3.4
+1. filters >= +3 charged ions now.
+## version 0.3.5
+1. fixed creation of background distribution in validators (hash_by base_name,
+first_scan, charge now)
+## version 0.3.6
+1. split off bad_aa_est from bad_aa
+## version 0.3.7
+1. can deal with No_Enzyme searches now (while still capable of setting
+sample_enzyme)
+## version 0.3.8
+1. can set a decoy to target ratio for decoy validation
+2. added mass calculator in Mass::Calculator
+## version 0.3.9
+1. doesn't clobber mzdata filename in ms_to_lmat.rb conversion

data/lib/ms/msrun.rb CHANGED Viewed

@@ -30,7 +30,9 @@ class MS::MSRun
     myopts = opts.dup ; myopts[:msrun] = self
     if file
       filetype_and_version = MS::Parser.filetype_and_version(file)
-      MS::Parser.new(filetype_and_version, :msrun).parse(file, myopts)
+      parser = MS::Parser.new(filetype_and_version, :msrun)
+      parser.parse(file, myopts)
+      #MS::Parser.new(filetype_and_version, :msrun).parse(file, myopts)
       (@filetype, @version) = filetype_and_version
     end
   end

data/lib/ms/parser/mzdata/dom.rb CHANGED Viewed

@@ -51,23 +51,20 @@ class MS::Parser::MzData::DOM
     # %w(num msLevel retentionTime startMz endMz precursors spectrum)
     root = get_root_node_from_file(file)
-    scan_count = 0
     description = root.find_first('child::description')
     bioworks33 = is_bioworks33?(description)
     spectrum_list = description.next
-    scans =
-    if bioworks33
-      [] #bioworks33 gives incorrect scan numbers!
-    else
-      Array(spectrum_list['count'].to_i)
-    end
+    scans = []
+    # bioworks 33 gives incorrect scan count
+    stated_num_scans = spectrum_list['count'].to_i
     # if I move from node to node, it means I've checked that it's a sequence
     # and that the elements are req'd
     if spectrum_list.child?
       spectrum_n = spectrum_list.child
       loop do
-        scan_count += 1
         scan = MS::Scan.new(9)
         id = spectrum_n["id"].to_i
         id_to_scan_hash[id] = scan
@@ -81,11 +78,9 @@ class MS::Parser::MzData::DOM
         spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
         scan[1] = spec_inst_n['msLevel'].to_i
-        if bioworks33
-          scans << scan # we can't trust the scan count!
-        else
-          scans[scan_count] = scan
-        end
+        # we could use a scan_count, but in bioworks 33, we can't trust the
+        # scan count!  So, we just collect them
+        scans << scan
         scan[3] = spec_inst_n['mzRangeStart'].to_f
         scan[4] = spec_inst_n['mzRangeStop'].to_f
@@ -149,7 +144,12 @@ class MS::Parser::MzData::DOM
       MS::MSRun.add_parent_scan(scans, opts[:spectra])
     end
     msrun_obj.scans = scans
-    msrun_obj.scan_count = scan_count
+    msrun_obj.scan_count = scans.size
+    unless bioworks33  # we know the scan count is off here
+      if msrun_obj.scan_count != stated_num_scans
+        warn "num collected scans (#{scans.size}) does not agree with stated num scans (#{stated_num_scans})!"
+      end
+    end
     msrun_obj.start_time = msrun_obj.scans.first.time
     msrun_obj.end_time = msrun_obj.scans.last.time
   end

data/lib/ms/scan.rb CHANGED Viewed

@@ -28,7 +28,7 @@ class MS::Scan
     atts = %w(num ms_level time start_mz end_mz)
     display = atts.map do |att|
       if val = send(att.to_sym)
-        "@#{att}=#{val}"
+        "#{att}=#{val}"
       else
         nil
       end
@@ -38,9 +38,9 @@ class MS::Scan
       if spectrum
         spectrum.mz.size
       else
-        nil
+        'nil'
       end
-    "<MS::Scan:#{__id__} " + display.join(", ") + "@precursors=#{precursors.inspect}" + "@spectrum=size:#{spec_display}" + ">"
+    "<MS::Scan:#{__id__} " + display.join(", ") + " precursors=#{precursors.inspect}" + " spectrum(size)=#{spec_display}" + " >"
   end
   # returns the string (space delimited): "ms_level num time [prec_mz prec_inten]"

data/lib/mspire.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module Mspire
-  Version = '0.3.1'
+  Version = '0.3.9'
 end

data/lib/sample_enzyme.rb CHANGED Viewed

@@ -23,6 +23,7 @@ class SampleEnzyme
   # For other enzymes, you must set :cut, :no_cut, :name, and :sense
   # will yield the object if you want to set the values that way
   def initialize(name=nil)
+    @num_missed_cleavages_regex = nil
     @sense = nil
     @cut = nil
     @no_cut = nil
@@ -62,6 +63,44 @@ class SampleEnzyme
     self.new.from_pepxml_node(node)
   end
+  # takes an amino acid sequence (e.g., -.PEPTIDK.L)
+  # returns the number of missed cleavages
+  def num_missed_cleavages(aaseq)
+    raise NotImplementedError, 'need to implement for N terminal sense'  if sense == 'N'
+    @num_missed_cleavages_regex =
+      if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
+      else
+        regex_string = "[#{@cut}]"
+        if @no_cut and @no_cut != ''
+          regex_string << "[^#{@no_cut}]"
+        end
+        /#{regex_string}/
+      end
+    arr = aaseq.scan(@num_missed_cleavages_regex)
+    num = arr.size
+    if aaseq[-1,1] =~ @num_missed_cleavages_regex
+      num -= 1
+    end
+    num
+  end
+  # requires full sequence (with heads and tails)
+  def num_tol_term(sequence)
+    raise NotImplementedError, 'need to implement for N terminal sense'  if sense == 'N'
+    no_cut = @no_cut || ''
+    num_tol = 0
+    first, middle, last = SpecID::Pep.split_sequence(sequence)
+    last_of_middle = middle[-1,1]
+    first_of_middle = middle[0,1]
+    if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
+      num_tol += 1
+    end
+    if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
+      num_tol += 1
+    end
+    num_tol
+  end
   # returns all peptides of missed cleavages <= 'missed_cleavages'
   # so 2 missed cleavages will return all no missed cleavage peptides
   # all 1 missed cleavages and all 2 missed cleavages.

data/lib/spec_id.rb CHANGED Viewed

@@ -7,6 +7,7 @@ require 'spec_id/bioworks'
 require 'spec_id/sequest'
 require 'spec_id/proph/prot_summary'
 require 'spec_id_xml'
+require 'spec_id/sqt'
 require 'spec_id/mass'
 require 'fasta'
@@ -71,6 +72,10 @@ module SpecID
       Proph::ProtSummary.new(file)
     when 'pepproph'
       Proph::PepSummary.new(file)
+    when 'sqg'
+      SQTGroup.new(file)
+    when 'sqt'
+      SQTGroup.new([file])
     else
       abort "UNRECOGNIZED file type for #{file}"
     end
@@ -447,6 +452,8 @@ module SpecID
   def self.file_type(file)
     if file =~ /\.srg$/
       return 'srg'
+    elsif file =~ /\.sqg$/
+      return 'sqg'
     end
     if IO.read(file, 7,438) == 'Enzyme:'
       return 'srf'
@@ -461,6 +468,17 @@ module SpecID
       elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
         return 'pepproph'
       end
+      # assumes the header of a sqt file is less than 200 lines ...
+      200.times do
+        line = fh.gets
+        if line
+          lines << line
+        else ; break
+        end
+      end
+      if lines =~ /^H\tDatabase/ and lines =~ /^H\tSQTGenerator/
+        return 'sqt'
+      end
     end
   end

data/lib/spec_id/aa_freqs.rb CHANGED Viewed

@@ -3,30 +3,27 @@ require 'fasta'
 module SpecID ; end
 class SpecID::AAFreqs
-  # a fasta object
-  attr_accessor :fasta
   # hash by capital one-letter amino acid symbols giving the frequency of
   # seeing that amino acid.  Frequencies should add to 1.
   attr_accessor :aafreqs
   # fasta is fasta object!
   def initialize(fasta=nil)
-    @fasta = fasta
-    if @fasta
-      @aafreqs = calculate_frequencies(@fasta)
+    if fasta
+      @aafreqs = calculate_frequencies(fasta.prots)
     end
   end
-  # creates an aafreqs hash based on fasta object
-  def calculate_frequencies(fasta)
+  # takes an enumerable of objects responding to :aaseq and creates an aafreqs hash
+  def calculate_frequencies(objs)
     hash = {}
     total_aas = 0
     ('A'..'Z').each do |x|
       hash[x] = 0
     end
     hash['*'] = 0
-    fasta.prots.each do |prot|
-      aaseq = prot.aaseq
+    objs.each do |obj|
+      aaseq = obj.aaseq
       total_aas += aaseq.size
       aaseq.split('').each do |x|
         hash[x] += 1

data/lib/spec_id/digestor.rb CHANGED Viewed

@@ -100,38 +100,37 @@ class Digestor
   #   The prot_aaseq is used if the mass_hash contains the keys
   #   :add_C_term_protein or :add_N_term_protein
   #
+  #   mass_hash requires the key :h_plus or :h depending on h_plus option.
   #   prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
   #   peptides matching a protein aaseq
-  #   returns another parallel array of passing proteins
+  #   returns another parallel array of passing peptides per protein
   def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
     if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
       raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
     else
       # figure out how much must be added to each peptide
       # include the h2o, the h, and N and C terminal static mods
-      h_key = h_plus ? :h_plus : :h
-      final_add = mass_hash[:h2o] + mass_hash[h_key]
+      h_plus_key = h_plus ? :h_plus : :h
+      extra_add = mass_hash[h_plus_key]
       [:add_N_term_peptide, :add_C_term_peptide].each do |sym|
         if mass_hash.key?(sym)
-          final_add += mass_hash[sym]
+          extra_add += mass_hash[sym]
         end
       end
-      hash_by_aa_string = {}
-      mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
+      mc = Mass::Calculator.new(mass_hash, extra_add)
+      masses_per_group = pep_aaseqs_ar.map do  |pep_aaseqs|
+        mc.masses(pep_aaseqs)
+      end
-      pep_aaseqs_ar.map do  |pep_aaseqs|
-        pep_aaseqs.select do |aaseq|
-          sum = 0.0
-          aaseq.split('').each do |let|
-            if !hash_by_aa_string.key? let
-              puts 'NOT FOUND'
-              p let
-            end
-            sum += hash_by_aa_string[let]
+      masses_per_group.zip(pep_aaseqs_ar).map do |masses, aaseqs|
+        passing = []
+        aaseqs.zip(masses) do |aaseq, mh_plus|
+          if ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
+            passing << aaseq
           end
-          mh_plus = sum + final_add
-          ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
         end
+        passing
       end
     end
   end

data/lib/spec_id/mass.rb CHANGED Viewed

@@ -29,13 +29,13 @@ class Mass
     :U => 150.95364,   # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
     :X => 118.805716,  # the average of the mono masses of the 20 amino acids
     :* => 118.805716, # same as X
+    :Z => (129.04259 + 128.05858) / 2,  # average glutamic acid and glutamine
     # elements etc.
     :h => 1.00783,
     :h_plus => 1.00728,
     :o => 15.9949146,
     :h2o => 18.01056,
   }
   AVG = {
     :A => 71.0788,
@@ -64,6 +64,7 @@ class Mass
     :U => 150.03,   # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
     :X => 118.88603, # the average of the masses of the 20 amino acids
     :* => 118.88603, # same as X
+    :Z => (129.1155+ 128.1307) / 2,  # average glutamic acid and glutamine
     # elements etc.
     :h => 1.00794,
@@ -112,5 +113,66 @@ class Mass
     end
     copy_hash
   end
+  # returns an array of masses parallel to array passed in
+  # If you want the mass with H+, then pass in the mass as h_plus
+  # The mass hash must repond to
+  #   :h2o (water)
+  #   and at least the twenty amino acids (by string or symbol)
+  # The mass hash may respond to :add_N_term_peptide or :add_C_term_peptide
+  # in which case these will be added to the final mass
+  def self.masses(aaseqs, mass_hash=Mass::MONO, h_plus=0.0)
+    final_add = mass_hash[:h2o] + h_plus
+    [:add_N_term_peptide, :add_C_term_peptide].each do |sym|
+      if mass_hash.key?(sym)
+        final_add += mass_hash[sym]
+      end
+    end
+    hash_by_aa_string = {}
+    mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
+    aaseqs.map do  |pep_aaseqs|
+      sum = 0.0
+      aaseq.split('').each do |let|
+        sum += hash_by_aa_string[let]
+      end
+      mh_plus = sum + final_add
+    end
+  end
+end
+class Mass::Calculator
+  # mass_hash must respond to :h2o or 'h2o'.  This is added to represent the
+  # tails of the peptide.  add_extra is outside of that (e.g., an H+)
+  def initialize(mass_hash, add_extra=0.0)
+    @mass_hash = mass_hash_to_s(mass_hash)
+    @final_add = @mass_hash['h2o'] + add_extra
+  end
+  def mass_hash_to_s(mass_hash)
+    new_hash = {}
+    mass_hash.each do |k,v|
+      new_hash[k.to_s] = v
+    end
+    new_hash
+  end
+  def masses(aaseqs)
+    aaseqs.map do |aaseq|
+      sum = @final_add  # <- add in the initialization
+      aaseq.split('').each do |let|
+        if @mass_hash.key? let
+          sum += @mass_hash[let]
+        else
+          abort "LETTER not found in mass_hash: #{let}"
+        end
+      end
+      sum
+    end
+  end
 end