RubyGems - mspire - Versions diffs - 0.1.7 → 0.2.0 - Mend

mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/Rakefile +41 -14
data/bin/bioworks2excel.rb +1 -1
data/bin/bioworks_to_pepxml.rb +46 -59
data/bin/fasta_shaker.rb +1 -1
data/bin/filter.rb +6 -0
data/bin/find_aa_freq.rb +23 -0
data/bin/id_precision.rb +3 -2
data/bin/mzxml_to_lmat.rb +2 -1
data/bin/pepproph_filter.rb +1 -1
data/bin/precision.rb +1 -1
data/bin/protein_summary.rb +2 -451
data/bin/raw_to_mzXML.rb +55 -0
data/bin/srf_group.rb +26 -0
data/changelog.txt +7 -0
data/lib/align.rb +3 -3
data/lib/fasta.rb +6 -1
data/lib/gi.rb +9 -4
data/lib/roc.rb +2 -0
data/lib/sample_enzyme.rb +2 -1
data/lib/spec/mzxml/parser.rb +2 -43
data/lib/spec/mzxml.rb +65 -2
data/lib/spec_id/aa_freqs.rb +10 -7
data/lib/spec_id/bioworks.rb +67 -87
data/lib/spec_id/filter.rb +794 -0
data/lib/spec_id/precision.rb +29 -36
data/lib/spec_id/proph.rb +5 -3
data/lib/spec_id/protein_summary.rb +459 -0
data/lib/spec_id/sequest.rb +323 -271
data/lib/spec_id/srf.rb +189 -135
data/lib/spec_id.rb +276 -227
data/lib/spec_id_xml.rb +101 -0
data/lib/toppred.rb +18 -0
data/script/degenerate_peptides.rb +47 -0
data/script/filter-peps.rb +5 -1
data/test/tc_align.rb +1 -1
data/test/tc_bioworks.rb +25 -22
data/test/tc_bioworks_to_pepxml.rb +37 -4
data/test/tc_fasta.rb +3 -1
data/test/tc_fasta_shaker.rb +8 -6
data/test/tc_filter.rb +203 -0
data/test/tc_gi.rb +6 -9
data/test/tc_id_precision.rb +31 -0
data/test/tc_mzxml.rb +8 -6
data/test/tc_peptide_parent_times.rb +2 -1
data/test/tc_precision.rb +1 -1
data/test/tc_proph.rb +5 -5
data/test/tc_protein_summary.rb +36 -13
data/test/tc_sequest.rb +78 -33
data/test/tc_spec_id.rb +128 -6
data/test/tc_srf.rb +84 -38
metadata +67 -62
data/bin/fasta_cat.rb +0 -39
data/bin/fasta_cat_mod.rb +0 -59
data/bin/fasta_mod.rb +0 -57
data/bin/filter_spec_id.rb +0 -365
data/bin/raw2mzXML.rb +0 -21
data/script/gen_database_searching.rb +0 -258

data/lib/spec/mzxml/parser.rb CHANGED Viewed

@@ -285,47 +285,6 @@ class Spec::MzXML::Parser
     # in progress
   end
-  # first, converts backslash to forward slash in filename.
-  # if .mzXML returns the filename
-  # if .raw or .RAW converts the file to .mZXML and returns mzXML filename
-  # if no recognized extension, looks for .mzXML file, then .RAW file (and
-  # converts)
-  # aborts if file was not able to be converted
-  def file_to_mzxml(file)
-    file.gsub!("\\",'/')
-    old_file = file.dup
-    if file =~ /\.mzXML$/
-      return file
-    elsif file =~ /(\.RAW)|(\.raw)$/
-      old_file = file.dup
-      ## t2x outputs in cwd (so go to the directory of the file!)
-      dir = File.dirname(file)
-      basename = File.basename(file)
-      Dir.chdir(dir) do
-        cmd = "#{Spec::MzXML::MZXML_CONVERTER} #{basename}"
-        puts cmd
-        puts `#{cmd}`
-      end
-      file.sub!(/\.RAW$/, '.mzXML')
-      file.sub!(/\.raw$/, '.mzXML')
-      unless File.exist? file
-        abort "Couldn't convert #{old_file} to #{file}"
-      end
-      return file
-    else
-      if File.exist?( file + '.mzXML' )
-        return file_to_mzxml(file + '.mzXML')
-      elsif File.exist?( file + '.RAW' )
-        return file_to_mzxml(file + '.RAW')
-      elsif File.exist?( file + '.raw' )
-        return file_to_mzxml(file + '.raw')
-      else
-        return nil
-      end
-    end
-  end
   def get_prec_mz_by_scan_for_time_index(file)
     index = Spec::MSRunIndex.new(file)
     prec_mz_by_scan = index.scans_by_num.collect do |scan|
@@ -356,7 +315,7 @@ class Spec::MzXML::Parser
       return get_prec_mz_by_scan_for_time_index(file)
     end
-    file = file_to_mzxml(file)
+    file = Spec::MzXML.file_to_mzxml(file)
     unless parse_type then parse_type = default_parser end
     case parse_type
@@ -386,7 +345,7 @@ class Spec::MzXML::Parser
   #   startMz         start_mz
   #   endMz           end_mz
   def basic_info(mzxml_file)
-    puts "parsing: #{mzxml_file} #{File.exist?(mzxml_file)}"
+    puts "parsing: #{mzxml_file} #{File.exist?(mzxml_file)}" if $VERBOSE
     hash = {}
     scan_count_tmp = []
     (1..5).to_a.each do |n| scan_count_tmp[n] = 0 end

data/lib/spec/mzxml.rb CHANGED Viewed

@@ -4,14 +4,62 @@ require 'base64'
 module Spec; end
 module Spec::MzXML
-  MZXML_CONVERTER = 't2x'
+  Potential_mzxml_converters = %w(readw.exe readw t2x)
   # takes PT2.7500000S and returns it as 2.700000 (no PT or S)
   def strip_time(time)
     return time[2...-1]
   end
+   # first, converts backslash to forward slash in filename.
+  # if .mzXML returns the filename
+  # if .raw or .RAW converts the file to .mZXML and returns mzXML filename
+  # if no recognized extension, looks for .mzXML file, then .RAW file (and
+  # converts)
+  # aborts if file was not able to be converted
+  # returns nil if a file that can be converted or used was not found
+  def self.file_to_mzxml(file)
+    file.gsub!("\\",'/')
+    old_file = file.dup
+    if file =~ /\.mzXML$/
+      return file
+    elsif file =~ /\.RAW$/i
+      old_file = file.dup
+      ## t2x outputs in cwd (so go to the directory of the file!)
+      dir = File.dirname(file)
+      basename = File.basename(file)
+      converter = Spec::MzXML.find_mzxml_converter
+      Dir.chdir(dir) do
+        if converter =~ /readw/
+          cmd = "#{converter} #{basename} c #{basename.sub(/\.RAW$/i, '.mzXML')}"
+        else
+          cmd = "#{converter} #{basename}"
+        end
+        #puts cmd
+        #puts `#{cmd}`
+        reply = `#{cmd}`
+        puts reply if $VERBOSE
+      end
+      file.sub!(/\.RAW$/i, '.mzXML')
+      unless File.exist? file
+        abort "Couldn't convert #{old_file} to #{file}"
+      end
+      return file
+    else
+      if File.exist?( file + '.mzXML' )
+        return file_to_mzxml(file + '.mzXML')
+      elsif File.exist?( file + '.RAW' )
+        return file_to_mzxml(file + '.RAW')
+      elsif File.exist?( file + '.raw' )
+        return file_to_mzxml(file + '.raw')
+      else
+        return nil
+      end
+    end
+  end
   # takes a base64 peaks string and returns an array of [m/z,intens] doublets
   # mzXML as network ordered
@@ -42,5 +90,20 @@ module Spec::MzXML
     b64d.unpack(unpack_code)
   end
+  # Searchs each path element and returns the first one it finds
+  # returns nil if none found
+  def self.find_mzxml_converter
+    ENV['PATH'].split(/[:;]/).each do |path|
+      Dir.chdir(path) do
+        Potential_mzxml_converters.each do |pc|
+          if File.exist? pc
+            return File.join(path, pc)
+          end
+        end
+      end
+    end
+    nil
+  end
 end

data/lib/spec_id/aa_freqs.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 require 'fasta'
+module SpecID ; end
 class SpecID::AAFreqs
   # a fasta object
   attr_accessor :fasta
@@ -77,16 +79,17 @@ class SpecID::AAFreqs
   end
   # pep_objs respond to sequence?
+  # also takes a hash of peptides keyed on :aaseq
   def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
-    @aafreqs ||= {}
-    @aafreqs[:C] = cyst_freq
-    seqs = pep_objs.map do |v|
-      if v.sequence =~ /\.([\w\*]+)\./
-        $1
-      else
-        abort v.sequence.to_s + " could not be matched!"
+    if pep_objs.is_a? Hash
+      seqs = pep_objs.keys
+    else
+      seqs = pep_objs.map do |v|
+        v.aaseq
       end
     end
+    @aafreqs ||= {}
+    @aafreqs[:C] = cyst_freq
     actual_and_expected_number(seqs, :C, 1)
   end

data/lib/spec_id/bioworks.rb CHANGED Viewed

@@ -6,13 +6,17 @@ require 'spec_id'
 require 'zlib'
 require 'hash_by'
 require 'set_from_hash'
+require 'array_class'
 ## have to pre-declare some guys
-class SpecID; end
-class SpecID::Prot; end
+module SpecID; end
+module SpecID::Prot; end
+module SpecID::Pep; end
 module SpecIDXML; end
-class SpecID::Bioworks
+class Bioworks
+  include SpecID
   # Regular expressions
   @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
   @@modifications_re = /<modifications>(.*)<\/modifications>/o
@@ -21,10 +25,9 @@ class SpecID::Bioworks
   @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
-  attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
+  attr_accessor :peps, :prots, :version, :global_filename, :origfilename, :origfilepath
   # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
   attr_accessor :modifications
-  attr_writer :peps
   def hi_prob_best ; false end
@@ -127,6 +130,7 @@ class SpecID::Bioworks
   # note that each pep will contain its original prot it belongs to, even
   # though the parallel protein actually represents the proteins it belongs
   # to.
+  # assumes that each peptide points to all its proteins in pep.prots
   def _uniq_peps_by_sequence_charge(peps)
     new_arr = []
     prot_arr = []
@@ -134,11 +138,11 @@ class SpecID::Bioworks
     (0...peps.size).each do |i|
       next if index_accounted_for.include?(i)
       new_arr << peps[i]
-      prot_arr.push( [peps[i].prot] )
+      prot_arr.push( peps[i].prots )
       ((i+1)...peps.size).each do |j|
         pep1, pep2 = peps[i], peps[j]
         if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge
-          prot_arr.last.push pep2.prot
+          prot_arr.last.push( *(pep2.prots) )
           index_accounted_for << j
         end
       end
@@ -149,13 +153,14 @@ class SpecID::Bioworks
   def initialize(file=nil)
     @peps = nil
     if file
+      @filename = file
       parse_xml(file)
       #parse_xml_by_xmlparser(file)
     end
   end
   def parse_xml_by_xmlparser(file)
-    parser = SpecID::Bioworks::XMLParser.new
+    parser = Bioworks::XMLParser.new
     File.open(file) do |fh|
       #3.times do fh.gets end  ## TEMPFIX
       parser.parse(fh)
@@ -165,23 +170,6 @@ class SpecID::Bioworks
     @prots = parser.prots
   end
-  # Returns the list of all peptide hits.  A given sequence/charge or scan
-  # may be redundant!
-  def peps
-    if @peps
-      return @peps
-    else
-      @peps = []
-      prots.each do |prot|
-        prot.peps.each do |pep|
-          @peps << pep
-        end
-      end
-      return @peps
-    end
-  end
   # This is highly specific to Bioworks 3.2 xml export.  In other words,
   # unless the newlines, etc. are duplicated, this parser will fail! Not
   # robust, but it is faster than xmlparser (which is based on the speedy
@@ -200,21 +188,23 @@ class SpecID::Bioworks
     end
     @version = get_regex_val(fh, @@bioworksinfo_re)
     @modifications = get_regex_val(fh, @@modifications_re)
-    @prots = get_prots(fh, self)
+    @prots, @peps = get_prots_from_xml_stream(fh)
     fh.close
   end
-  def get_prots(fh, bioworks)
+  ## returns proteins and peptides
+  def get_prots_from_xml_stream(fh)
+    uniq_pephit_hash = {}
     prots = []
     while line = fh.gets
       if line =~ @@protein_re
-        prot =  SpecID::Bioworks::Prot.new
+        prot =  Bioworks::Prot.new
         prot.bioworks = self
-        prot.set_from_xml_stream(fh, bioworks)
+        prot.set_from_xml_stream(fh, uniq_pephit_hash)
         prots << prot
       end
     end
-    prots
+    [prots, uniq_pephit_hash.values]
   end
   # gets the regex and stops (and rewinds if it hits a protein)
@@ -246,7 +236,7 @@ end
 # Implements fast parsing via XMLParser (wrapper around Expat)
 # It is actually slower (about %25 slower) than regular expression parsing
-class SpecID::Bioworks::XMLParser < XMLParser
+class Bioworks::XMLParser < XMLParser
   @@at = '@'
   attr_accessor :prots
@@ -262,18 +252,18 @@ class SpecID::Bioworks::XMLParser < XMLParser
     case name
     when "peptide"
       curr_prot = @current_obj
-      if @current_obj.class == SpecID::Bioworks::Prot
+      if @current_obj.class == Bioworks::Prot
         @current_obj.set_from_xml_hash_xmlparser(@current_hash)
       else
         curr_prot = @current_obj.prot  ## unless previous was a peptide
       end
-      peptide = SpecID::Bioworks::Pep.new
+      peptide = Bioworks::Pep.new
       peptide.prot = curr_prot
       curr_prot.peps << peptide
       @current_obj = peptide
       @current_hash = {}
     when "protein"
-      @current_obj = SpecID::Bioworks::Prot.new
+      @current_obj = Bioworks::Prot.new
       @current_hash = {}
       @prots << @current_obj
     else
@@ -297,13 +287,14 @@ class SpecID::Bioworks::XMLParser < XMLParser
 end
-module SpecID::Bioworks::XML
+module Bioworks::XML
   # The regular expression to grab attributes from the bioworks xml format
   @@att_re = /<([\w]+)>(.*)<\/[\w]+>/o
 end
-class SpecID::Bioworks::Prot < SpecID::Prot
-  include SpecID::Bioworks::XML
+class Bioworks::Prot
+  include SpecID::Prot
+  include Bioworks::XML
   @@end_prot_re = /<\/protein>/o
   @@pep_re = /<peptide>/o
@@ -323,15 +314,32 @@ class SpecID::Bioworks::Prot < SpecID::Prot
     end
   end
-  def set_from_xml_stream(fh, bioworks)
+  def set_from_xml_stream(fh, uniq_pephit_hash)
     hash = {}
+    @peps = []
     while line = fh.gets
       if line =~ @@att_re
         hash[$1] = $2
       elsif line =~ @@pep_re
-        pep = SpecID::Bioworks::Pep.new.set_from_xml_stream(fh, self)
-        pep.prot = self
+        ## Could do a look ahead to grab the file and sequence to check
+        ## uniqueness to increase speed here.
+        pep = Bioworks::Pep.new.set_from_xml_stream(fh)
+        # normal search results files have a global filename
+        # while multi-consensus do not
+        pep[12] ||= bioworks.global_filename
+        ## figure out uniqueness
+        ky = [pep.base_name, pep.first_scan, pep.charge, pep.sequence]
+        if uniq_pephit_hash.key? ky
+          pep = uniq_pephit_hash[ky]
+        else
+          ## insert the new protein
+          pep.prots = []
+          uniq_pephit_hash[ky] = pep
+        end
+        pep.prots << self
         @peps << pep
       elsif line =~ @@end_prot_re
         set_from_xml_hash(hash)
         break
@@ -367,9 +375,12 @@ class SpecID::Bioworks::Prot < SpecID::Prot
   end
 end
+Bioworks::Pep = ArrayClass.new( %w(sequence mass deltamass charge xcorr deltacn sp rsp ions count tic prots base_name first_scan last_scan peptide_probability file _num_prots _first_prot aaseq) )
+# 0=sequence 1=mass 2=deltamass 3=charge 4=xcorr 5=deltacn 6=sp 7=rsp 8=ions 9=count 10=tic 11=prots 12=base_name 13=first_scan 14=last_scan 15=peptide_probability 16=file 17=_num_prots 18=_first_prot 19=aaseq
-class SpecID::Bioworks::Pep < Array
-  include SpecID::Bioworks::XML
+class Bioworks::Pep
+  include SpecID::Pep
+  include Bioworks::XML
   include SpecIDXML
   @@file_split_first_re = /, /o
@@ -380,53 +391,18 @@ class SpecID::Bioworks::Pep < Array
   @@file_mult_scan_re = /(.*), (\d+) - (\d+)/o
   ## NOTE! the mass is really the theoretical MH+!!!!
   ## NOTE! ALL values stored as strings, except peptide_probability!
-  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
-  ind_keys = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prot => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18 }
-  def sequence ; self[0] end ; def sequence=(oth) ; self[0] = oth end
-  def mass ; self[1] end ; def mass=(oth) ; self[1] = oth end
-  def deltamass ; self[2] end ; def deltamass=(oth) ; self[2] = oth end
-  def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end
-  def xcorr ; self[4] end ; def xcorr=(oth) ; self[4] = oth end
-  def deltacn ; self[5] end ; def deltacn=(oth) ; self[5] = oth end
-  def sp ; self[6] end ; def sp=(oth) ; self[6] = oth end
-  def rsp ; self[7] end ; def rsp=(oth) ; self[7] = oth end
-  def ions ; self[8] end ; def ions=(oth) ; self[8] = oth end
-  def count ; self[9] end ; def count=(oth) ; self[9] = oth end
-  def tic ; self[10] end ; def tic=(oth) ; self[10] = oth end
-  def prot ; self[11] end ; def prot=(oth) ; self[11] = oth end
-  def base_name ; self[12] end ; def base_name=(oth) ; self[12] = oth end
-  def first_scan ; self[13] end ; def first_scan=(oth) ; self[13] = oth end
-  def last_scan ; self[14] end ; def last_scan=(oth) ; self[14] = oth end
-  def peptide_probability ; self[15] end ; def peptide_probability=(oth) ; self[15] = oth end
-  def file ; self[16] end   # we define a writer below
-  def _num_prots ; self[17] end ; def _num_prots=(oth) ; self[17] = oth end
-  def _first_prot ; self[18] end ; def _first_prot=(oth) ; self[18] = oth end
   ## other accessors:
   def probability ; self[15] end
+  def mh ; self[1] end
-  #ind_keys.keys do |k|
-  #  self.module_eval( "def #{k} ; self[#{ind_keys[k]}] end ; def #{k}=(oth) ; self[#{ind_keys[k]} = oth end ", __FILE__, __LINE__ )
-  #end
-  @@arr_size = ind_keys.size
-  ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
-  ind_keys.merge!(ind_keys_w_eq)
-  ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
-  def initialize(args=nil)
-    super(@@arr_size.size)
-    if args
-      if args.is_a? Hash
-        args.each do |k,v|
-          self[@@ind[k]] = v
-        end
-      end
-    end
+  # This is not a true ppm since it should be divided by the actual mh instead
+  # of the theoretical (but it is as close as we can get for this object)
+  def ppm
+    1.0e6 * (self[2].abs/self[1])
+    #1.0e6 * (self.deltamass.abs/self.mh)
   end
   # returns array of values of the attributes given (as symbols)
   def get(*args)
     args.collect do |arg|
@@ -463,25 +439,30 @@ class SpecID::Bioworks::Pep < Array
     [base_name, first_scan, last_scan]
   end
+  tmp_verb = $VERBOSE
+  $VERBOSE = nil
   def file=(arg)
     ## Set these vals by index:
     #puts "AERRG: #{arg}"
     self[16] = arg
     self[12,3] = self.class.extract_file_info(arg)
   end
+  $VERBOSE = tmp_verb
   def inspect
-    "<SpecID::Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn} prot: #{prot} base_name: #{base_name} first_scan: #{first_scan} last_scan: #{last_scan} file: #{file} peptide_probability: #{peptide_probability}>"
+    "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
   end
   def set_from_hash(hash)
     self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]]
     self.file = hash["file"]
     self[15] = hash["peptide_probability"].to_f
+    self[19] = SpecID::Pep.sequence_to_aaseq(self[0])  ## aaseq
   end
-  def set_from_xml_stream(fh, prot)
-    self[11] = prot
+  def set_from_xml_stream(fh)
     hash = {}
     while line = fh.gets
       if line =~ @@att_re
@@ -491,7 +472,6 @@ class SpecID::Bioworks::Pep < Array
       elsif line =~ @@end_pep_re
         set_from_hash(hash)
         #puts "SELF[12]: #{self[12]}"
-        unless self[12] then self[12] = prot.bioworks.global_filename end
         #puts "SELF[12]: #{self[12]}"
         break
       else