RubyGems - mspire - Versions diffs - 0.1.7 → 0.2.0 - Mend

mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/Rakefile +41 -14
data/bin/bioworks2excel.rb +1 -1
data/bin/bioworks_to_pepxml.rb +46 -59
data/bin/fasta_shaker.rb +1 -1
data/bin/filter.rb +6 -0
data/bin/find_aa_freq.rb +23 -0
data/bin/id_precision.rb +3 -2
data/bin/mzxml_to_lmat.rb +2 -1
data/bin/pepproph_filter.rb +1 -1
data/bin/precision.rb +1 -1
data/bin/protein_summary.rb +2 -451
data/bin/raw_to_mzXML.rb +55 -0
data/bin/srf_group.rb +26 -0
data/changelog.txt +7 -0
data/lib/align.rb +3 -3
data/lib/fasta.rb +6 -1
data/lib/gi.rb +9 -4
data/lib/roc.rb +2 -0
data/lib/sample_enzyme.rb +2 -1
data/lib/spec/mzxml/parser.rb +2 -43
data/lib/spec/mzxml.rb +65 -2
data/lib/spec_id/aa_freqs.rb +10 -7
data/lib/spec_id/bioworks.rb +67 -87
data/lib/spec_id/filter.rb +794 -0
data/lib/spec_id/precision.rb +29 -36
data/lib/spec_id/proph.rb +5 -3
data/lib/spec_id/protein_summary.rb +459 -0
data/lib/spec_id/sequest.rb +323 -271
data/lib/spec_id/srf.rb +189 -135
data/lib/spec_id.rb +276 -227
data/lib/spec_id_xml.rb +101 -0
data/lib/toppred.rb +18 -0
data/script/degenerate_peptides.rb +47 -0
data/script/filter-peps.rb +5 -1
data/test/tc_align.rb +1 -1
data/test/tc_bioworks.rb +25 -22
data/test/tc_bioworks_to_pepxml.rb +37 -4
data/test/tc_fasta.rb +3 -1
data/test/tc_fasta_shaker.rb +8 -6
data/test/tc_filter.rb +203 -0
data/test/tc_gi.rb +6 -9
data/test/tc_id_precision.rb +31 -0
data/test/tc_mzxml.rb +8 -6
data/test/tc_peptide_parent_times.rb +2 -1
data/test/tc_precision.rb +1 -1
data/test/tc_proph.rb +5 -5
data/test/tc_protein_summary.rb +36 -13
data/test/tc_sequest.rb +78 -33
data/test/tc_spec_id.rb +128 -6
data/test/tc_srf.rb +84 -38
metadata +67 -62
data/bin/fasta_cat.rb +0 -39
data/bin/fasta_cat_mod.rb +0 -59
data/bin/fasta_mod.rb +0 -57
data/bin/filter_spec_id.rb +0 -365
data/bin/raw2mzXML.rb +0 -21
data/script/gen_database_searching.rb +0 -258

data/lib/spec_id/srf.rb CHANGED Viewed

@@ -73,11 +73,57 @@ class DTALog
   end
 end
+class SRFGroup
+  include SpecID
+  ## the srf objects themselves
+  attr_accessor :srfs, :filenames
+  ## also inherits :peps and :prots accessor
+  # takes an array of filenames
+  # or a single .srg filename
+  # see from_srg to load a single .srg file
+  def initialize(filenames=nil)
+    @filenames = filenames
+    @peps = []
+    @prots = []
+    @global_ref_hash = {}
+    @srfs = []
+    if filenames
+      if filenames.is_a?(String) && filenames =~ /\.srg$/
+        srg_filename = filenames.dup
+        @filename = srg_filename
+        filenames = IO.readlines(filenames).grep(/\w/).map {|v| v.chomp }
+        filenames.each do |file|
+          if !File.exist? file
+            puts "File: #{file} in #{srg_filename} does not exist!"
+            puts "Please modify #{srg_filename} to point to existing files."
+            abort
+          end
+        end
+      end
+      filenames.each do |file|
+        @srfs << SRF.new(file, @peps, @global_ref_hash)
+      end
+    end
+  end
+  # returns the filename used
+  def to_srg(srg_filename='bioworks.srg')
+    File.open(srg_filename, 'w') do |v|
+      @filenames.each do |srf_file|
+        v.puts File.expand_path(srf_file)
+      end
+    end
+    srg_filename
+  end
+end
 class SRF
   # a string 3.3 or 3.2
   attr_accessor :version
   attr_accessor :header
   attr_accessor :dta_files
   attr_accessor :out_files
@@ -85,6 +131,7 @@ class SRF
   # a parallel array to dta_files and out_files where each entry is:
   # [first_scan, last_scan, charge]
   attr_accessor :index
+  attr_accessor :base_name
   def dta_start_byte
     case @version
@@ -93,26 +140,44 @@ class SRF
     end
   end
-  def initialize(filename=nil)
+  # peps and
+  def initialize(filename=nil, peps=[], global_ref_hash={})
     @dta_files = []
     @out_files = []
     if filename
-      from_file(filename)
+      from_file(filename, peps, global_ref_hash)
     end
   end
   # returns self
-  def from_file(filename)
+  def from_file(filename, peps, global_ref_hash)
     File.open(filename, "rb") do |fh|
       @header = SRF::Header.new.from_handle(fh)
       @version = @header.version
-      @dta_files = read_dta_files(fh,@header.num_dta_files)
-      @out_files = read_out_files(fh,@header.num_dta_files)
-      @params = SpecID::Sequest::Params.new.parse_handle(fh)
+      @dta_files, measured_mhs = read_dta_files(fh,@header.num_dta_files)
+      @out_files = read_out_files(fh,@header.num_dta_files, global_ref_hash, measured_mhs)
+      @params = Sequest::Params.new.parse_handle(fh)
       fh.read(12)  ## gap between last params entry and index
       @index = read_scan_index(fh,@header.num_dta_files)
     end
+    ### UPDATE SOME THINGS ON SINGLE PASS:
+    @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first
+    # give each hit a base_name, first_scan, last_scan
+    @index.each_with_index do |ind,i|
+      mass_measured = @dta_files[i][0]
+      #puts @out_files[i].join(", ")
+      pep_hits = @out_files[i][3]
+      peps.push( *pep_hits )
+      pep_hits.each do |pep_hit|
+        pep_hit[13,3] = @base_name, *ind
+        # add the deltamass
+        pep_hit[10] = pep_hit[0] - mass_measured  # real - measured (deltamass)
+        pep_hit[11] = 1.0e6 * pep_hit[10].abs / mass_measured ## ppm
+        pep_hit[17] = self  ## link with the srf object
+      end
+    end
     self
   end
@@ -143,26 +208,26 @@ class SRF
   # returns an array of dta_files
   def read_dta_files(fh, num_files)
+    measured_mhs = Array.new(num_files) ## A parallel array to capture the actual mh
     dta_files = Array.new(num_files)
     start = dta_start_byte
     unless fh.pos == start
       fh.pos = start
     end
     header.num_dta_files.times do |i|
-      dta_files[i] = SRF::DTA.new.from_handle(fh)
+      dta_file = SRF::DTA.new.from_handle(fh)
+      measured_mhs[i] = dta_file[0]
+      dta_files[i] = dta_file
     end
-    dta_files
+    [dta_files, measured_mhs]
   end
   # filehandle (fh) must be at the start of the outfiles.  'read_dta_files'
   # will put the fh there.
-  def read_out_files(fh,number_files)
+  def read_out_files(fh,number_files, global_ref_hash, measured_mhs)
     out_files = Array.new(number_files)
     header.num_dta_files.times do |i|
-      #if i == header.num_dta_files - 2
-      #  abort
-      #end
-      out_files[i] = SRF::OUT.new.from_handle(fh)
+      out_files[i] = SRF::OUT.new.from_handle(fh, global_ref_hash)
     end
     out_files
   end
@@ -261,45 +326,17 @@ class SRF::DTAGen
   end
 end
-class SRF::DTA < Array
+SRF::DTA = ArrayClass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
+class SRF::DTA
+  Unpack = "EeIvvvv"
   # is this universal?
   First_record_start_byte = 3644
-  ## mucky details.  This should be encapsulated into a class to inherit from, etc.
-  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
-  ind_keys = {:mh => 0, :dta_tic => 1, :num_peaks => 2, :charge => 3, :ms_level => 4, :unknown => 5, :total_num_possible_charge_states => 6, :peaks => 7}
-  @@arr_size = ind_keys.size
-  def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
-  def dta_tic ; self[1] end ; def dta_tic=(oth) ; self[1] = oth end
-  def num_peaks ; self[2] end ; def num_peaks=(oth) ; self[2] = oth end
-  def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end
-  def ms_level ; self[4] end ; def ms_level=(oth) ; self[4] = oth end
-  def unknown ; self[5] end ; def unknown=(oth) ; self[5] = oth end
-  def total_num_possible_charge_states ; self[6] end ; def total_num_possible_charge_states=(oth) ; self[6] = oth end
+  # note on peaks (self[7])
   # this is a byte array of floats, you can get the peaks out with
   # unpack("e*")
-  def peaks ; self[7] end
-  # this is a byte array of floats, you can get the peaks out with
-  def peaks=(oth) ; self[7] = oth end
-  @@arr_size = ind_keys.size
-  ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
-  ind_keys.merge!(ind_keys_w_eq)
-  ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
-  def initialize(args=nil)
-    super(@@arr_size.size)
-    if args
-      if args.is_a? Hash
-        args.each do |k,v|
-          self[@@ind[k]] = v
-        end
-      end
-    end
-  end
   def inspect
     peaks_st = 'nil'
@@ -310,7 +347,7 @@ class SRF::DTA < Array
   def from_handle(fh)
     st = fh.read(24)
     # get the bulk of the data in single unpack
-    self[0,7] = st.unpack("EeIvvvv")
+    self[0,7] = st.unpack(Unpack)
     # Scan numbers possibly hidden in this next sequence of bytes (I think)
     st2 = fh.read(24)
@@ -323,112 +360,99 @@ class SRF::DTA < Array
 end
+SRF::OUT = ArrayClass.new( %w(num_hits computer date_time hits) )
+# 0=num_hits 1=charge 2=computer 3=date_time 4=hits
-class SRF::OUT < Array
-  ## mucky details.  This should be encapsulated into a class to inherit from, etc.
-  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
-  ind_keys = {:num_hits => 0, :charge => 1, :computer => 2, :date_time => 3, :first_scan => 4, :last_scan => 5, :filename_noext => 6, :hits => 7}
-  @@arr_size = ind_keys.size
-  def num_hits ; self[0] end ; def num_hits=(oth) ; self[0] = oth end
-  def charge ; self[1] end ; def charge=(oth) ; self[1] = oth end
-  def computer ; self[2] end ; def computer=(oth) ; self[2] = oth end
-  def date_time ; self[3] end ; def date_time=(oth) ; self[3] = oth end
-  def first_scan ; self[4] end ; def first_scan=(oth) ; self[4] = oth end
-  def last_scan ; self[5] end ; def last_scan=(oth) ; self[5] = oth end
-  def filename_noext ; self[6] end ; def filename_noext=(oth) ; self[6] = oth end
-  def hits ; self[7] end ; def hits=(oth) ; self[7] = oth end
-  @@arr_size = ind_keys.size
-  ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
-  ind_keys.merge!(ind_keys_w_eq)
-  ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
-  def initialize(args=nil)
-    super(@@arr_size.size)
-    if args
-      if args.is_a? Hash
-        args.each do |k,v|
-          self[@@ind[k]] = v
-        end
-      end
-    end
-  end
+class SRF::OUT
+  Unpack = '@36vx2Z*@60Z*'
   def inspect
     if first_scan
       ins = "@first_scan=#{first_scan}, @last_scan=#{last_scan}, @filename_noext=#{filename_noext}, "
     end
-    "<SRF::OUT  @num_hits=#{num_hits}, @charge=#{charge}, @computer=#{computer}, @date_time=#{date_time}, #{ins}@hits=#{hits.inspect}>"
+    "<SRF::OUT  @num_hits=#{num_hits}, @computer=#{computer}, @date_time=#{date_time}, #{ins}@hits=#{hits.inspect}>"
   end
-  def from_handle(fh)
+  def from_handle(fh, global_ref_hash)
     ## EMPTY out file is 96 bytes
     ## each hit is 320 bytes
     ## num_hits and charge:
     st = fh.read(96)
-    self[0,4] = st.unpack("@36vvZ*@60Z*")
+    self[0,3] = st.unpack(Unpack)
     num_hits = self[0]
     ar = Array.new(num_hits)
-    num_hits.times do |i|
-      ar[i] = SRF::OUT::Hit.new.from_handle(fh)
+    if ar.size > 0
+      num_hits.times do |i|
+        ar[i] = SRF::OUT::Pep.new.from_handle(fh, global_ref_hash)
+      end
+      ## The xcorrs are already ordered by best to worst hit
+      ## ADJUST the deltacn's to be meaningful for the top hit:
+      ## (the same as bioworks and prophet)
+      (1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn }
+      ar.last.deltacn = 1.1
     end
-    self[7] = ar
+    self[3] = ar
     self
   end
 end
-class SRF::OUT::Hit < Array
+# deltacn is modified to be that of the next best hit (by xcorr).
+# if there is no next best hit, then it will be 1.1 (like bioworks)
+# mh is the theoretical mass + h
+# prots are created as SRF prot objects with a reference and linked to their
+# peptides (from global hash by reference)
+# ppm = 10^6 * ∆m_accuracy / mass_measured  [ where ∆m_accuracy = mass_real – mass_measured ]
+# This is calculated for the M+H mass!
+# srf = the srf object this scan came from
+SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn sp xcorr id rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf) )
+# 0=mh 1=deltacn 2=sp 3=xcorr 4=id 5=rsp 6=ions_matched 7=ions_total 8=sequence 9=prots 10=deltamass 11=ppm 12=aaseq 13=base_name 14=first_scan 15=last_scan 16=charge 17=srf
+class SRF::OUT::Pep
+  include SpecID::Pep
+  Unpack = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
+  Unpack_four_null_bytes = 'a*'
+  Unpack_Zstar = 'Z*'
   FourNullBytes_as_string = "\0\0\0\0"
   #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
   NewRecordStart = 0x01.chr + 0x00.chr
   Sequest_record_start = "[SEQUEST]"
-   ## mucky details.  This should be encapsulated into a class to inherit from, etc.
-  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
-  ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
-  @@arr_size = ind_keys.size
-  def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
-  def deltacn ; self[1] end ; def deltacn=(oth) ; self[1] = oth end
-  def sp ; self[2] end ; def sp=(oth) ; self[2] = oth end
-  def xcorr ; self[3] end ; def xcorr=(oth) ; self[3] = oth end
-  def id ; self[4] end ; def id=(oth) ; self[4] = oth end
-  def rsp ; self[5] end ; def rsp=(oth) ; self[5] = oth end
-  def ions_matched ; self[6] end ; def ions_matched=(oth) ; self[6] = oth end
-  def ions_total ; self[7] end ; def ions_total=(oth) ; self[7] = oth end
-  def peptide ; self[8] end ; def peptide=(oth) ; self[8] = oth end
-  def reference ; self[9] end ; def reference=(oth) ; self[9] = oth end
-  # The number of total proteins sharing this peptide
-  def num_tot_proteins ; self[10] end ; def num_tot_proteins=(oth) ; self[10] = oth end
-  def initialize(args=nil)
-    super(@@arr_size.size)
-    if args
-      if args.is_a? Hash
-        args.each do |k,v|
-          self[@@ind[k]] = v
-        end
-      end
-    end
-  end
+  tmp = $VERBOSE ; $VERBOSE = nil
+  def prots() self[9] end
+  $VERBOSE = tmp
   def inspect
-    "<SRF::OUT::Hit @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @peptide=#{peptide}, @reference=#{reference}, @num_tot_proteins=#{num_tot_proteins}>"
+    st = %w(aaseq sequence mh deltacn sp xcorr id rsp ions_matched ions_total prots deltamass ppm base_name first_scan last_scan charge).map do |v|
+      if v.is_a? Array
+        "##{v}=#{send(v.to_sym).size}"
+      else
+        "@#{v}=#{send(v.to_sym)}"
+      end
+    end
+    st.unshift("<#{self.class}")
+    if srf
+      st.push("@srf(base_name)=#{srf.base_name}")
+    end
+    st.push('>')
+    st.join(' ')
+    #"<SRF::OUT::Pep @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @prots(count)=#{prots.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
   end
   ## There must be a better way to do this.
   ## We are checking that there are no additional protein references only
   ## so that we are in register for the next reading
-  def read_extra_references(fh)
+  def read_extra_references(fh, global_ref_hash)
     $SRF_OUT_HIT_FH_POS = fh.pos
     st = fh.read(4)
     #puts "HHH: " + st.unpack("H*").first
     ## if we see 0000 0000 we are done
-    if st.unpack("a*").first == FourNullBytes_as_string
+    if st.unpack(Unpack_four_null_bytes).first == FourNullBytes_as_string
       fh.pos = $SRF_OUT_HIT_FH_POS
       return nil
     end
@@ -436,20 +460,11 @@ class SRF::OUT::Hit < Array
     ## NOTE: in context of 4 bytes read above!
     st = fh.read(36)
-    #p self
-    #puts "HHHH: " + st.unpack("H*").first
-    #puts st[34,2].unpack("H*").first
     if st[34,2] == NewRecordStart
       fh.pos = $SRF_OUT_HIT_FH_POS
       return nil
     end
-    ##if st.unpack("@22H*").first == NewRecordStart_as_hex
-    #if st[22,6] == NewRecordStart
-    #  fh.pos = $SRF_OUT_HIT_FH_POS
-    #  return nil
-    #end
     # is this the end of the outfiles?
     ## BACK to beginning of this section
     fh.pos = $SRF_OUT_HIT_FH_POS
@@ -459,28 +474,67 @@ class SRF::OUT::Hit < Array
     end
     ## we have extra references
-    self[10] += 1
-    fh.read(79)
-    #p self
+    ## original read was fh.read(79)
+    fh.seek(-1, IO::SEEK_CUR)
+    self[9].push( new_protein(fh.read(80).unpack(Unpack_Zstar).first, self, global_ref_hash ) )
+    #p self.prots
+    #puts self.prots.size
     #$glob ||= 0
     #$glob += 1
-    #if $glob == 100
+    #if $glob == 20
     #  abort
     #end
-    read_extra_references(fh)
+    read_extra_references(fh,global_ref_hash)
   end
+  def new_protein(reference, peptide, global_ref_hash)
+    if global_ref_hash.key? reference
+      global_ref_hash[reference].peps << peptide
+    else
+      global_ref_hash[reference] = SRF::OUT::Prot.new(reference, [peptide])
+    end
+    global_ref_hash[reference]
+  end
-  def from_handle(fh)
+  def from_handle(fh, global_ref_hash)
     ## get the first part of the info
     st = fh.read(320) ## read all the hit data
-    self[0,10] = st.unpack('@64Ex8ex12eeIx18vvvx8Z*@240Z*')
-    self[10] = 1
-    read_extra_references(fh)
+    self[0,10] = st.unpack(Unpack)
+    # we are slicing the reference to 38 chars to be the same length as
+    # duplicate references
+    self[9] = [new_protein(self[9][0,38], self, global_ref_hash)]
+    self[12] = SpecID::Pep.sequence_to_aaseq(self[8])
+    read_extra_references(fh, global_ref_hash)
     self
   end
 end
+SRF::OUT::Prot = ArrayClass.new( %w(reference peps) )
+class SRF::OUT::Prot
+  include SpecID::Prot
+  tmp = $VERBOSE ; $VERBOSE = nil
+  def initialize(reference=nil, peps=[])
+    super(@@arr_size)
+    #@reference = reference
+    #@peps = peps
+    self[0,2] = reference, peps
+  end
+  $VERBOSE = tmp
+  #  "<SRF::OUT::Prot reference=\"#{@reference}\">"
+  def inspect
+    "<SRF::OUT::Prot @reference=#{reference}, @peps(#)=#{peps.size}>"
+  end
+end