RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/lib/spec_id/srf.rb ADDED Viewed

@@ -0,0 +1,486 @@
+require 'spec_id/sequest'
+module BinaryReader
+  Null_char = "\0"[0]  ## change for ruby 1.9 or 2.0
+  # extracts a string with all empty chars at the end stripped
+  # expects the filehandle to be at the proper location
+  def get_null_padded_string(fh,bytes)
+    st = fh.read(bytes)
+    # for empty declarations
+    if st[0] == Null_char
+      return ''
+    end
+    st.rstrip!
+    st
+  end
+end
+# class to extract information from <file>_dta.log files
+class DTALog
+  # returns an array indexed by the dta file number (starting at 0)
+  # each entry is an array [first_scan, last_scan, dta_filename_noext]
+  # this is now obsolete since I found the scan # index at the end of the srf
+  # files
+  def self.dta_and_scans_by_dta_index(file)
+    dta_index = nil
+    final_scan = nil
+    dta_cnt = 0
+    re = /^ m/o
+    scan_line_re = /scan: (\d+) - (\d+), Datafile: (.*?) (.*)/o
+    other_dta_re = /Datafile: (.*?) /o
+    File.open(file) do |fh|
+      10.times { fh.readline }
+      scan_range_line = fh.readline
+      if scan_range_line =~ /scan range\s+= \d+ - (\d+)/
+        # this is an overestimate (since MS scans have no dta, but that's OK)
+        dta_index = Array.new($1.to_i)
+      else
+        dta_index = []
+      end
+      3.times { fh.readline }
+      fh.each do |line|
+        if line =~ re
+          if line =~ scan_line_re
+            first_scan = $1.to_i
+            last_scan = $2.to_i
+            the_rest = $4.dup
+            dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
+            dta_cnt += 1
+            if the_rest =~ other_dta_re
+              dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
+              dta_cnt += 1
+            end
+          end
+          break
+        end
+      end
+      fh.each do |line|
+        if line =~ scan_line_re
+          first_scan = $1.to_i
+          last_scan = $2.to_i
+          the_rest = $4.dup
+          dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
+          dta_cnt += 1
+          if the_rest =~ other_dta_re
+            dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
+            dta_cnt += 1
+          end
+        end
+      end
+    end
+    dta_index.compact! # remove those trailing nils
+    dta_index
+  end
+end
+class SRF
+  # a string 3.3 or 3.2
+  attr_accessor :version
+  attr_accessor :header
+  attr_accessor :dta_files
+  attr_accessor :out_files
+  attr_accessor :params
+  # a parallel array to dta_files and out_files where each entry is:
+  # [first_scan, last_scan, charge]
+  attr_accessor :index
+  def dta_start_byte
+    case @version
+    when '3.2' ; 3260
+    when '3.3' ; 3644
+    end
+  end
+  def initialize(filename=nil)
+    @dta_files = []
+    @out_files = []
+    if filename
+      from_file(filename)
+    end
+  end
+  # returns self
+  def from_file(filename)
+    File.open(filename, "rb") do |fh|
+      @header = SRF::Header.new.from_handle(fh)
+      @version = @header.version
+      @dta_files = read_dta_files(fh,@header.num_dta_files)
+      @out_files = read_out_files(fh,@header.num_dta_files)
+      @params = SpecID::Sequest::Params.new.parse_handle(fh)
+      fh.read(12)  ## gap between last params entry and index
+      @index = read_scan_index(fh,@header.num_dta_files)
+    end
+    self
+  end
+  # returns an index where each entry is [first_scan, last_scan, charge]
+  def read_scan_index(fh, num)
+    ind_len = 24
+    index = Array.new(num)
+    unpack_string = 'III'
+    st = ''
+    ind_len.times do st << '0' end  ## create a 24 byte string to receive data
+    num.times do |i|
+      fh.read(ind_len, st)
+      index[i] = st.unpack(unpack_string)
+    end
+    index
+  end
+  # given a zero indexed list where each entry is [first_scan, last_scan,
+  # dta_filename] updates the out info
+  # returns self
+  def update_out_scan_info_from_dta_log(dta_log)
+    index = DTALog.dta_and_scans_by_dta_index(dta_log)
+    @out_files.each_with_index do |ot,i|
+      ot[4,3] = index[i]  #contingent on implementation of ot
+    end
+    self
+  end
+  # returns an array of dta_files
+  def read_dta_files(fh, num_files)
+    dta_files = Array.new(num_files)
+    start = dta_start_byte
+    unless fh.pos == start
+      fh.pos = start
+    end
+    header.num_dta_files.times do |i|
+      dta_files[i] = SRF::DTA.new.from_handle(fh)
+    end
+    dta_files
+  end
+  # filehandle (fh) must be at the start of the outfiles.  'read_dta_files'
+  # will put the fh there.
+  def read_out_files(fh,number_files)
+    out_files = Array.new(number_files)
+    header.num_dta_files.times do |i|
+      #if i == header.num_dta_files - 2
+      #  abort
+      #end
+      out_files[i] = SRF::OUT.new.from_handle(fh)
+    end
+    out_files
+  end
+end
+class SRF::Header
+  include BinaryReader
+  Start_byte = {
+    :enzyme => 438,
+    :ion_series => 694,
+    :model => 950,
+    :modifications => 982,
+    :raw_filename => 1822,
+    :db_filename => 2082,
+    :dta_log_filename => 2602,
+    :params_filename => 3122,
+    :sequest_log_filename => 3382,
+  }
+  Byte_length = {
+    :enzyme => 256,
+    :ion_series => 256,
+    :model => 32,
+    :modifications => 840,
+    :raw_filename => 260,
+    :db_filename => 520,
+    :dta_log_filename => 520,
+    :params_filename => 260,
+    :sequest_log_filename => 262, ## is this really 262?? or should be 260??
+  }
+  Byte_length_v32 = {
+    :modifications => 456,
+  }
+  # a SRF::DTAGen object
+  attr_accessor :version
+  attr_accessor :dta_gen
+  attr_accessor :enzyme
+  attr_accessor :ion_series
+  attr_accessor :model
+  attr_accessor :modifications
+  attr_accessor :raw_filename
+  attr_accessor :db_filename
+  attr_accessor :dta_log_filename
+  attr_accessor :params_filename
+  attr_accessor :sequest_log_filename
+  def num_dta_files
+    @dta_gen.num_dta_files
+  end
+  # sets fh to 0 and grabs the information it wants
+  def from_handle(fh)
+    st = fh.read(4)
+    @version = '3.' + st.unpack('I').first.to_s
+    @dta_gen = SRF::DTAGen.new.from_handle(fh)
+    ## get the rest of the info
+    byte_length = Byte_length.dup
+    byte_length.merge! Byte_length_v32 if @version == '3.2'
+    fh.pos = Start_byte[:enzyme]
+    [:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
+      send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param]) )
+    end
+    self
+  end
+end
+# the DTA Generation Params
+class SRF::DTAGen
+  ## not sure if this is correct
+  attr_accessor :start_time
+  # group scan (not sure if this is correct)
+  attr_accessor :start_mass
+  attr_accessor :end_mass
+  attr_accessor :num_dta_files
+  attr_accessor :group_scan
+  ## not sure if this is correct
+  attr_accessor :min_group_count
+  attr_accessor :min_ion_threshold
+  #attr_accessor :intensity_threshold # can't find yet
+  #attr_accessor :precursor_tolerance # can't find yet
+  attr_accessor :start_scan
+  attr_accessor :end_scan
+  #
+  def from_handle(fh)
+    fh.pos = 0 if fh.pos != 0
+    st = fh.read(148)
+    (@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
+    self
+  end
+end
+class SRF::DTA < Array
+  # is this universal?
+  First_record_start_byte = 3644
+  ## mucky details.  This should be encapsulated into a class to inherit from, etc.
+  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
+  ind_keys = {:mh => 0, :dta_tic => 1, :num_peaks => 2, :charge => 3, :ms_level => 4, :unknown => 5, :total_num_possible_charge_states => 6, :peaks => 7}
+  @@arr_size = ind_keys.size
+  def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
+  def dta_tic ; self[1] end ; def dta_tic=(oth) ; self[1] = oth end
+  def num_peaks ; self[2] end ; def num_peaks=(oth) ; self[2] = oth end
+  def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end
+  def ms_level ; self[4] end ; def ms_level=(oth) ; self[4] = oth end
+  def unknown ; self[5] end ; def unknown=(oth) ; self[5] = oth end
+  def total_num_possible_charge_states ; self[6] end ; def total_num_possible_charge_states=(oth) ; self[6] = oth end
+  # this is a byte array of floats, you can get the peaks out with
+  # unpack("e*")
+  def peaks ; self[7] end
+  # this is a byte array of floats, you can get the peaks out with
+  def peaks=(oth) ; self[7] = oth end
+  @@arr_size = ind_keys.size
+  ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
+  ind_keys.merge!(ind_keys_w_eq)
+  ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
+  def initialize(args=nil)
+    super(@@arr_size.size)
+    if args
+      if args.is_a? Hash
+        args.each do |k,v|
+          self[@@ind[k]] = v
+        end
+      end
+    end
+  end
+  def inspect
+    peaks_st = 'nil'
+    if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
+    "<SRF::DTA @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
+  end
+  def from_handle(fh)
+    st = fh.read(24)
+    # get the bulk of the data in single unpack
+    self[0,7] = st.unpack("EeIvvvv")
+    # Scan numbers possibly hidden in this next sequence of bytes (I think)
+    st2 = fh.read(24)
+    num_bytes_to_read = num_peaks * 8
+    st3 = fh.read(num_bytes_to_read)
+    self[7] = st3
+    self
+  end
+end
+class SRF::OUT < Array
+  ## mucky details.  This should be encapsulated into a class to inherit from, etc.
+  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
+  ind_keys = {:num_hits => 0, :charge => 1, :computer => 2, :date_time => 3, :first_scan => 4, :last_scan => 5, :filename_noext => 6, :hits => 7}
+  @@arr_size = ind_keys.size
+  def num_hits ; self[0] end ; def num_hits=(oth) ; self[0] = oth end
+  def charge ; self[1] end ; def charge=(oth) ; self[1] = oth end
+  def computer ; self[2] end ; def computer=(oth) ; self[2] = oth end
+  def date_time ; self[3] end ; def date_time=(oth) ; self[3] = oth end
+  def first_scan ; self[4] end ; def first_scan=(oth) ; self[4] = oth end
+  def last_scan ; self[5] end ; def last_scan=(oth) ; self[5] = oth end
+  def filename_noext ; self[6] end ; def filename_noext=(oth) ; self[6] = oth end
+  def hits ; self[7] end ; def hits=(oth) ; self[7] = oth end
+  @@arr_size = ind_keys.size
+  ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
+  ind_keys.merge!(ind_keys_w_eq)
+  ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
+  def initialize(args=nil)
+    super(@@arr_size.size)
+    if args
+      if args.is_a? Hash
+        args.each do |k,v|
+          self[@@ind[k]] = v
+        end
+      end
+    end
+  end
+  def inspect
+    if first_scan
+      ins = "@first_scan=#{first_scan}, @last_scan=#{last_scan}, @filename_noext=#{filename_noext}, "
+    end
+    "<SRF::OUT  @num_hits=#{num_hits}, @charge=#{charge}, @computer=#{computer}, @date_time=#{date_time}, #{ins}@hits=#{hits.inspect}>"
+  end
+  def from_handle(fh)
+    ## EMPTY out file is 96 bytes
+    ## each hit is 320 bytes
+    ## num_hits and charge:
+    st = fh.read(96)
+    self[0,4] = st.unpack("@36vvZ*@60Z*")
+    num_hits = self[0]
+    ar = Array.new(num_hits)
+    num_hits.times do |i|
+      ar[i] = SRF::OUT::Hit.new.from_handle(fh)
+    end
+    self[7] = ar
+    self
+  end
+end
+class SRF::OUT::Hit < Array
+  FourNullBytes_as_string = "\0\0\0\0"
+  #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
+  NewRecordStart = 0x01.chr + 0x00.chr
+  Sequest_record_start = "[SEQUEST]"
+   ## mucky details.  This should be encapsulated into a class to inherit from, etc.
+  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
+  ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
+  @@arr_size = ind_keys.size
+  def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
+  def deltacn ; self[1] end ; def deltacn=(oth) ; self[1] = oth end
+  def sp ; self[2] end ; def sp=(oth) ; self[2] = oth end
+  def xcorr ; self[3] end ; def xcorr=(oth) ; self[3] = oth end
+  def id ; self[4] end ; def id=(oth) ; self[4] = oth end
+  def rsp ; self[5] end ; def rsp=(oth) ; self[5] = oth end
+  def ions_matched ; self[6] end ; def ions_matched=(oth) ; self[6] = oth end
+  def ions_total ; self[7] end ; def ions_total=(oth) ; self[7] = oth end
+  def peptide ; self[8] end ; def peptide=(oth) ; self[8] = oth end
+  def reference ; self[9] end ; def reference=(oth) ; self[9] = oth end
+  # The number of total proteins sharing this peptide
+  def num_tot_proteins ; self[10] end ; def num_tot_proteins=(oth) ; self[10] = oth end
+  def initialize(args=nil)
+    super(@@arr_size.size)
+    if args
+      if args.is_a? Hash
+        args.each do |k,v|
+          self[@@ind[k]] = v
+        end
+      end
+    end
+  end
+  def inspect
+    "<SRF::OUT::Hit @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @peptide=#{peptide}, @reference=#{reference}, @num_tot_proteins=#{num_tot_proteins}>"
+  end
+  ## There must be a better way to do this.
+  ## We are checking that there are no additional protein references only
+  ## so that we are in register for the next reading
+  def read_extra_references(fh)
+    $SRF_OUT_HIT_FH_POS = fh.pos
+    st = fh.read(4)
+    #puts "HHH: " + st.unpack("H*").first
+    ## if we see 0000 0000 we are done
+    if st.unpack("a*").first == FourNullBytes_as_string
+      fh.pos = $SRF_OUT_HIT_FH_POS
+      return nil
+    end
+    # read in context of 4 bytes read above:
+    ## NOTE: in context of 4 bytes read above!
+    st = fh.read(36)
+    #p self
+    #puts "HHHH: " + st.unpack("H*").first
+    #puts st[34,2].unpack("H*").first
+    if st[34,2] == NewRecordStart
+      fh.pos = $SRF_OUT_HIT_FH_POS
+      return nil
+    end
+    ##if st.unpack("@22H*").first == NewRecordStart_as_hex
+    #if st[22,6] == NewRecordStart
+    #  fh.pos = $SRF_OUT_HIT_FH_POS
+    #  return nil
+    #end
+    # is this the end of the outfiles?
+    ## BACK to beginning of this section
+    fh.pos = $SRF_OUT_HIT_FH_POS
+    if fh.read(9) == Sequest_record_start
+      fh.pos = $SRF_OUT_HIT_FH_POS
+      return
+    end
+    ## we have extra references
+    self[10] += 1
+    fh.read(79)
+    #p self
+    #$glob ||= 0
+    #$glob += 1
+    #if $glob == 100
+    #  abort
+    #end
+    read_extra_references(fh)
+  end
+  def from_handle(fh)
+    ## get the first part of the info
+    st = fh.read(320) ## read all the hit data
+    self[0,10] = st.unpack('@64Ex8ex12eeIx18vvvx8Z*@240Z*')
+    self[10] = 1
+    read_extra_references(fh)
+    self
+  end
+end