RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/lib/spec_id/bioworks.rb CHANGED Viewed

@@ -5,10 +5,11 @@ require 'xmlparser'
 require 'spec_id'
 require 'zlib'
 require 'hash_by'
-require 'set_from_hash'
 require 'array_class'
+require 'fasta'
 ## have to pre-declare some guys
+module ProteinReferenceable; end
 module SpecID; end
 module SpecID::Prot; end
 module SpecID::Pep; end
@@ -274,7 +275,7 @@ class Bioworks::XMLParser < XMLParser
   def endElement(name)
     case name
     when "peptide"
-      @current_obj.set_from_hash(@current_hash)
+      @current_obj.set_from_hash_given_text(@current_hash)
     when "protein"
     else
       @current_hash[name] = @current_data
@@ -293,6 +294,7 @@ module Bioworks::XML
 end
 class Bioworks::Prot
+  include ProteinReferenceable
   include SpecID::Prot
   include Bioworks::XML
@@ -357,20 +359,20 @@ class Bioworks::Prot
     hash.delete("bioworksinfo")
     hash["sf"] = hash.delete("Sf")
     hash["pi"] = hash.delete("pI")
-    set_from_hash(hash)
+    set_from_xml_hash(hash)
   end
   # changes the sf to Sf and pI to pi
   def set_from_xml_hash(hash)
     @reference = hash["reference"]
-    @protein_probability = hash["protein_probability"]
-    @probability = @protein_probability.to_f
-    @consensus_score = hash["consensus_score"]
-    @sf = hash["Sf"]
-    @unified_score = hash["unified_score"]
-    @coverage = hash["coverage"]
-    @pi = hash["pI"]
-    @weight = hash["weight"]
+    @protein_probability = hash["protein_probability"].to_f
+    #@probability = @protein_probability.to_f
+    @consensus_score = hash["consensus_score"].to_f
+    @sf = hash["Sf"].to_f
+    @unified_score = hash["unified_score"].to_f
+    @coverage = hash["coverage"].to_f
+    @pi = hash["pI"].to_f
+    @weight = hash["weight"].to_f
     @accession = hash["accession"]
   end
 end
@@ -392,6 +394,8 @@ class Bioworks::Pep
   ## NOTE! the mass is really the theoretical MH+!!!!
   ## NOTE! ALL values stored as strings, except peptide_probability!
+  #ions is a string 'x/y'
   ## other accessors:
   def probability ; self[15] end
   def mh ; self[1] end
@@ -449,14 +453,16 @@ class Bioworks::Pep
   end
   $VERBOSE = tmp_verb
+  undef_method :inspect
   def inspect
     "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
   end
-  def set_from_hash(hash)
-    self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]]
+  # if cast == true, then all the data will be cast
+  def set_from_hash_given_text(hash)
+    self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
     self.file = hash["file"]
     self[15] = hash["peptide_probability"].to_f
     self[19] = SpecID::Pep.sequence_to_aaseq(self[0])  ## aaseq
@@ -470,7 +476,7 @@ class Bioworks::Pep
         hash[$1] = $2
         #puts "IN PEP: " + $1 + ": " + $2
       elsif line =~ @@end_pep_re
-        set_from_hash(hash)
+        set_from_hash_given_text(hash)
         #puts "SELF[12]: #{self[12]}"
         #puts "SELF[12]: #{self[12]}"
         break

data/lib/spec_id/digestor.rb ADDED Viewed

@@ -0,0 +1,139 @@
+require 'spec_id/sequest/pepxml'
+require 'spec_id/mass'
+# A digestor must be able to respond to these methods:
+class Digestor
+  # min_mh_mass = min molecular mass of peptide (M+H)+
+  attr_accessor :min_mh_mass
+  # max_mh_mass = max molecular mass of peptide (M+H)+
+  attr_accessor :max_mh_mass
+  # the number of allowable missed cleavages
+  attr_accessor :missed_cleavages
+  # sample_enzyme = SampleEnzyme object
+  attr_accessor :sample_enzyme
+  # hash of masses to use (matching keys of Mass::AVG or Mass::MONO)
+  # In addition, the following keys (as symbols) are recognized.
+  # add_C_term_protein
+  # add_C_term_peptide
+  # add_N_term_protein
+  # add_N_term_peptide
+  attr_accessor :mass_hash
+  # returns a list of peptide objects created from a digestion of the fasta
+  # proteins using the sequest params (variable mods not supported yet)
+  def self.digest(fasta_obj, params_obj)
+    dig = self.new
+    dig.set_from_params(params_obj)
+    dig.create_peptide_hash(fasta_obj).values
+  end
+  def initialize
+  end
+  # takes a parameters object and fills in the necessary values
+  def set_from_params(params_obj, include_variable_mods=false)
+    raise NotImplementedError, "no variable mods yet" if include_variable_mods
+    if params_obj.is_a? Sequest::Params
+      @sample_enzyme = params_obj.sample_enzyme
+      @missed_cleavages = params_obj.max_num_internal_cleavage_sites.to_i
+      (@min_mh_mass, @max_mh_mass) = params_obj.digest_mass_range.split(' ').map {|v| v.to_f }
+      (static_mods, static_terminal_mods) = Sequest::PepXML::Modifications.new.create_static_mods(params_obj)
+      monoisotopic_parents = case params_obj.mass_type_parent
+                             when '0' ; false
+                             when '1' ; true
+                             end
+      @mass_hash = Mass.add_static_masses(monoisotopic_parents, static_mods, static_terminal_mods)
+    else
+      raise ArgumentError, "Don't recognize params object of type: #{params_obj.class}"
+    end
+  end
+  # aka 'digestion'
+  # will return a hash of SpecID::GenericPep objects (with 'aaseq' and
+  # 'prots') hashed by aminoacid sequence.  The prot will be the fasta object.
+  def create_peptide_hash(fasta_obj)
+    pep_to_prots_hash = {}
+    pep_objs = nil
+    pep_aaseqs_ar = fasta_obj.map do |prot|
+      @sample_enzyme.digest(prot.aaseq, @missed_cleavages)
+    end
+    prot_aaseqs = fasta_obj.map {|prot| prot.aaseq }
+    passing_pep_seqs_ar = limit_sizes(prot_aaseqs, pep_aaseqs_ar, @min_mh_mass, @max_mh_mass, @mass_hash)
+    #pep_aaseqs_ar.each_with_index do |before_peps,i|
+    #  after_peps = passing_pep_seqs_ar[i]
+    #  puts "before: #{before_peps.size} after: #{after_peps.size}"
+    #  puts "Losing: #{(before_peps - after_peps).inspect}"
+    #  puts "Keeping: #{after_peps.inspect}"
+    #end
+    fasta_obj.each_with_index do |prot, i|
+      pep_seqs = passing_pep_seqs_ar[i]
+      pep_seqs.each do |pep_seq|
+        pep_obj =
+          if pep_to_prots_hash.key?(pep_seq)
+            pep_to_prots_hash[pep_seq]
+          else
+            pep_ob = SpecID::GenericPep.new
+            pep_ob.prots = []
+            pep_ob.aaseq = pep_seq
+            pep_to_prots_hash[pep_seq] = pep_ob
+          end
+        pep_obj.prots << prot
+      end
+    end
+    #pep_to_prots_hash.each do |k,v|
+    #  p v.aaseq
+    #  puts v.prots.size
+    #end
+    pep_to_prots_hash
+  end
+  # min max are both in terms of the M+H(+)
+  #
+  # h_plus:
+  #   On this website:
+  #   http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
+  #   They use the mass of 'H' not 'H+' to find the (M+H)+ weight.
+  #
+  #   The prot_aaseq is used if the mass_hash contains the keys
+  #   :add_C_term_protein or :add_N_term_protein
+  #
+  #   prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
+  #   peptides matching a protein aaseq
+  #   returns another parallel array of passing proteins
+  def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
+    if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
+      raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
+    else
+      # figure out how much must be added to each peptide
+      # include the h2o, the h, and N and C terminal static mods
+      h_key = h_plus ? :h_plus : :h
+      final_add = mass_hash[:h2o] + mass_hash[h_key]
+      [:add_N_term_peptide, :add_C_term_peptide].each do |sym|
+        if mass_hash.key?(sym)
+          final_add += mass_hash[sym]
+        end
+      end
+      hash_by_aa_string = {}
+      mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
+      pep_aaseqs_ar.map do  |pep_aaseqs|
+        pep_aaseqs.select do |aaseq|
+          sum = 0.0
+          aaseq.split('').each do |let|
+            if !hash_by_aa_string.key? let
+              puts 'NOT FOUND'
+              p let
+            end
+            sum += hash_by_aa_string[let]
+          end
+          mh_plus = sum + final_add
+          ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
+        end
+      end
+    end
+  end
+end

data/lib/spec_id/mass.rb ADDED Viewed

@@ -0,0 +1,116 @@
+class Mass
+  # http://expasy.org/tools/findmod/findmod_masses.html
+  # still need to add the modifications
+  MONO = {
+    :A => 71.03711,
+    :R => 156.10111,
+    :N => 114.04293,
+    :D => 115.02694,
+    :C => 103.00919,
+    :E => 129.04259,
+    :Q => 128.05858,
+    :G => 57.02146,
+    :H => 137.05891,
+    :I => 113.08406,
+    :L => 113.08406,
+    :K => 128.09496,
+    :M => 131.04049,
+    :F => 147.06841,
+    :P => 97.05276,
+    :S => 87.03203,
+    :T => 101.04768,
+    :W => 186.07931,
+    :Y => 163.06333,
+    :V => 99.06841,
+     # uncommon
+    :B => 172.048405, # average of aspartic acid and asparagine
+    :U => 150.95364,   # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
+    :X => 118.805716,  # the average of the mono masses of the 20 amino acids
+    :* => 118.805716, # same as X
+    # elements etc.
+    :h => 1.00783,
+    :h_plus => 1.00728,
+    :o => 15.9949146,
+    :h2o => 18.01056,
+  }
+  AVG = {
+    :A => 71.0788,
+    :R => 156.1875,
+    :N => 114.1038,
+    :D => 115.0886,
+    :C => 103.1388,
+    :E => 129.1155,
+    :Q => 128.1307,
+    :G => 57.0519,
+    :H => 137.1411,
+    :I => 113.1594,
+    :L => 113.1594,
+    :K => 128.1741,
+    :M => 131.1926,
+    :F => 147.1766,
+    :P => 97.1167,
+    :S => 87.0782,
+    :T => 101.1051,
+    :W => 186.2132,
+    :Y => 163.1760,
+    :V => 99.1326,
+    # uncommon
+    :B => 172.1405, # average of aspartic acid and asparagine
+    :U => 150.03,   # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
+    :X => 118.88603, # the average of the masses of the 20 amino acids
+    :* => 118.88603, # same as X
+    # elements etc.
+    :h => 1.00794,
+    :h_plus => 1.00739,
+    :o => 15.9994,
+    :h2o => 18.01524,
+  }
+  # returns a fresh hash where it has been added to each amino acid the amount
+  # specified in the array of a PepXML::Modifications object
+  # if static_terminal_mods given than will create the following keys as
+  # symbols as necessary:
+  # add_C_term_protein
+  # add_C_term_peptide
+  # add_N_term_protein
+  # add_N_term_peptide
+  def self.add_static_masses(monoisotopic, static_mods, static_terminal_mods=nil)
+    hash_to_use =
+      if monoisotopic
+        Mass::MONO
+      else
+        Mass::AVG
+      end
+    copy_hash = hash_to_use.dup
+    static_mods.each do |mod|
+      copy_hash[mod.aminoacid.to_sym] += mod.massdiff
+    end
+    static_terminal_mods.each do |mod|
+      if x = mod.protein_terminus
+        # its a protein terminus modification
+        case x
+        when 'n'
+          copy_hash[:add_N_term_protein] = mod.massdiff
+        when 'c'
+          copy_hash[:add_C_term_protein] = mod.massdiff
+        end
+      else
+        # its a peptide terminus modification
+        case mod.terminus
+        when 'n'
+          copy_hash[:add_N_term_peptide] = mod.massdiff
+        when 'c'
+          copy_hash[:add_C_term_peptide] = mod.massdiff
+        end
+      end
+    end
+    copy_hash
+  end
+end

data/lib/spec_id/parser/proph.rb ADDED Viewed

@@ -0,0 +1,236 @@
+require 'xml_style_parser'
+require 'spec_id/sequest/pepxml'
+module SpecID ; end
+module SpecID::Parser ; end
+class SpecID::Parser::PepProph
+  include XMLStyleParser
+  def initialize(parse_type=:spec_id, version='3.0')
+    @method = parse_type
+    @version = version
+    implemented = %w(AXML LibXML)
+    klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
+    case klass_s
+    when 'AXML'
+      @get_root_node_from_file = Proc.new do |file|
+      AXML.parse_file(file)
+      end
+    when 'LibXML'  # LibXML is buggy on some machines...
+      @get_root_node_from_file = Proc.new do |file|
+        doc = XML::Document.file(file)
+        doc.root
+      end
+    else
+      raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
+    end
+  end
+  # returns the spec_id object
+  def spec_id(file, opts={})
+    raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
+    spec_id_obj =
+      if x = opts[:spec_id]
+        x
+      else
+        Proph::PepSummary.new
+      end
+    msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
+    spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
+    msms_run_summary_n = msms_pipeline_analysis_n.find_first('child::msms_run_summary')
+    spec_id_obj.from_pepxml_node(msms_run_summary_n)
+  end
+end
+class SpecID::Parser::ProtProph
+  include XMLStyleParser
+  Split_unique_stripped_peptides_re = /\+/
+  def initialize(parse_type=:spec_id, version='4')
+    @method = parse_type
+    @version = version
+    implemented = %w(AXML LibXML)
+    klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
+    case klass_s
+    when 'AXML'
+      #puts "parsing with AXML (XMLParser based)" if $VERBOSE
+      @get_root_node_from_file = Proc.new do |file|
+        AXML.parse_file(file)
+      end
+    when 'LibXML'  # LibXML is buggy on some machines...
+      #puts "parsing with LibXML" if $VERBOSE
+      @get_root_node_from_file = Proc.new do |file|
+        doc = XML::Document.file(file)
+        doc.root
+      end
+    else
+      raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
+    end
+  end
+  # returns the spec_id object
+  def spec_id(file, opts={})
+    raise NotImplementedError, "cannot do #{@version} yet" if @version != '4'
+    spec_id_obj =
+      if x = opts[:spec_id]
+        x
+      else
+        Proph::ProtSummary.new
+      end
+    protein_summary_n = @get_root_node_from_file.call(file)
+    #protein_summary_n = scan_for_first(doc, 'protein_summary')
+    # protein_summary_header_n = protein_summary_n.child
+    # could grab some of this info if we wanted...
+    pep_hash = {}
+    prot_hash = {}
+    protein_groups = []
+    # get all the proteins from inside protein groups
+    protein_group_name = 'protein_group'
+    get_protein_summary_header = true
+    protein_summary_n.each do |protein_group_n|
+      if get_protein_summary_header
+        protein_summary_header_n = protein_group_n
+        get_protein_summary_header = false
+      elsif protein_group_n.name == protein_group_name
+        protein_groups << get_proteins(protein_group_n, pep_hash, prot_hash)
+      end
+    end
+    # need to finalize hash stuff
+    pep_hash.each do |k,pep|
+      new_prots = []
+      pep.prots.each do |prot_or_string|
+        if prot_or_string.is_a?(Proph::Prot)
+          new_prots << prot_or_string
+        else
+          prt = prot_hash[prot_or_string]
+          if prt.nil?
+            # this is an indistinguishable protein!
+          else
+            new_prots << prt
+          end
+        end
+      end
+      pep.prots = new_prots
+    end
+    spec_id_obj.peps = pep_hash.values
+    spec_id_obj.prots = prot_hash.values
+    spec_id_obj.prot_groups = protein_groups
+    spec_id_obj
+  end
+  # takes a Y or N and gives true/false
+  def booleanize(string)
+    case string
+    when 'Y'
+      true
+    when 'N'
+      false
+    else
+      nil
+    end
+  end
+  # assumes that all the rest of the nodes are  protein_groups
+  # pep_hash is hashed on aaseq OR modified peptide amino acid sequence (if
+  # modified) + charge
+  # (as far as I can tell, all protein entries are unique!)
+  # returns a ProtGroup object
+  def get_proteins(protein_group_node, pep_hash, prot_hash)
+    protein_group_proteins = []
+    protein_group_node.each do |protein_n|
+      raise(Exception, "not expecting anything but protein's, got: #{protein_n.name}") if protein_n.name != 'protein'
+      # probability peps protein_name n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description
+      # get the description
+      # INITIALIZE the protein and set key
+      n = protein_n
+      protein_name = n['protein_name']
+      peps = []
+      protein = Proph::Prot.new( [protein_name, n['probability'].to_f,
+                      n['n_indistinguishable_proteins'].to_i,
+                      n['percent_coverage'].to_f,
+                      n['unique_stripped_peptides'].split(Split_unique_stripped_peptides_re),
+                      n['group_sibling_id'], n['total_number_peptides'].to_i,
+                      n['pct_spectrum_ids'].to_f, nil,
+                      peps ])
+      protein_group_proteins << protein
+      prot_hash[protein_name] = protein
+      # traverse through the peptides (and annotation)
+      protein_n.each do |protein_sub_n|
+        # create a proteins array for each peptide
+        proteins = [protein]
+        if protein_sub_n.name == 'annotation'
+          protein.description = protein_sub_n['protein_description']
+        end
+        if protein_sub_n.name == 'peptide'
+          peptide_n = protein_sub_n
+          # peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots
+          # get modifications, if any
+          n = peptide_n
+          peptide_sequence = n['peptide_sequence']
+          charge = n['charge'].to_i
+          # GET list of all proteins and modifications
+          mod_info = nil
+          peptide_hash_string = peptide_sequence
+          if peptide_n.child?
+            peptide_n.each do |pep_sub_n|
+              case pep_sub_n.name
+              when 'peptide_parent_protein'
+                # NOTE! the proteins list will have strings until the assoc.
+                # prot is found!
+                proteins << pep_sub_n['protein_name']
+              when 'modification_info'
+                masses = pep_sub_n.map do |mod_aa_mass_n|
+                Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mod_aa_mass_n['position'].to_i, mod_aa_mass_n['mass'].to_f])
+                end
+                peptide_hash_string = pep_sub_n['modified_peptide']
+              mod_info = Sequest::PepXML::SearchHit::ModificationInfo.new([peptide_hash_string, masses])
+              end
+            end
+          end
+          key = [peptide_hash_string, charge]
+          peptide =
+            if pep_hash.key? key
+              pep_hash[key]
+            else
+              pep = Proph::Prot::Pep.new([peptide_sequence, charge,
+                             n['initial_probability'].to_f, n['nsp_adjusted_probability'].to_f,
+                             n['weight'].to_f, booleanize(n['is_nondegenerate_evidence']),
+                             n['n_enzymatic_termini'].to_i, n['n_sibling_peptides'].to_f,
+                             n['n_sibling_peptides'].to_i, n['n_instances'].to_i,
+                             booleanize(n['is_contributing_evidence']),
+                             n['calc_neutral_pep_mass'].to_f, mod_info, proteins] )
+              pep_hash[key] = pep
+              pep
+            end
+          peps << peptide
+        end
+      end  # end protein children
+    end
+    Proph::ProtGroup.new(:prots => protein_group_proteins, :group_number => protein_group_node['group_number'].to_i, :probability => protein_group_node['probability'].to_f)
+  end
+  def parse(file, opts)
+    send(@method, file, opts)
+  end
+end