RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/lib/spec_id.rb CHANGED Viewed

@@ -1,75 +1,19 @@
 require 'ostruct'
 require 'set'
 require 'hash_by'
-require 'spec_id/precision'
 require 'roc'
 require 'sample_enzyme'  # for others
 require 'spec_id/bioworks'
 require 'spec_id/sequest'
-require 'spec_id/proph'
+require 'spec_id/proph/prot_summary'
 require 'spec_id_xml'
+require 'spec_id/mass'
+require 'fasta'
+module ProteinReferenceable ; end
 class SampleEnzyme ; end
-class Mass
-  # http://expasy.org/tools/findmod/findmod_masses.html
-  # still need to add the modifications
-  MONO = {
-    :A => 71.03711,
-    :R => 156.10111,
-    :N => 114.04293,
-    :D => 115.02694,
-    :C => 103.00919,
-    :E => 129.04259,
-    :Q => 128.05858,
-    :G => 57.02146,
-    :H => 137.05891,
-    :I => 113.08406,
-    :L => 113.08406,
-    :K => 128.09496,
-    :M => 131.04049,
-    :F => 147.06841,
-    :P => 97.05276,
-    :S => 87.03203,
-    :T => 101.04768,
-    :W => 186.07931,
-    :Y => 163.06333,
-    :V => 99.06841,
-    :h => 1.00783,
-    :h_plus => 1.00728,
-    :o => 15.9949146,
-    :h2o => 18.01056,
-  }
-  AVG = {
-    :A => 71.0788,
-    :R => 156.1875,
-    :N => 114.1038,
-    :D => 115.0886,
-    :C => 103.1388,
-    :E => 129.1155,
-    :Q => 128.1307,
-    :G => 57.0519,
-    :H => 137.1411,
-    :I => 113.1594,
-    :L => 113.1594,
-    :K => 128.1741,
-    :M => 131.1926,
-    :F => 147.1766,
-    :P => 97.1167,
-    :S => 87.0782,
-    :T => 101.1051,
-    :W => 186.2132,
-    :Y => 163.1760,
-    :V => 99.1326,
-    :h => 1.00794,
-    :h_plus => 1.00739,
-    :o => 15.9994,
-    :h2o => 18.01524,
-  }
-end
 module SpecID ; end
@@ -91,8 +35,14 @@ module SpecID
   # Will return a SpecID object (really, the object corresponding to the
   # file type which mixes in SpecID [is_a?(SpecID) == true])
   # If no file is given, will return a GenericSpecID object.
+  # If file is an array, this is assumed to be a group of srf files which is
+  # converted into an SRFGroup Ojbect and run.
   def self.new(file=nil, tp=nil)
-    if file
+    # this will need to be specialized for other groups later
+    if file.is_a?(Array)
+      # takes an array of srf filenames
+      SRFGroup.new(file)
+    elsif file
       from_file(file, tp)
     else
       GenericSpecID.new
@@ -100,22 +50,27 @@ module SpecID
   end
   # tp = file_type
-  # only takes an array if they are srf files!
+  # a single srf file will be packaged into an SRFGroup object
   def self.from_file(file, tp=nil)
     obj = nil
     unless tp
       tp = file_type(file)
     end
     obj = case tp
+    when 'srf'
+      #@hi_prob_best = false
+      SRFGroup.new([file])
     when 'srg'
-      @hi_prob_best = false
+      #@hi_prob_best = false
       SRFGroup.new(file)
     when 'bioworks'
-      @hi_prob_best = false
+      #@hi_prob_best = false
       Bioworks.new(file)
     when 'protproph'
-      @hi_prob_best = true
+      #@hi_prob_best = true
       Proph::ProtSummary.new(file)
+    when 'pepproph'
+      Proph::PepSummary.new(file)
     else
       abort "UNRECOGNIZED file type for #{file}"
     end
@@ -123,9 +78,76 @@ module SpecID
   end
   def inspect
-    "<#{self.class} #peps=\"#{peps.size}\">"
+    peps_string =
+      if peps
+        "peps(#)=#{peps.size}"
+      else
+        "peps=(nil)"
+      end
+    "<#{self.class} #{peps_string}>"
+  end
+  # given some list of SpecID::Pep based objects, returns the list of proteins
+  # associated with those peptides
+  # kind must be a symbol:
+  # :no_update (current proteins are returned, but their peps attribute
+  # is not updated)
+  # :update (current proteins returned with peps attribute updated)
+  # :new (new proteins are created complete with peps attribute)
+  def self.protein_list(pephits, kind=:no_update)
+    orig_pephits_prts = []
+    if kind == :new
+      new_prots = {}
+      pephits.each_with_index do |pep,i|
+        orig_pephits_prts[i] = pep.prots
+        peps_new_prts = pep.prots.map do |prt|
+          if new_prots.key? prt.reference
+            already_exists = new_prots[prt.reference]
+          else
+            np = prt.dup
+            np.peps = []
+            new_prots[np.reference] = np
+            np
+          end
+        end
+        pep.prots = peps_new_prts
+      end
+    end
+    if kind == :update
+      pephits.each do |pep|
+        pep.prots.each do |prt|
+          prt.peps = []
+        end
+      end
+    end
+    prot_set = {}
+    pephits.each do |pep|
+      prts = pep.prots
+      prts.each do |prt|
+        prot_set[ prt.reference ] = prt
+      end
+      if (kind == :update || kind == :new)
+        prts.each do |prt|
+          prt.peps << pep
+        end
+      end
+    end
+    ## Reset the original protein hits
+    if kind == :new
+      pephits.each_with_index do |pep,i|
+        pep.prots = orig_pephits_prts[i]
+      end
+    end
+    prot_set.values
   end
   # takes a comma separated list  or array and extends the last to create an
   # array of desired size
   def self.extend_args(arg, desired_size)
@@ -193,13 +215,6 @@ module SpecID
     prot_triplets
   end
-  ## basically, this is the command line wrapper
-  def self.precision(argv)
-    Prec.new.run_cmd_line(argv)
-  end
   # returns number of true positives (array) and the specified output (as
   # parallel array).  Requires the classification method and a sorted array of
   # tp values and an array fp values.
@@ -223,55 +238,100 @@ module SpecID
     pps
   end
-  def classify_by_regex(items, regex, fp_on_match=true)
-    case items
-    when :prots
+  def self.prots?(ar)
+    ar.first.is_a? SpecID::Prot
+  end
+  def self.peps?(ar)
+    ar.first.is_a? SpecID::Pep
+  end
+  # for older stuff
+  def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
+    objects =
+      case items
+      when :prots
+        prots
+      when :peps
+        peps
+      end
+    SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
+  end
+  # includes the peptide hit in both
+  # returns (target, decoy)
+  # (for peps) ties can be :both, true (target wins), false (decoy wins)
+  # regardless of ties behavior, will partition out the proteins to be
+  # appropriate for the peptide
+  def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
+    if items.size == 0
+      return [[],[]]
+    elsif prots?(items)
       myproc = proc { |prt|
-        if prt.reference =~ regex ; !fp_on_match
-        else ; fp_on_match end
+        if prt.reference =~ regex ; !decoy_on_match
+        else ; decoy_on_match end
       }
       return classify(items, myproc)
-    when :peps
+    elsif peps?(items)
       match = [] ; nomatch = []
-      peps.each do |pep|
-        match_prots = [] ; nomatch_prots = []
-        (hit, nohit) = pep.prots.partition do |prot|
+      items.each do |pep|
+        (match_prots, nomatch_prots) = pep.prots.partition do |prot|
           prot.reference =~ regex
         end
-        if hit.size == 0
+        if match_prots.size == 0
           nomatch << pep
-        elsif nohit.size == 0
+        elsif nomatch_prots.size == 0
           match << pep
         else ## both have hits
           pep.prots = match_prots
           nomatch_pep = pep.dup
           nomatch_pep.prots = nomatch_prots
-          match << pep
-          nomatch << pep
+          # resolve ties
+          case ties
+          when true
+            if decoy_on_match
+              nomatch << pep
+            else
+              match << pep
+            end
+          when false
+            if decoy_on_match
+              match << pep
+            else
+              nomatch << pep
+            end
+          when :both
+            match << pep
+            nomatch << pep
+          else ; raise ArgumentError
+          end
         end
       end
-      if fp_on_match
+      if decoy_on_match
         return [nomatch , match]
       else
         return [match, nomatch]
       end
     else
-      abort "don't recognize "
+      raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
     end
   end
   # returns [tp, fp] based on the protein prefix for items where items =
   # (:prot|:peps)
   # this may result in a duplication of some peptides if they match both
   # normal and decoy proteins.  In this case, the protein arrays are split,
   # too, so that each points only to its breed of protein.
-  def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
+  def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
     if prefix
       regex = /^#{Regexp.escape(flag)}/
     else
       regex = /#{Regexp.escape(flag)}/
     end
-    classify_by_regex(items, regex, fp_on_match)
+    classify_by_regex(items, regex, decoy_on_match)
   end
   # Returns (match, nomatch)
@@ -303,7 +363,7 @@ module SpecID
         classify_item_by.call(item) ]
     end
     roc = ROC.new
-    tp, fp = roc.prep_list(doublets)
+    tp, fp = roc.doublets_to_separate(doublets)
     return tp, fp
   end
@@ -393,11 +453,13 @@ module SpecID
     end
     File.open(file) do |fh|
       lines = ""
-      4.times { lines << fh.readline }
+      8.times { lines << fh.readline }
       if lines =~ /<bioworksinfo>/
         return 'bioworks'
-      elsif lines =~ /<protein_summary/ && lines =~ /xmlns="http:\/\/regis-web.systemsbiology.net\/protXML"/
+      elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
         return 'protproph'
+      elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
+        return 'pepproph'
       end
     end
   end
@@ -521,9 +583,10 @@ module SpecID
   end
 end
 # A Generic spectraID protein
 module SpecID::Prot
+  include ProteinReferenceable
   # probability is always a float!
   attr_accessor :probability, :reference, :peps
@@ -531,6 +594,14 @@ module SpecID::Prot
     self.reference <=> other.reference
   end
+  def inspect
+    pep_string =
+      if peps
+      ", @peps(#)=#{peps.size}"
+      end
+    "<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
+  end
 end
 module SpecID::Pep
@@ -653,6 +724,23 @@ module SpecID::Pep
     when :mmu
     end
   end
+  # calls the method associated with each key and returns the value
+  def values_at(*args)
+    args.map do |arg|
+      send(arg)
+    end
+  end
+  def inspect
+    prot_string =
+      if prots
+      ", @prots(#)=#{prots.size}"
+      end
+    "<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
+  end
 end
 class SpecID::GenericProt

data/lib/spec_id_xml.rb CHANGED Viewed

@@ -6,7 +6,7 @@
 # concatenation into a file
 module SpecIDXML
-  Special_chrs_hash = {
+  MSial_chrs_hash = {
     '"' => '&quot;',
     '&' => '&amp;',
     "'" => '&apos;',
@@ -17,8 +17,8 @@ module SpecIDXML
   # substitutes special xml chars
   def escape_special_chars(string)
     string.split('').map do |char|
-      if Special_chrs_hash.key? char ; Special_chrs_hash[char]
-        # if x = Special_chrs_hash[char] ; x  # <-- that's slightly slower
+      if MSial_chrs_hash.key? char ; MSial_chrs_hash[char]
+        # if x = MSial_chrs_hash[char] ; x  # <-- that's slightly slower
       else ; char end
     end.join
   end
@@ -33,13 +33,13 @@ module SpecIDXML
   end
-  def param_xml(symbol)
-    tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{send(symbol)}" + '"/>'
+  def param_xml(obj, symbol)
+    tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{obj.send(symbol)}" + '"/>'
   end
-  def params_xml(*symbol_list)
+  def params_xml(obj, *symbol_list)
     symbol_list.collect { |sy|
-      param_xml(sy)
+      param_xml(obj, sy)
     }.join("\n") + "\n"
   end
@@ -92,9 +92,7 @@ module SpecIDXML
   end
   def attrs_xml(list_of_symbols)
-    list_of_symbols.collect {|sy|
-      attr_xml(sy)
-    }.join(" ")
+    list_of_symbols.collect {|sy| attr_xml(sy) }.join(" ")
   end
 end

data/lib/transmem/phobius.rb ADDED Viewed

@@ -0,0 +1,147 @@
+require 'transmem'
+class Phobius ; end
+# This class will probably change its interface some in the future
+# That's the web portal
+# http://phobius.cgb.ki.se/
+# How to run:
+# Select output format as 'Short'
+# then hit 'Submit Query'
+# note: to implement some of the TransmemIndex features, the update_aaseq
+# method must be called!
+class Phobius::Index < Hash
+  include TransmemIndex
+  # will update_aaseq if given a fasta_obj
+  def initialize(file, fasta_obj = nil )
+    Phobius.default_index(file, self)
+    if fasta_obj
+      update_aaseq(fasta_obj)
+    end
+  end
+  # we need to match whatever function toppred uses to generate identifiers if
+  # we want derivative processes to be fast and accurate
+  def reference_to_key(reference)
+    if reference
+      if reference.size > 0
+        index = reference.index(' ')
+        string =
+          if index
+            reference[0...index]
+          else
+            reference
+          end
+        string.gsub('"','')
+      else
+        ''
+      end
+    else
+      nil
+    end
+  end
+  # adds an :aaseq key to each hash (necessary for avg_overlap method)
+  # these are shallow references to the aaseq in the fasta obj
+  def update_aaseq(fasta)
+    fasta.each do |prot|
+      self[reference_to_key(prot.reference)][:aaseq] = prot.aaseq
+    end
+  end
+end
+class Phobius
+  include TransmemIndex
+  # returns the default index
+  def self.default_index(file, index={})
+    parser = Phobius::Parser.new(:short)
+    parser.file_to_index(file, index)
+  end
+end
+module Phobius::Parser
+  def self.new(parser_type=:short)
+    klass =
+      case parser_type
+      when :short
+        Phobius::ParserShort
+      else
+        raise ArgumentError, "don't recognize parser type: #{parser_type}"
+      end
+    klass.new
+  end
+  def file_to_index(file, index={})
+    File.open(file) {|fh| to_index(fh, index) }
+  end
+end
+class Phobius::ParserShort
+  include Phobius::Parser
+  # takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
+  # and returns an array of hashes with the keys :start and :stop
+  def prediction_to_array(string)
+    segments = []
+    string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
+      segments << { :start => m1.to_i, :stop => m2.to_i }
+    end
+    segments
+  end
+  # returns a hash structure in this form: { identifier => {
+  # :num_certain_transmembrane_segments => Int,
+  # :transmembrane_segments => [:start => Int, :stop
+  # => Int] }
+  # can parse io even if there is no header to key in on.
+  def to_index(io, index={})
+    init_pos = io.pos
+    cnt = 0
+    found_header = false
+    loop do
+      if io.gets =~ /SEQENCE/
+        found_header = true
+        break
+      end
+      cnt += 1
+      break if cnt > 10
+    end
+    if !found_header
+      io.pos = init_pos
+    end
+    current_record = nil
+    io.each do |line|
+      line.chomp!
+      # grab values
+      ar = line.split(/\s+/)
+      next if ar.size != 4
+      (key, num_tms, signal_peptide, prediction) = ar
+      # cast the values
+      num_tms = num_tms.to_i
+      signal_peptide =
+        case signal_peptide
+        when 'Y'
+          true
+        when '0'
+          false
+        end
+      index[key] = {
+        :num_certain_transmembrane_segments => num_tms,
+        :signal_peptide => signal_peptide,
+      }
+      if num_tms > 0
+        index[key][:transmembrane_segments] = prediction_to_array(prediction)
+      end
+    end
+    index
+  end
+end