RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/lib/validator.rb ADDED Viewed

@@ -0,0 +1,214 @@
+class Validator
+  Validator_to_string = {
+    'Validator::AA' => 'badAA',
+    'Validator::Decoy' => 'decoy',
+    'Validator::Transmem::Protein' => 'tmm',
+    'Validator::TruePos' => 'tps',
+    'Validator::Bias' => 'bias',
+    'Validator::Probability' => 'prob',
+    :bad_aa => 'badAA',
+    :decoy => 'decoy',
+    :tmm => 'tmm',
+    :tps => 'tps',
+    :bias => 'bias',
+    :prob => 'prob',
+  }
+  def initialize_increment
+    @increment_tps = 0
+    @increment_fps = 0
+    @increment_total_submitted = 0
+    @increment_initialized = true
+  end
+  # if adding pephits in groups at a time, the entire group does not need to be
+  # queried, just the individual hit.  Use this OR pephits_precision (NOT
+  # both).  The initial query to this method will begin a running tally that
+  # is saved by the validator.
+  # takes either an array or a single pephit (determined by if it is a
+  # SpecID::Pep)
+  def increment_pephits_precision(peps)
+    tmp = $VERBOSE; $VERBOSE = nil
+    initialize_increment unless @increment_initialized
+    $VERBOSE = tmp
+    to_submit =
+      if peps.is_a? SpecID::Pep
+        [peps]
+      else
+        peps
+      end
+    @increment_total_submitted += to_submit.size
+    (tps, fps) = partition(to_submit)
+    @increment_tps += tps.size
+    @increment_fps += fps.size
+    (num_tps, num_fps) =
+    if self.respond_to?(:calc_precision_prep)  # for digestion based validators
+      (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
+      [num_tps, num_fps]
+    else
+      [@increment_tps, @increment_fps]
+    end
+    calc_precision(num_tps, num_fps)
+  end
+  # returns an adjusted false positive rate (a float not to drop below 0.0)
+  # based on a background of 'false'-false positive hits to total hits.  Also
+  # sets the @calculated_background attribute.  Accepts floats or ints
+  def adjust_fps_for_background(num_tps, num_fps, background)
+    num_fps = num_fps.to_f
+    total_peps = num_tps + num_fps
+    @calculated_background = num_fps / total_peps
+    num_fps -= (total_peps.to_f * background)
+    num_fps = 0.0 if num_fps < 0.0
+    num_fps
+  end
+  # copied from libjtp: vec
+  # returns the mean and std_dev
+  def sample_stats(array)
+    _len = array.size
+    _sum = 0.0
+    _sum_sq = 0.0
+    array.each do |val|
+      _sum += val
+      _sum_sq += val * val
+    end
+    std_dev = _sum_sq - ((_sum * _sum)/_len)
+    std_dev /= ( (_len > 1) ? (_len-1) : 1 )
+    # on occasion, a very small negative number occurs
+    if std_dev < 0.0
+      std_dev = 0.0
+    else
+      std_dev = Math.sqrt(std_dev)
+    end
+    mean = _sum.to_f/_len
+    [mean, std_dev]
+  end
+  # takes an array of validators and returns a fresh array where each has been
+  # turned into a sensible hash (with symbols as the keys!)
+  def self.sensible_validator_hashes(validators)
+    validators.map do |val|
+      hash = {}
+      case val
+      when Validator::TruePos
+        hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
+      when Validator::AA
+        %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
+          hash[cat.to_sym] = val.send(cat.to_sym)
+        end
+      when Validator::Decoy
+        %w(correct_wins decoy_on_match).each do |cat|
+          hash[cat.to_sym] = val.send(cat.to_sym)
+        end
+        hash[:constraint] = val.constraint.inspect if val.constraint
+      when Validator::Bias
+        %w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
+          hash[cat.to_sym] = val.send(cat.to_sym)
+        end
+        hash[:file] = val.fasta.filename
+      when Validator::Transmem::Protein
+        %w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
+          hash[cat.to_sym] = val.send(cat.to_sym)
+        end
+      when Validator::Probability
+        %w(prob_method).each do |cat|
+          hash[cat.to_sym] = val.send(cat.to_sym)
+        end
+      else ; raise ArgumentError, "Don't know the validator class #{val}"
+      end
+      klass_as_s = val.class.to_s
+      hash[:type] = Validator_to_string[klass_as_s]
+      hash[:class] = klass_as_s
+      hash
+    end
+  end
+=begin
+  ## THIS IS WITH STRINGS AS KEYS!
+  # takes an array of validators and returns a fresh array where each has been
+  # turned into a sensible hash (with symbols as the keys!)
+  def self.sensible_validator_hashes(validators)
+    validators.map do |val|
+      hash = {}
+      case val
+      when Validator::TruePos
+        hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
+      when Validator::AA
+        %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
+          hash[cat] = val.send(cat.to_sym)
+        end
+      when Validator::Decoy
+        %w(correct_wins decoy_on_match).each do |cat|
+          hash[cat] = val.send(cat.to_sym)
+        end
+        hash['constraint'] = val.constraint.inspect if val.constraint
+      when Validator::Bias
+        %w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
+          hash[cat] = val.send(cat.to_sym)
+        end
+        hash['file'] = val.fasta.filename
+      when Validator::Transmem::Protein
+        %w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
+          hash[cat] = val.send(cat.to_sym)
+        end
+      when Validator::Probability
+      else ; raise ArgumentError, "Don't know the validator class #{val}"
+      end
+      klass_as_s = val.class.to_s
+      hash['type'] = Validator_to_string[klass_as_s]
+      hash['class'] = klass_as_s
+      hash
+    end
+  end
+=end
+end
+module Precision::Calculator
+  # calculates precision by the assumption that the first group are all true
+  # hits and the second are all false hits
+  # (0,0) is returned as 1.0
+  def calc_precision(num_true_hits, num_false_hits)
+    if ((num_true_hits.to_f == 0.0) && (num_false_hits.to_f == 0.0))
+      1.0
+    else
+      num_true_hits.to_f / (num_true_hits.to_f + num_false_hits.to_f)
+    end
+  end
+end
+# will calculate precision for groups of proteins where the first group are
+# normal hits (which may be true or false) and the second are decoy hits.
+# edge case:  if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
+module Precision::Calculator::Decoy
+  def calc_precision(num_normal, num_decoy)
+    # will calculate as floats in case fractional amounts passed in for
+    # whatever reason
+    num_normal_f = num_normal.to_f
+    num_true_pos = num_normal.to_f - num_decoy
+    precision =
+      if num_normal_f == 0.0
+        if num_decoy.to_f > 0.0
+          0.0
+        else
+          1.0
+        end
+      else
+        num_true_pos/num_normal_f
+      end
+  end
+end
+require 'validator/true_pos'
+require 'validator/aa'
+require 'validator/bias'
+require 'validator/decoy'
+require 'validator/transmem'
+require 'validator/probability'
+require 'validator/prot_from_pep'

data/lib/xml.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module XML
+  HourMinuteMatch = /[MH]/o
+  # returns a float object of seconds
+  # doesn't support year month, etc, yet
+  def self.duration_to_seconds(string)
+    case x = string[0,2]
+    when 'PT'
+      rest = string[2..-1]
+      # usually it will be this 'PT1.223434S':
+      if rest !~ HourMinuteMatch
+        rest[0...-1].to_f
+      else
+        addit = ''
+        total_secs = 0
+        total_secs_as_float = nil
+        rest.split('').each do |let|
+          case let
+          when 'H'
+            total_secs += addit.to_i * 3600
+            addit = ''
+          when 'M'
+            total_secs += addit.to_i * 60
+            addit = ''
+          when 'S'
+            total_secs_as_float = total_secs.to_f
+            total_secs_as_float += addit.to_f
+          else
+            addit << let
+          end
+        end
+        total_secs_as_float
+      end
+    else
+      abort 'need to include support for other durations'
+    end
+  end
+end

data/lib/xml_style_parser.rb ADDED Viewed

@@ -0,0 +1,105 @@
+module XMLStyleParser
+  @done_once = nil
+  Parser_precedence = %w(AXML LibXML XMLParser Regexp REXML)
+  # currently AXML requires 'xmlparser' to be installed.... (may not always be
+  # the case...)
+  File_required = {'AXML' => /^axml/, 'LibXML' => /^xml\/libxml/, 'XMLParser' => /^xmlparser/}
+  # the method that the parser will call on the given file at parse!
+  attr_accessor :method
+  # parses the given file by sending to @method
+  def parse(file, opts={})
+    if respond_to? @method
+      send(@method, file, opts)
+    else
+      raise NoMethodError, "Parser of class #{self.class} can't parse #{@method} yet"
+    end
+  end
+  # XMLParser and xml/libxml are incompatible, so if xmlparser is available,
+  # libxml will not be loaded (XMLParser#parse is clobbered by
+  # XML::Parser#parse [don't ask me why])
+  def self.require_parsers
+    if !@done_once
+      have_xmlparser = false
+      begin
+        require 'xmlparser'
+        puts "Loaded XMLParser" if $VERBOSE
+        have_xmlparser = true
+      rescue LoadError
+      end
+      begin
+        require 'axml'
+        puts "Loaded AXML" if $VERBOSE
+      rescue LoadError
+      end
+      begin
+        unless have_xmlparser
+          require 'xml/libxml'
+          puts "Loaded xml/libxml" if $VERBOSE
+          ################################################################
+          # IMPORTANT!
+          # This magic line makes the parser behave like it ought to!!
+          XML::Parser.default_keep_blanks = false
+          ################################################################
+        end
+      rescue LoadError
+      end
+    end
+    @done_once = true
+  end
+  # returns an array of strings depending on File_required (in the order of
+  # Parser_precedence)
+  def self.available_xml_parsers
+    require_parsers
+    parser_precedence = Parser_precedence.dup
+    File_required.map do |k,v|
+      unless $".any? {|req_file| req_file.match(v) }
+        parser_precedence.delete(k)
+      end
+    end
+    parser_precedence
+  end
+  ## appends downcase to each parser type here and tries to require it
+  # returns all those that were required without a load error
+  def self.require_parse_files(base_dir)
+    XMLStyleParser.available_xml_parsers.select do |v|
+      to_require = base_dir + '/' + v.downcase
+      begin
+        require to_require
+        true
+      rescue LoadError
+        false
+      end
+    end
+  end
+  # seeks a subclass that has the public_method @method
+  def self.choose_parser(const, method)
+    ## First update @@parser_precedence to ensure we should get these guys
+    parser_precedence = available_xml_parsers
+    available_constants = parser_precedence.select do |v|
+      const.const_defined?(v)
+    end
+    available_subclasses = available_constants.map do |v|
+      const.const_get(v)
+    end
+    available = available_subclasses.select do |subclass|
+      subclass.public_method_defined? method
+    end
+    if available.size > 0
+      available.first
+    else
+      raise NoMethodError, "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
+    end
+  end
+end

data/lib/xmlparser_wrapper.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module XMLParserWrapper
+  def parse_and_report(file, const, report_method=:report)
+    parse_and_report_string(IO.read(file), const, report_method)
+  end
+  def parse_and_report_string(string, const, report_method=:report)
+    parser = self.class.const_get(const).new
+    parser.parse(string)
+    parser.send(report_method)
+  end
+  def parse_and_report_io(io, const, report_method=:report)
+    parser = self.class.const_get(const).new
+    parser.parse(io)
+    parser.send(report_method)
+  end
+end

data/script/compile_and_plot_smriti_final.rb ADDED Viewed

@@ -0,0 +1,97 @@
+#!/usr/bin/ruby -w
+require 'roc'
+require 'optparse'
+require 'generator'
+$decoy = false
+$base = "precision_vs_numhits"
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} smriti.csv ..."
+  op.separator ""
+  op.separator "smriti.csv = (tab delimited) prob, file:seq:charge, T/F"
+  op.separator ""
+  op.on("--decoy", "'F' indicates this is a decoy") {|v| $decoy = true }
+  op.on("-o", "--outfile <filename>", "base outfile name (#{$base})") {|v| $base = v}
+end
+opts.parse!
+if ARGV.size <= 0
+  puts opts
+  exit
+end
+files = ARGV.to_a
+xys = files.map do |file|
+  triplets = IO.readlines(file).reject{|v| v =~ /^#/}.map do |line|
+    line.chomp.split("\t")
+  end
+  # check that they're all OK:
+  triplets.each do |trip|
+    if trip.size != 3 ; abort "bad triplet" end
+  end
+  # figure out the ordering (and correct if necessary):
+  higher_better = triplets[0][0].to_f > triplets.last[0].to_f
+  doublets = triplets.map do |trip|
+    value = trip[0].to_f
+    value *= -1 if higher_better
+    [value, ((trip[2] == 'T') ? true : false)]
+  end
+  roc = ROC.new
+  (tps, fps) = roc.doublets_to_separate(doublets)
+  (x, y) =
+    if $decoy
+      (numhits, precision) = DecoyROC.new.pred_and_ppv(tps, fps)
+      [numhits, precision]
+    else
+      (numhits, precision) = roc.numhits_and_ppv(doublets)
+      [numhits, precision]
+    end
+  [x,y]
+end
+## PLOT TO to_plot
+File.open( $base + ".to_plot", 'w') do |fh|
+  fh.puts "XYData"
+  fh.puts $base
+  fh.puts "precision vs. num hits"
+  fh.puts "num hits"
+  fh.puts "precision"
+  files.zip(xys) do |file,xy|
+    (x,y) = xy
+    x.unshift(0)
+    y.unshift(1)
+    fh.puts file.sub(/\.[^\.]$/,'')
+    fh.puts x.join(" ")
+    fh.puts y.join(" ")
+  end
+end
+File.open( $base + ".csv", 'w') do |fh|
+  columns = []
+  files.zip(xys) do |file,xy|
+    f = file.sub(/\.[^\.]$/,'')
+    (x,y) = xy
+    x.unshift("#Hits: #{f}")
+    y.unshift("Precision: #{f}")
+    columns << x << y
+  end
+  SyncEnumerator.new(*columns).each do |row|
+    fh.puts row.join("\t")
+  end
+end

data/script/extract_gradient_programs.rb ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/ruby
+require 'optparse'
+require 'table'
+require 'spec/gradient_program'
+delimiter = "\t"
+table_format = false
+opts = OptionParser.new do |op|
+  op.banner = "#{File.basename(__FILE__)} [OPTIONS] <file>.meth"
+  op.on("-d", "--delimiter <tab|space|format>", "delimiter (tab default)", "format = space delimited, formatted ascii table") do |v|
+    if v == 'space'
+      delimiter = " "
+    elsif v == 'tab'
+      delimiter = "\t"
+    elsif v == 'format'
+      table_format = true
+    else
+      abort "don't recognize #{v}"
+    end
+  end
+end
+opts.parse!
+if ARGV.size == 0
+  puts opts
+  exit
+end
+sets_of_tables = {}
+ARGV.each do |file|
+  File.open(file) do |fh|
+    sets_of_tables[file] = GradientProgram.all_from_handle(fh)
+  end
+end
+sets_of_tables.each do |file, tables|
+  puts "FILE: #{file}"
+  tables.each do |gp|
+    puts "PUMP_TYPE: #{gp.pump_type}"
+    col_labels = ["time(min)", "%A", "%B", "%C", "%D", "ul/min"]
+    data = gp.time_points.map do |tp|
+      line = [tp.time, *(tp.percentages)]
+      line << tp.flow_rate
+    end
+    table = Table.new(data, nil, col_labels)
+    if table_format
+      puts table.to_formatted_string
+    else
+      puts table.to_s(delimiter)
+    end
+  end
+end

data/script/get_apex_values_rexml.rb ADDED Viewed

@@ -0,0 +1,44 @@
+#!/usr/bin/ruby
+require 'rexml/document'
+if ARGV.size == 0
+  puts "usage: #{File.basename(__FILE__)} <file>-prot.xml ..."
+  puts "outputs a .csv file"
+  exit
+end
+class Protein
+  attr_accessor :name, :pi, :ni
+  def initialize(name, pi, ni)
+    @name, @pi, @ni = name, pi, ni
+  end
+end
+class Listener
+  attr_accessor :proteins
+  def initialize
+    @proteins = []
+  end
+  def tag_start(name, attrs)
+    if name == "protein"
+      protein = Protein.new( attrs['protein_name'], attrs['probability'].to_f, attrs['total_number_peptides'].to_i)
+      @proteins.push( protein )
+    end
+  end
+  def method_missing(*args) ; end
+end
+ARGV.each do |file|
+  File.open("output.csv", 'w') do |out|
+    listener = Listener.new
+    REXML::Document.parse_stream(File.new(file), listener)
+    listener.proteins.sort_by {|prot| [prot.pi, prot.ni, prot.name] }.reverse.each do |protein|
+      out.puts [protein.name, protein.pi, protein.ni].join("\t")
+    end
+  end
+end

data/script/mzXML2timeIndex.rb CHANGED Viewed

@@ -18,7 +18,7 @@ end
 ARGV.each do |file|
   puts "READING: " + file
   outfile = file + '.timeIndex'
-  obj = Spec::MSRunIndex.new(file)
+  obj = MS::MSRunIndex.new(file)
   puts "WRITING: " + outfile
   obj.to_index_file(outfile)
 end

data/script/smriti_final_analysis.rb ADDED Viewed

@@ -0,0 +1,103 @@
+#!/usr/bin/ruby -w
+require 'spec_id'
+require 'fasta'
+require 'optparse'
+$top = false
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml <file>.fasta|prefix"
+  op.separator "outputs stdout (tab del sorted by probability) probability, file:aaseq:charge T/F"
+  op.separator "hashes on file+aaseq+charge"
+  op.on("-t", "--top", "only top peptide (by prob) per scan+charge") do
+    $top = true
+  end
+end
+opts.parse!
+if ARGV.size < 2
+  puts opts.to_s
+  exit
+end
+specid_file = ARGV.shift
+file_or_prefix = ARGV.shift
+specid = SpecID.new(specid_file)
+indicator =
+  if File.exist? file_or_prefix
+    Fasta.new.read_file(file_or_prefix)
+  else
+    file_or_prefix
+  end
+# returns an array containing the min prob peptides (in case of a tie)
+def lowest_peps(ar)
+  min_prob = ar.min {|a,b| a.probability.to_f <=> b.probability.to_f }.probability.to_f
+  ar.select {|v| v.probability.to_f == min_prob }
+end
+peps = specid.peps
+if $top
+  top_by_scan = []
+  peps.hash_by(:base_name, :first_scan).each do |k,v|
+    low_peps = lowest_peps(v)
+    top_by_scan.push( *low_peps )
+  end
+end
+results = top_by_scan.hash_by(:base_name, :aaseq, :charge).map do |k,v|
+  low_peps = lowest_peps(v)
+  #min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
+  all_prots = []
+  low_peps.each do |pep|
+    all_prot_references.push( *(pep.prots.map {|v| v.reference }) )
+  end
+  all_prot_references.uniq!
+  is_true =
+    if indicator.is_a? Fasta
+      all_prot_references.any? do |ref|
+        indicator.included_in_header?(ref)
+      end
+    else
+      !(all_prot_references.all? {|ref| ref.include?( indicator )})
+    end
+  [min_pep.probability.to_f, k, is_true]
+end
+results.sort.each do |result|
+  report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
+  puts report.join("\t")
+end
+=begin
+# ORIGINAL CODE
+peps = specid.peps
+if $top
+  peps = peps.hash_by(:base_name, :first_scan).map do |k,v|
+    v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
+  end
+end
+results = peps.hash_by(:base_name, :aaseq, :charge).map do |k,v|
+  min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
+  references = min_pep.prots.map {|v| v.reference }.uniq
+  is_true =
+    if indicator.is_a? Fasta
+      references.any? do |ref|
+        indicator.included_in_header?(ref)
+      end
+    else
+      !(references.all? {|ref| ref.include?( indicator )})
+    end
+  [min_pep.probability.to_f, k, is_true]
+end
+results.sort.each do |result|
+  report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
+  puts report.join("\t")
+end
+=end