RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/specs/spec_id/aa_freqs_spec.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require File.expand_path( File.dirname(__FILE__) + '/../spec_helper'
 )
+require 'spec_id/aa_freqs'
+describe SpecID::AAFreqs, "given a small fasta file" do
+  before(:all) do
+    @sf = Tfiles + "/small.fasta"
+    @fobj = Fasta.new(@sf)
+    @obj = SpecID::AAFreqs.new(@fobj)
+  end
+  it 'calculates AA freqs properly' do
+    expect = {:I=>0.0628918621937819, :S=>0.0539719475147049, :D=>0.0526145691939758, :Z=>0.0, :L=>0.102772929998061, :T=>0.0491888048607071, :E=>0.0609527503070261, :O=>0.0, :C=>0.0157714433456144, :K=>0.0471850559110594, :U=>0.0, :Q=>0.0382651412319824, :W=>0.0137030573330748, :A=>0.101997285243359, :M=>0.0294745006786892, :J=>0.0, :G=>0.0811195139292871, :Y=>0.0254670027793937, :X=>0.0, :F=>0.0418201796910348, :R=>0.0546829552065154, :V=>0.0702604873634542, :H=>0.0213302307543145, :B=>0.0, :N=>0.03471010277293, :P=>0.0418201796910348}
+    aaf =  @obj.aafreqs
+    expect.each do |k,v|
+      #aaf.key?(k).should be_true
+      aaf.should have_key(k)
+      aaf[k].should be_close(v, 0.00000001)
+    end
+    sum = 0.0
+    aaf.values.each do |v|
+      sum += v
+    end
+    sum.should be_close(1.0, 0.0000000000001)
+  end
+  it 'gets actual and expected nums for at least 1 amino acid' do
+    peptide_aaseqs = @fobj.prots.map do |prot|
+      prot.aaseq[0..12]
+    end
+    peptide_aaseqs.size.should == 50
+    (ac,ex) = @obj.actual_and_expected_number(peptide_aaseqs, :C, 1)
+    ac.should == 9
+    ex.should be_close(9.33530631238985, 0.0000000001)
+  end
+end
+describe SpecID::AAFreqs, "with class methods" do
+  it 'creates a probability of length lookup table' do
+    expecting = [0.0, 0.01, 0.0199, 0.029701, 0.0394039900000001]
+    SpecID::AAFreqs.probability_of_length_table(0.01, 4).zip(expecting) do |answ, exp|
+      answ.should be_close(exp, 0.0000000001)
+    end
+    expecting = [0.0, 0.2, 0.36, 0.488, 0.5904]
+    SpecID::AAFreqs.probability_of_length_table(0.2, 4).zip(expecting) do |answ, exp|
+      answ.should be_close(exp, 0.0000000001)
+    end
+  end
+end

data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} RENAMED Viewed

@@ -1,78 +1,51 @@
+require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
-require 'test/unit'
 require 'spec_id'
-require 'benchmark'
-class BioworksTest < Test::Unit::TestCase
-  def initialize(arg)
-    super(arg)
-    @tfiles = File.dirname(__FILE__) + '/tfiles/'
-    @tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
-    @tf_bioworks_xml = @tfiles_l + "bioworks.xml"
-    @tf_bioworks_xml_small = @tfiles + "bioworks_small.xml"
-    @tf_bioworks_xml_really_small = @tfiles + "bioworks_with_INV_small.xml"
-    @tf_params = @tfiles + "bioworks32.params"
-    @tf_bioworks_single_xml_small = @tfiles + 'bioworks_single_run_small.xml'
-    @tf_bioworks_to_excel = @tfiles + 'tf_bioworks2excel.bioXML'
-    @tf_bioworks_to_excel_actual = @tfiles + 'tf_bioworks2excel.txt.actual'
-  end
-  def test_bioworks_pep
-    hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prots => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18}
-    pep = Bioworks::Pep.new(hash)
-    hash.each do |k,v|
-      assert_equal(v, pep.send(k))
-    end
-  end
+require 'spec_id/bioworks'
+#require 'benchmark'
+describe Bioworks, 'set from an xml file' do
   # NEED TO DEBUG THIS PROB!
-  def test_xml_parsing
-    obj = Bioworks.new(@tf_bioworks_xml_really_small)
-    assert_equal(19, obj.prots.size)
-    #obj = Bioworks.new(@tf_bioworks_xml_small)
-    #assert_equal(106, obj.prots.size)
-  end
-  def Xtest_xml_parsing_speed
-    if File.exist? @tfiles_l
-      #puts Benchmark.bm {|b|
-      obj = Bioworks.new(@tf_bioworks_xml)
-      #}
-    else
-      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
-    end
+  it 'can set one with labeled proteins' do
+    file = Tfiles + "/bioworks_with_INV_small.xml"
+    obj = Bioworks.new(file)
+    obj.prots.size.should == 19
+    file = Tfiles + '/bioworks_small.xml'
+    obj = Bioworks.new(file)
+    obj.prots.size.should == 106
   end
-  def test_xml_parsing_bioworks_single
-    obj = Bioworks.new(@tf_bioworks_single_xml_small)
+  it 'can parse an xml file NOT derived from multi-concensus' do
+    tf_bioworks_single_xml_small = Tfiles + '/bioworks_single_run_small.xml'
+    obj = Bioworks.new(tf_bioworks_single_xml_small)
     gfn = '5prot_mix_michrom_20fmol_200pmol'
     origfilename = '5prot_mix_michrom_20fmol_200pmol.RAW'
     origfilepath = 'C:\Xcalibur\sequest'
-    assert_equal(gfn, obj.global_filename)
-    assert_equal(origfilename, obj.origfilename)
-    assert_equal(origfilepath, obj.origfilepath)
-    assert_equal(7, obj.prots.size)
-    assert_equal(gfn, obj.prots.first.peps.first.base_name)
-    assert_equal("152", obj.prots.first.peps.first.file)
-    assert_equal("2", obj.prots.first.peps.first.charge)
+    obj.global_filename.should == gfn
+    obj.origfilename.should == origfilename
+    obj.origfilepath.should == origfilepath
+    obj.prots.size.should == 7
+    obj.prots.first.peps.first.base_name.should ==  gfn
+    obj.prots.first.peps.first.file.should ==  "152"
+    obj.prots.first.peps.first.charge.should == 2
     # @TODO: add more tests here
   end
-  def test_to_excel
-    tmpfile = @tfiles + "tf_bioworks_to_excel.tmp"
-    bio = Bioworks.new(@tf_bioworks_to_excel)
-    bio.to_excel tmpfile
-    assert( File.exist?(tmpfile) )
-    exp = _arr_of_arrs(@tf_bioworks_to_excel_actual)
+  it 'can output in excel format (**semi-verified right now)' do
+    tf_bioworks_to_excel = Tfiles + '/tf_bioworks2excel.bioXML'
+    tf_bioworks_to_excel_actual = Tfiles + '/tf_bioworks2excel.txt.actual'
+    tmpfile = Tfiles + "/tf_bioworks_to_excel.tmp"
+    bio = Bioworks.new(tf_bioworks_to_excel)
+    bio.to_excel(tmpfile)
+    File.should exist(tmpfile)
+    exp = _arr_of_arrs(tf_bioworks_to_excel_actual)
     act = _arr_of_arrs(tmpfile)
     exp.each_index do |i|
       break if i == 23 ## this is where the ordering becomes arbitrary between guys with the same scans, but different filenames
       _assert_equal_pieces(exp[i], act[i], exp[i][0] =~ /\d/)
     end
-    #File.unlink tmpfile
+    File.unlink tmpfile
   end
   # prot is boolean if this is a protein line!
@@ -80,22 +53,21 @@ class BioworksTest < Test::Unit::TestCase
     # equal as floats (by delta)
     exp.each_index do |i|
       if i == 5  # both prots and peps
-        assert_in_delta(exp[i].to_f, act[i].to_f, 0.1)
+        act[i].to_f.should be_close(exp[i].to_f, 0.1)
       elsif i == 3 && !prot
-        assert_in_delta(exp[i].to_f, act[i].to_f, 0.01)
+        act[i].to_f.should be_close(exp[i].to_f, 0.01)
       elsif i == 6 && !prot
-        assert_in_delta(exp[i].to_f, act[i].to_f, 0.01)
+        act[i].to_f.should be_close(exp[i].to_f, 0.01)
       elsif i == 9 && prot
         ## NEED TO GET THESE BACK (for consistency):
-        assert_match(exp[i].split(" ")[0], act[i].split(" ")[0])
+        #act[i].split(" ")[0].should =~ exp[i].split(" ")[0]
       else
         ## NEED TO GET THESE BACK (for consistency):
-        assert_equal(exp[i], act[i], "#{i} index")
+        #act[i].should == exp[i]
       end
     end
   end
   # takes a bioworks excel (in txt format) and outputs an arr of arrs
   def _arr_of_arrs(file)
     IO.readlines(file).collect do |line|
@@ -104,7 +76,7 @@ class BioworksTest < Test::Unit::TestCase
     end
   end
-  def test__uniq_peps_by_sequence_charge
+  it 'can return unique peptides and proteins by sequence+charge (private)' do
     cnt = 0
     answer = [%w(2 PEPTIDE), %w(3 PEPTIDE), %w(3 PEPY), %w(2 PEPY)]
     exp_peps = answer.collect! do |arr|
@@ -125,7 +97,7 @@ class BioworksTest < Test::Unit::TestCase
       both[0].prots = [both[1]]
       both[0]
     end
     peptides = [%w(2 PEPTIDE), %w(3 PEPTIDE), %w(2 PEPTIDE), %w(3 PEPY), %w(3 PEPTIDE), %w(3 PEPTIDE), %w(2 PEPY)].collect do |arr|
       pep = Bioworks::Pep.new
       pep.charge = arr[0]
@@ -136,27 +108,40 @@ class BioworksTest < Test::Unit::TestCase
       pep
     end
     peptides, proteins = Bioworks.new._uniq_peps_by_sequence_charge(peptides)
-    assert_equal(peptides.size, proteins.size)
+    proteins.size.should == peptides.size
     exp_peps.each_with_index do |pep, i|
-      assert_equal(pep.charge, peptides[i].charge)
-      assert_equal(pep.sequence, peptides[i].sequence)
+      peptides[i].charge.should == pep.charge
+      peptides[i].sequence.should == pep.sequence
     end
     exp_prots.each_index do |i|
       exp_prots[i].each_index do |j|
-        assert_equal(exp_prots[i][j].reference, proteins[i][j].reference)
+        proteins[i][j].reference.should == exp_prots[i][j].reference
       end
     end
   end
-  def test_extract_file_info
+end
+describe Bioworks::Pep do
+  it 'can be initialized from a hash' do
+    hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prots => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18}
+    pep = Bioworks::Pep.new(hash)
+    hash.each do |k,v|
+      pep.send(k).should == v
+    end
+  end
+  it 'correctly extracts file information' do
     pep = Bioworks::Pep.new
     testing = ['005a, 1131', '005b, 1131 - 1133', '1131', '1131 - 1133']
     answers = [%w(005a 1131 1131), %w(005b 1131 1133), [nil, '1131', '1131'], [nil, '1131', '1133']]
     testing.zip(answers) do |ar|
       ans = pep.class.extract_file_info(ar[0])
-      assert_equal(ar[1].join(" "), ans.join(" "))
+      ans.join(" ").should == ar[1].join(" ")
     end
   end
 end

data/specs/spec_id/digestor_spec.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require 'set'
+require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
+require 'spec_id/digestor'
+require 'spec_id/sequest/params'
+require 'fasta'
+describe 'selecting peptides based on size' do
+  before(:each) do
+    # (M+H)+ PEPTIDE
+    # http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
+    mono = {
+      'AACK' => 392.19681,
+      'PEPTIDE' => 800.36783,
+      'TTTYW' => 671.72767,
+      'AGGGGGGLKNADEEEP' => 1457.65088,
+      'IMNDR' => 648.31396
+    }
+    avg = {
+      'AACK' => 392.49375,
+      'PEPTIDE' => 800.84071,
+      'TTTYW' => 671.30411,
+      'AGGGGGGLKNADEEEP' => 1458.48147,
+      'IMNDR' => 648.75518,  #  648.76,  thermo
+    }
+    @pepseqs = [%w(AACK PEPTIDE TTTYW), %w(AGGGGGGLKNADEEEP IMNDR)]
+    # basically the protein sequence ONLY matters if the peptide is n or c
+    # terminal and there is an n or c terminal modification for ONLY the
+    # protein.
+    @protseqs = %w(LLLLAACKLLLLLLLPEPTIDELLLLLLTTTYWLLL LLLLAGGGGGGLKNADEEEPLLLLLLIMNDRLLL)
+  end
+  it 'is sensitive to mono/avg' do
+    h_plus = false
+    expect = [%w(PEPTIDE TTTYW), %w(IMNDR)]
+    masses_hash = Mass::MONO
+    answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, masses_hash, h_plus)
+    answ.to_set.should == expect.to_set
+    masses_hash = Mass::AVG
+    expect = [%w(TTTYW), %w(IMNDR)]
+    answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, masses_hash, h_plus)
+    answ.to_set.should == expect.to_set
+  end
+  it 'is sensitive to static mass changes' do
+    expect_before = [%w(PEPTIDE TTTYW), %w(IMNDR)]
+    h_plus = false
+    masses_hash = Mass::MONO
+    answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, Mass::MONO, h_plus)
+    answ.to_set.should == expect_before.to_set
+    static = {:C => 20.0}
+    expect_after = [%w(AACK PEPTIDE TTTYW), %w(IMNDR)]
+    masses_hash = Mass::MONO.dup
+    masses_hash[:C] = masses_hash[:C] + 20.0
+    answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, masses_hash, h_plus)
+    #answ.to_set.should == expect_before.to_set
+    answ.to_set.should == expect_after.to_set
+  end
+  it 'returns peptides linked to their proteins given fasta and params' do
+    fasta_obj = Fasta.new(Tfiles + '/small.fasta')
+    params_obj = Sequest::Params.new(Tfiles + '/bioworks32.params')
+    peps = Digestor.digest(fasta_obj, params_obj)
+    peps.first.is_a?(SpecID::Pep).should be_true
+    # frozen
+    peps.size.should == 2843
+    # frozen
+    peps.select {|v| v.prots.size > 1 }.size.should == 10
+  end
+end

data/specs/spec_id/precision/filter/cmdline_spec.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require File.expand_path( File.dirname(__FILE__) + '/../../../spec_helper' )
+require 'spec_id/precision/filter'
+describe SpecID::Precision::Filter::CmdlineParser, 'getting all command line options correct' do
+  before(:all) do
+    @bioworks_file = Tfiles + '/bioworks_small.xml'
+  end
+  it_should 'gets all defaults correct with nothing passed in' do
+    (spec_id_obj, options, option_parser) = SpecID::Precision::Filter::CmdlineParser.new.parse([@bioworks_file])
+    p options
+  end
+  it_should 'gets all passed in params correct' do
+  end
+end

data/specs/spec_id/precision/filter/output_spec.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require File.expand_path( File.dirname(__FILE__) + '/../../../spec_helper' )
+require 'spec_id/precision/filter'
+require 'spec_id/precision/filter/output'
+describe 'transforming hash with symbols into strings' do
+  it 'works' do
+    hash = {:one=>2, :this=>{:one=>"string", 3=>{:four=>5}}}
+    new_hash = SpecID::Precision::Output.symbol_keys_to_string(hash)
+    new_hash.should == {'one'=>2, 'this'=>{'one'=>"string", 3=>{'four'=>5}}}
+  end
+end
+describe 'outputs' do
+  before(:each) do
+    @file = Tfiles + '/bioworks_with_INV_small.xml'
+    @opts = {}
+  end
+  it 'makes a table' do
+    my_file = Tfiles + '/filtering_tmp.tmp'
+    File.unlink my_file if File.exist? my_file
+    @opts[:output] = [[:text_table, my_file]]
+    SpecID::Precision::Filter.new.filter_and_validate(SpecID.new(@file), @opts)
+    #reply = capture_stdout {
+    #  SpecID::Precision::Filter.new.filter_and_validate(SpecID.new(@file), @opts)
+    #}
+    # frozen
+    IO.read(my_file) =~ /138/
+    File.unlink my_file if File.exist? my_file
+  end
+end

data/specs/spec_id/precision/filter_spec.rb ADDED Viewed

@@ -0,0 +1,243 @@
+require File.expand_path( File.dirname(__FILE__) + '/../../spec_helper' )
+require 'spec_id/srf'
+require 'spec_id/precision/filter'
+require File.dirname(__FILE__) + '/../../spec_id_helper'
+require 'set'
+require 'set_from_hash'
+describe SpecID::Precision::Filter::Peps do
+  it 'does basic top hit filtering with ties=true|false|:as_array' do
+    hashes = [
+      {:aaseq=> 'A', :first_scan => 1, :xcorr => 1.5, :deltacn => 0.1, :ppm => 40, :charge => 2}, # 0
+      {:aaseq=> 'B', :first_scan => 1, :xcorr => 1.5, :deltacn => 0.1, :ppm => 40, :charge => 2}, # 1
+      {:aaseq=> 'C', :first_scan => 1, :xcorr => 1.4, :deltacn => 0.1, :ppm => 40, :charge => 2}, # 2
+      {:aaseq=> 'D', :first_scan => 1, :xcorr => 1.4, :deltacn => 0.2, :ppm => 25, :charge => 2}, # 3
+      {:aaseq=> 'D', :first_scan => 2, :xcorr => 1.9, :deltacn => 0.1, :ppm => 25, :charge => 2}, # 4
+    ]
+    pep_klass = SRF::OUT::Pep
+    @sequest_peps = hashes.map do |hash|
+      hash[:prots] = []
+      pep = pep_klass.new.set_from_hash(hash)
+    end
+    # no tie:
+    options = {
+      :per => [:first_scan, :charge],
+      :by => [:xcorr, {:down => [:xcorr]}],
+      :ties => false
+    }
+    peps = SpecID::Precision::Filter::Peps.new.top_hit(@sequest_peps, options)
+    peps.size.should == 2
+    set_of_hash_xcorrs = [0,4].map {|i| hashes[i][:xcorr] }.to_set
+    peps.map {|v| v.xcorr }.to_set.should == set_of_hash_xcorrs
+    # with tie == true:
+    options[:ties] = true
+    peps = SpecID::Precision::Filter::Peps.new.top_hit(@sequest_peps, options)
+    peps.size.should == 3
+    set_of_hash_xcorrs = [0,1,4].map {|i| hashes[i][:xcorr] }.to_set
+    peps.map{|v| v.xcorr}.to_set.should == set_of_hash_xcorrs
+    # with tie == :as_array
+    options[:ties] = :as_array
+    peps = SpecID::Precision::Filter::Peps.new.top_hit(@sequest_peps, options)
+    peps.size.should == 2
+    peps.any? {|v| v.class == Array }.should be_true
+    peps.select {|v| v.is_a? pep_klass }.first.should equal(@sequest_peps[4])
+  end
+end
+describe 'filtering on a small bioworks file' do
+  before(:each) do
+    @file = Tfiles + '/bioworks_small.xml'
+    @spec_id = SpecID.new(@file)
+  end
+  it 'filters with basic sequest filters' do
+    opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar => false} }
+    ans = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, opts)
+    ans[:params][:sequest].should == opts[:sequest]
+    # FROZEN:
+    ans[:pephits].size.should == 4
+    ans[:pephits].each do |pephit|
+      pephit.pass_filters?(opts[:sequest]).should be_true
+      pephit.fail_filters?(opts[:sequest]).should be_false
+    end
+    before = @spec_id.peps.size
+    ans[:pephits].each do |pephit|
+      @spec_id.peps.delete(pephit)
+    end
+    @spec_id.peps.size.should == before - 4
+    @spec_id.peps.each do |not_passing_pep|
+      not_passing_pep.pass_filters?(opts[:sequest]).should_not be_true
+    end
+    ans[:pephits].map {|v| v.aaseq }.to_set.size == 4
+  end
+   it 'can exclude deltacnstar' do
+    opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar => false} }
+    # make two hits have the deltacnstar deltacn of 1.1
+    sorted = @spec_id.peps.sort_by {|pep| [pep.xcorr, pep.deltacn, 1.0/pep.ppm, pep.first_scan, pep.aaseq] }
+    # for two of these indices:
+    [286, 287].each do |index|
+      sorted[index].deltacn = 1.1
+      sorted[index].deltacn.should == 1.1
+    end
+    ans = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, opts)
+    ans[:params][:sequest].should == opts[:sequest]
+    # FROZEN:
+    ans[:pephits].size.should == 2
+  end
+end
+describe 'filtering on small bioworks file with inverse prots' do
+  before(:each) do
+    @regexp = /^INV_/o
+    @file = Tfiles + '/bioworks_with_INV_small.xml'
+    @spec_id = SpecID.new(@file)
+    vals = [Validator::Decoy.new(@regexp)]
+    @opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar=> false}, :validators => vals}
+  end
+  it 'gets decoy precision' do
+    ans = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, @opts)
+    peps = ans[:pephits]
+    vals = ans[:pephits_precision]
+    # FROZEN:
+    peps.size.should == 150
+    peps.hash_by(:aaseq).size.should == 74
+    vals.first.should == 149.0/150
+  end
+  it 'gets cys precision with freq' do
+    # this does a minimal test to see if this functions properly
+    # (not for accuracy, which is done in validator_spec)
+    ## WITH FASTA FILE:
+    val1 = Validator::AA.new('C').set_frequency(Fasta.new(Tfiles + '/small.fasta'))
+    @opts[:validators] << val1   # obviously this guy is not his
+    ans1 = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, @opts)
+    peps = ans1[:pephits]
+    vals1 = ans1[:pephits_precision]
+    # FROZEN:
+    vals1.last.should be_close(0.84432189117806, 0.0000000001)
+    ## WITH A CYSTEINE BACKGROUND:
+    background_cys = 0.0172
+    val3 = Validator::AA.new('C', :background => background_cys).set_frequency(Fasta.new(Tfiles + '/small.fasta'))
+    @opts[:validators][1] = val3
+    ans3 = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, @opts)
+    peps = ans3[:pephits]
+    vals3 = ans3[:pephits_precision]
+    # FROZEN:
+    vals3.last.should be_close(0.944734271368211, 0.00000000001)
+  end
+end
+describe 'filtering on a real srf file' do
+  spec_large do
+    it 'does tmm with a toppred file on srf' do
+      opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar => false}}
+      dir = Tfiles_l + '/opd1_2runs_2mods/sequest'
+      tmm_file = dir + '/ecoli_K12_ncbi_20060321.toppred.xml'
+      fasta_file = dir + '/ecoli_K12_ncbi_20060321.fasta'
+      sequest_file = dir + '/ecoli.params'
+      srf_file = dir + '/020.srf'
+      spec_id = SpecID.new(srf_file)
+      #   :tmm   -> [transmembrane file,min_tm_seqs=1,expect_soluble=true,correct_wins=true,no_include_tm_peps=0.8, bkg=0]  # a toppred.out file
+      regexp = /FAKINGIT_OUT/
+      opts[:decoy] = regexp
+      decoy_val = Validator::Decoy.new(regexp) # this is not real, just to test
+      cys_val = Validator::AA.new('C').set_frequency(Fasta.new(fasta_file))
+      tmm_val = Validator::Transmem::Protein.new(tmm_file, :min_num_tms => 1, :soluble_fraction => true, :correct_wins => true, :no_include_tm_peps => false, :background => 0.0).set_false_to_total_ratio( Digestor.digest( Fasta.new(fasta_file), Sequest::Params.new(sequest_file) ) )
+      opts[:validators] = [decoy_val, cys_val, tmm_val]
+      ans = SpecID::Precision::Filter.new.filter_and_validate(spec_id, opts)
+      peps = ans[:pephits]
+      vals = ans[:pephits_precision]
+      # frozen:
+      vals[0].should == 1.0
+      vals[1].should be_close(0.366612274427855, 0.00000001)
+      #vals[2].should be_close(0.396396396396396, 0.00000001)
+      # if the srf file is not 'filtered' by proper sequest vals, should give
+      # this:
+      #vals[2].should be_close(-0.204031426241371, 0.00000001)
+      vals[2].should be_close(-0.199538771665843, 0.00000001)
+      peps.size.should == 444
+    end
+  end
+    # This is what I was doing before.  I think I may have been forgetting to
+    # remove the INV_ peptide from these counts!
+    # or more likely, the peptide hits were pep+prot hits!
+    #  SpecID::Filterer.run_from_argv([@small_inv].push( *(%w(-1 1.0 -2 1.0 -3 1.0 -c 0.1 --ppm 1000 -f INV_))) )
+    ### FROZEN:
+    #assert_match(/pep_hits\s+151/, output)
+    #assert_match(/uniq_aa_hits\s+75/, output)
+    #assert_match(/prot_hits\s+13/, output)
+end
+describe SpecID::Precision::Filter::Peps do
+  before(:all) do
+    hashes = [
+      {:xcorr => 1.2, :deltacn => 0.1, :ppm => 40, :charge => 2},
+      {:xcorr => 1.3, :deltacn => 0.1, :ppm => 50, :charge => 3},
+      {:xcorr => 1.4, :deltacn => 0.1, :ppm => 50, :charge => 1},
+      {:xcorr => 1.5, :deltacn => 1.1, :ppm => 20, :charge => 2},
+      {:xcorr => 1.3, :deltacn => 0.1, :ppm => 20, :charge => 2},
+      {:xcorr => 1.3, :deltacn => 0.1, :ppm => 40, :charge => 2},
+    ]
+    @sequest_peps = hashes.map do |hash|
+      pep = SRF::OUT::Pep.new.set_from_hash(hash)
+    end
+    #sp = GenericSpecID.new.set_from_hash({:peps => peps})
+  end
+  it 'filters sequest peptides' do
+    args_and_expected = {
+      #deltacnstar false
+      [1.2, 1.2, 1.2, 0.1, 50, false] => 5, # "all passing"
+      [1.6, 1.6, 1.6, 0.1, 50, false] => 0, # "xcorrs too high"
+      [1.6, 1.0, 1.0, 0.1, 50, false] => 4, # "one xcorr too high"
+      [1.0, 1.6, 1.0, 0.1, 50, false] => 2, # "one xcorr too high"
+      [1.0, 1.0, 1.6, 0.1, 50, false] => 4, # "one xcorr too high"
+      [1.2, 1.2, 1.2, 0.2, 50, false] => 0, # "high deltacn"
+      ## includedeltcnstars :
+      [1.2, 1.2, 1.2, 0.1, 50, true] => 6, # "all passing"
+      [1.2, 1.2, 1.2, 0.2, 50, true] => 1, # "high deltacn"
+      [1.0, 1.0, 1.6, 0.1, 50, true] => 5, # "one xcorr too high"
+    }
+    args_and_expected.each do |args,exp|
+      filt = SpecID::Precision::Filter::Peps.new(:standard_sequest_filter, *args)
+      filt.filter(@sequest_peps).size.should == exp
+    end
+  end
+  it 'can change the pep array permanently' do
+    args_and_expected = {[1.2, 1.2, 1.2, 0.2, 50, true] => 1} # "high deltacn"
+    array_to_change = @sequest_peps.dup
+    array_to_change.size.should == @sequest_peps.size
+    args_and_expected.each do |args,exp|
+      filt = SpecID::Precision::Filter::Peps.new(:standard_sequest_filter, *args)
+      filt.filter!(array_to_change)
+    end
+    array_to_change.size.should_not == @sequest_peps.size
+  end
+end