RubyGems - mspire - Versions diffs - 0.1.7 → 0.2.0 - Mend

mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/Rakefile +41 -14
data/bin/bioworks2excel.rb +1 -1
data/bin/bioworks_to_pepxml.rb +46 -59
data/bin/fasta_shaker.rb +1 -1
data/bin/filter.rb +6 -0
data/bin/find_aa_freq.rb +23 -0
data/bin/id_precision.rb +3 -2
data/bin/mzxml_to_lmat.rb +2 -1
data/bin/pepproph_filter.rb +1 -1
data/bin/precision.rb +1 -1
data/bin/protein_summary.rb +2 -451
data/bin/raw_to_mzXML.rb +55 -0
data/bin/srf_group.rb +26 -0
data/changelog.txt +7 -0
data/lib/align.rb +3 -3
data/lib/fasta.rb +6 -1
data/lib/gi.rb +9 -4
data/lib/roc.rb +2 -0
data/lib/sample_enzyme.rb +2 -1
data/lib/spec/mzxml/parser.rb +2 -43
data/lib/spec/mzxml.rb +65 -2
data/lib/spec_id/aa_freqs.rb +10 -7
data/lib/spec_id/bioworks.rb +67 -87
data/lib/spec_id/filter.rb +794 -0
data/lib/spec_id/precision.rb +29 -36
data/lib/spec_id/proph.rb +5 -3
data/lib/spec_id/protein_summary.rb +459 -0
data/lib/spec_id/sequest.rb +323 -271
data/lib/spec_id/srf.rb +189 -135
data/lib/spec_id.rb +276 -227
data/lib/spec_id_xml.rb +101 -0
data/lib/toppred.rb +18 -0
data/script/degenerate_peptides.rb +47 -0
data/script/filter-peps.rb +5 -1
data/test/tc_align.rb +1 -1
data/test/tc_bioworks.rb +25 -22
data/test/tc_bioworks_to_pepxml.rb +37 -4
data/test/tc_fasta.rb +3 -1
data/test/tc_fasta_shaker.rb +8 -6
data/test/tc_filter.rb +203 -0
data/test/tc_gi.rb +6 -9
data/test/tc_id_precision.rb +31 -0
data/test/tc_mzxml.rb +8 -6
data/test/tc_peptide_parent_times.rb +2 -1
data/test/tc_precision.rb +1 -1
data/test/tc_proph.rb +5 -5
data/test/tc_protein_summary.rb +36 -13
data/test/tc_sequest.rb +78 -33
data/test/tc_spec_id.rb +128 -6
data/test/tc_srf.rb +84 -38
metadata +67 -62
data/bin/fasta_cat.rb +0 -39
data/bin/fasta_cat_mod.rb +0 -59
data/bin/fasta_mod.rb +0 -57
data/bin/filter_spec_id.rb +0 -365
data/bin/raw2mzXML.rb +0 -21
data/script/gen_database_searching.rb +0 -258

data/test/tc_filter.rb ADDED Viewed

@@ -0,0 +1,203 @@
+require 'test/unit'
+require 'spec_id/filter'
+require 'spec_id/srf'
+require 'set_from_hash'
+require File.dirname(__FILE__) + '/test_helper'
+$VERBOSE = false
+class TestFilter < Test::Unit::TestCase
+  def initialize(arg)
+    super(arg)
+    @tfiles = File.dirname(__FILE__) + '/tfiles/'
+    @tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
+    @small_inv = @tfiles + 'bioworks_with_INV_small.xml'
+    @small = @tfiles + 'bioworks_small.xml'
+    ## SRF:
+    @zero_srf = @tfiles_l + 'opd1_cat_inv/000.srf'
+    @twenty_srf = @tfiles_l + 'opd1_cat_inv/020.srf'
+    @zero_srg = @tfiles_l + 'bioworks_000.srg'
+    @both_srg = @tfiles_l + 'bioworks_both.srg'
+    ## FASTA:
+    @opd1_fasta = @tfiles_l + 'opd1_cat_inv/ecoli_K12_ncbi_20060321.fasta'
+    @opd1_correct_fasta = @tfiles_l + 'opd1_cat_inv/correct_fictitious_314.fasta'
+    if File.exist? @tfiles_l
+      File.open(@zero_srg, 'w') {|fh| fh.puts( File.expand_path(@zero_srf) ) }
+      File.open(@both_srg, 'w') {|fh| fh.puts( File.expand_path(@zero_srf) ); fh.puts( File.expand_path(@twenty_srf) ) }
+    end
+  end
+  def test_protein_fppr
+    peps_per_prot = [4,4,3,2,2]
+    (num, mean_fppr, std_num, std_fppr) = SpecID::Filter.new.protein_fppr(peps_per_prot, 1, 10)
+    assert_equal(0, mean_fppr, "no prots completely wrong")
+    assert_equal(0, std_fppr, "no prots completely wrong")
+    (num, mean_fppr, std_num, std_fppr) = SpecID::Filter.new.protein_fppr(peps_per_prot, 14, 10)
+    assert_equal(4.0/5, mean_fppr, "only one prot right")
+    assert_equal(0.0, std_fppr, "only one prot right")
+  end
+  def test_filter_sequest
+    hashes = [
+      {:xcorr => 1.2, :deltacn => 0.1, :ppm => 40, :charge => 2},
+      {:xcorr => 1.3, :deltacn => 0.1, :ppm => 50, :charge => 3},
+      {:xcorr => 1.4, :deltacn => 0.1, :ppm => 50, :charge => 1},
+      {:xcorr => 1.5, :deltacn => 1.1, :ppm => 20, :charge => 2},
+      {:xcorr => 1.3, :deltacn => 0.1, :ppm => 20, :charge => 2},
+      {:xcorr => 1.3, :deltacn => 0.1, :ppm => 40, :charge => 2},
+    ]
+    peps = hashes.map do |hash|
+      pep = SRF::OUT::Pep.new.set_from_hash(hash)
+    end
+    sp = GenericSpecID.new.set_from_hash({:peps => peps})
+    before_size = sp.peps.size
+    assert_filter([1.2, 1.2, 1.2, 0.1, 50], sp, 5, "all passing")
+    assert_filter([1.6, 1.6, 1.6, 0.1, 50], sp, 0, "xcorrs too high")
+    assert_filter([1.6, 1.0, 1.0, 0.1, 50], sp, 4, "one xcorr too high")
+    assert_filter([1.0, 1.6, 1.0, 0.1, 50], sp, 2, "one xcorr too high")
+    assert_filter([1.0, 1.0, 1.6, 0.1, 50], sp, 4, "one xcorr too high")
+    assert_filter([1.2, 1.2, 1.2, 0.2, 50], sp, 0, "high deltacn")
+    ## with deltcnstars:
+    assert_filter([1.2, 1.2, 1.2, 0.1, 50], sp, 6, "all passing", true)
+    assert_filter([1.2, 1.2, 1.2, 0.2, 50], sp, 1, "high deltacn", true)
+    assert_filter([1.0, 1.0, 1.6, 0.1, 50], sp, 5, "one xcorr too high", true)
+  end
+  def assert_filter(filter_args, spec_id, expected_passing, message, include_deltcn=false)
+    npeps = spec_id.filter_sequest(filter_args, include_deltcn)
+    assert_equal(expected_passing, npeps.size, message)
+  end
+  def test_passing_proteins
+    hash_prots = (0..7).map do |n|
+      SpecID::GenericProt.new.set_from_hash({:reference => "prot_"+n.to_s, :peps => []})
+    end
+    arr_prots = (0..7).map do |n|
+      SRF::OUT::Prot.new.set_from_hash({:reference => "prot_"+n.to_s, :peps => []})
+    end
+    [hash_prots, arr_prots].each do |prots|
+      hashes = [
+        {:aaseq => 'PEP0', :xcorr => 1.2, :deltacn => 0.1, :ppm => 40, :charge => 2, :prots => [prots[0],prots[1]]},
+        {:aaseq => 'PEP1', :xcorr => 1.3, :deltacn => 0.1, :ppm => 50, :charge => 3, :prots => [prots[1],prots[2]]},
+        {:aaseq => 'PEP2', :xcorr => 1.4, :deltacn => 0.1, :ppm => 50, :charge => 1, :prots => [prots[3]]},
+        {:aaseq => 'PEP3', :xcorr => 1.5, :deltacn => 1.1, :ppm => 20, :charge => 2, :prots => [prots[4]]},
+        {:aaseq => 'PEP4', :xcorr => 1.3, :deltacn => 0.1, :ppm => 20, :charge => 2, :prots => [prots[0]]},
+        {:aaseq => 'PEP5', :xcorr => 1.3, :deltacn => 0.1, :ppm => 40, :charge => 2, :prots => prots[1,2]},
+      ]
+      peps = hashes.map do |hash|
+        SRF::OUT::Pep.new.set_from_hash(hash)
+      end
+      prts = SpecID.passing_proteins(peps)
+      exp = (0..4).map do |n|
+      "prot_" + n.to_s
+      end
+      refs = prts.map { |v| v.reference }.sort
+      assert_equal(exp, refs)
+      prts = SpecID.passing_proteins(peps, :update)
+      prot_0_before = prts.select {|v| v.reference == 'prot_0'}.first
+      assert_protein_match(prts, 'prot_0', %w(PEP0 PEP4))
+      assert_protein_match(prts, 'prot_1', %w(PEP0 PEP1 PEP5))
+      assert_protein_match(prts, 'prot_2', %w(PEP1 PEP5))
+      assert_protein_match(prts, 'prot_3', %w(PEP2))
+      assert_protein_match(prts, 'prot_4', %w(PEP3))
+      srt_ref = prts.map {|v| v.reference}.sort
+      assert_equal(%w(prot_0 prot_1 prot_2 prot_3 prot_4), srt_ref, "just the right number of prots")
+      prot_0 = prts.select {|v| v.reference == 'prot_0'}.first
+      assert_equal(prot_0_before.__id__, prot_0.__id__, "proteins are identical")
+      prot_0_before = prts.select {|v| v.reference == 'prot_0'}.first.__id__
+      prts = SpecID.passing_proteins(peps, :new)
+      assert_protein_match(prts, 'prot_0', %w(PEP0 PEP4))
+      assert_protein_match(prts, 'prot_1', %w(PEP0 PEP1 PEP5))
+      assert_protein_match(prts, 'prot_2', %w(PEP1 PEP5))
+      assert_protein_match(prts, 'prot_3', %w(PEP2))
+      assert_protein_match(prts, 'prot_4', %w(PEP3))
+      srt_ref = prts.map {|v| v.reference}.sort
+      assert_equal(%w(prot_0 prot_1 prot_2 prot_3 prot_4), srt_ref, "just the right number of prots")
+      prot_0 = prts.select {|v| v.reference == 'prot_0'}.first
+      assert_not_equal(prot_0_before, prot_0.__id__, "proteins are not identical")
+    end
+  end
+  def assert_protein_match(prts, ref, pepseqs, message='')
+    prt = prts.select{|v| v.reference == ref }.first
+    sorted_prt_peps_aaseqs = prt.peps.map {|v| v.aaseq }.sort
+    sorted_pepseqs = pepseqs.sort
+    assert_equal(pepseqs, sorted_prt_peps_aaseqs, message)
+  end
+  def test_usage
+    output = capture_stdout {
+      SpecID::Filter.run_from_argv([])
+    }
+    assert_match('usage:', output)
+  end
+  def test_basic_bioworks_xml
+    output = capture_stdout {
+      SpecID::Filter.run_from_argv([@small].push( *(%w(-1 1.0 -2 1.0 -3 1.0 -c 0.1 --ppm 1000))) )
+    }
+    ## FROZEN:
+    assert_match(/pep_hits\s+4/, output)
+    assert_match(/uniq_aa_hits\s+4/, output)
+    assert_match(/prot_hits\s+4/, output)
+    output = capture_stdout {
+      SpecID::Filter.run_from_argv([@small_inv].push( *(%w(-1 1.0 -2 1.0 -3 1.0 -c 0.1 --ppm 1000 -f INV_))) )
+    }
+    #puts ""
+    #puts output
+    ## FROZEN:
+    assert_match(/pep_hits\s+151/, output)
+    assert_match(/uniq_aa_hits\s+75/, output)
+    assert_match(/prot_hits\s+13/, output)
+  end
+  def test_srf
+    if File.exist? @tfiles_l
+      ## dcy
+      output = capture_stdout {
+        SpecID::Filter.run_from_argv([@zero_srg].push( *(%w(-1 1.0 -2 1.0 -3 1.0 -c 0.1 --ppm 1000 -f INV_))) )
+      }
+      ## FROZEN:
+      #puts ""
+      #puts output
+      assert_match(/pep_hits\s+2111\s+107\.2/, output)
+      assert_match(/uniq_aa_hits\s+2034\s+106\.6/, output)
+      assert_match(/prot_hits\s+1454\s+100\.0/, output)
+      ## cys tps fps COMBINED
+      # tps are fictitious!
+      output = capture_stdout {
+        # that's the background freq for ecoli that this file's from
+        SpecID::Filter.run_from_argv([@zero_srg].push( *(%w(-1 1.0 -2 1.0 -3 1.0 -c 0.1 --ppm 1000 --occams_razor --cys 0.0115866200193321 --t).push(@opd1_correct_fasta))))
+      }
+      #puts ""
+      #puts output
+      ## FROZEN:
+      assert_match(/num\s+tps%\s+cys%/, output, "header")
+      assert_match(/pep_hits\s+4374\s+9\d\.\d.*\s+83\.7/, output)
+      assert_match(/uniq_aa_hits\s+4203\s+9\d\.\d.*\s+82\.8/, output)
+      assert_match(/prot_hits\s+2986\s+9\d\..*\s+7\d\./, output)
+      assert_match(/occams.*\s+2986\s+8\d\..*\s+7\d\./, output)
+    else
+      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})" ))
+    end
+  end
+end

data/test/tc_gi.rb CHANGED Viewed

@@ -7,14 +7,11 @@ class Gi2AnnotTest < Test::Unit::TestCase
   ROOT_DIR = File.join(File.dirname(__FILE__), '..')
   def test_single_query
-    #begin
-      annot = GI.gi2annot([16130548]).first
-    #rescue
-      puts "SKIPPING gi2annot test since no internet connection available:"
-      puts "#{$!}"
-      assert true
-    #else
-      assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]'+"\n", annot)
-    #end
+    annot = GI.gi2annot([16130548])
+    if annot
+      assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]', annot.first)
+    else
+      assert_nil( puts("SKIPPING gi test (no internet connection available)") )
+    end
   end
 end

data/test/tc_id_precision.rb CHANGED Viewed

@@ -30,6 +30,30 @@ class IDPrecisionTest < Test::Unit::TestCase
 PepProts: NH,PepProts: PR,SeqCharge: NH,SeqCharge: PR,Scan(TopHit): NH,Scan(TopHit): PR,Scan(Top10): NH,Scan(Top10): PR,ScanCharge(TopHit): NH,ScanCharge(TopHit): PR,ScanCharge(Top10): NH,ScanCharge(Top10): PR
 75, 1.0, 37, 1.0, 75, 1.0, 75, 1.0, 75, 1.0, 75, 1.0
 95, 1.0, 49, 1.0, 95, 1.0, 95, 1.0, 95, 1.0, 95, 1.0
+155, 1.0, 67, 1.0, 123, 1.0, 155, 1.0, 125, 1.0, 155, 1.0
+186, 1.0, 85, 1.0, 154, 1.0, 186, 1.0, 156, 1.0, 186, 1.0
+196, 1.0, 90, 1.0, 161, 1.0, 196, 1.0, 163, 1.0, 196, 1.0
+214, 1.0, 94, 1.0, 168, 1.0, 214, 1.0, 170, 1.0, 214, 1.0
+215, 1.0, 95, 1.0, 169, 1.0, 215, 1.0, 171, 1.0, 215, 1.0
+217, 0.995391705069124, 97, 0.989690721649485, 171, 0.994152046783626, 217, 0.995391705069124, 173, 0.994219653179191, 217, 0.995391705069124
+219, 0.995433789954338, 99, 0.98989898989899, 172, 0.994186046511628, 219, 0.995433789954338, 175, 0.994285714285714, 219, 0.995433789954338
+227, 0.995594713656388, 106, 0.990566037735849, 180, 0.994444444444444, 227, 0.995594713656388, 183, 0.994535519125683, 227, 0.995594713656388
+228, 0.995614035087719, 107, 0.990654205607477, 181, 0.994475138121547, 228, 0.995614035087719, 184, 0.994565217391304, 228, 0.995614035087719
+229, 0.991266375545852, 108, 0.981481481481482, 182, 0.989010989010989, 229, 0.991266375545852, 185, 0.989189189189189, 229, 0.991266375545852
+END
+    # This was the result we were getting before first hashing on protein
+    # sequences and doing uniqe peptide hits.  It is very similar ( but not
+    # exactly the same) to what we are doing now).  Must have something to do
+    # with the way things are hashed out.
+    before_doing_uniq_peptides=<<END
+#  NH = number of hits
+#  TP = true positives
+#  FP = false positives
+#  PR = precision = TP/(TP+FP)
+PepProts: NH,PepProts: PR,SeqCharge: NH,SeqCharge: PR,Scan(TopHit): NH,Scan(TopHit): PR,Scan(Top10): NH,Scan(Top10): PR,ScanCharge(TopHit): NH,ScanCharge(TopHit): PR,ScanCharge(Top10): NH,ScanCharge(Top10): PR
+75, 1.0, 37, 1.0, 75, 1.0, 75, 1.0, 75, 1.0, 75, 1.0
+95, 1.0, 49, 1.0, 95, 1.0, 95, 1.0, 95, 1.0, 95, 1.0
 125, 1.0, 67, 1.0, 123, 1.0, 125, 1.0, 125, 1.0, 125, 1.0
 155, 1.0, 85, 1.0, 154, 1.0, 155, 1.0, 156, 1.0, 155, 1.0
 186, 1.0, 90, 1.0, 161, 1.0, 186, 1.0, 163, 1.0, 186, 1.0
@@ -49,10 +73,17 @@ END
     cmd = "#{@cmd} INV_ #{@tf_bioworks_inv_xml} -a"
     #puts "RUNNING: #{cmd}"
     reply = `#{cmd}`
+    # This is what we were getting before hashing for uniqe peptides
+    # It is very similar (but not identical to previous output)
     string =<<END
 Filename PepProts SeqCharge Scan(TopHit) Scan(Top10) ScanCharge(TopHit) ScanCharge(Top10)
 ./test/tfiles/bioworks_with_INV_small.xml 228.925377117814 107.877585995136 181.929045912105 228.925377117814 184.924437525838 228.925377117814
 END
+    string =<<NEWEND
+Filename PepProts SeqCharge Scan(TopHit) Scan(Top10) ScanCharge(TopHit) ScanCharge(Top10)
+./test/tfiles/bioworks_with_INV_small.xml 228.939375794224 107.877585995136 181.929045912105 228.939375794224 184.924437525838 228.939375794224
+NEWEND
     assert_equal(string, reply, "area under the curve")
   end
 end

data/test/tc_mzxml.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 require 'test/unit'
 require 'spec/mzxml/parser'
 class SpecMzXML < Test::Unit::TestCase
   def initialize(arg)
     super(arg)
@@ -49,23 +48,24 @@ class SpecMzXML < Test::Unit::TestCase
       sr_raw = @tfiles + 'smallraw.RAW'
       sr_noext = @tfiles + 'smallraw'
       sr_mzxml = @tfiles + 'smallraw.mzXML'
-      ob = Spec::MzXML::Parser.new
+      klass = Spec::MzXML
       # given raw
-      file = ob.file_to_mzxml(sr_raw)
+      file = klass.file_to_mzxml(sr_raw)
       file_to_mzxml_assert(file)
       # given mzXML
-      file = ob.file_to_mzxml(sr_mzxml)
+      file = klass.file_to_mzxml(sr_mzxml)
       file_to_mzxml_assert(file)
       File.unlink(sr_mzxml)
       # given basename (and no mzXML)
-      file = ob.file_to_mzxml(sr_noext)
+      file = klass.file_to_mzxml(sr_noext)
       file_to_mzxml_assert(file)
       # given basename (and mzXML)
-      file = ob.file_to_mzxml(sr_noext)
+      file = klass.file_to_mzxml(sr_noext)
       file_to_mzxml_assert(file)
       File.unlink(sr_mzxml)
     else
       puts "SKIPPING tests requiring 't2x' to convert RAW to mzXML"
+      puts "(look in the archive folder of the gem for t2x binary for linux)"
     end
   end
@@ -84,3 +84,5 @@ class SpecMzXML < Test::Unit::TestCase
   end
 end

data/test/tc_peptide_parent_times.rb CHANGED Viewed

@@ -9,7 +9,8 @@ class PeptideParentTimesTest < Test::Unit::TestCase
   end
   def test_blank
-    puts "NOT RUNNING ANY TESTS FOR PEPTIDE_PARENT_TIMES RIGHT NOW"
+    ## need to finish this guy up:
+    puts "\nSKIPPING: tests for peptide_parent_times"
   end
   def Xtest_run

data/test/tc_precision.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'test/unit'
 require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
-class PrecisionTest < Test::Unit::TestCase
+class PrecTest < Test::Unit::TestCase
   ROOT_DIR = File.join(File.dirname(__FILE__), "..")
   def initialize(arg)

data/test/tc_proph.rb CHANGED Viewed

@@ -16,8 +16,8 @@ class ProphTest < Test::Unit::TestCase
   def test_parse_protxml_file
     file = @tfiles + 'opd1/000_020_3prots-prot.xml'
-    #obj = SpecID::Proph::ProtSummary.new
-    obj = SpecID::Proph::ProtSummary.new(file)
+    #obj = Proph::ProtSummary.new
+    obj = Proph::ProtSummary.new(file)
     assert_equal(3, obj.prot_groups.size)
     assert_equal("1.00", obj.prot_groups.first.probability)
     assert_equal("0.98", obj.prot_groups[2].probability)
@@ -38,7 +38,7 @@ class ProphTest < Test::Unit::TestCase
   def Xtest_filter_by_min_pep_prob
-    obj = SpecID::Proph::Pep::Parser.new
+    obj = Proph::Pep::Parser.new
     new_file = "tfiles/tmp.xml"
     assert_match(/peptideprophet_result probability="0.[0-5]/, IO.read(@pepproph_xml))
     obj.filter_by_min_pep_prob(@pepproph_xml, new_file, 0.50)
@@ -48,7 +48,7 @@ class ProphTest < Test::Unit::TestCase
   end
   def Xtest_uniq_by_seqcharge
-    cls = SpecID::Proph::Pep
+    cls = Proph::Pep
     p1 = cls.new({ :charge => '2', :sequence => 'PEPTIDE' })
     p2 = cls.new({ :charge => '3', :sequence => 'PEPTIDE' })
     p3 = cls.new({ :charge => '2', :sequence => 'PEPTIDE' })
@@ -91,7 +91,7 @@ class ProphTest < Test::Unit::TestCase
     s1 = Spec::Scan.new(1,2,0.10, 300.2, i1, p1)
     s2 = Spec::Scan.new(2,2,0.20, 301.1, i2, p2)
     s3 = Spec::Scan.new(3,2,0.30, 302.0, i3, p3)
-    scan = SpecID::Proph::Pep.new({:scans => [s1,s2,s3]}).arithmetic_avg_scan_by_parent_time
+    scan = Proph::Pep.new({:scans => [s1,s2,s3]}).arithmetic_avg_scan_by_parent_time
     tot_inten = i1 + i2 + i3
     tm = ( t1 * (i1/tot_inten) + t2 * (i2/tot_inten) + t3 * (i3/tot_inten) )
     {:ms_level => 2, :prec_inten => 130115.0/3, :num => nil, :prec_mz => 301.1.to_f, :time => tm }.each do |k,v|

data/test/tc_protein_summary.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'test/unit'
-require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
+require 'spec_id/protein_summary'
+require File.dirname(__FILE__) + '/test_helper'
@@ -20,16 +21,29 @@ class ProphProtSummaryTest < Test::Unit::TestCase
     @tf_proph_cat_inv_summary_html = @tfiles + 'opd1/opd1_cat_inv_small-prot.summary.html'
     @tf_proph_cat_inv_summary_png = @tfiles + 'opd1/opd1_cat_inv_small-prot.summary.png'
     @tf_peptide_count = @tfiles + "peptide_counts.tmp.txt"
-    @cmd = "ruby -I#{File.join(File.dirname(__FILE__), "..", "lib")} -S protein_summary.rb "
   end
+  def runit(string_or_args)
+    args = if string_or_args.is_a? String
+             string_or_args.split(/\s+/)
+           else
+             string_or_args
+           end
+    ProteinSummary.new.create_from_command_line_args(args)
+  end
   def test_usage
-    assert_match(/usage:/, `#{@cmd}`)
+    output = capture_stdout {
+      runit('')
+    }
+    assert_match(/usage:/, output)
   end
-  def Xtest_proph_basic
+  def test_proph_basic
     if File.exist? @tfiles_l
-      print `#{@cmd} -c 5.0 #{@tf_proph}`
+      runit "-c 5.0 #{@tf_proph}"
+      ProteinSummary.new.create_from_command_line_args([@tf_proph, '-c', '5.0'])
       assert(File.exist?(@tf_summary), "file #{@tf_summary} exists")
       string = IO.read(@tf_summary)
       assert_match(/gi\|16132176\|ref\|NP_418775\.1\|/, string)
@@ -41,7 +55,7 @@ class ProphProtSummaryTest < Test::Unit::TestCase
   end
   def test_bioworks_basic
-    print `#{@cmd} #{@tf_bioworks_small}`
+    runit "#{@tf_bioworks_small}"
     assert(File.exist?(@tf_bioworks_small_summary_html), "file #{@tf_bioworks_small_summary_html} exists")
     File.unlink @tf_bioworks_small_summary_html unless NODELETE
@@ -49,23 +63,32 @@ class ProphProtSummaryTest < Test::Unit::TestCase
   end
   def test_bioworks_with_precision
-    `#{@cmd} -f #{@tf_bioworks_small} #{@tf_bioworks_small} --precision`
-    assert_match('TP : 106', IO.read(@tf_bioworks_small_summary_html))
-    assert_match(/False Positive Rate.*: 0.500/, IO.read(@tf_bioworks_small_summary_html))
+    ## Could reimplement a separate file approach?
+    #reply = `#{@cmd} -f #{@tf_bioworks_small} #{@tf_bioworks_small} --precision`
+    runit "#{@tf_bioworks_small} --precision"
+    assert_match(/# hits.*106/m, IO.read(@tf_bioworks_small_summary_html))
+    #assert_match(/False Positive Rate.*: 0.500/, IO.read(@tf_bioworks_small_summary_html))
+    #assert_match(/False Positive Rate.*: 0.500/, IO.read(@tf_bioworks_small_summary_html))
     assert(File.exist?(@tf_bioworks_small_summary_html), "file #{@tf_bioworks_small_summary_html} exists")
     File.unlink @tf_bioworks_small_summary_html unless NODELETE
   end
-  def Xtest_proph_with_precision
+  def test_proph_with_precision
     #puts @cmd
-    print `#{@cmd} #{@tf_proph_cat_inv} -f INV_ --precision`
+    runit "#{@tf_proph_cat_inv} -f INV_ --precision"
+    html =  IO.read(@tf_proph_cat_inv_summary_html)
+    assert_match(/# hits/, html, "in #{@tf_proph_cat_inv_summary_html}")
+    assert_match(/2.*0\.0000/m, html, "in #{@tf_proph_cat_inv_summary_html}")
+    assert_match(/3.*0\.3333/m, html, "in #{@tf_proph_cat_inv_summary_html}")
+    assert_match(/7.*0\.5714/m, html, "in #{@tf_proph_cat_inv_summary_html}")
     File.unlink @tf_proph_cat_inv_summary_html unless NODELETE
     File.unlink @tf_proph_cat_inv_summary_png unless NODELETE
   end
-  def Xtest_peptide_count
+  def test_peptide_count
     if File.exist? @tfiles_l
-      print `#{@cmd} -c 5.0 #{@tf_proph} --peptide_count #{@tf_peptide_count}`
+      runit "-c 5.0 #{@tf_proph} --peptide_count #{@tf_peptide_count}"
       assert(File.exist?(@tf_peptide_count), "file #{@tf_peptide_count} exists")
       file = IO.read(@tf_peptide_count)
       assert_match("gi|16132176|ref|NP_418775.1|\t2", file)

data/test/tc_sequest.rb CHANGED Viewed

@@ -4,8 +4,10 @@
 require 'spec_id'
 require 'spec_id/sequest'
 require 'test/unit'
+require 'spec/mzxml'
+NODELETE = false
 class SequestTest < Test::Unit::TestCase
@@ -18,10 +20,10 @@ class SequestTest < Test::Unit::TestCase
     @tf_bioworks_xml = @tfiles + "bioworks_small.xml"
   end
-  def test_set_from_bioworks
+  def Xtest_set_from_bioworks
     if File.exist? @tfiles_l
       out_path = '.'
-      pepxml_objs = SpecID::Sequest::PepXML.set_from_bioworks(@tf_params, @tf_bioworks_xml, @tf_mzxml_path, out_path)
+      pepxml_objs = Sequest::PepXML.set_from_bioworks_xml(@tf_bioworks_xml, @tf_params, {:ms_path => @tf_mzxml_path, :out_path => out_path})
       pepxml_objs.each do |obj|
         assert(obj.spectrum_queries.size > 2)
         assert(obj.spectrum_queries.first.search_results.first.search_hits.size > 0)
@@ -55,10 +57,10 @@ class SequestTest < Test::Unit::TestCase
       mzxml_path = @tfiles + "opd1"
       out_path = @tfiles
       pepxml_version = 18
-      pepxml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params, bioworks_xml, mzxml_path, out_path, pepxml_version, "trypsin")
+      pepxml_objs = Sequest::PepXML.set_from_bioworks_xml(bioworks_xml, params, {:ms_data => mzxml_path, :out_path => out_path, :pepxml_version => pepxml_version, :sample_enzyme => "trypsin"})
       puts "TOOK #{Time.new - st}secs"
       po = pepxml_objs.first
-      assert_equal(pepxml_version, SpecID::Sequest::PepXML.pepxml_version)
+      assert_equal(pepxml_version, Sequest::PepXML.pepxml_version)
       # MSMSPipelineAnalysis
       pipe = po.msms_pipeline_analysis
@@ -197,9 +199,9 @@ class SequestTest < Test::Unit::TestCase
-  def test_calc_num_tol_term
-    params = SpecID::Sequest::Params.new(@tf_params)
-    scall = SpecID::Sequest::PepXML::SearchHit
+  def Xtest_calc_num_tol_term
+    params = Sequest::Params.new(@tf_params)
+    scall = Sequest::PepXML::SearchHit
     sym = :calc_num_tol_term
     assert_equal(2, scall.send(sym, params, "K.EPTIDR.E"))
     assert_equal(1, scall.send(sym, params, "K.PEPTIDR.E"))
@@ -207,9 +209,9 @@ class SequestTest < Test::Unit::TestCase
     assert_equal(0, scall.send(sym, params, "F.PEPTIDW.R"))
   end
-  def test_calc_num_missed_cleavages
-    params = SpecID::Sequest::Params.new(@tf_params)
-    scall = SpecID::Sequest::PepXML::SearchHit
+  def Xtest_calc_num_missed_cleavages
+    params = Sequest::Params.new(@tf_params)
+    scall = Sequest::PepXML::SearchHit
     sym = :calc_num_missed_cleavages
     assert_equal(0, scall.send(sym, params, "K.EPTIDR.E"))
     assert_equal(0, scall.send(sym, params, "K.PEPTIDR.E"))
@@ -225,35 +227,27 @@ class SequestTest < Test::Unit::TestCase
   end
-  def test_sys_ind_basename
-    assert_equal("hello.fasta", SpecID::Sequest::Params.new._sys_ind_basename("C:\\Xcalibur\\database\\hello.fasta"))
-    assert_equal("hello.fasta", SpecID::Sequest::Params.new._sys_ind_basename("/work/john/hello.fasta"))
+  def Xtest_sys_ind_basename
+    assert_equal("hello.fasta", Sequest::Params.new._sys_ind_basename("C:\\Xcalibur\\database\\hello.fasta"))
+    assert_equal("hello.fasta", Sequest::Params.new._sys_ind_basename("/work/john/hello.fasta"))
   end
-  def test_modifications
-    obj = SpecID::Sequest::PepXML::Modifications.new(nil, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
+  def Xtest_modifications
+    obj = Sequest::PepXML::Modifications.new(nil, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
     answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
     assert_equal(answ, obj.mod_symbols_hash, "mod_symbols_hash")
     ## need more here
   end
-  def test_non_standard_aa_removal
-    hash = {"K.PEPTIDE.Z" => "K.PEPTIDE.Z", "K.*M" => "K.M", "aI" => 'I', "YI.&" => "YI.", "EI.!@#\$%^&*(){}[]|\\;:'\"<>,?/EI" => 'EI.EI'}
-    cl = proc {|v| SpecID::Sequest::PepXML::SearchHit.remove_non_amino_acids(v) }
-    hash.each do |k,v|
-      assert_equal(v, cl.call(k))
-    end
-  end
-  def test_modification_info
+  def Xtest_modification_info
     hash = {
       :mod_nterm_mass => 520.2,
       :modified_peptide => "MOD*IFI^E&D",
       :mod_aminoacid_mass => [[3, 150.3], [6, 345.2]],
     }
     answ = "<modification_info mod_nterm_mass=\"520.2\" modified_peptide=\"MOD*IFI^E&amp;D\">\n\t<mod_aminoacid_mass position=\"3\" mass=\"150.3\"/>\n\t<mod_aminoacid_mass position=\"6\" mass=\"345.2\"/>\n</modification_info>\n"
-    string = SpecID::Sequest::PepXML::SearchHit::ModificationInfo.new(hash).to_pepxml
+    string = Sequest::PepXML::SearchHit::ModificationInfo.new(hash).to_pepxml
     assert_match(_re('<modification_info'), answ)
     assert_match(_re(" mod_nterm_mass=\"520.2\""), answ)
     assert_match(_re(" modified_peptide=\"MOD*IFI^E&amp;D\""), answ)
@@ -270,22 +264,73 @@ class SequestTest < Test::Unit::TestCase
   end
   def test_modifications
-    params = SpecID::Sequest::Params.new(@tf_params)
+    params = Sequest::Params.new(@tf_params)
     mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
     params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
     params.term_diff_search_options = "14.20000 12.33000"
-    assert 1
-=begin
-    mod = SpecID::Sequest::PepXML::Modifications(params, mod_string)
-SpecID::Sequest::PepXML::Modifications
-    peptide = "PEPTIDE"
+    mod = Sequest::PepXML::Modifications.new(params, mod_string)
     ## no mods
+    peptide = "PEPTIDE"
     assert_equal(nil, mod.modification_info(peptide))
     peptide = "]M*EC^S@IDM#M*EMSCM["
-    p mod.modification_info(peptide)
-=end
+    modinfo = mod.modification_info(peptide)
+    assert_equal(peptide, modinfo.modified_peptide)
+    assert_in_delta(146.40054, modinfo.mod_nterm_mass, 0.000001)
+    assert_in_delta(160.52994, modinfo.mod_cterm_mass, 0.000001)
+  end
+  # splits string on ' 'and matches the line found by find_line_regexp in
+  # lines
+  def match_modline_pieces(lines, find_line_regexp, string)
+    pieces = string.split(' ').map {|v| /#{Regexp.escape(v)}/ }
+    lines.each do |line|
+      if line =~ find_line_regexp
+        pieces.each do |piece|
+          assert_match(piece, line)
+        end
+      end
+    end
   end
+  def test_modifications_in_run
+    if File.exist? @tfiles_l
+      modfiles_sequest_dir = @tfiles_l + 'opd1_2runs_2mods/sequest/'
+      modfiles_data_dir = @tfiles_l + 'opd1_2runs_2mods/data/'
+      srgfile = modfiles_sequest_dir + 'tmp.srg'
+      out_path = modfiles_sequest_dir + 'pepxml'
+      modfiles = %w(020 040).map do |file|
+        modfiles_sequest_dir + file + ".srf"
+      end
+      objs = Sequest::PepXML.set_from_bioworks( SRFGroup.new(modfiles).to_srg(srgfile), {:ms_data => modfiles_data_dir, :out_path => out_path, :print => true, :backup_db_path => '/project/marcotte/marcotte/ms/database'} )
+    %w(020 040).each do |file|
+      fn = out_path + '/' + file + '.xml'
+      assert(File.exist?(fn), "file #{fn} exists")
+      beginning = IO.read(fn)
+      lines = beginning.split("\n")
+      [
+        [/aminoacid="M"/, '<aminoacid_modification symbol="*" massdiff="+15.9994" aminoacid="M" variable="Y" binary="N" mass="147.192"'],
+        [/aminoacid="S"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="S" variable="Y" binary="N" mass="167.0581"'],
+        [/aminoacid="T"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="T" variable="Y" binary="N" mass="181.085"'],
+        [/aminoacid="Y"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="Y" variable="Y" binary="N" mass="243.1559"'],
+        [/parameter name="diff_search_options"/, '<parameter name="diff_search_options" value="15.999400 M 79.979900 STY 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>'],
+      ].each do |a,b|
+        match_modline_pieces(lines, a, b)
+      end
+      [
+        '<modification_info modified_peptide="Y#RLGGS#T#K">',
+        '<mod_aminoacid_mass position="1" mass="243.1559"/>',
+        '<mod_aminoacid_mass position="7" mass="167.0581"/>',
+        '</modification_info>',
+        '<mod_aminoacid_mass position="9" mass="181.085"/>'
+      ].each do |line|
+        assert_match(/#{Regexp.escape(line)}/, beginning, "a modification info for a peptide")
+      end
+      File.unlink(fn) unless NODELETE
+    end
+    else
+      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
+    end
+  end
 end