RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/script/genuine_tps_and_probs.rb ADDED Viewed

@@ -0,0 +1,136 @@
+#!/usr/bin/ruby -w
+# Here is what I plotted: take each id'd pep-prot id'd on the tophit scans -- you will likely have the same pep-prot id'd on multiple scans -- plot the top probability of each such pep-prot.
+# There are 43 such id'd peptides for Sashimi, whereas SEQUEST id's about 66. So you'll have 66 (1-p-values) to plot, I had 43. Similarly for OMICS.
+require 'spec_id'
+require 'fasta'
+require 'optparse'
+require 'ostruct'
+# returns an accession number if available, or the entire reference (less the
+# starting '>'
+def get_fasta_accession(fasta_prot)
+  head = fasta_prot.header
+  if head =~ ACC_REGEX
+    $1.dup
+  else
+    head.sub(/^>/, '').rstrip
+  end
+end
+# returns the accession number from a reference, or the complete reference
+def accession_from_ref(pep)
+  ref = pep.prot.reference
+  if ref =~ ACC_REGEX
+    $1.dup
+  else
+    ref.rstrip
+  end
+end
+def get_pep_prot_accession(pep)
+  acc = pep.prot.accession
+  if !acc || acc == '0' || acc == 0
+    accession_from_ref(pep)
+  else
+    acc
+  end
+end
+#####################################################################
+# MAIN
+#####################################################################
+opt = OpenStruct.new
+opt.p = 'prob'
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml true_hits.fasta"
+  op.separator "     [prints to stdout tab delimited table]"
+  op.on('-t', '--ties', 'allow ties on best hit') {|v| opt.t = v }
+  op.on('-p', '--param <s>', 'param: (xcorr | prob)') {|v| opt.p = v}
+end
+opts.parse!
+if ARGV.size < 2
+  puts opts
+  exit
+end
+case opt.p
+when 'prob'
+  param = :peptide_probability
+  best = :first
+when 'xcorr'
+  param = :xcorr
+  best = :last
+else
+  abort "incorrect param: #{opt.p}"
+end
+############################
+# GLOBALS
+DELIM = "\t"
+ACC_REGEX = /\|(.*?)\|/o
+############################
+bioworks = ARGV[0]
+fasta_file = ARGV[1]
+fprots = Fasta.new.read_file(fasta_file).prots
+gi_nums = fprots.map {|prot| get_fasta_accession(prot) }
+peptides = SpecID.new(bioworks).peps
+## Get the best peptide(s) per scan
+top_peps_per_scan = []
+peptides.hash_by(:base_name, :first_scan).each do |bn_scan, pep_array|
+  sorted_list = pep_array.sort_by {|pep| pep.send(param).to_f }
+  top_peps = if best == :first ; [sorted_list.shift] ; else [sorted_list.pop] end
+  found_another = false
+  sorted_list.each do |pep|
+    if pep.send(param).to_f == top_peps.send(best).send(param).to_f
+      if opt.t
+        top_peps << pep
+      else
+        found_another = true
+      end
+    end
+  end
+  unless found_another
+    top_peps_per_scan.push( *top_peps )
+  end
+end
+## Get the best scoring peptide per peptide/prot from list of best
+## peptides/scan
+top_pep_seq_prots = top_peps_per_scan.hash_by {|pep| [pep.sequence, get_pep_prot_accession(pep)] }.map do |k,pep_array|
+  pep_array.sort_by {|pep| pep.send(param).to_f }.send(best)
+end
+## sort the peptides by best score
+sorted_top_pep_seq_prots = top_pep_seq_prots.sort_by {|pep| pep.send(param).to_f }
+if best == :last ; sorted_top_pep_seq_prots.reverse! end
+## plot the probability vs. the number of tps
+puts ['#TPs', param, 'sequence', 'protein accession', 'xcorr'].join(DELIM)
+tps = 0
+sorted_top_pep_seq_prots.each do |pep|
+  if gi_nums.include?( get_pep_prot_accession(pep) )
+    tps += 1
+    puts [tps.to_s, pep.send(param), pep.sequence, get_pep_prot_accession(pep), pep.xcorr].join(DELIM)
+  end
+end

data/script/top_hit_per_scan.rb CHANGED Viewed

@@ -4,7 +4,6 @@
 cats = %w(base_name sequence xcorr deltacn first_scan last_scan)
 ###################################################################
-require 'pp'
 require 'spec_id'
 require 'hash_by'
@@ -46,7 +45,11 @@ outfile_top = file.sub(/\.xml$/, extension_top)
 outfile_all = file.sub(/\.xml$/, extension_all)
 sp = SpecID.new(file)
-pep_hash = sp.peps.hash_by(:first_scan, :last_scan)
+# The old (incorrect version)
+# pep_hash = sp.peps.hash_by(:first_scan, :last_scan)
+# The correct version:
+pep_hash = sp.peps.hash_by(:base_name, :first_scan, :last_scan)
 top_per_scan = pep_hash.map {|k,v| v.sort_by {|ob| ob.xcorr.to_f }.last }
 top_per_scan = top_per_scan.sort_by {|pep| pep.first_scan.to_i }

data/test/tc_aa_freqs.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'test/unit'
+require 'spec_id/aa_freqs'
+class FastaTest < Test::Unit::TestCase
+  def initialize(arg)
+    super(arg)
+    @tfiles = File.dirname(__FILE__) + '/tfiles/'
+    @sf = @tfiles + "small.fasta"
+  end
+  def test_basic
+    obj = SpecID::AAFreqs.new(@sf)
+    expect = {:I=>0.0628918621937819, :S=>0.0539719475147049, :D=>0.0526145691939758, :Z=>0.0, :L=>0.102772929998061, :T=>0.0491888048607071, :E=>0.0609527503070261, :O=>0.0, :C=>0.0157714433456144, :K=>0.0471850559110594, :U=>0.0, :Q=>0.0382651412319824, :W=>0.0137030573330748, :A=>0.101997285243359, :M=>0.0294745006786892, :J=>0.0, :G=>0.0811195139292871, :Y=>0.0254670027793937, :X=>0.0, :F=>0.0418201796910348, :R=>0.0546829552065154, :V=>0.0702604873634542, :H=>0.0213302307543145, :B=>0.0, :N=>0.03471010277293, :P=>0.0418201796910348}
+    aaf =  obj.aafreqs
+    expect.each do |k,v|
+      assert(aaf.key?(k))
+      assert_in_delta(v, aaf[k], 0.00000001, "freqs match up")
+    end
+    sum = 0.0
+    aaf.values.each do |v|
+      sum += v
+    end
+    assert_in_delta(1.0, sum, 0.0000000000001, "all freqs add to 1")
+  end
+  def test_probability_of_length_table
+    # p SpecID::AAFreqs.probability_of_length_table(0.01, 4)
+    assert_equal_arrs_in_delta([0.0, 0.01, 0.0199, 0.029701, 0.0394039900000001],  SpecID::AAFreqs.probability_of_length_table(0.01, 4), 0.000000001)
+    assert_equal_arrs_in_delta([0.0, 0.2, 0.36, 0.488, 0.5904], SpecID::AAFreqs.probability_of_length_table(0.2, 4), 0.000000001)
+  end
+  def test_actual_and_expected_number
+    fobj = Fasta.new.read_file(@sf)
+    obj = SpecID::AAFreqs.new
+    obj.aafreqs = obj.calculate_frequencies(fobj)
+    peptide_aaseqs = fobj.prots.map do |prot|
+      prot.aaseq[0..12]
+    end
+    assert_equal(50, peptide_aaseqs.size, 'sanity check')
+    (ac,ex) = obj.actual_and_expected_number(peptide_aaseqs, :C, 1)
+    assert_equal(9, ac)
+    assert_in_delta( 9.33530631238985, ex, 0.0000000001)
+  end
+  private
+  def assert_equal_arrs_in_delta(expect, actual, delta)
+    expect.each_with_index do |v,i|
+      assert_in_delta(v, actual[i], delta)
+    end
+  end
+end

data/test/tc_bioworks.rb CHANGED Viewed

@@ -8,7 +8,8 @@ class BioworksTest < Test::Unit::TestCase
   def initialize(arg)
     super(arg)
     @tfiles = File.dirname(__FILE__) + '/tfiles/'
-    @tf_bioworks_xml = @tfiles + "bioworks.xml"
+    @tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
+    @tf_bioworks_xml = @tfiles_l + "bioworks.xml"
     @tf_bioworks_xml_small = @tfiles + "bioworks_small.xml"
     @tf_bioworks_xml_really_small = @tfiles + "bioworks_with_INV_small.xml"
     @tf_params = @tfiles + "bioworks32.params"
@@ -34,9 +35,13 @@ class BioworksTest < Test::Unit::TestCase
   end
   def Xtest_xml_parsing_speed
+    if File.exist? @tfiles_l
     #puts Benchmark.bm {|b|
       obj = SpecID::Bioworks.new(@tf_bioworks_xml)
     #}
+    else
+      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
+    end
   end
   def test_xml_parsing_bioworks_single

data/test/tc_bioworks_to_pepxml.rb CHANGED Viewed

@@ -9,11 +9,10 @@ class BioworksToPepXMLTest < Test::Unit::TestCase
   def initialize(arg)
     super(arg)
     @tfiles = File.dirname(__FILE__) + '/tfiles/'
-    @tf_mzxml_path = @tfiles + "yeast_gly_mzXML"
+    @tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
+    @tf_mzxml_path = @tfiles_l + "yeast_gly_mzXML"
     @tf_bioworks_xml = @tfiles + "bioworks_small.xml"
     @tf_params = @tfiles + "bioworks32.params"
-    @tf_opd1 = @tfiles + "opd1/bioworks.000.oldparams.xml"
-    @tf_opd1_mzxml = @tfiles + "opd1/000.mzXML.timeIndex"
     @no_delete = false
     @out_path = @tfiles + 'pepxml/'
     @cmd = "ruby -I#{File.join(File.dirname(__FILE__), "..", "lib")} -S bioworks_to_pepxml.rb "
@@ -33,23 +32,31 @@ class BioworksToPepXMLTest < Test::Unit::TestCase
   end
   def test_basic
-    cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path} -d /work/special/path"
-    prc = proc {|file|
-      assert(File.exist?(file), "#{file} exists")
-    }
-    _basic(cmd, prc)
-    unless @no_delete then FileUtils.rm_rf(@out_path) end
+    if File.exist? @tfiles_l
+      cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path} -d /work/special/path"
+      prc = proc {|file|
+        assert(File.exist?(file), "#{file} exists")
+      }
+      _basic(cmd, prc)
+      unless @no_delete then FileUtils.rm_rf(@out_path) end
+    else
+      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
+    end
   end
   def test_database
-    cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path}"
-    db_re = /C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta/
-    assert_match(db_re, IO.read(@tf_params))
-    prc = proc {|file|
-      assert(File.exist?(file))
-      assert_no_match(db_re, IO.read(file))
-    }
-    _basic(cmd, prc)
-    unless @no_delete then FileUtils.rm_rf(@out_path) end
+    if File.exist? @tfiles_l
+      cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path}"
+      db_re = /C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta/
+      assert_match(db_re, IO.read(@tf_params))
+      prc = proc {|file|
+        assert(File.exist?(file))
+        assert_no_match(db_re, IO.read(file))
+      }
+      _basic(cmd, prc)
+      unless @no_delete then FileUtils.rm_rf(@out_path) end
+    else
+      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
+    end
   end
 end

data/test/tc_fasta.rb CHANGED Viewed

@@ -4,6 +4,8 @@ require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
 require 'test/unit'
 require 'fasta'
 require 'assert_files'
+require 'sample_enzyme'
+require 'set'
 module Test::Unit::Assertions
@@ -11,6 +13,7 @@ module Test::Unit::Assertions
 end
 class FastaTest < Test::Unit::TestCase
+  NODELETE = false
   def initialize(arg)
     super(arg)
@@ -73,7 +76,7 @@ class FastaTest < Test::Unit::TestCase
   def test_mod
     ## Testing shuffle:
-    puts `#{@fasta_mod_cmd + 'shuffle ' + @sf}`
+    `#{@fasta_mod_cmd + 'shuffle ' + @sf}`
     assert(File.exist?(@sf_shuffle), "output file #{@sf_shuffle} exists")
     ob1 = Fasta.new.read_file(@sf)
     ob2 = Fasta.new.read_file(@sf_shuffle)
@@ -83,7 +86,7 @@ class FastaTest < Test::Unit::TestCase
     assert(_are_shuffled?(ob1,ob2))
     ## Testing invert:
-    puts `#{@fasta_mod_cmd + 'invert ' + @sf}`
+    `#{@fasta_mod_cmd + 'invert ' + @sf}`
     assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
     ob1 = Fasta.new.read_file(@sf)
     ob2 = Fasta.new.read_file(@sf_invert)
@@ -94,7 +97,7 @@ class FastaTest < Test::Unit::TestCase
     ## Testing prefix
     #puts "#{@fasta_mod_cmd + '-p _HELLO_ invert ' + @sf}"
-    puts `#{@fasta_mod_cmd + 'invert -p _HELLO_ ' + @sf}` # NOT WORKING!
+    `#{@fasta_mod_cmd + 'invert -p _HELLO_ ' + @sf}` # NOT WORKING!
     assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
     ob1 = Fasta.new.read_file(@sf)
     ob2 = Fasta.new.read_file(@sf_invert)
@@ -176,6 +179,81 @@ class FastaTest < Test::Unit::TestCase
     end
   end
+  def test_invert_tryptic_peptides
+    # FOR INDIVIDUAL PROTEINS:
+    seq = 'ABCKCDERDEKDGEKWXYRRKDER'
+    # tryptic = ABCK, CDER, DEK, DGEK, WXYR, R, K, DER
+    tryp = SampleEnzyme.tryptic(seq)
+    reverse_tryptic = %w(CBAK EDCR EDK EGDK YXWR R K EDR)
+    prot = Fasta::Prot.new(nil, seq)
+    prot.invert_tryptic_peptides!
+    assert_equal(reverse_tryptic.join(''), prot.aaseq, "reversing tryptic peptides")
+    seq = 'XYRABCD'
+    prot = Fasta::Prot.new(nil, seq)
+    prot.invert_tryptic_peptides!
+    assert_equal('YXRDCBA', prot.aaseq, 'last peptide treated special')
+    seq = 'XYRPABCD'
+    prot = Fasta::Prot.new(nil, seq)
+    prot.invert_tryptic_peptides!
+    assert_equal('DCBAPRYX', prot.aaseq, 'with a proline')
+  end
+  def test_fraction_of_prots
+    peps = [['>silly1', "PEPTIDE"], ['>silly2', "ANOTHER"], ['>silly3', "AGAIN"], ['>silly4', "LARMA"]]
+    prots = peps.map do |header, seq|
+      Fasta::Prot.new(header, seq)
+    end
+    f = Fasta.new(prots)
+    # simple:
+    n = f.fraction_of_prots(1.0)
+    assert_equal(f.prots.map{|v| v.header }.to_set, n.prots.map{|v| v.header }.to_set, "same headers")
+    assert_equal(f.prots.map{|v| v.aaseq }.to_set, n.prots.map{|v| v.aaseq }.to_set, "same aaseqs")
+    pre = proc {|cnt| "SHUFF_f#{cnt}_" }
+    # test prefix
+    n = f.fraction_of_prots(1.0, pre)
+    n.prots.each do |prot|
+      assert_match(/^>SHUFF_f0_/, prot.header, "contains new prefix")
+    end
+    # smaller
+    n = f.fraction_of_prots(0.75, pre)
+    assert_equal(3, n.prots.size, "correct number of proteins")
+    # bigger
+    n = f.fraction_of_prots(2.5, pre)
+    assert_equal(10, n.prots.size, "correct number of proteins")
+    n.prots[0..3].each {|prt| assert_match(/^>SHUFF_f0_/, prt.header ) }
+    n.prots[4..7].each {|prt| assert_match(/^>SHUFF_f1_/, prt.header ) }
+    n.prots[8..9].each {|prt| assert_match(/^>SHUFF_f2_/, prt.header ) }
+    # crazy
+    n = f.fraction_of_prots(1.33, pre)
+    assert_equal(6, n.prots.size, "correct number of proteins")
+  end
+  def test_inverted_tryptic_peptides_for_file
+    # for a file:
+    tmpfile = @tfiles + "fasta.tmp"
+    fasta = Fasta.new.read_file(@sf)
+    fasta.aaseq_invert_tryptic_peptides!
+    fasta.write_file(tmpfile)
+    lines = IO.readlines(tmpfile)
+    #normal = 'MKRISTTITTTITITTGNGAG'
+    inverted_tryptic = 'MKRGAGNGTTITITTTITTSI' ## ?????
+    assert_equal(inverted_tryptic, lines[1].chomp)
+    #normal =  'MATYLIGDVHGCYDELIALLHKVEFTPGKDTLWLTGDLVARGPGSLDVLRYVKSLGDSVRLVLGNHDLHL
+    # LAVFAGISRNKPKDRLTPLLEAPDADELLNWLRRQPLLQIDEEKKLVMAHAGITPQWDLQTAKECARDVE
+    # AVLSSDSYPFFLDAMYGDMPNNWSPELRGLGRLRFITNAFTRMRFCFPNGQLDMYSKESPEEAPAPLKPW
+    # FAIPGPVAEEYSIAFGHWASLEGKGTPEGIYALDTGCCWGGTLTCLRWEDKQYFVQPSNRHKDLGEAAAS'
+    inverted_tryptic = 'HLLAILEDYCGHVDGILYTAMKGPTFEVKAVLDGTLWLTDRLVDLSGPGRVYKVSDGLSRSIGAFVALLHLDHNGLVLRPKNKDRLWNLLEDADPAELLPTLRREEDIQLLPQKKATQLDWQPTIGAHAMVLKACERLEPSWNNPMDGYMADLFFPYSDSSLVAEVDRGLGRLRTFANTIFRMRSYMDLQGNPFCFKGELSAWHGFAISYEEAVPGPIAFWPKLPAPAEEPSEKLCTLTGGWCCGTDLAYIGEPTGRDEWKNSPQVFYQRHKSAAAEGLD'
+    assert_equal(inverted_tryptic, lines[-1].chomp)
+    File.unlink(tmpfile) unless NODELETE
+  end
   ## HELPER ASSERTIONS:
   def _are_inverted?(obj1, obj2)

data/test/tc_fasta_shaker.rb ADDED Viewed

@@ -0,0 +1,147 @@
+require 'test/unit'
+require 'fasta'
+Filestring = ">gi|P1
+AMKRGAN
+>gi|P2
+CRGATKKTAGRPMEK
+>gi|P3
+PEPTIDE
+"
+Rev = ">gi|P1
+NAGRKMA
+>gi|P2
+KEMPRGATKKTAGRC
+>gi|P3
+EDITPEP
+"
+RevTryptic = ">gi|P1
+MAKRNAG
+>gi|P2
+CRTAGKKEMPRGATK
+>gi|P3
+EDITPEP
+"
+ShuffTryptic = ">gi|P1
+MAKRNAG
+>gi|P2
+CRTAGKKEMPRGATK
+>gi|P3
+EDITPEP
+"
+class TestBasic < Test::Unit::TestCase
+  def setup
+    testdir = File.dirname(__FILE__)
+    libdir = testdir + '/../lib'
+    bindir = testdir +  '/../bin'
+    progname = "fasta_shaker.rb"
+    @cmd = "ruby -I #{libdir} #{bindir}/#{progname} "
+    @tfiles = testdir + '/tfiles/'
+    @tmpfile = @tfiles + "littlefasta.trash.fasta"
+    File.open(@tmpfile, "w") {|fh| fh.print Filestring }
+    @f = @tfiles + "trash.fasta"
+  end
+  def teardown
+    File.unlink @tmpfile if File.exist? @tmpfile
+    File.unlink @f if File.exist? @f
+  end
+  def test_reverse
+    cmd = @cmd + "reverse #{@tmpfile} -o #{@f}"
+    system cmd
+    assert_equal(Rev, fastap(@f).to_s)
+  end
+  def test_reverse_tryptic
+    cmd = @cmd + "reverse #{@tmpfile} -o #{@f} --tryptic_peptides"
+    system cmd
+    assert_equal(RevTryptic, fastap(@f).to_s)
+  end
+  def test_shuff_tryptic
+    cmd = @cmd + "shuffle #{@tmpfile} -o #{@f} --tryptic_peptides"
+    system cmd
+    lns = fastap(@f).to_s.split("\n")
+    assert_equal('KR', lns[1][2..3])
+    assert_equal('R', lns[3][1..1])
+    assert_equal('CRGATKKTAGRPMEK'.size, lns[3].size, "sequence is same size")
+    assert_not_equal('CRGATKKTAGRPMEK', lns[3], "sequence is randomised from original [remote chance of failure] rerun to make sure")
+  end
+  def test_shuffle
+    cmd = @cmd + "shuffle #{@tmpfile} -o #{@f}"
+    system cmd
+    clines = strlns(Filestring)
+    lns = fastalns(@f)
+    lns.each_with_index do |line,i|
+      assert_equal(clines[i].size, line.size, "same size lines: A: <<#{clines[i]}>> B: <<#{line}>>")
+    end
+    assert_equal('CRGATKKTAGRPMEK'.size, lns[3].size, "sequence is same size")
+    assert_not_equal('CRGATKKTAGRPMEK', lns[3], "sequence is randomised from original [remote chance of failure] rerun to make sure")
+  end
+  def test_cat
+    cmd = @cmd + "reverse #{@tmpfile} -c -o #{@f}"
+    `#{cmd}`  ## suppress warning
+    lns = fastalns(@f)
+    assert_equal(strlns(Filestring), lns[0..5], "first part equal")
+    assert_equal(strlns(Rev), lns[6..-1], "second part equal")
+  end
+  def test_fraction
+    cmd = @cmd + "reverse #{@tmpfile} -f 2.6 -o #{@f}"
+    `#{cmd}`
+    assert_equal(8, fastap(@f).size)
+    cmd = @cmd + "shuffle #{@tmpfile} -f 2.0 -c -p MINE_ -o #{@f}"
+    `#{cmd}`
+    assert_equal(9, fastap(@f).size)
+    fp = fastap(@f)
+    fp[0..2].each do |prt|
+      assert_match(/^>/, prt.header, "prefix matches")
+    end
+    fp[3..5].each do |prt|
+      assert_match(/^>MINE_f0_/, prt.header, "prefix matches")
+    end
+    fp[6..8].each do |prt|
+      assert_match(/^>MINE_f1_/, prt.header, "prefix matches")
+    end
+    #cmd = @cmd + "reverse #{@tmpfile} -c -f 2.0 -o #{@f}"
+  end
+  def test_prefix
+    cmd = @cmd + "reverse #{@tmpfile} -p SILLY_ -o #{@f}"
+    `#{cmd}`
+    fp = fastap(@f)
+    fp.each do |prt|
+      assert_match(/^>SILLY_.+/, prt.header)
+    end
+  end
+  private
+  def strlns(str)
+    str.split("\n")
+  end
+  def fastalns(fn)
+    assert(File.exist?(fn), "FILE: #{fn} exists")
+    IO.read(fn).split("\n")
+  end
+  # returns the fasta object proteins
+  def fastap(fn)
+    assert(File.exist?(fn), "FILE: #{fn} exists")
+    Fasta.new.read_file(fn).prots
+  end
+end

data/test/tc_gi.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'test/unit'
+require 'gi'
+class Gi2AnnotTest < Test::Unit::TestCase
+  ROOT_DIR = File.join(File.dirname(__FILE__), '..')
+  def test_single_query
+    #begin
+      annot = GI.gi2annot([16130548]).first
+    #rescue
+      puts "SKIPPING gi2annot test since no internet connection available:"
+      puts "#{$!}"
+      assert true
+    #else
+      assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]'+"\n", annot)
+    #end
+  end
+end

data/test/tc_id_class_anal.rb CHANGED Viewed

@@ -23,14 +23,12 @@ class IDClassAnalTest < Test::Unit::TestCase
     output = `#{@cmd} -p INV_ #{@tf_proph_inv}`
     fps = [1.00, 1.00, 0.97]
     tps = [1.00, 1.00, 0.98, 0.97, 0.97, 0.97, 0.97]
-    puts output
     #File.open("tmp.csv","w") do |fh| fh.print output end
     assert 1
   end
-  def Xtest_basic
+  def test_basic
     output = `#{@cmd} -p INV_ #{@tf_bioworks_esmall_xml}`
-    # @TODO: that's the output, need to grab for consistency sake
     exp = [
       [1, 1.0, 0.0],
       [2, 1.0, 0.0],
@@ -40,11 +38,11 @@ class IDClassAnalTest < Test::Unit::TestCase
       [6, 1.0, 0.0],
       [9, 1.0, 0.0],
       [10, 1.0, 0.0],
-      [11, 0.916666666666667, 0.166666666666667],
-      [12, 0.923076923076923, 0.153846153846154],
-      [13, 0.928571428571429, 0.142857142857143],
-      [14, 0.933333333333333, 0.133333333333333],
-      [15, 0.882352941176471, 0.235294117647059]
+      [11, 0.909090909090909],
+      [12, 0.916666666666667],
+      [13, 0.923076923076923],
+      [14, 0.928571428571429],
+      [15, 0.866666666666667],
     ]
     outarr = output.split($/)
     exp.each_with_index do |line,i|
@@ -55,18 +53,17 @@ class IDClassAnalTest < Test::Unit::TestCase
     end
   end
-  def Xtest_multiple_output
+  def test_multiple_output
     myplot = 'class_anal.toplot'
     output = `#{@cmd} -j -p INV_,SHUFF_ #{@tf_bioworks_esmall_xml} #{@tf_bioworks_shuff}`
     assert(output.size > 10) ## @TODO: BETTER HERE
     assert(File.exist?(myplot), "file #{myplot} exists")
     File.unlink myplot
   end
-  def Xtest_jtplot_output
+  def test_jtplot_output
     myplot = 'class_anal.toplot'
-    `#{@cmd} -p INV_ -j #{@tf_bioworks_esmall_xml}`
+    output = `#{@cmd} -p INV_ -j #{@tf_bioworks_esmall_xml}`
     assert(File.exist?(myplot), "file #{myplot} exists")
     File.unlink myplot
   end