RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/script/toppred_to_yaml.rb ADDED Viewed

@@ -0,0 +1,47 @@
+#!/usr/bin/ruby -w
+require 'optparse'
+opt = {}
+opt[:probability] = 1.0
+opts = OptionParser.new do |op|
+  op.banner = "USAGE: #{File.basename(__FILE__)} toppred.out"
+  op.separator "Outputs toppred.yaml"
+  op.separator "takes the highest probability structure"
+  op.separator "for best structures of equal probability, takes first given"
+  op.separator "Each line contains:"
+  op.separator "<identifier>: String :"
+  op.separator "                      num_found: Int"
+  op.separator "                      num_certain_transmembrane_segments: Int"
+  op.separator "                      num_putative_transmembrane_segments: Int"
+  op.separator "                      best_structure_probability: Float"
+  op.separator "                      transmembrane_segments:"
+  op.separator "                        - probability: Float"
+  op.separator "                          start: Int"
+  op.separator "                          stop: Int"
+  op.separator "                          aaseq: String"
+  op.separator ""
+  op.separator "OPTIONS:"
+  op.on("-p", "--probability", Float, "min structure prob threshold (default #{opt[:probability]})") {|v| opt[:probability] = v}
+end
+opts.parse!
+if ARGV.size == 0
+  puts opts
+  exit
+end
+file = ARGV.shift
+File.open(file) do |fh|
+  hash = Transmem.read_toppred(fh)
+end
+puts hash.to_yaml

data/script/tpp_installer.rb CHANGED Viewed

@@ -202,7 +202,7 @@ chmod(0777, TPP_DATA_PATH.chomp('/'))
 mkpath TPP_VIS_PATH.chomp('/')
 ## VERY SPECIFIC to OUR SYSTEM
-soft_link('/project/marcotte/ms', TPP_DATA_PATH.chomp('/') + '/ms')
+soft_link('/project/marcotte/marcotte/ms', TPP_DATA_PATH.chomp('/') + '/ms')
 system "sudo chown john:marcotte #{TPP_DATA_PATH.chomp('/')}"
 system "sudo chown john:marcotte #{TPP_VIS_PATH.chomp('/')}"

data/{test/tc_align.rb → specs/align_spec.rb} RENAMED Viewed

@@ -1,20 +1,17 @@
+require File.expand_path( File.dirname(__FILE__) + '/spec_helper' )
-require 'test/unit'
 require 'align'
-require 'pp'
-class AlignTest < Test::Unit::TestCase
+describe Align do
-  def initialize(arg)
-    super(arg)
-    @tfiles = File.dirname(__FILE__) + '/tfiles/'
-    @mz1 = @tfiles + '4-03-03_mzXML/000.mzXML.timeIndex'
-    @mz2 = @tfiles + '4-03-03_mzXML/020.mzXML.timeIndex'
-    @prt = @tfiles + '4-03-03_small-prot.xml'
-    @pep = @tfiles + '4-03-03_small.xml'
+  before(:each) do
+    @mz1 = Tfiles + '4-03-03_mzXML/000.mzXML.timeIndex'
+    @mz2 = Tfiles + '4-03-03_mzXML/020.mzXML.timeIndex'
+    @prt = Tfiles + '4-03-03_small-prot.xml'
+    @pep = Tfiles + '4-03-03_small.xml'
   end
-  def test_overlapping_peps_by_seqcharge
+  it_should 'finds overlapping peptides of same seq+charge' do
     s1 = 'DETTIVEGAGDAEAIQGR'
     c1 = '2'
     s2 = 'TDDVAGDGTTTATVLAQALVR'
@@ -35,28 +32,25 @@ class AlignTest < Test::Unit::TestCase
           has_seqcharges << false
         end
       end
-      has_seqcharges.each do |c| assert c end
+      has_seqcharges.each { |c| c.should be_true }
     end
   end
   ### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   # @TODO: CURRENT WORK!
-  def test_overlapping_peps_by_seqcharge_with_filter
-    assert true
-    if false
-      al = Align.new
-      pep1 = al.peps_with_scans([@mz1], @prt, @pep, 0.0 ,0.0 ,0.0  )
-      pep2 = al.peps_with_scans(@mz2, @prt, @pep, 0.0, 0.0, 0.0 )
-      max_dups = nil
-      outlier_cutoff = 0.0
-      olap = al.overlapping_peps_by_seqcharge_with_filter([pep1, pep2], max_dups, outlier_cutoff)
-      olap.each do |peps|
-        p peps
-      end
+  it_should 'should find overlapping peptides at a seqcharge with a filter' do
+    al = Align.new
+    pep1 = al.peps_with_scans([@mz1], @prt, @pep, 0.0 ,0.0 ,0.0  )
+    pep2 = al.peps_with_scans(@mz2, @prt, @pep, 0.0, 0.0, 0.0 )
+    max_dups = nil
+    outlier_cutoff = 0.0
+    olap = al.overlapping_peps_by_seqcharge_with_filter([pep1, pep2], max_dups, outlier_cutoff)
+    olap.each do |peps|
+      p peps
     end
   end
-  def test_toss_outliers
+  it_should 'should toss outliers' do
     # Consistency/sanity checks right now (not accuracy)
     x = [-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,10,0 ,1,2,3,4,5,6,7,8,9]
@@ -65,7 +59,7 @@ class AlignTest < Test::Unit::TestCase
     expy2 = [-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,1,2,3,4,5,6,7,8,9]
     pcls = Proph::Pep
-    scls = Spec::Scan
+    scls = MS::Scan
     pep_groups = [x,y].collect do |arr|
       arr.collect do |val|
@@ -79,7 +73,7 @@ class AlignTest < Test::Unit::TestCase
     deviations = 3.2
     size_before = pep_groups.first.size
     al.toss_outliers(pep_groups, deviations)
-    assert_equal(2, size_before - pep_groups.first.size)
+    (size_before - pep_groups.first.size).should == 2
   end
 end

data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} RENAMED Viewed

@@ -1,12 +1,8 @@
-require 'test/unit'
-require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
+require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
 require 'fileutils'
-tmp = $VERBOSE
-$VERBOSE = 5
-$XML_SANITY_LINES = ['<sample_enzyme name="trypsin">', '<specificity cut="KR" no_cut="P" sense="C"/>', '<parameter name="diff_search_options" value="0.000000 S 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>']
+$XML_SANITY_LINES = ['<sample_enzyme name="Trypsin">', '<specificity cut="KR" no_cut="P" sense="C"/>', '<parameter name="diff_search_options" value="0.000000 S 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>']
 $XML_SANITY_MATCHES = [/<spectrum_query spectrum="0\d0.\d+.\d+.[123]" start_scan="\d+" end_scan="\d+" precursor_neutral_mass="[\d\.]+" assumed_charge="[123]" index="\d+">/,
   /	<search_hit hit_rank="\d" peptide="[\w\-\.]+" peptide_prev_aa="." peptide_next_aa="." protein=".*" num_tot_proteins="\d+" num_matched_ions="\d+" tot_num_ions="\d+" calc_neutral_pep_mass="[\d\.]+" massdiff="[\+\-][\d\.]+" num_tol_term="\d" num_missed_cleavages="\d" is_rejected="[01]">/,
@@ -18,78 +14,66 @@ $XML_SANITY_MATCHES = [/<spectrum_query spectrum="0\d0.\d+.\d+.[123]" start_scan
 ]
-class BioworksToPepXMLTest < Test::Unit::TestCase
-  def initialize(arg)
-    super(arg)
-    @tfiles = File.dirname(__FILE__) + '/tfiles/'
-    @tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
-    @tf_mzxml_path = @tfiles_l + "yeast_gly_mzXML"
-    @tf_bioworks_xml = @tfiles + "bioworks_small.xml"
-    @tf_params = @tfiles + "bioworks32.params"
-    @no_delete = true
-    @out_path = @tfiles + 'pepxml/'
-    @cmd = "ruby -I#{File.join(File.dirname(__FILE__), "..", "lib")} -S bioworks_to_pepxml.rb "
+describe 'bioworks_to_pepxml.rb' do
+  before(:all) do
+    @tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
+    @tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
+    @tf_params = Tfiles + '/bioworks32.params'
+    @out_path = Tfiles + '/pepxml/'
+    @progname = 'bioworks_to_pepxml.rb'
+    @no_delete = false
   end
-  def test_usage
-    assert_match(/usage:/, `#{@cmd}`)
-  end
+  it_should_behave_like "a cmdline program"
   def _basic(cmd, prc)
-    puts "Performing: #{cmd}" if $VERBOSE
+    puts "Performing: #{cmd}" if $DEBUG
     reply = `#{cmd}`
-    puts reply if $VERBOSE
+    puts reply if $DEBUG
     %w(000 020).each do |file|
       ffile = @out_path + file + ".xml"
       prc.call(ffile)
     end
   end
-  def test_basic
-    if File.exist? @tfiles_l
+  spec_large do
+    it 'works on a real bioworks.xml file' do
       cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path} -d /work/special/path --copy_mzxml"
       ## FILES EXIST:
       prc = proc {|file|
-        assert(File.exist?(file), "#{file} exists")
+        file.should exist
         beginning = IO.readlines(file)[0,50].join("\n")
         $XML_SANITY_LINES.each do |line|
-          assert(beginning.include?(line), "xml includes line: #{line}")
+          beginning.should include(line)
+          #beginning.include?(line).should be_true
         end
         $XML_SANITY_MATCHES.each do |match|
-          assert_match(match, beginning, "matches")
+          beginning.should =~ match
         end
       }
       _basic(cmd, prc)
       ## COPY MZXML:
       %w(000 020).each do |file|
         mzxml_file = File.join(@out_path, "#{file}.mzXML")
-        assert(File.exist?( mzxml_file ), "file: #{mzxml_file} exists")
+        mzxml_file.should exist
       end
       ## CLEANUP:
       unless @no_delete then FileUtils.rm_rf(@out_path) end
-    else
-      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
     end
   end
-  def test_database
-    if File.exist? @tfiles_l
+  spec_large do
+    it 'transforms database name when its proper to do so' do
       cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path}"
       db_re = /C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta/
-      assert_match(db_re, IO.read(@tf_params))
+      IO.read(@tf_params).should =~ db_re
       prc = proc {|file|
-        assert(File.exist?(file))
-        assert_no_match(db_re, IO.read(file))
+        file.should exist
+        IO.read(file).should_not =~ db_re
       }
       _basic(cmd, prc)
       unless @no_delete then FileUtils.rm_rf(@out_path) end
-    else
-      assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
     end
   end
 end
-$VERBOSE = tmp

data/specs/bin/fasta_shaker_spec.rb ADDED Viewed

@@ -0,0 +1,259 @@
+require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
+require 'fasta'
+class Fasta
+  def same_sized_proteins?(other_fasta_obj_or_file)
+    other = Fasta.to_fasta(other_fasta_obj_or_file)
+    @prots.zip(other.prots).all? do |a,b|
+      a.aaseq.size == b.aaseq.size
+    end
+  end
+  # This is tough to say 'for sure'  Right now, we consider the proteins
+  # shuffled if they are all the same size and 2/3 or more of the peptides are
+  # different than the other (this is designed for small sets of proteins
+  # where it is possible one of the peptides is equal to the other).
+  def shuffled?(other_fasta_obj_or_file)
+    other = Fasta.to_fasta(other_fasta_obj_or_file)
+    if !same_sized_proteins?(other)
+      false
+    else
+      (same, different) = @prots.zip(other.prots).partition do |prota, protb|
+        prota == protb
+      end
+      fraction_different = different.size.to_f / (same.size + different.size)
+      fraction_different >= 2.0/3
+    end
+  end
+end
+describe "a manipulator of a fasta file", :shared => true do
+  before(:all) do
+    @filestring = ">gi|P1
+AMKRGAN
+>gi|P2
+CRGATKKTAGRPMEK
+>gi|P3
+PEPTIDE
+"
+    @rev_filestring = ">gi|P1
+NAGRKMA
+>gi|P2
+KEMPRGATKKTAGRC
+>gi|P3
+EDITPEP
+"
+    @rev_pref_filestring = ">REV_gi|P1
+NAGRKMA
+>REV_gi|P2
+KEMPRGATKKTAGRC
+>REV_gi|P3
+EDITPEP
+"
+    @rev_tryptic_filestring = ">gi|P1
+MAKRNAG
+>gi|P2
+CRTAGKKEMPRGATK
+>gi|P3
+EDITPEP
+"
+  end
+  before(:each) do
+    testdir = File.dirname(__FILE__)
+    @tmpfile = Tfiles + "/littlefasta.trash.fasta"
+    @f = Tfiles + "/trash.fasta"
+    File.open(@tmpfile, "w") {|fh| fh.print @filestring }
+  end
+  after(:each) do
+    File.unlink @tmpfile if File.exist? @tmpfile
+    File.unlink @f if File.exist? @f
+  end
+  it 'reverses protein sequences' do
+    reverse_the_file
+    fastap(@f).to_s.should == @rev_filestring
+  end
+  def reverse_the_file
+    do_it(:reverse)
+  end
+  it 'shuffles protein sequences' do
+    shuffle_the_file
+    Fasta.new(@f).shuffled?(Fasta.from_string(@filestring)).should be_true
+  end
+  def shuffle_the_file
+    do_it(:shuffle)
+  end
+  it 'concatenates sequences' do
+    concatenate_sequences
+    lns = fastalns(@f)
+    strlns(@filestring).should == lns[0..5] # first part equal
+    strlns(@rev_pref_filestring).should == lns[6..-1] # "second part equal")
+  end
+  def concatenate_sequences
+    do_it(:reverse, :cat => true, :prefix => 'REV_')
+  end
+  it 'makes prefixes' do
+    make_prefixes
+    #@shaker.reverse(@tmpfile, :out => @f, :prefix => 'SILLY_')
+    fp = fastap(@f)
+    fp.each do |prt|
+      prt.header.should match(/^>SILLY_.+/)
+    end
+  end
+  def make_prefixes
+    do_it(:reverse, :prefix => 'SILLY_')
+  end
+  it 'makes fractions of proteins' do
+    make_fractions_of_proteins(1.0/3)
+    fastap(@f).size.should == 1
+    fastap(@f).first.header.should =~ /^>[^M]/
+    # this guy gets rounded up on the command line so that it fails there
+    #make_fractions_of_proteins(2.0/3)
+    #fastap(@f).size.should == 2
+    #fastap(@f).each do |prt|
+    #  prt.header.should =~ /^>[^M]/
+    #end
+    make_fractions_of_proteins(1.0)
+    fastap(@f).size.should == 3
+    fastap(@f).each do |prt|
+      prt.header.should =~ /^>[^M]/
+    end
+  end
+  def make_fractions_of_proteins(fraction)
+    do_it(:shuffle, :fraction => fraction)
+  end
+  it 'makes fractions with labels (for > 1)' do
+    make_fractions_of_proteins(1.1)
+    fastap(@f).size.should == 4
+    fastap(@f).any? do |prt|
+      prt.header =~ /^>[^M]/
+    end.should be_true
+    make_fractions_of_proteins(2.6)
+    fastap(@f).size.should == 8
+    make_reverse_cat_fractions(2.0)
+    fastap(@f).size.should == 9
+    fp = Fasta.new(@f)
+    fp[0..2].each do |prt|
+      prt.header.should =~ /^>/
+    end
+    fp[3..5].each do |prt|
+      prt.header.should =~ /^>MINE_f0_/
+    end
+    fp[6..8].each do |prt|
+      prt.header.should =~ /^>MINE_f1_/
+    end
+  end
+  def make_reverse_cat_fractions(fraction, prefix='MINE_')
+    do_it(:reverse, :fraction => fraction, :cat => true, :prefix => prefix)
+  end
+  def reverse_tryptic_peptides
+    do_it(:reverse, :tryptic_peptides => true)
+  end
+    it 'reverses tryptic peptides' do
+      reverse_tryptic_peptides
+      Fasta.from_string(@rev_tryptic_filestring).should == Fasta.new(@f)
+    end
+  def shuffle_tryptic_peptides
+    do_it(:shuffle, :tryptic_peptides => true)
+  end
+  it 'shuffles tryptic peptides (rerun on failure to recheck)' do
+    shuffle_tryptic_peptides
+    lns = fastap(@f).to_s.split("\n")
+    lns[1][2..3].should == 'KR'
+    lns[3][1..1].should == 'R'
+    lns[3].size.should == 'CRGATKKTAGRPMEK'.size
+    lns[3].should_not == 'CRGATKKTAGRPMEK' #sequence is randomised from original [remote chance of failure] rerun to make sure
+  end
+    def strlns(str)
+      str.split("\n")
+    end
+  def fastalns(fn)
+    fn.should exist
+    IO.read(fn).split("\n")
+  end
+  # returns the fasta object proteins
+  def fastap(fn)
+    @f.should exist
+    Fasta.new(fn).prots
+  end
+end
+describe FastaShaker, "by method call" do
+  before(:all) do
+    @shaker = FastaShaker.new
+  end
+  it_should_behave_like "a manipulator of a fasta file"
+  def do_it(method, additional_opts={})
+    opts = {:out => @f}
+    @shaker.send(method, @tmpfile, opts.merge(additional_opts))
+  end
+end
+describe FastaShaker, "by command line long args" do
+  before(:all) do
+    @progname = 'fasta_shaker.rb'
+  end
+  it_should_behave_like "a cmdline program"
+  it_should_behave_like "a manipulator of a fasta file"
+  # returns an array of the args
+  def opts_to_cmd_args(hash)
+    opts = []
+    hash.each do |k,v|
+      opts.push('--' + k.to_s)
+      unless (v == true) or (v == false)
+        opts.push(v)
+      end
+    end
+    opts
+  end
+  def do_it(method, additional_opts={})
+    opts = {:out => @f}
+    opts.merge!(additional_opts)
+    cmd = [@cmd, method, @tmpfile, *(opts_to_cmd_args(opts))].join(" ")
+    #puts cmd
+    system cmd
+  end
+end