RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/specs/spec_id/sequest/pepxml_spec.rb ADDED Viewed

@@ -0,0 +1,452 @@
+require File.expand_path( File.dirname(__FILE__) + '/../../spec_helper' )
+require 'spec_id'
+require 'spec_id/sequest/pepxml'
+#require 'ms/mzxml'
+NODELETE = false
+describe Sequest::PepXML::SearchHit, 'making enzyme calculations on sequences' do
+  before(:each) do
+    @tf_params_fullKRP = Tfiles + "/bioworks32.params"
+    # The enzyme is: 1 KR P
+    @tf_params_justKR = Tfiles + "/bioworks33.params"
+  end
+  it 'calculates the number of tolerant termini' do
+    exp = [{
+      # full KR/P
+      'K.EPTIDR.E' => 2,
+      'K.PEPTIDR.E' => 1,
+      'F.EEPTIDR.E' => 1,
+      'F.PEPTIDW.R' => 0,
+    },
+    {
+      # just KR
+      'K.EPTIDR.E' => 2,
+      'K.PEPTIDR.E' => 2,
+      'F.EEPTIDR.E' => 1,
+      'F.PEPTIDW.R' => 0,
+    }
+    ]
+    scall = Sequest::PepXML::SearchHit
+    sym = :calc_num_tol_term
+    params_ar = [Sequest::Params.new(@tf_params_fullKRP), Sequest::Params.new(@tf_params_justKR)]
+    params_ar.zip(exp) do |params,hash|
+      hash.each do |seq, val|
+        scall.send(sym, params, seq).should == val
+      end
+    end
+  end
+  it 'calculates number of missed cleavages' do
+    exp = [{
+    "K.EPTIDR.E" => 0,
+    "K.PEPTIDR.E" => 0,
+    "F.EEPTIDR.E" => 0,
+    "F.PEPTIDW.R" => 0,
+    "F.PERPTIDW.R" => 0,
+    "F.PEPKPTIDW.R" => 0,
+    "F.PEPKTIDW.R" => 1,
+    "K.RTTIDR.E" => 1,
+    "K.RTTIKK.E" => 2,
+    "F.PKEPRTIDW.R" => 2,
+    "F.PKEPRTIDKP.R" => 2,
+    "F.PKEPRAALKPEERPTIDKW.R" => 3,
+    },
+    {
+    "K.EPTIDR.E" => 0,
+    "K.PEPTIDR.E" => 0,
+    "F.EEPTIDR.E" => 0,
+    "F.PEPTIDW.R" => 0,
+    "F.PERPTIDW.R" => 1,
+    "F.PEPKPTIDW.R" => 1,
+    "F.PEPKTIDW.R" => 1,
+    "K.RTTIDR.E" => 1,
+    "K.RTTIKK.E" => 2,
+    "F.PKEPRTIDW.R" => 2,
+    "F.PKEPRTIDKP.R" => 3,
+    "F.PKEPRAALKPEERPTIDKW.R" => 5,
+    }
+    ]
+    params_ar = [Sequest::Params.new(@tf_params_fullKRP), Sequest::Params.new(@tf_params_justKR)]
+    scall = Sequest::PepXML::SearchHit
+    sym = :calc_num_missed_cleavages
+    #params_ar[1] = params_ar[0]
+    params_ar.zip(exp) do |params, hash|
+      hash.each do |seq, val|
+        scall.send(sym, params, seq).should == val
+      end
+    end
+  end
+end
+describe Sequest::PepXML, " created from small bioworks.xml" do
+  spec_large do
+    before(:all) do
+      tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
+      tf_params = Tfiles + "/bioworks32.params"
+      tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
+      out_path = Tfiles
+      @pepxml_objs = Sequest::PepXML.set_from_bioworks(tf_bioworks_xml, :params => tf_params, :ms_data => tf_mzxml_path, :out_path => out_path)
+    end
+    it 'gets some spectrum queries' do
+      @pepxml_objs.each do |obj|
+        (obj.spectrum_queries.size > 2).should be_true
+        (obj.spectrum_queries.first.search_results.first.search_hits.size > 0).should be_true
+      end
+      #@pepxml_objs.each do |pep| puts pep.to_pepxml end
+    end
+  end
+end
+describe Sequest::PepXML, " created from large bioworks.xml" do
+  # assert_equal_by_pairs (really any old array)
+  def assert_equal_pairs(obj, arrs)
+    arrs.each do |arr|
+      #if obj.send(arr[1]) != arr[0]
+      #  puts "HELLO"
+      #  puts "OBJ answer"
+      #  p obj.send(arr[1])
+      #  puts "ar0"
+      #  p arr[0]
+      #  puts "ar1"
+      #  p arr[1]
+      #end
+      if arr[0].is_a? Float
+        obj.send(arr[1]).should be_close(arr[0], 0.0000000001)
+      else
+        obj.send(arr[1]).should == arr[0]
+      end
+    end
+  end
+  #swap the first to guys first
+  def assert_equal_pairs_swapped(obj, arrs)
+    arrs.each do |arr|
+      arr[0], arr[1] = arr[1], arr[0]
+    end
+    assert_equal_pairs(obj, arrs)
+  end
+  spec_large do
+    before(:all) do
+      st = Time.new
+      params = Tfiles + "/opd1/sequest.3.2.params"
+      bioworks_xml = Tfiles_l + "/opd1/bioworks.000.oldparams.xml"
+      mzxml_path = Tfiles_l + "/opd1"
+      out_path = Tfiles
+      @pepxml_version = 18
+      @pepxml_objs = Sequest::PepXML.set_from_bioworks_xml(bioworks_xml, params, {:ms_data => mzxml_path, :out_path => out_path, :pepxml_version => @pepxml_version})
+      puts "- takes #{Time.new - st} secs"
+    end
+    it 'extracts MSMSPipelineAnalysis' do
+      ######## HMMMMM...
+      Sequest::PepXML.pepxml_version.should == @pepxml_version
+      # MSMSPipelineAnalysis
+      po = @pepxml_objs.first
+      msms_pipeline = po.msms_pipeline_analysis
+      msms_pipeline.xmlns.should == 'http://regis-web.systemsbiology.net/pepXML'
+      msms_pipeline.xmlns_xsi.should == 'http://www.w3.org/2001/XMLSchema-instance'
+      msms_pipeline.xsi_schema_location.should == 'http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd'
+      msms_pipeline.summary_xml.should == '000.xml'
+    end
+    it 'extracts MSmSRunSummary' do
+      # MSMSRunSummary
+      rs = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary
+      rs.base_name.should =~ /\/000/
+      assert_equal_pairs(rs, [ ['ThermoFinnigan', :ms_manufacturer], ['LCQ Deca XP Plus', :ms_model], ['ESI', :ms_ionization], ['Ion Trap', :ms_mass_analyzer], ['UNKNOWN', :ms_detector], ['raw', :raw_data_type], ['.mzXML', :raw_data], ])
+    end
+    it 'extracts SampleEnzyme' do
+      # SampleEnzyme
+      se = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.sample_enzyme
+      assert_equal_pairs(se, [ ['Trypsin', :name], ['KR', :cut], [nil, :no_cut], ['C', :sense], ])
+    end
+    it 'extracts SearchSummary' do
+      # SearchSummary
+      ss = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary
+      ss.is_a?(Sequest::PepXML::SearchSummary).should be_true
+      ss.base_name.should =~ /\/000/
+      ss.peptide_mass_tol.should =~ /1\.500/
+      assert_equal_pairs_swapped(ss, [ # normal attributes
+                                 [:search_engine, "SEQUEST"], [:precursor_mass_type, "average"], [:fragment_mass_type, "average"], [:out_data_type, "out"], [:out_data, ".tgz"], [:search_id, "1"],
+                                 # enzymatic_search_constraint
+                                 [:enzyme, 'Trypsin'], [:max_num_internal_cleavages, '2'], [:min_number_termini, '2'],
+                                 # parameters
+                                 [:fragment_ion_tol, "1.0000"], [:ion_series, "0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0"], [:max_num_differential_AA_per_mod, "3"], [:nucleotide_reading_frame, "0"], [:num_output_lines, "10"], [:remove_precursor_peak, "0"], [:ion_cutoff_percentage, "0.0000"], [:match_peak_count, "0"], [:match_peak_allowed_error, "1"], [:match_peak_tolerance, "1.0000"], [:protein_mass_filter, "0 0"],
+      ])
+    end
+    it 'extracts SearchDatabase' do
+      # SearchDatabase
+      sd = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary.search_database
+      sd.is_a?(Sequest::PepXML::SearchDatabase).should be_true
+      assert_equal_pairs_swapped(sd, [ [:local_path, "C:\\Xcalibur\\database\\ecoli_K12.fasta"], [:seq_type, 'AA'], ])
+    end
+    it 'returns SpectrumQueries' do
+      # SpectrumQueries
+      sq = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.spectrum_queries
+      spec = sq.first
+      assert_equal_pairs_swapped(spec, [
+                                 [:spectrum, "000.100.100.1"], [:start_scan, "100"], [:end_scan, "100"],
+                                 #[:precursor_neutral_mass, "1074.5920"], # out2summary
+                                 [:precursor_neutral_mass, 1074.666926], # mine
+                                 [:assumed_charge, 1], [:index, "1"],
+      ])
+      sh = spec.search_results.first.search_hits.first
+      assert_equal_pairs_swapped(sh, [
+                                 # normal attributes
+                                 [:hit_rank, 1],
+                                 [:peptide, "SIYFRNFK"],
+                                 [:peptide_prev_aa, "R"],
+                                 [:peptide_next_aa, "G"],
+                                 [:protein, "gi|16130084|ref|NP_416651.1|"],
+                                 [:num_tot_proteins, 1],
+                                 [:num_matched_ions, 4],
+                                 [:tot_num_ions, 14],
+                                 #[:calc_neutral_pep_mass, "1074.1920"], # out2summary
+                                 [:calc_neutral_pep_mass, 1074.23261], # mine
+                                 #[:massdiff, "+0.400000"], # out2summary
+                                 [:massdiff, 0.434316000000081],  # mine
+                                 [:num_tol_term, 2], [:num_missed_cleavages, 1], [:is_rejected, 0],
+                                 # search_score
+                                 [:xcorr, 0.4], [:deltacn, 0.023], [:deltacnstar, "0"], [:spscore, 78.8], [:sprank, 1],
+      ])
+      spec = sq[1]
+      assert_equal_pairs_swapped(spec, [
+                                 [:spectrum, "000.1000.1000.1"], [:start_scan, "1000"], [:end_scan, "1000"], #[:precursor_neutral_mass, "663.1920"], # out2summary
+                                 [:precursor_neutral_mass, 663.206111], # mine
+                                 [:assumed_charge, 1], [:index, "2"],
+      ])
+      sh = spec.search_results.first.search_hits.first
+      assert_equal_pairs_swapped(sh, [
+                                 # normal attributes
+                                 [:hit_rank, 1], [:peptide, "ALADFK"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "S"], [:protein, "gi|16128765|ref|NP_415318.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 10],
+                                 [:num_tol_term, 2], [:num_missed_cleavages, 0], [:is_rejected, 0],
+                                 #[:massdiff, "-0.600000"], # out2summary
+                                 [:massdiff, -0.556499000000031],  # mine
+                                 #[:calc_neutral_pep_mass, 663.7920], # out2summary
+                                 [:calc_neutral_pep_mass, 663.76261], # mine
+                                 # search_score
+                                 [:xcorr, 0.965], [:deltacn, 0.132], [:deltacnstar, "0"], [:spscore, 81.1], [:sprank, 1],
+      ])
+      spec = sq[9]
+      assert_equal_pairs_swapped(spec, [
+                                 [:spectrum, "000.1008.1008.2"], [:start_scan, "1008"], [:end_scan, "1008"], [:assumed_charge, 2],
+                                 #[:precursor_neutral_mass, "691.0920"], # out2summary
+                                 [:precursor_neutral_mass, 691.150992], # mine
+      ])
+      sh = spec.search_results.first.search_hits.first
+      assert_equal_pairs_swapped(sh, [
+                                 # normal attributes
+                                 [:hit_rank, 1], [:peptide, "RLFTR"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "A"], [:protein, "gi|16130457|ref|NP_417027.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 8], [:num_tol_term, 2],
+                                 #[:num_missed_cleavages, "0"],  # out2summary misses this!
+                                 [:num_missed_cleavages, 1],
+                                 [:is_rejected, 0],
+                                 #[:calc_neutral_pep_mass, "691.7920"], # out2summary
+                                 [:calc_neutral_pep_mass, 691.82261], # mine
+                                 #[:massdiff, "-0.700000"], # out2summary
+                                 [:massdiff, -0.67161800000008],  # mine
+                                 # search_score
+                                 [:xcorr, 0.903], [:deltacn, 0.333], [:deltacnstar, "0"], [:spscore, 172.8], [:sprank, 1],
+      ])
+    end
+    it 'can generate correct pepxml file' do
+      ## IF OUR OBJECT IS CORRECT, THEN WE GET THE OUTPUT:
+      string = @pepxml_objs.first.to_pepxml
+      ans_lines = IO.read(Tfiles + "/opd1/000.my_answer.100lines.xml").split("\n")
+      base_name_re = /base_name=".*?files\//o
+      date_re = /date=".*?"/
+      string.split("\n").each_with_index do |line,i|
+        if i > 99 ; break end
+        ans, exp =
+          if i == 1
+            [line.sub(date_re,''), ans_lines[i].sub(date_re,'')]
+          elsif i == 2
+            [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t")]
+          elsif i == 6
+            [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t\t")]
+          else
+            [line, ans_lines[i]]
+          end
+        #ans.split('').zip(exp.split('')) do |l,a|
+        #  if l != a
+        #    puts line
+        #    puts ans_lines[i]
+        #    puts l
+        #    puts a
+        #  end
+        #end
+        if ans != exp
+          puts ans
+          puts exp
+        end
+        ans.should == exp
+        #line.sub(base_name_re,'').should == ans_lines[i].sub(base_name_re,'')
+      end
+    end
+  end
+end
+describe Sequest::PepXML::Modifications do
+  before(:each) do
+    tf_params = Tfiles + "/bioworks32.params"
+    @params = Sequest::Params.new(tf_params)
+    # The params object here is completely unnecessary for this test, except
+    # that it sets up the mass table
+    @obj = Sequest::PepXML::Modifications.new(@params, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
+  end
+  it 'creates a mod_symbols_hash' do
+    answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
+    @obj.mod_symbols_hash.should == answ
+    ## need more here
+  end
+  it 'creates a ModificationInfo object given a special peptide sequence' do
+    mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
+    @params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
+    @params.term_diff_search_options = "14.20000 12.33000"
+    mod = Sequest::PepXML::Modifications.new(@params, mod_string)
+    ## no mods
+    peptide = "PEPTIDE"
+    mod.modification_info(peptide).should be_nil
+    peptide = "]M*EC^S@IDM#M*EMSCM["
+    modinfo = mod.modification_info(peptide)
+    modinfo.modified_peptide.should == peptide
+    modinfo.mod_nterm_mass.should be_close(146.40054, 0.000001)
+    modinfo.mod_cterm_mass.should be_close(160.52994, 0.000001)
+  end
+end
+describe Sequest::PepXML::SearchHit::ModificationInfo do
+  before(:each) do
+    modaaobjs = [[3, 150.3], [6, 345.2]].map do |ar|
+      Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new(ar)
+    end
+    hash = {
+      :mod_nterm_mass => 520.2,
+      :modified_peptide => "MOD*IFI^E&D",
+      :mod_aminoacid_masses => modaaobjs,
+    }
+    #answ = "<modification_info mod_nterm_mass=\"520.2\" modified_peptide=\"MOD*IFI^E&amp;D\">\n\t<mod_aminoacid_mass position=\"3\" mass=\"150.3\"/>\n\t<mod_aminoacid_mass position=\"6\" mass=\"345.2\"/>\n</modification_info>\n"
+    @obj = Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
+  end
+  def _re(st)
+    /#{Regexp.escape(st)}/
+  end
+  it 'can produce pepxml' do
+    answ = @obj.to_pepxml
+    answ.should =~ _re('<modification_info')
+    answ.should =~ _re(" mod_nterm_mass=\"520.2\"")
+    answ.should =~ _re(" modified_peptide=\"MOD*IFI^E&amp;D\"")
+    answ.should =~ _re("<mod_aminoacid_mass")
+    answ.should =~ _re(" position=\"3\"")
+    answ.should =~ _re(" mass=\"150.3\"")
+    answ.should =~ _re(" position=\"6\"")
+    answ.should =~ _re(" mass=\"345.2\"")
+    answ.should =~ _re("</modification_info>")
+  end
+end
+describe 'bioworks file with modifications transformed into pepxml' do
+  spec_large do
+    before(:all) do
+      modfiles_sequest_dir = Tfiles_l + '/opd1_2runs_2mods/sequest/'
+      modfiles_data_dir = Tfiles_l + '/opd1_2runs_2mods/data/'
+      @srgfile = modfiles_sequest_dir + 'tmp.srg'
+      @out_path = modfiles_sequest_dir + 'pepxml'
+      modfiles = %w(020 040).map do |file|
+        modfiles_sequest_dir + file + ".srf"
+      end
+      objs = Sequest::PepXML.set_from_bioworks( SRFGroup.new(modfiles).to_srg(@srgfile), {:ms_data => modfiles_data_dir, :out_path => @out_path, :print => true, :backup_db_path => '/project/marcotte/marcotte/ms/database'} )
+      @out_files = %w(020 040).map do |file|
+        @out_path + '/' + file + '.xml'
+      end
+    end
+    after(:all) do
+      File.unlink(@srgfile) unless NODELETE
+      FileUtils.rm_r(@out_path)
+      #@out_files.each do |fn|
+      #  File.unlink(fn) unless NODELETE
+      #end
+    end
+    # splits string on ' 'and matches the line found by find_line_regexp in
+    # lines
+    def match_modline_pieces(lines, find_line_regexp, string)
+      pieces = string.split(' ').map {|v| /#{Regexp.escape(v)}/ }
+      lines.each do |line|
+        if line =~ find_line_regexp
+          pieces.each do |piece|
+            line.should =~ piece
+          end
+        end
+      end
+    end
+    it 'gets modifications right in real run' do
+      @out_files.each do |fn|
+        fn.should exist
+        beginning = IO.read(fn)
+        lines = beginning.split("\n")
+        [
+          [/aminoacid="M"/, '<aminoacid_modification symbol="*" massdiff="+15.9994" aminoacid="M" variable="Y" binary="N" mass="147.192"'],
+          [/aminoacid="S"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="S" variable="Y" binary="N" mass="167.0581"'],
+          [/aminoacid="T"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="T" variable="Y" binary="N" mass="181.085"'],
+          [/aminoacid="Y"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="Y" variable="Y" binary="N" mass="243.1559"'],
+          [/parameter name="diff_search_options"/, '<parameter name="diff_search_options" value="15.999400 M 79.979900 STY 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>'],
+        ].each do |a,b|
+          match_modline_pieces(lines, a, b)
+        end
+        [
+        '<modification_info modified_peptide="Y#RLGGS#T#K">',
+        '<mod_aminoacid_mass position="1" mass="243.1559"/>',
+        '<mod_aminoacid_mass position="7" mass="167.0581"/>',
+        '</modification_info>',
+        '<mod_aminoacid_mass position="9" mass="181.085"/>'
+        ].each do |line|
+          beginning.should =~ /#{Regexp.escape(line)}/ # "a modification info for a peptide")
+        end
+      end
+    end
+  end
+end

data/specs/spec_id/sqt_spec.rb ADDED Viewed

@@ -0,0 +1,138 @@
+require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
+require 'spec_id/srf'
+SpecHelperHeaderHash = {
+  'SQTGenerator' => 'mspire',
+  'SQTGeneratorVersion' => String,
+  'Database' => 'C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta',
+  'FragmentMasses' => 'AVG',
+  'PrecursorMasses' => 'AVG',
+  'StartTime' => nil,
+  'Alg-MSModel' => 'LCQ Deca XP',
+  'Alg-PreMassUnits' => 'amu',
+  'DBLocusCount' => '4237',
+  'Alg-FragMassTol' => '1.0000',
+  'Alg-PreMassTol' => '1.4000',
+  'Alg-IonSeries' => '0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0',
+  'Alg-Enzyme' => 'Trypsin(KR/P) (2)',
+  'Comment' => ['Created from Bioworks .srf file'],
+  'StaticMod' => ['C=160.1901','Cterm=10.1230','E=161.4455'],
+  'DynamicMod' => ['STY*=+79.97990', 'M#=+14.02660'],
+}
+SpecHelperOtherLines =<<END
+S	2	2	1	0.0	VELA	391.04541015625	3021.5419921875	0.0	0
+S	3	3	1	0.0	VELA	446.009033203125	1743.96911621094	0.0	122
+M	1	1	445.5769264522	0.0	0.245620265603065	16.6666660308838	1	6	R.SNSK.S	U
+L	gi|16128266|ref|NP_414815.1|
+END
+SpecHelperOtherLinesEnd =<<END
+L	gi|90111093|ref|NP_414704.4|
+M	10	17	1298.5350544522	0.235343858599663	0.823222815990448	151.717300415039	12	54	K.LQKIITNSY*K	U
+L	gi|90111124|ref|NP_414904.2|
+END
+describe 'converting a large srf to sqt' do
+  def del(file)
+    if File.exist?(file)
+      File.unlink(file)
+    end
+  end
+  # returns true or false
+  def header_hash_match(header_lines, hash)
+    header_lines.all? do |line|
+      (h, k, v) = line.chomp.split("\t")
+      if hash[k].is_a? Array
+        if hash[k].include?(v)
+          true
+        else
+          puts "FAILED: "
+          p k
+          p v
+          p hash[k]
+          false
+        end
+      elsif hash[k] == String
+        v.is_a?(String)
+      else
+        if v == hash[k]
+          true
+        else
+          puts "FAILED: "
+          p k
+          p v
+          p hash[k]
+          false
+        end
+      end
+    end
+  end
+  spec_large do
+    before(:all) do
+      @file = Tfiles_l + '/opd1_static_diff_mods/000.srf'
+      @output = Tfiles_l + '/opd1_static_diff_mods/000.sqt.tmp'
+      @srf = SRF.new(@file)
+      @original_db_filename = @srf.header.db_filename
+    end
+    it 'converts without bothering with the database' do
+      @srf.to_sqt(@output)
+      @output.should exist
+      lines = File.readlines(@output)
+      lines.size.should == 80910
+      header_lines = lines.grep(/^H/)
+      (header_lines.size > 10).should be_true
+      header_hash_match(header_lines, SpecHelperHeaderHash).should be_true
+      other_lines = lines.grep(/^[^H]/)
+      other_lines[0,4].join('').should == SpecHelperOtherLines
+      other_lines[-3,3].join('').should == SpecHelperOtherLinesEnd
+      del(@output)
+    end
+    it 'warns if the db path is incorrect and we want to update db info' do
+      # requires some knowledge of how the database file is extracted
+      # internally
+      wacky_path = '/not/a/real/path/wacky.fasta'
+      @srf.header.db_filename = wacky_path
+      my_error_string = ''
+      StringIO.open(my_error_string, 'w') do |strio|
+        $stderr = strio
+        @srf.to_sqt(@output, :db_info => true)
+      end
+      my_error_string.should include(wacky_path)
+      @srf.header.db_filename = @original_db_filename
+      $stderr = STDERR
+      @output.should exist
+      IO.readlines(@output).size.should == 80910
+      del(@output)
+    end
+    it 'can get db info with correct path' do
+      @srf.to_sqt(@output, :db_info => true, :new_db_path => Tfiles_l + '/opd1_2runs_2mods/sequest')
+      @output.should exist
+      lines = IO.readlines(@output)
+      has_md5 = lines.any? do |line|
+        line =~ /DBMD5Sum\s+202b1d95e91f2da30191174a7f13a04e/
+      end
+      has_md5.should be_true
+      has_seq_len = lines.any? do |line|
+        # frozen
+        line =~ /DBSeqLength\s+1342842/
+      end
+      has_seq_len.should be_true
+      lines.size.should == 80912
+      del(@output)
+    end
+    it 'can update the Database' do
+      @srf.to_sqt(@output, :new_db_path => Tfiles_l + '/opd1_2runs_2mods/sequest', :update_db_path => true)
+      regexp = Regexp.new("Database\t/.*/opd1_2runs_2mods/sequest/ecoli_K12_ncbi_20060321.fasta")
+      updated_db = IO.readlines(@output).any? do |line|
+        line =~ regexp
+      end
+      updated_db.should be_true
+      del(@output)
+    end
+  end
+end