mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#!/usr/bin/ruby -w
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
require 'optparse'
|
|
5
|
+
|
|
6
|
+
opt = {}
|
|
7
|
+
opt[:probability] = 1.0
|
|
8
|
+
opts = OptionParser.new do |op|
|
|
9
|
+
op.banner = "USAGE: #{File.basename(__FILE__)} toppred.out"
|
|
10
|
+
op.separator "Outputs toppred.yaml"
|
|
11
|
+
op.separator "takes the highest probability structure"
|
|
12
|
+
op.separator "for best structures of equal probability, takes first given"
|
|
13
|
+
op.separator "Each line contains:"
|
|
14
|
+
op.separator "<identifier>: String :"
|
|
15
|
+
op.separator " num_found: Int"
|
|
16
|
+
op.separator " num_certain_transmembrane_segments: Int"
|
|
17
|
+
op.separator " num_putative_transmembrane_segments: Int"
|
|
18
|
+
op.separator " best_structure_probability: Float"
|
|
19
|
+
op.separator " transmembrane_segments:"
|
|
20
|
+
op.separator " - probability: Float"
|
|
21
|
+
op.separator " start: Int"
|
|
22
|
+
op.separator " stop: Int"
|
|
23
|
+
op.separator " aaseq: String"
|
|
24
|
+
op.separator ""
|
|
25
|
+
op.separator "OPTIONS:"
|
|
26
|
+
op.on("-p", "--probability", Float, "min structure prob threshold (default #{opt[:probability]})") {|v| opt[:probability] = v}
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
opts.parse!
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
if ARGV.size == 0
|
|
33
|
+
puts opts
|
|
34
|
+
exit
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
file = ARGV.shift
|
|
38
|
+
|
|
39
|
+
File.open(file) do |fh|
|
|
40
|
+
hash = Transmem.read_toppred(fh)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
puts hash.to_yaml
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
data/script/tpp_installer.rb
CHANGED
|
@@ -202,7 +202,7 @@ chmod(0777, TPP_DATA_PATH.chomp('/'))
|
|
|
202
202
|
mkpath TPP_VIS_PATH.chomp('/')
|
|
203
203
|
|
|
204
204
|
## VERY SPECIFIC to OUR SYSTEM
|
|
205
|
-
soft_link('/project/marcotte/ms', TPP_DATA_PATH.chomp('/') + '/ms')
|
|
205
|
+
soft_link('/project/marcotte/marcotte/ms', TPP_DATA_PATH.chomp('/') + '/ms')
|
|
206
206
|
system "sudo chown john:marcotte #{TPP_DATA_PATH.chomp('/')}"
|
|
207
207
|
system "sudo chown john:marcotte #{TPP_VIS_PATH.chomp('/')}"
|
|
208
208
|
|
|
@@ -1,20 +1,17 @@
|
|
|
1
|
+
require File.expand_path( File.dirname(__FILE__) + '/spec_helper' )
|
|
1
2
|
|
|
2
|
-
require 'test/unit'
|
|
3
3
|
require 'align'
|
|
4
|
-
require 'pp'
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
describe Align do
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@
|
|
11
|
-
@
|
|
12
|
-
@
|
|
13
|
-
@prt = @tfiles + '4-03-03_small-prot.xml'
|
|
14
|
-
@pep = @tfiles + '4-03-03_small.xml'
|
|
7
|
+
before(:each) do
|
|
8
|
+
@mz1 = Tfiles + '4-03-03_mzXML/000.mzXML.timeIndex'
|
|
9
|
+
@mz2 = Tfiles + '4-03-03_mzXML/020.mzXML.timeIndex'
|
|
10
|
+
@prt = Tfiles + '4-03-03_small-prot.xml'
|
|
11
|
+
@pep = Tfiles + '4-03-03_small.xml'
|
|
15
12
|
end
|
|
16
13
|
|
|
17
|
-
|
|
14
|
+
it_should 'finds overlapping peptides of same seq+charge' do
|
|
18
15
|
s1 = 'DETTIVEGAGDAEAIQGR'
|
|
19
16
|
c1 = '2'
|
|
20
17
|
s2 = 'TDDVAGDGTTTATVLAQALVR'
|
|
@@ -35,28 +32,25 @@ class AlignTest < Test::Unit::TestCase
|
|
|
35
32
|
has_seqcharges << false
|
|
36
33
|
end
|
|
37
34
|
end
|
|
38
|
-
has_seqcharges.each
|
|
35
|
+
has_seqcharges.each { |c| c.should be_true }
|
|
39
36
|
end
|
|
40
37
|
end
|
|
41
38
|
|
|
42
39
|
### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
43
40
|
# @TODO: CURRENT WORK!
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
olap.each do |peps|
|
|
54
|
-
p peps
|
|
55
|
-
end
|
|
41
|
+
it_should 'should find overlapping peptides at a seqcharge with a filter' do
|
|
42
|
+
al = Align.new
|
|
43
|
+
pep1 = al.peps_with_scans([@mz1], @prt, @pep, 0.0 ,0.0 ,0.0 )
|
|
44
|
+
pep2 = al.peps_with_scans(@mz2, @prt, @pep, 0.0, 0.0, 0.0 )
|
|
45
|
+
max_dups = nil
|
|
46
|
+
outlier_cutoff = 0.0
|
|
47
|
+
olap = al.overlapping_peps_by_seqcharge_with_filter([pep1, pep2], max_dups, outlier_cutoff)
|
|
48
|
+
olap.each do |peps|
|
|
49
|
+
p peps
|
|
56
50
|
end
|
|
57
51
|
end
|
|
58
52
|
|
|
59
|
-
|
|
53
|
+
it_should 'should toss outliers' do
|
|
60
54
|
|
|
61
55
|
# Consistency/sanity checks right now (not accuracy)
|
|
62
56
|
x = [-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,10,0 ,1,2,3,4,5,6,7,8,9]
|
|
@@ -65,7 +59,7 @@ class AlignTest < Test::Unit::TestCase
|
|
|
65
59
|
expy2 = [-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,1,2,3,4,5,6,7,8,9]
|
|
66
60
|
|
|
67
61
|
pcls = Proph::Pep
|
|
68
|
-
scls =
|
|
62
|
+
scls = MS::Scan
|
|
69
63
|
|
|
70
64
|
pep_groups = [x,y].collect do |arr|
|
|
71
65
|
arr.collect do |val|
|
|
@@ -79,7 +73,7 @@ class AlignTest < Test::Unit::TestCase
|
|
|
79
73
|
deviations = 3.2
|
|
80
74
|
size_before = pep_groups.first.size
|
|
81
75
|
al.toss_outliers(pep_groups, deviations)
|
|
82
|
-
|
|
76
|
+
(size_before - pep_groups.first.size).should == 2
|
|
83
77
|
end
|
|
84
78
|
|
|
85
79
|
end
|
|
@@ -1,12 +1,8 @@
|
|
|
1
|
-
|
|
2
|
-
require 'test/unit'
|
|
3
|
-
require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
|
|
1
|
+
require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
|
|
4
2
|
require 'fileutils'
|
|
5
3
|
|
|
6
|
-
tmp = $VERBOSE
|
|
7
|
-
$VERBOSE = 5
|
|
8
4
|
|
|
9
|
-
$XML_SANITY_LINES = ['<sample_enzyme name="
|
|
5
|
+
$XML_SANITY_LINES = ['<sample_enzyme name="Trypsin">', '<specificity cut="KR" no_cut="P" sense="C"/>', '<parameter name="diff_search_options" value="0.000000 S 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>']
|
|
10
6
|
|
|
11
7
|
$XML_SANITY_MATCHES = [/<spectrum_query spectrum="0\d0.\d+.\d+.[123]" start_scan="\d+" end_scan="\d+" precursor_neutral_mass="[\d\.]+" assumed_charge="[123]" index="\d+">/,
|
|
12
8
|
/ <search_hit hit_rank="\d" peptide="[\w\-\.]+" peptide_prev_aa="." peptide_next_aa="." protein=".*" num_tot_proteins="\d+" num_matched_ions="\d+" tot_num_ions="\d+" calc_neutral_pep_mass="[\d\.]+" massdiff="[\+\-][\d\.]+" num_tol_term="\d" num_missed_cleavages="\d" is_rejected="[01]">/,
|
|
@@ -18,78 +14,66 @@ $XML_SANITY_MATCHES = [/<spectrum_query spectrum="0\d0.\d+.\d+.[123]" start_scan
|
|
|
18
14
|
]
|
|
19
15
|
|
|
20
16
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@
|
|
28
|
-
@
|
|
29
|
-
@tf_mzxml_path = @tfiles_l + "yeast_gly_mzXML"
|
|
30
|
-
@tf_bioworks_xml = @tfiles + "bioworks_small.xml"
|
|
31
|
-
@tf_params = @tfiles + "bioworks32.params"
|
|
32
|
-
@no_delete = true
|
|
33
|
-
@out_path = @tfiles + 'pepxml/'
|
|
34
|
-
@cmd = "ruby -I#{File.join(File.dirname(__FILE__), "..", "lib")} -S bioworks_to_pepxml.rb "
|
|
17
|
+
describe 'bioworks_to_pepxml.rb' do
|
|
18
|
+
before(:all) do
|
|
19
|
+
@tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
|
|
20
|
+
@tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
|
|
21
|
+
@tf_params = Tfiles + '/bioworks32.params'
|
|
22
|
+
@out_path = Tfiles + '/pepxml/'
|
|
23
|
+
@progname = 'bioworks_to_pepxml.rb'
|
|
24
|
+
@no_delete = false
|
|
35
25
|
end
|
|
36
26
|
|
|
37
|
-
|
|
38
|
-
assert_match(/usage:/, `#{@cmd}`)
|
|
39
|
-
end
|
|
27
|
+
it_should_behave_like "a cmdline program"
|
|
40
28
|
|
|
41
29
|
def _basic(cmd, prc)
|
|
42
|
-
puts "Performing: #{cmd}" if $
|
|
30
|
+
puts "Performing: #{cmd}" if $DEBUG
|
|
43
31
|
reply = `#{cmd}`
|
|
44
|
-
puts reply if $
|
|
32
|
+
puts reply if $DEBUG
|
|
45
33
|
%w(000 020).each do |file|
|
|
46
34
|
ffile = @out_path + file + ".xml"
|
|
47
35
|
prc.call(ffile)
|
|
48
36
|
end
|
|
49
37
|
end
|
|
50
38
|
|
|
51
|
-
|
|
52
|
-
|
|
39
|
+
spec_large do
|
|
40
|
+
it 'works on a real bioworks.xml file' do
|
|
53
41
|
cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path} -d /work/special/path --copy_mzxml"
|
|
54
42
|
## FILES EXIST:
|
|
55
43
|
prc = proc {|file|
|
|
56
|
-
|
|
44
|
+
file.should exist
|
|
57
45
|
beginning = IO.readlines(file)[0,50].join("\n")
|
|
58
46
|
$XML_SANITY_LINES.each do |line|
|
|
59
|
-
|
|
47
|
+
beginning.should include(line)
|
|
48
|
+
#beginning.include?(line).should be_true
|
|
60
49
|
end
|
|
61
50
|
$XML_SANITY_MATCHES.each do |match|
|
|
62
|
-
|
|
51
|
+
beginning.should =~ match
|
|
63
52
|
end
|
|
64
53
|
}
|
|
65
54
|
_basic(cmd, prc)
|
|
66
55
|
## COPY MZXML:
|
|
67
56
|
%w(000 020).each do |file|
|
|
68
57
|
mzxml_file = File.join(@out_path, "#{file}.mzXML")
|
|
69
|
-
|
|
58
|
+
mzxml_file.should exist
|
|
70
59
|
end
|
|
71
60
|
## CLEANUP:
|
|
72
61
|
unless @no_delete then FileUtils.rm_rf(@out_path) end
|
|
73
|
-
else
|
|
74
|
-
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
|
75
62
|
end
|
|
76
63
|
end
|
|
77
64
|
|
|
78
|
-
|
|
79
|
-
|
|
65
|
+
spec_large do
|
|
66
|
+
it 'transforms database name when its proper to do so' do
|
|
80
67
|
cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path}"
|
|
81
68
|
db_re = /C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta/
|
|
82
|
-
|
|
69
|
+
IO.read(@tf_params).should =~ db_re
|
|
83
70
|
prc = proc {|file|
|
|
84
|
-
|
|
85
|
-
|
|
71
|
+
file.should exist
|
|
72
|
+
IO.read(file).should_not =~ db_re
|
|
86
73
|
}
|
|
87
74
|
_basic(cmd, prc)
|
|
88
75
|
unless @no_delete then FileUtils.rm_rf(@out_path) end
|
|
89
|
-
else
|
|
90
|
-
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
|
91
76
|
end
|
|
92
77
|
end
|
|
93
78
|
end
|
|
94
79
|
|
|
95
|
-
$VERBOSE = tmp
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
|
|
2
|
+
|
|
3
|
+
require 'fasta'
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Fasta
|
|
7
|
+
def same_sized_proteins?(other_fasta_obj_or_file)
|
|
8
|
+
other = Fasta.to_fasta(other_fasta_obj_or_file)
|
|
9
|
+
@prots.zip(other.prots).all? do |a,b|
|
|
10
|
+
a.aaseq.size == b.aaseq.size
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# This is tough to say 'for sure' Right now, we consider the proteins
|
|
15
|
+
# shuffled if they are all the same size and 2/3 or more of the peptides are
|
|
16
|
+
# different than the other (this is designed for small sets of proteins
|
|
17
|
+
# where it is possible one of the peptides is equal to the other).
|
|
18
|
+
def shuffled?(other_fasta_obj_or_file)
|
|
19
|
+
other = Fasta.to_fasta(other_fasta_obj_or_file)
|
|
20
|
+
if !same_sized_proteins?(other)
|
|
21
|
+
false
|
|
22
|
+
else
|
|
23
|
+
(same, different) = @prots.zip(other.prots).partition do |prota, protb|
|
|
24
|
+
prota == protb
|
|
25
|
+
end
|
|
26
|
+
fraction_different = different.size.to_f / (same.size + different.size)
|
|
27
|
+
fraction_different >= 2.0/3
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe "a manipulator of a fasta file", :shared => true do
|
|
33
|
+
before(:all) do
|
|
34
|
+
@filestring = ">gi|P1
|
|
35
|
+
AMKRGAN
|
|
36
|
+
>gi|P2
|
|
37
|
+
CRGATKKTAGRPMEK
|
|
38
|
+
>gi|P3
|
|
39
|
+
PEPTIDE
|
|
40
|
+
"
|
|
41
|
+
|
|
42
|
+
@rev_filestring = ">gi|P1
|
|
43
|
+
NAGRKMA
|
|
44
|
+
>gi|P2
|
|
45
|
+
KEMPRGATKKTAGRC
|
|
46
|
+
>gi|P3
|
|
47
|
+
EDITPEP
|
|
48
|
+
"
|
|
49
|
+
|
|
50
|
+
@rev_pref_filestring = ">REV_gi|P1
|
|
51
|
+
NAGRKMA
|
|
52
|
+
>REV_gi|P2
|
|
53
|
+
KEMPRGATKKTAGRC
|
|
54
|
+
>REV_gi|P3
|
|
55
|
+
EDITPEP
|
|
56
|
+
"
|
|
57
|
+
|
|
58
|
+
@rev_tryptic_filestring = ">gi|P1
|
|
59
|
+
MAKRNAG
|
|
60
|
+
>gi|P2
|
|
61
|
+
CRTAGKKEMPRGATK
|
|
62
|
+
>gi|P3
|
|
63
|
+
EDITPEP
|
|
64
|
+
"
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
before(:each) do
|
|
69
|
+
testdir = File.dirname(__FILE__)
|
|
70
|
+
@tmpfile = Tfiles + "/littlefasta.trash.fasta"
|
|
71
|
+
@f = Tfiles + "/trash.fasta"
|
|
72
|
+
File.open(@tmpfile, "w") {|fh| fh.print @filestring }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
after(:each) do
|
|
76
|
+
File.unlink @tmpfile if File.exist? @tmpfile
|
|
77
|
+
File.unlink @f if File.exist? @f
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it 'reverses protein sequences' do
|
|
81
|
+
reverse_the_file
|
|
82
|
+
fastap(@f).to_s.should == @rev_filestring
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def reverse_the_file
|
|
86
|
+
do_it(:reverse)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it 'shuffles protein sequences' do
|
|
90
|
+
shuffle_the_file
|
|
91
|
+
Fasta.new(@f).shuffled?(Fasta.from_string(@filestring)).should be_true
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def shuffle_the_file
|
|
95
|
+
do_it(:shuffle)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
it 'concatenates sequences' do
|
|
99
|
+
concatenate_sequences
|
|
100
|
+
lns = fastalns(@f)
|
|
101
|
+
strlns(@filestring).should == lns[0..5] # first part equal
|
|
102
|
+
strlns(@rev_pref_filestring).should == lns[6..-1] # "second part equal")
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def concatenate_sequences
|
|
106
|
+
do_it(:reverse, :cat => true, :prefix => 'REV_')
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
it 'makes prefixes' do
|
|
110
|
+
make_prefixes
|
|
111
|
+
#@shaker.reverse(@tmpfile, :out => @f, :prefix => 'SILLY_')
|
|
112
|
+
fp = fastap(@f)
|
|
113
|
+
fp.each do |prt|
|
|
114
|
+
prt.header.should match(/^>SILLY_.+/)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def make_prefixes
|
|
119
|
+
do_it(:reverse, :prefix => 'SILLY_')
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
it 'makes fractions of proteins' do
|
|
123
|
+
make_fractions_of_proteins(1.0/3)
|
|
124
|
+
fastap(@f).size.should == 1
|
|
125
|
+
fastap(@f).first.header.should =~ /^>[^M]/
|
|
126
|
+
|
|
127
|
+
# this guy gets rounded up on the command line so that it fails there
|
|
128
|
+
#make_fractions_of_proteins(2.0/3)
|
|
129
|
+
#fastap(@f).size.should == 2
|
|
130
|
+
#fastap(@f).each do |prt|
|
|
131
|
+
# prt.header.should =~ /^>[^M]/
|
|
132
|
+
#end
|
|
133
|
+
|
|
134
|
+
make_fractions_of_proteins(1.0)
|
|
135
|
+
fastap(@f).size.should == 3
|
|
136
|
+
fastap(@f).each do |prt|
|
|
137
|
+
prt.header.should =~ /^>[^M]/
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def make_fractions_of_proteins(fraction)
|
|
142
|
+
do_it(:shuffle, :fraction => fraction)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
it 'makes fractions with labels (for > 1)' do
|
|
147
|
+
make_fractions_of_proteins(1.1)
|
|
148
|
+
fastap(@f).size.should == 4
|
|
149
|
+
fastap(@f).any? do |prt|
|
|
150
|
+
prt.header =~ /^>[^M]/
|
|
151
|
+
end.should be_true
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
make_fractions_of_proteins(2.6)
|
|
155
|
+
fastap(@f).size.should == 8
|
|
156
|
+
|
|
157
|
+
make_reverse_cat_fractions(2.0)
|
|
158
|
+
fastap(@f).size.should == 9
|
|
159
|
+
|
|
160
|
+
fp = Fasta.new(@f)
|
|
161
|
+
fp[0..2].each do |prt|
|
|
162
|
+
prt.header.should =~ /^>/
|
|
163
|
+
end
|
|
164
|
+
fp[3..5].each do |prt|
|
|
165
|
+
prt.header.should =~ /^>MINE_f0_/
|
|
166
|
+
end
|
|
167
|
+
fp[6..8].each do |prt|
|
|
168
|
+
prt.header.should =~ /^>MINE_f1_/
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def make_reverse_cat_fractions(fraction, prefix='MINE_')
|
|
173
|
+
do_it(:reverse, :fraction => fraction, :cat => true, :prefix => prefix)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def reverse_tryptic_peptides
|
|
177
|
+
do_it(:reverse, :tryptic_peptides => true)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it 'reverses tryptic peptides' do
|
|
181
|
+
reverse_tryptic_peptides
|
|
182
|
+
Fasta.from_string(@rev_tryptic_filestring).should == Fasta.new(@f)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def shuffle_tryptic_peptides
|
|
186
|
+
do_it(:shuffle, :tryptic_peptides => true)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
it 'shuffles tryptic peptides (rerun on failure to recheck)' do
|
|
190
|
+
shuffle_tryptic_peptides
|
|
191
|
+
lns = fastap(@f).to_s.split("\n")
|
|
192
|
+
lns[1][2..3].should == 'KR'
|
|
193
|
+
lns[3][1..1].should == 'R'
|
|
194
|
+
lns[3].size.should == 'CRGATKKTAGRPMEK'.size
|
|
195
|
+
lns[3].should_not == 'CRGATKKTAGRPMEK' #sequence is randomised from original [remote chance of failure] rerun to make sure
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def strlns(str)
|
|
199
|
+
str.split("\n")
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def fastalns(fn)
|
|
203
|
+
fn.should exist
|
|
204
|
+
IO.read(fn).split("\n")
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# returns the fasta object proteins
|
|
208
|
+
def fastap(fn)
|
|
209
|
+
@f.should exist
|
|
210
|
+
Fasta.new(fn).prots
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
describe FastaShaker, "by method call" do
|
|
216
|
+
|
|
217
|
+
before(:all) do
|
|
218
|
+
@shaker = FastaShaker.new
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
it_should_behave_like "a manipulator of a fasta file"
|
|
222
|
+
|
|
223
|
+
def do_it(method, additional_opts={})
|
|
224
|
+
opts = {:out => @f}
|
|
225
|
+
@shaker.send(method, @tmpfile, opts.merge(additional_opts))
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
describe FastaShaker, "by command line long args" do
|
|
232
|
+
before(:all) do
|
|
233
|
+
@progname = 'fasta_shaker.rb'
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
it_should_behave_like "a cmdline program"
|
|
237
|
+
it_should_behave_like "a manipulator of a fasta file"
|
|
238
|
+
|
|
239
|
+
# returns an array of the args
|
|
240
|
+
def opts_to_cmd_args(hash)
|
|
241
|
+
opts = []
|
|
242
|
+
hash.each do |k,v|
|
|
243
|
+
opts.push('--' + k.to_s)
|
|
244
|
+
unless (v == true) or (v == false)
|
|
245
|
+
opts.push(v)
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
opts
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def do_it(method, additional_opts={})
|
|
252
|
+
opts = {:out => @f}
|
|
253
|
+
opts.merge!(additional_opts)
|
|
254
|
+
cmd = [@cmd, method, @tmpfile, *(opts_to_cmd_args(opts))].join(" ")
|
|
255
|
+
#puts cmd
|
|
256
|
+
system cmd
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
end
|