mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+
4
+ require 'optparse'
5
+
6
+ opt = {}
7
+ opt[:probability] = 1.0
8
+ opts = OptionParser.new do |op|
9
+ op.banner = "USAGE: #{File.basename(__FILE__)} toppred.out"
10
+ op.separator "Outputs toppred.yaml"
11
+ op.separator "takes the highest probability structure"
12
+ op.separator "for best structures of equal probability, takes first given"
13
+ op.separator "Each line contains:"
14
+ op.separator "<identifier>: String :"
15
+ op.separator " num_found: Int"
16
+ op.separator " num_certain_transmembrane_segments: Int"
17
+ op.separator " num_putative_transmembrane_segments: Int"
18
+ op.separator " best_structure_probability: Float"
19
+ op.separator " transmembrane_segments:"
20
+ op.separator " - probability: Float"
21
+ op.separator " start: Int"
22
+ op.separator " stop: Int"
23
+ op.separator " aaseq: String"
24
+ op.separator ""
25
+ op.separator "OPTIONS:"
26
+ op.on("-p", "--probability", Float, "min structure prob threshold (default #{opt[:probability]})") {|v| opt[:probability] = v}
27
+ end
28
+
29
+ opts.parse!
30
+
31
+
32
+ if ARGV.size == 0
33
+ puts opts
34
+ exit
35
+ end
36
+
37
+ file = ARGV.shift
38
+
39
+ File.open(file) do |fh|
40
+ hash = Transmem.read_toppred(fh)
41
+ end
42
+
43
+ puts hash.to_yaml
44
+
45
+
46
+
47
+
@@ -202,7 +202,7 @@ chmod(0777, TPP_DATA_PATH.chomp('/'))
202
202
  mkpath TPP_VIS_PATH.chomp('/')
203
203
 
204
204
  ## VERY SPECIFIC to OUR SYSTEM
205
- soft_link('/project/marcotte/ms', TPP_DATA_PATH.chomp('/') + '/ms')
205
+ soft_link('/project/marcotte/marcotte/ms', TPP_DATA_PATH.chomp('/') + '/ms')
206
206
  system "sudo chown john:marcotte #{TPP_DATA_PATH.chomp('/')}"
207
207
  system "sudo chown john:marcotte #{TPP_VIS_PATH.chomp('/')}"
208
208
 
@@ -1,20 +1,17 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/spec_helper' )
1
2
 
2
- require 'test/unit'
3
3
  require 'align'
4
- require 'pp'
5
4
 
6
- class AlignTest < Test::Unit::TestCase
5
+ describe Align do
7
6
 
8
- def initialize(arg)
9
- super(arg)
10
- @tfiles = File.dirname(__FILE__) + '/tfiles/'
11
- @mz1 = @tfiles + '4-03-03_mzXML/000.mzXML.timeIndex'
12
- @mz2 = @tfiles + '4-03-03_mzXML/020.mzXML.timeIndex'
13
- @prt = @tfiles + '4-03-03_small-prot.xml'
14
- @pep = @tfiles + '4-03-03_small.xml'
7
+ before(:each) do
8
+ @mz1 = Tfiles + '4-03-03_mzXML/000.mzXML.timeIndex'
9
+ @mz2 = Tfiles + '4-03-03_mzXML/020.mzXML.timeIndex'
10
+ @prt = Tfiles + '4-03-03_small-prot.xml'
11
+ @pep = Tfiles + '4-03-03_small.xml'
15
12
  end
16
13
 
17
- def test_overlapping_peps_by_seqcharge
14
+ it_should 'finds overlapping peptides of same seq+charge' do
18
15
  s1 = 'DETTIVEGAGDAEAIQGR'
19
16
  c1 = '2'
20
17
  s2 = 'TDDVAGDGTTTATVLAQALVR'
@@ -35,28 +32,25 @@ class AlignTest < Test::Unit::TestCase
35
32
  has_seqcharges << false
36
33
  end
37
34
  end
38
- has_seqcharges.each do |c| assert c end
35
+ has_seqcharges.each { |c| c.should be_true }
39
36
  end
40
37
  end
41
38
 
42
39
  ### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
43
40
  # @TODO: CURRENT WORK!
44
- def test_overlapping_peps_by_seqcharge_with_filter
45
- assert true
46
- if false
47
- al = Align.new
48
- pep1 = al.peps_with_scans([@mz1], @prt, @pep, 0.0 ,0.0 ,0.0 )
49
- pep2 = al.peps_with_scans(@mz2, @prt, @pep, 0.0, 0.0, 0.0 )
50
- max_dups = nil
51
- outlier_cutoff = 0.0
52
- olap = al.overlapping_peps_by_seqcharge_with_filter([pep1, pep2], max_dups, outlier_cutoff)
53
- olap.each do |peps|
54
- p peps
55
- end
41
+ it_should 'should find overlapping peptides at a seqcharge with a filter' do
42
+ al = Align.new
43
+ pep1 = al.peps_with_scans([@mz1], @prt, @pep, 0.0 ,0.0 ,0.0 )
44
+ pep2 = al.peps_with_scans(@mz2, @prt, @pep, 0.0, 0.0, 0.0 )
45
+ max_dups = nil
46
+ outlier_cutoff = 0.0
47
+ olap = al.overlapping_peps_by_seqcharge_with_filter([pep1, pep2], max_dups, outlier_cutoff)
48
+ olap.each do |peps|
49
+ p peps
56
50
  end
57
51
  end
58
52
 
59
- def test_toss_outliers
53
+ it_should 'should toss outliers' do
60
54
 
61
55
  # Consistency/sanity checks right now (not accuracy)
62
56
  x = [-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,10,0 ,1,2,3,4,5,6,7,8,9]
@@ -65,7 +59,7 @@ class AlignTest < Test::Unit::TestCase
65
59
  expy2 = [-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,1,2,3,4,5,6,7,8,9]
66
60
 
67
61
  pcls = Proph::Pep
68
- scls = Spec::Scan
62
+ scls = MS::Scan
69
63
 
70
64
  pep_groups = [x,y].collect do |arr|
71
65
  arr.collect do |val|
@@ -79,7 +73,7 @@ class AlignTest < Test::Unit::TestCase
79
73
  deviations = 3.2
80
74
  size_before = pep_groups.first.size
81
75
  al.toss_outliers(pep_groups, deviations)
82
- assert_equal(2, size_before - pep_groups.first.size)
76
+ (size_before - pep_groups.first.size).should == 2
83
77
  end
84
78
 
85
79
  end
@@ -1,12 +1,8 @@
1
-
2
- require 'test/unit'
3
- require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
1
+ require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
4
2
  require 'fileutils'
5
3
 
6
- tmp = $VERBOSE
7
- $VERBOSE = 5
8
4
 
9
- $XML_SANITY_LINES = ['<sample_enzyme name="trypsin">', '<specificity cut="KR" no_cut="P" sense="C"/>', '<parameter name="diff_search_options" value="0.000000 S 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>']
5
+ $XML_SANITY_LINES = ['<sample_enzyme name="Trypsin">', '<specificity cut="KR" no_cut="P" sense="C"/>', '<parameter name="diff_search_options" value="0.000000 S 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>']
10
6
 
11
7
  $XML_SANITY_MATCHES = [/<spectrum_query spectrum="0\d0.\d+.\d+.[123]" start_scan="\d+" end_scan="\d+" precursor_neutral_mass="[\d\.]+" assumed_charge="[123]" index="\d+">/,
12
8
  / <search_hit hit_rank="\d" peptide="[\w\-\.]+" peptide_prev_aa="." peptide_next_aa="." protein=".*" num_tot_proteins="\d+" num_matched_ions="\d+" tot_num_ions="\d+" calc_neutral_pep_mass="[\d\.]+" massdiff="[\+\-][\d\.]+" num_tol_term="\d" num_missed_cleavages="\d" is_rejected="[01]">/,
@@ -18,78 +14,66 @@ $XML_SANITY_MATCHES = [/<spectrum_query spectrum="0\d0.\d+.\d+.[123]" start_scan
18
14
  ]
19
15
 
20
16
 
21
-
22
-
23
- class BioworksToPepXMLTest < Test::Unit::TestCase
24
-
25
- def initialize(arg)
26
- super(arg)
27
- @tfiles = File.dirname(__FILE__) + '/tfiles/'
28
- @tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
29
- @tf_mzxml_path = @tfiles_l + "yeast_gly_mzXML"
30
- @tf_bioworks_xml = @tfiles + "bioworks_small.xml"
31
- @tf_params = @tfiles + "bioworks32.params"
32
- @no_delete = true
33
- @out_path = @tfiles + 'pepxml/'
34
- @cmd = "ruby -I#{File.join(File.dirname(__FILE__), "..", "lib")} -S bioworks_to_pepxml.rb "
17
+ describe 'bioworks_to_pepxml.rb' do
18
+ before(:all) do
19
+ @tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
20
+ @tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
21
+ @tf_params = Tfiles + '/bioworks32.params'
22
+ @out_path = Tfiles + '/pepxml/'
23
+ @progname = 'bioworks_to_pepxml.rb'
24
+ @no_delete = false
35
25
  end
36
26
 
37
- def test_usage
38
- assert_match(/usage:/, `#{@cmd}`)
39
- end
27
+ it_should_behave_like "a cmdline program"
40
28
 
41
29
  def _basic(cmd, prc)
42
- puts "Performing: #{cmd}" if $VERBOSE
30
+ puts "Performing: #{cmd}" if $DEBUG
43
31
  reply = `#{cmd}`
44
- puts reply if $VERBOSE
32
+ puts reply if $DEBUG
45
33
  %w(000 020).each do |file|
46
34
  ffile = @out_path + file + ".xml"
47
35
  prc.call(ffile)
48
36
  end
49
37
  end
50
38
 
51
- def test_basic
52
- if File.exist? @tfiles_l
39
+ spec_large do
40
+ it 'works on a real bioworks.xml file' do
53
41
  cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path} -d /work/special/path --copy_mzxml"
54
42
  ## FILES EXIST:
55
43
  prc = proc {|file|
56
- assert(File.exist?(file), "#{file} exists")
44
+ file.should exist
57
45
  beginning = IO.readlines(file)[0,50].join("\n")
58
46
  $XML_SANITY_LINES.each do |line|
59
- assert(beginning.include?(line), "xml includes line: #{line}")
47
+ beginning.should include(line)
48
+ #beginning.include?(line).should be_true
60
49
  end
61
50
  $XML_SANITY_MATCHES.each do |match|
62
- assert_match(match, beginning, "matches")
51
+ beginning.should =~ match
63
52
  end
64
53
  }
65
54
  _basic(cmd, prc)
66
55
  ## COPY MZXML:
67
56
  %w(000 020).each do |file|
68
57
  mzxml_file = File.join(@out_path, "#{file}.mzXML")
69
- assert(File.exist?( mzxml_file ), "file: #{mzxml_file} exists")
58
+ mzxml_file.should exist
70
59
  end
71
60
  ## CLEANUP:
72
61
  unless @no_delete then FileUtils.rm_rf(@out_path) end
73
- else
74
- assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
75
62
  end
76
63
  end
77
64
 
78
- def test_database
79
- if File.exist? @tfiles_l
65
+ spec_large do
66
+ it 'transforms database name when its proper to do so' do
80
67
  cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path}"
81
68
  db_re = /C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta/
82
- assert_match(db_re, IO.read(@tf_params))
69
+ IO.read(@tf_params).should =~ db_re
83
70
  prc = proc {|file|
84
- assert(File.exist?(file))
85
- assert_no_match(db_re, IO.read(file))
71
+ file.should exist
72
+ IO.read(file).should_not =~ db_re
86
73
  }
87
74
  _basic(cmd, prc)
88
75
  unless @no_delete then FileUtils.rm_rf(@out_path) end
89
- else
90
- assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
91
76
  end
92
77
  end
93
78
  end
94
79
 
95
- $VERBOSE = tmp
@@ -0,0 +1,259 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
2
+
3
+ require 'fasta'
4
+
5
+
6
+ class Fasta
7
+ def same_sized_proteins?(other_fasta_obj_or_file)
8
+ other = Fasta.to_fasta(other_fasta_obj_or_file)
9
+ @prots.zip(other.prots).all? do |a,b|
10
+ a.aaseq.size == b.aaseq.size
11
+ end
12
+ end
13
+
14
+ # This is tough to say 'for sure' Right now, we consider the proteins
15
+ # shuffled if they are all the same size and 2/3 or more of the peptides are
16
+ # different than the other (this is designed for small sets of proteins
17
+ # where it is possible one of the peptides is equal to the other).
18
+ def shuffled?(other_fasta_obj_or_file)
19
+ other = Fasta.to_fasta(other_fasta_obj_or_file)
20
+ if !same_sized_proteins?(other)
21
+ false
22
+ else
23
+ (same, different) = @prots.zip(other.prots).partition do |prota, protb|
24
+ prota == protb
25
+ end
26
+ fraction_different = different.size.to_f / (same.size + different.size)
27
+ fraction_different >= 2.0/3
28
+ end
29
+ end
30
+ end
31
+
32
+ describe "a manipulator of a fasta file", :shared => true do
33
+ before(:all) do
34
+ @filestring = ">gi|P1
35
+ AMKRGAN
36
+ >gi|P2
37
+ CRGATKKTAGRPMEK
38
+ >gi|P3
39
+ PEPTIDE
40
+ "
41
+
42
+ @rev_filestring = ">gi|P1
43
+ NAGRKMA
44
+ >gi|P2
45
+ KEMPRGATKKTAGRC
46
+ >gi|P3
47
+ EDITPEP
48
+ "
49
+
50
+ @rev_pref_filestring = ">REV_gi|P1
51
+ NAGRKMA
52
+ >REV_gi|P2
53
+ KEMPRGATKKTAGRC
54
+ >REV_gi|P3
55
+ EDITPEP
56
+ "
57
+
58
+ @rev_tryptic_filestring = ">gi|P1
59
+ MAKRNAG
60
+ >gi|P2
61
+ CRTAGKKEMPRGATK
62
+ >gi|P3
63
+ EDITPEP
64
+ "
65
+ end
66
+
67
+
68
+ before(:each) do
69
+ testdir = File.dirname(__FILE__)
70
+ @tmpfile = Tfiles + "/littlefasta.trash.fasta"
71
+ @f = Tfiles + "/trash.fasta"
72
+ File.open(@tmpfile, "w") {|fh| fh.print @filestring }
73
+ end
74
+
75
+ after(:each) do
76
+ File.unlink @tmpfile if File.exist? @tmpfile
77
+ File.unlink @f if File.exist? @f
78
+ end
79
+
80
+ it 'reverses protein sequences' do
81
+ reverse_the_file
82
+ fastap(@f).to_s.should == @rev_filestring
83
+ end
84
+
85
+ def reverse_the_file
86
+ do_it(:reverse)
87
+ end
88
+
89
+ it 'shuffles protein sequences' do
90
+ shuffle_the_file
91
+ Fasta.new(@f).shuffled?(Fasta.from_string(@filestring)).should be_true
92
+ end
93
+
94
+ def shuffle_the_file
95
+ do_it(:shuffle)
96
+ end
97
+
98
+ it 'concatenates sequences' do
99
+ concatenate_sequences
100
+ lns = fastalns(@f)
101
+ strlns(@filestring).should == lns[0..5] # first part equal
102
+ strlns(@rev_pref_filestring).should == lns[6..-1] # "second part equal")
103
+ end
104
+
105
+ def concatenate_sequences
106
+ do_it(:reverse, :cat => true, :prefix => 'REV_')
107
+ end
108
+
109
+ it 'makes prefixes' do
110
+ make_prefixes
111
+ #@shaker.reverse(@tmpfile, :out => @f, :prefix => 'SILLY_')
112
+ fp = fastap(@f)
113
+ fp.each do |prt|
114
+ prt.header.should match(/^>SILLY_.+/)
115
+ end
116
+ end
117
+
118
+ def make_prefixes
119
+ do_it(:reverse, :prefix => 'SILLY_')
120
+ end
121
+
122
+ it 'makes fractions of proteins' do
123
+ make_fractions_of_proteins(1.0/3)
124
+ fastap(@f).size.should == 1
125
+ fastap(@f).first.header.should =~ /^>[^M]/
126
+
127
+ # this guy gets rounded up on the command line so that it fails there
128
+ #make_fractions_of_proteins(2.0/3)
129
+ #fastap(@f).size.should == 2
130
+ #fastap(@f).each do |prt|
131
+ # prt.header.should =~ /^>[^M]/
132
+ #end
133
+
134
+ make_fractions_of_proteins(1.0)
135
+ fastap(@f).size.should == 3
136
+ fastap(@f).each do |prt|
137
+ prt.header.should =~ /^>[^M]/
138
+ end
139
+ end
140
+
141
+ def make_fractions_of_proteins(fraction)
142
+ do_it(:shuffle, :fraction => fraction)
143
+ end
144
+
145
+
146
+ it 'makes fractions with labels (for > 1)' do
147
+ make_fractions_of_proteins(1.1)
148
+ fastap(@f).size.should == 4
149
+ fastap(@f).any? do |prt|
150
+ prt.header =~ /^>[^M]/
151
+ end.should be_true
152
+
153
+
154
+ make_fractions_of_proteins(2.6)
155
+ fastap(@f).size.should == 8
156
+
157
+ make_reverse_cat_fractions(2.0)
158
+ fastap(@f).size.should == 9
159
+
160
+ fp = Fasta.new(@f)
161
+ fp[0..2].each do |prt|
162
+ prt.header.should =~ /^>/
163
+ end
164
+ fp[3..5].each do |prt|
165
+ prt.header.should =~ /^>MINE_f0_/
166
+ end
167
+ fp[6..8].each do |prt|
168
+ prt.header.should =~ /^>MINE_f1_/
169
+ end
170
+ end
171
+
172
+ def make_reverse_cat_fractions(fraction, prefix='MINE_')
173
+ do_it(:reverse, :fraction => fraction, :cat => true, :prefix => prefix)
174
+ end
175
+
176
+ def reverse_tryptic_peptides
177
+ do_it(:reverse, :tryptic_peptides => true)
178
+ end
179
+
180
+ it 'reverses tryptic peptides' do
181
+ reverse_tryptic_peptides
182
+ Fasta.from_string(@rev_tryptic_filestring).should == Fasta.new(@f)
183
+ end
184
+
185
+ def shuffle_tryptic_peptides
186
+ do_it(:shuffle, :tryptic_peptides => true)
187
+ end
188
+
189
+ it 'shuffles tryptic peptides (rerun on failure to recheck)' do
190
+ shuffle_tryptic_peptides
191
+ lns = fastap(@f).to_s.split("\n")
192
+ lns[1][2..3].should == 'KR'
193
+ lns[3][1..1].should == 'R'
194
+ lns[3].size.should == 'CRGATKKTAGRPMEK'.size
195
+ lns[3].should_not == 'CRGATKKTAGRPMEK' #sequence is randomised from original [remote chance of failure] rerun to make sure
196
+ end
197
+
198
+ def strlns(str)
199
+ str.split("\n")
200
+ end
201
+
202
+ def fastalns(fn)
203
+ fn.should exist
204
+ IO.read(fn).split("\n")
205
+ end
206
+
207
+ # returns the fasta object proteins
208
+ def fastap(fn)
209
+ @f.should exist
210
+ Fasta.new(fn).prots
211
+ end
212
+
213
+ end
214
+
215
+ describe FastaShaker, "by method call" do
216
+
217
+ before(:all) do
218
+ @shaker = FastaShaker.new
219
+ end
220
+
221
+ it_should_behave_like "a manipulator of a fasta file"
222
+
223
+ def do_it(method, additional_opts={})
224
+ opts = {:out => @f}
225
+ @shaker.send(method, @tmpfile, opts.merge(additional_opts))
226
+ end
227
+
228
+ end
229
+
230
+
231
+ describe FastaShaker, "by command line long args" do
232
+ before(:all) do
233
+ @progname = 'fasta_shaker.rb'
234
+ end
235
+
236
+ it_should_behave_like "a cmdline program"
237
+ it_should_behave_like "a manipulator of a fasta file"
238
+
239
+ # returns an array of the args
240
+ def opts_to_cmd_args(hash)
241
+ opts = []
242
+ hash.each do |k,v|
243
+ opts.push('--' + k.to_s)
244
+ unless (v == true) or (v == false)
245
+ opts.push(v)
246
+ end
247
+ end
248
+ opts
249
+ end
250
+
251
+ def do_it(method, additional_opts={})
252
+ opts = {:out => @f}
253
+ opts.merge!(additional_opts)
254
+ cmd = [@cmd, method, @tmpfile, *(opts_to_cmd_args(opts))].join(" ")
255
+ #puts cmd
256
+ system cmd
257
+ end
258
+
259
+ end