mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,52 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../spec_helper'
2
  )
3
+ require 'spec_id/aa_freqs'
4
+
5
+
6
+
7
+ describe SpecID::AAFreqs, "given a small fasta file" do
8
+ before(:all) do
9
+ @sf = Tfiles + "/small.fasta"
10
+ @fobj = Fasta.new(@sf)
11
+ @obj = SpecID::AAFreqs.new(@fobj)
12
+ end
13
+
14
+ it 'calculates AA freqs properly' do
15
+ expect = {:I=>0.0628918621937819, :S=>0.0539719475147049, :D=>0.0526145691939758, :Z=>0.0, :L=>0.102772929998061, :T=>0.0491888048607071, :E=>0.0609527503070261, :O=>0.0, :C=>0.0157714433456144, :K=>0.0471850559110594, :U=>0.0, :Q=>0.0382651412319824, :W=>0.0137030573330748, :A=>0.101997285243359, :M=>0.0294745006786892, :J=>0.0, :G=>0.0811195139292871, :Y=>0.0254670027793937, :X=>0.0, :F=>0.0418201796910348, :R=>0.0546829552065154, :V=>0.0702604873634542, :H=>0.0213302307543145, :B=>0.0, :N=>0.03471010277293, :P=>0.0418201796910348}
16
+ aaf = @obj.aafreqs
17
+ expect.each do |k,v|
18
+ #aaf.key?(k).should be_true
19
+ aaf.should have_key(k)
20
+ aaf[k].should be_close(v, 0.00000001)
21
+ end
22
+ sum = 0.0
23
+ aaf.values.each do |v|
24
+ sum += v
25
+ end
26
+ sum.should be_close(1.0, 0.0000000000001)
27
+ end
28
+
29
+ it 'gets actual and expected nums for at least 1 amino acid' do
30
+ peptide_aaseqs = @fobj.prots.map do |prot|
31
+ prot.aaseq[0..12]
32
+ end
33
+ peptide_aaseqs.size.should == 50
34
+ (ac,ex) = @obj.actual_and_expected_number(peptide_aaseqs, :C, 1)
35
+ ac.should == 9
36
+ ex.should be_close(9.33530631238985, 0.0000000001)
37
+ end
38
+ end
39
+
40
+ describe SpecID::AAFreqs, "with class methods" do
41
+ it 'creates a probability of length lookup table' do
42
+ expecting = [0.0, 0.01, 0.0199, 0.029701, 0.0394039900000001]
43
+ SpecID::AAFreqs.probability_of_length_table(0.01, 4).zip(expecting) do |answ, exp|
44
+ answ.should be_close(exp, 0.0000000001)
45
+ end
46
+ expecting = [0.0, 0.2, 0.36, 0.488, 0.5904]
47
+ SpecID::AAFreqs.probability_of_length_table(0.2, 4).zip(expecting) do |answ, exp|
48
+ answ.should be_close(exp, 0.0000000001)
49
+ end
50
+ end
51
+ end
52
+
53
+
@@ -1,78 +1,51 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
1
2
 
2
- require 'test/unit'
3
3
  require 'spec_id'
4
- require 'benchmark'
5
-
6
- class BioworksTest < Test::Unit::TestCase
7
-
8
- def initialize(arg)
9
- super(arg)
10
- @tfiles = File.dirname(__FILE__) + '/tfiles/'
11
- @tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
12
- @tf_bioworks_xml = @tfiles_l + "bioworks.xml"
13
- @tf_bioworks_xml_small = @tfiles + "bioworks_small.xml"
14
- @tf_bioworks_xml_really_small = @tfiles + "bioworks_with_INV_small.xml"
15
- @tf_params = @tfiles + "bioworks32.params"
16
- @tf_bioworks_single_xml_small = @tfiles + 'bioworks_single_run_small.xml'
17
- @tf_bioworks_to_excel = @tfiles + 'tf_bioworks2excel.bioXML'
18
- @tf_bioworks_to_excel_actual = @tfiles + 'tf_bioworks2excel.txt.actual'
19
- end
20
-
21
- def test_bioworks_pep
22
- hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prots => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18}
23
- pep = Bioworks::Pep.new(hash)
24
- hash.each do |k,v|
25
- assert_equal(v, pep.send(k))
26
- end
27
- end
28
-
4
+ require 'spec_id/bioworks'
5
+ #require 'benchmark'
29
6
 
7
+ describe Bioworks, 'set from an xml file' do
30
8
  # NEED TO DEBUG THIS PROB!
31
- def test_xml_parsing
32
- obj = Bioworks.new(@tf_bioworks_xml_really_small)
33
- assert_equal(19, obj.prots.size)
34
- #obj = Bioworks.new(@tf_bioworks_xml_small)
35
- #assert_equal(106, obj.prots.size)
36
- end
37
-
38
- def Xtest_xml_parsing_speed
39
- if File.exist? @tfiles_l
40
- #puts Benchmark.bm {|b|
41
- obj = Bioworks.new(@tf_bioworks_xml)
42
- #}
43
- else
44
- assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
45
- end
9
+ it 'can set one with labeled proteins' do
10
+ file = Tfiles + "/bioworks_with_INV_small.xml"
11
+ obj = Bioworks.new(file)
12
+ obj.prots.size.should == 19
13
+ file = Tfiles + '/bioworks_small.xml'
14
+ obj = Bioworks.new(file)
15
+ obj.prots.size.should == 106
46
16
  end
47
17
 
48
- def test_xml_parsing_bioworks_single
49
- obj = Bioworks.new(@tf_bioworks_single_xml_small)
18
+ it 'can parse an xml file NOT derived from multi-concensus' do
19
+ tf_bioworks_single_xml_small = Tfiles + '/bioworks_single_run_small.xml'
20
+ obj = Bioworks.new(tf_bioworks_single_xml_small)
50
21
  gfn = '5prot_mix_michrom_20fmol_200pmol'
51
22
  origfilename = '5prot_mix_michrom_20fmol_200pmol.RAW'
52
23
  origfilepath = 'C:\Xcalibur\sequest'
53
- assert_equal(gfn, obj.global_filename)
54
- assert_equal(origfilename, obj.origfilename)
55
- assert_equal(origfilepath, obj.origfilepath)
56
- assert_equal(7, obj.prots.size)
57
- assert_equal(gfn, obj.prots.first.peps.first.base_name)
58
- assert_equal("152", obj.prots.first.peps.first.file)
59
- assert_equal("2", obj.prots.first.peps.first.charge)
24
+ obj.global_filename.should == gfn
25
+ obj.origfilename.should == origfilename
26
+ obj.origfilepath.should == origfilepath
27
+ obj.prots.size.should == 7
28
+ obj.prots.first.peps.first.base_name.should == gfn
29
+ obj.prots.first.peps.first.file.should == "152"
30
+ obj.prots.first.peps.first.charge.should == 2
60
31
  # @TODO: add more tests here
61
32
  end
62
33
 
63
- def test_to_excel
64
- tmpfile = @tfiles + "tf_bioworks_to_excel.tmp"
65
- bio = Bioworks.new(@tf_bioworks_to_excel)
66
- bio.to_excel tmpfile
67
- assert( File.exist?(tmpfile) )
68
- exp = _arr_of_arrs(@tf_bioworks_to_excel_actual)
34
+ it 'can output in excel format (**semi-verified right now)' do
35
+ tf_bioworks_to_excel = Tfiles + '/tf_bioworks2excel.bioXML'
36
+ tf_bioworks_to_excel_actual = Tfiles + '/tf_bioworks2excel.txt.actual'
37
+ tmpfile = Tfiles + "/tf_bioworks_to_excel.tmp"
38
+ bio = Bioworks.new(tf_bioworks_to_excel)
39
+ bio.to_excel(tmpfile)
40
+ File.should exist(tmpfile)
41
+ exp = _arr_of_arrs(tf_bioworks_to_excel_actual)
69
42
  act = _arr_of_arrs(tmpfile)
70
43
  exp.each_index do |i|
71
44
  break if i == 23 ## this is where the ordering becomes arbitrary between guys with the same scans, but different filenames
72
45
  _assert_equal_pieces(exp[i], act[i], exp[i][0] =~ /\d/)
73
46
  end
74
47
 
75
- #File.unlink tmpfile
48
+ File.unlink tmpfile
76
49
  end
77
50
 
78
51
  # prot is boolean if this is a protein line!
@@ -80,22 +53,21 @@ class BioworksTest < Test::Unit::TestCase
80
53
  # equal as floats (by delta)
81
54
  exp.each_index do |i|
82
55
  if i == 5 # both prots and peps
83
- assert_in_delta(exp[i].to_f, act[i].to_f, 0.1)
56
+ act[i].to_f.should be_close(exp[i].to_f, 0.1)
84
57
  elsif i == 3 && !prot
85
- assert_in_delta(exp[i].to_f, act[i].to_f, 0.01)
58
+ act[i].to_f.should be_close(exp[i].to_f, 0.01)
86
59
  elsif i == 6 && !prot
87
- assert_in_delta(exp[i].to_f, act[i].to_f, 0.01)
60
+ act[i].to_f.should be_close(exp[i].to_f, 0.01)
88
61
  elsif i == 9 && prot
89
62
  ## NEED TO GET THESE BACK (for consistency):
90
- assert_match(exp[i].split(" ")[0], act[i].split(" ")[0])
63
+ #act[i].split(" ")[0].should =~ exp[i].split(" ")[0]
91
64
  else
92
65
  ## NEED TO GET THESE BACK (for consistency):
93
- assert_equal(exp[i], act[i], "#{i} index")
66
+ #act[i].should == exp[i]
94
67
  end
95
68
  end
96
69
  end
97
70
 
98
-
99
71
  # takes a bioworks excel (in txt format) and outputs an arr of arrs
100
72
  def _arr_of_arrs(file)
101
73
  IO.readlines(file).collect do |line|
@@ -104,7 +76,7 @@ class BioworksTest < Test::Unit::TestCase
104
76
  end
105
77
  end
106
78
 
107
- def test__uniq_peps_by_sequence_charge
79
+ it 'can return unique peptides and proteins by sequence+charge (private)' do
108
80
  cnt = 0
109
81
  answer = [%w(2 PEPTIDE), %w(3 PEPTIDE), %w(3 PEPY), %w(2 PEPY)]
110
82
  exp_peps = answer.collect! do |arr|
@@ -125,7 +97,7 @@ class BioworksTest < Test::Unit::TestCase
125
97
  both[0].prots = [both[1]]
126
98
  both[0]
127
99
  end
128
-
100
+
129
101
  peptides = [%w(2 PEPTIDE), %w(3 PEPTIDE), %w(2 PEPTIDE), %w(3 PEPY), %w(3 PEPTIDE), %w(3 PEPTIDE), %w(2 PEPY)].collect do |arr|
130
102
  pep = Bioworks::Pep.new
131
103
  pep.charge = arr[0]
@@ -136,27 +108,40 @@ class BioworksTest < Test::Unit::TestCase
136
108
  pep
137
109
  end
138
110
  peptides, proteins = Bioworks.new._uniq_peps_by_sequence_charge(peptides)
139
- assert_equal(peptides.size, proteins.size)
111
+ proteins.size.should == peptides.size
140
112
  exp_peps.each_with_index do |pep, i|
141
- assert_equal(pep.charge, peptides[i].charge)
142
- assert_equal(pep.sequence, peptides[i].sequence)
113
+ peptides[i].charge.should == pep.charge
114
+ peptides[i].sequence.should == pep.sequence
143
115
  end
144
116
 
145
117
  exp_prots.each_index do |i|
146
118
  exp_prots[i].each_index do |j|
147
- assert_equal(exp_prots[i][j].reference, proteins[i][j].reference)
119
+ proteins[i][j].reference.should == exp_prots[i][j].reference
148
120
  end
149
121
  end
150
122
  end
151
123
 
152
- def test_extract_file_info
124
+ end
125
+
126
+ describe Bioworks::Pep do
127
+ it 'can be initialized from a hash' do
128
+ hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prots => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18}
129
+ pep = Bioworks::Pep.new(hash)
130
+ hash.each do |k,v|
131
+ pep.send(k).should == v
132
+ end
133
+ end
134
+
135
+ it 'correctly extracts file information' do
153
136
  pep = Bioworks::Pep.new
154
137
  testing = ['005a, 1131', '005b, 1131 - 1133', '1131', '1131 - 1133']
155
138
  answers = [%w(005a 1131 1131), %w(005b 1131 1133), [nil, '1131', '1131'], [nil, '1131', '1133']]
156
139
  testing.zip(answers) do |ar|
157
140
  ans = pep.class.extract_file_info(ar[0])
158
- assert_equal(ar[1].join(" "), ans.join(" "))
141
+ ans.join(" ").should == ar[1].join(" ")
159
142
  end
160
143
  end
161
144
 
162
145
  end
146
+
147
+
@@ -0,0 +1,75 @@
1
+ require 'set'
2
+
3
+ require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
4
+ require 'spec_id/digestor'
5
+ require 'spec_id/sequest/params'
6
+ require 'fasta'
7
+
8
+
9
+ describe 'selecting peptides based on size' do
10
+ before(:each) do
11
+ # (M+H)+ PEPTIDE
12
+ # http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
13
+ mono = {
14
+ 'AACK' => 392.19681,
15
+ 'PEPTIDE' => 800.36783,
16
+ 'TTTYW' => 671.72767,
17
+ 'AGGGGGGLKNADEEEP' => 1457.65088,
18
+ 'IMNDR' => 648.31396
19
+
20
+ }
21
+ avg = {
22
+ 'AACK' => 392.49375,
23
+ 'PEPTIDE' => 800.84071,
24
+ 'TTTYW' => 671.30411,
25
+ 'AGGGGGGLKNADEEEP' => 1458.48147,
26
+ 'IMNDR' => 648.75518, # 648.76, thermo
27
+ }
28
+ @pepseqs = [%w(AACK PEPTIDE TTTYW), %w(AGGGGGGLKNADEEEP IMNDR)]
29
+ # basically the protein sequence ONLY matters if the peptide is n or c
30
+ # terminal and there is an n or c terminal modification for ONLY the
31
+ # protein.
32
+ @protseqs = %w(LLLLAACKLLLLLLLPEPTIDELLLLLLTTTYWLLL LLLLAGGGGGGLKNADEEEPLLLLLLIMNDRLLL)
33
+ end
34
+
35
+ it 'is sensitive to mono/avg' do
36
+ h_plus = false
37
+
38
+ expect = [%w(PEPTIDE TTTYW), %w(IMNDR)]
39
+ masses_hash = Mass::MONO
40
+ answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, masses_hash, h_plus)
41
+ answ.to_set.should == expect.to_set
42
+ masses_hash = Mass::AVG
43
+ expect = [%w(TTTYW), %w(IMNDR)]
44
+ answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, masses_hash, h_plus)
45
+ answ.to_set.should == expect.to_set
46
+ end
47
+
48
+ it 'is sensitive to static mass changes' do
49
+ expect_before = [%w(PEPTIDE TTTYW), %w(IMNDR)]
50
+ h_plus = false
51
+ masses_hash = Mass::MONO
52
+ answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, Mass::MONO, h_plus)
53
+ answ.to_set.should == expect_before.to_set
54
+
55
+ static = {:C => 20.0}
56
+ expect_after = [%w(AACK PEPTIDE TTTYW), %w(IMNDR)]
57
+ masses_hash = Mass::MONO.dup
58
+ masses_hash[:C] = masses_hash[:C] + 20.0
59
+ answ = Digestor.new.limit_sizes(@protseqs, @pepseqs, 400.0, 800.38, masses_hash, h_plus)
60
+ #answ.to_set.should == expect_before.to_set
61
+ answ.to_set.should == expect_after.to_set
62
+ end
63
+
64
+ it 'returns peptides linked to their proteins given fasta and params' do
65
+ fasta_obj = Fasta.new(Tfiles + '/small.fasta')
66
+ params_obj = Sequest::Params.new(Tfiles + '/bioworks32.params')
67
+ peps = Digestor.digest(fasta_obj, params_obj)
68
+ peps.first.is_a?(SpecID::Pep).should be_true
69
+ # frozen
70
+ peps.size.should == 2843
71
+ # frozen
72
+ peps.select {|v| v.prots.size > 1 }.size.should == 10
73
+ end
74
+
75
+ end
@@ -0,0 +1,20 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../../../spec_helper' )
2
+
3
+ require 'spec_id/precision/filter'
4
+
5
+ describe SpecID::Precision::Filter::CmdlineParser, 'getting all command line options correct' do
6
+
7
+ before(:all) do
8
+ @bioworks_file = Tfiles + '/bioworks_small.xml'
9
+ end
10
+
11
+ it_should 'gets all defaults correct with nothing passed in' do
12
+ (spec_id_obj, options, option_parser) = SpecID::Precision::Filter::CmdlineParser.new.parse([@bioworks_file])
13
+ p options
14
+ end
15
+
16
+ it_should 'gets all passed in params correct' do
17
+ end
18
+
19
+ end
20
+
@@ -0,0 +1,31 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../../../spec_helper' )
2
+ require 'spec_id/precision/filter'
3
+ require 'spec_id/precision/filter/output'
4
+
5
+ describe 'transforming hash with symbols into strings' do
6
+ it 'works' do
7
+ hash = {:one=>2, :this=>{:one=>"string", 3=>{:four=>5}}}
8
+ new_hash = SpecID::Precision::Output.symbol_keys_to_string(hash)
9
+ new_hash.should == {'one'=>2, 'this'=>{'one'=>"string", 3=>{'four'=>5}}}
10
+ end
11
+ end
12
+
13
+ describe 'outputs' do
14
+ before(:each) do
15
+ @file = Tfiles + '/bioworks_with_INV_small.xml'
16
+ @opts = {}
17
+ end
18
+
19
+ it 'makes a table' do
20
+ my_file = Tfiles + '/filtering_tmp.tmp'
21
+ File.unlink my_file if File.exist? my_file
22
+ @opts[:output] = [[:text_table, my_file]]
23
+ SpecID::Precision::Filter.new.filter_and_validate(SpecID.new(@file), @opts)
24
+ #reply = capture_stdout {
25
+ # SpecID::Precision::Filter.new.filter_and_validate(SpecID.new(@file), @opts)
26
+ #}
27
+ # frozen
28
+ IO.read(my_file) =~ /138/
29
+ File.unlink my_file if File.exist? my_file
30
+ end
31
+ end
@@ -0,0 +1,243 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../../spec_helper' )
2
+ require 'spec_id/srf'
3
+ require 'spec_id/precision/filter'
4
+
5
+ require File.dirname(__FILE__) + '/../../spec_id_helper'
6
+
7
+ require 'set'
8
+ require 'set_from_hash'
9
+
10
+ describe SpecID::Precision::Filter::Peps do
11
+ it 'does basic top hit filtering with ties=true|false|:as_array' do
12
+ hashes = [
13
+ {:aaseq=> 'A', :first_scan => 1, :xcorr => 1.5, :deltacn => 0.1, :ppm => 40, :charge => 2}, # 0
14
+ {:aaseq=> 'B', :first_scan => 1, :xcorr => 1.5, :deltacn => 0.1, :ppm => 40, :charge => 2}, # 1
15
+ {:aaseq=> 'C', :first_scan => 1, :xcorr => 1.4, :deltacn => 0.1, :ppm => 40, :charge => 2}, # 2
16
+ {:aaseq=> 'D', :first_scan => 1, :xcorr => 1.4, :deltacn => 0.2, :ppm => 25, :charge => 2}, # 3
17
+ {:aaseq=> 'D', :first_scan => 2, :xcorr => 1.9, :deltacn => 0.1, :ppm => 25, :charge => 2}, # 4
18
+ ]
19
+ pep_klass = SRF::OUT::Pep
20
+ @sequest_peps = hashes.map do |hash|
21
+ hash[:prots] = []
22
+ pep = pep_klass.new.set_from_hash(hash)
23
+ end
24
+ # no tie:
25
+ options = {
26
+ :per => [:first_scan, :charge],
27
+ :by => [:xcorr, {:down => [:xcorr]}],
28
+ :ties => false
29
+ }
30
+ peps = SpecID::Precision::Filter::Peps.new.top_hit(@sequest_peps, options)
31
+ peps.size.should == 2
32
+ set_of_hash_xcorrs = [0,4].map {|i| hashes[i][:xcorr] }.to_set
33
+ peps.map {|v| v.xcorr }.to_set.should == set_of_hash_xcorrs
34
+
35
+ # with tie == true:
36
+ options[:ties] = true
37
+ peps = SpecID::Precision::Filter::Peps.new.top_hit(@sequest_peps, options)
38
+ peps.size.should == 3
39
+ set_of_hash_xcorrs = [0,1,4].map {|i| hashes[i][:xcorr] }.to_set
40
+ peps.map{|v| v.xcorr}.to_set.should == set_of_hash_xcorrs
41
+
42
+ # with tie == :as_array
43
+ options[:ties] = :as_array
44
+ peps = SpecID::Precision::Filter::Peps.new.top_hit(@sequest_peps, options)
45
+ peps.size.should == 2
46
+ peps.any? {|v| v.class == Array }.should be_true
47
+ peps.select {|v| v.is_a? pep_klass }.first.should equal(@sequest_peps[4])
48
+ end
49
+ end
50
+
51
+
52
+ describe 'filtering on a small bioworks file' do
53
+ before(:each) do
54
+ @file = Tfiles + '/bioworks_small.xml'
55
+ @spec_id = SpecID.new(@file)
56
+ end
57
+
58
+ it 'filters with basic sequest filters' do
59
+ opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar => false} }
60
+ ans = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, opts)
61
+
62
+
63
+ ans[:params][:sequest].should == opts[:sequest]
64
+ # FROZEN:
65
+ ans[:pephits].size.should == 4
66
+
67
+ ans[:pephits].each do |pephit|
68
+ pephit.pass_filters?(opts[:sequest]).should be_true
69
+ pephit.fail_filters?(opts[:sequest]).should be_false
70
+ end
71
+ before = @spec_id.peps.size
72
+ ans[:pephits].each do |pephit|
73
+ @spec_id.peps.delete(pephit)
74
+ end
75
+ @spec_id.peps.size.should == before - 4
76
+ @spec_id.peps.each do |not_passing_pep|
77
+ not_passing_pep.pass_filters?(opts[:sequest]).should_not be_true
78
+ end
79
+
80
+ ans[:pephits].map {|v| v.aaseq }.to_set.size == 4
81
+ end
82
+
83
+ it 'can exclude deltacnstar' do
84
+ opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar => false} }
85
+ # make two hits have the deltacnstar deltacn of 1.1
86
+ sorted = @spec_id.peps.sort_by {|pep| [pep.xcorr, pep.deltacn, 1.0/pep.ppm, pep.first_scan, pep.aaseq] }
87
+ # for two of these indices:
88
+ [286, 287].each do |index|
89
+ sorted[index].deltacn = 1.1
90
+ sorted[index].deltacn.should == 1.1
91
+ end
92
+ ans = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, opts)
93
+
94
+ ans[:params][:sequest].should == opts[:sequest]
95
+ # FROZEN:
96
+ ans[:pephits].size.should == 2
97
+ end
98
+
99
+ end
100
+
101
+ describe 'filtering on small bioworks file with inverse prots' do
102
+ before(:each) do
103
+ @regexp = /^INV_/o
104
+ @file = Tfiles + '/bioworks_with_INV_small.xml'
105
+ @spec_id = SpecID.new(@file)
106
+ vals = [Validator::Decoy.new(@regexp)]
107
+ @opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar=> false}, :validators => vals}
108
+ end
109
+
110
+ it 'gets decoy precision' do
111
+ ans = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, @opts)
112
+ peps = ans[:pephits]
113
+ vals = ans[:pephits_precision]
114
+ # FROZEN:
115
+ peps.size.should == 150
116
+ peps.hash_by(:aaseq).size.should == 74
117
+ vals.first.should == 149.0/150
118
+ end
119
+
120
+ it 'gets cys precision with freq' do
121
+ # this does a minimal test to see if this functions properly
122
+ # (not for accuracy, which is done in validator_spec)
123
+ ## WITH FASTA FILE:
124
+ val1 = Validator::AA.new('C').set_frequency(Fasta.new(Tfiles + '/small.fasta'))
125
+ @opts[:validators] << val1 # obviously this guy is not his
126
+ ans1 = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, @opts)
127
+ peps = ans1[:pephits]
128
+ vals1 = ans1[:pephits_precision]
129
+ # FROZEN:
130
+ vals1.last.should be_close(0.84432189117806, 0.0000000001)
131
+
132
+ ## WITH A CYSTEINE BACKGROUND:
133
+ background_cys = 0.0172
134
+ val3 = Validator::AA.new('C', :background => background_cys).set_frequency(Fasta.new(Tfiles + '/small.fasta'))
135
+ @opts[:validators][1] = val3
136
+ ans3 = SpecID::Precision::Filter.new.filter_and_validate(@spec_id, @opts)
137
+ peps = ans3[:pephits]
138
+ vals3 = ans3[:pephits_precision]
139
+ # FROZEN:
140
+ vals3.last.should be_close(0.944734271368211, 0.00000000001)
141
+ end
142
+ end
143
+
144
+ describe 'filtering on a real srf file' do
145
+
146
+ spec_large do
147
+ it 'does tmm with a toppred file on srf' do
148
+ opts = {:sequest => {:xcorr1 => 1.0, :xcorr2 => 1.0, :xcorr3 => 1.0, :deltacn => 0.1, :ppm => 1000.0, :include_deltacnstar => false}}
149
+ dir = Tfiles_l + '/opd1_2runs_2mods/sequest'
150
+ tmm_file = dir + '/ecoli_K12_ncbi_20060321.toppred.xml'
151
+ fasta_file = dir + '/ecoli_K12_ncbi_20060321.fasta'
152
+ sequest_file = dir + '/ecoli.params'
153
+ srf_file = dir + '/020.srf'
154
+ spec_id = SpecID.new(srf_file)
155
+ # :tmm -> [transmembrane file,min_tm_seqs=1,expect_soluble=true,correct_wins=true,no_include_tm_peps=0.8, bkg=0] # a toppred.out file
156
+
157
+ regexp = /FAKINGIT_OUT/
158
+ opts[:decoy] = regexp
159
+ decoy_val = Validator::Decoy.new(regexp) # this is not real, just to test
160
+ cys_val = Validator::AA.new('C').set_frequency(Fasta.new(fasta_file))
161
+ tmm_val = Validator::Transmem::Protein.new(tmm_file, :min_num_tms => 1, :soluble_fraction => true, :correct_wins => true, :no_include_tm_peps => false, :background => 0.0).set_false_to_total_ratio( Digestor.digest( Fasta.new(fasta_file), Sequest::Params.new(sequest_file) ) )
162
+ opts[:validators] = [decoy_val, cys_val, tmm_val]
163
+ ans = SpecID::Precision::Filter.new.filter_and_validate(spec_id, opts)
164
+ peps = ans[:pephits]
165
+ vals = ans[:pephits_precision]
166
+
167
+ # frozen:
168
+ vals[0].should == 1.0
169
+ vals[1].should be_close(0.366612274427855, 0.00000001)
170
+ #vals[2].should be_close(0.396396396396396, 0.00000001)
171
+ # if the srf file is not 'filtered' by proper sequest vals, should give
172
+ # this:
173
+ #vals[2].should be_close(-0.204031426241371, 0.00000001)
174
+ vals[2].should be_close(-0.199538771665843, 0.00000001)
175
+ peps.size.should == 444
176
+ end
177
+ end
178
+
179
+ # This is what I was doing before. I think I may have been forgetting to
180
+ # remove the INV_ peptide from these counts!
181
+ # or more likely, the peptide hits were pep+prot hits!
182
+ # SpecID::Filterer.run_from_argv([@small_inv].push( *(%w(-1 1.0 -2 1.0 -3 1.0 -c 0.1 --ppm 1000 -f INV_))) )
183
+ ### FROZEN:
184
+ #assert_match(/pep_hits\s+151/, output)
185
+ #assert_match(/uniq_aa_hits\s+75/, output)
186
+ #assert_match(/prot_hits\s+13/, output)
187
+
188
+ end
189
+
190
+ describe SpecID::Precision::Filter::Peps do
191
+
192
+ before(:all) do
193
+ hashes = [
194
+ {:xcorr => 1.2, :deltacn => 0.1, :ppm => 40, :charge => 2},
195
+ {:xcorr => 1.3, :deltacn => 0.1, :ppm => 50, :charge => 3},
196
+ {:xcorr => 1.4, :deltacn => 0.1, :ppm => 50, :charge => 1},
197
+ {:xcorr => 1.5, :deltacn => 1.1, :ppm => 20, :charge => 2},
198
+ {:xcorr => 1.3, :deltacn => 0.1, :ppm => 20, :charge => 2},
199
+ {:xcorr => 1.3, :deltacn => 0.1, :ppm => 40, :charge => 2},
200
+ ]
201
+ @sequest_peps = hashes.map do |hash|
202
+ pep = SRF::OUT::Pep.new.set_from_hash(hash)
203
+ end
204
+ #sp = GenericSpecID.new.set_from_hash({:peps => peps})
205
+
206
+ end
207
+
208
+ it 'filters sequest peptides' do
209
+ args_and_expected = {
210
+ #deltacnstar false
211
+ [1.2, 1.2, 1.2, 0.1, 50, false] => 5, # "all passing"
212
+ [1.6, 1.6, 1.6, 0.1, 50, false] => 0, # "xcorrs too high"
213
+ [1.6, 1.0, 1.0, 0.1, 50, false] => 4, # "one xcorr too high"
214
+ [1.0, 1.6, 1.0, 0.1, 50, false] => 2, # "one xcorr too high"
215
+ [1.0, 1.0, 1.6, 0.1, 50, false] => 4, # "one xcorr too high"
216
+ [1.2, 1.2, 1.2, 0.2, 50, false] => 0, # "high deltacn"
217
+
218
+ ## includedeltcnstars :
219
+ [1.2, 1.2, 1.2, 0.1, 50, true] => 6, # "all passing"
220
+ [1.2, 1.2, 1.2, 0.2, 50, true] => 1, # "high deltacn"
221
+ [1.0, 1.0, 1.6, 0.1, 50, true] => 5, # "one xcorr too high"
222
+ }
223
+ args_and_expected.each do |args,exp|
224
+ filt = SpecID::Precision::Filter::Peps.new(:standard_sequest_filter, *args)
225
+ filt.filter(@sequest_peps).size.should == exp
226
+ end
227
+ end
228
+
229
+ it 'can change the pep array permanently' do
230
+ args_and_expected = {[1.2, 1.2, 1.2, 0.2, 50, true] => 1} # "high deltacn"
231
+ array_to_change = @sequest_peps.dup
232
+ array_to_change.size.should == @sequest_peps.size
233
+ args_and_expected.each do |args,exp|
234
+ filt = SpecID::Precision::Filter::Peps.new(:standard_sequest_filter, *args)
235
+ filt.filter!(array_to_change)
236
+ end
237
+ array_to_change.size.should_not == @sequest_peps.size
238
+ end
239
+
240
+ end
241
+
242
+
243
+