mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -1,310 +1,354 @@
1
-
2
-
3
- require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
4
- require 'test/unit'
5
- require 'fasta'
6
- require 'assert_files'
7
- require 'sample_enzyme'
8
- require 'set'
9
-
10
-
11
- module Test::Unit::Assertions
12
- @@file_display_length = 10000
13
- end
14
-
15
- class FastaTest < Test::Unit::TestCase
16
- NODELETE = false
17
-
18
- def initialize(arg)
19
- super(arg)
20
-
21
- @cat_shuffle_postfix = Fasta::CAT_SHUFF_FILE_POSTFIX
22
- @connector = Fasta::FILE_CONNECTOR
23
- @shuff_prefix = Fasta::SHUFF_PREFIX
24
- @inv_prefix = Fasta::SHUFF_PREFIX
25
- @shuff_ext = Fasta::SHUFF_FILE_POSTFIX
26
- @inv_ext = Fasta::INV_FILE_POSTFIX
27
-
28
- @tfiles = File.dirname(__FILE__) + '/tfiles/'
29
- @base_cmd = "ruby -I #{File.join(File.dirname(__FILE__), "..", "lib")} -S "
30
- @fasta_mod_cmd = @base_cmd + "fasta_mod.rb "
31
- @fasta_cat_mod_cmd = @base_cmd + "fasta_cat_mod.rb "
32
- @fasta_cat_cmd = @base_cmd + "fasta_cat.rb "
33
- @sf = @tfiles + "small.fasta"
34
- @sf_shuffle = @tfiles + "small#{@shuff_ext}.fasta"
35
- @sf_invert = @tfiles + "small#{@inv_ext}.fasta"
36
- @sf_cat = @tfiles + "small__small_SHUFF.fasta"
37
- @sf_cat_mod = @tfiles + "small#{@cat_shuffle_postfix}.fasta"
38
- @mf = @tfiles + "messups.fasta"
39
- end
40
-
41
- def test_read_file
42
- obj = Fasta.new.read_file(@sf)
43
- @tmpfile = @tfiles + "tmp.tmp"
44
- obj.write_file(@tmpfile)
45
- assert_not_equal_file_content(@tmpfile, @sf)
46
- obj2 = Fasta.new.read_file(@tmpfile)
47
- File.unlink(@tmpfile)
48
- assert_equal(obj, obj2)
49
- end
50
-
51
- def test_cat
52
- obj = Fasta.new.read_file(@sf)
53
- first_size = obj.prots.size
54
- obj << obj
55
- assert_equal(2, obj.prots.size/first_size)
56
- end
57
-
58
- def test_dup
59
- obj = Fasta.new.read_file(@sf)
60
- objd = obj.dup
61
- obj_prots = obj.prots
62
- objd.prots.each do |prot|
63
- assert(obj_prots.include?(prot))
64
- end
65
- end
66
-
67
- def test_prefix_extension
68
- assert('f_howdy.ext', Fasta.prefix_extension('f.ext', '_howdy'))
69
- assert('f.ext_howdy.ext', Fasta.prefix_extension('f.ext.ext', '_howdy'))
70
- end
71
-
72
- def test_cat_filenames
73
- assert('f1f2.ext1', Fasta.cat_filenames(['f1.ext1', 'f2.ext2']))
74
- assert('f1__f2.ext1', Fasta.cat_filenames(['f1.ext1', 'f2.ext2'], '__'))
75
- end
76
-
77
- =begin
78
- def test_mod
79
- ## Testing shuffle:
80
- `#{@fasta_mod_cmd + 'shuffle ' + @sf}`
81
- assert(File.exist?(@sf_shuffle), "output file #{@sf_shuffle} exists")
82
- ob1 = Fasta.new.read_file(@sf)
83
- ob2 = Fasta.new.read_file(@sf_shuffle)
84
- assert_not_equal_file_content(@sf_shuffle, @sf)
85
- File.unlink @sf_shuffle
86
- assert(_same_headers?(ob1,ob2))
87
- assert(_are_shuffled?(ob1,ob2))
88
-
89
- ## Testing invert:
90
- `#{@fasta_mod_cmd + 'invert ' + @sf}`
91
- assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
92
- ob1 = Fasta.new.read_file(@sf)
93
- ob2 = Fasta.new.read_file(@sf_invert)
94
- assert_not_equal_file_content(@sf_invert, @sf)
95
- File.unlink(@sf_invert)
96
- assert(_same_headers?(ob1,ob2))
97
- assert(_are_inverted?(ob1,ob2))
98
-
99
- ## Testing prefix
100
- #puts "#{@fasta_mod_cmd + '-p _HELLO_ invert ' + @sf}"
101
- `#{@fasta_mod_cmd + 'invert -p _HELLO_ ' + @sf}` # NOT WORKING!
102
- assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
103
- ob1 = Fasta.new.read_file(@sf)
104
- ob2 = Fasta.new.read_file(@sf_invert)
105
- assert(_are_inverted?(ob1,ob2))
106
- assert_equal(ob1.prots.size, IO.read(@sf_invert).scan(/>_HELLO_/).size)
107
- File.unlink(@sf_invert)
108
- end
109
- =end
110
-
111
- def test_gi
112
- gis = ['>gi|7427923|pir||PHRBG glycogen phosphorylase (EC 2.4.1.1), muscle - rabbit', '>sp|lollygag|helloyou', '>only has one bar | and thats it', 'notme|me|nome', '>lots|an|lots|of|bars|heehee']
113
- answ = ['7427923', 'lollygag', nil, 'me','an']
114
- actual = gis.map do |head|
115
- Fasta::Prot.new(head).gi
116
- end
117
- assert_equal(answ, actual, "testing gi regex")
118
- end
119
-
120
- ## IN PROGRESS:
121
- def Xtest_cat_mod
122
-
123
- assert(File.exist?(@sf), "prerequisite for cat tests")
124
-
125
- ## Single file to cat shuffle test
126
- puts `#{@fcat_mod_cmd + @sf}`
127
- assert(File.exist?(@sf_cat_single), "output file exists")
128
- ob1 = Fasta.new.read_file(@sf)
129
- ob2 = Fasta.new.read_file(@sf_cat_single)
130
- assert_equal(2, ob2.prots.size/ob1.prots.size)
131
- assert_equal(ob1.prots, ob2.prots[0, (ob1.prots.size)])
132
- assert_not_equal(ob1.prots, ob2.prots[(ob1.prots.size)..-1])
133
- File.unlink @sf_cat_single
134
- end
135
-
136
- ## IN PROGRESS:
137
- def Xtest_cat
138
-
139
- ## Concatenate files test:
140
- puts `#{@cat_cmd + @sf} -p ,#{@shuff_prefix} #{@sfn}`
141
- assert(File.exist?(@sf_cat), "output file #{@sf_cat} exists")
142
- ob1 = Fasta.new.read_file(@sf)
143
- ob2 = Fasta.new.read_file(@sfn)
144
- ob3 = Fasta.new.read_file(@sf_cat)
145
- assert_not_equal_file_content(@sf_cat, @sf)
146
- assert_not_equal_file_content(@sf_cat, @sfn)
147
- [@sfn,@sf_cat].each { |f| File.unlink f }
148
-
149
- ob2.header_prefix!(@shuff_prefix)
150
- ob3_prots = ob3.prots
151
- [ob1, ob2].each do |ob|
152
- ob.prots.each do |prot|
153
- unless ob3_prots.include? prot
154
- p prot
155
- flunk "protein not found in cat version"
156
- end
157
- end
158
- end
159
-
160
-
161
- # test catenation
162
- sfci = "small_CAT_INV.fasta"
163
- cat_inverted = @tfiles + sfci
164
- iccmd = @base_cmd + "fasta_cat_inverse.rb "
165
- cmd = iccmd + @sf
166
- puts `#{cmd}`
167
- assert(File.exist?(cat_inverted), "file #{cat_inverted} exists")
168
-
169
- norm = Fasta.new.read_file(@sf)
170
- cat_inv = Fasta.new.read_file(cat_inverted)
171
- File.unlink(cat_inverted)
172
-
173
- num_prots = norm.prots.size
174
- cat_norm_prots = cat_inv.prots[0, num_prots]
175
- cat_inv_prots = cat_inv.prots[num_prots..-1]
176
- norm.prots.each_with_index do |prot,i|
177
- assert_equal(prot.header, cat_norm_prots[i].header)
178
- assert_not_equal(prot.header, cat_inv_prots[i].header)
179
- assert_equal(prot.aaseq, cat_norm_prots[i].aaseq)
180
- assert_equal(prot.aaseq.reverse!, cat_inv_prots[i].aaseq)
181
- end
182
- end
183
-
184
- def test_invert_tryptic_peptides
185
- # FOR INDIVIDUAL PROTEINS:
186
- seq = 'ABCKCDERDEKDGEKWXYRRKDER'
187
- # tryptic = ABCK, CDER, DEK, DGEK, WXYR, R, K, DER
188
- tryp = SampleEnzyme.tryptic(seq)
189
- reverse_tryptic = %w(CBAK EDCR EDK EGDK YXWR R K EDR)
190
- prot = Fasta::Prot.new(nil, seq)
191
- prot.invert_tryptic_peptides!
192
- assert_equal(reverse_tryptic.join(''), prot.aaseq, "reversing tryptic peptides")
193
-
194
- seq = 'XYRABCD'
195
- prot = Fasta::Prot.new(nil, seq)
196
- prot.invert_tryptic_peptides!
197
- assert_equal('YXRDCBA', prot.aaseq, 'last peptide treated special')
198
-
199
- seq = 'XYRPABCD'
200
- prot = Fasta::Prot.new(nil, seq)
201
- prot.invert_tryptic_peptides!
202
- assert_equal('DCBAPRYX', prot.aaseq, 'with a proline')
203
-
204
- end
205
-
206
- def test_fraction_of_prots
207
- peps = [['>silly1', "PEPTIDE"], ['>silly2', "ANOTHER"], ['>silly3', "AGAIN"], ['>silly4', "LARMA"]]
208
- prots = peps.map do |header, seq|
209
- Fasta::Prot.new(header, seq)
210
- end
211
- f = Fasta.new(prots)
212
- # simple:
213
- n = f.fraction_of_prots(1.0)
214
- assert_equal(f.prots.map{|v| v.header }.to_set, n.prots.map{|v| v.header }.to_set, "same headers")
215
- assert_equal(f.prots.map{|v| v.aaseq }.to_set, n.prots.map{|v| v.aaseq }.to_set, "same aaseqs")
216
-
217
- pre = proc {|cnt| "SHUFF_f#{cnt}_" }
218
- # test prefix
219
- n = f.fraction_of_prots(1.0, pre)
220
- n.prots.each do |prot|
221
- assert_match(/^>SHUFF_f0_/, prot.header, "contains new prefix")
222
- end
223
-
224
- # smaller
225
- n = f.fraction_of_prots(0.75, pre)
226
- assert_equal(3, n.prots.size, "correct number of proteins")
227
- # bigger
228
- n = f.fraction_of_prots(2.5, pre)
229
- assert_equal(10, n.prots.size, "correct number of proteins")
230
- n.prots[0..3].each {|prt| assert_match(/^>SHUFF_f0_/, prt.header ) }
231
- n.prots[4..7].each {|prt| assert_match(/^>SHUFF_f1_/, prt.header ) }
232
- n.prots[8..9].each {|prt| assert_match(/^>SHUFF_f2_/, prt.header ) }
233
- # crazy
234
- n = f.fraction_of_prots(1.33, pre)
235
- assert_equal(6, n.prots.size, "correct number of proteins")
236
- end
237
-
238
- def test_inverted_tryptic_peptides_for_file
239
- # for a file:
240
- tmpfile = @tfiles + "fasta.tmp"
241
- fasta = Fasta.new.read_file(@sf)
242
- fasta.aaseq_invert_tryptic_peptides!
243
- fasta.write_file(tmpfile)
244
- lines = IO.readlines(tmpfile)
245
- #normal = 'MKRISTTITTTITITTGNGAG'
246
- inverted_tryptic = 'MKRGAGNGTTITITTTITTSI' ## ?????
247
- assert_equal(inverted_tryptic, lines[1].chomp)
248
- #normal = 'MATYLIGDVHGCYDELIALLHKVEFTPGKDTLWLTGDLVARGPGSLDVLRYVKSLGDSVRLVLGNHDLHL
249
- # LAVFAGISRNKPKDRLTPLLEAPDADELLNWLRRQPLLQIDEEKKLVMAHAGITPQWDLQTAKECARDVE
250
- # AVLSSDSYPFFLDAMYGDMPNNWSPELRGLGRLRFITNAFTRMRFCFPNGQLDMYSKESPEEAPAPLKPW
251
- # FAIPGPVAEEYSIAFGHWASLEGKGTPEGIYALDTGCCWGGTLTCLRWEDKQYFVQPSNRHKDLGEAAAS'
252
- inverted_tryptic = 'HLLAILEDYCGHVDGILYTAMKGPTFEVKAVLDGTLWLTDRLVDLSGPGRVYKVSDGLSRSIGAFVALLHLDHNGLVLRPKNKDRLWNLLEDADPAELLPTLRREEDIQLLPQKKATQLDWQPTIGAHAMVLKACERLEPSWNNPMDGYMADLFFPYSDSSLVAEVDRGLGRLRTFANTIFRMRSYMDLQGNPFCFKGELSAWHGFAISYEEAVPGPIAFWPKLPAPAEEPSEKLCTLTGGWCCGTDLAYIGEPTGRDEWKNSPQVFYQRHKSAAAEGLD'
253
- assert_equal(inverted_tryptic, lines[-1].chomp)
254
- File.unlink(tmpfile) unless NODELETE
255
- end
256
-
257
-
258
-
259
- ## HELPER ASSERTIONS:
260
-
261
- def _are_inverted?(obj1, obj2)
262
- obj2_prots = obj2.prots
263
- obj1.prots.each_with_index do |prot,i|
264
- if prot.aaseq.reverse != obj2_prots[i].aaseq
265
- return false
266
- end
267
- end
268
- return true
269
- end
270
-
271
- def _same_headers?(obj1, obj2)
272
- obj1.prots.each_with_index do |prot,ind|
273
- oprot = obj2.prots[ind]
274
- if prot.header != oprot.header
275
- return false
276
- end
277
- end
278
- return true
279
- end
280
-
281
- # true if all prot AA seq's are the same
282
- def _same_aaseqs?(obj1, obj2)
283
- obj2_prots = obj2.prots
284
- obj1.prots.each_with_index do |prot,i|
285
- if prot.aaseq != obj2_prots[i].aaseq
286
- return false
287
- end
288
- end
289
- return true
290
- end
291
-
292
- # for two parallel fasta objects, determines if the list of proteins
293
- # are shuffled by examining the proteins and asking of > 4 are different
294
- # returns true or false
295
- def _are_shuffled?(obj1, obj2)
296
- cnt = 0
297
- obj1.prots.each_with_index do |prot,ind|
298
- oprot = obj2.prots[ind]
299
- if prot.header == oprot.header && prot.aaseq != oprot.aaseq
300
- cnt += 1
301
- end
302
- end
303
- if cnt > 4
304
- return true
305
- else
306
- return false
307
- end
308
- end
309
-
310
- end
1
+ require File.expand_path( File.dirname(__FILE__) + '/spec_helper' )
2
+
3
+ require 'fasta'
4
+
5
+ Filestring = ">gi|P1
6
+ AMKRGAN
7
+ >gi|P2
8
+ CRGATKKTAGRPMEK
9
+ >gi|P3
10
+ PEPTIDE
11
+ "
12
+
13
+ class Fasta
14
+ def proteins?
15
+ (@prots.size > 0) and
16
+ @prots.first.is_a? Fasta::Prot
17
+ end
18
+ end
19
+
20
+ describe Fasta do
21
+
22
+ it 'can be set from a string' do
23
+ obj = Fasta.from_string(Filestring)
24
+ obj.is_a?(Fasta).should be_true
25
+ obj.proteins?.should be_true
26
+ obj.size.should == 3
27
+ matches_filestring(obj)
28
+ end
29
+
30
+ # given a fasta obj, asks if it matches filestring
31
+ def matches_filestring(obj)
32
+ heads = %w(>gi|P1 >gi|P2 >gi|P3)
33
+ seqs = %w(AMKRGAN CRGATKKTAGRPMEK PEPTIDE)
34
+ obj.zip(heads, seqs) do |prot, head, seq|
35
+ prot.header.should == head
36
+ prot.aaseq.should == seq
37
+ end
38
+ end
39
+
40
+ end
41
+
42
+ describe Fasta::Prot do
43
+
44
+ it 'can extract a gi code out of ncbi sequences' do
45
+ gis = ['>gi|7427923|pir||PHRBG glycogen phosphorylase (EC 2.4.1.1), muscle - rabbit', '>sp|lollygag|helloyou', '>only has one bar | and thats it', 'notme|me|nome', '>lots|an|lots|of|bars|heehee']
46
+ answ = ['7427923', 'lollygag', nil, 'me','an']
47
+ actual = gis.map do |head|
48
+ Fasta::Prot.new(head).gi
49
+ end
50
+ actual.should == answ
51
+ end
52
+ end
53
+
54
+ =begin
55
+
56
+ require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
57
+ require 'test/unit'
58
+ require 'fasta'
59
+ require 'assert_files'
60
+ require 'sample_enzyme'
61
+ require 'set'
62
+
63
+
64
+ module Test::Unit::Assertions
65
+ @@file_display_length = 10000
66
+ end
67
+
68
+ class FastaTest < Test::Unit::TestCase
69
+ NODELETE = false
70
+
71
+ def initialize(arg)
72
+ super(arg)
73
+
74
+ @cat_shuffle_postfix = Fasta::CAT_SHUFF_FILE_POSTFIX
75
+ @connector = Fasta::FILE_CONNECTOR
76
+ @shuff_prefix = Fasta::SHUFF_PREFIX
77
+ @inv_prefix = Fasta::SHUFF_PREFIX
78
+ @shuff_ext = Fasta::SHUFF_FILE_POSTFIX
79
+ @inv_ext = Fasta::INV_FILE_POSTFIX
80
+
81
+ @tfiles = File.dirname(__FILE__) + '/tfiles/'
82
+ @base_cmd = "ruby -I #{File.join(File.dirname(__FILE__), "..", "lib")} -S "
83
+ @fasta_mod_cmd = @base_cmd + "fasta_mod.rb "
84
+ @fasta_cat_mod_cmd = @base_cmd + "fasta_cat_mod.rb "
85
+ @fasta_cat_cmd = @base_cmd + "fasta_cat.rb "
86
+ @sf = @tfiles + "small.fasta"
87
+ @sf_shuffle = @tfiles + "small#{@shuff_ext}.fasta"
88
+ @sf_invert = @tfiles + "small#{@inv_ext}.fasta"
89
+ @sf_cat = @tfiles + "small__small_SHUFF.fasta"
90
+ @sf_cat_mod = @tfiles + "small#{@cat_shuffle_postfix}.fasta"
91
+ @mf = @tfiles + "messups.fasta"
92
+ end
93
+
94
+ def test_read_file
95
+ obj = Fasta.new.read_file(@sf)
96
+ @tmpfile = @tfiles + "tmp.tmp"
97
+ obj.write_file(@tmpfile)
98
+ assert_not_equal_file_content(@tmpfile, @sf)
99
+ obj2 = Fasta.new.read_file(@tmpfile)
100
+ File.unlink(@tmpfile)
101
+ assert_equal(obj, obj2)
102
+ end
103
+
104
+ def test_cat
105
+ obj = Fasta.new.read_file(@sf)
106
+ first_size = obj.prots.size
107
+ obj << obj
108
+ assert_equal(2, obj.prots.size/first_size)
109
+ end
110
+
111
+ def test_dup
112
+ obj = Fasta.new.read_file(@sf)
113
+ objd = obj.dup
114
+ obj_prots = obj.prots
115
+ objd.prots.each do |prot|
116
+ assert(obj_prots.include?(prot))
117
+ end
118
+ end
119
+
120
+ def test_prefix_extension
121
+ assert('f_howdy.ext', Fasta.prefix_extension('f.ext', '_howdy'))
122
+ assert('f.ext_howdy.ext', Fasta.prefix_extension('f.ext.ext', '_howdy'))
123
+ end
124
+
125
+ def test_cat_filenames
126
+ assert('f1f2.ext1', Fasta.cat_filenames(['f1.ext1', 'f2.ext2']))
127
+ assert('f1__f2.ext1', Fasta.cat_filenames(['f1.ext1', 'f2.ext2'], '__'))
128
+ end
129
+
130
+ def test_mod
131
+ ## Testing shuffle:
132
+ `#{@fasta_mod_cmd + 'shuffle ' + @sf}`
133
+ assert(File.exist?(@sf_shuffle), "output file #{@sf_shuffle} exists")
134
+ ob1 = Fasta.new.read_file(@sf)
135
+ ob2 = Fasta.new.read_file(@sf_shuffle)
136
+ assert_not_equal_file_content(@sf_shuffle, @sf)
137
+ File.unlink @sf_shuffle
138
+ assert(_same_headers?(ob1,ob2))
139
+ assert(_are_shuffled?(ob1,ob2))
140
+
141
+ ## Testing invert:
142
+ `#{@fasta_mod_cmd + 'invert ' + @sf}`
143
+ assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
144
+ ob1 = Fasta.new.read_file(@sf)
145
+ ob2 = Fasta.new.read_file(@sf_invert)
146
+ assert_not_equal_file_content(@sf_invert, @sf)
147
+ File.unlink(@sf_invert)
148
+ assert(_same_headers?(ob1,ob2))
149
+ assert(_are_inverted?(ob1,ob2))
150
+
151
+ ## Testing prefix
152
+ #puts "#{@fasta_mod_cmd + '-p _HELLO_ invert ' + @sf}"
153
+ `#{@fasta_mod_cmd + 'invert -p _HELLO_ ' + @sf}` # NOT WORKING!
154
+ assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
155
+ ob1 = Fasta.new.read_file(@sf)
156
+ ob2 = Fasta.new.read_file(@sf_invert)
157
+ assert(_are_inverted?(ob1,ob2))
158
+ assert_equal(ob1.prots.size, IO.read(@sf_invert).scan(/>_HELLO_/).size)
159
+ File.unlink(@sf_invert)
160
+ end
161
+
162
+ ## IN PROGRESS:
163
+ def Xtest_cat_mod
164
+
165
+ assert(File.exist?(@sf), "prerequisite for cat tests")
166
+
167
+ ## Single file to cat shuffle test
168
+ puts `#{@fcat_mod_cmd + @sf}`
169
+ assert(File.exist?(@sf_cat_single), "output file exists")
170
+ ob1 = Fasta.new.read_file(@sf)
171
+ ob2 = Fasta.new.read_file(@sf_cat_single)
172
+ assert_equal(2, ob2.prots.size/ob1.prots.size)
173
+ assert_equal(ob1.prots, ob2.prots[0, (ob1.prots.size)])
174
+ assert_not_equal(ob1.prots, ob2.prots[(ob1.prots.size)..-1])
175
+ File.unlink @sf_cat_single
176
+ end
177
+
178
+ ## IN PROGRESS:
179
+ def Xtest_cat
180
+
181
+ ## Concatenate files test:
182
+ puts `#{@cat_cmd + @sf} -p ,#{@shuff_prefix} #{@sfn}`
183
+ assert(File.exist?(@sf_cat), "output file #{@sf_cat} exists")
184
+ ob1 = Fasta.new.read_file(@sf)
185
+ ob2 = Fasta.new.read_file(@sfn)
186
+ ob3 = Fasta.new.read_file(@sf_cat)
187
+ assert_not_equal_file_content(@sf_cat, @sf)
188
+ assert_not_equal_file_content(@sf_cat, @sfn)
189
+ [@sfn,@sf_cat].each { |f| File.unlink f }
190
+
191
+ ob2.header_prefix!(@shuff_prefix)
192
+ ob3_prots = ob3.prots
193
+ [ob1, ob2].each do |ob|
194
+ ob.prots.each do |prot|
195
+ unless ob3_prots.include? prot
196
+ p prot
197
+ flunk "protein not found in cat version"
198
+ end
199
+ end
200
+ end
201
+
202
+
203
+ # test catenation
204
+ sfci = "small_CAT_INV.fasta"
205
+ cat_inverted = @tfiles + sfci
206
+ iccmd = @base_cmd + "fasta_cat_inverse.rb "
207
+ cmd = iccmd + @sf
208
+ puts `#{cmd}`
209
+ assert(File.exist?(cat_inverted), "file #{cat_inverted} exists")
210
+
211
+ norm = Fasta.new.read_file(@sf)
212
+ cat_inv = Fasta.new.read_file(cat_inverted)
213
+ File.unlink(cat_inverted)
214
+
215
+ num_prots = norm.prots.size
216
+ cat_norm_prots = cat_inv.prots[0, num_prots]
217
+ cat_inv_prots = cat_inv.prots[num_prots..-1]
218
+ norm.prots.each_with_index do |prot,i|
219
+ assert_equal(prot.header, cat_norm_prots[i].header)
220
+ assert_not_equal(prot.header, cat_inv_prots[i].header)
221
+ assert_equal(prot.aaseq, cat_norm_prots[i].aaseq)
222
+ assert_equal(prot.aaseq.reverse!, cat_inv_prots[i].aaseq)
223
+ end
224
+ end
225
+
226
+ def test_invert_tryptic_peptides
227
+ # FOR INDIVIDUAL PROTEINS:
228
+ seq = 'ABCKCDERDEKDGEKWXYRRKDER'
229
+ # tryptic = ABCK, CDER, DEK, DGEK, WXYR, R, K, DER
230
+ tryp = SampleEnzyme.tryptic(seq)
231
+ reverse_tryptic = %w(CBAK EDCR EDK EGDK YXWR R K EDR)
232
+ prot = Fasta::Prot.new(nil, seq)
233
+ prot.invert_tryptic_peptides!
234
+ assert_equal(reverse_tryptic.join(''), prot.aaseq, "reversing tryptic peptides")
235
+
236
+ seq = 'XYRABCD'
237
+ prot = Fasta::Prot.new(nil, seq)
238
+ prot.invert_tryptic_peptides!
239
+ assert_equal('YXRDCBA', prot.aaseq, 'last peptide treated special')
240
+
241
+ seq = 'XYRPABCD'
242
+ prot = Fasta::Prot.new(nil, seq)
243
+ prot.invert_tryptic_peptides!
244
+ assert_equal('DCBAPRYX', prot.aaseq, 'with a proline')
245
+
246
+ end
247
+
248
+ def test_fraction_of_prots
249
+ peps = [['>silly1', "PEPTIDE"], ['>silly2', "ANOTHER"], ['>silly3', "AGAIN"], ['>silly4', "LARMA"]]
250
+ prots = peps.map do |header, seq|
251
+ Fasta::Prot.new(header, seq)
252
+ end
253
+ f = Fasta.new(prots)
254
+ # simple:
255
+ n = f.fraction_of_prots(1.0)
256
+ assert_equal(f.prots.map{|v| v.header }.to_set, n.prots.map{|v| v.header }.to_set, "same headers")
257
+ assert_equal(f.prots.map{|v| v.aaseq }.to_set, n.prots.map{|v| v.aaseq }.to_set, "same aaseqs")
258
+
259
+ pre = proc {|cnt| "SHUFF_f#{cnt}_" }
260
+ # test prefix
261
+ n = f.fraction_of_prots(1.0, pre)
262
+ n.prots.each do |prot|
263
+ assert_match(/^>SHUFF_f0_/, prot.header, "contains new prefix")
264
+ end
265
+
266
+ # smaller
267
+ n = f.fraction_of_prots(0.75, pre)
268
+ assert_equal(3, n.prots.size, "correct number of proteins")
269
+ # bigger
270
+ n = f.fraction_of_prots(2.5, pre)
271
+ assert_equal(10, n.prots.size, "correct number of proteins")
272
+ n.prots[0..3].each {|prt| assert_match(/^>SHUFF_f0_/, prt.header ) }
273
+ n.prots[4..7].each {|prt| assert_match(/^>SHUFF_f1_/, prt.header ) }
274
+ n.prots[8..9].each {|prt| assert_match(/^>SHUFF_f2_/, prt.header ) }
275
+ # crazy
276
+ n = f.fraction_of_prots(1.33, pre)
277
+ assert_equal(6, n.prots.size, "correct number of proteins")
278
+ end
279
+
280
+ def test_inverted_tryptic_peptides_for_file
281
+ # for a file:
282
+ tmpfile = @tfiles + "fasta.tmp"
283
+ fasta = Fasta.new.read_file(@sf)
284
+ fasta.aaseq_invert_tryptic_peptides!
285
+ fasta.write_file(tmpfile)
286
+ lines = IO.readlines(tmpfile)
287
+ #normal = 'MKRISTTITTTITITTGNGAG'
288
+ inverted_tryptic = 'MKRGAGNGTTITITTTITTSI' ## ?????
289
+ assert_equal(inverted_tryptic, lines[1].chomp)
290
+ #normal = 'MATYLIGDVHGCYDELIALLHKVEFTPGKDTLWLTGDLVARGPGSLDVLRYVKSLGDSVRLVLGNHDLHL
291
+ # LAVFAGISRNKPKDRLTPLLEAPDADELLNWLRRQPLLQIDEEKKLVMAHAGITPQWDLQTAKECARDVE
292
+ # AVLSSDSYPFFLDAMYGDMPNNWSPELRGLGRLRFITNAFTRMRFCFPNGQLDMYSKESPEEAPAPLKPW
293
+ # FAIPGPVAEEYSIAFGHWASLEGKGTPEGIYALDTGCCWGGTLTCLRWEDKQYFVQPSNRHKDLGEAAAS'
294
+ inverted_tryptic = 'HLLAILEDYCGHVDGILYTAMKGPTFEVKAVLDGTLWLTDRLVDLSGPGRVYKVSDGLSRSIGAFVALLHLDHNGLVLRPKNKDRLWNLLEDADPAELLPTLRREEDIQLLPQKKATQLDWQPTIGAHAMVLKACERLEPSWNNPMDGYMADLFFPYSDSSLVAEVDRGLGRLRTFANTIFRMRSYMDLQGNPFCFKGELSAWHGFAISYEEAVPGPIAFWPKLPAPAEEPSEKLCTLTGGWCCGTDLAYIGEPTGRDEWKNSPQVFYQRHKSAAAEGLD'
295
+ assert_equal(inverted_tryptic, lines[-1].chomp)
296
+ File.unlink(tmpfile) unless NODELETE
297
+ end
298
+
299
+
300
+
301
+ ## HELPER ASSERTIONS:
302
+
303
+ def _are_inverted?(obj1, obj2)
304
+ obj2_prots = obj2.prots
305
+ obj1.prots.each_with_index do |prot,i|
306
+ if prot.aaseq.reverse != obj2_prots[i].aaseq
307
+ return false
308
+ end
309
+ end
310
+ return true
311
+ end
312
+
313
+ def _same_headers?(obj1, obj2)
314
+ obj1.prots.each_with_index do |prot,ind|
315
+ oprot = obj2.prots[ind]
316
+ if prot.header != oprot.header
317
+ return false
318
+ end
319
+ end
320
+ return true
321
+ end
322
+
323
+ # true if all prot AA seq's are the same
324
+ def _same_aaseqs?(obj1, obj2)
325
+ obj2_prots = obj2.prots
326
+ obj1.prots.each_with_index do |prot,i|
327
+ if prot.aaseq != obj2_prots[i].aaseq
328
+ return false
329
+ end
330
+ end
331
+ return true
332
+ end
333
+
334
+ # for two parallel fasta objects, determines if the list of proteins
335
+ # are shuffled by examining the proteins and asking of > 4 are different
336
+ # returns true or false
337
+ def _are_shuffled?(obj1, obj2)
338
+ cnt = 0
339
+ obj1.prots.each_with_index do |prot,ind|
340
+ oprot = obj2.prots[ind]
341
+ if prot.header == oprot.header && prot.aaseq != oprot.aaseq
342
+ cnt += 1
343
+ end
344
+ end
345
+ if cnt > 4
346
+ return true
347
+ else
348
+ return false
349
+ end
350
+ end
351
+
352
+ end
353
+
354
+ =end