mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/fasta.rb DELETED
@@ -1,626 +0,0 @@
1
- require 'sample_enzyme'
2
- require 'each_index'
3
- require 'optparse'
4
- require 'delegate'
5
- require 'hash_by'
6
- require 'digest/md5'
7
-
8
-
9
- tmp = $VERBOSE ; $VERBOSE = nil
10
- class String
11
-
12
- def each_index
13
- (0...self.size).each do |c|
14
- yield c
15
- end
16
- end
17
-
18
- # modifies and returns self
19
- def shuffle!
20
- each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
21
- self
22
- end
23
-
24
- def shuffle
25
- out = self.dup
26
- out.shuffle!
27
- out
28
- end
29
-
30
- end
31
- $VERBOSE = tmp
32
-
33
-
34
- module FastaManipulation ; end
35
-
36
- class Fasta < DelegateClass(Array)
37
- include FastaManipulation
38
- SHUFF_PREFIX = "SHUFF_"
39
- SHUFF_FILE_POSTFIX = "_SHUFF"
40
- CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
41
- FILE_CONNECTOR = "__"
42
- INV_PREFIX = "INV_"
43
- INV_FILE_POSTFIX = "_INV"
44
- CAT_INV_FILE_POSTFIX = "_CAT_INV"
45
-
46
- attr_writer :prots
47
- # this will probably be relative
48
- attr_accessor :filename
49
-
50
- # for backwards compatibility
51
- def prots
52
- @prots
53
- end
54
-
55
- def self.to_fasta(file_or_obj)
56
- if file_or_obj.is_a? Fasta
57
- file_or_obj
58
- else
59
- Fasta.new(file_or_obj)
60
- end
61
- end
62
-
63
- # arg can be:
64
- # Fasta::Prot objects (Array)
65
- # filename (String)
66
- # Another Fasta object (Fasta) (shallow copy!)
67
- def initialize(arg=nil, filename=nil)
68
- @filename = filename
69
- @prots = []
70
- if arg
71
- if arg.is_a? Fasta
72
- self.prots = arg.prots
73
- self.filename = arg.filename
74
- elsif arg.is_a? Array
75
- @prots = arg
76
- else
77
- read_file(arg)
78
- end
79
- end
80
- super(@prots)
81
- end
82
-
83
- # uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
84
- def md5_sum
85
- if File.exist?(@filename)
86
- Digest::MD5.hexdigest(File.read(@filename))
87
- else
88
- nil
89
- end
90
- end
91
-
92
- # returns the length of the file (in terms of the total number of amino
93
- # acids represented)
94
- def aa_seq_length
95
- tot = 0
96
- self.each do |prot|
97
- tot += prot.aaseq.size
98
- end
99
- tot
100
- end
101
-
102
- # searches proteins for a match to the exact sequence and returns a single
103
- # protein header (with > & no newline)
104
- # exact matches). nil if no matches
105
- def header_from_exact_sequence(aaseq)
106
- hash = self.hash_by(:aaseq)
107
- answ = hash[aaseq].map{|v| v.header}
108
- if answ.size == 1
109
- answ
110
- elsif answ.size == 0
111
- nil
112
- else
113
- answ
114
- end
115
- end
116
-
117
- # searches all headers to see if they include input string
118
- # returns true if one matches, false otherwise
119
- # (remember that headers are not stored with newline chars but do contain
120
- # beginning '>'
121
- def included_in_header?(input)
122
- @prots.any? do |prot|
123
- prot.header.include? input
124
- end
125
- end
126
-
127
- # takes an io object or string (which is the fasta data) This is not as
128
- # stringent as 'read_file' which is recommended for industrial type use. For
129
- # instance, this will fail if your newlines are different in your file from
130
- # those defined on your operating system. If you have a string, simply pass
131
- # in StringIO.new(your_string) to be read.
132
- # returns self
133
- def load(io)
134
- current_prot = nil
135
- current_aaseq = nil
136
- @prots.clear
137
- io.each do |line|
138
- if line[0,1] == '>'
139
- current_prot = Prot.new
140
- @prots << current_prot
141
- current_prot.header = line.chomp
142
- current_aaseq = ''
143
- current_prot.aaseq = current_aaseq
144
- elsif (line =~ /[^ ]/) && (line.size > 1)
145
- current_aaseq << line.chomp
146
- end
147
- end
148
- self
149
- end
150
-
151
- # uses 'load' to create a fasta object from a fasta string
152
- def self.from_string(string)
153
- Fasta.new.load(StringIO.new(string))
154
- end
155
-
156
- # Reads fasta files (under windows or unix newlines)
157
- # Always outputs LF separated files
158
- # Checks that the first character per line is '>' or character class [A-Za-z*]
159
- # returns a fasta object for stringing commands
160
- # if fn not given, will read the :filename attribute
161
- # will set :filename to fn is given
162
- def read_file(fn=nil)
163
- @filename = fn if fn
164
- first_char_re = /[A-Za-z*]/o
165
- obj = nil
166
- regex = /(\r\n)|\n/o
167
- fh = File.new(fn).binmode
168
- lines = fh.read.split(regex)
169
- fh.close
170
- first_char = nil
171
- lines.each do |line|
172
- if line =~ /[^ \n\r]/
173
- first_char = line[0,1]
174
- if first_char == '>'
175
- obj = Prot.new
176
- @prots << obj
177
- obj.header = line.dup
178
- elsif first_char =~ first_char_re
179
- obj.aaseq << line.chomp
180
- else
181
- raise "Line not in fasta format (between arrows): -->#{line}<--"
182
- end
183
- end
184
- end
185
- self
186
- end
187
-
188
- # if no fn, will write to :filename attribute
189
- def write_file(fn=nil)
190
- fn = @out unless fn
191
- File.open(fn, "wb") do |out|
192
- @prots.each do |prot|
193
- out.print(prot.to_s)
194
- end
195
- end
196
- end
197
-
198
- # duplicates the object (deep copy)
199
- def dup
200
- other = self.class.new
201
- other.filename = self.filename
202
- self.prots.each do |prot|
203
- other.prots << prot.dup
204
- end
205
- other
206
- end
207
-
208
- end
209
-
210
- class FastaShaker
211
-
212
- def reverse(fasta_file_or_obj, opts={})
213
- shake_it(:reverse, fasta_file_or_obj, opts)
214
- end
215
-
216
- def shuffle(fasta_file_or_obj, opts={})
217
- shake_it(:shuffle, fasta_file_or_obj, opts)
218
- end
219
-
220
- # sets the outbound filename attribute from opts
221
- def create_filename(fasta, method, opts={})
222
- file = fasta.filename || 'fasta'
223
- filebase = file.sub(/\..*$/,'')
224
- parts = [filebase]
225
- parts << 'cat' if opts[:cat]
226
- parts << method
227
- parts << 'prefix' << opts[:prefix] if opts[:prefix]
228
- parts << 'fraction' << opts[:fraction] if opts[:fraction]
229
- parts << 'tryptic_peptides' if opts[:tryptic_peptides]
230
- parts.join("_") << ".fasta"
231
- end
232
-
233
- protected
234
- def shake_it(method, fasta_file_or_obj, opt)
235
- fasta = Fasta.to_fasta(fasta_file_or_obj)
236
- if opt[:cat] && !opt[:prefix]
237
- message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
238
- warn message
239
- end
240
-
241
- unless opt[:out]
242
- opt[:out] = create_filename(fasta, method, opt)
243
- end
244
-
245
- ## CAT (save an original copy)
246
- fasta_orig = fasta.dup if opt[:cat]
247
-
248
- ## FRACTION the proteins
249
- if f = opt[:fraction]
250
- prefix = nil
251
- if f > 1.0
252
- prefix = proc {|cnt| "f#{cnt}_" }
253
- end
254
- fasta = fasta.fraction_of_prots(f, prefix)
255
- end
256
-
257
- ## PREFIX the proteins
258
- if pre = opt[:prefix]
259
- fasta.header_prefix!(pre)
260
- end
261
-
262
- ## MODIFY the proteins
263
- fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
264
-
265
- ## CAT (finish it up)
266
- if opt[:cat]
267
- fasta_orig << fasta
268
- fasta = fasta_orig
269
- end
270
-
271
- ## WRITE out the file
272
- fasta.write_file(opt[:out])
273
- end
274
-
275
-
276
-
277
-
278
- #############################################
279
- # END MAIN METHODS
280
- #############################################
281
-
282
- # takes command line input, and sends it to shake
283
- def FastaShaker.shake_from_argv(argv)
284
- opt = {}
285
-
286
- opts = OptionParser.new do |op|
287
- prog = File.basename(__FILE__)
288
- op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
289
- op.separator " <method> = reverse | shuffle"
290
- op.separator ""
291
- op.separator "fasta_shaker is kind of like a salt shaker:"
292
- op.separator "shake up your fasta proteins and let them"
293
- op.separator "season your dinner (hopefully a protein dinner). Mmmm."
294
- op.separator "false identification rates never tasted so good :)"
295
- op.separator ""
296
- op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
297
- op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
298
- op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
299
- op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
300
- op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
301
- op.separator " (after any given prefix) so that proteins are unique]"
302
- op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
303
-
304
- op.separator ""
305
- op.separator "EXAMPLES: "
306
- op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
307
- op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
308
- op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
309
- op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
310
- end
311
-
312
- #p argv
313
- opts.parse!(argv)
314
-
315
- if argv.size < 2
316
- puts opts
317
- exit
318
- end
319
-
320
- (method, file) = argv
321
- fs = FastaShaker.new
322
- fs.send(method.to_sym, file, opt)
323
- end
324
-
325
- private
326
-
327
-
328
-
329
- end
330
-
331
- module FastaManipulation
332
-
333
- # concatenates the filenames like this:
334
- # cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
335
- # the path and extension of the first filename are kept intact.
336
- # other files only use the basename (with no extension)
337
- def self.cat_filenames(filenames, connector="")
338
- fn1 = filenames.shift
339
- fn1_ext = File.extname(fn1)
340
- filenames.collect! do |fn|
341
- fn_ext = File.extname(fn)
342
- fn_base_no_ext = File.basename(fn, fn_ext)
343
- end
344
- con_filenames = filenames.join(connector)
345
- fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
346
- end
347
-
348
- # returns a new fasta object using some fraction of proteins randomly
349
- # selected (fraction may be > 1). Always rounds up. Will not choose a
350
- # protein twice unless all other proteins have been chosen
351
- #
352
- # fraction_prefix ensures that a unique header is given even if multiple
353
- # fraction of proteins are being created
354
- # fraction_cnt = (prot_cnt/num_prots).floor.to_i
355
- # so for the first n proteins, it will be 0,
356
- # the 2n proteins will be 1, etc.
357
- # e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
358
- # would give headers like this: >f0_<some_real_header>,
359
- # >f1_<some_real_header>, ...
360
- def fraction_of_prots(fraction=1, prefix_proc=nil)
361
- new_num = (fraction.to_f * self.prots.size).ceil
362
- arr = []
363
- orig_num_prots = @prots.size
364
-
365
- # initialize
366
- new_prots = @prots.map {|prt| prt.dup }
367
- frac_cnt = 0
368
- ind_cnt = 0
369
- prt_cnt = orig_num_prots
370
- while ind_cnt < new_num
371
- arr << new_prots.delete_at(rand(new_prots.size))
372
- if prefix_proc
373
- prefix = prefix_proc.call(frac_cnt)
374
- arr.last.header_prefix!(prefix)
375
- end
376
- prt_cnt -= 1 # index
377
- if prt_cnt == 0
378
- frac_cnt += 1
379
- new_prots = @prots.map {|prt| prt.dup }
380
- prt_cnt = orig_num_prots
381
- end
382
- ind_cnt += 1
383
- end
384
- fasta_fraction = Fasta.new(arr)
385
- end
386
-
387
- # Convenience method to concatenate an array of fasta files. Filenames are
388
- # concatenated according to 'cat_filenames') and prefixes the proteins
389
- # according to the values in 'file_prot_header_prefixes' array
390
- def self.cat_and_prefix(files, file_prot_header_prefixes=nil, file_connector=nil)
391
- fastas = files.collect do |file|
392
- Fasta.new.read_file(file)
393
- end
394
- outfile = cat_filenames(files, file_connector)
395
- if file_prot_header_prefixes
396
- file_prot_header_prefixes.each_with_index do |prefix,i|
397
- fastas[i].header_prefix!(prefix) if prefix
398
- end
399
- end
400
- fasta1 = fastas.shift
401
- fastas.each do |fasta|
402
- fasta1 << fasta
403
- end
404
- fasta1.write_file(outfile)
405
- outfile
406
- end
407
-
408
- def <<(other)
409
- # case when with class names uses === operator
410
- case other
411
- when Fasta
412
- @prots.push(*(other.prots))
413
- when Fasta::Prot
414
- @prots.push(other)
415
- end
416
- end
417
-
418
- # method = :shuffle! | :reverse!
419
- def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
420
- if tryptic_peptides
421
- @prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
422
- else
423
- @prots.each {|prot| prot.aaseq!(method_as_symbol) }
424
- end
425
- end
426
-
427
- # shuffles the aa sequence of each protein (each protein within itself)
428
- def aaseq_shuffle!
429
- @prots.each {|prot| prot.shuffle! }
430
- end
431
-
432
- # shuffles the aa sequence of each protein (each protein within itself)
433
- def aaseq_invert!
434
- @prots.each {|prot| prot.invert! }
435
- end
436
-
437
-
438
- def aaseq_invert_tryptic_peptides!
439
- @prots.each {|prot| prot.invert_tryptic_peptides! }
440
- end
441
-
442
- def aaseq_shuffle_tryptic_peptides!
443
- @prots.each {|prot| prot.invert_tryptic_peptides! }
444
- end
445
-
446
- def header_prefix!(prefix)
447
- @prots.each do |prot|
448
- prot.header_prefix!(prefix)
449
- end
450
- end
451
-
452
- end
453
-
454
- # requires that object respond_to? :reference
455
- module ProteinReferenceable
456
- # gives the string up to the first space (without the leading '>')
457
- def first_entry
458
- ref = reference
459
- if ref
460
- if ref.size > 1
461
- ls_ref = ref.lstrip
462
- index = ls_ref.index(' ')
463
- if index
464
- ls_ref[0...index]
465
- else
466
- ls_ref.dup
467
- end
468
- else
469
- ''
470
- end
471
- else
472
- nil
473
- end
474
- end
475
-
476
- end
477
-
478
-
479
-
480
-
481
- class Fasta::Prot
482
- include ProteinReferenceable
483
-
484
- # header given as full line with starting '>' (but no newline chars!).
485
- # aaseq also given without any newline chars
486
- attr_accessor :header, :aaseq
487
- def initialize(header=nil, aaseq=nil)
488
- @header = header || ''
489
- if aaseq
490
- @aaseq = aaseq
491
- else
492
- @aaseq = ""
493
- end
494
- end
495
-
496
- def ==(other)
497
- other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
498
- end
499
-
500
- # gives the string up to the first space (without the leading '>')
501
- def first_entry
502
-
503
- if @header
504
- if @header.size > 1
505
- index = @header.index(' ')
506
- if index
507
- @header[1...index]
508
- else
509
- @header[1..-1]
510
- end
511
- else
512
- ''
513
- end
514
- else
515
- nil
516
- end
517
- end
518
-
519
- # returns the fasta header information without the leading '>'
520
- def reference
521
- @header[1..-1]
522
- end
523
-
524
- # returns the value after the first '|' and before the second '|'
525
- # according to this regexp: /\|(.*?)\|/
526
- # This will typically be the gi code
527
- # Returns nil if it doesn't match
528
- def gi
529
- if @header =~ /\|(.*?)\|/
530
- $1.dup
531
- else
532
- nil
533
- end
534
- end
535
-
536
- # convenience
537
- def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
538
- def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
539
-
540
- # modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
541
- # [cuts after K or R but not if followed by a P]
542
- # if method_as_symbol = :reverse
543
- # :reverse | :shuffle OR :reverse! | :shuffle!
544
- # aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
545
- # -> 'ABCKCDERDEKDGEKWXYRRKDER'
546
- def tryptic_peptides!(method_as_symbol)
547
- peps = SampleEnzyme.tryptic(@aaseq)
548
- ends_in_RK = /[KR]/o
549
-
550
- ## if the last peptide doesn't end in R or K we want to flip it completely
551
- last_pep_special = nil
552
- if peps.last[-1,1] !~ /[KR]/
553
- last_pep_special = peps.pop
554
- end
555
- rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
556
- if last_pep_special
557
- rev_peps << last_pep_special.send(method_as_symbol)
558
- end
559
- @aaseq = rev_peps.join
560
- end
561
-
562
- # takes :reverse! | :shuffle!
563
- def aaseq!(method_as_symbol)
564
- @aaseq.send(method_as_symbol)
565
- end
566
-
567
- def invert!
568
- @aaseq.reverse!
569
- end
570
-
571
- def shuffle!
572
- @aaseq.shuffle!
573
- end
574
-
575
- # adds a prefix to the protein header (which comes after the '>' char) if
576
- # one is not already there.
577
- def header_prefix!(prefix)
578
- unless @header =~ /^>#{Regexp.escape(prefix)}/
579
- @header.gsub!(/^>/, ">#{prefix}")
580
- end
581
- end
582
-
583
- def dup
584
- self.class.new(@header.dup, @aaseq.dup)
585
- end
586
-
587
- # returns the header line and aaseq with trailing newlines as one might find
588
- # in a fasta file
589
- def to_s
590
- @header + "\n" + @aaseq + "\n"
591
- end
592
-
593
- end
594
-
595
-
596
- # For reference, my code is about 15X faster than the first code I wrote
597
- # below! It turns out that the major slowdown is in the randomize routine.
598
- # Using my own randomize routine with the below way of reading fasta
599
- # files is 2X faster than below (in other words, my reader is 2X as fasta).
600
- #
601
- ##!/usr/bin/ruby -w
602
- #
603
- #require 'bio'
604
- #
605
- #SHUFF_EXT = "_shuffled"
606
- #
607
- #if ARGV.size < 1
608
- # puts <<END
609
- #usage: #{File.basename(__FILE__)} file.fasta ... # -> file#{SHUFF_EXT}.fasta ...
610
- #Shuffles the amino acid sequence of each protein.
611
- #END
612
- # exit
613
- #end
614
- #
615
- #ARGV.each do |fn|
616
- # fn_ext = File.extname(fn)
617
- # fn_out = fn.gsub(fn_ext, SHUFF_EXT + fn_ext)
618
- # File.open(fn_out, "w") do |fh|
619
- # f = Bio::FlatFile.auto(fn)
620
- # f.each_entry do |e|
621
- # fh.puts '>' + e.definition
622
- # fh.puts e.aaseq.randomize
623
- # end
624
- # end
625
- #end
626
- by=:protein, num=1