mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/fasta.rb DELETED
@@ -1,626 +0,0 @@
1
- require 'sample_enzyme'
2
- require 'each_index'
3
- require 'optparse'
4
- require 'delegate'
5
- require 'hash_by'
6
- require 'digest/md5'
7
-
8
-
9
- tmp = $VERBOSE ; $VERBOSE = nil
10
- class String
11
-
12
- def each_index
13
- (0...self.size).each do |c|
14
- yield c
15
- end
16
- end
17
-
18
- # modifies and returns self
19
- def shuffle!
20
- each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
21
- self
22
- end
23
-
24
- def shuffle
25
- out = self.dup
26
- out.shuffle!
27
- out
28
- end
29
-
30
- end
31
- $VERBOSE = tmp
32
-
33
-
34
- module FastaManipulation ; end
35
-
36
- class Fasta < DelegateClass(Array)
37
- include FastaManipulation
38
- SHUFF_PREFIX = "SHUFF_"
39
- SHUFF_FILE_POSTFIX = "_SHUFF"
40
- CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
41
- FILE_CONNECTOR = "__"
42
- INV_PREFIX = "INV_"
43
- INV_FILE_POSTFIX = "_INV"
44
- CAT_INV_FILE_POSTFIX = "_CAT_INV"
45
-
46
- attr_writer :prots
47
- # this will probably be relative
48
- attr_accessor :filename
49
-
50
- # for backwards compatibility
51
- def prots
52
- @prots
53
- end
54
-
55
- def self.to_fasta(file_or_obj)
56
- if file_or_obj.is_a? Fasta
57
- file_or_obj
58
- else
59
- Fasta.new(file_or_obj)
60
- end
61
- end
62
-
63
- # arg can be:
64
- # Fasta::Prot objects (Array)
65
- # filename (String)
66
- # Another Fasta object (Fasta) (shallow copy!)
67
- def initialize(arg=nil, filename=nil)
68
- @filename = filename
69
- @prots = []
70
- if arg
71
- if arg.is_a? Fasta
72
- self.prots = arg.prots
73
- self.filename = arg.filename
74
- elsif arg.is_a? Array
75
- @prots = arg
76
- else
77
- read_file(arg)
78
- end
79
- end
80
- super(@prots)
81
- end
82
-
83
- # uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
84
- def md5_sum
85
- if File.exist?(@filename)
86
- Digest::MD5.hexdigest(File.read(@filename))
87
- else
88
- nil
89
- end
90
- end
91
-
92
- # returns the length of the file (in terms of the total number of amino
93
- # acids represented)
94
- def aa_seq_length
95
- tot = 0
96
- self.each do |prot|
97
- tot += prot.aaseq.size
98
- end
99
- tot
100
- end
101
-
102
- # searches proteins for a match to the exact sequence and returns a single
103
- # protein header (with > & no newline)
104
- # exact matches). nil if no matches
105
- def header_from_exact_sequence(aaseq)
106
- hash = self.hash_by(:aaseq)
107
- answ = hash[aaseq].map{|v| v.header}
108
- if answ.size == 1
109
- answ
110
- elsif answ.size == 0
111
- nil
112
- else
113
- answ
114
- end
115
- end
116
-
117
- # searches all headers to see if they include input string
118
- # returns true if one matches, false otherwise
119
- # (remember that headers are not stored with newline chars but do contain
120
- # beginning '>'
121
- def included_in_header?(input)
122
- @prots.any? do |prot|
123
- prot.header.include? input
124
- end
125
- end
126
-
127
- # takes an io object or string (which is the fasta data) This is not as
128
- # stringent as 'read_file' which is recommended for industrial type use. For
129
- # instance, this will fail if your newlines are different in your file from
130
- # those defined on your operating system. If you have a string, simply pass
131
- # in StringIO.new(your_string) to be read.
132
- # returns self
133
- def load(io)
134
- current_prot = nil
135
- current_aaseq = nil
136
- @prots.clear
137
- io.each do |line|
138
- if line[0,1] == '>'
139
- current_prot = Prot.new
140
- @prots << current_prot
141
- current_prot.header = line.chomp
142
- current_aaseq = ''
143
- current_prot.aaseq = current_aaseq
144
- elsif (line =~ /[^ ]/) && (line.size > 1)
145
- current_aaseq << line.chomp
146
- end
147
- end
148
- self
149
- end
150
-
151
- # uses 'load' to create a fasta object from a fasta string
152
- def self.from_string(string)
153
- Fasta.new.load(StringIO.new(string))
154
- end
155
-
156
- # Reads fasta files (under windows or unix newlines)
157
- # Always outputs LF separated files
158
- # Checks that the first character per line is '>' or character class [A-Za-z*]
159
- # returns a fasta object for stringing commands
160
- # if fn not given, will read the :filename attribute
161
- # will set :filename to fn is given
162
- def read_file(fn=nil)
163
- @filename = fn if fn
164
- first_char_re = /[A-Za-z*]/o
165
- obj = nil
166
- regex = /(\r\n)|\n/o
167
- fh = File.new(fn).binmode
168
- lines = fh.read.split(regex)
169
- fh.close
170
- first_char = nil
171
- lines.each do |line|
172
- if line =~ /[^ \n\r]/
173
- first_char = line[0,1]
174
- if first_char == '>'
175
- obj = Prot.new
176
- @prots << obj
177
- obj.header = line.dup
178
- elsif first_char =~ first_char_re
179
- obj.aaseq << line.chomp
180
- else
181
- raise "Line not in fasta format (between arrows): -->#{line}<--"
182
- end
183
- end
184
- end
185
- self
186
- end
187
-
188
- # if no fn, will write to :filename attribute
189
- def write_file(fn=nil)
190
- fn = @out unless fn
191
- File.open(fn, "wb") do |out|
192
- @prots.each do |prot|
193
- out.print(prot.to_s)
194
- end
195
- end
196
- end
197
-
198
- # duplicates the object (deep copy)
199
- def dup
200
- other = self.class.new
201
- other.filename = self.filename
202
- self.prots.each do |prot|
203
- other.prots << prot.dup
204
- end
205
- other
206
- end
207
-
208
- end
209
-
210
- class FastaShaker
211
-
212
- def reverse(fasta_file_or_obj, opts={})
213
- shake_it(:reverse, fasta_file_or_obj, opts)
214
- end
215
-
216
- def shuffle(fasta_file_or_obj, opts={})
217
- shake_it(:shuffle, fasta_file_or_obj, opts)
218
- end
219
-
220
- # sets the outbound filename attribute from opts
221
- def create_filename(fasta, method, opts={})
222
- file = fasta.filename || 'fasta'
223
- filebase = file.sub(/\..*$/,'')
224
- parts = [filebase]
225
- parts << 'cat' if opts[:cat]
226
- parts << method
227
- parts << 'prefix' << opts[:prefix] if opts[:prefix]
228
- parts << 'fraction' << opts[:fraction] if opts[:fraction]
229
- parts << 'tryptic_peptides' if opts[:tryptic_peptides]
230
- parts.join("_") << ".fasta"
231
- end
232
-
233
- protected
234
- def shake_it(method, fasta_file_or_obj, opt)
235
- fasta = Fasta.to_fasta(fasta_file_or_obj)
236
- if opt[:cat] && !opt[:prefix]
237
- message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
238
- warn message
239
- end
240
-
241
- unless opt[:out]
242
- opt[:out] = create_filename(fasta, method, opt)
243
- end
244
-
245
- ## CAT (save an original copy)
246
- fasta_orig = fasta.dup if opt[:cat]
247
-
248
- ## FRACTION the proteins
249
- if f = opt[:fraction]
250
- prefix = nil
251
- if f > 1.0
252
- prefix = proc {|cnt| "f#{cnt}_" }
253
- end
254
- fasta = fasta.fraction_of_prots(f, prefix)
255
- end
256
-
257
- ## PREFIX the proteins
258
- if pre = opt[:prefix]
259
- fasta.header_prefix!(pre)
260
- end
261
-
262
- ## MODIFY the proteins
263
- fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
264
-
265
- ## CAT (finish it up)
266
- if opt[:cat]
267
- fasta_orig << fasta
268
- fasta = fasta_orig
269
- end
270
-
271
- ## WRITE out the file
272
- fasta.write_file(opt[:out])
273
- end
274
-
275
-
276
-
277
-
278
- #############################################
279
- # END MAIN METHODS
280
- #############################################
281
-
282
- # takes command line input, and sends it to shake
283
- def FastaShaker.shake_from_argv(argv)
284
- opt = {}
285
-
286
- opts = OptionParser.new do |op|
287
- prog = File.basename(__FILE__)
288
- op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
289
- op.separator " <method> = reverse | shuffle"
290
- op.separator ""
291
- op.separator "fasta_shaker is kind of like a salt shaker:"
292
- op.separator "shake up your fasta proteins and let them"
293
- op.separator "season your dinner (hopefully a protein dinner). Mmmm."
294
- op.separator "false identification rates never tasted so good :)"
295
- op.separator ""
296
- op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
297
- op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
298
- op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
299
- op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
300
- op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
301
- op.separator " (after any given prefix) so that proteins are unique]"
302
- op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
303
-
304
- op.separator ""
305
- op.separator "EXAMPLES: "
306
- op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
307
- op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
308
- op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
309
- op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
310
- end
311
-
312
- #p argv
313
- opts.parse!(argv)
314
-
315
- if argv.size < 2
316
- puts opts
317
- exit
318
- end
319
-
320
- (method, file) = argv
321
- fs = FastaShaker.new
322
- fs.send(method.to_sym, file, opt)
323
- end
324
-
325
- private
326
-
327
-
328
-
329
- end
330
-
331
- module FastaManipulation
332
-
333
- # concatenates the filenames like this:
334
- # cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
335
- # the path and extension of the first filename are kept intact.
336
- # other files only use the basename (with no extension)
337
- def self.cat_filenames(filenames, connector="")
338
- fn1 = filenames.shift
339
- fn1_ext = File.extname(fn1)
340
- filenames.collect! do |fn|
341
- fn_ext = File.extname(fn)
342
- fn_base_no_ext = File.basename(fn, fn_ext)
343
- end
344
- con_filenames = filenames.join(connector)
345
- fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
346
- end
347
-
348
- # returns a new fasta object using some fraction of proteins randomly
349
- # selected (fraction may be > 1). Always rounds up. Will not choose a
350
- # protein twice unless all other proteins have been chosen
351
- #
352
- # fraction_prefix ensures that a unique header is given even if multiple
353
- # fraction of proteins are being created
354
- # fraction_cnt = (prot_cnt/num_prots).floor.to_i
355
- # so for the first n proteins, it will be 0,
356
- # the 2n proteins will be 1, etc.
357
- # e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
358
- # would give headers like this: >f0_<some_real_header>,
359
- # >f1_<some_real_header>, ...
360
- def fraction_of_prots(fraction=1, prefix_proc=nil)
361
- new_num = (fraction.to_f * self.prots.size).ceil
362
- arr = []
363
- orig_num_prots = @prots.size
364
-
365
- # initialize
366
- new_prots = @prots.map {|prt| prt.dup }
367
- frac_cnt = 0
368
- ind_cnt = 0
369
- prt_cnt = orig_num_prots
370
- while ind_cnt < new_num
371
- arr << new_prots.delete_at(rand(new_prots.size))
372
- if prefix_proc
373
- prefix = prefix_proc.call(frac_cnt)
374
- arr.last.header_prefix!(prefix)
375
- end
376
- prt_cnt -= 1 # index
377
- if prt_cnt == 0
378
- frac_cnt += 1
379
- new_prots = @prots.map {|prt| prt.dup }
380
- prt_cnt = orig_num_prots
381
- end
382
- ind_cnt += 1
383
- end
384
- fasta_fraction = Fasta.new(arr)
385
- end
386
-
387
- # Convenience method to concatenate an array of fasta files. Filenames are
388
- # concatenated according to 'cat_filenames') and prefixes the proteins
389
- # according to the values in 'file_prot_header_prefixes' array
390
- def self.cat_and_prefix(files, file_prot_header_prefixes=nil, file_connector=nil)
391
- fastas = files.collect do |file|
392
- Fasta.new.read_file(file)
393
- end
394
- outfile = cat_filenames(files, file_connector)
395
- if file_prot_header_prefixes
396
- file_prot_header_prefixes.each_with_index do |prefix,i|
397
- fastas[i].header_prefix!(prefix) if prefix
398
- end
399
- end
400
- fasta1 = fastas.shift
401
- fastas.each do |fasta|
402
- fasta1 << fasta
403
- end
404
- fasta1.write_file(outfile)
405
- outfile
406
- end
407
-
408
- def <<(other)
409
- # case when with class names uses === operator
410
- case other
411
- when Fasta
412
- @prots.push(*(other.prots))
413
- when Fasta::Prot
414
- @prots.push(other)
415
- end
416
- end
417
-
418
- # method = :shuffle! | :reverse!
419
- def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
420
- if tryptic_peptides
421
- @prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
422
- else
423
- @prots.each {|prot| prot.aaseq!(method_as_symbol) }
424
- end
425
- end
426
-
427
- # shuffles the aa sequence of each protein (each protein within itself)
428
- def aaseq_shuffle!
429
- @prots.each {|prot| prot.shuffle! }
430
- end
431
-
432
- # shuffles the aa sequence of each protein (each protein within itself)
433
- def aaseq_invert!
434
- @prots.each {|prot| prot.invert! }
435
- end
436
-
437
-
438
- def aaseq_invert_tryptic_peptides!
439
- @prots.each {|prot| prot.invert_tryptic_peptides! }
440
- end
441
-
442
- def aaseq_shuffle_tryptic_peptides!
443
- @prots.each {|prot| prot.invert_tryptic_peptides! }
444
- end
445
-
446
- def header_prefix!(prefix)
447
- @prots.each do |prot|
448
- prot.header_prefix!(prefix)
449
- end
450
- end
451
-
452
- end
453
-
454
- # requires that object respond_to? :reference
455
- module ProteinReferenceable
456
- # gives the string up to the first space (without the leading '>')
457
- def first_entry
458
- ref = reference
459
- if ref
460
- if ref.size > 1
461
- ls_ref = ref.lstrip
462
- index = ls_ref.index(' ')
463
- if index
464
- ls_ref[0...index]
465
- else
466
- ls_ref.dup
467
- end
468
- else
469
- ''
470
- end
471
- else
472
- nil
473
- end
474
- end
475
-
476
- end
477
-
478
-
479
-
480
-
481
- class Fasta::Prot
482
- include ProteinReferenceable
483
-
484
- # header given as full line with starting '>' (but no newline chars!).
485
- # aaseq also given without any newline chars
486
- attr_accessor :header, :aaseq
487
- def initialize(header=nil, aaseq=nil)
488
- @header = header || ''
489
- if aaseq
490
- @aaseq = aaseq
491
- else
492
- @aaseq = ""
493
- end
494
- end
495
-
496
- def ==(other)
497
- other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
498
- end
499
-
500
- # gives the string up to the first space (without the leading '>')
501
- def first_entry
502
-
503
- if @header
504
- if @header.size > 1
505
- index = @header.index(' ')
506
- if index
507
- @header[1...index]
508
- else
509
- @header[1..-1]
510
- end
511
- else
512
- ''
513
- end
514
- else
515
- nil
516
- end
517
- end
518
-
519
- # returns the fasta header information without the leading '>'
520
- def reference
521
- @header[1..-1]
522
- end
523
-
524
- # returns the value after the first '|' and before the second '|'
525
- # according to this regexp: /\|(.*?)\|/
526
- # This will typically be the gi code
527
- # Returns nil if it doesn't match
528
- def gi
529
- if @header =~ /\|(.*?)\|/
530
- $1.dup
531
- else
532
- nil
533
- end
534
- end
535
-
536
- # convenience
537
- def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
538
- def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
539
-
540
- # modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
541
- # [cuts after K or R but not if followed by a P]
542
- # if method_as_symbol = :reverse
543
- # :reverse | :shuffle OR :reverse! | :shuffle!
544
- # aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
545
- # -> 'ABCKCDERDEKDGEKWXYRRKDER'
546
- def tryptic_peptides!(method_as_symbol)
547
- peps = SampleEnzyme.tryptic(@aaseq)
548
- ends_in_RK = /[KR]/o
549
-
550
- ## if the last peptide doesn't end in R or K we want to flip it completely
551
- last_pep_special = nil
552
- if peps.last[-1,1] !~ /[KR]/
553
- last_pep_special = peps.pop
554
- end
555
- rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
556
- if last_pep_special
557
- rev_peps << last_pep_special.send(method_as_symbol)
558
- end
559
- @aaseq = rev_peps.join
560
- end
561
-
562
- # takes :reverse! | :shuffle!
563
- def aaseq!(method_as_symbol)
564
- @aaseq.send(method_as_symbol)
565
- end
566
-
567
- def invert!
568
- @aaseq.reverse!
569
- end
570
-
571
- def shuffle!
572
- @aaseq.shuffle!
573
- end
574
-
575
- # adds a prefix to the protein header (which comes after the '>' char) if
576
- # one is not already there.
577
- def header_prefix!(prefix)
578
- unless @header =~ /^>#{Regexp.escape(prefix)}/
579
- @header.gsub!(/^>/, ">#{prefix}")
580
- end
581
- end
582
-
583
- def dup
584
- self.class.new(@header.dup, @aaseq.dup)
585
- end
586
-
587
- # returns the header line and aaseq with trailing newlines as one might find
588
- # in a fasta file
589
- def to_s
590
- @header + "\n" + @aaseq + "\n"
591
- end
592
-
593
- end
594
-
595
-
596
- # For reference, my code is about 15X faster than the first code I wrote
597
- # below! It turns out that the major slowdown is in the randomize routine.
598
- # Using my own randomize routine with the below way of reading fasta
599
- # files is 2X faster than below (in other words, my reader is 2X as fasta).
600
- #
601
- ##!/usr/bin/ruby -w
602
- #
603
- #require 'bio'
604
- #
605
- #SHUFF_EXT = "_shuffled"
606
- #
607
- #if ARGV.size < 1
608
- # puts <<END
609
- #usage: #{File.basename(__FILE__)} file.fasta ... # -> file#{SHUFF_EXT}.fasta ...
610
- #Shuffles the amino acid sequence of each protein.
611
- #END
612
- # exit
613
- #end
614
- #
615
- #ARGV.each do |fn|
616
- # fn_ext = File.extname(fn)
617
- # fn_out = fn.gsub(fn_ext, SHUFF_EXT + fn_ext)
618
- # File.open(fn_out, "w") do |fh|
619
- # f = Bio::FlatFile.auto(fn)
620
- # f.each_entry do |e|
621
- # fh.puts '>' + e.definition
622
- # fh.puts e.aaseq.randomize
623
- # end
624
- # end
625
- #end
626
- by=:protein, num=1