mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/bsearch.rb ADDED
@@ -0,0 +1,120 @@
1
+ #
2
+ # Ruby/Bsearch - a binary search library for Ruby.
3
+ #
4
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Ruby's licence.
10
+ #
11
+ # Example:
12
+ #
13
+ # % irb -r ./bsearch.rb
14
+ # >> %w(a b c c c d e f).bsearch_first {|x| x <=> "c"}
15
+ # => 2
16
+ # >> %w(a b c c c d e f).bsearch_last {|x| x <=> "c"}
17
+ # => 4
18
+ # >> %w(a b c e f).bsearch_first {|x| x <=> "c"}
19
+ # => 2
20
+ # >> %w(a b e f).bsearch_first {|x| x <=> "c"}
21
+ # => nil
22
+ # >> %w(a b e f).bsearch_last {|x| x <=> "c"}
23
+ # => nil
24
+ # >> %w(a b e f).bsearch_lower_boundary {|x| x <=> "c"}
25
+ # => 2
26
+ # >> %w(a b e f).bsearch_upper_boundary {|x| x <=> "c"}
27
+ # => 2
28
+ # >> %w(a b c c c d e f).bsearch_range {|x| x <=> "c"}
29
+ # => 2...5
30
+ # >> %w(a b c d e f).bsearch_range {|x| x <=> "c"}
31
+ # => 2...3
32
+ # >> %w(a b d e f).bsearch_range {|x| x <=> "c"}
33
+ # => 2...2
34
+
35
+ module Bsearch
36
+ VERSION = '1.5'
37
+ end
38
+
39
+ class Array
40
+ #
41
+ # The binary search algorithm is extracted from Jon Bentley's
42
+ # Programming Pearls 2nd ed. p.93
43
+ #
44
+
45
+ #
46
+ # Return the lower boundary. (inside)
47
+ #
48
+ def bsearch_lower_boundary (range = 0 ... self.length, &block)
49
+ lower = range.first() -1
50
+ upper = if range.exclude_end? then range.last else range.last + 1 end
51
+ while lower + 1 != upper
52
+ mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
53
+ if yield(self[mid]) < 0
54
+ lower = mid
55
+ else
56
+ upper = mid
57
+ end
58
+ end
59
+ return upper
60
+ end
61
+
62
+ #
63
+ # This method searches the FIRST occurrence which satisfies a
64
+ # condition given by a block in binary fashion and return the
65
+ # index of the first occurrence. Return nil if not found.
66
+ #
67
+ def bsearch_first (range = 0 ... self.length, &block)
68
+ boundary = bsearch_lower_boundary(range, &block)
69
+ if boundary >= self.length || yield(self[boundary]) != 0
70
+ return nil
71
+ else
72
+ return boundary
73
+ end
74
+ end
75
+
76
+ alias bsearch bsearch_first
77
+
78
+ #
79
+ # Return the upper boundary. (outside)
80
+ #
81
+ def bsearch_upper_boundary (range = 0 ... self.length, &block)
82
+ lower = range.first() -1
83
+ upper = if range.exclude_end? then range.last else range.last + 1 end
84
+ while lower + 1 != upper
85
+ mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
86
+ if yield(self[mid]) <= 0
87
+ lower = mid
88
+ else
89
+ upper = mid
90
+ end
91
+ end
92
+ return lower + 1 # outside of the matching range.
93
+ end
94
+
95
+ #
96
+ # This method searches the LAST occurrence which satisfies a
97
+ # condition given by a block in binary fashion and return the
98
+ # index of the last occurrence. Return nil if not found.
99
+ #
100
+ def bsearch_last (range = 0 ... self.length, &block)
101
+ # `- 1' for canceling `lower + 1' in bsearch_upper_boundary.
102
+ boundary = bsearch_upper_boundary(range, &block) - 1
103
+
104
+ if (boundary <= -1 || yield(self[boundary]) != 0)
105
+ return nil
106
+ else
107
+ return boundary
108
+ end
109
+ end
110
+
111
+ #
112
+ # Return the search result as a Range object.
113
+ #
114
+ def bsearch_range (range = 0 ... self.length, &block)
115
+ lower = bsearch_lower_boundary(range, &block)
116
+ upper = bsearch_upper_boundary(range, &block)
117
+ return lower ... upper
118
+ end
119
+ end
120
+
data/lib/fasta.rb CHANGED
@@ -1,5 +1,9 @@
1
1
  require 'sample_enzyme'
2
2
  require 'each_index'
3
+ require 'optparse'
4
+ require 'delegate'
5
+ require 'hash_by'
6
+ require 'digest/md5'
3
7
 
4
8
 
5
9
  tmp = $VERBOSE ; $VERBOSE = nil
@@ -27,8 +31,10 @@ end
27
31
  $VERBOSE = tmp
28
32
 
29
33
 
34
+ module FastaManipulation ; end
30
35
 
31
- class Fasta
36
+ class Fasta < DelegateClass(Array)
37
+ include FastaManipulation
32
38
  SHUFF_PREFIX = "SHUFF_"
33
39
  SHUFF_FILE_POSTFIX = "_SHUFF"
34
40
  CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
@@ -37,21 +43,124 @@ class Fasta
37
43
  INV_FILE_POSTFIX = "_INV"
38
44
  CAT_INV_FILE_POSTFIX = "_CAT_INV"
39
45
 
40
- attr_accessor :prots
46
+ attr_writer :prots
47
+ # this will probably be relative
48
+ attr_accessor :filename
41
49
 
42
- def initialize(prots=nil)
43
- if prots
44
- @prots = prots
50
+ # for backwards compatibility
51
+ def prots
52
+ @prots
53
+ end
54
+
55
+ def self.to_fasta(file_or_obj)
56
+ if file_or_obj.is_a? Fasta
57
+ file_or_obj
58
+ else
59
+ Fasta.new(file_or_obj)
60
+ end
61
+ end
62
+
63
+ # arg can be:
64
+ # Fasta::Prot objects (Array)
65
+ # filename (String)
66
+ # Another Fasta object (Fasta) (shallow copy!)
67
+ def initialize(arg=nil, filename=nil)
68
+ @filename = filename
69
+ @prots = []
70
+ if arg
71
+ if arg.is_a? Fasta
72
+ self.prots = arg.prots
73
+ self.filename = arg.filename
74
+ elsif arg.is_a? Array
75
+ @prots = arg
76
+ else
77
+ read_file(arg)
78
+ end
79
+ end
80
+ super(@prots)
81
+ end
82
+
83
+ # uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
84
+ def md5_sum
85
+ if File.exist?(@filename)
86
+ Digest::MD5.hexdigest(File.read(@filename))
45
87
  else
46
- @prots = []
88
+ nil
47
89
  end
48
90
  end
49
91
 
92
+ # returns the length of the file (in terms of the total number of amino
93
+ # acids represented)
94
+ def aa_seq_length
95
+ tot = 0
96
+ self.each do |prot|
97
+ tot += prot.aaseq.size
98
+ end
99
+ tot
100
+ end
101
+
102
+ # searches proteins for a match to the exact sequence and returns a single
103
+ # protein header (with > & no newline)
104
+ # exact matches). nil if no matches
105
+ def header_from_exact_sequence(aaseq)
106
+ hash = self.hash_by(:aaseq)
107
+ answ = hash[aaseq].map{|v| v.header}
108
+ if answ.size == 1
109
+ answ
110
+ elsif answ.size == 0
111
+ nil
112
+ else
113
+ answ
114
+ end
115
+ end
116
+
117
+ # searches all headers to see if they include input string
118
+ # returns true if one matches, false otherwise
119
+ # (remember that headers are not stored with newline chars but do contain
120
+ # beginning '>'
121
+ def included_in_header?(input)
122
+ @prots.any? do |prot|
123
+ prot.header.include? input
124
+ end
125
+ end
126
+
127
+ # takes an io object or string (which is the fasta data) This is not as
128
+ # stringent as 'read_file' which is recommended for industrial type use. For
129
+ # instance, this will fail if your newlines are different in your file from
130
+ # those defined on your operating system. If you have a string, simply pass
131
+ # in StringIO.new(your_string) to be read.
132
+ # returns self
133
+ def load(io)
134
+ current_prot = nil
135
+ current_aaseq = nil
136
+ @prots.clear
137
+ io.each do |line|
138
+ if line[0,1] == '>'
139
+ current_prot = Prot.new
140
+ @prots << current_prot
141
+ current_prot.header = line.chomp
142
+ current_aaseq = ''
143
+ current_prot.aaseq = current_aaseq
144
+ elsif (line =~ /[^ ]/) && (line.size > 1)
145
+ current_aaseq << line.chomp
146
+ end
147
+ end
148
+ self
149
+ end
150
+
151
+ # uses 'load' to create a fasta object from a fasta string
152
+ def self.from_string(string)
153
+ Fasta.new.load(StringIO.new(string))
154
+ end
155
+
50
156
  # Reads fasta files (under windows or unix newlines)
51
157
  # Always outputs LF separated files
52
158
  # Checks that the first character per line is '>' or character class [A-Za-z*]
53
159
  # returns a fasta object for stringing commands
54
- def read_file(fn)
160
+ # if fn not given, will read the :filename attribute
161
+ # will set :filename to fn is given
162
+ def read_file(fn=nil)
163
+ @filename = fn if fn
55
164
  first_char_re = /[A-Za-z*]/o
56
165
  obj = nil
57
166
  regex = /(\r\n)|\n/o
@@ -76,14 +185,151 @@ class Fasta
76
185
  self
77
186
  end
78
187
 
79
- # Returns filename with the extension (including the '.' prefixed with
80
- # the extension_prefix (given as a string)
81
- def self.prefix_extension(filename, extension_prefix)
82
- ext = File.extname(filename)
83
- ext_regex = /#{Regexp.escape(ext)}$/o
84
- new_filename = filename.gsub(ext_regex, extension_prefix + ext)
188
+ # if no fn, will write to :filename attribute
189
+ def write_file(fn=nil)
190
+ fn = @out unless fn
191
+ File.open(fn, "wb") do |out|
192
+ @prots.each do |prot|
193
+ out.print(prot.to_s)
194
+ end
195
+ end
196
+ end
197
+
198
+ # duplicates the object (deep copy)
199
+ def dup
200
+ other = self.class.new
201
+ other.filename = self.filename
202
+ self.prots.each do |prot|
203
+ other.prots << prot.dup
204
+ end
205
+ other
206
+ end
207
+
208
+ end
209
+
210
+ class FastaShaker
211
+
212
+ def reverse(fasta_file_or_obj, opts={})
213
+ shake_it(:reverse, fasta_file_or_obj, opts)
214
+ end
215
+
216
+ def shuffle(fasta_file_or_obj, opts={})
217
+ shake_it(:shuffle, fasta_file_or_obj, opts)
218
+ end
219
+
220
+ # sets the outbound filename attribute from opts
221
+ def create_filename(fasta, method, opts={})
222
+ file = fasta.filename || 'fasta'
223
+ filebase = file.sub(/\..*$/,'')
224
+ parts = [filebase]
225
+ parts << 'cat' if opts[:cat]
226
+ parts << method
227
+ parts << 'prefix' << opts[:prefix] if opts[:prefix]
228
+ parts << 'fraction' << opts[:fraction] if opts[:fraction]
229
+ parts << 'tryptic_peptides' if opts[:tryptic_peptides]
230
+ parts.join("_") << ".fasta"
231
+ end
232
+
233
+ protected
234
+ def shake_it(method, fasta_file_or_obj, opt)
235
+ fasta = Fasta.to_fasta(fasta_file_or_obj)
236
+ if opt[:cat] && !opt[:prefix]
237
+ message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
238
+ warn message
239
+ end
240
+
241
+ unless opt[:out]
242
+ opt[:out] = create_filename(fasta, method, opt)
243
+ end
244
+
245
+ ## CAT (save an original copy)
246
+ fasta_orig = fasta.dup if opt[:cat]
247
+
248
+ ## FRACTION the proteins
249
+ if f = opt[:fraction]
250
+ prefix = nil
251
+ if f > 1.0
252
+ prefix = proc {|cnt| "f#{cnt}_" }
253
+ end
254
+ fasta = fasta.fraction_of_prots(f, prefix)
255
+ end
256
+
257
+ ## PREFIX the proteins
258
+ if pre = opt[:prefix]
259
+ fasta.header_prefix!(pre)
260
+ end
261
+
262
+ ## MODIFY the proteins
263
+ fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
264
+
265
+ ## CAT (finish it up)
266
+ if opt[:cat]
267
+ fasta_orig << fasta
268
+ fasta = fasta_orig
269
+ end
270
+
271
+ ## WRITE out the file
272
+ fasta.write_file(opt[:out])
273
+ end
274
+
275
+
276
+
277
+
278
+ #############################################
279
+ # END MAIN METHODS
280
+ #############################################
281
+
282
+ # takes command line input, and sends it to shake
283
+ def FastaShaker.shake_from_argv(argv)
284
+ opt = {}
285
+
286
+ opts = OptionParser.new do |op|
287
+ prog = File.basename(__FILE__)
288
+ op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
289
+ op.separator " <method> = reverse | shuffle"
290
+ op.separator ""
291
+ op.separator "fasta_shaker is kind of like a salt shaker:"
292
+ op.separator "shake up your fasta proteins and let them"
293
+ op.separator "season your dinner (hopefully a protein dinner). Mmmm."
294
+ op.separator "false identification rates never tasted so good :)"
295
+ op.separator ""
296
+ op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
297
+ op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
298
+ op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
299
+ op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
300
+ op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
301
+ op.separator " (after any given prefix) so that proteins are unique]"
302
+ op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
303
+
304
+ op.separator ""
305
+ op.separator "EXAMPLES: "
306
+ op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
307
+ op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
308
+ op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
309
+ op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
310
+ end
311
+
312
+ #p argv
313
+ opts.parse!(argv)
314
+
315
+ if argv.size < 2
316
+ puts opts
317
+ exit
318
+ end
319
+
320
+ (method, file) = argv
321
+ fs = FastaShaker.new
322
+ fs.send(method.to_sym, file, opt)
85
323
  end
86
324
 
325
+ private
326
+
327
+
328
+
329
+ end
330
+
331
+ module FastaManipulation
332
+
87
333
  # concatenates the filenames like this:
88
334
  # cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
89
335
  # the path and extension of the first filename are kept intact.
@@ -99,18 +345,6 @@ class Fasta
99
345
  fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
100
346
  end
101
347
 
102
- # Convenience method for creating a modified file with a particular method
103
- # from Fasta. Returns the name of the output file.
104
- def self.modify_file(file, method, file_postfix="", prot_header_prefix=nil)
105
- file_out = prefix_extension(file, file_postfix)
106
- fasta = Fasta.new
107
- fasta.read_file(file)
108
- fasta.send(method)
109
- fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
110
- fasta.write_file(file_out)
111
- file_out
112
- end
113
-
114
348
  # returns a new fasta object using some fraction of proteins randomly
115
349
  # selected (fraction may be > 1). Always rounds up. Will not choose a
116
350
  # protein twice unless all other proteins have been chosen
@@ -150,30 +384,6 @@ class Fasta
150
384
  fasta_fraction = Fasta.new(arr)
151
385
  end
152
386
 
153
- # Convenience method for modifying some fraction of the proteins of a file
154
- # and concatenating it to a copy of the original. Returns the name of the
155
- # output file.
156
- def self.modify_fraction_and_cat_to_file(file, method, fraction=1, file_postfix=nil, prot_header_prefix=nil)
157
- #puts [file, method, fraction, file_postfix, prot_header_prefix].join("*")
158
- file_postfix = "" unless file_postfix
159
- fasta = Fasta.new
160
- fasta.read_file(file)
161
- outfile = prefix_extension(file, file_postfix)
162
- other_fasta = fasta.fraction_of_prots(fraction)
163
- other_fasta.send(method)
164
- other_fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
165
- fasta << other_fasta
166
- fasta.write_file(outfile)
167
- return outfile
168
- end
169
-
170
- # Convenience method for modifying a file and concatenating it to a copy of
171
- # the original. Returns th name of the output file.
172
- def self.modify_and_cat_to_file(file, method, file_postfix=nil, prot_header_prefix=nil)
173
- fraction = 1
174
- modify_fraction_and_cat_to_file(file, method, fraction, file_postfix, prot_header_prefix)
175
- end
176
-
177
387
  # Convenience method to concatenate an array of fasta files. Filenames are
178
388
  # concatenated according to 'cat_filenames') and prefixes the proteins
179
389
  # according to the values in 'file_prot_header_prefixes' array
@@ -196,35 +406,13 @@ class Fasta
196
406
  end
197
407
 
198
408
  def <<(other)
199
- @prots.push(*(other.prots))
200
- end
201
-
202
- # @TODO: this should be in terms of sets, right now depends on order!!
203
- def ==(other)
204
- other_prots = other.prots
205
- @prots.each_with_index do |prot, index|
206
- if other_prots[index] != prot
207
- return false
208
- end
209
- end
210
- return true
211
- end
212
-
213
- def write_file(fn)
214
- File.open(fn, "wb") do |out|
215
- @prots.each do |prot|
216
- out.print(prot.to_s)
217
- end
218
- end
219
- end
220
-
221
- # duplicates the object (deep copy)
222
- def dup
223
- other = self.class.new
224
- self.prots.each do |prot|
225
- other.prots << prot.dup
409
+ # case when with class names uses === operator
410
+ case other
411
+ when Fasta
412
+ @prots.push(*(other.prots))
413
+ when Fasta::Prot
414
+ @prots.push(other)
226
415
  end
227
- other
228
416
  end
229
417
 
230
418
  # method = :shuffle! | :reverse!
@@ -260,10 +448,39 @@ class Fasta
260
448
  prot.header_prefix!(prefix)
261
449
  end
262
450
  end
263
-
451
+
452
+ end
453
+
454
+ # requires that object respond_to? :reference
455
+ module ProteinReferenceable
456
+ # gives the string up to the first space (without the leading '>')
457
+ def first_entry
458
+ ref = reference
459
+ if ref
460
+ if ref.size > 1
461
+ ls_ref = ref.lstrip
462
+ index = ls_ref.index(' ')
463
+ if index
464
+ ls_ref[0...index]
465
+ else
466
+ ls_ref.dup
467
+ end
468
+ else
469
+ ''
470
+ end
471
+ else
472
+ nil
473
+ end
474
+ end
475
+
264
476
  end
265
477
 
478
+
479
+
480
+
266
481
  class Fasta::Prot
482
+ include ProteinReferenceable
483
+
267
484
  # header given as full line with starting '>' (but no newline chars!).
268
485
  # aaseq also given without any newline chars
269
486
  attr_accessor :header, :aaseq
@@ -280,15 +497,30 @@ class Fasta::Prot
280
497
  other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
281
498
  end
282
499
 
283
- # returns the fasta header information without the leading '>'
284
- def reference
285
- if @header =~ /^>(.*)/
286
- $1.dup
500
+ # gives the string up to the first space (without the leading '>')
501
+ def first_entry
502
+
503
+ if @header
504
+ if @header.size > 1
505
+ index = @header.index(' ')
506
+ if index
507
+ @header[1...index]
508
+ else
509
+ @header[1..-1]
510
+ end
511
+ else
512
+ ''
513
+ end
287
514
  else
288
- @header
515
+ nil
289
516
  end
290
517
  end
291
518
 
519
+ # returns the fasta header information without the leading '>'
520
+ def reference
521
+ @header[1..-1]
522
+ end
523
+
292
524
  # returns the value after the first '|' and before the second '|'
293
525
  # according to this regexp: /\|(.*?)\|/
294
526
  # This will typically be the gi code
@@ -314,7 +546,7 @@ class Fasta::Prot
314
546
  def tryptic_peptides!(method_as_symbol)
315
547
  peps = SampleEnzyme.tryptic(@aaseq)
316
548
  ends_in_RK = /[KR]/o
317
-
549
+
318
550
  ## if the last peptide doesn't end in R or K we want to flip it completely
319
551
  last_pep_special = nil
320
552
  if peps.last[-1,1] !~ /[KR]/
@@ -360,7 +592,7 @@ class Fasta::Prot
360
592
 
361
593
  end
362
594
 
363
-
595
+
364
596
  # For reference, my code is about 15X faster than the first code I wrote
365
597
  # below! It turns out that the major slowdown is in the randomize routine.
366
598
  # Using my own randomize routine with the below way of reading fasta
@@ -391,4 +623,4 @@ end
391
623
  # end
392
624
  # end
393
625
  #end
394
-
626
+ by=:protein, num=1
data/lib/group_by.rb ADDED
@@ -0,0 +1,10 @@
1
+
2
+ #taken from rails, will be in Ruby 1.9
3
+ module Enumerable
4
+ def group_by
5
+ inject({}) do |groups, element|
6
+ (groups[yield(element)] ||= []) << element
7
+ groups
8
+ end
9
+ end
10
+ end
data/lib/index_by.rb ADDED
@@ -0,0 +1,11 @@
1
+
2
+ # taken from rails (will be in Ruby 1.9??)
3
+
4
+ module Enumerable
5
+ def index_by
6
+ inject({}) do |accum, elem|
7
+ accum[yield(elem)] = elem
8
+ accum
9
+ end
10
+ end
11
+ end
data/lib/merge_deep.rb ADDED
@@ -0,0 +1,21 @@
1
+
2
+ class Hash
3
+
4
+ # any hashes within the hash will also be merged to the level specifid
5
+ def merge_deep(hash2, level=1)
6
+ if level == 1
7
+ tmp_opts = {}
8
+ self.each do |k,v|
9
+ if (v.is_a?(Hash) and hash2[k].is_a?(Hash))
10
+ tmp_opts[k] = v.merge(hash2[k])
11
+ end
12
+ end
13
+ opts = self.merge(hash2)
14
+ opts.merge!(tmp_opts)
15
+ opts
16
+ else
17
+ raise NotImplementedError, "need to implement level > 1"
18
+ end
19
+ end
20
+ end
21
+