mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/bsearch.rb ADDED
@@ -0,0 +1,120 @@
1
+ #
2
+ # Ruby/Bsearch - a binary search library for Ruby.
3
+ #
4
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Ruby's licence.
10
+ #
11
+ # Example:
12
+ #
13
+ # % irb -r ./bsearch.rb
14
+ # >> %w(a b c c c d e f).bsearch_first {|x| x <=> "c"}
15
+ # => 2
16
+ # >> %w(a b c c c d e f).bsearch_last {|x| x <=> "c"}
17
+ # => 4
18
+ # >> %w(a b c e f).bsearch_first {|x| x <=> "c"}
19
+ # => 2
20
+ # >> %w(a b e f).bsearch_first {|x| x <=> "c"}
21
+ # => nil
22
+ # >> %w(a b e f).bsearch_last {|x| x <=> "c"}
23
+ # => nil
24
+ # >> %w(a b e f).bsearch_lower_boundary {|x| x <=> "c"}
25
+ # => 2
26
+ # >> %w(a b e f).bsearch_upper_boundary {|x| x <=> "c"}
27
+ # => 2
28
+ # >> %w(a b c c c d e f).bsearch_range {|x| x <=> "c"}
29
+ # => 2...5
30
+ # >> %w(a b c d e f).bsearch_range {|x| x <=> "c"}
31
+ # => 2...3
32
+ # >> %w(a b d e f).bsearch_range {|x| x <=> "c"}
33
+ # => 2...2
34
+
35
+ module Bsearch
36
+ VERSION = '1.5'
37
+ end
38
+
39
+ class Array
40
+ #
41
+ # The binary search algorithm is extracted from Jon Bentley's
42
+ # Programming Pearls 2nd ed. p.93
43
+ #
44
+
45
+ #
46
+ # Return the lower boundary. (inside)
47
+ #
48
+ def bsearch_lower_boundary (range = 0 ... self.length, &block)
49
+ lower = range.first() -1
50
+ upper = if range.exclude_end? then range.last else range.last + 1 end
51
+ while lower + 1 != upper
52
+ mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
53
+ if yield(self[mid]) < 0
54
+ lower = mid
55
+ else
56
+ upper = mid
57
+ end
58
+ end
59
+ return upper
60
+ end
61
+
62
+ #
63
+ # This method searches the FIRST occurrence which satisfies a
64
+ # condition given by a block in binary fashion and return the
65
+ # index of the first occurrence. Return nil if not found.
66
+ #
67
+ def bsearch_first (range = 0 ... self.length, &block)
68
+ boundary = bsearch_lower_boundary(range, &block)
69
+ if boundary >= self.length || yield(self[boundary]) != 0
70
+ return nil
71
+ else
72
+ return boundary
73
+ end
74
+ end
75
+
76
+ alias bsearch bsearch_first
77
+
78
+ #
79
+ # Return the upper boundary. (outside)
80
+ #
81
+ def bsearch_upper_boundary (range = 0 ... self.length, &block)
82
+ lower = range.first() -1
83
+ upper = if range.exclude_end? then range.last else range.last + 1 end
84
+ while lower + 1 != upper
85
+ mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
86
+ if yield(self[mid]) <= 0
87
+ lower = mid
88
+ else
89
+ upper = mid
90
+ end
91
+ end
92
+ return lower + 1 # outside of the matching range.
93
+ end
94
+
95
+ #
96
+ # This method searches the LAST occurrence which satisfies a
97
+ # condition given by a block in binary fashion and return the
98
+ # index of the last occurrence. Return nil if not found.
99
+ #
100
+ def bsearch_last (range = 0 ... self.length, &block)
101
+ # `- 1' for canceling `lower + 1' in bsearch_upper_boundary.
102
+ boundary = bsearch_upper_boundary(range, &block) - 1
103
+
104
+ if (boundary <= -1 || yield(self[boundary]) != 0)
105
+ return nil
106
+ else
107
+ return boundary
108
+ end
109
+ end
110
+
111
+ #
112
+ # Return the search result as a Range object.
113
+ #
114
+ def bsearch_range (range = 0 ... self.length, &block)
115
+ lower = bsearch_lower_boundary(range, &block)
116
+ upper = bsearch_upper_boundary(range, &block)
117
+ return lower ... upper
118
+ end
119
+ end
120
+
data/lib/fasta.rb CHANGED
@@ -1,5 +1,9 @@
1
1
  require 'sample_enzyme'
2
2
  require 'each_index'
3
+ require 'optparse'
4
+ require 'delegate'
5
+ require 'hash_by'
6
+ require 'digest/md5'
3
7
 
4
8
 
5
9
  tmp = $VERBOSE ; $VERBOSE = nil
@@ -27,8 +31,10 @@ end
27
31
  $VERBOSE = tmp
28
32
 
29
33
 
34
+ module FastaManipulation ; end
30
35
 
31
- class Fasta
36
+ class Fasta < DelegateClass(Array)
37
+ include FastaManipulation
32
38
  SHUFF_PREFIX = "SHUFF_"
33
39
  SHUFF_FILE_POSTFIX = "_SHUFF"
34
40
  CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
@@ -37,21 +43,124 @@ class Fasta
37
43
  INV_FILE_POSTFIX = "_INV"
38
44
  CAT_INV_FILE_POSTFIX = "_CAT_INV"
39
45
 
40
- attr_accessor :prots
46
+ attr_writer :prots
47
+ # this will probably be relative
48
+ attr_accessor :filename
41
49
 
42
- def initialize(prots=nil)
43
- if prots
44
- @prots = prots
50
+ # for backwards compatibility
51
+ def prots
52
+ @prots
53
+ end
54
+
55
+ def self.to_fasta(file_or_obj)
56
+ if file_or_obj.is_a? Fasta
57
+ file_or_obj
58
+ else
59
+ Fasta.new(file_or_obj)
60
+ end
61
+ end
62
+
63
+ # arg can be:
64
+ # Fasta::Prot objects (Array)
65
+ # filename (String)
66
+ # Another Fasta object (Fasta) (shallow copy!)
67
+ def initialize(arg=nil, filename=nil)
68
+ @filename = filename
69
+ @prots = []
70
+ if arg
71
+ if arg.is_a? Fasta
72
+ self.prots = arg.prots
73
+ self.filename = arg.filename
74
+ elsif arg.is_a? Array
75
+ @prots = arg
76
+ else
77
+ read_file(arg)
78
+ end
79
+ end
80
+ super(@prots)
81
+ end
82
+
83
+ # uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
84
+ def md5_sum
85
+ if File.exist?(@filename)
86
+ Digest::MD5.hexdigest(File.read(@filename))
45
87
  else
46
- @prots = []
88
+ nil
47
89
  end
48
90
  end
49
91
 
92
+ # returns the length of the file (in terms of the total number of amino
93
+ # acids represented)
94
+ def aa_seq_length
95
+ tot = 0
96
+ self.each do |prot|
97
+ tot += prot.aaseq.size
98
+ end
99
+ tot
100
+ end
101
+
102
+ # searches proteins for a match to the exact sequence and returns a single
103
+ # protein header (with > & no newline)
104
+ # exact matches). nil if no matches
105
+ def header_from_exact_sequence(aaseq)
106
+ hash = self.hash_by(:aaseq)
107
+ answ = hash[aaseq].map{|v| v.header}
108
+ if answ.size == 1
109
+ answ
110
+ elsif answ.size == 0
111
+ nil
112
+ else
113
+ answ
114
+ end
115
+ end
116
+
117
+ # searches all headers to see if they include input string
118
+ # returns true if one matches, false otherwise
119
+ # (remember that headers are not stored with newline chars but do contain
120
+ # beginning '>'
121
+ def included_in_header?(input)
122
+ @prots.any? do |prot|
123
+ prot.header.include? input
124
+ end
125
+ end
126
+
127
+ # takes an io object or string (which is the fasta data) This is not as
128
+ # stringent as 'read_file' which is recommended for industrial type use. For
129
+ # instance, this will fail if your newlines are different in your file from
130
+ # those defined on your operating system. If you have a string, simply pass
131
+ # in StringIO.new(your_string) to be read.
132
+ # returns self
133
+ def load(io)
134
+ current_prot = nil
135
+ current_aaseq = nil
136
+ @prots.clear
137
+ io.each do |line|
138
+ if line[0,1] == '>'
139
+ current_prot = Prot.new
140
+ @prots << current_prot
141
+ current_prot.header = line.chomp
142
+ current_aaseq = ''
143
+ current_prot.aaseq = current_aaseq
144
+ elsif (line =~ /[^ ]/) && (line.size > 1)
145
+ current_aaseq << line.chomp
146
+ end
147
+ end
148
+ self
149
+ end
150
+
151
+ # uses 'load' to create a fasta object from a fasta string
152
+ def self.from_string(string)
153
+ Fasta.new.load(StringIO.new(string))
154
+ end
155
+
50
156
  # Reads fasta files (under windows or unix newlines)
51
157
  # Always outputs LF separated files
52
158
  # Checks that the first character per line is '>' or character class [A-Za-z*]
53
159
  # returns a fasta object for stringing commands
54
- def read_file(fn)
160
+ # if fn not given, will read the :filename attribute
161
+ # will set :filename to fn is given
162
+ def read_file(fn=nil)
163
+ @filename = fn if fn
55
164
  first_char_re = /[A-Za-z*]/o
56
165
  obj = nil
57
166
  regex = /(\r\n)|\n/o
@@ -76,14 +185,151 @@ class Fasta
76
185
  self
77
186
  end
78
187
 
79
- # Returns filename with the extension (including the '.' prefixed with
80
- # the extension_prefix (given as a string)
81
- def self.prefix_extension(filename, extension_prefix)
82
- ext = File.extname(filename)
83
- ext_regex = /#{Regexp.escape(ext)}$/o
84
- new_filename = filename.gsub(ext_regex, extension_prefix + ext)
188
+ # if no fn, will write to :filename attribute
189
+ def write_file(fn=nil)
190
+ fn = @out unless fn
191
+ File.open(fn, "wb") do |out|
192
+ @prots.each do |prot|
193
+ out.print(prot.to_s)
194
+ end
195
+ end
196
+ end
197
+
198
+ # duplicates the object (deep copy)
199
+ def dup
200
+ other = self.class.new
201
+ other.filename = self.filename
202
+ self.prots.each do |prot|
203
+ other.prots << prot.dup
204
+ end
205
+ other
206
+ end
207
+
208
+ end
209
+
210
+ class FastaShaker
211
+
212
+ def reverse(fasta_file_or_obj, opts={})
213
+ shake_it(:reverse, fasta_file_or_obj, opts)
214
+ end
215
+
216
+ def shuffle(fasta_file_or_obj, opts={})
217
+ shake_it(:shuffle, fasta_file_or_obj, opts)
218
+ end
219
+
220
+ # sets the outbound filename attribute from opts
221
+ def create_filename(fasta, method, opts={})
222
+ file = fasta.filename || 'fasta'
223
+ filebase = file.sub(/\..*$/,'')
224
+ parts = [filebase]
225
+ parts << 'cat' if opts[:cat]
226
+ parts << method
227
+ parts << 'prefix' << opts[:prefix] if opts[:prefix]
228
+ parts << 'fraction' << opts[:fraction] if opts[:fraction]
229
+ parts << 'tryptic_peptides' if opts[:tryptic_peptides]
230
+ parts.join("_") << ".fasta"
231
+ end
232
+
233
+ protected
234
+ def shake_it(method, fasta_file_or_obj, opt)
235
+ fasta = Fasta.to_fasta(fasta_file_or_obj)
236
+ if opt[:cat] && !opt[:prefix]
237
+ message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
238
+ warn message
239
+ end
240
+
241
+ unless opt[:out]
242
+ opt[:out] = create_filename(fasta, method, opt)
243
+ end
244
+
245
+ ## CAT (save an original copy)
246
+ fasta_orig = fasta.dup if opt[:cat]
247
+
248
+ ## FRACTION the proteins
249
+ if f = opt[:fraction]
250
+ prefix = nil
251
+ if f > 1.0
252
+ prefix = proc {|cnt| "f#{cnt}_" }
253
+ end
254
+ fasta = fasta.fraction_of_prots(f, prefix)
255
+ end
256
+
257
+ ## PREFIX the proteins
258
+ if pre = opt[:prefix]
259
+ fasta.header_prefix!(pre)
260
+ end
261
+
262
+ ## MODIFY the proteins
263
+ fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
264
+
265
+ ## CAT (finish it up)
266
+ if opt[:cat]
267
+ fasta_orig << fasta
268
+ fasta = fasta_orig
269
+ end
270
+
271
+ ## WRITE out the file
272
+ fasta.write_file(opt[:out])
273
+ end
274
+
275
+
276
+
277
+
278
+ #############################################
279
+ # END MAIN METHODS
280
+ #############################################
281
+
282
+ # takes command line input, and sends it to shake
283
+ def FastaShaker.shake_from_argv(argv)
284
+ opt = {}
285
+
286
+ opts = OptionParser.new do |op|
287
+ prog = File.basename(__FILE__)
288
+ op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
289
+ op.separator " <method> = reverse | shuffle"
290
+ op.separator ""
291
+ op.separator "fasta_shaker is kind of like a salt shaker:"
292
+ op.separator "shake up your fasta proteins and let them"
293
+ op.separator "season your dinner (hopefully a protein dinner). Mmmm."
294
+ op.separator "false identification rates never tasted so good :)"
295
+ op.separator ""
296
+ op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
297
+ op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
298
+ op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
299
+ op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
300
+ op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
301
+ op.separator " (after any given prefix) so that proteins are unique]"
302
+ op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
303
+
304
+ op.separator ""
305
+ op.separator "EXAMPLES: "
306
+ op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
307
+ op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
308
+ op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
309
+ op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
310
+ end
311
+
312
+ #p argv
313
+ opts.parse!(argv)
314
+
315
+ if argv.size < 2
316
+ puts opts
317
+ exit
318
+ end
319
+
320
+ (method, file) = argv
321
+ fs = FastaShaker.new
322
+ fs.send(method.to_sym, file, opt)
85
323
  end
86
324
 
325
+ private
326
+
327
+
328
+
329
+ end
330
+
331
+ module FastaManipulation
332
+
87
333
  # concatenates the filenames like this:
88
334
  # cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
89
335
  # the path and extension of the first filename are kept intact.
@@ -99,18 +345,6 @@ class Fasta
99
345
  fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
100
346
  end
101
347
 
102
- # Convenience method for creating a modified file with a particular method
103
- # from Fasta. Returns the name of the output file.
104
- def self.modify_file(file, method, file_postfix="", prot_header_prefix=nil)
105
- file_out = prefix_extension(file, file_postfix)
106
- fasta = Fasta.new
107
- fasta.read_file(file)
108
- fasta.send(method)
109
- fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
110
- fasta.write_file(file_out)
111
- file_out
112
- end
113
-
114
348
  # returns a new fasta object using some fraction of proteins randomly
115
349
  # selected (fraction may be > 1). Always rounds up. Will not choose a
116
350
  # protein twice unless all other proteins have been chosen
@@ -150,30 +384,6 @@ class Fasta
150
384
  fasta_fraction = Fasta.new(arr)
151
385
  end
152
386
 
153
- # Convenience method for modifying some fraction of the proteins of a file
154
- # and concatenating it to a copy of the original. Returns the name of the
155
- # output file.
156
- def self.modify_fraction_and_cat_to_file(file, method, fraction=1, file_postfix=nil, prot_header_prefix=nil)
157
- #puts [file, method, fraction, file_postfix, prot_header_prefix].join("*")
158
- file_postfix = "" unless file_postfix
159
- fasta = Fasta.new
160
- fasta.read_file(file)
161
- outfile = prefix_extension(file, file_postfix)
162
- other_fasta = fasta.fraction_of_prots(fraction)
163
- other_fasta.send(method)
164
- other_fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
165
- fasta << other_fasta
166
- fasta.write_file(outfile)
167
- return outfile
168
- end
169
-
170
- # Convenience method for modifying a file and concatenating it to a copy of
171
- # the original. Returns th name of the output file.
172
- def self.modify_and_cat_to_file(file, method, file_postfix=nil, prot_header_prefix=nil)
173
- fraction = 1
174
- modify_fraction_and_cat_to_file(file, method, fraction, file_postfix, prot_header_prefix)
175
- end
176
-
177
387
  # Convenience method to concatenate an array of fasta files. Filenames are
178
388
  # concatenated according to 'cat_filenames') and prefixes the proteins
179
389
  # according to the values in 'file_prot_header_prefixes' array
@@ -196,35 +406,13 @@ class Fasta
196
406
  end
197
407
 
198
408
  def <<(other)
199
- @prots.push(*(other.prots))
200
- end
201
-
202
- # @TODO: this should be in terms of sets, right now depends on order!!
203
- def ==(other)
204
- other_prots = other.prots
205
- @prots.each_with_index do |prot, index|
206
- if other_prots[index] != prot
207
- return false
208
- end
209
- end
210
- return true
211
- end
212
-
213
- def write_file(fn)
214
- File.open(fn, "wb") do |out|
215
- @prots.each do |prot|
216
- out.print(prot.to_s)
217
- end
218
- end
219
- end
220
-
221
- # duplicates the object (deep copy)
222
- def dup
223
- other = self.class.new
224
- self.prots.each do |prot|
225
- other.prots << prot.dup
409
+ # case when with class names uses === operator
410
+ case other
411
+ when Fasta
412
+ @prots.push(*(other.prots))
413
+ when Fasta::Prot
414
+ @prots.push(other)
226
415
  end
227
- other
228
416
  end
229
417
 
230
418
  # method = :shuffle! | :reverse!
@@ -260,10 +448,39 @@ class Fasta
260
448
  prot.header_prefix!(prefix)
261
449
  end
262
450
  end
263
-
451
+
452
+ end
453
+
454
+ # requires that object respond_to? :reference
455
+ module ProteinReferenceable
456
+ # gives the string up to the first space (without the leading '>')
457
+ def first_entry
458
+ ref = reference
459
+ if ref
460
+ if ref.size > 1
461
+ ls_ref = ref.lstrip
462
+ index = ls_ref.index(' ')
463
+ if index
464
+ ls_ref[0...index]
465
+ else
466
+ ls_ref.dup
467
+ end
468
+ else
469
+ ''
470
+ end
471
+ else
472
+ nil
473
+ end
474
+ end
475
+
264
476
  end
265
477
 
478
+
479
+
480
+
266
481
  class Fasta::Prot
482
+ include ProteinReferenceable
483
+
267
484
  # header given as full line with starting '>' (but no newline chars!).
268
485
  # aaseq also given without any newline chars
269
486
  attr_accessor :header, :aaseq
@@ -280,15 +497,30 @@ class Fasta::Prot
280
497
  other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
281
498
  end
282
499
 
283
- # returns the fasta header information without the leading '>'
284
- def reference
285
- if @header =~ /^>(.*)/
286
- $1.dup
500
+ # gives the string up to the first space (without the leading '>')
501
+ def first_entry
502
+
503
+ if @header
504
+ if @header.size > 1
505
+ index = @header.index(' ')
506
+ if index
507
+ @header[1...index]
508
+ else
509
+ @header[1..-1]
510
+ end
511
+ else
512
+ ''
513
+ end
287
514
  else
288
- @header
515
+ nil
289
516
  end
290
517
  end
291
518
 
519
+ # returns the fasta header information without the leading '>'
520
+ def reference
521
+ @header[1..-1]
522
+ end
523
+
292
524
  # returns the value after the first '|' and before the second '|'
293
525
  # according to this regexp: /\|(.*?)\|/
294
526
  # This will typically be the gi code
@@ -314,7 +546,7 @@ class Fasta::Prot
314
546
  def tryptic_peptides!(method_as_symbol)
315
547
  peps = SampleEnzyme.tryptic(@aaseq)
316
548
  ends_in_RK = /[KR]/o
317
-
549
+
318
550
  ## if the last peptide doesn't end in R or K we want to flip it completely
319
551
  last_pep_special = nil
320
552
  if peps.last[-1,1] !~ /[KR]/
@@ -360,7 +592,7 @@ class Fasta::Prot
360
592
 
361
593
  end
362
594
 
363
-
595
+
364
596
  # For reference, my code is about 15X faster than the first code I wrote
365
597
  # below! It turns out that the major slowdown is in the randomize routine.
366
598
  # Using my own randomize routine with the below way of reading fasta
@@ -391,4 +623,4 @@ end
391
623
  # end
392
624
  # end
393
625
  #end
394
-
626
+ by=:protein, num=1
data/lib/group_by.rb ADDED
@@ -0,0 +1,10 @@
1
+
2
+ #taken from rails, will be in Ruby 1.9
3
+ module Enumerable
4
+ def group_by
5
+ inject({}) do |groups, element|
6
+ (groups[yield(element)] ||= []) << element
7
+ groups
8
+ end
9
+ end
10
+ end
data/lib/index_by.rb ADDED
@@ -0,0 +1,11 @@
1
+
2
+ # taken from rails (will be in Ruby 1.9??)
3
+
4
+ module Enumerable
5
+ def index_by
6
+ inject({}) do |accum, elem|
7
+ accum[yield(elem)] = elem
8
+ accum
9
+ end
10
+ end
11
+ end
data/lib/merge_deep.rb ADDED
@@ -0,0 +1,21 @@
1
+
2
+ class Hash
3
+
4
+ # any hashes within the hash will also be merged to the level specifid
5
+ def merge_deep(hash2, level=1)
6
+ if level == 1
7
+ tmp_opts = {}
8
+ self.each do |k,v|
9
+ if (v.is_a?(Hash) and hash2[k].is_a?(Hash))
10
+ tmp_opts[k] = v.merge(hash2[k])
11
+ end
12
+ end
13
+ opts = self.merge(hash2)
14
+ opts.merge!(tmp_opts)
15
+ opts
16
+ else
17
+ raise NotImplementedError, "need to implement level > 1"
18
+ end
19
+ end
20
+ end
21
+