BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,400 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # rubocop: disable LineLength
29
+ module BioDSL
30
+ # == Slice aligned sequences in the stream to obtain subsequences.
31
+ #
32
+ # +slice_align+ slices an alignment to extract subsequence from all sequences
33
+ # in the stream. This is done by either specifying a range or a set of primers
34
+ # that is then used to locate the range to be sliced from the sequences.
35
+ #
36
+ # If a range is given with the +slice+ option the potitions (0-based) must be
37
+ # corresponding the aligned sequence, i.e with gaps.
38
+ #
39
+ # If a set of primers are given with the +forward+ and +reverse+ options (or
40
+ # the +forward_rc+ and +reverse_rc+ options) these primers are used to locate
41
+ # the matching positions in the first entry and this range is used to slice
42
+ # this and any following sequences. It is possible to specify fuzzy primer
43
+ # matching by using the +max_mismatches+, +max_insertions+ and +max_deletions+
44
+ # options. Moreover, IUPAC ambigity codes are allowed.
45
+ #
46
+ # It is also possible to specify a template file using the +template_file+
47
+ # option. The template file should be a file with one FASTA formatted sequence
48
+ # from the alignment (with gaps). If a template file and a range is specified
49
+ # the nucleotide positions from the ungapped template will be used. If both
50
+ # template file and primers are specified the template sequence is used for
51
+ # the primer search and the positions will be used for slicing.
52
+ #
53
+ # The sequences in the stream are replaced with the sliced subsequences.
54
+ #
55
+ # == Usage
56
+ #
57
+ # slice_align(<slice: <index>|<range>> |
58
+ # <forward: <string> | forward_rc: <string>>,
59
+ # <revese: <string> | reverse_rc: <string>
60
+ # [, max_mismatches: <uint>[, max_insertions: <uint>
61
+ # [, max_deletions: <uint>[, template_file: <file>]]]])
62
+ #
63
+ # === Options
64
+ #
65
+ # * slice: <index> - Slice a one residue subsequence.
66
+ # * slice: <range> - Slice a range from the sequence.
67
+ # * forward: <string> - Forward primer (5'-3').
68
+ # * forward_rc: <string> - Forward primer (3'-5').
69
+ # * reverse: <string> - Reverse primer (3'-5').
70
+ # * reverse_rc: <string> - Reverse primer (5'-3').
71
+ # * max_mismatches: <uint> - Max number of mismatchs (default=2).
72
+ # * max_insertions: <uint> - Max number of insertions (default=1).
73
+ # * max_deletions: <uint> - Max number of deletions (default=1).
74
+ # * template_file: <file> - File with one aligned sequence in FASTA format.
75
+ #
76
+ # == Examples
77
+ #
78
+ # Consider the following alignment in the file `test.fna`
79
+ #
80
+ # >ID00000000
81
+ # CCGCATACG-------CCCTGAGGGG----
82
+ # >ID00000001
83
+ # CCGCATGAT-------ACCTGAGGGT----
84
+ # >ID00000002
85
+ # CCGCATATACTCTTGACGCTAAAGCGTAGT
86
+ # >ID00000003
87
+ # CCGTATGTG-------CCCTTCGGGG----
88
+ # >ID00000004
89
+ # CCGGATAAG-------CCCTTACGGG----
90
+ # >ID00000005
91
+ # CCGGATAAG-------CCCTTACGGG----
92
+ #
93
+ # We can slice the alignment with +slice_align+ using a range:
94
+ #
95
+ # BP.new.
96
+ # read_fasta(input: "test.fna").
97
+ # slice_align(slice: 14 .. 27).
98
+ # dump.
99
+ # run
100
+ #
101
+ # {:SEQ_NAME=>"ID00000000", :SEQ=>"--CCCTGAGGGG--", :SEQ_LEN=>14}
102
+ # {:SEQ_NAME=>"ID00000001", :SEQ=>"--ACCTGAGGGT--", :SEQ_LEN=>14}
103
+ # {:SEQ_NAME=>"ID00000002", :SEQ=>"GACGCTAAAGCGTA", :SEQ_LEN=>14}
104
+ # {:SEQ_NAME=>"ID00000003", :SEQ=>"--CCCTTCGGGG--", :SEQ_LEN=>14}
105
+ # {:SEQ_NAME=>"ID00000004", :SEQ=>"--CCCTTACGGG--", :SEQ_LEN=>14}
106
+ # {:SEQ_NAME=>"ID00000005", :SEQ=>"--CCCTTACGGG--", :SEQ_LEN=>14}
107
+ #
108
+ # Or we could slice the alignment using a set of primers:
109
+ #
110
+ # BP.new.
111
+ # read_fasta(input: "test.fna").
112
+ # slice_align(forward: "CGCATACG", reverse: "GAGGGG", max_mismatches: 0,
113
+ # max_insertions: 0, max_deletions: 0).
114
+ # dump.run
115
+ #
116
+ # {:SEQ_NAME=>"ID00000000", :SEQ=>"CGCATACG-------CCCTGAGGGG", :SEQ_LEN=>25}
117
+ # {:SEQ_NAME=>"ID00000001", :SEQ=>"CGCATGAT-------ACCTGAGGGT", :SEQ_LEN=>25}
118
+ # {:SEQ_NAME=>"ID00000002", :SEQ=>"CGCATATACTCTTGACGCTAAAGCG", :SEQ_LEN=>25}
119
+ # {:SEQ_NAME=>"ID00000003", :SEQ=>"CGTATGTG-------CCCTTCGGGG", :SEQ_LEN=>25}
120
+ # {:SEQ_NAME=>"ID00000004", :SEQ=>"CGGATAAG-------CCCTTACGGG", :SEQ_LEN=>25}
121
+ # {:SEQ_NAME=>"ID00000005", :SEQ=>"CGGATAAG-------CCCTTACGGG", :SEQ_LEN=>25}
122
+ #
123
+ # Now, if we have a template file with the following FASTA entry:
124
+ #
125
+ # >template
126
+ # CTGAATACG-------CCATTCGATGG---
127
+ #
128
+ # and spefifying primers these will be matched to the template and the hit
129
+ # positions used for slicing:
130
+ #
131
+ # BP.new.
132
+ # read_fasta(input: "test.fna").
133
+ # slice_align(template_file: "template.fna", forward: "GAATACG",
134
+ # reverse: "ATTCGAT", max_mismatches: 0, max_insertions: 0,
135
+ # max_deletions: 0).
136
+ # dump.run
137
+ #
138
+ # {:SEQ_NAME=>"ID00000000", :SEQ=>"GCATACG-------CCCTGAGGG", :SEQ_LEN=>23}
139
+ # {:SEQ_NAME=>"ID00000001", :SEQ=>"GCATGAT-------ACCTGAGGG", :SEQ_LEN=>23}
140
+ # {:SEQ_NAME=>"ID00000002", :SEQ=>"GCATATACTCTTGACGCTAAAGC", :SEQ_LEN=>23}
141
+ # {:SEQ_NAME=>"ID00000003", :SEQ=>"GTATGTG-------CCCTTCGGG", :SEQ_LEN=>23}
142
+ # {:SEQ_NAME=>"ID00000004", :SEQ=>"GGATAAG-------CCCTTACGG", :SEQ_LEN=>23}
143
+ # {:SEQ_NAME=>"ID00000005", :SEQ=>"GGATAAG-------CCCTTACGG", :SEQ_LEN=>23}
144
+ #
145
+ # Finally, specifying a template file and an interval the positions used for
146
+ # slicing will be the ungapped positions from the template sequence. This
147
+ # is useful if you are slicing 16S rRNA alignments and want the _E.coli_
148
+ # corresponding positions - simply use the _E.coli_ sequence as template.
149
+ #
150
+ # BP.new.
151
+ # read_fasta(input: "test.fna").
152
+ # slice_align(template_file: "template.fna", slice: 4 .. 14).
153
+ # dump.run
154
+ #
155
+ # {:SEQ_NAME=>"ID00000000", :SEQ=>"ATACG-------CCCTGA", :SEQ_LEN=>18}
156
+ # {:SEQ_NAME=>"ID00000001", :SEQ=>"ATGAT-------ACCTGA", :SEQ_LEN=>18}
157
+ # {:SEQ_NAME=>"ID00000002", :SEQ=>"ATATACTCTTGACGCTAA", :SEQ_LEN=>18}
158
+ # {:SEQ_NAME=>"ID00000003", :SEQ=>"ATGTG-------CCCTTC", :SEQ_LEN=>18}
159
+ # {:SEQ_NAME=>"ID00000004", :SEQ=>"ATAAG-------CCCTTA", :SEQ_LEN=>18}
160
+ # {:SEQ_NAME=>"ID00000005", :SEQ=>"ATAAG-------CCCTTA", :SEQ_LEN=>18}
161
+ #
162
+ # rubocop: enable LineLength
163
+ # rubocop: disable ClassLength
164
+ class SliceAlign
165
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
166
+ residues_out)
167
+
168
+ # Constructor for SliceAlign.
169
+ #
170
+ # @param options [Hash] Options hash.
171
+ # @option options [Range,Integer] :slice
172
+ # @option options [String] :forward
173
+ # @option options [String] :forward_rc
174
+ # @option options [String] :reverse
175
+ # @option options [String] :reverse_rc
176
+ # @option options [Integer] :max_mismatches
177
+ # @option options [Integer] :max_insertions
178
+ # @option options [Integer] :max_deletions
179
+ # @option options [String] :template_file
180
+ #
181
+ # @return [SliceAlign] Class instance.
182
+ def initialize(options)
183
+ @options = options
184
+ @forward = forward
185
+ @reverse = reverse
186
+ @indels = BioDSL::Seq::INDELS.sort.join
187
+ @template = nil
188
+ @slice = options[:slice]
189
+
190
+ check_options
191
+ defaults
192
+ end
193
+
194
+ # Return the comman lamba for slice_align.
195
+ #
196
+ # @return [Proc] Command lambda.
197
+ def lmb
198
+ lambda do |input, output, status|
199
+ status_init(status, STATS)
200
+
201
+ parse_template_file
202
+ setup_template_slice
203
+
204
+ input.each do |record|
205
+ @status[:records_in] += 1
206
+ slice_align(record) if record.key? :SEQ
207
+ output << record
208
+ @status[:records_out] += 1
209
+ end
210
+ end
211
+ end
212
+
213
+ private
214
+
215
+ # Check options.
216
+ def check_options
217
+ options_allowed(@options, :slice, :forward, :forward_rc, :reverse,
218
+ :reverse_rc, :max_mismatches, :max_insertions,
219
+ :max_deletions, :template_file)
220
+ options_conflict(@options, slice: :forward)
221
+ options_files_exist(@options, :template_file)
222
+ options_assert(@options, ':max_mismatches >= 0')
223
+ options_assert(@options, ':max_insertions >= 0')
224
+ options_assert(@options, ':max_deletions >= 0')
225
+ options_assert(@options, ':max_mismatches <= 5')
226
+ options_assert(@options, ':max_insertions <= 5')
227
+ options_assert(@options, ':max_deletions <= 5')
228
+ end
229
+
230
+ # Setup default primer matching attributes.
231
+ def defaults
232
+ @max_mis = @options[:max_mismatches] || 2
233
+ @max_ins = @options[:max_insertions] || 1
234
+ @max_del = @options[:max_deletions] || 1
235
+ end
236
+
237
+ # Parse FASTA file with one gapped template sequence if specified.
238
+ def parse_template_file
239
+ return unless @options[:template_file]
240
+
241
+ @template = BioDSL::Fasta.read(@options[:template_file]).first
242
+ end
243
+
244
+ # Set the slice positions using the template sequence.
245
+ def setup_template_slice
246
+ return unless @template
247
+
248
+ pos_index = PosIndex.new(@template, @indels)
249
+
250
+ if @slice
251
+ start, stop = setup_template_slice_range(pos_index)
252
+ else
253
+ start, stop = setup_template_slice_primers(pos_index)
254
+ end
255
+
256
+ @slice = Range.new(start, stop)
257
+ end
258
+
259
+ # Given a position index use slice positions to locate equivalent postitions
260
+ # in the template sequence.
261
+ #
262
+ # @param pos_index [PosIndex] Position index.
263
+ def setup_template_slice_range(pos_index)
264
+ start = pos_index[@slice.first]
265
+ stop = pos_index[@slice.last]
266
+
267
+ [start, stop]
268
+ end
269
+
270
+ # Given a position index use primers to locate the slice positions in the
271
+ # template sequence.
272
+ #
273
+ # @param pos_index [PosIndex] Position index.
274
+ def setup_template_slice_primers(pos_index)
275
+ compact = Seq.new(seq: @template.seq.dup.delete(@indels))
276
+ fmatch = find_match(@forward, compact)
277
+ rmatch = find_match(@reverse, compact)
278
+ start = pos_index[fmatch.start]
279
+ stop = pos_index[rmatch.stop]
280
+
281
+ [start, stop]
282
+ end
283
+
284
+ # Return the forward primer sequence and reverse-complement it if need be.
285
+ #
286
+ # @return [String] Forward primer sequence.
287
+ def forward
288
+ if @options[:forward_rc]
289
+ @options[:forward] = Seq.new(seq: @options[:forward_rc], type: :dna).
290
+ reverse.complement.seq
291
+ else
292
+ @options[:forward]
293
+ end
294
+ end
295
+
296
+ # Return the reverse primer sequence and reverse-complement it if need be.
297
+ #
298
+ # @return [String] Reverse primer sequence.
299
+ def reverse
300
+ if @options[:reverse_rc]
301
+ @options[:reverse] = Seq.new(seq: @options[:reverse_rc], type: :dna).
302
+ reverse.complement.seq
303
+ else
304
+ @options[:reverse]
305
+ end
306
+ end
307
+
308
+ # Slice sequence in given record accoding to slice positions.
309
+ #
310
+ # @param record [Hash] BioDSL record.
311
+ def slice_align(record)
312
+ entry = BioDSL::Seq.new_bp(record)
313
+
314
+ @status[:sequences_in] += 1
315
+ @status[:residues_in] += entry.length
316
+
317
+ setup_slice(entry) unless @slice
318
+
319
+ entry = entry[@slice]
320
+
321
+ record.merge! entry.to_bp
322
+
323
+ @status[:sequences_out] += 1
324
+ @status[:residues_out] += entry.length
325
+ end
326
+
327
+ # Usings primers to locate slice positions in entry.
328
+ #
329
+ # @param entry [BioDSL::Seq] Sequence entry.
330
+ def setup_slice(entry)
331
+ pos_index = PosIndex.new(entry, @indels)
332
+ compact = Seq.new(seq: entry.seq.dup.delete(@indels))
333
+
334
+ fmatch = find_match(@forward, compact)
335
+ rmatch = find_match(@reverse, compact)
336
+
337
+ @slice = Range.new(pos_index[fmatch.start], pos_index[rmatch.stop])
338
+ end
339
+
340
+ # Find pattern in entry and return match.
341
+ #
342
+ # @param pattern [String] Search pattern.
343
+ # @param entry [BioDSL::Seq] Sequence to search.
344
+ #
345
+ # @return [BioDSL::Seq::Match] Pattern match.
346
+ #
347
+ # @raise [BioDSL::SeqError] If no match.
348
+ def find_match(pattern, entry)
349
+ match = entry.patmatch(pattern,
350
+ max_mismatches: @max_mis,
351
+ max_insertions: @max_ins,
352
+ max_deletions: @max_del)
353
+
354
+ return match unless match.nil?
355
+
356
+ fail BioDSL::SeqError, "pattern not found: #{pattern}"
357
+ end
358
+
359
+ # Class for indexing gapped sequence positions to non-gapped sequence
360
+ # positions.
361
+ class PosIndex
362
+ # Constructor for PosIndex.
363
+ #
364
+ # @param entry [BioDSL::Seq] Gapped sequence entry.
365
+ # @param indels [String] String with indel alphabet.
366
+ #
367
+ # @return [PosIndex] Class instance.
368
+ def initialize(entry, indels)
369
+ @entry = entry
370
+ @indels = indels
371
+ @index = index_positions
372
+ end
373
+
374
+ # Given a non-gapped sequence postion return the gapped position.
375
+ #
376
+ # @param pos [Integer] Non-gapped sequence position.
377
+ #
378
+ # @return [Integer] Gapped sequence position
379
+ def [](pos)
380
+ @index[pos]
381
+ end
382
+
383
+ private
384
+
385
+ # Return an index mapping gapped sequence positions to non-gapped
386
+ # positions.
387
+ #
388
+ # @return [Array] Position index.
389
+ def index_positions
390
+ pos_index = []
391
+
392
+ @entry.seq.chars.each_with_index do |c, i|
393
+ pos_index << i unless @indels.include? c
394
+ end
395
+
396
+ pos_index
397
+ end
398
+ end
399
+ end
400
+ end
@@ -0,0 +1,151 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Slice sequences in the stream and obtain subsequences.
30
+ #
31
+ # Slice subsequences from sequences using index positions, that is single
32
+ # postion residues, or using ranges for stretches of residues.
33
+ #
34
+ # All positions are 0-based.
35
+ #
36
+ # If the records also contain quality SCORES these are also sliced.
37
+ #
38
+ # == Usage
39
+ #
40
+ # slice_seq(<slice: <index>|<range>>)
41
+ #
42
+ # === Options
43
+ #
44
+ # * slice: <index> - Slice a one residue subsequence.
45
+ # * slice: <range> - Slice a range from the sequence.
46
+ #
47
+ # == Examples
48
+ #
49
+ # Consider the following FASTQ entry in the file test.fq:
50
+ #
51
+ # @HWI-EAS157_20FFGAAXX:2:1:888:434
52
+ # TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
53
+ # +
54
+ # !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
55
+ #
56
+ # To slice the second residue from the beginning do:
57
+ #
58
+ # BP.new.read_fastq(input: "test.fq").slice_seq(slice: 2).dump.run
59
+ #
60
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
61
+ # :SEQ=>"G",
62
+ # :SEQ_LEN=>1,
63
+ # :SCORES=>"#"}
64
+ #
65
+ # To slice the last residue do:
66
+ #
67
+ # BP.new.read_fastq(input: "test.fq").slice_seq(slice: -1).dump.run
68
+ #
69
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
70
+ # :SEQ=>"T",
71
+ # :SEQ_LEN=>1,
72
+ # :SCORES=>"I"}
73
+ #
74
+ # To slice the first 5 residues do:
75
+ #
76
+ # BP.new.read_fastq(input: "test.fq").slice_seq(slice: 0 ... 5).dump.run
77
+ #
78
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
79
+ # :SEQ=>"TTGGT",
80
+ # :SEQ_LEN=>5,
81
+ # :SCORES=>"!\"\#$%"}
82
+ #
83
+ # To slice the last 5 residues do:
84
+ #
85
+ # BP.new.read_fastq(input: "test.fq").slice_seq(slice: -5 .. -1).dump.run
86
+ #
87
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
88
+ # :SEQ=>"GCGAT",
89
+ # :SEQ_LEN=>5,
90
+ # :SCORES=>"EFGHI"}
91
+ class SliceSeq
92
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
93
+ residues_out)
94
+
95
+ # Constructor for SliceSeq.
96
+ #
97
+ # @param options [Hash] Options hash.
98
+ # @option options [Range,Integer] :slice
99
+ #
100
+ # @return [SliceSeq] Class instance.
101
+ def initialize(options)
102
+ @options = options
103
+
104
+ check_options
105
+ end
106
+
107
+ # Return lambda for command.
108
+ #
109
+ # @return [Proc] Command lambda.
110
+ def lmb
111
+ lambda do |input, output, status|
112
+ status_init(status, STATS)
113
+
114
+ input.each do |record|
115
+ @status[:records_in] += 1
116
+
117
+ slice_seq(record) if record.key? :SEQ
118
+
119
+ output << record
120
+
121
+ @status[:records_out] += 1
122
+ end
123
+ end
124
+ end
125
+
126
+ private
127
+
128
+ # Check options.
129
+ def check_options
130
+ options_allowed(@options, :slice)
131
+ options_required(@options, :slice)
132
+ end
133
+
134
+ # Slice sequence in given record.
135
+ #
136
+ # @param record [Hash] BioDSL record.
137
+ def slice_seq(record)
138
+ entry = BioDSL::Seq.new_bp(record)
139
+
140
+ @status[:sequences_in] += 1
141
+ @status[:residues_in] += entry.length
142
+
143
+ entry = entry[@options[:slice]]
144
+
145
+ @status[:sequences_out] += 1
146
+ @status[:residues_out] += entry.length
147
+
148
+ record.merge! entry.to_bp
149
+ end
150
+ end
151
+ end