BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# rubocop: disable LineLength
|
|
29
|
+
module BioDSL
|
|
30
|
+
# == Slice aligned sequences in the stream to obtain subsequences.
|
|
31
|
+
#
|
|
32
|
+
# +slice_align+ slices an alignment to extract subsequence from all sequences
|
|
33
|
+
# in the stream. This is done by either specifying a range or a set of primers
|
|
34
|
+
# that is then used to locate the range to be sliced from the sequences.
|
|
35
|
+
#
|
|
36
|
+
# If a range is given with the +slice+ option the potitions (0-based) must be
|
|
37
|
+
# corresponding the aligned sequence, i.e with gaps.
|
|
38
|
+
#
|
|
39
|
+
# If a set of primers are given with the +forward+ and +reverse+ options (or
|
|
40
|
+
# the +forward_rc+ and +reverse_rc+ options) these primers are used to locate
|
|
41
|
+
# the matching positions in the first entry and this range is used to slice
|
|
42
|
+
# this and any following sequences. It is possible to specify fuzzy primer
|
|
43
|
+
# matching by using the +max_mismatches+, +max_insertions+ and +max_deletions+
|
|
44
|
+
# options. Moreover, IUPAC ambigity codes are allowed.
|
|
45
|
+
#
|
|
46
|
+
# It is also possible to specify a template file using the +template_file+
|
|
47
|
+
# option. The template file should be a file with one FASTA formatted sequence
|
|
48
|
+
# from the alignment (with gaps). If a template file and a range is specified
|
|
49
|
+
# the nucleotide positions from the ungapped template will be used. If both
|
|
50
|
+
# template file and primers are specified the template sequence is used for
|
|
51
|
+
# the primer search and the positions will be used for slicing.
|
|
52
|
+
#
|
|
53
|
+
# The sequences in the stream are replaced with the sliced subsequences.
|
|
54
|
+
#
|
|
55
|
+
# == Usage
|
|
56
|
+
#
|
|
57
|
+
# slice_align(<slice: <index>|<range>> |
|
|
58
|
+
# <forward: <string> | forward_rc: <string>>,
|
|
59
|
+
# <revese: <string> | reverse_rc: <string>
|
|
60
|
+
# [, max_mismatches: <uint>[, max_insertions: <uint>
|
|
61
|
+
# [, max_deletions: <uint>[, template_file: <file>]]]])
|
|
62
|
+
#
|
|
63
|
+
# === Options
|
|
64
|
+
#
|
|
65
|
+
# * slice: <index> - Slice a one residue subsequence.
|
|
66
|
+
# * slice: <range> - Slice a range from the sequence.
|
|
67
|
+
# * forward: <string> - Forward primer (5'-3').
|
|
68
|
+
# * forward_rc: <string> - Forward primer (3'-5').
|
|
69
|
+
# * reverse: <string> - Reverse primer (3'-5').
|
|
70
|
+
# * reverse_rc: <string> - Reverse primer (5'-3').
|
|
71
|
+
# * max_mismatches: <uint> - Max number of mismatchs (default=2).
|
|
72
|
+
# * max_insertions: <uint> - Max number of insertions (default=1).
|
|
73
|
+
# * max_deletions: <uint> - Max number of deletions (default=1).
|
|
74
|
+
# * template_file: <file> - File with one aligned sequence in FASTA format.
|
|
75
|
+
#
|
|
76
|
+
# == Examples
|
|
77
|
+
#
|
|
78
|
+
# Consider the following alignment in the file `test.fna`
|
|
79
|
+
#
|
|
80
|
+
# >ID00000000
|
|
81
|
+
# CCGCATACG-------CCCTGAGGGG----
|
|
82
|
+
# >ID00000001
|
|
83
|
+
# CCGCATGAT-------ACCTGAGGGT----
|
|
84
|
+
# >ID00000002
|
|
85
|
+
# CCGCATATACTCTTGACGCTAAAGCGTAGT
|
|
86
|
+
# >ID00000003
|
|
87
|
+
# CCGTATGTG-------CCCTTCGGGG----
|
|
88
|
+
# >ID00000004
|
|
89
|
+
# CCGGATAAG-------CCCTTACGGG----
|
|
90
|
+
# >ID00000005
|
|
91
|
+
# CCGGATAAG-------CCCTTACGGG----
|
|
92
|
+
#
|
|
93
|
+
# We can slice the alignment with +slice_align+ using a range:
|
|
94
|
+
#
|
|
95
|
+
# BP.new.
|
|
96
|
+
# read_fasta(input: "test.fna").
|
|
97
|
+
# slice_align(slice: 14 .. 27).
|
|
98
|
+
# dump.
|
|
99
|
+
# run
|
|
100
|
+
#
|
|
101
|
+
# {:SEQ_NAME=>"ID00000000", :SEQ=>"--CCCTGAGGGG--", :SEQ_LEN=>14}
|
|
102
|
+
# {:SEQ_NAME=>"ID00000001", :SEQ=>"--ACCTGAGGGT--", :SEQ_LEN=>14}
|
|
103
|
+
# {:SEQ_NAME=>"ID00000002", :SEQ=>"GACGCTAAAGCGTA", :SEQ_LEN=>14}
|
|
104
|
+
# {:SEQ_NAME=>"ID00000003", :SEQ=>"--CCCTTCGGGG--", :SEQ_LEN=>14}
|
|
105
|
+
# {:SEQ_NAME=>"ID00000004", :SEQ=>"--CCCTTACGGG--", :SEQ_LEN=>14}
|
|
106
|
+
# {:SEQ_NAME=>"ID00000005", :SEQ=>"--CCCTTACGGG--", :SEQ_LEN=>14}
|
|
107
|
+
#
|
|
108
|
+
# Or we could slice the alignment using a set of primers:
|
|
109
|
+
#
|
|
110
|
+
# BP.new.
|
|
111
|
+
# read_fasta(input: "test.fna").
|
|
112
|
+
# slice_align(forward: "CGCATACG", reverse: "GAGGGG", max_mismatches: 0,
|
|
113
|
+
# max_insertions: 0, max_deletions: 0).
|
|
114
|
+
# dump.run
|
|
115
|
+
#
|
|
116
|
+
# {:SEQ_NAME=>"ID00000000", :SEQ=>"CGCATACG-------CCCTGAGGGG", :SEQ_LEN=>25}
|
|
117
|
+
# {:SEQ_NAME=>"ID00000001", :SEQ=>"CGCATGAT-------ACCTGAGGGT", :SEQ_LEN=>25}
|
|
118
|
+
# {:SEQ_NAME=>"ID00000002", :SEQ=>"CGCATATACTCTTGACGCTAAAGCG", :SEQ_LEN=>25}
|
|
119
|
+
# {:SEQ_NAME=>"ID00000003", :SEQ=>"CGTATGTG-------CCCTTCGGGG", :SEQ_LEN=>25}
|
|
120
|
+
# {:SEQ_NAME=>"ID00000004", :SEQ=>"CGGATAAG-------CCCTTACGGG", :SEQ_LEN=>25}
|
|
121
|
+
# {:SEQ_NAME=>"ID00000005", :SEQ=>"CGGATAAG-------CCCTTACGGG", :SEQ_LEN=>25}
|
|
122
|
+
#
|
|
123
|
+
# Now, if we have a template file with the following FASTA entry:
|
|
124
|
+
#
|
|
125
|
+
# >template
|
|
126
|
+
# CTGAATACG-------CCATTCGATGG---
|
|
127
|
+
#
|
|
128
|
+
# and spefifying primers these will be matched to the template and the hit
|
|
129
|
+
# positions used for slicing:
|
|
130
|
+
#
|
|
131
|
+
# BP.new.
|
|
132
|
+
# read_fasta(input: "test.fna").
|
|
133
|
+
# slice_align(template_file: "template.fna", forward: "GAATACG",
|
|
134
|
+
# reverse: "ATTCGAT", max_mismatches: 0, max_insertions: 0,
|
|
135
|
+
# max_deletions: 0).
|
|
136
|
+
# dump.run
|
|
137
|
+
#
|
|
138
|
+
# {:SEQ_NAME=>"ID00000000", :SEQ=>"GCATACG-------CCCTGAGGG", :SEQ_LEN=>23}
|
|
139
|
+
# {:SEQ_NAME=>"ID00000001", :SEQ=>"GCATGAT-------ACCTGAGGG", :SEQ_LEN=>23}
|
|
140
|
+
# {:SEQ_NAME=>"ID00000002", :SEQ=>"GCATATACTCTTGACGCTAAAGC", :SEQ_LEN=>23}
|
|
141
|
+
# {:SEQ_NAME=>"ID00000003", :SEQ=>"GTATGTG-------CCCTTCGGG", :SEQ_LEN=>23}
|
|
142
|
+
# {:SEQ_NAME=>"ID00000004", :SEQ=>"GGATAAG-------CCCTTACGG", :SEQ_LEN=>23}
|
|
143
|
+
# {:SEQ_NAME=>"ID00000005", :SEQ=>"GGATAAG-------CCCTTACGG", :SEQ_LEN=>23}
|
|
144
|
+
#
|
|
145
|
+
# Finally, specifying a template file and an interval the positions used for
|
|
146
|
+
# slicing will be the ungapped positions from the template sequence. This
|
|
147
|
+
# is useful if you are slicing 16S rRNA alignments and want the _E.coli_
|
|
148
|
+
# corresponding positions - simply use the _E.coli_ sequence as template.
|
|
149
|
+
#
|
|
150
|
+
# BP.new.
|
|
151
|
+
# read_fasta(input: "test.fna").
|
|
152
|
+
# slice_align(template_file: "template.fna", slice: 4 .. 14).
|
|
153
|
+
# dump.run
|
|
154
|
+
#
|
|
155
|
+
# {:SEQ_NAME=>"ID00000000", :SEQ=>"ATACG-------CCCTGA", :SEQ_LEN=>18}
|
|
156
|
+
# {:SEQ_NAME=>"ID00000001", :SEQ=>"ATGAT-------ACCTGA", :SEQ_LEN=>18}
|
|
157
|
+
# {:SEQ_NAME=>"ID00000002", :SEQ=>"ATATACTCTTGACGCTAA", :SEQ_LEN=>18}
|
|
158
|
+
# {:SEQ_NAME=>"ID00000003", :SEQ=>"ATGTG-------CCCTTC", :SEQ_LEN=>18}
|
|
159
|
+
# {:SEQ_NAME=>"ID00000004", :SEQ=>"ATAAG-------CCCTTA", :SEQ_LEN=>18}
|
|
160
|
+
# {:SEQ_NAME=>"ID00000005", :SEQ=>"ATAAG-------CCCTTA", :SEQ_LEN=>18}
|
|
161
|
+
#
|
|
162
|
+
# rubocop: enable LineLength
|
|
163
|
+
# rubocop: disable ClassLength
|
|
164
|
+
class SliceAlign
|
|
165
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
166
|
+
residues_out)
|
|
167
|
+
|
|
168
|
+
# Constructor for SliceAlign.
|
|
169
|
+
#
|
|
170
|
+
# @param options [Hash] Options hash.
|
|
171
|
+
# @option options [Range,Integer] :slice
|
|
172
|
+
# @option options [String] :forward
|
|
173
|
+
# @option options [String] :forward_rc
|
|
174
|
+
# @option options [String] :reverse
|
|
175
|
+
# @option options [String] :reverse_rc
|
|
176
|
+
# @option options [Integer] :max_mismatches
|
|
177
|
+
# @option options [Integer] :max_insertions
|
|
178
|
+
# @option options [Integer] :max_deletions
|
|
179
|
+
# @option options [String] :template_file
|
|
180
|
+
#
|
|
181
|
+
# @return [SliceAlign] Class instance.
|
|
182
|
+
def initialize(options)
|
|
183
|
+
@options = options
|
|
184
|
+
@forward = forward
|
|
185
|
+
@reverse = reverse
|
|
186
|
+
@indels = BioDSL::Seq::INDELS.sort.join
|
|
187
|
+
@template = nil
|
|
188
|
+
@slice = options[:slice]
|
|
189
|
+
|
|
190
|
+
check_options
|
|
191
|
+
defaults
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Return the comman lamba for slice_align.
|
|
195
|
+
#
|
|
196
|
+
# @return [Proc] Command lambda.
|
|
197
|
+
def lmb
|
|
198
|
+
lambda do |input, output, status|
|
|
199
|
+
status_init(status, STATS)
|
|
200
|
+
|
|
201
|
+
parse_template_file
|
|
202
|
+
setup_template_slice
|
|
203
|
+
|
|
204
|
+
input.each do |record|
|
|
205
|
+
@status[:records_in] += 1
|
|
206
|
+
slice_align(record) if record.key? :SEQ
|
|
207
|
+
output << record
|
|
208
|
+
@status[:records_out] += 1
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
private
|
|
214
|
+
|
|
215
|
+
# Check options.
|
|
216
|
+
def check_options
|
|
217
|
+
options_allowed(@options, :slice, :forward, :forward_rc, :reverse,
|
|
218
|
+
:reverse_rc, :max_mismatches, :max_insertions,
|
|
219
|
+
:max_deletions, :template_file)
|
|
220
|
+
options_conflict(@options, slice: :forward)
|
|
221
|
+
options_files_exist(@options, :template_file)
|
|
222
|
+
options_assert(@options, ':max_mismatches >= 0')
|
|
223
|
+
options_assert(@options, ':max_insertions >= 0')
|
|
224
|
+
options_assert(@options, ':max_deletions >= 0')
|
|
225
|
+
options_assert(@options, ':max_mismatches <= 5')
|
|
226
|
+
options_assert(@options, ':max_insertions <= 5')
|
|
227
|
+
options_assert(@options, ':max_deletions <= 5')
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Setup default primer matching attributes.
|
|
231
|
+
def defaults
|
|
232
|
+
@max_mis = @options[:max_mismatches] || 2
|
|
233
|
+
@max_ins = @options[:max_insertions] || 1
|
|
234
|
+
@max_del = @options[:max_deletions] || 1
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Parse FASTA file with one gapped template sequence if specified.
|
|
238
|
+
def parse_template_file
|
|
239
|
+
return unless @options[:template_file]
|
|
240
|
+
|
|
241
|
+
@template = BioDSL::Fasta.read(@options[:template_file]).first
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Set the slice positions using the template sequence.
|
|
245
|
+
def setup_template_slice
|
|
246
|
+
return unless @template
|
|
247
|
+
|
|
248
|
+
pos_index = PosIndex.new(@template, @indels)
|
|
249
|
+
|
|
250
|
+
if @slice
|
|
251
|
+
start, stop = setup_template_slice_range(pos_index)
|
|
252
|
+
else
|
|
253
|
+
start, stop = setup_template_slice_primers(pos_index)
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
@slice = Range.new(start, stop)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Given a position index use slice positions to locate equivalent postitions
|
|
260
|
+
# in the template sequence.
|
|
261
|
+
#
|
|
262
|
+
# @param pos_index [PosIndex] Position index.
|
|
263
|
+
def setup_template_slice_range(pos_index)
|
|
264
|
+
start = pos_index[@slice.first]
|
|
265
|
+
stop = pos_index[@slice.last]
|
|
266
|
+
|
|
267
|
+
[start, stop]
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Given a position index use primers to locate the slice positions in the
|
|
271
|
+
# template sequence.
|
|
272
|
+
#
|
|
273
|
+
# @param pos_index [PosIndex] Position index.
|
|
274
|
+
def setup_template_slice_primers(pos_index)
|
|
275
|
+
compact = Seq.new(seq: @template.seq.dup.delete(@indels))
|
|
276
|
+
fmatch = find_match(@forward, compact)
|
|
277
|
+
rmatch = find_match(@reverse, compact)
|
|
278
|
+
start = pos_index[fmatch.start]
|
|
279
|
+
stop = pos_index[rmatch.stop]
|
|
280
|
+
|
|
281
|
+
[start, stop]
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Return the forward primer sequence and reverse-complement it if need be.
|
|
285
|
+
#
|
|
286
|
+
# @return [String] Forward primer sequence.
|
|
287
|
+
def forward
|
|
288
|
+
if @options[:forward_rc]
|
|
289
|
+
@options[:forward] = Seq.new(seq: @options[:forward_rc], type: :dna).
|
|
290
|
+
reverse.complement.seq
|
|
291
|
+
else
|
|
292
|
+
@options[:forward]
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Return the reverse primer sequence and reverse-complement it if need be.
|
|
297
|
+
#
|
|
298
|
+
# @return [String] Reverse primer sequence.
|
|
299
|
+
def reverse
|
|
300
|
+
if @options[:reverse_rc]
|
|
301
|
+
@options[:reverse] = Seq.new(seq: @options[:reverse_rc], type: :dna).
|
|
302
|
+
reverse.complement.seq
|
|
303
|
+
else
|
|
304
|
+
@options[:reverse]
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
# Slice sequence in given record accoding to slice positions.
|
|
309
|
+
#
|
|
310
|
+
# @param record [Hash] BioDSL record.
|
|
311
|
+
def slice_align(record)
|
|
312
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
313
|
+
|
|
314
|
+
@status[:sequences_in] += 1
|
|
315
|
+
@status[:residues_in] += entry.length
|
|
316
|
+
|
|
317
|
+
setup_slice(entry) unless @slice
|
|
318
|
+
|
|
319
|
+
entry = entry[@slice]
|
|
320
|
+
|
|
321
|
+
record.merge! entry.to_bp
|
|
322
|
+
|
|
323
|
+
@status[:sequences_out] += 1
|
|
324
|
+
@status[:residues_out] += entry.length
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Usings primers to locate slice positions in entry.
|
|
328
|
+
#
|
|
329
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
330
|
+
def setup_slice(entry)
|
|
331
|
+
pos_index = PosIndex.new(entry, @indels)
|
|
332
|
+
compact = Seq.new(seq: entry.seq.dup.delete(@indels))
|
|
333
|
+
|
|
334
|
+
fmatch = find_match(@forward, compact)
|
|
335
|
+
rmatch = find_match(@reverse, compact)
|
|
336
|
+
|
|
337
|
+
@slice = Range.new(pos_index[fmatch.start], pos_index[rmatch.stop])
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Find pattern in entry and return match.
|
|
341
|
+
#
|
|
342
|
+
# @param pattern [String] Search pattern.
|
|
343
|
+
# @param entry [BioDSL::Seq] Sequence to search.
|
|
344
|
+
#
|
|
345
|
+
# @return [BioDSL::Seq::Match] Pattern match.
|
|
346
|
+
#
|
|
347
|
+
# @raise [BioDSL::SeqError] If no match.
|
|
348
|
+
def find_match(pattern, entry)
|
|
349
|
+
match = entry.patmatch(pattern,
|
|
350
|
+
max_mismatches: @max_mis,
|
|
351
|
+
max_insertions: @max_ins,
|
|
352
|
+
max_deletions: @max_del)
|
|
353
|
+
|
|
354
|
+
return match unless match.nil?
|
|
355
|
+
|
|
356
|
+
fail BioDSL::SeqError, "pattern not found: #{pattern}"
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Class for indexing gapped sequence positions to non-gapped sequence
|
|
360
|
+
# positions.
|
|
361
|
+
class PosIndex
|
|
362
|
+
# Constructor for PosIndex.
|
|
363
|
+
#
|
|
364
|
+
# @param entry [BioDSL::Seq] Gapped sequence entry.
|
|
365
|
+
# @param indels [String] String with indel alphabet.
|
|
366
|
+
#
|
|
367
|
+
# @return [PosIndex] Class instance.
|
|
368
|
+
def initialize(entry, indels)
|
|
369
|
+
@entry = entry
|
|
370
|
+
@indels = indels
|
|
371
|
+
@index = index_positions
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
# Given a non-gapped sequence postion return the gapped position.
|
|
375
|
+
#
|
|
376
|
+
# @param pos [Integer] Non-gapped sequence position.
|
|
377
|
+
#
|
|
378
|
+
# @return [Integer] Gapped sequence position
|
|
379
|
+
def [](pos)
|
|
380
|
+
@index[pos]
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
private
|
|
384
|
+
|
|
385
|
+
# Return an index mapping gapped sequence positions to non-gapped
|
|
386
|
+
# positions.
|
|
387
|
+
#
|
|
388
|
+
# @return [Array] Position index.
|
|
389
|
+
def index_positions
|
|
390
|
+
pos_index = []
|
|
391
|
+
|
|
392
|
+
@entry.seq.chars.each_with_index do |c, i|
|
|
393
|
+
pos_index << i unless @indels.include? c
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
pos_index
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
end
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Slice sequences in the stream and obtain subsequences.
|
|
30
|
+
#
|
|
31
|
+
# Slice subsequences from sequences using index positions, that is single
|
|
32
|
+
# postion residues, or using ranges for stretches of residues.
|
|
33
|
+
#
|
|
34
|
+
# All positions are 0-based.
|
|
35
|
+
#
|
|
36
|
+
# If the records also contain quality SCORES these are also sliced.
|
|
37
|
+
#
|
|
38
|
+
# == Usage
|
|
39
|
+
#
|
|
40
|
+
# slice_seq(<slice: <index>|<range>>)
|
|
41
|
+
#
|
|
42
|
+
# === Options
|
|
43
|
+
#
|
|
44
|
+
# * slice: <index> - Slice a one residue subsequence.
|
|
45
|
+
# * slice: <range> - Slice a range from the sequence.
|
|
46
|
+
#
|
|
47
|
+
# == Examples
|
|
48
|
+
#
|
|
49
|
+
# Consider the following FASTQ entry in the file test.fq:
|
|
50
|
+
#
|
|
51
|
+
# @HWI-EAS157_20FFGAAXX:2:1:888:434
|
|
52
|
+
# TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
|
|
53
|
+
# +
|
|
54
|
+
# !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
|
|
55
|
+
#
|
|
56
|
+
# To slice the second residue from the beginning do:
|
|
57
|
+
#
|
|
58
|
+
# BP.new.read_fastq(input: "test.fq").slice_seq(slice: 2).dump.run
|
|
59
|
+
#
|
|
60
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
61
|
+
# :SEQ=>"G",
|
|
62
|
+
# :SEQ_LEN=>1,
|
|
63
|
+
# :SCORES=>"#"}
|
|
64
|
+
#
|
|
65
|
+
# To slice the last residue do:
|
|
66
|
+
#
|
|
67
|
+
# BP.new.read_fastq(input: "test.fq").slice_seq(slice: -1).dump.run
|
|
68
|
+
#
|
|
69
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
70
|
+
# :SEQ=>"T",
|
|
71
|
+
# :SEQ_LEN=>1,
|
|
72
|
+
# :SCORES=>"I"}
|
|
73
|
+
#
|
|
74
|
+
# To slice the first 5 residues do:
|
|
75
|
+
#
|
|
76
|
+
# BP.new.read_fastq(input: "test.fq").slice_seq(slice: 0 ... 5).dump.run
|
|
77
|
+
#
|
|
78
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
79
|
+
# :SEQ=>"TTGGT",
|
|
80
|
+
# :SEQ_LEN=>5,
|
|
81
|
+
# :SCORES=>"!\"\#$%"}
|
|
82
|
+
#
|
|
83
|
+
# To slice the last 5 residues do:
|
|
84
|
+
#
|
|
85
|
+
# BP.new.read_fastq(input: "test.fq").slice_seq(slice: -5 .. -1).dump.run
|
|
86
|
+
#
|
|
87
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
88
|
+
# :SEQ=>"GCGAT",
|
|
89
|
+
# :SEQ_LEN=>5,
|
|
90
|
+
# :SCORES=>"EFGHI"}
|
|
91
|
+
class SliceSeq
|
|
92
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
93
|
+
residues_out)
|
|
94
|
+
|
|
95
|
+
# Constructor for SliceSeq.
|
|
96
|
+
#
|
|
97
|
+
# @param options [Hash] Options hash.
|
|
98
|
+
# @option options [Range,Integer] :slice
|
|
99
|
+
#
|
|
100
|
+
# @return [SliceSeq] Class instance.
|
|
101
|
+
def initialize(options)
|
|
102
|
+
@options = options
|
|
103
|
+
|
|
104
|
+
check_options
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Return lambda for command.
|
|
108
|
+
#
|
|
109
|
+
# @return [Proc] Command lambda.
|
|
110
|
+
def lmb
|
|
111
|
+
lambda do |input, output, status|
|
|
112
|
+
status_init(status, STATS)
|
|
113
|
+
|
|
114
|
+
input.each do |record|
|
|
115
|
+
@status[:records_in] += 1
|
|
116
|
+
|
|
117
|
+
slice_seq(record) if record.key? :SEQ
|
|
118
|
+
|
|
119
|
+
output << record
|
|
120
|
+
|
|
121
|
+
@status[:records_out] += 1
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
private
|
|
127
|
+
|
|
128
|
+
# Check options.
|
|
129
|
+
def check_options
|
|
130
|
+
options_allowed(@options, :slice)
|
|
131
|
+
options_required(@options, :slice)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Slice sequence in given record.
|
|
135
|
+
#
|
|
136
|
+
# @param record [Hash] BioDSL record.
|
|
137
|
+
def slice_seq(record)
|
|
138
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
139
|
+
|
|
140
|
+
@status[:sequences_in] += 1
|
|
141
|
+
@status[:residues_in] += entry.length
|
|
142
|
+
|
|
143
|
+
entry = entry[@options[:slice]]
|
|
144
|
+
|
|
145
|
+
@status[:sequences_out] += 1
|
|
146
|
+
@status[:residues_out] += entry.length
|
|
147
|
+
|
|
148
|
+
record.merge! entry.to_bp
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|