BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Create taxonomy index from sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +index_taxonomy+ is used to create a taxonomy index to allow subsequent
|
|
32
|
+
# taxonomic classification with +classify_seq+. The records with taxnomic
|
|
33
|
+
# information must contain :SEQ_NAME and :SEQ keys where the :SEQ_NAME value
|
|
34
|
+
# must be formatted with an initial ID number followed by a space and then the
|
|
35
|
+
# taxonomy string progressing from kingdom to species level. Only the
|
|
36
|
+
# following leves are accepted:
|
|
37
|
+
#
|
|
38
|
+
# * K - kingdom
|
|
39
|
+
# * P - phylum
|
|
40
|
+
# * C - class
|
|
41
|
+
# * O - order
|
|
42
|
+
# * F - family
|
|
43
|
+
# * G - genus
|
|
44
|
+
# * S - species
|
|
45
|
+
#
|
|
46
|
+
# Truncated taxonomic strings are allowed, e.g. a string that stops at family
|
|
47
|
+
# level. Below is an example of a full taxonomic string:
|
|
48
|
+
#
|
|
49
|
+
# 32 K#Bacteria;P#Actinobacteria;C#Actinobacteria;O#Acidimicrobiales; \
|
|
50
|
+
# F#Acidimicrobiaceae;G#Ferrimicrobium;S#Ferrimicrobium acidiphilum
|
|
51
|
+
#
|
|
52
|
+
# The resulting index consists of the following files (here using the default
|
|
53
|
+
# "taxonomy" as prefix) which are saved to a specified +output_dir+:
|
|
54
|
+
#
|
|
55
|
+
# * taxonomy_tax_index.dat - return node for a given node id.
|
|
56
|
+
# * taxonomy_kmer_index.dat - return list of node ids for a given level and
|
|
57
|
+
# kmer.
|
|
58
|
+
#
|
|
59
|
+
# The index is constructed by breaking the sequences into kmers of a given
|
|
60
|
+
# kmer_size and using a given step_size:
|
|
61
|
+
#
|
|
62
|
+
# Example FASTA entry:
|
|
63
|
+
#
|
|
64
|
+
# >2 K#Bacteria;P#Proteobacteria;C#Gammaproteobacteria;O#Vibrionales; \
|
|
65
|
+
# F#Vibrionaceae;G#Vibrio;S#Vibrio
|
|
66
|
+
# UCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGAUGCAGCCAUGCCGCGUGUAUGA
|
|
67
|
+
#
|
|
68
|
+
# This sequence is broken down to a list of oligos using the default kmer_size
|
|
69
|
+
# and step_size of 8 and 1, respectively:
|
|
70
|
+
#
|
|
71
|
+
# UCCUACGG
|
|
72
|
+
# CCUACGGG
|
|
73
|
+
# CUACGGGA
|
|
74
|
+
# UACGGGAG
|
|
75
|
+
# ACGGGAGG
|
|
76
|
+
# ...
|
|
77
|
+
#
|
|
78
|
+
# Oligos containing ambiguity codes are skipped. Each oligo is encoded as an
|
|
79
|
+
# kmer (integer) by encoding two bits per nucletoide:
|
|
80
|
+
#
|
|
81
|
+
# * A = 00
|
|
82
|
+
# * U = 01
|
|
83
|
+
# * C = 10
|
|
84
|
+
# * G = 11
|
|
85
|
+
#
|
|
86
|
+
# E.g. UCCUACGG = 0110100100101111 = 26927
|
|
87
|
+
#
|
|
88
|
+
# For each node in the tree a vector is kept containing information of all
|
|
89
|
+
# observed oligos for that particular node. Thus all child nodes contain a
|
|
90
|
+
# subset of oligos compared to the parent node. Finally, the tree is saved to
|
|
91
|
+
# files.
|
|
92
|
+
#
|
|
93
|
+
# It should be noted that the speed and accuarcy of the classification is
|
|
94
|
+
# strongly dependent on the size and quality of the taxonomic database used
|
|
95
|
+
# (RDP, GreenGenes or Silva) and for a particular amplicon it is strongly
|
|
96
|
+
# recommended to create a slice from the database aligment matching the
|
|
97
|
+
# amplicon.
|
|
98
|
+
#
|
|
99
|
+
# == Usage
|
|
100
|
+
#
|
|
101
|
+
# index_taxonomy(<output_dir: <dir>>[, kmer_size: <uint>
|
|
102
|
+
# [, step_size: <uint>[, prefix: <string>
|
|
103
|
+
# [, force: <bool>]]]])
|
|
104
|
+
#
|
|
105
|
+
# === Options
|
|
106
|
+
#
|
|
107
|
+
# * output_dir: <dir> - Output directory to contain index files.
|
|
108
|
+
# * kmer_size: <uint> - Size of kmer to use (default=8).
|
|
109
|
+
# * step_size: <uint> - Size of steps (default=1).
|
|
110
|
+
# * prefix: <string> - Prefix to use with index file names
|
|
111
|
+
# (default="taxonomy").
|
|
112
|
+
# * force: <bool> - Force overwrite existing index files.
|
|
113
|
+
#
|
|
114
|
+
# == Examples
|
|
115
|
+
#
|
|
116
|
+
# BP.new.
|
|
117
|
+
# read_fasta(input: "RDP_11_Bacteria.fna").
|
|
118
|
+
# index_taxonomy(output_dir: "RDP_11").
|
|
119
|
+
# run
|
|
120
|
+
class IndexTaxonomy
|
|
121
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
122
|
+
residues_out)
|
|
123
|
+
|
|
124
|
+
# Constructor for IndexTaxonomy.
|
|
125
|
+
#
|
|
126
|
+
# @param options [Hash] Options hash.
|
|
127
|
+
# @option options [String] :output_dir Path to output directory.
|
|
128
|
+
# @option options [String] :prefix Database file name prefix.
|
|
129
|
+
# @option options [Integer] :kmer_size Kmer size to use for indexing.
|
|
130
|
+
# @option options [Integer] :step_size Step size to use for indexing.
|
|
131
|
+
# @option options [Boolean] :force Flag for force-overwriting output files.
|
|
132
|
+
#
|
|
133
|
+
# @return [IndexTaxonomy] Instance of class.
|
|
134
|
+
def initialize(options)
|
|
135
|
+
@options = options
|
|
136
|
+
|
|
137
|
+
defaults
|
|
138
|
+
check_options
|
|
139
|
+
create_output_dir
|
|
140
|
+
check_output_files
|
|
141
|
+
|
|
142
|
+
@index = BioDSL::Taxonomy::Index.new(options)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Return command lambda for index_taxonomy.
|
|
146
|
+
#
|
|
147
|
+
# @return [Proc] Command lambda.
|
|
148
|
+
def lmb
|
|
149
|
+
lambda do |input, output, status|
|
|
150
|
+
status_init(status, STATS)
|
|
151
|
+
|
|
152
|
+
input.each do |record|
|
|
153
|
+
@status[:records_in] += 1
|
|
154
|
+
|
|
155
|
+
add_to_index(record) if record[:SEQ_NAME] && record[:SEQ]
|
|
156
|
+
|
|
157
|
+
output << record
|
|
158
|
+
@status[:records_out] += 1
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
@index.save
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
private
|
|
166
|
+
|
|
167
|
+
# Check options.
|
|
168
|
+
def check_options
|
|
169
|
+
options_allowed(@options, :output_dir, :kmer_size, :step_size, :prefix,
|
|
170
|
+
:force)
|
|
171
|
+
options_required(@options, :output_dir)
|
|
172
|
+
options_allowed_values(@options, force: [nil, true, false])
|
|
173
|
+
options_files_exist_force(@options, :report)
|
|
174
|
+
options_assert(@options, ':kmer_size > 0')
|
|
175
|
+
options_assert(@options, ':kmer_size <= 12')
|
|
176
|
+
options_assert(@options, ':step_size > 0')
|
|
177
|
+
options_assert(@options, ':step_size <= 12')
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Set the default options hash values.
|
|
181
|
+
def defaults
|
|
182
|
+
@options[:prefix] ||= 'taxonomy'
|
|
183
|
+
@options[:kmer_size] ||= 8
|
|
184
|
+
@options[:step_size] ||= 1
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Create the output directory specified in the options hash if this does not
|
|
188
|
+
# already exist.
|
|
189
|
+
def create_output_dir
|
|
190
|
+
return if File.exist?(@options[:output_dir])
|
|
191
|
+
|
|
192
|
+
FileUtils.mkdir_p(@options[:output_dir])
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Check if the output files already exist and throw an exception if so and
|
|
196
|
+
# the no force options is used.
|
|
197
|
+
#
|
|
198
|
+
# @raise [BioDSL::OptionsError] If file exists and force option not used.
|
|
199
|
+
def check_output_files
|
|
200
|
+
files = [
|
|
201
|
+
File.join(@options[:output_dir], "#{@options[:prefix]}_tax_index.dat"),
|
|
202
|
+
File.join(@options[:output_dir], "#{@options[:prefix]}_kmer_index.dat")
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
files.each do |file|
|
|
206
|
+
next unless File.exist? file
|
|
207
|
+
|
|
208
|
+
unless @options[:force]
|
|
209
|
+
msg = "File exists: #{file} - use 'force: true' to overwrite"
|
|
210
|
+
fail BioDSL::OptionError, msg
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Add to the taxonomy index the sequence information from a given record.
|
|
216
|
+
#
|
|
217
|
+
# @param record [Hash] BioDSL record with sequence info.
|
|
218
|
+
def add_to_index(record)
|
|
219
|
+
@status[:sequences_in] += 1
|
|
220
|
+
|
|
221
|
+
_, seq_name = record[:SEQ_NAME].split(' ', 2)
|
|
222
|
+
|
|
223
|
+
@index.add(BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ]))
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Mask sequences in the stream based on quality scores.
|
|
30
|
+
#
|
|
31
|
+
# +mask_seq+ masks sequences in the stream using either hard masking or
|
|
32
|
+
# soft masking (default). Hard masking is replacing residues with
|
|
33
|
+
# corresponding quality score below a specified +quality_min+ with an N,
|
|
34
|
+
# while soft is replacing such residues with lower case. The sequences are
|
|
35
|
+
# values to SEQ keys and the quality scores are values to SCORES keys. The
|
|
36
|
+
# SCORES are encoded as ranges of ASCII characters from '!' to 'I'
|
|
37
|
+
# indicating scores from 0 to 40.
|
|
38
|
+
#
|
|
39
|
+
# == Usage
|
|
40
|
+
#
|
|
41
|
+
# mask_seq([quality_min: <uint>[, mask: <:soft|:hard>]])
|
|
42
|
+
#
|
|
43
|
+
# === Options
|
|
44
|
+
#
|
|
45
|
+
# * quality_min: <uint> - Minimum quality (default=20).
|
|
46
|
+
# * mask: <string> - Soft or Hard mask (default=soft).
|
|
47
|
+
#
|
|
48
|
+
# == Examples
|
|
49
|
+
#
|
|
50
|
+
# Consider the following FASTQ entry in the file test.fq:
|
|
51
|
+
#
|
|
52
|
+
# @HWI-EAS157_20FFGAAXX:2:1:888:434
|
|
53
|
+
# TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
|
|
54
|
+
# +HWI-EAS157_20FFGAAXX:2:1:888:434
|
|
55
|
+
# !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
|
|
56
|
+
#
|
|
57
|
+
# We can read in these sequence using +read_fastq+ and then soft mask the
|
|
58
|
+
# sequence with mask_seq like this:
|
|
59
|
+
#
|
|
60
|
+
# BP.new.read_fastq(input: "test.fq").mask_seq.dump.run
|
|
61
|
+
#
|
|
62
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
63
|
+
# :SEQ=>"ttggtcgctcgctccgcgacCTCAGATCAGACGTGGGCGAT",
|
|
64
|
+
# :SEQ_LEN=>41,
|
|
65
|
+
# :SCORES=>"!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI"}
|
|
66
|
+
#
|
|
67
|
+
# Using the +quality_min+ option we can change the cutoff:
|
|
68
|
+
#
|
|
69
|
+
# BP.new.read_fastq(input: "test.fq").mask_seq(quality_min: 25).dump.run
|
|
70
|
+
#
|
|
71
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
72
|
+
# :SEQ=>"ttggtcgctcgctccgcgacctcagATCAGACGTGGGCGAT",
|
|
73
|
+
# :SEQ_LEN=>41,
|
|
74
|
+
# :SCORES=>"!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI"}
|
|
75
|
+
#
|
|
76
|
+
# Using the +mask+ option for hard masking:
|
|
77
|
+
#
|
|
78
|
+
# BP.new.read_fastq(input: "test.fq").mask_seq(mask: :hard).dump.run
|
|
79
|
+
#
|
|
80
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
81
|
+
# :SEQ=>"NNNNNNNNNNNNNNNNNNNNCTCAGATCAGACGTGGGCGAT",
|
|
82
|
+
# :SEQ_LEN=>41,
|
|
83
|
+
# :SCORES=>"!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI"}
|
|
84
|
+
class MaskSeq
|
|
85
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
86
|
+
residues_out masked)
|
|
87
|
+
|
|
88
|
+
# Constructor for MaskSeq.
|
|
89
|
+
#
|
|
90
|
+
# @param options [Hash] Options hash.
|
|
91
|
+
# @option options [Integer] Minimum quality score.
|
|
92
|
+
# @option options [Symbol,String] Mask scheme.
|
|
93
|
+
#
|
|
94
|
+
# @return [MaskSeq] Instance of MaskSeq.
|
|
95
|
+
def initialize(options)
|
|
96
|
+
@options = options
|
|
97
|
+
|
|
98
|
+
check_options
|
|
99
|
+
defaults
|
|
100
|
+
|
|
101
|
+
@mask = options[:mask].to_sym
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Return command lambda for mask_seq.
|
|
105
|
+
#
|
|
106
|
+
# @return [Proc] command lambda.
|
|
107
|
+
def lmb
|
|
108
|
+
lambda do |input, output, status|
|
|
109
|
+
status_init(status, STATS)
|
|
110
|
+
|
|
111
|
+
input.each do |record|
|
|
112
|
+
@status[:records_in] += 1
|
|
113
|
+
|
|
114
|
+
mask_seq(record) if record[:SEQ] && record[:SCORES]
|
|
115
|
+
|
|
116
|
+
output << record
|
|
117
|
+
|
|
118
|
+
@status[:records_out] += 1
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
@status[:masked_percent] =
|
|
122
|
+
(100 * @status[:masked].to_f / @status[:residues_in]).round(2)
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
private
|
|
127
|
+
|
|
128
|
+
# Check options.
|
|
129
|
+
def check_options
|
|
130
|
+
options_allowed(@options, :quality_min, :mask)
|
|
131
|
+
options_allowed_values(@options, mask: [:soft, :hard, 'soft', 'hard'])
|
|
132
|
+
options_assert(@options, ':quality_min >= 0')
|
|
133
|
+
options_assert(@options, ':quality_min <= 40')
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Set default options.
|
|
137
|
+
def defaults
|
|
138
|
+
@options[:quality_min] ||= 20
|
|
139
|
+
@options[:mask] ||= :soft
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Mask sequence in given record.
|
|
143
|
+
#
|
|
144
|
+
# @param record [Hash] BioDSL record.
|
|
145
|
+
def mask_seq(record)
|
|
146
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
147
|
+
|
|
148
|
+
@status[:sequences_in] += 1
|
|
149
|
+
@status[:residues_in] += entry.length
|
|
150
|
+
|
|
151
|
+
@mask == :soft ? mask_seq_soft(entry) : mask_seq_hard(entry)
|
|
152
|
+
|
|
153
|
+
@status[:sequences_out] += 1
|
|
154
|
+
@status[:residues_out] += entry.length
|
|
155
|
+
|
|
156
|
+
record.merge! entry.to_bp
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Soft mask sequences in given entry.
|
|
160
|
+
#
|
|
161
|
+
# @param entry [BioDSL::seq] sequences entry.
|
|
162
|
+
def mask_seq_soft(entry)
|
|
163
|
+
entry.mask_seq_soft!(@options[:quality_min])
|
|
164
|
+
@status[:masked] += entry.seq.count('a-z')
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Hard mask sequences in given entry.
|
|
168
|
+
#
|
|
169
|
+
# @param entry [BioDSL::seq] sequences entry.
|
|
170
|
+
def mask_seq_hard(entry)
|
|
171
|
+
entry.mask_seq_hard!(@options[:quality_min])
|
|
172
|
+
@status[:masked] += entry.seq.count('N')
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Calculate the mean or local mean of quality SCORES in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +mean_scores+ calculates either the global or local mean value or quality
|
|
32
|
+
# SCORES in the stream. The quality SCORES are encoded Phred style in
|
|
33
|
+
# character string.
|
|
34
|
+
#
|
|
35
|
+
# The global (default) behaviour calculates the SCORES_MEAN as the sum of all
|
|
36
|
+
# the scores over the length of the SCORES string.
|
|
37
|
+
#
|
|
38
|
+
# The local means SCORES_MEAN_LOCAL are calculated using means from a sliding
|
|
39
|
+
# window, where the smallest mean is returned.
|
|
40
|
+
#
|
|
41
|
+
# Thus, subquality records, with either an overall low mean quality or with
|
|
42
|
+
# local dip in quality, can be filtered using +grab+.
|
|
43
|
+
#
|
|
44
|
+
# == Usage
|
|
45
|
+
#
|
|
46
|
+
# mean_scores([local: <bool>[, window_size: <uint>]])
|
|
47
|
+
#
|
|
48
|
+
# === Options
|
|
49
|
+
#
|
|
50
|
+
# * local: <bool> - Calculate local mean score (default=false).
|
|
51
|
+
# * window_size: <uint> - Size of sliding window (defaul=5).
|
|
52
|
+
#
|
|
53
|
+
# == Examples
|
|
54
|
+
#
|
|
55
|
+
# Consider the following FASTQ entry in the file test.fq:
|
|
56
|
+
#
|
|
57
|
+
# @HWI-EAS157_20FFGAAXX:2:1:888:434
|
|
58
|
+
# TTGGTCGCTCGCTCGACCTCAGATCAGACGTGG
|
|
59
|
+
# +
|
|
60
|
+
# BCDEFGHIIIIIII,,,,,IFFIIIIIIIIIII
|
|
61
|
+
#
|
|
62
|
+
# The values of the scores in decimal are:
|
|
63
|
+
#
|
|
64
|
+
# SCORES: 33;34;35;36;37;38;39;40;40;40;40;40;40;40;11;11;11;11;11;40;37;
|
|
65
|
+
# 37;40;40;40;40;40;40;40;40;40;40;40;
|
|
66
|
+
#
|
|
67
|
+
# To calculate the mean score do:
|
|
68
|
+
#
|
|
69
|
+
# BP.new.read_fastq(input: "test.fq").mean_scores.dump.run
|
|
70
|
+
#
|
|
71
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
72
|
+
# :SEQ=>"TTGGTCGCTCGCTCGACCTCAGATCAGACGTGG",
|
|
73
|
+
# :SEQ_LEN=>33,
|
|
74
|
+
# :SCORES=>"BCDEFGHIIIIIII,,,,,IFFIIIIIIIIIII",
|
|
75
|
+
# :SCORES_MEAN=>34.58}
|
|
76
|
+
#
|
|
77
|
+
# To calculate local means for a sliding window, do:
|
|
78
|
+
#
|
|
79
|
+
# BP.new.read_fastq(input: "test.fq").mean_scores(local: true).dump.run
|
|
80
|
+
#
|
|
81
|
+
# {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
|
|
82
|
+
# :SEQ=>"TTGGTCGCTCGCTCGACCTCAGATCAGACGTGG",
|
|
83
|
+
# :SEQ_LEN=>33,
|
|
84
|
+
# :SCORES=>"BCDEFGHIIIIIII,,,,,IFFIIIIIIIIIII",
|
|
85
|
+
# :SCORES_MEAN_LOCAL=>11.0}
|
|
86
|
+
#
|
|
87
|
+
# Which indicates a local minimum was located at the stretch of ,,,,, =
|
|
88
|
+
# 11+11+11+11+11 / 5 = 11.0
|
|
89
|
+
class MeanScores
|
|
90
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
91
|
+
residues_out min_mean max_mean mean_mean)
|
|
92
|
+
|
|
93
|
+
# Constructor for MeanScores.
|
|
94
|
+
#
|
|
95
|
+
# @param options [Hash] Options hash.
|
|
96
|
+
# @option options [Boolean] :local
|
|
97
|
+
# @option options [Fixnum] :window_size
|
|
98
|
+
#
|
|
99
|
+
# @return [MeanScores] Class instance.
|
|
100
|
+
def initialize(options)
|
|
101
|
+
@options = options
|
|
102
|
+
@min = Float::INFINITY
|
|
103
|
+
@max = 0
|
|
104
|
+
@sum = 0
|
|
105
|
+
@count = 0
|
|
106
|
+
|
|
107
|
+
check_options
|
|
108
|
+
defaults
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Return command lambda for mean_scores.
|
|
112
|
+
#
|
|
113
|
+
# @return [Proc] Command lambda.
|
|
114
|
+
def lmb
|
|
115
|
+
lambda do |input, output, status|
|
|
116
|
+
status_init(status, STATS)
|
|
117
|
+
|
|
118
|
+
input.each do |record|
|
|
119
|
+
@status[:records_in] += 1
|
|
120
|
+
|
|
121
|
+
calc_mean(record) if record[:SCORES] && record[:SCORES].length > 0
|
|
122
|
+
|
|
123
|
+
output << record
|
|
124
|
+
|
|
125
|
+
@status[:records_out] += 1
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
@status[:mean_mean] = (@sum.to_f / @count).round(2)
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
private
|
|
133
|
+
|
|
134
|
+
# Check options
|
|
135
|
+
def check_options
|
|
136
|
+
options_allowed(@options, :local, :window_size)
|
|
137
|
+
options_tie(@options, window_size: :local)
|
|
138
|
+
options_allowed_values(@options, local: [true, false])
|
|
139
|
+
options_assert(@options, ':window_size > 1')
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Set default options.
|
|
143
|
+
def defaults
|
|
144
|
+
@options[:window_size] ||= 5
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Calculate the mean score for a given record and record
|
|
148
|
+
# count, sum, min and max.
|
|
149
|
+
#
|
|
150
|
+
# @param record [Hash] BioDSL record.
|
|
151
|
+
def calc_mean(record)
|
|
152
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
153
|
+
|
|
154
|
+
if @options[:local]
|
|
155
|
+
mean = entry.scores_mean_local(@options[:window_size]).round(2)
|
|
156
|
+
record[:SCORES_MEAN_LOCAL] = mean
|
|
157
|
+
else
|
|
158
|
+
mean = entry.scores_mean.round(2)
|
|
159
|
+
record[:SCORES_MEAN] = mean
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
@sum += mean
|
|
163
|
+
@status[:min_mean] = mean if mean < @status[:min_mean]
|
|
164
|
+
@status[:max_mean] = mean if mean > @status[:max_mean]
|
|
165
|
+
@count += 1
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|