BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Run usearch_local on sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# This is a wrapper for the +usearch+ tool to run the program usearch_local.
|
|
32
|
+
# Basically sequence type records are searched against a reference database
|
|
33
|
+
# and records with hit information are output.
|
|
34
|
+
#
|
|
35
|
+
# Please refer to the manual:
|
|
36
|
+
#
|
|
37
|
+
# http://drive5.com/usearch/manual/cmd_usearch_local.html
|
|
38
|
+
#
|
|
39
|
+
# Usearch 7.0 must be installed for +usearch+ to work. Read more here:
|
|
40
|
+
#
|
|
41
|
+
# http://www.drive5.com/usearch/
|
|
42
|
+
#
|
|
43
|
+
# == Usage
|
|
44
|
+
#
|
|
45
|
+
# usearch_local(<database: <file>, <identity: float>,
|
|
46
|
+
# <strand: "plus|both">[, cpus: <uint>])
|
|
47
|
+
#
|
|
48
|
+
# === Options
|
|
49
|
+
#
|
|
50
|
+
# * database: <file> - Database to search (in FASTA format).
|
|
51
|
+
# * identity: <float> - Similarity for matching in percent between 0.0 and
|
|
52
|
+
# 1.0.
|
|
53
|
+
# * strand: <string> - For nucleotide search report hits from plus or both
|
|
54
|
+
# strands.
|
|
55
|
+
# * cpus: <uint> - Number of CPU cores to use (default=1).
|
|
56
|
+
#
|
|
57
|
+
# == Examples
|
|
58
|
+
#
|
|
59
|
+
class UsearchLocal
|
|
60
|
+
require 'BioDSL/helpers/aux_helper'
|
|
61
|
+
|
|
62
|
+
include AuxHelper
|
|
63
|
+
|
|
64
|
+
STATS = %i(records_in records_out sequences_in hits_out)
|
|
65
|
+
|
|
66
|
+
# Constructor for UsearchLocal.
|
|
67
|
+
#
|
|
68
|
+
# @param options [Hash] Options hash.
|
|
69
|
+
# @option options [String] :database
|
|
70
|
+
# @option options [Float] :identity
|
|
71
|
+
# @option options [String,Symbol] :strand
|
|
72
|
+
# @option options [Integer] :cpus
|
|
73
|
+
#
|
|
74
|
+
# @return [UsearchLocal] Class instance.
|
|
75
|
+
def initialize(options)
|
|
76
|
+
@options = options
|
|
77
|
+
@options[:cpus] ||= 1
|
|
78
|
+
|
|
79
|
+
aux_exist('usearch')
|
|
80
|
+
check_options
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Return command lambda for usearch_local.
|
|
84
|
+
#
|
|
85
|
+
# @return [Proc] Command lambda.
|
|
86
|
+
def lmb
|
|
87
|
+
lambda do |input, output, status|
|
|
88
|
+
status_init(status, STATS)
|
|
89
|
+
|
|
90
|
+
TmpDir.create('in', 'out') do |tmp_in, tmp_out|
|
|
91
|
+
process_input(input, output, tmp_in)
|
|
92
|
+
run_usearch_local(tmp_in, tmp_out)
|
|
93
|
+
process_output(output, tmp_out)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
# Check options.
|
|
101
|
+
def check_options
|
|
102
|
+
options_allowed(@options, :database, :identity, :strand, :cpus)
|
|
103
|
+
options_required(@options, :database, :identity)
|
|
104
|
+
options_allowed_values(@options, strand: ['plus', 'both', :plus, :both])
|
|
105
|
+
options_files_exist(@options, :database)
|
|
106
|
+
options_assert(@options, ':identity > 0.0')
|
|
107
|
+
options_assert(@options, ':identity <= 1.0')
|
|
108
|
+
options_assert(@options, ':cpus >= 1')
|
|
109
|
+
options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Process input and emit to the output stream while saving all records
|
|
113
|
+
# containing sequences to a temporary FASTA file.
|
|
114
|
+
#
|
|
115
|
+
# @param input [Enumerator] Input stream.
|
|
116
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
117
|
+
# @param tmp_in [String] Path to temporary file.
|
|
118
|
+
def process_input(input, output, tmp_in)
|
|
119
|
+
BioDSL::Fasta.open(tmp_in, 'w') do |ios|
|
|
120
|
+
input.each_with_index do |record, i|
|
|
121
|
+
@status[:records_in] += 1
|
|
122
|
+
|
|
123
|
+
output << record
|
|
124
|
+
|
|
125
|
+
@status[:records_out] += 1
|
|
126
|
+
|
|
127
|
+
next unless record[:SEQ]
|
|
128
|
+
|
|
129
|
+
@status[:sequences_in] += 1
|
|
130
|
+
seq_name = record[:SEQ_NAME] || i.to_s
|
|
131
|
+
|
|
132
|
+
entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
|
|
133
|
+
|
|
134
|
+
ios.puts entry.to_fasta
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Run usearch local on the input file and save results in the output file.
|
|
140
|
+
def run_usearch_local(tmp_in, tmp_out)
|
|
141
|
+
run_opts = {
|
|
142
|
+
input: tmp_in,
|
|
143
|
+
output: tmp_out,
|
|
144
|
+
database: @options[:database],
|
|
145
|
+
strand: @options[:strand],
|
|
146
|
+
identity: @options[:identity],
|
|
147
|
+
cpus: @options[:cpus],
|
|
148
|
+
verbose: @options[:verbose]
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
BioDSL::Usearch.usearch_local(run_opts)
|
|
152
|
+
rescue BioDSL::UsearchError => e
|
|
153
|
+
raise unless e.message =~ /Empty input file/
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Parse usearch output file and emit records to the output stream.
|
|
157
|
+
#
|
|
158
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
159
|
+
# @param tmp_out [String] Path to output file.
|
|
160
|
+
def process_output(output, tmp_out)
|
|
161
|
+
BioDSL::Usearch.open(tmp_out) do |ios|
|
|
162
|
+
ios.each(:uc) do |record|
|
|
163
|
+
record[:RECORD_TYPE] = 'usearch'
|
|
164
|
+
output << record
|
|
165
|
+
@status[:hits_out] += 1
|
|
166
|
+
@status[:records_out] += 1
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Write sequences from stream in FASTA format.
|
|
30
|
+
#
|
|
31
|
+
# Description
|
|
32
|
+
#
|
|
33
|
+
# +write_fasta+ writes sequence from the data stream in FASTA format.
|
|
34
|
+
# However, a FASTA entry will only be written if a SEQ key and a SEQ_NAME key
|
|
35
|
+
# is present. An example FASTA entry:
|
|
36
|
+
#
|
|
37
|
+
# >test1
|
|
38
|
+
# TATGACGCGCATCGACAGCAGCACGAGCATGCATCGACTG
|
|
39
|
+
# TGCACTGACTACGAGCATCACTATATCATCATCATAATCT
|
|
40
|
+
# TACGACATCTAGGGACTAC
|
|
41
|
+
#
|
|
42
|
+
# For more about the FASTA format:
|
|
43
|
+
#
|
|
44
|
+
# http://en.wikipedia.org/wiki/FASTA_format
|
|
45
|
+
#
|
|
46
|
+
# == Usage
|
|
47
|
+
# write_fasta([wrap: <uin>[, output: <file>[, force: <bool>
|
|
48
|
+
# [, gzip: <bool> | bzip2: <bool>]]]])
|
|
49
|
+
#
|
|
50
|
+
# === Options
|
|
51
|
+
# * output <file> - Output file.
|
|
52
|
+
# * force <bool> - Force overwrite existing output file.
|
|
53
|
+
# * wrap <uint> - Wrap sequence into lines of wrap length.
|
|
54
|
+
# * gzip <bool> - Write gzipped output file.
|
|
55
|
+
# * bzip2 <bool> - Write bzipped output file.
|
|
56
|
+
#
|
|
57
|
+
# == Examples
|
|
58
|
+
#
|
|
59
|
+
# To write FASTA entries to STDOUT.
|
|
60
|
+
#
|
|
61
|
+
# write_fasta
|
|
62
|
+
#
|
|
63
|
+
# To write FASTA entries wrapped in lines of length of 80 to STDOUT.
|
|
64
|
+
#
|
|
65
|
+
# write_fasta(wrap: 80)
|
|
66
|
+
#
|
|
67
|
+
# To write FASTA entries to a file 'test.fna'.
|
|
68
|
+
#
|
|
69
|
+
# write_fasta(output: "test.fna")
|
|
70
|
+
#
|
|
71
|
+
# To overwrite output file if this exists use the force option:
|
|
72
|
+
#
|
|
73
|
+
# write_fasta(output: "test.fna", force: true)
|
|
74
|
+
#
|
|
75
|
+
# To write gzipped FASTA entries to file 'test.fna.gz'.
|
|
76
|
+
#
|
|
77
|
+
# write_fasta(output: "test.fna.gz", gzip: true)
|
|
78
|
+
#
|
|
79
|
+
# To write bzipped FASTA entries to file 'test.fna.bz2'.
|
|
80
|
+
#
|
|
81
|
+
# write_fasta(output: "test.fna.bz2", bzip2: true)
|
|
82
|
+
class WriteFasta
|
|
83
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
84
|
+
residues_out)
|
|
85
|
+
|
|
86
|
+
# Constructor for the WriteFasta class.
|
|
87
|
+
#
|
|
88
|
+
# @param [Hash] options Options hash.
|
|
89
|
+
# @option options [Bool] :force Flag allowing overwriting files.
|
|
90
|
+
# @option options [String] :output Output file path.
|
|
91
|
+
# @option options [Integer] :wrap Wrap sequences at this length (default no
|
|
92
|
+
# wrap)
|
|
93
|
+
# @option options [Bool] :gzip Output will be gzip'ed.
|
|
94
|
+
# @option options [Bool] :bzip2 Output will be bzip2'ed.
|
|
95
|
+
#
|
|
96
|
+
# @return [WriteFasta] Returns an instance of the class.
|
|
97
|
+
def initialize(options)
|
|
98
|
+
@options = options
|
|
99
|
+
check_options
|
|
100
|
+
@options[:output] ||= $stdout
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Return a lambda for the write_fasta command.
|
|
104
|
+
#
|
|
105
|
+
# @return [Proc] Returns the write_fasta command lambda.
|
|
106
|
+
def lmb
|
|
107
|
+
lambda do |input, output, status|
|
|
108
|
+
status_init(status, STATS)
|
|
109
|
+
|
|
110
|
+
if @options[:output] == $stdout
|
|
111
|
+
write_stdout(input, output)
|
|
112
|
+
else
|
|
113
|
+
write_file(input, output)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
private
|
|
119
|
+
|
|
120
|
+
# Check the options.
|
|
121
|
+
def check_options
|
|
122
|
+
options_allowed(@options, :force, :output, :wrap, :gzip, :bzip2)
|
|
123
|
+
options_unique(@options, :gzip, :bzip2)
|
|
124
|
+
options_tie(@options, gzip: :output, bzip2: :output)
|
|
125
|
+
options_files_exist_force(@options, :output)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Write all sequence entries to stdout.
|
|
129
|
+
#
|
|
130
|
+
# @param input [Enumerator] The input stream.
|
|
131
|
+
# @param output [Enumerator::Yielder] The output stream.
|
|
132
|
+
def write_stdout(input, output)
|
|
133
|
+
wrap = @options[:wrap]
|
|
134
|
+
|
|
135
|
+
input.each do |record|
|
|
136
|
+
@status[:records_in] += 1
|
|
137
|
+
|
|
138
|
+
if (entry = record2entry(record))
|
|
139
|
+
$stdout.puts entry.to_fasta(wrap)
|
|
140
|
+
@status[:sequences_in] += 1
|
|
141
|
+
@status[:sequences_out] += 1
|
|
142
|
+
@status[:residues_in] += entry.length
|
|
143
|
+
@status[:residues_out] += entry.length
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
write_output(output, record)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# rubocop: disable Metrics/AbcSize
|
|
151
|
+
|
|
152
|
+
# Write all sequence entries to a specified file.
|
|
153
|
+
#
|
|
154
|
+
# @param input [Enumerator] The input stream.
|
|
155
|
+
# @param output [Enumerator::Yielder] The output stream.
|
|
156
|
+
def write_file(input, output)
|
|
157
|
+
Fasta.open(@options[:output], 'w', compress: compress) do |ios|
|
|
158
|
+
input.each do |record|
|
|
159
|
+
@status[:records_in] += 1
|
|
160
|
+
|
|
161
|
+
if (entry = record2entry(record))
|
|
162
|
+
ios.puts entry.to_fasta(@options[:wrap])
|
|
163
|
+
@status[:sequences_in] += 1
|
|
164
|
+
@status[:sequences_out] += 1
|
|
165
|
+
@status[:residues_in] += entry.length
|
|
166
|
+
@status[:residues_out] += entry.length
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
write_output(output, record)
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# rubocop: enable Metrics/AbcSize
|
|
175
|
+
|
|
176
|
+
# Write a given record to the output stream if this exist.
|
|
177
|
+
#
|
|
178
|
+
# @param output [Enumerator::Yielder, nil] Output stream.
|
|
179
|
+
# @param record [Hash] Biopices record to write.
|
|
180
|
+
def write_output(output, record)
|
|
181
|
+
return unless output
|
|
182
|
+
|
|
183
|
+
output << record
|
|
184
|
+
@status[:records_out] += 1
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Creates a Seq object from a given record if SEQ_NAME and SEQ is present.
|
|
188
|
+
#
|
|
189
|
+
# @param record [Hash] Biopices record to convert.
|
|
190
|
+
#
|
|
191
|
+
# @return [BioDSL::Seq] Sequence entry.
|
|
192
|
+
def record2entry(record)
|
|
193
|
+
return unless record.key? :SEQ_NAME
|
|
194
|
+
return unless record.key? :SEQ
|
|
195
|
+
|
|
196
|
+
BioDSL::Seq.new_bp(record)
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Determine what compression should be used for output.
|
|
200
|
+
#
|
|
201
|
+
# @return [Symbol, nil] Compression flag or nil if no compression.
|
|
202
|
+
def compress
|
|
203
|
+
return :gzip if @options[:gzip]
|
|
204
|
+
return :bzip2 if @options[:bzip2]
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Write sequences from stream in FASTQ format.
|
|
30
|
+
#
|
|
31
|
+
# Description
|
|
32
|
+
#
|
|
33
|
+
# +write_fastq+ writes sequence from the data stream in FASTQ format. However,
|
|
34
|
+
# a FASTQ entry will only be written if a SEQ key and a SEQ_NAME key is
|
|
35
|
+
# present. An example FASTQ entry:
|
|
36
|
+
#
|
|
37
|
+
# >test1
|
|
38
|
+
# TATGACGCGCATCGACAGCAGCACGAGCATGCATCGACTG
|
|
39
|
+
# TGCACTGACTACGAGCATCACTATATCATCATCATAATCT
|
|
40
|
+
# TACGACATCTAGGGACTAC
|
|
41
|
+
#
|
|
42
|
+
# For more about the FASTQ format:
|
|
43
|
+
#
|
|
44
|
+
# http://en.wikipedia.org/wiki/FASTQ_format
|
|
45
|
+
#
|
|
46
|
+
# == Usage
|
|
47
|
+
# write_fastq([encoding: <:base_33|:base_64>[, output: <file>
|
|
48
|
+
# [, force: <bool>[, gzip: <bool> | bzip2: <bool>]]])
|
|
49
|
+
#
|
|
50
|
+
# === Options
|
|
51
|
+
# * encoding <base> - Encoding quality scores using :base_33 (default) or
|
|
52
|
+
# :base_64.
|
|
53
|
+
# * output <file> - Output file.
|
|
54
|
+
# * force <bool> - Force overwrite existing output file.
|
|
55
|
+
# * gzip <bool> - Write gzipped output file.
|
|
56
|
+
# * bzip2 <bool> - Write bzipped output file.
|
|
57
|
+
#
|
|
58
|
+
# == Examples
|
|
59
|
+
#
|
|
60
|
+
# To write FASTQ entries to STDOUT.
|
|
61
|
+
#
|
|
62
|
+
# write_fastq
|
|
63
|
+
#
|
|
64
|
+
# To write FASTQ entries to a file 'test.fq'.
|
|
65
|
+
#
|
|
66
|
+
# write_fastq(output: "test.fq")
|
|
67
|
+
#
|
|
68
|
+
# To overwrite output file if this exists use the force option:
|
|
69
|
+
#
|
|
70
|
+
# write_fastq(output: "test.fq", force: true)
|
|
71
|
+
#
|
|
72
|
+
# To write gzipped FASTQ entries to file 'test.fq.gz'.
|
|
73
|
+
#
|
|
74
|
+
# write_fastq(output: "test.fq.gz", gzip: true)
|
|
75
|
+
#
|
|
76
|
+
# To write bzipped FASTQ entries to file 'test.fq.bz2'.
|
|
77
|
+
#
|
|
78
|
+
# write_fastq(output: "test.fq.bz2", bzip2: true)
|
|
79
|
+
class WriteFastq
|
|
80
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
81
|
+
residues_out)
|
|
82
|
+
|
|
83
|
+
# Constructor for WriteFastq.
|
|
84
|
+
#
|
|
85
|
+
# @param options [Hash] Options hash.
|
|
86
|
+
# @option options [String,Symbol] :encoding
|
|
87
|
+
# @option options [Boolean] :force
|
|
88
|
+
# @option options [String] :output
|
|
89
|
+
# @option options [Boolean] :gzip
|
|
90
|
+
# @option options [Boolean] :bzip2
|
|
91
|
+
#
|
|
92
|
+
# @return [WriteFastq] Class instance.
|
|
93
|
+
def initialize(options)
|
|
94
|
+
@options = options
|
|
95
|
+
check_options
|
|
96
|
+
@options[:output] ||= $stdout
|
|
97
|
+
@compress = choose_compression
|
|
98
|
+
@encoding = choose_encoding
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Return command lambda for write_fastq.
|
|
102
|
+
#
|
|
103
|
+
# @return [Proc] Command lambda.
|
|
104
|
+
def lmb
|
|
105
|
+
lambda do |input, output, status|
|
|
106
|
+
status_init(status, STATS)
|
|
107
|
+
|
|
108
|
+
if @options[:output] == $stdout
|
|
109
|
+
process_input(input, output, $stdout)
|
|
110
|
+
else
|
|
111
|
+
Fastq.open(@options[:output], 'w', compress: @compress) do |ios|
|
|
112
|
+
process_input(input, output, ios)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
private
|
|
119
|
+
|
|
120
|
+
# Check options.
|
|
121
|
+
def check_options
|
|
122
|
+
options_allowed(@options, :encoding, :force, :output, :gzip, :bzip2)
|
|
123
|
+
options_allowed_values(@options, encoding: [:base_33, :base_64, 'base_33',
|
|
124
|
+
'base_64'])
|
|
125
|
+
options_unique(@options, :gzip, :bzip2)
|
|
126
|
+
options_tie(@options, gzip: :output, bzip2: :output)
|
|
127
|
+
options_files_exist_force(@options, :output)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Process all records in the input stream and output FASTQ data to the given
|
|
131
|
+
# ios, and finally emit all records to the output stream if specified.
|
|
132
|
+
#
|
|
133
|
+
# @param input [Enumerable] Input stream.
|
|
134
|
+
# @param output [Enumerable::Yielder] Output stream.
|
|
135
|
+
# @param ios [BioDSL::Fastq::IO,STDOUT] Output IO.
|
|
136
|
+
def process_input(input, output, ios)
|
|
137
|
+
input.each do |record|
|
|
138
|
+
@status[:records_in] += 1
|
|
139
|
+
|
|
140
|
+
if record[:SEQ]
|
|
141
|
+
@status[:sequences_in] += 1
|
|
142
|
+
@status[:residues_in] += record[:SEQ].length
|
|
143
|
+
|
|
144
|
+
write_fastq(record, ios) if record[:SEQ_NAME] && record[:SCORES]
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
if output
|
|
148
|
+
output << record
|
|
149
|
+
@status[:records_out] += 1
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Given a BioPeices record convert this to a sequence entry and output in
|
|
155
|
+
# FASTQ format to the speficied IO.
|
|
156
|
+
#
|
|
157
|
+
# @param record [Hash] BioDSL record.
|
|
158
|
+
# @param ios [BioDSL::Fastq::IO,STDOUT] Output IO.
|
|
159
|
+
def write_fastq(record, ios)
|
|
160
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
161
|
+
entry.qual_convert!(:base_33, @encoding)
|
|
162
|
+
|
|
163
|
+
ios.puts entry.to_fastq
|
|
164
|
+
@status[:sequences_out] += 1
|
|
165
|
+
@status[:residues_out] += entry.length
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Choose compression to use which can either be gzip or bzip2 or no
|
|
169
|
+
# compression.
|
|
170
|
+
#
|
|
171
|
+
# @return [Symbol,nil] Compression.
|
|
172
|
+
def choose_compression
|
|
173
|
+
if @options[:gzip]
|
|
174
|
+
:gzip
|
|
175
|
+
elsif @options[:bzip2]
|
|
176
|
+
:bzip2
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Chose the quality score encoding.
|
|
181
|
+
#
|
|
182
|
+
# @return [Symbol,nil] Encoding.
|
|
183
|
+
def choose_encoding
|
|
184
|
+
if @options[:encoding]
|
|
185
|
+
@options[:encoding].to_sym
|
|
186
|
+
else
|
|
187
|
+
:base_33
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|