BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Filter rRNA sequences from the stream.
|
|
30
|
+
#
|
|
31
|
+
# Description
|
|
32
|
+
#
|
|
33
|
+
# +filter_rrna+ utilizes +sortmerna+ to identify and filter ribosomal RNA
|
|
34
|
+
# sequences from the stream. The +sortmerna+ and +indexdb_rna+ executables
|
|
35
|
+
# must be installed for +filter_rrna+ to work.
|
|
36
|
+
#
|
|
37
|
+
# Indexed reference files are produced using +indexdb_rna+.
|
|
38
|
+
#
|
|
39
|
+
# For more about the sortmerna look here:
|
|
40
|
+
#
|
|
41
|
+
# http://bioinfo.lifl.fr/RNA/sortmerna/
|
|
42
|
+
#
|
|
43
|
+
# == Usage
|
|
44
|
+
# filter_rrna(ref_fasta: <file(s)>, ref_index: <file(s)>)
|
|
45
|
+
#
|
|
46
|
+
# === Options
|
|
47
|
+
# * ref_fasta <file(s)> - One or more reference FASTA files.
|
|
48
|
+
# * ref_index <file(s)> - One or more index reference files.
|
|
49
|
+
#
|
|
50
|
+
# == Examples
|
|
51
|
+
#
|
|
52
|
+
# To filter all reads matching the SILVA archaea 23S rRNA do:
|
|
53
|
+
#
|
|
54
|
+
# BP.new.
|
|
55
|
+
# read_fastq(input: "reads.fq").
|
|
56
|
+
# filter_rrna(ref_fasta: ["silva-arc-23s-id98.fasta"],
|
|
57
|
+
# ref_index: ["silva-arc-23s-id98.fasta.idx*"]).
|
|
58
|
+
# write_fastq(output: "clean.fq").
|
|
59
|
+
# run
|
|
60
|
+
#
|
|
61
|
+
# rubocop:disable ClassLength
|
|
62
|
+
class FilterRrna
|
|
63
|
+
require 'English'
|
|
64
|
+
require 'set'
|
|
65
|
+
require 'BioDSL/helpers/aux_helper'
|
|
66
|
+
|
|
67
|
+
include AuxHelper
|
|
68
|
+
|
|
69
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
70
|
+
residues_out)
|
|
71
|
+
|
|
72
|
+
# Constructor the FilterRrna class.
|
|
73
|
+
#
|
|
74
|
+
# @param options [Hash] Options hash.
|
|
75
|
+
# @option options [String,Array] Path(s) to reference FASTA files.
|
|
76
|
+
# @option options [String,Array] Path(s) to reference index files.
|
|
77
|
+
#
|
|
78
|
+
# @return [FilterRrnas] Class instance of FilterRrnas.
|
|
79
|
+
def initialize(options)
|
|
80
|
+
@options = options
|
|
81
|
+
@filter = Set.new
|
|
82
|
+
|
|
83
|
+
aux_exist('sortmerna')
|
|
84
|
+
check_options
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Return the command lambda for filter_rrnas.
|
|
88
|
+
#
|
|
89
|
+
# @return [Proc] Command lambda.
|
|
90
|
+
def lmb
|
|
91
|
+
lambda do |input, output, status|
|
|
92
|
+
status_init(status, STATS)
|
|
93
|
+
|
|
94
|
+
TmpDir.create('tmp', 'seq', 'out') do |tmp_file, seq_file, out_file|
|
|
95
|
+
ref_files = process_ref_files
|
|
96
|
+
process_input(input, tmp_file, seq_file)
|
|
97
|
+
execute_sortmerna(ref_files, seq_file, out_file)
|
|
98
|
+
parse_sortme_output(out_file)
|
|
99
|
+
process_output(output, tmp_file)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
private
|
|
105
|
+
|
|
106
|
+
# Check options.
|
|
107
|
+
def check_options
|
|
108
|
+
options_allowed(@options, :ref_fasta, :ref_index)
|
|
109
|
+
options_files_exist(@options, :ref_fasta, :ref_index)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Given reference index and fasta files in the options hash, process these
|
|
113
|
+
# into a string of the format read by 'sortmerna': fasta1,id1:fasta2,id2:...
|
|
114
|
+
#
|
|
115
|
+
# @return [String] Reference file string for sortmerna.
|
|
116
|
+
def process_ref_files
|
|
117
|
+
ref_index = @options[:ref_index]
|
|
118
|
+
ref_fasta = @options[:ref_fasta]
|
|
119
|
+
|
|
120
|
+
if ref_index.is_a? Array
|
|
121
|
+
ref_index.map { |f| f.sub!(/\*$/, '') }
|
|
122
|
+
else
|
|
123
|
+
ref_index.sub!(/\*$/, '')
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
ref_fasta = [ref_fasta.split(',')] if ref_fasta.is_a? String
|
|
127
|
+
ref_index = [ref_index.split(',')] if ref_index.is_a? String
|
|
128
|
+
|
|
129
|
+
ref_fasta.zip(ref_index).map { |m| m.join(',') }.join(':')
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Execute 'sortmerna'.
|
|
133
|
+
#
|
|
134
|
+
# @param ref_files [String] Reference file string for sortmerna.
|
|
135
|
+
# @param seq_file [String] Path to intput file with reads.
|
|
136
|
+
# @param out_file [String] Path to output file.
|
|
137
|
+
#
|
|
138
|
+
# @raise if execution of 'sortmerna' fails.
|
|
139
|
+
def execute_sortmerna(ref_files, seq_file, out_file)
|
|
140
|
+
cmd = ['sortmerna']
|
|
141
|
+
cmd << "--ref #{ref_files}"
|
|
142
|
+
cmd << "--reads #{seq_file}"
|
|
143
|
+
cmd << "--aligned #{out_file}"
|
|
144
|
+
cmd << '--fastx'
|
|
145
|
+
cmd << '-v' if BioDSL.verbose
|
|
146
|
+
|
|
147
|
+
cmd_line = cmd.join(' ')
|
|
148
|
+
|
|
149
|
+
$stderr.puts "Running command: #{cmd_line}" if BioDSL.verbose
|
|
150
|
+
|
|
151
|
+
system(cmd_line)
|
|
152
|
+
|
|
153
|
+
fail "command failed: #{cmd_line}" unless $CHILD_STATUS.success?
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Parse the 'sortmerna' output file and add all sequence name indices to the
|
|
157
|
+
# filter set.
|
|
158
|
+
#
|
|
159
|
+
# @param out_file [String] Path to output file.
|
|
160
|
+
def parse_sortme_output(out_file)
|
|
161
|
+
BioDSL::Fasta.open("#{out_file}.fasta", 'r') do |ios|
|
|
162
|
+
ios.each do |entry|
|
|
163
|
+
@filter << entry.seq_name.to_i
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Process input stream and serialize all records and write a temporary FASTA
|
|
169
|
+
# file.
|
|
170
|
+
#
|
|
171
|
+
# @param input [Enumerator] Input stream.
|
|
172
|
+
# @param tmp_file [String] Path to tmp file for serialized records.
|
|
173
|
+
# @param seq_file [String] Path to tmp FASTA sequence file.
|
|
174
|
+
def process_input(input, tmp_file, seq_file)
|
|
175
|
+
BioDSL::Fasta.open(seq_file, 'w') do |seq_io|
|
|
176
|
+
File.open(tmp_file, 'wb') do |tmp_ios|
|
|
177
|
+
BioDSL::Serializer.new(tmp_ios) do |s|
|
|
178
|
+
input.each_with_index do |record, i|
|
|
179
|
+
@status[:records_in] += 1
|
|
180
|
+
|
|
181
|
+
s << record
|
|
182
|
+
# FIXME: need << method
|
|
183
|
+
seq_io.puts record2entry(record, i).to_fasta if record.key? :SEQ
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Given a BioDSL record and an index create a new sequence entry object
|
|
191
|
+
# that is returned using the index as sequence name.
|
|
192
|
+
#
|
|
193
|
+
# @param record [Hash] BioDSL record
|
|
194
|
+
# @param i [Integer] Index.
|
|
195
|
+
#
|
|
196
|
+
# @return [BioDSL::Seq] Sequence entry.
|
|
197
|
+
def record2entry(record, i)
|
|
198
|
+
entry = BioDSL::Seq.new(seq_name: i, seq: record[:SEQ])
|
|
199
|
+
@status[:sequences_in] += 1
|
|
200
|
+
@status[:residues_in] += entry.length
|
|
201
|
+
entry
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Process the serialized data and output all records, that does not match
|
|
205
|
+
# the filter, to the output stream.
|
|
206
|
+
#
|
|
207
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
208
|
+
# @param tmp_file [String] Path to tmp file with serialized records.
|
|
209
|
+
def process_output(output, tmp_file)
|
|
210
|
+
File.open(tmp_file, 'rb') do |ios|
|
|
211
|
+
BioDSL::Serializer.new(ios) do |s|
|
|
212
|
+
s.each_with_index do |record, i|
|
|
213
|
+
output_record(output, record, i)
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Output a record to the output stream unless it contains sequence
|
|
220
|
+
# information that should be filtered.
|
|
221
|
+
#
|
|
222
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
223
|
+
# @param record [Hash] BioDSL record.
|
|
224
|
+
# @param i [Integer] Index.
|
|
225
|
+
def output_record(output, record, i)
|
|
226
|
+
if record.key? :SEQ
|
|
227
|
+
unless @filter.include? i
|
|
228
|
+
output << record
|
|
229
|
+
@status[:records_out] += 1
|
|
230
|
+
@status[:sequences_out] += 1
|
|
231
|
+
@status[:residues_out] += record[:SEQ].length
|
|
232
|
+
end
|
|
233
|
+
else
|
|
234
|
+
output << record
|
|
235
|
+
@status[:records_out] += 1
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Genecall sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +Genecall+ predict genes in prokaryotic single genomes or metagenomes using
|
|
32
|
+
# Prodigal 2.6 which must be installed:
|
|
33
|
+
#
|
|
34
|
+
# http://prodigal.ornl.gov/
|
|
35
|
+
#
|
|
36
|
+
# The records produced are of the type:
|
|
37
|
+
#
|
|
38
|
+
# {:RECORD_TYPE=>"gene",
|
|
39
|
+
# :S_BEG=>2, :S_END=>109,
|
|
40
|
+
# :S_LEN=>108,
|
|
41
|
+
# :STRAND=>"-",
|
|
42
|
+
# :SEQ_NAME=>"contig1",
|
|
43
|
+
# :SEQ=>"MGKVIGIDLGTTNSCVAVMDGKTAKVIENAEGMRTT",
|
|
44
|
+
# :SEQ_LEN=>36}
|
|
45
|
+
#
|
|
46
|
+
# == Usage
|
|
47
|
+
#
|
|
48
|
+
# genecall([type: <string>[, procedure: <string>[, closed_ends: <bool>
|
|
49
|
+
# [, masked: <bool>]]]])
|
|
50
|
+
#
|
|
51
|
+
# === Options
|
|
52
|
+
#
|
|
53
|
+
# * type: <string> - Output dna or protein sequence (default: dna).
|
|
54
|
+
# * procedure: <string> - Single or meta (default: single).
|
|
55
|
+
# * closed_ends: <bool> - Don't allow truncated gene at ends.
|
|
56
|
+
# * masked: <bool> - Ignore stretch of Ns.
|
|
57
|
+
#
|
|
58
|
+
# == Examples
|
|
59
|
+
#
|
|
60
|
+
# To genecall a genome do:
|
|
61
|
+
#
|
|
62
|
+
# BP.new.
|
|
63
|
+
# read_fasta(input: "contigs.fna").
|
|
64
|
+
# genecall.
|
|
65
|
+
# grab(select: "genecall", key: :type, exact: true).
|
|
66
|
+
# write_fasta(output: "genes.fna").
|
|
67
|
+
# run
|
|
68
|
+
#
|
|
69
|
+
# To add genecall data to the sequence name use +merge_values+:
|
|
70
|
+
#
|
|
71
|
+
# BP.new.
|
|
72
|
+
# read_fasta(input: "contigs.fna").
|
|
73
|
+
# genecall(type: "protein").
|
|
74
|
+
# grab(select: "genecall", key: :type, exact: true).
|
|
75
|
+
# merge_values(keys: [:SEQ_NAME, :S_BEG, :S_END, :S_LEN, :STRAND]).
|
|
76
|
+
# write_fasta(output: "genes.faa").
|
|
77
|
+
# run
|
|
78
|
+
class Genecall
|
|
79
|
+
require 'English'
|
|
80
|
+
require 'BioDSL/helpers/aux_helper'
|
|
81
|
+
|
|
82
|
+
include AuxHelper
|
|
83
|
+
|
|
84
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
85
|
+
residues_out)
|
|
86
|
+
|
|
87
|
+
# Constructor for the Genecall class.
|
|
88
|
+
#
|
|
89
|
+
# @param [Hash] options Options hash.
|
|
90
|
+
# @option options [Symbol] :type of output.
|
|
91
|
+
# @option options [Symbol] :procedure used for genecalling.
|
|
92
|
+
# @option options [Boolean] :closed_ends disallow truncated genes at ends.
|
|
93
|
+
# @option options [Boolean] :masked ignore stretch of Ns.
|
|
94
|
+
#
|
|
95
|
+
# @return [Genecall] Returns an instance of the class.
|
|
96
|
+
def initialize(options)
|
|
97
|
+
@options = options
|
|
98
|
+
@names = {}
|
|
99
|
+
|
|
100
|
+
aux_exist('prodigal')
|
|
101
|
+
defaults
|
|
102
|
+
check_options
|
|
103
|
+
|
|
104
|
+
@type = @options[:type].to_sym
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Return a lambda for the genecall command.
|
|
108
|
+
#
|
|
109
|
+
# @return [Proc] Returns the command lambda.
|
|
110
|
+
def lmb
|
|
111
|
+
lambda do |input, output, status|
|
|
112
|
+
status_init(status, STATS)
|
|
113
|
+
|
|
114
|
+
TmpDir.create('i.fa', 'o.fna', 'o.faa') do |tmp_in, tmp_fna, tmp_faa|
|
|
115
|
+
process_input(input, output, tmp_in)
|
|
116
|
+
run_prodigal(tmp_in, tmp_fna, tmp_faa)
|
|
117
|
+
process_output(output, tmp_fna, tmp_faa)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
private
|
|
123
|
+
|
|
124
|
+
# Run Prodigal on the input file.
|
|
125
|
+
#
|
|
126
|
+
# @param tmp_in [String] Path to input FASTA file.
|
|
127
|
+
# @param tmp_fna [String] Path to output FASTA DNA file.
|
|
128
|
+
# @param tmp_faa [String] Path to output FASTA Protein file.
|
|
129
|
+
def run_prodigal(tmp_in, tmp_fna, tmp_faa)
|
|
130
|
+
cmd = []
|
|
131
|
+
cmd << 'prodigal'
|
|
132
|
+
cmd << '-f gff'
|
|
133
|
+
cmd << '-c' if @options[:closed_ends]
|
|
134
|
+
cmd << '-m' if @options[:masked]
|
|
135
|
+
cmd << "-p #{@options[:procedure]}"
|
|
136
|
+
cmd << "-i #{tmp_in}"
|
|
137
|
+
cmd << "-d #{tmp_fna}"
|
|
138
|
+
cmd << "-a #{tmp_faa}"
|
|
139
|
+
cmd << '-q' unless BioDSL.verbose
|
|
140
|
+
cmd << '> /dev/null 2>&1' unless BioDSL.verbose
|
|
141
|
+
|
|
142
|
+
cmd_line = cmd.join(' ')
|
|
143
|
+
|
|
144
|
+
$stderr.puts "Running: #{cmd_line}" if BioDSL.verbose
|
|
145
|
+
system(cmd_line)
|
|
146
|
+
|
|
147
|
+
fail cmd_line unless $CHILD_STATUS.success?
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Check the options.
|
|
151
|
+
def check_options
|
|
152
|
+
options_allowed(@options, :type, :procedure, :closed_ends, :masked)
|
|
153
|
+
options_allowed_values(@options, type: [:dna, :protein, 'dna',
|
|
154
|
+
'protein'])
|
|
155
|
+
options_allowed_values(@options, procedure: ['single', 'meta', :single,
|
|
156
|
+
:meta])
|
|
157
|
+
options_allowed_values(@options, closed_ends: [nil, true, false])
|
|
158
|
+
options_allowed_values(@options, masked: [nil, true, false])
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Set the default option values.
|
|
162
|
+
def defaults
|
|
163
|
+
@options[:type] ||= :dna
|
|
164
|
+
@options[:procedure] ||= :single
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Read all records from input and emit non-sequence records to the output
|
|
168
|
+
# stream. Sequence records are saved to a temporary file.
|
|
169
|
+
#
|
|
170
|
+
# @param input [Enumerator] input stream.
|
|
171
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
172
|
+
# @param fa_in [String] Path to temporary FASTA file.
|
|
173
|
+
def process_input(input, output, fa_in)
|
|
174
|
+
BioDSL::Fasta.open(fa_in, 'w') do |fasta_io|
|
|
175
|
+
input.each_with_index do |record, i|
|
|
176
|
+
@status[:records_in] += 1
|
|
177
|
+
|
|
178
|
+
if record.key? :SEQ
|
|
179
|
+
entry = BioDSL::Seq.new(seq_name: i, seq: record[:SEQ])
|
|
180
|
+
@names[i] = record[:SEQ_NAME] || i
|
|
181
|
+
|
|
182
|
+
@status[:sequences_in] += 1
|
|
183
|
+
@status[:sequences_out] += 1
|
|
184
|
+
@status[:residues_in] += entry.length
|
|
185
|
+
@status[:residues_out] += entry.length
|
|
186
|
+
|
|
187
|
+
fasta_io.puts entry.to_fasta
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
@status[:records_out] += 1
|
|
191
|
+
output << record
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Read the output from file and emit to the output stream.
|
|
197
|
+
#
|
|
198
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
199
|
+
# @param tmp_fna [String] Path to output FASTA DNA file.
|
|
200
|
+
# @param tmp_faa [String] Path to output FASTA Protein file.
|
|
201
|
+
def process_output(output, tmp_fna, tmp_faa)
|
|
202
|
+
file = (@type == :dna) ? tmp_fna : tmp_faa
|
|
203
|
+
|
|
204
|
+
BioDSL::Fasta.open(file, 'r') do |ios|
|
|
205
|
+
ios.each do |entry|
|
|
206
|
+
output << parse_entry(entry)
|
|
207
|
+
|
|
208
|
+
@status[:records_out] += 1
|
|
209
|
+
@status[:sequences_out] += 1
|
|
210
|
+
@status[:residues_out] += entry.length
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Parse Prodigal genecall data from sequence name.
|
|
216
|
+
#
|
|
217
|
+
# @param entry [BioDSL::Seq] Sequence object.
|
|
218
|
+
#
|
|
219
|
+
# @return [Hash] BioPiece record.
|
|
220
|
+
def parse_entry(entry)
|
|
221
|
+
record = {}
|
|
222
|
+
fields = entry.seq_name.split(' # ')
|
|
223
|
+
|
|
224
|
+
record[:RECORD_TYPE] = 'genecall'
|
|
225
|
+
record[:S_BEG] = fields[1].to_i - 1
|
|
226
|
+
record[:S_END] = fields[2].to_i - 1
|
|
227
|
+
record[:S_LEN] = record[:S_END] - record[:S_BEG] + 1
|
|
228
|
+
record[:STRAND] = fields[3] == '1' ? '+' : '-'
|
|
229
|
+
record[:SEQ_NAME] = @names[fields[0].split('_').first.to_i]
|
|
230
|
+
record[:SEQ] = entry.seq
|
|
231
|
+
record[:SEQ_LEN] = entry.length
|
|
232
|
+
record[:SEQ_TYPE] = @type.to_s
|
|
233
|
+
|
|
234
|
+
record
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
end
|