BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Sort records in the stream.
|
|
30
|
+
#
|
|
31
|
+
# +sort+ records in the stream given a specific key. Sorting on multiple keys
|
|
32
|
+
# is currently not supported.
|
|
33
|
+
#
|
|
34
|
+
# == Usage
|
|
35
|
+
#
|
|
36
|
+
# sort(key: <value>[, reverse: <bool>[, block_size: <uint>]])
|
|
37
|
+
#
|
|
38
|
+
# === Options
|
|
39
|
+
#
|
|
40
|
+
# * key: <value> - Sort records on the value for key.
|
|
41
|
+
# * reverse: <bool> - Reverse sort.
|
|
42
|
+
# * block_size: <uint> - Block size used for disk based sorting
|
|
43
|
+
# (default=250_000_000).
|
|
44
|
+
#
|
|
45
|
+
# == Examples
|
|
46
|
+
#
|
|
47
|
+
# Consider the following table in the file `test.tab`:
|
|
48
|
+
#
|
|
49
|
+
# #COUNT ORGANISM
|
|
50
|
+
# 4 Dog
|
|
51
|
+
# 3 Cat
|
|
52
|
+
# 1 Eel
|
|
53
|
+
#
|
|
54
|
+
# To sort this accoring to COUNT in descending order do:
|
|
55
|
+
#
|
|
56
|
+
# BP.new.read_table(input: "test.tab").sort(key: :COUNT).dump.run
|
|
57
|
+
#
|
|
58
|
+
# {:COUNT=>1, :ORGANISM=>"Eel"}
|
|
59
|
+
# {:COUNT=>3, :ORGANISM=>"Cat"}
|
|
60
|
+
# {:COUNT=>4, :ORGANISM=>"Dog"}
|
|
61
|
+
#
|
|
62
|
+
# And in ascending order:
|
|
63
|
+
#
|
|
64
|
+
# BP.new.
|
|
65
|
+
# read_table(input: "test.tab").
|
|
66
|
+
# sort(key: :COUNT, reverse: true).
|
|
67
|
+
# dump.
|
|
68
|
+
# run
|
|
69
|
+
#
|
|
70
|
+
# {:COUNT=>4, :ORGANISM=>"Dog"}
|
|
71
|
+
# {:COUNT=>3, :ORGANISM=>"Cat"}
|
|
72
|
+
# {:COUNT=>1, :ORGANISM=>"Eel"}
|
|
73
|
+
#
|
|
74
|
+
# The type of value determines the sorting, alphabetical order:
|
|
75
|
+
#
|
|
76
|
+
# BP.new.read_table(input: "test.tab").sort(key: :ORGANISM).dump.run
|
|
77
|
+
#
|
|
78
|
+
# {:COUNT=>3, :ORGANISM=>"Cat"}
|
|
79
|
+
# {:COUNT=>4, :ORGANISM=>"Dog"}
|
|
80
|
+
# {:COUNT=>1, :ORGANISM=>"Eel"}
|
|
81
|
+
#
|
|
82
|
+
# And reverse alphabetic order:
|
|
83
|
+
#
|
|
84
|
+
# BP.new.
|
|
85
|
+
# read_table(input: "test.tab").
|
|
86
|
+
# sort(key: :ORGANISM, reverse: true).
|
|
87
|
+
# dump.
|
|
88
|
+
# run
|
|
89
|
+
#
|
|
90
|
+
# {:COUNT=>1, :ORGANISM=>"Eel"}
|
|
91
|
+
# {:COUNT=>4, :ORGANISM=>"Dog"}
|
|
92
|
+
# {:COUNT=>3, :ORGANISM=>"Cat"}
|
|
93
|
+
class Sort
|
|
94
|
+
require 'pqueue'
|
|
95
|
+
|
|
96
|
+
STATS = %i(records_in records_out)
|
|
97
|
+
SORT_BLOCK_SIZE = 250_000_000 # max bytes to hold in memory.
|
|
98
|
+
|
|
99
|
+
# Constructor for Sort.
|
|
100
|
+
#
|
|
101
|
+
# @param options [Hash] Options hash.
|
|
102
|
+
#
|
|
103
|
+
# @option options [String,Symbol] :key
|
|
104
|
+
# @option options [Boolean] :reverse
|
|
105
|
+
# @option options [Integer] :block_size
|
|
106
|
+
#
|
|
107
|
+
# @return [Sort] Class instance.
|
|
108
|
+
def initialize(options)
|
|
109
|
+
@options = options
|
|
110
|
+
@block_size = options[:block_size] || SORT_BLOCK_SIZE
|
|
111
|
+
@key = options[:key].to_sym
|
|
112
|
+
@files = []
|
|
113
|
+
@records = []
|
|
114
|
+
@size = 0
|
|
115
|
+
@pqueue = pqueue_init
|
|
116
|
+
@fds = nil
|
|
117
|
+
|
|
118
|
+
check_options
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Return command lambda for Sort.
|
|
122
|
+
#
|
|
123
|
+
# @return [Proc] Command lambda.
|
|
124
|
+
def lmb
|
|
125
|
+
lambda do |input, output, status|
|
|
126
|
+
status_init(status, STATS)
|
|
127
|
+
|
|
128
|
+
input.each do |record|
|
|
129
|
+
@status[:records_in] += 1
|
|
130
|
+
@records << record
|
|
131
|
+
@size += record.to_s.size
|
|
132
|
+
save_block if @size > @block_size
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
save_block
|
|
136
|
+
open_block_files
|
|
137
|
+
fill_pqueue
|
|
138
|
+
output_pqueue(output)
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
private
|
|
143
|
+
|
|
144
|
+
# Check options.
|
|
145
|
+
def check_options
|
|
146
|
+
options_allowed(@options, :key, :reverse, :block_size)
|
|
147
|
+
options_required(@options, :key)
|
|
148
|
+
options_allowed_values(@options, reverse: [nil, true, false])
|
|
149
|
+
options_assert(@options, ':block_size > 0')
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Initialize pqueue
|
|
153
|
+
def pqueue_init
|
|
154
|
+
PQueue.new do |a, b|
|
|
155
|
+
if @options[:reverse]
|
|
156
|
+
a.first[@key] <=> b.first[@key]
|
|
157
|
+
else
|
|
158
|
+
b.first[@key] <=> a.first[@key]
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Save a block of records after sorting this.
|
|
164
|
+
def save_block
|
|
165
|
+
return if @records.empty?
|
|
166
|
+
|
|
167
|
+
@records.sort_by! { |r| r[@options[:key].to_sym] }
|
|
168
|
+
@records.reverse! if @options[:reverse]
|
|
169
|
+
|
|
170
|
+
serialize_records
|
|
171
|
+
|
|
172
|
+
@records = []
|
|
173
|
+
@size = 0
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Save sorted records to file.
|
|
177
|
+
def serialize_records
|
|
178
|
+
file = Tempfile.new('sort')
|
|
179
|
+
|
|
180
|
+
File.open(file, 'wb') do |ios|
|
|
181
|
+
BioDSL::Serializer.new(ios) do |serializer|
|
|
182
|
+
@records.each { |record| serializer << record }
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
@files << file
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Open all sorted files.
|
|
190
|
+
def open_block_files
|
|
191
|
+
@fds = @files.inject([]) { |a, e| a << File.open(e, 'rb') }
|
|
192
|
+
at_exit { @fds.map(&:close) }
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Fill the pqueue with the first record from each of the file descriptors.
|
|
196
|
+
def fill_pqueue
|
|
197
|
+
@fds.each_with_index do |fd, i|
|
|
198
|
+
BioDSL::Serializer.new(fd) do |serializer|
|
|
199
|
+
@pqueue << [serializer.next_entry, i] unless fd.eof?
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Output all records from the pqueue while filling this with the next record
|
|
205
|
+
# from the list of file descriptors.
|
|
206
|
+
#
|
|
207
|
+
# @param output [Enumerator::Yeilder] Output stream.
|
|
208
|
+
def output_pqueue(output)
|
|
209
|
+
until @pqueue.empty?
|
|
210
|
+
record, i = @pqueue.pop
|
|
211
|
+
|
|
212
|
+
output << record
|
|
213
|
+
@status[:records_out] += 1
|
|
214
|
+
|
|
215
|
+
fd = @fds[i]
|
|
216
|
+
|
|
217
|
+
BioDSL::Serializer.new(fd) do |serializer|
|
|
218
|
+
@pqueue << [serializer.next_entry, i] unless fd.eof?
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Splite pair-end sequences in the stream.
|
|
30
|
+
#
|
|
31
|
+
# split_pair_seq splits sequences in the stream previously merged with
|
|
32
|
+
# merge_pair_seq. Sequence names must be in either Illumina1.3/1.5 format
|
|
33
|
+
# trailing a /1 or /2 or Illumina1.8 containing 1: or 2:. A sequence split
|
|
34
|
+
# into two will be output as two records where the first will be named with 1
|
|
35
|
+
# and the second with 2.
|
|
36
|
+
#
|
|
37
|
+
# == Usage
|
|
38
|
+
#
|
|
39
|
+
# split_pair_seq
|
|
40
|
+
#
|
|
41
|
+
# === Options
|
|
42
|
+
#
|
|
43
|
+
# == Examples
|
|
44
|
+
#
|
|
45
|
+
# Consider the following records created with merge_pair_seq:
|
|
46
|
+
#
|
|
47
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14",
|
|
48
|
+
# :SEQ=>"TGGGGAATATTGGACAATGGCCTGTTTGCTACCCACGCTT",
|
|
49
|
+
# :SEQ_LEN=>40,
|
|
50
|
+
# :SCORES=>"<??????BDDDDDDDDGGGG?????BB<-<BDDDDDFEEF",
|
|
51
|
+
# :SEQ_LEN_LEFT=>20,
|
|
52
|
+
# :SEQ_LEN_RIGHT=>20}
|
|
53
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14",
|
|
54
|
+
# :SEQ=>"TAGGGAATCTTGCACAATGGACTCTTCGCTACCCATGCTT",
|
|
55
|
+
# :SEQ_LEN=>40,
|
|
56
|
+
# :SCORES=>"<???9?BBBDBDDBDDFFFF,5<??BB?DDABDBDDFFFF",
|
|
57
|
+
# :SEQ_LEN_LEFT=>20,
|
|
58
|
+
# :SEQ_LEN_RIGHT=>20}
|
|
59
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14",
|
|
60
|
+
# :SEQ=>"TAGGGAATCTTGCACAATGGCCTCTTCGCTACCCATGCTT",
|
|
61
|
+
# :SEQ_LEN=>40,
|
|
62
|
+
# :SCORES=>"?????BBBBBDDBDDBFFFF??,<??B?BB?BBBBBFF?F",
|
|
63
|
+
# :SEQ_LEN_LEFT=>20,
|
|
64
|
+
# :SEQ_LEN_RIGHT=>20}
|
|
65
|
+
#
|
|
66
|
+
# These can be split using split_pair_seq:
|
|
67
|
+
#
|
|
68
|
+
# BP.new.
|
|
69
|
+
# read_fastq(input: "test.fq", encoding: :base_33).
|
|
70
|
+
# merge_pair_seq.
|
|
71
|
+
# split_pair_seq.
|
|
72
|
+
# dump.
|
|
73
|
+
# run
|
|
74
|
+
#
|
|
75
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14",
|
|
76
|
+
# :SEQ=>"TGGGGAATATTGGACAATGG",
|
|
77
|
+
# :SEQ_LEN=>20,
|
|
78
|
+
# :SCORES=>"<??????BDDDDDDDDGGGG"}
|
|
79
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 2:N:0:14",
|
|
80
|
+
# :SEQ=>"CCTGTTTGCTACCCACGCTT",
|
|
81
|
+
# :SEQ_LEN=>20,
|
|
82
|
+
# :SCORES=>"?????BB<-<BDDDDDFEEF"}
|
|
83
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14",
|
|
84
|
+
# :SEQ=>"TAGGGAATCTTGCACAATGG",
|
|
85
|
+
# :SEQ_LEN=>20,
|
|
86
|
+
# :SCORES=>"<???9?BBBDBDDBDDFFFF"}
|
|
87
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 2:N:0:14",
|
|
88
|
+
# :SEQ=>"ACTCTTCGCTACCCATGCTT",
|
|
89
|
+
# :SEQ_LEN=>20,
|
|
90
|
+
# :SCORES=>",5<??BB?DDABDBDDFFFF"}
|
|
91
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14",
|
|
92
|
+
# :SEQ=>"TAGGGAATCTTGCACAATGG",
|
|
93
|
+
# :SEQ_LEN=>20,
|
|
94
|
+
# :SCORES=>"?????BBBBBDDBDDBFFFF"}
|
|
95
|
+
# {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 2:N:0:14",
|
|
96
|
+
# :SEQ=>"CCTCTTCGCTACCCATGCTT",
|
|
97
|
+
# :SEQ_LEN=>20,
|
|
98
|
+
# :SCORES=>"??,<??B?BB?BBBBBFF?F"}
|
|
99
|
+
class SplitPairSeq
|
|
100
|
+
STATS = %i(records_in records_out sequences_in sequences_out residues_in
|
|
101
|
+
residues_out)
|
|
102
|
+
|
|
103
|
+
# Constructor for SplitPairSeq.
|
|
104
|
+
#
|
|
105
|
+
# @param options [Hash] Options hash.
|
|
106
|
+
#
|
|
107
|
+
# @return [SplitPairSeq] Class instance.
|
|
108
|
+
def initialize(options)
|
|
109
|
+
@options = options
|
|
110
|
+
|
|
111
|
+
check_options
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Return command lambda for split_pair_seq.
|
|
115
|
+
#
|
|
116
|
+
# @return [Proc] Command lambda.
|
|
117
|
+
def lmb
|
|
118
|
+
lambda do |input, output, status|
|
|
119
|
+
status_init(status, STATS)
|
|
120
|
+
|
|
121
|
+
input.each do |record|
|
|
122
|
+
@status[:records_in] += 1
|
|
123
|
+
|
|
124
|
+
if record[:SEQ_NAME] && record[:SEQ] && record[:SEQ_LEN_LEFT] &&
|
|
125
|
+
record[:SEQ_LEN_RIGHT]
|
|
126
|
+
split_pair_seq(output, record)
|
|
127
|
+
else
|
|
128
|
+
output << record
|
|
129
|
+
|
|
130
|
+
@status[:records_out] += 1
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
private
|
|
137
|
+
|
|
138
|
+
# Check options.
|
|
139
|
+
def check_options
|
|
140
|
+
options_allowed(@options, nil)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Output two sequence entries from a sequence in the given record that has
|
|
144
|
+
# been split at a position defined by the SEQ_LEN_LEFT key in the record.
|
|
145
|
+
#
|
|
146
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
147
|
+
# @param record [Hash] BioDSL record.
|
|
148
|
+
#
|
|
149
|
+
# rubocop: disable Metrics/AbcSize
|
|
150
|
+
def split_pair_seq(output, record)
|
|
151
|
+
entry = BioDSL::Seq.new_bp(record)
|
|
152
|
+
|
|
153
|
+
@status[:sequences_in] += 1
|
|
154
|
+
@status[:residues_in] += entry.length
|
|
155
|
+
|
|
156
|
+
pos = get_split_pos(record, entry)
|
|
157
|
+
|
|
158
|
+
entry1, entry2 = split_entry(entry, pos)
|
|
159
|
+
|
|
160
|
+
output << entry1.to_bp
|
|
161
|
+
output << entry2.to_bp
|
|
162
|
+
|
|
163
|
+
@status[:sequences_out] += 2
|
|
164
|
+
@status[:residues_out] += entry1.length + entry2.length
|
|
165
|
+
@status[:records_out] += 2
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Given a record locate the sequence split position.
|
|
169
|
+
#
|
|
170
|
+
# @param record [Hash] BioDSL record.
|
|
171
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
172
|
+
#
|
|
173
|
+
# @return [Integer] Sequence split position.
|
|
174
|
+
#
|
|
175
|
+
# @raise [BioDSL::SeqError]
|
|
176
|
+
# If left and right lengths don't fit entry length.
|
|
177
|
+
def get_split_pos(record, entry)
|
|
178
|
+
len_left = record[:SEQ_LEN_LEFT].to_i
|
|
179
|
+
len_right = record[:SEQ_LEN_RIGHT].to_i
|
|
180
|
+
|
|
181
|
+
unless len_left + len_right == entry.length
|
|
182
|
+
fail BioDSL::SeqError, 'SEQ_LEN_LEFT + SEQ_LEN_RIGHT != SEQ_LEN ' \
|
|
183
|
+
"#{len_left} + #{len_right} != #{entry.length}"
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
len_left
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Split the given entry at the given position and return two new entries.
|
|
190
|
+
#
|
|
191
|
+
# @param entry [BioDSL::Seq] Sequence entry.
|
|
192
|
+
# @param pos [Integer] Split position.
|
|
193
|
+
#
|
|
194
|
+
# @return [Array] Tuple with the two new entries.
|
|
195
|
+
def split_entry(entry, pos)
|
|
196
|
+
entry1 = entry[0...pos]
|
|
197
|
+
entry2 = entry[pos..-1]
|
|
198
|
+
|
|
199
|
+
fix_seq_names(entry, entry2)
|
|
200
|
+
|
|
201
|
+
[entry1, entry2]
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Fix sequence names.
|
|
205
|
+
#
|
|
206
|
+
# @param entry1 [BioDSL::Seq] Sequence entry1.
|
|
207
|
+
# @param entry2 [BioDSL::Seq] Sequence entry2.
|
|
208
|
+
#
|
|
209
|
+
# @raise [RuntimeError] If names wasn't fixed.
|
|
210
|
+
def fix_seq_names(entry1, entry2)
|
|
211
|
+
if entry1.seq_name =~ /^[^ ]+ \d:/
|
|
212
|
+
entry2.seq_name.sub!(/ \d:/, ' 2:')
|
|
213
|
+
elsif entry1.seq_name =~ /^.+\/\d$/
|
|
214
|
+
entry2.seq_name[-1] = '2'
|
|
215
|
+
else
|
|
216
|
+
fail "Could not match sequence name: #{entry1.seq_name}"
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Split the values of a key into new key/value pairs.
|
|
30
|
+
#
|
|
31
|
+
# +split_values+ splits the value of a given key into multiple values that are
|
|
32
|
+
# added to the record. The keys used for the values are per default based on
|
|
33
|
+
# the given key with an added index, but using the +keys+ option allows
|
|
34
|
+
# specifying a list of keys to use instead.
|
|
35
|
+
#
|
|
36
|
+
# == Usage
|
|
37
|
+
#
|
|
38
|
+
# split_values(<key>: <string>>[, delimiter: <string>[, keys: <list>]])
|
|
39
|
+
#
|
|
40
|
+
# === Options
|
|
41
|
+
#
|
|
42
|
+
# # key: <string> - Key who's value to split.
|
|
43
|
+
# * keys: <list> - List of keys to use with split values.
|
|
44
|
+
# * delimiter: <string> - Delimiter (default='_').
|
|
45
|
+
#
|
|
46
|
+
# == Examples
|
|
47
|
+
#
|
|
48
|
+
# Consider the following records:
|
|
49
|
+
#
|
|
50
|
+
# {ID: "FOO:count=10", SEQ: "gataag"}
|
|
51
|
+
# {ID: "FOO_10_20", SEQ: "gataag"}
|
|
52
|
+
#
|
|
53
|
+
# To split the value belinging to ID do:
|
|
54
|
+
#
|
|
55
|
+
# split_values(key: :ID)
|
|
56
|
+
#
|
|
57
|
+
# {:ID=>"FOO:count=10", :SEQ=>"gataag"}
|
|
58
|
+
# {:ID=>"FOO_10_20", :SEQ=>"gataag", :ID_0=>"FOO", :ID_1=>10, :ID_2=>20}
|
|
59
|
+
#
|
|
60
|
+
# Using a different delimiter:
|
|
61
|
+
#
|
|
62
|
+
# split_values(key: "ID", delimiter: ':count=')
|
|
63
|
+
#
|
|
64
|
+
# {:ID=>"FOO:count=10", :SEQ=>"gataag", :ID_0=>"FOO", :ID_1=>10}
|
|
65
|
+
# {:ID=>"FOO_10_20", :SEQ=>"gataag"}
|
|
66
|
+
#
|
|
67
|
+
# Using a different delimiter and a list of keys:
|
|
68
|
+
#
|
|
69
|
+
# split_values(key: "ID", keys: ["ID", :COUNT], delimiter: ':count=')
|
|
70
|
+
#
|
|
71
|
+
# {:ID=>"FOO", :SEQ=>"gataag", :COUNT=>10}
|
|
72
|
+
# {:ID=>"FOO_10_20", :SEQ=>"gataag"}
|
|
73
|
+
class SplitValues
|
|
74
|
+
STATS = %i(records_in records_out)
|
|
75
|
+
|
|
76
|
+
# Constructor for SplitValues.
|
|
77
|
+
#
|
|
78
|
+
# @param options [Hash] Options hash.
|
|
79
|
+
# @option options [String,Symbol] :key
|
|
80
|
+
# @option options [Array] :keys
|
|
81
|
+
# @option options [String] :delimiter
|
|
82
|
+
#
|
|
83
|
+
# @return [SplitValues] Class instance.
|
|
84
|
+
def initialize(options)
|
|
85
|
+
@options = options
|
|
86
|
+
|
|
87
|
+
check_options
|
|
88
|
+
|
|
89
|
+
@first = true
|
|
90
|
+
@convert = []
|
|
91
|
+
@keys = @options[:keys]
|
|
92
|
+
@key = @options[:key].to_sym
|
|
93
|
+
@delimiter = @options[:delimiter] || '_'
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Return command lambda for split_values.
|
|
97
|
+
#
|
|
98
|
+
# @return [Proc] Command lambda.
|
|
99
|
+
def lmb
|
|
100
|
+
lambda do |input, output, status|
|
|
101
|
+
status_init(status, STATS)
|
|
102
|
+
|
|
103
|
+
input.each do |record|
|
|
104
|
+
@status[:records_in] += 1
|
|
105
|
+
|
|
106
|
+
if (value = record[@key])
|
|
107
|
+
values = value.split(@delimiter)
|
|
108
|
+
|
|
109
|
+
if values.size > 1
|
|
110
|
+
determine_types(values) if @first
|
|
111
|
+
|
|
112
|
+
split_values(values, record)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
output << record
|
|
117
|
+
|
|
118
|
+
@status[:records_out] += 1
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
private
|
|
124
|
+
|
|
125
|
+
# Check options.
|
|
126
|
+
def check_options
|
|
127
|
+
options_allowed(@options, :key, :keys, :delimiter)
|
|
128
|
+
options_required(@options, :key)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Given an array of values determine the types that must be converted to
|
|
132
|
+
# integers or floats and save the value index in a class variable.
|
|
133
|
+
#
|
|
134
|
+
# @param values [Array] List of values.
|
|
135
|
+
def determine_types(values)
|
|
136
|
+
values.each_with_index do |val, i|
|
|
137
|
+
val = val.to_num
|
|
138
|
+
|
|
139
|
+
if val.is_a? Fixnum
|
|
140
|
+
@convert[i] = :to_i
|
|
141
|
+
elsif val.is_a? Float
|
|
142
|
+
@convert[i] = :to_f
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
@first = false
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Convert values and add to record.
|
|
150
|
+
#
|
|
151
|
+
# @param values [Array] List of values.
|
|
152
|
+
# @param record [Hash] BioDSL record.
|
|
153
|
+
def split_values(values, record)
|
|
154
|
+
values.each_with_index do |val, i|
|
|
155
|
+
val = val.send(@convert[i]) if @convert[i]
|
|
156
|
+
|
|
157
|
+
if @keys
|
|
158
|
+
record[@keys[i].to_sym] = val
|
|
159
|
+
else
|
|
160
|
+
record["#{@key}_#{i}".to_sym] = val
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|