bio-gadget 0.4.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
data/lib/bio/gadget.rb
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
require 'mkfifo'
|
2
|
+
require 'open3'
|
3
|
+
require 'tempfile'
|
4
|
+
require 'thor'
|
5
|
+
|
6
|
+
module Bio
|
7
|
+
class Gadget < Thor
|
8
|
+
|
9
|
+
OPT_BUFFER_SIZE = [
|
10
|
+
:buffer_size, {
|
11
|
+
:aliases => '-S',
|
12
|
+
:banner => 'SIZE',
|
13
|
+
:desc => 'Use SIZE for main memory buffer',
|
14
|
+
:type => :string
|
15
|
+
}
|
16
|
+
]
|
17
|
+
|
18
|
+
OPT_DOWNLOAD = [ :download, {
|
19
|
+
:banner => 'BEHAVIOR',
|
20
|
+
:default => 'yes',
|
21
|
+
:desc => 'Download and process, no download or only',
|
22
|
+
:enum => ['yes', 'no', 'only'] } ]
|
23
|
+
|
24
|
+
OPT_PARALLEL = [
|
25
|
+
:parallel, {
|
26
|
+
:banner => 'N',
|
27
|
+
:default => (
|
28
|
+
system('which gnproc >/dev/null 2>&1') ?
|
29
|
+
`gnproc`.to_i :
|
30
|
+
(system('which nproc >/dev/null 2>&1') ? `nproc`.to_i : 2)
|
31
|
+
),
|
32
|
+
:desc => 'Change the number of sorts run concurrently',
|
33
|
+
:type => :numeric
|
34
|
+
}
|
35
|
+
]
|
36
|
+
|
37
|
+
OPT_COREUTILS_PREFIX = [
|
38
|
+
:coreutils_prefix, {
|
39
|
+
:banner => 'PREFIX',
|
40
|
+
:default => system('which gnproc >/dev/null 2>&1') ? 'g' : '',
|
41
|
+
:desc => 'A prefix character for GNU coreutils',
|
42
|
+
:type => :string
|
43
|
+
}
|
44
|
+
]
|
45
|
+
|
46
|
+
OPT_GREP_PREFIX = [
|
47
|
+
:grep_prefix, {
|
48
|
+
:banner => 'PREFIX',
|
49
|
+
:default => system('which ggrep >/dev/null 2>&1') ? 'g' : '',
|
50
|
+
:desc => 'A prefix character for GNU grep',
|
51
|
+
:type => :string
|
52
|
+
}
|
53
|
+
]
|
54
|
+
|
55
|
+
#
|
56
|
+
|
57
|
+
no_commands do
|
58
|
+
|
59
|
+
def self.banner(command, namespace = true, subcommand = false)
|
60
|
+
"#{basename} #{@package_name.nil? ? '' : @package_name.to_s+' '}#{command.usage}"
|
61
|
+
end
|
62
|
+
|
63
|
+
def buffer_size_option
|
64
|
+
options.key?(:buffer_size) ? ' --buffer-size='+options.buffer_size : ''
|
65
|
+
end
|
66
|
+
|
67
|
+
def cat_command
|
68
|
+
"#{options.coreutils_prefix}cat"
|
69
|
+
end
|
70
|
+
|
71
|
+
def coreutils_prefix_option
|
72
|
+
options.key?(:coreutils_prefix) ? " --coreutils-prefix=#{options.coreutils_prefix}" : ''
|
73
|
+
end
|
74
|
+
|
75
|
+
def cut_command
|
76
|
+
"#{options.coreutils_prefix}cut"
|
77
|
+
end
|
78
|
+
|
79
|
+
def download_file(url, path)
|
80
|
+
system "curl -R -f -s -S -o #{path} '#{url}'" or exit $?.exitstatus
|
81
|
+
end
|
82
|
+
|
83
|
+
def fold_command(options)
|
84
|
+
"#{options.coreutils_prefix}fold"
|
85
|
+
end
|
86
|
+
|
87
|
+
def fq1l_convert_command(options)
|
88
|
+
"fq1l convert#{coreutils_prefix_option}"
|
89
|
+
end
|
90
|
+
|
91
|
+
def fq1l_count_command(options)
|
92
|
+
"fq1l count#{coreutils_prefix_option}#{parallel_option(options)}"
|
93
|
+
end
|
94
|
+
|
95
|
+
def fq1l_sort_command(options)
|
96
|
+
"fq1l sort#{coreutils_prefix_option}#{parallel_option(options)}"
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_temporary_path(prefix, suffix, cleanup=true)
|
100
|
+
tmpname = Dir::Tmpname.create(["rbg.#{prefix}.", ".#{suffix}"]) { }
|
101
|
+
if cleanup
|
102
|
+
at_exit { File.unlink(tmpname) if FileTest.exist?(tmpname) }
|
103
|
+
end
|
104
|
+
tmpname
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_fifo(prefix, suffix, cleanup=true)
|
108
|
+
fifo = get_temporary_path("#{prefix}.fifo", suffix, cleanup)
|
109
|
+
File.mkfifo(fifo)
|
110
|
+
fifo
|
111
|
+
end
|
112
|
+
|
113
|
+
def grep_command
|
114
|
+
"#{options.grep_prefix}grep"
|
115
|
+
end
|
116
|
+
|
117
|
+
def grep_prefix_option(options)
|
118
|
+
options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
|
119
|
+
end
|
120
|
+
|
121
|
+
def head_command(options)
|
122
|
+
"#{options.coreutils_prefix}head"
|
123
|
+
end
|
124
|
+
|
125
|
+
def parallel_option(options)
|
126
|
+
options.key?(:parallel) ? " --parallel=#{options.parallel}" : ''
|
127
|
+
end
|
128
|
+
|
129
|
+
def pipeline(*cmds)
|
130
|
+
stats = Open3.pipeline(*cmds)
|
131
|
+
stats.each_index do |i|
|
132
|
+
raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success?
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def sort_command
|
137
|
+
"#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz"
|
138
|
+
end
|
139
|
+
|
140
|
+
def sh(cmd)
|
141
|
+
system cmd
|
142
|
+
raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success?
|
143
|
+
end
|
144
|
+
|
145
|
+
def tee_command(options)
|
146
|
+
"#{options.coreutils_prefix}tee"
|
147
|
+
end
|
148
|
+
|
149
|
+
def uniq_command(options)
|
150
|
+
"#{options.coreutils_prefix}uniq"
|
151
|
+
end
|
152
|
+
|
153
|
+
def unlink_files(files)
|
154
|
+
files.each do |file|
|
155
|
+
File.unlink(file) if File.exist?(file)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def wc_command(options)
|
160
|
+
"#{options.coreutils_prefix}wc"
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
require 'bio/gadgets'
|
169
|
+
require 'bio/gadget/fq1l'
|
170
|
+
require 'bio/gadget/strt'
|
171
|
+
require 'bio/gadget/bio_gadget'
|
@@ -0,0 +1,457 @@
|
|
1
|
+
require 'damerau-levenshtein'
|
2
|
+
require 'io/wait'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
class Gadget
|
7
|
+
class Fq1l < Bio::Gadget
|
8
|
+
|
9
|
+
OPT_INVERT_MATCH = [
|
10
|
+
:invert_match, {
|
11
|
+
:desc => 'The sense of matching',
|
12
|
+
:type => :boolean
|
13
|
+
}
|
14
|
+
]
|
15
|
+
|
16
|
+
OPT_MINIMUM_LENGTH = [
|
17
|
+
:minimum_length, {
|
18
|
+
:banner => 'NT',
|
19
|
+
:default => 40,
|
20
|
+
:desc => 'Minimum length after trimming',
|
21
|
+
:type => :numeric
|
22
|
+
}
|
23
|
+
]
|
24
|
+
|
25
|
+
# fq1l:annotate_index
|
26
|
+
|
27
|
+
desc 'annotate_index', 'Annotate sequence identifier by index sequence at the specified region'
|
28
|
+
|
29
|
+
method_option :first_cycle,
|
30
|
+
default: 7,
|
31
|
+
desc: 'The first cycle of index',
|
32
|
+
type: :numeric
|
33
|
+
|
34
|
+
method_option :last_cycle,
|
35
|
+
default: 12,
|
36
|
+
desc: 'The last cycle of index',
|
37
|
+
type: :numeric
|
38
|
+
|
39
|
+
def annotate_index
|
40
|
+
exit unless STDIN.wait
|
41
|
+
BioGadget.i2i(options.first_cycle, options.last_cycle)
|
42
|
+
end
|
43
|
+
|
44
|
+
# fq1l:annotate_umi
|
45
|
+
|
46
|
+
desc 'annotate_umi', 'Annotate sequence identifier by UMI sequence at the specified region'
|
47
|
+
|
48
|
+
method_option :first_cycle,
|
49
|
+
default: 1,
|
50
|
+
desc: 'The first cycle of UMI',
|
51
|
+
type: :numeric
|
52
|
+
|
53
|
+
method_option :last_cycle,
|
54
|
+
default: 6,
|
55
|
+
desc: 'The last cycle of UMI',
|
56
|
+
type: :numeric
|
57
|
+
|
58
|
+
def annotate_umi
|
59
|
+
exit unless STDIN.wait
|
60
|
+
BioGadget.u2i(options.first_cycle, options.last_cycle)
|
61
|
+
end
|
62
|
+
|
63
|
+
# fq1l:convert
|
64
|
+
|
65
|
+
desc 'convert', 'Convert fastq from 4 lines/read to 1 line/read for this utility'
|
66
|
+
|
67
|
+
method_option *OPT_COREUTILS_PREFIX
|
68
|
+
|
69
|
+
def convert
|
70
|
+
exit unless STDIN.wait
|
71
|
+
exec "#{options.coreutils_prefix}paste - - - -"
|
72
|
+
end
|
73
|
+
|
74
|
+
# fq1l:count
|
75
|
+
|
76
|
+
desc 'count [CSV]', 'Count sequences by the length'
|
77
|
+
|
78
|
+
method_option *OPT_COREUTILS_PREFIX
|
79
|
+
method_option *OPT_PARALLEL
|
80
|
+
|
81
|
+
def count(csv = nil)
|
82
|
+
exit unless STDIN.wait
|
83
|
+
if csv.nil?
|
84
|
+
puts "length,reads"
|
85
|
+
pipeline("#{cut_command} -f 2",
|
86
|
+
"ruby -nle 'puts $_.length'",
|
87
|
+
"#{sort_command} -n",
|
88
|
+
"#{uniq_command(options)} -c",
|
89
|
+
"ruby -anle 'puts $F.reverse.join(\",\")'")
|
90
|
+
else
|
91
|
+
fifo = get_fifo('fq1l.count', 'fq1l')
|
92
|
+
pid = Kernel.spawn("fq1l count#{coreutils_prefix_option} < #{fifo} > #{csv}")
|
93
|
+
system "#{tee_command(options)} #{fifo}"
|
94
|
+
Process.waitpid(pid)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# fq1l:demultiplex
|
99
|
+
|
100
|
+
desc 'demultiplex BASE MAP', 'Demultiplex based on a barcode MAP, and restore sequence files with BASE names'
|
101
|
+
|
102
|
+
method_option :maximum_distance,
|
103
|
+
default: 1,
|
104
|
+
desc: 'Maximum distance between barcode and sequence',
|
105
|
+
type: :numeric
|
106
|
+
|
107
|
+
def demultiplex(base, map)
|
108
|
+
|
109
|
+
dl = DamerauLevenshtein
|
110
|
+
|
111
|
+
exit unless STDIN.wait
|
112
|
+
|
113
|
+
bc2fq = Hash.new
|
114
|
+
open(map, 'r').each do |line|
|
115
|
+
bc, well = line.rstrip.split(',')
|
116
|
+
bc2fq[bc] = fq = "#{base}.#{well}.fq"
|
117
|
+
File.unlink(fq) if File.exist?(fq)
|
118
|
+
end
|
119
|
+
na = "#{base}.NA.fq"
|
120
|
+
File.unlink(na) if File.exist?(na)
|
121
|
+
|
122
|
+
bcl = bc2fq.keys.map!{|key| key.length}.sort.uniq[0]
|
123
|
+
|
124
|
+
fp = nil
|
125
|
+
pbc = nil
|
126
|
+
STDIN.set_encoding('BINARY').each do |line|
|
127
|
+
acc, seq, sep, qual = line.rstrip.split(/\t/)
|
128
|
+
bc = acc[-bcl, bcl]
|
129
|
+
if bc != pbc
|
130
|
+
mindist = options.maximum_distance+1
|
131
|
+
minbc = nil
|
132
|
+
bc2fq.each_key do |key|
|
133
|
+
dist = dl.distance(key, bc, 0, options.maximum_distance)
|
134
|
+
if dist < mindist
|
135
|
+
mindist = dist
|
136
|
+
minbc = key
|
137
|
+
end
|
138
|
+
break if dist == 0
|
139
|
+
end
|
140
|
+
fp.close unless fp.nil?
|
141
|
+
fp = open(mindist <= options.maximum_distance ? bc2fq[minbc] : na, 'a')
|
142
|
+
pbc = bc
|
143
|
+
end
|
144
|
+
fp.puts "#{acc}\n#{seq}\n#{sep}\n#{qual}"
|
145
|
+
end
|
146
|
+
fp.close unless fp.nil?
|
147
|
+
|
148
|
+
bc2fq.each_value {|fq| system "pigz #{fq}" if File.exist?(fq) }
|
149
|
+
system "pigz #{na}" if File.exist?(na)
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
# fq1l:exclude_degenerate
|
154
|
+
|
155
|
+
desc 'exclude_degenerate', 'Exclude degenerated reads in the order'
|
156
|
+
|
157
|
+
def exclude_degenerate
|
158
|
+
exit unless STDIN.wait
|
159
|
+
BioGadget.nr_deg()
|
160
|
+
end
|
161
|
+
|
162
|
+
# fq1l:exclude_duplicate
|
163
|
+
|
164
|
+
desc 'exclude_duplicate', 'Exclude duplicated reads in the order'
|
165
|
+
|
166
|
+
def exclude_duplicate
|
167
|
+
exit unless STDIN.wait
|
168
|
+
BioGadget.nr_std()
|
169
|
+
end
|
170
|
+
|
171
|
+
# fq1l:match_3end
|
172
|
+
|
173
|
+
desc 'match_3end PATTERN', 'Select sequences that match the 3\'-end with a given PATTERN'
|
174
|
+
|
175
|
+
method_option *OPT_INVERT_MATCH
|
176
|
+
method_option *OPT_GREP_PREFIX
|
177
|
+
|
178
|
+
def match_3end(pattern)
|
179
|
+
exit unless STDIN.wait
|
180
|
+
# PCRE was faster than BRE and ERE in GNU grep 2.25
|
181
|
+
system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
|
182
|
+
exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
|
183
|
+
end
|
184
|
+
|
185
|
+
# fq1l:match_5end
|
186
|
+
|
187
|
+
desc 'match_5end PATTERN', 'Select sequences that match the 5\'-end with a given PATTERN'
|
188
|
+
|
189
|
+
method_option *OPT_INVERT_MATCH
|
190
|
+
method_option *OPT_GREP_PREFIX
|
191
|
+
|
192
|
+
def match_5end(pattern)
|
193
|
+
exit unless STDIN.wait
|
194
|
+
# PCRE was faster than BRE and ERE in GNU grep 2.25
|
195
|
+
system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
|
196
|
+
exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
|
197
|
+
end
|
198
|
+
|
199
|
+
# fq1l:restore
|
200
|
+
|
201
|
+
desc 'restore', 'Convert fastq from 1 line/read to 4 lines/read'
|
202
|
+
|
203
|
+
method_option *OPT_COREUTILS_PREFIX
|
204
|
+
|
205
|
+
def restore
|
206
|
+
exit unless STDIN.wait
|
207
|
+
exec "#{options.coreutils_prefix}tr \"\\t\" \"\\n\""
|
208
|
+
end
|
209
|
+
|
210
|
+
# fq1l:slice
|
211
|
+
|
212
|
+
desc 'slice Nth SLICE', 'Slice the sequences'
|
213
|
+
|
214
|
+
def slice(nth, slice)
|
215
|
+
exit unless STDIN.wait
|
216
|
+
BioGadget.slice(nth.to_i, slice.to_i)
|
217
|
+
end
|
218
|
+
|
219
|
+
# fq1l:sort
|
220
|
+
|
221
|
+
desc 'sort [FQ1Ls]', 'Sort by sequence and the quality in descending order'
|
222
|
+
|
223
|
+
method_option *OPT_COREUTILS_PREFIX
|
224
|
+
method_option *OPT_BUFFER_SIZE
|
225
|
+
method_option *OPT_PARALLEL
|
226
|
+
|
227
|
+
def sort(*fq1ls)
|
228
|
+
if fq1ls.size == 0
|
229
|
+
exit unless STDIN.wait
|
230
|
+
exec "#{sort_command} -t '\t' -r -k2,4"
|
231
|
+
else
|
232
|
+
exec "#{sort_command} -t '\t' -r -k2,4 -m #{fq1ls.join(' ')}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# fq1l:sort_index
|
237
|
+
|
238
|
+
desc 'sort_index', 'Sort by index'
|
239
|
+
|
240
|
+
method_option *OPT_COREUTILS_PREFIX
|
241
|
+
method_option *OPT_BUFFER_SIZE
|
242
|
+
method_option *OPT_PARALLEL
|
243
|
+
|
244
|
+
def sort_index
|
245
|
+
exit unless STDIN.wait
|
246
|
+
exec "#{sort_command} -k2"
|
247
|
+
end
|
248
|
+
|
249
|
+
# fq1l:sum_counts
|
250
|
+
|
251
|
+
desc 'sum_counts CSV ...', 'Sum counts of sequences by the length'
|
252
|
+
|
253
|
+
def sum_counts(*csvs)
|
254
|
+
length2count = Hash.new
|
255
|
+
csvs.each do |csv|
|
256
|
+
open(csv).each do |line|
|
257
|
+
l, c = line.rstrip.split(/,/)
|
258
|
+
next if l == 'length'
|
259
|
+
length = l.to_i
|
260
|
+
length2count[length] = 0 unless length2count.key?(length)
|
261
|
+
length2count[length] += c.to_i
|
262
|
+
end
|
263
|
+
end
|
264
|
+
puts "length,count"
|
265
|
+
length2count.keys.sort.each do |length|
|
266
|
+
puts "#{length},#{length2count[length]}"
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# fq1l:thin_out
|
271
|
+
|
272
|
+
desc 'thin_out DRAW SKIP', 'Thin out the sequences'
|
273
|
+
|
274
|
+
def to(draw, skip)
|
275
|
+
exit unless STDIN.wait
|
276
|
+
BioGadget.to(draw.to_i, skip.to_i)
|
277
|
+
end
|
278
|
+
|
279
|
+
# fq1l:trim_3end
|
280
|
+
|
281
|
+
desc 'trim_3end SEQUENCE', 'Trim 3\'-end that match with a given SEQUENCE'
|
282
|
+
|
283
|
+
method_option *OPT_COREUTILS_PREFIX
|
284
|
+
method_option *OPT_GREP_PREFIX
|
285
|
+
method_option *OPT_MINIMUM_LENGTH
|
286
|
+
|
287
|
+
method_option :trimmed,
|
288
|
+
banner: 'FILE',
|
289
|
+
desc: 'FILE for trimmed reads; STDOUT if not speficied',
|
290
|
+
type: :string
|
291
|
+
|
292
|
+
def trim_3end(sequence)
|
293
|
+
# exit unless STDIN.wait
|
294
|
+
gPrefix = options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
|
295
|
+
fifo = get_fifo('fq1l.trim_3end', 'fq1l', false)
|
296
|
+
begin
|
297
|
+
tmpfile = options.key?(:trimmed) ? File.expand_path(options.trimmed) : get_temporary_path('fq1l.trim_3end', 'fq1l', false)
|
298
|
+
begin
|
299
|
+
pid = Process.fork do
|
300
|
+
BioGadget.t3("fq1l match_3end#{gPrefix} #{sequence} < #{fifo}", sequence.length, options.minimum_length, tmpfile)
|
301
|
+
end
|
302
|
+
pipeline("#{tee_command(options)} #{fifo}",
|
303
|
+
"fq1l match_3end#{gPrefix} #{sequence} --invert-match")
|
304
|
+
ensure
|
305
|
+
system "#{cat_command} #{tmpfile}" unless options.key?(:trimmed)
|
306
|
+
end
|
307
|
+
ensure
|
308
|
+
File.unlink(fifo) if File.exist?(fifo)
|
309
|
+
File.unlink(tmpfile) if File.exist?(tmpfile) && !options.key?(:trimmed)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# fq1l:trim_3end_length
|
314
|
+
|
315
|
+
desc 'trim_3end_length', 'Trim 3\'-end by a specific length'
|
316
|
+
|
317
|
+
method_option *OPT_MINIMUM_LENGTH
|
318
|
+
|
319
|
+
method_option :trimming_length,
|
320
|
+
default: 1,
|
321
|
+
desc: 'Length of the trimming',
|
322
|
+
type: :numeric
|
323
|
+
|
324
|
+
def trim_3end_length
|
325
|
+
exit unless STDIN.wait
|
326
|
+
BioGadget.t3(nil, options.trimming_length, options.minimum_length, nil)
|
327
|
+
end
|
328
|
+
|
329
|
+
# fq1l:trim_3end_primer
|
330
|
+
|
331
|
+
desc 'trim_3end_primer', 'Trim 3\'-end that match with a given primer'
|
332
|
+
|
333
|
+
method_option *OPT_COREUTILS_PREFIX
|
334
|
+
method_option *OPT_GREP_PREFIX
|
335
|
+
method_option *OPT_MINIMUM_LENGTH
|
336
|
+
method_option *OPT_PARALLEL
|
337
|
+
|
338
|
+
method_option :primers,
|
339
|
+
default: 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG,CTCGTATGCCGTCTTCTGCTTG',
|
340
|
+
desc: 'Comma-separated primer sequences that be used for trimming',
|
341
|
+
type: :string
|
342
|
+
|
343
|
+
def trim_3end_primer
|
344
|
+
|
345
|
+
opt_minimum_length = "--minimum-length=#{options.minimum_length}"
|
346
|
+
primers = options.primers.split(',')
|
347
|
+
|
348
|
+
fragments = Hash.new
|
349
|
+
tmp = Hash.new
|
350
|
+
primers.each do |primer|
|
351
|
+
max = primer.length-1
|
352
|
+
for i in 0..max do
|
353
|
+
fragment = primer[0..i]
|
354
|
+
unless tmp.key?(fragment)
|
355
|
+
l = fragment.length
|
356
|
+
fragments[l] = Array.new unless fragments.key?(l)
|
357
|
+
fragments[l] << fragment
|
358
|
+
tmp[fragment] = true
|
359
|
+
end
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
363
|
+
exit unless STDIN.wait
|
364
|
+
|
365
|
+
tmpfiles = Array.new
|
366
|
+
commands = Array.new
|
367
|
+
|
368
|
+
fragments.keys.sort.reverse.each do |length|
|
369
|
+
if 4**length == fragments[length].size
|
370
|
+
commands << "fq1l trim_3end_length --trimming-length=#{length} #{opt_minimum_length}"
|
371
|
+
break
|
372
|
+
else
|
373
|
+
fragments[length].sort.reverse.each do |fragment|
|
374
|
+
tmpfiles << tmpfile = get_temporary_path("fq1l.trim_3end_primer.#{fragment}", 'fq1l', false)
|
375
|
+
commands << "fq1l trim_3end#{' --coreutils-prefix='+options.coreutils_prefix if options.key?(:coreutils_prefix)}#{' --grep-prefix='+options.grep_prefix if options.key?(:grep_prefix)} #{opt_minimum_length} --trimmed=#{tmpfile} #{fragment}"
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
stats = Open3.pipeline(*commands)
|
380
|
+
stats.each_index do |i|
|
381
|
+
unless stats[i].success?
|
382
|
+
unlink_files(tmpfiles)
|
383
|
+
raise "Fail at process #{i}; #{stats[i]}; #{commands[i]}"
|
384
|
+
end
|
385
|
+
end
|
386
|
+
system "#{cat_command} #{tmpfiles.join(' ')}"
|
387
|
+
unlink_files(tmpfiles)
|
388
|
+
|
389
|
+
end
|
390
|
+
|
391
|
+
# fq1l:trim_3end_quality
|
392
|
+
|
393
|
+
desc 'trim_3end_quality', 'Trim 3\'-end from a low quality base'
|
394
|
+
|
395
|
+
method_option *OPT_MINIMUM_LENGTH
|
396
|
+
|
397
|
+
method_option :low_qualities,
|
398
|
+
banner: 'CHARACTERS',
|
399
|
+
default: '!"#',
|
400
|
+
desc: 'Low quality characters',
|
401
|
+
type: :string
|
402
|
+
|
403
|
+
def trim_3end_quality
|
404
|
+
BioGadget.t3q(options.low_qualities, options.minimum_length)
|
405
|
+
end
|
406
|
+
|
407
|
+
# fq1l:trim_5end
|
408
|
+
|
409
|
+
desc 'trim_5end PATTERN', 'Trim 5\'-end that match with a given PATTERN'
|
410
|
+
|
411
|
+
method_option :minimum_length,
|
412
|
+
banner: 'NT',
|
413
|
+
default: 24,
|
414
|
+
desc: 'Minimum length after trimming',
|
415
|
+
type: :numeric
|
416
|
+
|
417
|
+
|
418
|
+
def trim_5end(pattern)
|
419
|
+
exit unless STDIN.wait
|
420
|
+
BioGadget.t5(pattern, options.minimum_length)
|
421
|
+
end
|
422
|
+
|
423
|
+
# #
|
424
|
+
|
425
|
+
# no_commands do
|
426
|
+
|
427
|
+
# def pipeline(parallel, *commands)
|
428
|
+
# stats = Array.new
|
429
|
+
# tmpin = nil
|
430
|
+
# tmpout = nil
|
431
|
+
# begin
|
432
|
+
# while commands.size > 0
|
433
|
+
# cmds = commands.shift(parallel)
|
434
|
+
# cmds[0] = cmds[0] + " < #{tmpin}" unless tmpin.nil?
|
435
|
+
# if commands.size > 0
|
436
|
+
# tmpout = get_temporary_path('pipeline', 'tmp', false)
|
437
|
+
# cmds[-1] = cmds[-1] + " > #{tmpout}"
|
438
|
+
# end
|
439
|
+
# tmpstats = Open3.pipeline(*cmds)
|
440
|
+
# stats.concat(tmpstats)
|
441
|
+
# tmpstats.each {|tmpstat| commands = nil unless tmpstat.success? }
|
442
|
+
# break if commands.nil?
|
443
|
+
# File.unlink(tmpin) unless tmpin.nil?
|
444
|
+
# tmpin = tmpout
|
445
|
+
# end
|
446
|
+
# ensure
|
447
|
+
# File.unlink(tmpin) if !tmpin.nil? && File.exist?(tmpin)
|
448
|
+
# File.unlink(tmpout) if !tmpout.nil? && File.exist?(tmpout)
|
449
|
+
# end
|
450
|
+
# stats
|
451
|
+
# end
|
452
|
+
|
453
|
+
# end
|
454
|
+
|
455
|
+
end
|
456
|
+
end
|
457
|
+
end
|