bio-gadget 0.4.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
data/lib/bio/gadget.rb
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
require 'mkfifo'
|
2
|
+
require 'open3'
|
3
|
+
require 'tempfile'
|
4
|
+
require 'thor'
|
5
|
+
|
6
|
+
module Bio
|
7
|
+
class Gadget < Thor
|
8
|
+
|
9
|
+
OPT_BUFFER_SIZE = [
|
10
|
+
:buffer_size, {
|
11
|
+
:aliases => '-S',
|
12
|
+
:banner => 'SIZE',
|
13
|
+
:desc => 'Use SIZE for main memory buffer',
|
14
|
+
:type => :string
|
15
|
+
}
|
16
|
+
]
|
17
|
+
|
18
|
+
OPT_DOWNLOAD = [ :download, {
|
19
|
+
:banner => 'BEHAVIOR',
|
20
|
+
:default => 'yes',
|
21
|
+
:desc => 'Download and process, no download or only',
|
22
|
+
:enum => ['yes', 'no', 'only'] } ]
|
23
|
+
|
24
|
+
OPT_PARALLEL = [
|
25
|
+
:parallel, {
|
26
|
+
:banner => 'N',
|
27
|
+
:default => (
|
28
|
+
system('which gnproc >/dev/null 2>&1') ?
|
29
|
+
`gnproc`.to_i :
|
30
|
+
(system('which nproc >/dev/null 2>&1') ? `nproc`.to_i : 2)
|
31
|
+
),
|
32
|
+
:desc => 'Change the number of sorts run concurrently',
|
33
|
+
:type => :numeric
|
34
|
+
}
|
35
|
+
]
|
36
|
+
|
37
|
+
OPT_COREUTILS_PREFIX = [
|
38
|
+
:coreutils_prefix, {
|
39
|
+
:banner => 'PREFIX',
|
40
|
+
:default => system('which gnproc >/dev/null 2>&1') ? 'g' : '',
|
41
|
+
:desc => 'A prefix character for GNU coreutils',
|
42
|
+
:type => :string
|
43
|
+
}
|
44
|
+
]
|
45
|
+
|
46
|
+
OPT_GREP_PREFIX = [
|
47
|
+
:grep_prefix, {
|
48
|
+
:banner => 'PREFIX',
|
49
|
+
:default => system('which ggrep >/dev/null 2>&1') ? 'g' : '',
|
50
|
+
:desc => 'A prefix character for GNU grep',
|
51
|
+
:type => :string
|
52
|
+
}
|
53
|
+
]
|
54
|
+
|
55
|
+
#
|
56
|
+
|
57
|
+
no_commands do
|
58
|
+
|
59
|
+
def self.banner(command, namespace = true, subcommand = false)
|
60
|
+
"#{basename} #{@package_name.nil? ? '' : @package_name.to_s+' '}#{command.usage}"
|
61
|
+
end
|
62
|
+
|
63
|
+
def buffer_size_option
|
64
|
+
options.key?(:buffer_size) ? ' --buffer-size='+options.buffer_size : ''
|
65
|
+
end
|
66
|
+
|
67
|
+
def cat_command
|
68
|
+
"#{options.coreutils_prefix}cat"
|
69
|
+
end
|
70
|
+
|
71
|
+
def coreutils_prefix_option
|
72
|
+
options.key?(:coreutils_prefix) ? " --coreutils-prefix=#{options.coreutils_prefix}" : ''
|
73
|
+
end
|
74
|
+
|
75
|
+
def cut_command
|
76
|
+
"#{options.coreutils_prefix}cut"
|
77
|
+
end
|
78
|
+
|
79
|
+
def download_file(url, path)
|
80
|
+
system "curl -R -f -s -S -o #{path} '#{url}'" or exit $?.exitstatus
|
81
|
+
end
|
82
|
+
|
83
|
+
def fold_command(options)
|
84
|
+
"#{options.coreutils_prefix}fold"
|
85
|
+
end
|
86
|
+
|
87
|
+
def fq1l_convert_command(options)
|
88
|
+
"fq1l convert#{coreutils_prefix_option}"
|
89
|
+
end
|
90
|
+
|
91
|
+
def fq1l_count_command(options)
|
92
|
+
"fq1l count#{coreutils_prefix_option}#{parallel_option(options)}"
|
93
|
+
end
|
94
|
+
|
95
|
+
def fq1l_sort_command(options)
|
96
|
+
"fq1l sort#{coreutils_prefix_option}#{parallel_option(options)}"
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_temporary_path(prefix, suffix, cleanup=true)
|
100
|
+
tmpname = Dir::Tmpname.create(["rbg.#{prefix}.", ".#{suffix}"]) { }
|
101
|
+
if cleanup
|
102
|
+
at_exit { File.unlink(tmpname) if FileTest.exist?(tmpname) }
|
103
|
+
end
|
104
|
+
tmpname
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_fifo(prefix, suffix, cleanup=true)
|
108
|
+
fifo = get_temporary_path("#{prefix}.fifo", suffix, cleanup)
|
109
|
+
File.mkfifo(fifo)
|
110
|
+
fifo
|
111
|
+
end
|
112
|
+
|
113
|
+
def grep_command
|
114
|
+
"#{options.grep_prefix}grep"
|
115
|
+
end
|
116
|
+
|
117
|
+
def grep_prefix_option(options)
|
118
|
+
options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
|
119
|
+
end
|
120
|
+
|
121
|
+
def head_command(options)
|
122
|
+
"#{options.coreutils_prefix}head"
|
123
|
+
end
|
124
|
+
|
125
|
+
def parallel_option(options)
|
126
|
+
options.key?(:parallel) ? " --parallel=#{options.parallel}" : ''
|
127
|
+
end
|
128
|
+
|
129
|
+
def pipeline(*cmds)
|
130
|
+
stats = Open3.pipeline(*cmds)
|
131
|
+
stats.each_index do |i|
|
132
|
+
raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success?
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def sort_command
|
137
|
+
"#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz"
|
138
|
+
end
|
139
|
+
|
140
|
+
def sh(cmd)
|
141
|
+
system cmd
|
142
|
+
raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success?
|
143
|
+
end
|
144
|
+
|
145
|
+
def tee_command(options)
|
146
|
+
"#{options.coreutils_prefix}tee"
|
147
|
+
end
|
148
|
+
|
149
|
+
def uniq_command(options)
|
150
|
+
"#{options.coreutils_prefix}uniq"
|
151
|
+
end
|
152
|
+
|
153
|
+
def unlink_files(files)
|
154
|
+
files.each do |file|
|
155
|
+
File.unlink(file) if File.exist?(file)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def wc_command(options)
|
160
|
+
"#{options.coreutils_prefix}wc"
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
require 'bio/gadgets'
|
169
|
+
require 'bio/gadget/fq1l'
|
170
|
+
require 'bio/gadget/strt'
|
171
|
+
require 'bio/gadget/bio_gadget'
|
@@ -0,0 +1,457 @@
|
|
1
|
+
require 'damerau-levenshtein'
|
2
|
+
require 'io/wait'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
class Gadget
|
7
|
+
class Fq1l < Bio::Gadget
|
8
|
+
|
9
|
+
OPT_INVERT_MATCH = [
|
10
|
+
:invert_match, {
|
11
|
+
:desc => 'The sense of matching',
|
12
|
+
:type => :boolean
|
13
|
+
}
|
14
|
+
]
|
15
|
+
|
16
|
+
OPT_MINIMUM_LENGTH = [
|
17
|
+
:minimum_length, {
|
18
|
+
:banner => 'NT',
|
19
|
+
:default => 40,
|
20
|
+
:desc => 'Minimum length after trimming',
|
21
|
+
:type => :numeric
|
22
|
+
}
|
23
|
+
]
|
24
|
+
|
25
|
+
# fq1l:annotate_index
|
26
|
+
|
27
|
+
desc 'annotate_index', 'Annotate sequence identifier by index sequence at the specified region'
|
28
|
+
|
29
|
+
method_option :first_cycle,
|
30
|
+
default: 7,
|
31
|
+
desc: 'The first cycle of index',
|
32
|
+
type: :numeric
|
33
|
+
|
34
|
+
method_option :last_cycle,
|
35
|
+
default: 12,
|
36
|
+
desc: 'The last cycle of index',
|
37
|
+
type: :numeric
|
38
|
+
|
39
|
+
def annotate_index
|
40
|
+
exit unless STDIN.wait
|
41
|
+
BioGadget.i2i(options.first_cycle, options.last_cycle)
|
42
|
+
end
|
43
|
+
|
44
|
+
# fq1l:annotate_umi
|
45
|
+
|
46
|
+
desc 'annotate_umi', 'Annotate sequence identifier by UMI sequence at the specified region'
|
47
|
+
|
48
|
+
method_option :first_cycle,
|
49
|
+
default: 1,
|
50
|
+
desc: 'The first cycle of UMI',
|
51
|
+
type: :numeric
|
52
|
+
|
53
|
+
method_option :last_cycle,
|
54
|
+
default: 6,
|
55
|
+
desc: 'The last cycle of UMI',
|
56
|
+
type: :numeric
|
57
|
+
|
58
|
+
def annotate_umi
|
59
|
+
exit unless STDIN.wait
|
60
|
+
BioGadget.u2i(options.first_cycle, options.last_cycle)
|
61
|
+
end
|
62
|
+
|
63
|
+
# fq1l:convert
|
64
|
+
|
65
|
+
desc 'convert', 'Convert fastq from 4 lines/read to 1 line/read for this utility'
|
66
|
+
|
67
|
+
method_option *OPT_COREUTILS_PREFIX
|
68
|
+
|
69
|
+
def convert
|
70
|
+
exit unless STDIN.wait
|
71
|
+
exec "#{options.coreutils_prefix}paste - - - -"
|
72
|
+
end
|
73
|
+
|
74
|
+
# fq1l:count
|
75
|
+
|
76
|
+
desc 'count [CSV]', 'Count sequences by the length'
|
77
|
+
|
78
|
+
method_option *OPT_COREUTILS_PREFIX
|
79
|
+
method_option *OPT_PARALLEL
|
80
|
+
|
81
|
+
def count(csv = nil)
|
82
|
+
exit unless STDIN.wait
|
83
|
+
if csv.nil?
|
84
|
+
puts "length,reads"
|
85
|
+
pipeline("#{cut_command} -f 2",
|
86
|
+
"ruby -nle 'puts $_.length'",
|
87
|
+
"#{sort_command} -n",
|
88
|
+
"#{uniq_command(options)} -c",
|
89
|
+
"ruby -anle 'puts $F.reverse.join(\",\")'")
|
90
|
+
else
|
91
|
+
fifo = get_fifo('fq1l.count', 'fq1l')
|
92
|
+
pid = Kernel.spawn("fq1l count#{coreutils_prefix_option} < #{fifo} > #{csv}")
|
93
|
+
system "#{tee_command(options)} #{fifo}"
|
94
|
+
Process.waitpid(pid)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# fq1l:demultiplex
|
99
|
+
|
100
|
+
desc 'demultiplex BASE MAP', 'Demultiplex based on a barcode MAP, and restore sequence files with BASE names'
|
101
|
+
|
102
|
+
method_option :maximum_distance,
|
103
|
+
default: 1,
|
104
|
+
desc: 'Maximum distance between barcode and sequence',
|
105
|
+
type: :numeric
|
106
|
+
|
107
|
+
def demultiplex(base, map)
|
108
|
+
|
109
|
+
dl = DamerauLevenshtein
|
110
|
+
|
111
|
+
exit unless STDIN.wait
|
112
|
+
|
113
|
+
bc2fq = Hash.new
|
114
|
+
open(map, 'r').each do |line|
|
115
|
+
bc, well = line.rstrip.split(',')
|
116
|
+
bc2fq[bc] = fq = "#{base}.#{well}.fq"
|
117
|
+
File.unlink(fq) if File.exist?(fq)
|
118
|
+
end
|
119
|
+
na = "#{base}.NA.fq"
|
120
|
+
File.unlink(na) if File.exist?(na)
|
121
|
+
|
122
|
+
bcl = bc2fq.keys.map!{|key| key.length}.sort.uniq[0]
|
123
|
+
|
124
|
+
fp = nil
|
125
|
+
pbc = nil
|
126
|
+
STDIN.set_encoding('BINARY').each do |line|
|
127
|
+
acc, seq, sep, qual = line.rstrip.split(/\t/)
|
128
|
+
bc = acc[-bcl, bcl]
|
129
|
+
if bc != pbc
|
130
|
+
mindist = options.maximum_distance+1
|
131
|
+
minbc = nil
|
132
|
+
bc2fq.each_key do |key|
|
133
|
+
dist = dl.distance(key, bc, 0, options.maximum_distance)
|
134
|
+
if dist < mindist
|
135
|
+
mindist = dist
|
136
|
+
minbc = key
|
137
|
+
end
|
138
|
+
break if dist == 0
|
139
|
+
end
|
140
|
+
fp.close unless fp.nil?
|
141
|
+
fp = open(mindist <= options.maximum_distance ? bc2fq[minbc] : na, 'a')
|
142
|
+
pbc = bc
|
143
|
+
end
|
144
|
+
fp.puts "#{acc}\n#{seq}\n#{sep}\n#{qual}"
|
145
|
+
end
|
146
|
+
fp.close unless fp.nil?
|
147
|
+
|
148
|
+
bc2fq.each_value {|fq| system "pigz #{fq}" if File.exist?(fq) }
|
149
|
+
system "pigz #{na}" if File.exist?(na)
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
# fq1l:exclude_degenerate
|
154
|
+
|
155
|
+
desc 'exclude_degenerate', 'Exclude degenerated reads in the order'
|
156
|
+
|
157
|
+
def exclude_degenerate
|
158
|
+
exit unless STDIN.wait
|
159
|
+
BioGadget.nr_deg()
|
160
|
+
end
|
161
|
+
|
162
|
+
# fq1l:exclude_duplicate
|
163
|
+
|
164
|
+
desc 'exclude_duplicate', 'Exclude duplicated reads in the order'
|
165
|
+
|
166
|
+
def exclude_duplicate
|
167
|
+
exit unless STDIN.wait
|
168
|
+
BioGadget.nr_std()
|
169
|
+
end
|
170
|
+
|
171
|
+
# fq1l:match_3end
|
172
|
+
|
173
|
+
desc 'match_3end PATTERN', 'Select sequences that match the 3\'-end with a given PATTERN'
|
174
|
+
|
175
|
+
method_option *OPT_INVERT_MATCH
|
176
|
+
method_option *OPT_GREP_PREFIX
|
177
|
+
|
178
|
+
def match_3end(pattern)
|
179
|
+
exit unless STDIN.wait
|
180
|
+
# PCRE was faster than BRE and ERE in GNU grep 2.25
|
181
|
+
system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
|
182
|
+
exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
|
183
|
+
end
|
184
|
+
|
185
|
+
# fq1l:match_5end
|
186
|
+
|
187
|
+
desc 'match_5end PATTERN', 'Select sequences that match the 5\'-end with a given PATTERN'
|
188
|
+
|
189
|
+
method_option *OPT_INVERT_MATCH
|
190
|
+
method_option *OPT_GREP_PREFIX
|
191
|
+
|
192
|
+
def match_5end(pattern)
|
193
|
+
exit unless STDIN.wait
|
194
|
+
# PCRE was faster than BRE and ERE in GNU grep 2.25
|
195
|
+
system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
|
196
|
+
exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
|
197
|
+
end
|
198
|
+
|
199
|
+
# fq1l:restore
|
200
|
+
|
201
|
+
desc 'restore', 'Convert fastq from 1 line/read to 4 lines/read'
|
202
|
+
|
203
|
+
method_option *OPT_COREUTILS_PREFIX
|
204
|
+
|
205
|
+
def restore
|
206
|
+
exit unless STDIN.wait
|
207
|
+
exec "#{options.coreutils_prefix}tr \"\\t\" \"\\n\""
|
208
|
+
end
|
209
|
+
|
210
|
+
# fq1l:slice
|
211
|
+
|
212
|
+
desc 'slice Nth SLICE', 'Slice the sequences'
|
213
|
+
|
214
|
+
def slice(nth, slice)
|
215
|
+
exit unless STDIN.wait
|
216
|
+
BioGadget.slice(nth.to_i, slice.to_i)
|
217
|
+
end
|
218
|
+
|
219
|
+
# fq1l:sort
|
220
|
+
|
221
|
+
desc 'sort [FQ1Ls]', 'Sort by sequence and the quality in descending order'
|
222
|
+
|
223
|
+
method_option *OPT_COREUTILS_PREFIX
|
224
|
+
method_option *OPT_BUFFER_SIZE
|
225
|
+
method_option *OPT_PARALLEL
|
226
|
+
|
227
|
+
def sort(*fq1ls)
|
228
|
+
if fq1ls.size == 0
|
229
|
+
exit unless STDIN.wait
|
230
|
+
exec "#{sort_command} -t '\t' -r -k2,4"
|
231
|
+
else
|
232
|
+
exec "#{sort_command} -t '\t' -r -k2,4 -m #{fq1ls.join(' ')}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# fq1l:sort_index
|
237
|
+
|
238
|
+
desc 'sort_index', 'Sort by index'
|
239
|
+
|
240
|
+
method_option *OPT_COREUTILS_PREFIX
|
241
|
+
method_option *OPT_BUFFER_SIZE
|
242
|
+
method_option *OPT_PARALLEL
|
243
|
+
|
244
|
+
def sort_index
|
245
|
+
exit unless STDIN.wait
|
246
|
+
exec "#{sort_command} -k2"
|
247
|
+
end
|
248
|
+
|
249
|
+
# fq1l:sum_counts
|
250
|
+
|
251
|
+
desc 'sum_counts CSV ...', 'Sum counts of sequences by the length'
|
252
|
+
|
253
|
+
def sum_counts(*csvs)
|
254
|
+
length2count = Hash.new
|
255
|
+
csvs.each do |csv|
|
256
|
+
open(csv).each do |line|
|
257
|
+
l, c = line.rstrip.split(/,/)
|
258
|
+
next if l == 'length'
|
259
|
+
length = l.to_i
|
260
|
+
length2count[length] = 0 unless length2count.key?(length)
|
261
|
+
length2count[length] += c.to_i
|
262
|
+
end
|
263
|
+
end
|
264
|
+
puts "length,count"
|
265
|
+
length2count.keys.sort.each do |length|
|
266
|
+
puts "#{length},#{length2count[length]}"
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# fq1l:thin_out
|
271
|
+
|
272
|
+
desc 'thin_out DRAW SKIP', 'Thin out the sequences'
|
273
|
+
|
274
|
+
def to(draw, skip)
|
275
|
+
exit unless STDIN.wait
|
276
|
+
BioGadget.to(draw.to_i, skip.to_i)
|
277
|
+
end
|
278
|
+
|
279
|
+
# fq1l:trim_3end
|
280
|
+
|
281
|
+
desc 'trim_3end SEQUENCE', 'Trim 3\'-end that match with a given SEQUENCE'
|
282
|
+
|
283
|
+
method_option *OPT_COREUTILS_PREFIX
|
284
|
+
method_option *OPT_GREP_PREFIX
|
285
|
+
method_option *OPT_MINIMUM_LENGTH
|
286
|
+
|
287
|
+
method_option :trimmed,
|
288
|
+
banner: 'FILE',
|
289
|
+
desc: 'FILE for trimmed reads; STDOUT if not speficied',
|
290
|
+
type: :string
|
291
|
+
|
292
|
+
def trim_3end(sequence)
|
293
|
+
# exit unless STDIN.wait
|
294
|
+
gPrefix = options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
|
295
|
+
fifo = get_fifo('fq1l.trim_3end', 'fq1l', false)
|
296
|
+
begin
|
297
|
+
tmpfile = options.key?(:trimmed) ? File.expand_path(options.trimmed) : get_temporary_path('fq1l.trim_3end', 'fq1l', false)
|
298
|
+
begin
|
299
|
+
pid = Process.fork do
|
300
|
+
BioGadget.t3("fq1l match_3end#{gPrefix} #{sequence} < #{fifo}", sequence.length, options.minimum_length, tmpfile)
|
301
|
+
end
|
302
|
+
pipeline("#{tee_command(options)} #{fifo}",
|
303
|
+
"fq1l match_3end#{gPrefix} #{sequence} --invert-match")
|
304
|
+
ensure
|
305
|
+
system "#{cat_command} #{tmpfile}" unless options.key?(:trimmed)
|
306
|
+
end
|
307
|
+
ensure
|
308
|
+
File.unlink(fifo) if File.exist?(fifo)
|
309
|
+
File.unlink(tmpfile) if File.exist?(tmpfile) && !options.key?(:trimmed)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# fq1l:trim_3end_length
|
314
|
+
|
315
|
+
desc 'trim_3end_length', 'Trim 3\'-end by a specific length'
|
316
|
+
|
317
|
+
method_option *OPT_MINIMUM_LENGTH
|
318
|
+
|
319
|
+
method_option :trimming_length,
|
320
|
+
default: 1,
|
321
|
+
desc: 'Length of the trimming',
|
322
|
+
type: :numeric
|
323
|
+
|
324
|
+
def trim_3end_length
|
325
|
+
exit unless STDIN.wait
|
326
|
+
BioGadget.t3(nil, options.trimming_length, options.minimum_length, nil)
|
327
|
+
end
|
328
|
+
|
329
|
+
# fq1l:trim_3end_primer
|
330
|
+
|
331
|
+
desc 'trim_3end_primer', 'Trim 3\'-end that match with a given primer'
|
332
|
+
|
333
|
+
method_option *OPT_COREUTILS_PREFIX
|
334
|
+
method_option *OPT_GREP_PREFIX
|
335
|
+
method_option *OPT_MINIMUM_LENGTH
|
336
|
+
method_option *OPT_PARALLEL
|
337
|
+
|
338
|
+
method_option :primers,
|
339
|
+
default: 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG,CTCGTATGCCGTCTTCTGCTTG',
|
340
|
+
desc: 'Comma-separated primer sequences that be used for trimming',
|
341
|
+
type: :string
|
342
|
+
|
343
|
+
def trim_3end_primer
|
344
|
+
|
345
|
+
opt_minimum_length = "--minimum-length=#{options.minimum_length}"
|
346
|
+
primers = options.primers.split(',')
|
347
|
+
|
348
|
+
fragments = Hash.new
|
349
|
+
tmp = Hash.new
|
350
|
+
primers.each do |primer|
|
351
|
+
max = primer.length-1
|
352
|
+
for i in 0..max do
|
353
|
+
fragment = primer[0..i]
|
354
|
+
unless tmp.key?(fragment)
|
355
|
+
l = fragment.length
|
356
|
+
fragments[l] = Array.new unless fragments.key?(l)
|
357
|
+
fragments[l] << fragment
|
358
|
+
tmp[fragment] = true
|
359
|
+
end
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
363
|
+
exit unless STDIN.wait
|
364
|
+
|
365
|
+
tmpfiles = Array.new
|
366
|
+
commands = Array.new
|
367
|
+
|
368
|
+
fragments.keys.sort.reverse.each do |length|
|
369
|
+
if 4**length == fragments[length].size
|
370
|
+
commands << "fq1l trim_3end_length --trimming-length=#{length} #{opt_minimum_length}"
|
371
|
+
break
|
372
|
+
else
|
373
|
+
fragments[length].sort.reverse.each do |fragment|
|
374
|
+
tmpfiles << tmpfile = get_temporary_path("fq1l.trim_3end_primer.#{fragment}", 'fq1l', false)
|
375
|
+
commands << "fq1l trim_3end#{' --coreutils-prefix='+options.coreutils_prefix if options.key?(:coreutils_prefix)}#{' --grep-prefix='+options.grep_prefix if options.key?(:grep_prefix)} #{opt_minimum_length} --trimmed=#{tmpfile} #{fragment}"
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
stats = Open3.pipeline(*commands)
|
380
|
+
stats.each_index do |i|
|
381
|
+
unless stats[i].success?
|
382
|
+
unlink_files(tmpfiles)
|
383
|
+
raise "Fail at process #{i}; #{stats[i]}; #{commands[i]}"
|
384
|
+
end
|
385
|
+
end
|
386
|
+
system "#{cat_command} #{tmpfiles.join(' ')}"
|
387
|
+
unlink_files(tmpfiles)
|
388
|
+
|
389
|
+
end
|
390
|
+
|
391
|
+
# fq1l:trim_3end_quality
|
392
|
+
|
393
|
+
desc 'trim_3end_quality', 'Trim 3\'-end from a low quality base'
|
394
|
+
|
395
|
+
method_option *OPT_MINIMUM_LENGTH
|
396
|
+
|
397
|
+
method_option :low_qualities,
|
398
|
+
banner: 'CHARACTERS',
|
399
|
+
default: '!"#',
|
400
|
+
desc: 'Low quality characters',
|
401
|
+
type: :string
|
402
|
+
|
403
|
+
def trim_3end_quality
|
404
|
+
BioGadget.t3q(options.low_qualities, options.minimum_length)
|
405
|
+
end
|
406
|
+
|
407
|
+
# fq1l:trim_5end
|
408
|
+
|
409
|
+
desc 'trim_5end PATTERN', 'Trim 5\'-end that match with a given PATTERN'
|
410
|
+
|
411
|
+
method_option :minimum_length,
|
412
|
+
banner: 'NT',
|
413
|
+
default: 24,
|
414
|
+
desc: 'Minimum length after trimming',
|
415
|
+
type: :numeric
|
416
|
+
|
417
|
+
|
418
|
+
def trim_5end(pattern)
|
419
|
+
exit unless STDIN.wait
|
420
|
+
BioGadget.t5(pattern, options.minimum_length)
|
421
|
+
end
|
422
|
+
|
423
|
+
# #
|
424
|
+
|
425
|
+
# no_commands do
|
426
|
+
|
427
|
+
# def pipeline(parallel, *commands)
|
428
|
+
# stats = Array.new
|
429
|
+
# tmpin = nil
|
430
|
+
# tmpout = nil
|
431
|
+
# begin
|
432
|
+
# while commands.size > 0
|
433
|
+
# cmds = commands.shift(parallel)
|
434
|
+
# cmds[0] = cmds[0] + " < #{tmpin}" unless tmpin.nil?
|
435
|
+
# if commands.size > 0
|
436
|
+
# tmpout = get_temporary_path('pipeline', 'tmp', false)
|
437
|
+
# cmds[-1] = cmds[-1] + " > #{tmpout}"
|
438
|
+
# end
|
439
|
+
# tmpstats = Open3.pipeline(*cmds)
|
440
|
+
# stats.concat(tmpstats)
|
441
|
+
# tmpstats.each {|tmpstat| commands = nil unless tmpstat.success? }
|
442
|
+
# break if commands.nil?
|
443
|
+
# File.unlink(tmpin) unless tmpin.nil?
|
444
|
+
# tmpin = tmpout
|
445
|
+
# end
|
446
|
+
# ensure
|
447
|
+
# File.unlink(tmpin) if !tmpin.nil? && File.exist?(tmpin)
|
448
|
+
# File.unlink(tmpout) if !tmpout.nil? && File.exist?(tmpout)
|
449
|
+
# end
|
450
|
+
# stats
|
451
|
+
# end
|
452
|
+
|
453
|
+
# end
|
454
|
+
|
455
|
+
end
|
456
|
+
end
|
457
|
+
end
|