bio-gadget 0.4.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ #ifndef BIO_GADGET_H
2
+ #define BIO_GADGET_H 1
3
+
4
+ #include "ruby.h"
5
+
6
+ #define BUFSIZE 65536
7
+
8
+ #endif /* BIO_GADGET_H */
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile("bio/gadget/bio_gadget")
@@ -0,0 +1,171 @@
1
+ require 'mkfifo'
2
+ require 'open3'
3
+ require 'tempfile'
4
+ require 'thor'
5
+
6
+ module Bio
7
+ class Gadget < Thor
8
+
9
+ OPT_BUFFER_SIZE = [
10
+ :buffer_size, {
11
+ :aliases => '-S',
12
+ :banner => 'SIZE',
13
+ :desc => 'Use SIZE for main memory buffer',
14
+ :type => :string
15
+ }
16
+ ]
17
+
18
+ OPT_DOWNLOAD = [ :download, {
19
+ :banner => 'BEHAVIOR',
20
+ :default => 'yes',
21
+ :desc => 'Download and process, no download or only',
22
+ :enum => ['yes', 'no', 'only'] } ]
23
+
24
+ OPT_PARALLEL = [
25
+ :parallel, {
26
+ :banner => 'N',
27
+ :default => (
28
+ system('which gnproc >/dev/null 2>&1') ?
29
+ `gnproc`.to_i :
30
+ (system('which nproc >/dev/null 2>&1') ? `nproc`.to_i : 2)
31
+ ),
32
+ :desc => 'Change the number of sorts run concurrently',
33
+ :type => :numeric
34
+ }
35
+ ]
36
+
37
+ OPT_COREUTILS_PREFIX = [
38
+ :coreutils_prefix, {
39
+ :banner => 'PREFIX',
40
+ :default => system('which gnproc >/dev/null 2>&1') ? 'g' : '',
41
+ :desc => 'A prefix character for GNU coreutils',
42
+ :type => :string
43
+ }
44
+ ]
45
+
46
+ OPT_GREP_PREFIX = [
47
+ :grep_prefix, {
48
+ :banner => 'PREFIX',
49
+ :default => system('which ggrep >/dev/null 2>&1') ? 'g' : '',
50
+ :desc => 'A prefix character for GNU grep',
51
+ :type => :string
52
+ }
53
+ ]
54
+
55
+ #
56
+
57
+ no_commands do
58
+
59
+ def self.banner(command, namespace = true, subcommand = false)
60
+ "#{basename} #{@package_name.nil? ? '' : @package_name.to_s+' '}#{command.usage}"
61
+ end
62
+
63
+ def buffer_size_option
64
+ options.key?(:buffer_size) ? ' --buffer-size='+options.buffer_size : ''
65
+ end
66
+
67
+ def cat_command
68
+ "#{options.coreutils_prefix}cat"
69
+ end
70
+
71
+ def coreutils_prefix_option
72
+ options.key?(:coreutils_prefix) ? " --coreutils-prefix=#{options.coreutils_prefix}" : ''
73
+ end
74
+
75
+ def cut_command
76
+ "#{options.coreutils_prefix}cut"
77
+ end
78
+
79
+ def download_file(url, path)
80
+ system "curl -R -f -s -S -o #{path} '#{url}'" or exit $?.exitstatus
81
+ end
82
+
83
+ def fold_command(options)
84
+ "#{options.coreutils_prefix}fold"
85
+ end
86
+
87
+ def fq1l_convert_command(options)
88
+ "fq1l convert#{coreutils_prefix_option}"
89
+ end
90
+
91
+ def fq1l_count_command(options)
92
+ "fq1l count#{coreutils_prefix_option}#{parallel_option(options)}"
93
+ end
94
+
95
+ def fq1l_sort_command(options)
96
+ "fq1l sort#{coreutils_prefix_option}#{parallel_option(options)}"
97
+ end
98
+
99
+ def get_temporary_path(prefix, suffix, cleanup=true)
100
+ tmpname = Dir::Tmpname.create(["rbg.#{prefix}.", ".#{suffix}"]) { }
101
+ if cleanup
102
+ at_exit { File.unlink(tmpname) if FileTest.exist?(tmpname) }
103
+ end
104
+ tmpname
105
+ end
106
+
107
+ def get_fifo(prefix, suffix, cleanup=true)
108
+ fifo = get_temporary_path("#{prefix}.fifo", suffix, cleanup)
109
+ File.mkfifo(fifo)
110
+ fifo
111
+ end
112
+
113
+ def grep_command
114
+ "#{options.grep_prefix}grep"
115
+ end
116
+
117
+ def grep_prefix_option(options)
118
+ options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
119
+ end
120
+
121
+ def head_command(options)
122
+ "#{options.coreutils_prefix}head"
123
+ end
124
+
125
+ def parallel_option(options)
126
+ options.key?(:parallel) ? " --parallel=#{options.parallel}" : ''
127
+ end
128
+
129
+ def pipeline(*cmds)
130
+ stats = Open3.pipeline(*cmds)
131
+ stats.each_index do |i|
132
+ raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success?
133
+ end
134
+ end
135
+
136
+ def sort_command
137
+ "#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz"
138
+ end
139
+
140
+ def sh(cmd)
141
+ system cmd
142
+ raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success?
143
+ end
144
+
145
+ def tee_command(options)
146
+ "#{options.coreutils_prefix}tee"
147
+ end
148
+
149
+ def uniq_command(options)
150
+ "#{options.coreutils_prefix}uniq"
151
+ end
152
+
153
+ def unlink_files(files)
154
+ files.each do |file|
155
+ File.unlink(file) if File.exist?(file)
156
+ end
157
+ end
158
+
159
+ def wc_command(options)
160
+ "#{options.coreutils_prefix}wc"
161
+ end
162
+
163
+ end
164
+
165
+ end
166
+ end
167
+
168
+ require 'bio/gadgets'
169
+ require 'bio/gadget/fq1l'
170
+ require 'bio/gadget/strt'
171
+ require 'bio/gadget/bio_gadget'
@@ -0,0 +1,457 @@
1
+ require 'damerau-levenshtein'
2
+ require 'io/wait'
3
+ require 'open3'
4
+
5
+ module Bio
6
+ class Gadget
7
+ class Fq1l < Bio::Gadget
8
+
9
+ OPT_INVERT_MATCH = [
10
+ :invert_match, {
11
+ :desc => 'The sense of matching',
12
+ :type => :boolean
13
+ }
14
+ ]
15
+
16
+ OPT_MINIMUM_LENGTH = [
17
+ :minimum_length, {
18
+ :banner => 'NT',
19
+ :default => 40,
20
+ :desc => 'Minimum length after trimming',
21
+ :type => :numeric
22
+ }
23
+ ]
24
+
25
+ # fq1l:annotate_index
26
+
27
+ desc 'annotate_index', 'Annotate sequence identifier by index sequence at the specified region'
28
+
29
+ method_option :first_cycle,
30
+ default: 7,
31
+ desc: 'The first cycle of index',
32
+ type: :numeric
33
+
34
+ method_option :last_cycle,
35
+ default: 12,
36
+ desc: 'The last cycle of index',
37
+ type: :numeric
38
+
39
+ def annotate_index
40
+ exit unless STDIN.wait
41
+ BioGadget.i2i(options.first_cycle, options.last_cycle)
42
+ end
43
+
44
+ # fq1l:annotate_umi
45
+
46
+ desc 'annotate_umi', 'Annotate sequence identifier by UMI sequence at the specified region'
47
+
48
+ method_option :first_cycle,
49
+ default: 1,
50
+ desc: 'The first cycle of UMI',
51
+ type: :numeric
52
+
53
+ method_option :last_cycle,
54
+ default: 6,
55
+ desc: 'The last cycle of UMI',
56
+ type: :numeric
57
+
58
+ def annotate_umi
59
+ exit unless STDIN.wait
60
+ BioGadget.u2i(options.first_cycle, options.last_cycle)
61
+ end
62
+
63
+ # fq1l:convert
64
+
65
+ desc 'convert', 'Convert fastq from 4 lines/read to 1 line/read for this utility'
66
+
67
+ method_option *OPT_COREUTILS_PREFIX
68
+
69
+ def convert
70
+ exit unless STDIN.wait
71
+ exec "#{options.coreutils_prefix}paste - - - -"
72
+ end
73
+
74
+ # fq1l:count
75
+
76
+ desc 'count [CSV]', 'Count sequences by the length'
77
+
78
+ method_option *OPT_COREUTILS_PREFIX
79
+ method_option *OPT_PARALLEL
80
+
81
+ def count(csv = nil)
82
+ exit unless STDIN.wait
83
+ if csv.nil?
84
+ puts "length,reads"
85
+ pipeline("#{cut_command} -f 2",
86
+ "ruby -nle 'puts $_.length'",
87
+ "#{sort_command} -n",
88
+ "#{uniq_command(options)} -c",
89
+ "ruby -anle 'puts $F.reverse.join(\",\")'")
90
+ else
91
+ fifo = get_fifo('fq1l.count', 'fq1l')
92
+ pid = Kernel.spawn("fq1l count#{coreutils_prefix_option} < #{fifo} > #{csv}")
93
+ system "#{tee_command(options)} #{fifo}"
94
+ Process.waitpid(pid)
95
+ end
96
+ end
97
+
98
+ # fq1l:demultiplex
99
+
100
+ desc 'demultiplex BASE MAP', 'Demultiplex based on a barcode MAP, and restore sequence files with BASE names'
101
+
102
+ method_option :maximum_distance,
103
+ default: 1,
104
+ desc: 'Maximum distance between barcode and sequence',
105
+ type: :numeric
106
+
107
+ def demultiplex(base, map)
108
+
109
+ dl = DamerauLevenshtein
110
+
111
+ exit unless STDIN.wait
112
+
113
+ bc2fq = Hash.new
114
+ open(map, 'r').each do |line|
115
+ bc, well = line.rstrip.split(',')
116
+ bc2fq[bc] = fq = "#{base}.#{well}.fq"
117
+ File.unlink(fq) if File.exist?(fq)
118
+ end
119
+ na = "#{base}.NA.fq"
120
+ File.unlink(na) if File.exist?(na)
121
+
122
+ bcl = bc2fq.keys.map!{|key| key.length}.sort.uniq[0]
123
+
124
+ fp = nil
125
+ pbc = nil
126
+ STDIN.set_encoding('BINARY').each do |line|
127
+ acc, seq, sep, qual = line.rstrip.split(/\t/)
128
+ bc = acc[-bcl, bcl]
129
+ if bc != pbc
130
+ mindist = options.maximum_distance+1
131
+ minbc = nil
132
+ bc2fq.each_key do |key|
133
+ dist = dl.distance(key, bc, 0, options.maximum_distance)
134
+ if dist < mindist
135
+ mindist = dist
136
+ minbc = key
137
+ end
138
+ break if dist == 0
139
+ end
140
+ fp.close unless fp.nil?
141
+ fp = open(mindist <= options.maximum_distance ? bc2fq[minbc] : na, 'a')
142
+ pbc = bc
143
+ end
144
+ fp.puts "#{acc}\n#{seq}\n#{sep}\n#{qual}"
145
+ end
146
+ fp.close unless fp.nil?
147
+
148
+ bc2fq.each_value {|fq| system "pigz #{fq}" if File.exist?(fq) }
149
+ system "pigz #{na}" if File.exist?(na)
150
+
151
+ end
152
+
153
+ # fq1l:exclude_degenerate
154
+
155
+ desc 'exclude_degenerate', 'Exclude degenerated reads in the order'
156
+
157
+ def exclude_degenerate
158
+ exit unless STDIN.wait
159
+ BioGadget.nr_deg()
160
+ end
161
+
162
+ # fq1l:exclude_duplicate
163
+
164
+ desc 'exclude_duplicate', 'Exclude duplicated reads in the order'
165
+
166
+ def exclude_duplicate
167
+ exit unless STDIN.wait
168
+ BioGadget.nr_std()
169
+ end
170
+
171
+ # fq1l:match_3end
172
+
173
+ desc 'match_3end PATTERN', 'Select sequences that match the 3\'-end with a given PATTERN'
174
+
175
+ method_option *OPT_INVERT_MATCH
176
+ method_option *OPT_GREP_PREFIX
177
+
178
+ def match_3end(pattern)
179
+ exit unless STDIN.wait
180
+ # PCRE was faster than BRE and ERE in GNU grep 2.25
181
+ system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
182
+ exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
183
+ end
184
+
185
+ # fq1l:match_5end
186
+
187
+ desc 'match_5end PATTERN', 'Select sequences that match the 5\'-end with a given PATTERN'
188
+
189
+ method_option *OPT_INVERT_MATCH
190
+ method_option *OPT_GREP_PREFIX
191
+
192
+ def match_5end(pattern)
193
+ exit unless STDIN.wait
194
+ # PCRE was faster than BRE and ERE in GNU grep 2.25
195
+ system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
196
+ exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
197
+ end
198
+
199
+ # fq1l:restore
200
+
201
+ desc 'restore', 'Convert fastq from 1 line/read to 4 lines/read'
202
+
203
+ method_option *OPT_COREUTILS_PREFIX
204
+
205
+ def restore
206
+ exit unless STDIN.wait
207
+ exec "#{options.coreutils_prefix}tr \"\\t\" \"\\n\""
208
+ end
209
+
210
+ # fq1l:slice
211
+
212
+ desc 'slice Nth SLICE', 'Slice the sequences'
213
+
214
+ def slice(nth, slice)
215
+ exit unless STDIN.wait
216
+ BioGadget.slice(nth.to_i, slice.to_i)
217
+ end
218
+
219
+ # fq1l:sort
220
+
221
+ desc 'sort [FQ1Ls]', 'Sort by sequence and the quality in descending order'
222
+
223
+ method_option *OPT_COREUTILS_PREFIX
224
+ method_option *OPT_BUFFER_SIZE
225
+ method_option *OPT_PARALLEL
226
+
227
+ def sort(*fq1ls)
228
+ if fq1ls.size == 0
229
+ exit unless STDIN.wait
230
+ exec "#{sort_command} -t '\t' -r -k2,4"
231
+ else
232
+ exec "#{sort_command} -t '\t' -r -k2,4 -m #{fq1ls.join(' ')}"
233
+ end
234
+ end
235
+
236
+ # fq1l:sort_index
237
+
238
+ desc 'sort_index', 'Sort by index'
239
+
240
+ method_option *OPT_COREUTILS_PREFIX
241
+ method_option *OPT_BUFFER_SIZE
242
+ method_option *OPT_PARALLEL
243
+
244
+ def sort_index
245
+ exit unless STDIN.wait
246
+ exec "#{sort_command} -k2"
247
+ end
248
+
249
+ # fq1l:sum_counts
250
+
251
+ desc 'sum_counts CSV ...', 'Sum counts of sequences by the length'
252
+
253
+ def sum_counts(*csvs)
254
+ length2count = Hash.new
255
+ csvs.each do |csv|
256
+ open(csv).each do |line|
257
+ l, c = line.rstrip.split(/,/)
258
+ next if l == 'length'
259
+ length = l.to_i
260
+ length2count[length] = 0 unless length2count.key?(length)
261
+ length2count[length] += c.to_i
262
+ end
263
+ end
264
+ puts "length,count"
265
+ length2count.keys.sort.each do |length|
266
+ puts "#{length},#{length2count[length]}"
267
+ end
268
+ end
269
+
270
+ # fq1l:thin_out
271
+
272
+ desc 'thin_out DRAW SKIP', 'Thin out the sequences'
273
+
274
+ def to(draw, skip)
275
+ exit unless STDIN.wait
276
+ BioGadget.to(draw.to_i, skip.to_i)
277
+ end
278
+
279
+ # fq1l:trim_3end
280
+
281
+ desc 'trim_3end SEQUENCE', 'Trim 3\'-end that match with a given SEQUENCE'
282
+
283
+ method_option *OPT_COREUTILS_PREFIX
284
+ method_option *OPT_GREP_PREFIX
285
+ method_option *OPT_MINIMUM_LENGTH
286
+
287
+ method_option :trimmed,
288
+ banner: 'FILE',
289
+ desc: 'FILE for trimmed reads; STDOUT if not speficied',
290
+ type: :string
291
+
292
+ def trim_3end(sequence)
293
+ # exit unless STDIN.wait
294
+ gPrefix = options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
295
+ fifo = get_fifo('fq1l.trim_3end', 'fq1l', false)
296
+ begin
297
+ tmpfile = options.key?(:trimmed) ? File.expand_path(options.trimmed) : get_temporary_path('fq1l.trim_3end', 'fq1l', false)
298
+ begin
299
+ pid = Process.fork do
300
+ BioGadget.t3("fq1l match_3end#{gPrefix} #{sequence} < #{fifo}", sequence.length, options.minimum_length, tmpfile)
301
+ end
302
+ pipeline("#{tee_command(options)} #{fifo}",
303
+ "fq1l match_3end#{gPrefix} #{sequence} --invert-match")
304
+ ensure
305
+ system "#{cat_command} #{tmpfile}" unless options.key?(:trimmed)
306
+ end
307
+ ensure
308
+ File.unlink(fifo) if File.exist?(fifo)
309
+ File.unlink(tmpfile) if File.exist?(tmpfile) && !options.key?(:trimmed)
310
+ end
311
+ end
312
+
313
+ # fq1l:trim_3end_length
314
+
315
+ desc 'trim_3end_length', 'Trim 3\'-end by a specific length'
316
+
317
+ method_option *OPT_MINIMUM_LENGTH
318
+
319
+ method_option :trimming_length,
320
+ default: 1,
321
+ desc: 'Length of the trimming',
322
+ type: :numeric
323
+
324
+ def trim_3end_length
325
+ exit unless STDIN.wait
326
+ BioGadget.t3(nil, options.trimming_length, options.minimum_length, nil)
327
+ end
328
+
329
+ # fq1l:trim_3end_primer
330
+
331
+ desc 'trim_3end_primer', 'Trim 3\'-end that match with a given primer'
332
+
333
+ method_option *OPT_COREUTILS_PREFIX
334
+ method_option *OPT_GREP_PREFIX
335
+ method_option *OPT_MINIMUM_LENGTH
336
+ method_option *OPT_PARALLEL
337
+
338
+ method_option :primers,
339
+ default: 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG,CTCGTATGCCGTCTTCTGCTTG',
340
+ desc: 'Comma-separated primer sequences that be used for trimming',
341
+ type: :string
342
+
343
+ def trim_3end_primer
344
+
345
+ opt_minimum_length = "--minimum-length=#{options.minimum_length}"
346
+ primers = options.primers.split(',')
347
+
348
+ fragments = Hash.new
349
+ tmp = Hash.new
350
+ primers.each do |primer|
351
+ max = primer.length-1
352
+ for i in 0..max do
353
+ fragment = primer[0..i]
354
+ unless tmp.key?(fragment)
355
+ l = fragment.length
356
+ fragments[l] = Array.new unless fragments.key?(l)
357
+ fragments[l] << fragment
358
+ tmp[fragment] = true
359
+ end
360
+ end
361
+ end
362
+
363
+ exit unless STDIN.wait
364
+
365
+ tmpfiles = Array.new
366
+ commands = Array.new
367
+
368
+ fragments.keys.sort.reverse.each do |length|
369
+ if 4**length == fragments[length].size
370
+ commands << "fq1l trim_3end_length --trimming-length=#{length} #{opt_minimum_length}"
371
+ break
372
+ else
373
+ fragments[length].sort.reverse.each do |fragment|
374
+ tmpfiles << tmpfile = get_temporary_path("fq1l.trim_3end_primer.#{fragment}", 'fq1l', false)
375
+ commands << "fq1l trim_3end#{' --coreutils-prefix='+options.coreutils_prefix if options.key?(:coreutils_prefix)}#{' --grep-prefix='+options.grep_prefix if options.key?(:grep_prefix)} #{opt_minimum_length} --trimmed=#{tmpfile} #{fragment}"
376
+ end
377
+ end
378
+ end
379
+ stats = Open3.pipeline(*commands)
380
+ stats.each_index do |i|
381
+ unless stats[i].success?
382
+ unlink_files(tmpfiles)
383
+ raise "Fail at process #{i}; #{stats[i]}; #{commands[i]}"
384
+ end
385
+ end
386
+ system "#{cat_command} #{tmpfiles.join(' ')}"
387
+ unlink_files(tmpfiles)
388
+
389
+ end
390
+
391
+ # fq1l:trim_3end_quality
392
+
393
+ desc 'trim_3end_quality', 'Trim 3\'-end from a low quality base'
394
+
395
+ method_option *OPT_MINIMUM_LENGTH
396
+
397
+ method_option :low_qualities,
398
+ banner: 'CHARACTERS',
399
+ default: '!"#',
400
+ desc: 'Low quality characters',
401
+ type: :string
402
+
403
+ def trim_3end_quality
404
+ BioGadget.t3q(options.low_qualities, options.minimum_length)
405
+ end
406
+
407
+ # fq1l:trim_5end
408
+
409
+ desc 'trim_5end PATTERN', 'Trim 5\'-end that match with a given PATTERN'
410
+
411
+ method_option :minimum_length,
412
+ banner: 'NT',
413
+ default: 24,
414
+ desc: 'Minimum length after trimming',
415
+ type: :numeric
416
+
417
+
418
+ def trim_5end(pattern)
419
+ exit unless STDIN.wait
420
+ BioGadget.t5(pattern, options.minimum_length)
421
+ end
422
+
423
+ # #
424
+
425
+ # no_commands do
426
+
427
+ # def pipeline(parallel, *commands)
428
+ # stats = Array.new
429
+ # tmpin = nil
430
+ # tmpout = nil
431
+ # begin
432
+ # while commands.size > 0
433
+ # cmds = commands.shift(parallel)
434
+ # cmds[0] = cmds[0] + " < #{tmpin}" unless tmpin.nil?
435
+ # if commands.size > 0
436
+ # tmpout = get_temporary_path('pipeline', 'tmp', false)
437
+ # cmds[-1] = cmds[-1] + " > #{tmpout}"
438
+ # end
439
+ # tmpstats = Open3.pipeline(*cmds)
440
+ # stats.concat(tmpstats)
441
+ # tmpstats.each {|tmpstat| commands = nil unless tmpstat.success? }
442
+ # break if commands.nil?
443
+ # File.unlink(tmpin) unless tmpin.nil?
444
+ # tmpin = tmpout
445
+ # end
446
+ # ensure
447
+ # File.unlink(tmpin) if !tmpin.nil? && File.exist?(tmpin)
448
+ # File.unlink(tmpout) if !tmpout.nil? && File.exist?(tmpout)
449
+ # end
450
+ # stats
451
+ # end
452
+
453
+ # end
454
+
455
+ end
456
+ end
457
+ end