bio-gadget 0.4.8 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ #ifndef BIO_GADGET_H
2
+ #define BIO_GADGET_H 1
3
+
4
+ #include "ruby.h"
5
+
6
+ #define BUFSIZE 65536
7
+
8
+ #endif /* BIO_GADGET_H */
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile("bio/gadget/bio_gadget")
@@ -0,0 +1,171 @@
1
+ require 'mkfifo'
2
+ require 'open3'
3
+ require 'tempfile'
4
+ require 'thor'
5
+
6
+ module Bio
7
+ class Gadget < Thor
8
+
9
+ OPT_BUFFER_SIZE = [
10
+ :buffer_size, {
11
+ :aliases => '-S',
12
+ :banner => 'SIZE',
13
+ :desc => 'Use SIZE for main memory buffer',
14
+ :type => :string
15
+ }
16
+ ]
17
+
18
+ OPT_DOWNLOAD = [ :download, {
19
+ :banner => 'BEHAVIOR',
20
+ :default => 'yes',
21
+ :desc => 'Download and process, no download or only',
22
+ :enum => ['yes', 'no', 'only'] } ]
23
+
24
+ OPT_PARALLEL = [
25
+ :parallel, {
26
+ :banner => 'N',
27
+ :default => (
28
+ system('which gnproc >/dev/null 2>&1') ?
29
+ `gnproc`.to_i :
30
+ (system('which nproc >/dev/null 2>&1') ? `nproc`.to_i : 2)
31
+ ),
32
+ :desc => 'Change the number of sorts run concurrently',
33
+ :type => :numeric
34
+ }
35
+ ]
36
+
37
+ OPT_COREUTILS_PREFIX = [
38
+ :coreutils_prefix, {
39
+ :banner => 'PREFIX',
40
+ :default => system('which gnproc >/dev/null 2>&1') ? 'g' : '',
41
+ :desc => 'A prefix character for GNU coreutils',
42
+ :type => :string
43
+ }
44
+ ]
45
+
46
+ OPT_GREP_PREFIX = [
47
+ :grep_prefix, {
48
+ :banner => 'PREFIX',
49
+ :default => system('which ggrep >/dev/null 2>&1') ? 'g' : '',
50
+ :desc => 'A prefix character for GNU grep',
51
+ :type => :string
52
+ }
53
+ ]
54
+
55
+ #
56
+
57
+ no_commands do
58
+
59
+ def self.banner(command, namespace = true, subcommand = false)
60
+ "#{basename} #{@package_name.nil? ? '' : @package_name.to_s+' '}#{command.usage}"
61
+ end
62
+
63
+ def buffer_size_option
64
+ options.key?(:buffer_size) ? ' --buffer-size='+options.buffer_size : ''
65
+ end
66
+
67
+ def cat_command
68
+ "#{options.coreutils_prefix}cat"
69
+ end
70
+
71
+ def coreutils_prefix_option
72
+ options.key?(:coreutils_prefix) ? " --coreutils-prefix=#{options.coreutils_prefix}" : ''
73
+ end
74
+
75
+ def cut_command
76
+ "#{options.coreutils_prefix}cut"
77
+ end
78
+
79
+ def download_file(url, path)
80
+ system "curl -R -f -s -S -o #{path} '#{url}'" or exit $?.exitstatus
81
+ end
82
+
83
+ def fold_command(options)
84
+ "#{options.coreutils_prefix}fold"
85
+ end
86
+
87
+ def fq1l_convert_command(options)
88
+ "fq1l convert#{coreutils_prefix_option}"
89
+ end
90
+
91
+ def fq1l_count_command(options)
92
+ "fq1l count#{coreutils_prefix_option}#{parallel_option(options)}"
93
+ end
94
+
95
+ def fq1l_sort_command(options)
96
+ "fq1l sort#{coreutils_prefix_option}#{parallel_option(options)}"
97
+ end
98
+
99
+ def get_temporary_path(prefix, suffix, cleanup=true)
100
+ tmpname = Dir::Tmpname.create(["rbg.#{prefix}.", ".#{suffix}"]) { }
101
+ if cleanup
102
+ at_exit { File.unlink(tmpname) if FileTest.exist?(tmpname) }
103
+ end
104
+ tmpname
105
+ end
106
+
107
+ def get_fifo(prefix, suffix, cleanup=true)
108
+ fifo = get_temporary_path("#{prefix}.fifo", suffix, cleanup)
109
+ File.mkfifo(fifo)
110
+ fifo
111
+ end
112
+
113
+ def grep_command
114
+ "#{options.grep_prefix}grep"
115
+ end
116
+
117
+ def grep_prefix_option(options)
118
+ options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
119
+ end
120
+
121
+ def head_command(options)
122
+ "#{options.coreutils_prefix}head"
123
+ end
124
+
125
+ def parallel_option(options)
126
+ options.key?(:parallel) ? " --parallel=#{options.parallel}" : ''
127
+ end
128
+
129
+ def pipeline(*cmds)
130
+ stats = Open3.pipeline(*cmds)
131
+ stats.each_index do |i|
132
+ raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success?
133
+ end
134
+ end
135
+
136
+ def sort_command
137
+ "#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz"
138
+ end
139
+
140
+ def sh(cmd)
141
+ system cmd
142
+ raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success?
143
+ end
144
+
145
+ def tee_command(options)
146
+ "#{options.coreutils_prefix}tee"
147
+ end
148
+
149
+ def uniq_command(options)
150
+ "#{options.coreutils_prefix}uniq"
151
+ end
152
+
153
+ def unlink_files(files)
154
+ files.each do |file|
155
+ File.unlink(file) if File.exist?(file)
156
+ end
157
+ end
158
+
159
+ def wc_command(options)
160
+ "#{options.coreutils_prefix}wc"
161
+ end
162
+
163
+ end
164
+
165
+ end
166
+ end
167
+
168
+ require 'bio/gadgets'
169
+ require 'bio/gadget/fq1l'
170
+ require 'bio/gadget/strt'
171
+ require 'bio/gadget/bio_gadget'
@@ -0,0 +1,457 @@
1
+ require 'damerau-levenshtein'
2
+ require 'io/wait'
3
+ require 'open3'
4
+
5
+ module Bio
6
+ class Gadget
7
+ class Fq1l < Bio::Gadget
8
+
9
+ OPT_INVERT_MATCH = [
10
+ :invert_match, {
11
+ :desc => 'The sense of matching',
12
+ :type => :boolean
13
+ }
14
+ ]
15
+
16
+ OPT_MINIMUM_LENGTH = [
17
+ :minimum_length, {
18
+ :banner => 'NT',
19
+ :default => 40,
20
+ :desc => 'Minimum length after trimming',
21
+ :type => :numeric
22
+ }
23
+ ]
24
+
25
+ # fq1l:annotate_index
26
+
27
+ desc 'annotate_index', 'Annotate sequence identifier by index sequence at the specified region'
28
+
29
+ method_option :first_cycle,
30
+ default: 7,
31
+ desc: 'The first cycle of index',
32
+ type: :numeric
33
+
34
+ method_option :last_cycle,
35
+ default: 12,
36
+ desc: 'The last cycle of index',
37
+ type: :numeric
38
+
39
+ def annotate_index
40
+ exit unless STDIN.wait
41
+ BioGadget.i2i(options.first_cycle, options.last_cycle)
42
+ end
43
+
44
+ # fq1l:annotate_umi
45
+
46
+ desc 'annotate_umi', 'Annotate sequence identifier by UMI sequence at the specified region'
47
+
48
+ method_option :first_cycle,
49
+ default: 1,
50
+ desc: 'The first cycle of UMI',
51
+ type: :numeric
52
+
53
+ method_option :last_cycle,
54
+ default: 6,
55
+ desc: 'The last cycle of UMI',
56
+ type: :numeric
57
+
58
+ def annotate_umi
59
+ exit unless STDIN.wait
60
+ BioGadget.u2i(options.first_cycle, options.last_cycle)
61
+ end
62
+
63
+ # fq1l:convert
64
+
65
+ desc 'convert', 'Convert fastq from 4 lines/read to 1 line/read for this utility'
66
+
67
+ method_option *OPT_COREUTILS_PREFIX
68
+
69
+ def convert
70
+ exit unless STDIN.wait
71
+ exec "#{options.coreutils_prefix}paste - - - -"
72
+ end
73
+
74
+ # fq1l:count
75
+
76
+ desc 'count [CSV]', 'Count sequences by the length'
77
+
78
+ method_option *OPT_COREUTILS_PREFIX
79
+ method_option *OPT_PARALLEL
80
+
81
+ def count(csv = nil)
82
+ exit unless STDIN.wait
83
+ if csv.nil?
84
+ puts "length,reads"
85
+ pipeline("#{cut_command} -f 2",
86
+ "ruby -nle 'puts $_.length'",
87
+ "#{sort_command} -n",
88
+ "#{uniq_command(options)} -c",
89
+ "ruby -anle 'puts $F.reverse.join(\",\")'")
90
+ else
91
+ fifo = get_fifo('fq1l.count', 'fq1l')
92
+ pid = Kernel.spawn("fq1l count#{coreutils_prefix_option} < #{fifo} > #{csv}")
93
+ system "#{tee_command(options)} #{fifo}"
94
+ Process.waitpid(pid)
95
+ end
96
+ end
97
+
98
+ # fq1l:demultiplex
99
+
100
+ desc 'demultiplex BASE MAP', 'Demultiplex based on a barcode MAP, and restore sequence files with BASE names'
101
+
102
+ method_option :maximum_distance,
103
+ default: 1,
104
+ desc: 'Maximum distance between barcode and sequence',
105
+ type: :numeric
106
+
107
+ def demultiplex(base, map)
108
+
109
+ dl = DamerauLevenshtein
110
+
111
+ exit unless STDIN.wait
112
+
113
+ bc2fq = Hash.new
114
+ open(map, 'r').each do |line|
115
+ bc, well = line.rstrip.split(',')
116
+ bc2fq[bc] = fq = "#{base}.#{well}.fq"
117
+ File.unlink(fq) if File.exist?(fq)
118
+ end
119
+ na = "#{base}.NA.fq"
120
+ File.unlink(na) if File.exist?(na)
121
+
122
+ bcl = bc2fq.keys.map!{|key| key.length}.sort.uniq[0]
123
+
124
+ fp = nil
125
+ pbc = nil
126
+ STDIN.set_encoding('BINARY').each do |line|
127
+ acc, seq, sep, qual = line.rstrip.split(/\t/)
128
+ bc = acc[-bcl, bcl]
129
+ if bc != pbc
130
+ mindist = options.maximum_distance+1
131
+ minbc = nil
132
+ bc2fq.each_key do |key|
133
+ dist = dl.distance(key, bc, 0, options.maximum_distance)
134
+ if dist < mindist
135
+ mindist = dist
136
+ minbc = key
137
+ end
138
+ break if dist == 0
139
+ end
140
+ fp.close unless fp.nil?
141
+ fp = open(mindist <= options.maximum_distance ? bc2fq[minbc] : na, 'a')
142
+ pbc = bc
143
+ end
144
+ fp.puts "#{acc}\n#{seq}\n#{sep}\n#{qual}"
145
+ end
146
+ fp.close unless fp.nil?
147
+
148
+ bc2fq.each_value {|fq| system "pigz #{fq}" if File.exist?(fq) }
149
+ system "pigz #{na}" if File.exist?(na)
150
+
151
+ end
152
+
153
+ # fq1l:exclude_degenerate
154
+
155
+ desc 'exclude_degenerate', 'Exclude degenerated reads in the order'
156
+
157
+ def exclude_degenerate
158
+ exit unless STDIN.wait
159
+ BioGadget.nr_deg()
160
+ end
161
+
162
+ # fq1l:exclude_duplicate
163
+
164
+ desc 'exclude_duplicate', 'Exclude duplicated reads in the order'
165
+
166
+ def exclude_duplicate
167
+ exit unless STDIN.wait
168
+ BioGadget.nr_std()
169
+ end
170
+
171
+ # fq1l:match_3end
172
+
173
+ desc 'match_3end PATTERN', 'Select sequences that match the 3\'-end with a given PATTERN'
174
+
175
+ method_option *OPT_INVERT_MATCH
176
+ method_option *OPT_GREP_PREFIX
177
+
178
+ def match_3end(pattern)
179
+ exit unless STDIN.wait
180
+ # PCRE was faster than BRE and ERE in GNU grep 2.25
181
+ system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
182
+ exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
183
+ end
184
+
185
+ # fq1l:match_5end
186
+
187
+ desc 'match_5end PATTERN', 'Select sequences that match the 5\'-end with a given PATTERN'
188
+
189
+ method_option *OPT_INVERT_MATCH
190
+ method_option *OPT_GREP_PREFIX
191
+
192
+ def match_5end(pattern)
193
+ exit unless STDIN.wait
194
+ # PCRE was faster than BRE and ERE in GNU grep 2.25
195
+ system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
196
+ exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
197
+ end
198
+
199
+ # fq1l:restore
200
+
201
+ desc 'restore', 'Convert fastq from 1 line/read to 4 lines/read'
202
+
203
+ method_option *OPT_COREUTILS_PREFIX
204
+
205
+ def restore
206
+ exit unless STDIN.wait
207
+ exec "#{options.coreutils_prefix}tr \"\\t\" \"\\n\""
208
+ end
209
+
210
+ # fq1l:slice
211
+
212
+ desc 'slice Nth SLICE', 'Slice the sequences'
213
+
214
+ def slice(nth, slice)
215
+ exit unless STDIN.wait
216
+ BioGadget.slice(nth.to_i, slice.to_i)
217
+ end
218
+
219
+ # fq1l:sort
220
+
221
+ desc 'sort [FQ1Ls]', 'Sort by sequence and the quality in descending order'
222
+
223
+ method_option *OPT_COREUTILS_PREFIX
224
+ method_option *OPT_BUFFER_SIZE
225
+ method_option *OPT_PARALLEL
226
+
227
+ def sort(*fq1ls)
228
+ if fq1ls.size == 0
229
+ exit unless STDIN.wait
230
+ exec "#{sort_command} -t '\t' -r -k2,4"
231
+ else
232
+ exec "#{sort_command} -t '\t' -r -k2,4 -m #{fq1ls.join(' ')}"
233
+ end
234
+ end
235
+
236
+ # fq1l:sort_index
237
+
238
+ desc 'sort_index', 'Sort by index'
239
+
240
+ method_option *OPT_COREUTILS_PREFIX
241
+ method_option *OPT_BUFFER_SIZE
242
+ method_option *OPT_PARALLEL
243
+
244
+ def sort_index
245
+ exit unless STDIN.wait
246
+ exec "#{sort_command} -k2"
247
+ end
248
+
249
+ # fq1l:sum_counts
250
+
251
+ desc 'sum_counts CSV ...', 'Sum counts of sequences by the length'
252
+
253
+ def sum_counts(*csvs)
254
+ length2count = Hash.new
255
+ csvs.each do |csv|
256
+ open(csv).each do |line|
257
+ l, c = line.rstrip.split(/,/)
258
+ next if l == 'length'
259
+ length = l.to_i
260
+ length2count[length] = 0 unless length2count.key?(length)
261
+ length2count[length] += c.to_i
262
+ end
263
+ end
264
+ puts "length,count"
265
+ length2count.keys.sort.each do |length|
266
+ puts "#{length},#{length2count[length]}"
267
+ end
268
+ end
269
+
270
+ # fq1l:thin_out
271
+
272
+ desc 'thin_out DRAW SKIP', 'Thin out the sequences'
273
+
274
+ def to(draw, skip)
275
+ exit unless STDIN.wait
276
+ BioGadget.to(draw.to_i, skip.to_i)
277
+ end
278
+
279
+ # fq1l:trim_3end
280
+
281
+ desc 'trim_3end SEQUENCE', 'Trim 3\'-end that match with a given SEQUENCE'
282
+
283
+ method_option *OPT_COREUTILS_PREFIX
284
+ method_option *OPT_GREP_PREFIX
285
+ method_option *OPT_MINIMUM_LENGTH
286
+
287
+ method_option :trimmed,
288
+ banner: 'FILE',
289
+ desc: 'FILE for trimmed reads; STDOUT if not speficied',
290
+ type: :string
291
+
292
+ def trim_3end(sequence)
293
+ # exit unless STDIN.wait
294
+ gPrefix = options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
295
+ fifo = get_fifo('fq1l.trim_3end', 'fq1l', false)
296
+ begin
297
+ tmpfile = options.key?(:trimmed) ? File.expand_path(options.trimmed) : get_temporary_path('fq1l.trim_3end', 'fq1l', false)
298
+ begin
299
+ pid = Process.fork do
300
+ BioGadget.t3("fq1l match_3end#{gPrefix} #{sequence} < #{fifo}", sequence.length, options.minimum_length, tmpfile)
301
+ end
302
+ pipeline("#{tee_command(options)} #{fifo}",
303
+ "fq1l match_3end#{gPrefix} #{sequence} --invert-match")
304
+ ensure
305
+ system "#{cat_command} #{tmpfile}" unless options.key?(:trimmed)
306
+ end
307
+ ensure
308
+ File.unlink(fifo) if File.exist?(fifo)
309
+ File.unlink(tmpfile) if File.exist?(tmpfile) && !options.key?(:trimmed)
310
+ end
311
+ end
312
+
313
+ # fq1l:trim_3end_length
314
+
315
+ desc 'trim_3end_length', 'Trim 3\'-end by a specific length'
316
+
317
+ method_option *OPT_MINIMUM_LENGTH
318
+
319
+ method_option :trimming_length,
320
+ default: 1,
321
+ desc: 'Length of the trimming',
322
+ type: :numeric
323
+
324
+ def trim_3end_length
325
+ exit unless STDIN.wait
326
+ BioGadget.t3(nil, options.trimming_length, options.minimum_length, nil)
327
+ end
328
+
329
+ # fq1l:trim_3end_primer
330
+
331
+ desc 'trim_3end_primer', 'Trim 3\'-end that match with a given primer'
332
+
333
+ method_option *OPT_COREUTILS_PREFIX
334
+ method_option *OPT_GREP_PREFIX
335
+ method_option *OPT_MINIMUM_LENGTH
336
+ method_option *OPT_PARALLEL
337
+
338
+ method_option :primers,
339
+ default: 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG,CTCGTATGCCGTCTTCTGCTTG',
340
+ desc: 'Comma-separated primer sequences that be used for trimming',
341
+ type: :string
342
+
343
+ def trim_3end_primer
344
+
345
+ opt_minimum_length = "--minimum-length=#{options.minimum_length}"
346
+ primers = options.primers.split(',')
347
+
348
+ fragments = Hash.new
349
+ tmp = Hash.new
350
+ primers.each do |primer|
351
+ max = primer.length-1
352
+ for i in 0..max do
353
+ fragment = primer[0..i]
354
+ unless tmp.key?(fragment)
355
+ l = fragment.length
356
+ fragments[l] = Array.new unless fragments.key?(l)
357
+ fragments[l] << fragment
358
+ tmp[fragment] = true
359
+ end
360
+ end
361
+ end
362
+
363
+ exit unless STDIN.wait
364
+
365
+ tmpfiles = Array.new
366
+ commands = Array.new
367
+
368
+ fragments.keys.sort.reverse.each do |length|
369
+ if 4**length == fragments[length].size
370
+ commands << "fq1l trim_3end_length --trimming-length=#{length} #{opt_minimum_length}"
371
+ break
372
+ else
373
+ fragments[length].sort.reverse.each do |fragment|
374
+ tmpfiles << tmpfile = get_temporary_path("fq1l.trim_3end_primer.#{fragment}", 'fq1l', false)
375
+ commands << "fq1l trim_3end#{' --coreutils-prefix='+options.coreutils_prefix if options.key?(:coreutils_prefix)}#{' --grep-prefix='+options.grep_prefix if options.key?(:grep_prefix)} #{opt_minimum_length} --trimmed=#{tmpfile} #{fragment}"
376
+ end
377
+ end
378
+ end
379
+ stats = Open3.pipeline(*commands)
380
+ stats.each_index do |i|
381
+ unless stats[i].success?
382
+ unlink_files(tmpfiles)
383
+ raise "Fail at process #{i}; #{stats[i]}; #{commands[i]}"
384
+ end
385
+ end
386
+ system "#{cat_command} #{tmpfiles.join(' ')}"
387
+ unlink_files(tmpfiles)
388
+
389
+ end
390
+
391
+ # fq1l:trim_3end_quality
392
+
393
+ desc 'trim_3end_quality', 'Trim 3\'-end from a low quality base'
394
+
395
+ method_option *OPT_MINIMUM_LENGTH
396
+
397
+ method_option :low_qualities,
398
+ banner: 'CHARACTERS',
399
+ default: '!"#',
400
+ desc: 'Low quality characters',
401
+ type: :string
402
+
403
+ def trim_3end_quality
404
+ BioGadget.t3q(options.low_qualities, options.minimum_length)
405
+ end
406
+
407
+ # fq1l:trim_5end
408
+
409
+ desc 'trim_5end PATTERN', 'Trim 5\'-end that match with a given PATTERN'
410
+
411
+ method_option :minimum_length,
412
+ banner: 'NT',
413
+ default: 24,
414
+ desc: 'Minimum length after trimming',
415
+ type: :numeric
416
+
417
+
418
+ def trim_5end(pattern)
419
+ exit unless STDIN.wait
420
+ BioGadget.t5(pattern, options.minimum_length)
421
+ end
422
+
423
+ # #
424
+
425
+ # no_commands do
426
+
427
+ # def pipeline(parallel, *commands)
428
+ # stats = Array.new
429
+ # tmpin = nil
430
+ # tmpout = nil
431
+ # begin
432
+ # while commands.size > 0
433
+ # cmds = commands.shift(parallel)
434
+ # cmds[0] = cmds[0] + " < #{tmpin}" unless tmpin.nil?
435
+ # if commands.size > 0
436
+ # tmpout = get_temporary_path('pipeline', 'tmp', false)
437
+ # cmds[-1] = cmds[-1] + " > #{tmpout}"
438
+ # end
439
+ # tmpstats = Open3.pipeline(*cmds)
440
+ # stats.concat(tmpstats)
441
+ # tmpstats.each {|tmpstat| commands = nil unless tmpstat.success? }
442
+ # break if commands.nil?
443
+ # File.unlink(tmpin) unless tmpin.nil?
444
+ # tmpin = tmpout
445
+ # end
446
+ # ensure
447
+ # File.unlink(tmpin) if !tmpin.nil? && File.exist?(tmpin)
448
+ # File.unlink(tmpout) if !tmpout.nil? && File.exist?(tmpout)
449
+ # end
450
+ # stats
451
+ # end
452
+
453
+ # end
454
+
455
+ end
456
+ end
457
+ end