seqtrimnext 2.0.51 → 2.0.52
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
data/History.txt
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
=== 2.0.52 2012-06-26
|
2
|
+
|
3
|
+
* Added new plugin for user contaminants.
|
4
|
+
* Sequences contaminated with user contaminants are stored in separate files.
|
5
|
+
* Processing of both illumina fastq paired-end files in the same execution.
|
6
|
+
* Template reorganization.
|
7
|
+
|
1
8
|
=== 2.0.51 2012-06-20
|
2
9
|
|
3
10
|
Added cont_viruses database
|
data/Manifest.txt
CHANGED
@@ -24,7 +24,7 @@ History.txt
|
|
24
24
|
lib/seqtrimnext/actions/action_ab_adapter.rb
|
25
25
|
lib/seqtrimnext/actions/action_ab_far_adapter.rb
|
26
26
|
lib/seqtrimnext/actions/action_ab_left_adapter.rb
|
27
|
-
lib/seqtrimnext/actions/
|
27
|
+
lib/seqtrimnext/actions/action_user_contaminant.rb
|
28
28
|
lib/seqtrimnext/actions/action_empty_insert.rb
|
29
29
|
lib/seqtrimnext/actions/action_ignore_repeated.rb
|
30
30
|
lib/seqtrimnext/actions/action_indetermination.rb
|
@@ -75,7 +75,6 @@ lib/seqtrimnext/classes/sequence_with_action.rb
|
|
75
75
|
lib/seqtrimnext/plugins/plugin.rb
|
76
76
|
lib/seqtrimnext/plugins/plugin_ab_adapters.rb
|
77
77
|
lib/seqtrimnext/plugins/plugin_adapters.rb
|
78
|
-
lib/seqtrimnext/plugins/plugin_adapters_old.rb
|
79
78
|
lib/seqtrimnext/plugins/plugin_amplicons.rb
|
80
79
|
lib/seqtrimnext/plugins/plugin_contaminants.rb
|
81
80
|
lib/seqtrimnext/plugins/plugin_user_contaminants.rb
|
@@ -89,10 +88,11 @@ lib/seqtrimnext/plugins/plugin_low_complexity.rb
|
|
89
88
|
lib/seqtrimnext/plugins/plugin_low_high_size.rb
|
90
89
|
lib/seqtrimnext/plugins/plugin_low_quality.rb
|
91
90
|
lib/seqtrimnext/plugins/plugin_mids.rb
|
92
|
-
lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb
|
93
91
|
lib/seqtrimnext/plugins/plugin_short_insert.rb
|
94
92
|
lib/seqtrimnext/plugins/plugin_vectors.rb
|
95
93
|
lib/seqtrimnext/templates/amplicons.txt
|
94
|
+
lib/seqtrimnext/templates/sanger.txt
|
95
|
+
lib/seqtrimnext/templates/only_quality.txt
|
96
96
|
lib/seqtrimnext/templates/genomics_454.txt
|
97
97
|
lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
98
98
|
lib/seqtrimnext/templates/genomics_short_reads.txt
|
data/README.rdoc
CHANGED
@@ -48,7 +48,7 @@ To install core databases (it should be done at installation time):
|
|
48
48
|
|
49
49
|
$> seqtrimnext -i core
|
50
50
|
|
51
|
-
Databases will be installed nearby SeqtrimNEXT by default, but you can override this location by setting the environment variable +
|
51
|
+
Databases will be installed nearby SeqtrimNEXT by default, but you can override this location by setting the environment variable +BLASTDB+. Eg.:
|
52
52
|
|
53
53
|
If you with your database installed at /var:
|
54
54
|
|
@@ -56,6 +56,10 @@ If you with your database installed at /var:
|
|
56
56
|
|
57
57
|
Be sure that this environment variable is always loaded before SeqtrimNEXT execution (Eg.: add it to /etc/profile.local).
|
58
58
|
|
59
|
+
There are aditional databases. To list them:
|
60
|
+
|
61
|
+
$> seqtrimnext -i LIST
|
62
|
+
|
59
63
|
To perform an analisys using a predefined template with a FASTQ file format using 4 cpus:
|
60
64
|
|
61
65
|
$> seqtrimnext -t genomics_454.txt -Q input_file_in_FASTQ -w 4
|
@@ -64,6 +68,13 @@ To perform an analisys using a predefined template with a FASTQ file format:
|
|
64
68
|
|
65
69
|
$> seqtrimnext -t genomics_454.txt -f input_file_in_FASTA -q input_file_in_QUAL
|
66
70
|
|
71
|
+
To clean illumina fastq files, with paired-ends and qualities encoded in illumina 1.5 format, using 4 cpus and disabling verbose output:
|
72
|
+
|
73
|
+
$> seqtrimnext -t genomics_short_reads.txt -F illumina15 -Q p1.fastq,p2.fastq -w 4 -K
|
74
|
+
|
75
|
+
To clean illumina fastq files, with paired-ends and qualities encoded in standard phred format, using 4 cpus and disabling verbose output:
|
76
|
+
|
77
|
+
$> seqtrimnext -t genomics_short_reads.txt -Q p1.fastq,p2.fastq -w 4 -K
|
67
78
|
|
68
79
|
To get additional help and list available templates and databases:
|
69
80
|
|
@@ -186,13 +197,17 @@ SeqtrimNEXT needs some core databases to work. To install them:
|
|
186
197
|
|
187
198
|
seqtrimnext -i core
|
188
199
|
|
189
|
-
You can change default database location by setting the environment variable +
|
200
|
+
You can change default database location by setting the environment variable +BLASTDB+. Refer to SYNOPSIS for an example.
|
201
|
+
|
202
|
+
There are aditional databases that can be listed with:
|
203
|
+
|
204
|
+
seqtrimnext -i LIST
|
190
205
|
|
191
206
|
=== Database modifications
|
192
207
|
|
193
208
|
Included databases will be usefull for a lot of people, but if you prefer, you can modify them, or add more elements to be search against your sequences.
|
194
209
|
|
195
|
-
You only need to drop new fasta files to each respective directory:
|
210
|
+
You only need to drop new fasta files to each respective directory, or even create new directories with new fasta files inside. Each directory with fasta files will be used as a database:
|
196
211
|
|
197
212
|
DB/vectors to add more vectors
|
198
213
|
DB/contaminants to add more contaminants
|
data/Rakefile
CHANGED
@@ -16,7 +16,7 @@ $hoe = Hoe.spec 'seqtrimnext' do
|
|
16
16
|
self.rubyforge_name = self.name # TODO this is default value
|
17
17
|
# self.extra_deps = ['narray','gnuplot','term-ansicolor','xml-simple','scbi_blast','scbi_drb','scbi_fasta','scbi_fastq','scbi_plot','scbi_math']
|
18
18
|
|
19
|
-
self.extra_deps
|
19
|
+
self.extra_deps = []
|
20
20
|
self.extra_deps << ['narray','>=0']
|
21
21
|
self.extra_deps << ['gnuplot','>=0']
|
22
22
|
self.extra_deps << ['term-ansicolor','>=1.0.5']
|
@@ -27,6 +27,7 @@ $hoe = Hoe.spec 'seqtrimnext' do
|
|
27
27
|
self.extra_deps << ['scbi_fastq','>=0.0.16']
|
28
28
|
self.extra_deps << ['scbi_plot','>=0.0.6']
|
29
29
|
self.extra_deps << ['scbi_math','>=0.0.1']
|
30
|
+
self.extra_deps << ['scbi_headers','>=0.0.2']
|
30
31
|
|
31
32
|
end
|
32
33
|
|
data/bin/parse_params.rb
CHANGED
@@ -26,6 +26,7 @@ params={}
|
|
26
26
|
params['vector_db_field']='vectors_db'
|
27
27
|
params['primers_db_field']='primers_db'
|
28
28
|
params['contaminants_db_field']='contaminants_db'
|
29
|
+
params['user_contaminants_db_field']='user_contaminants_db'
|
29
30
|
params['species_field']='genus'
|
30
31
|
params['min_insert_size_field']='min_insert_size_trimmed'
|
31
32
|
params['min_paired_insert_size_field']='min_insert_size_paired'
|
@@ -53,6 +54,7 @@ end
|
|
53
54
|
sq_params=File.open(params_file,'r')
|
54
55
|
|
55
56
|
data=get_json_data(input_file)
|
57
|
+
|
56
58
|
# puts data.keys
|
57
59
|
# puts data['vector_db_field']
|
58
60
|
|
@@ -69,10 +71,12 @@ data=get_json_data(input_file)
|
|
69
71
|
|
70
72
|
sq_params=File.open(params_file,'a+')
|
71
73
|
|
74
|
+
sq_params.puts ""
|
75
|
+
|
72
76
|
data.each do |k,v|
|
73
77
|
|
74
78
|
sq_name=params[k]
|
75
|
-
# puts k,sq_name
|
79
|
+
# puts k,sq_name
|
76
80
|
|
77
81
|
if sq_name && v && !v.empty?
|
78
82
|
sq_params.puts "#{sq_name}=#{v}"
|
data/bin/seqtrimnext
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
2
4
|
# SeqTrimNext: Next generation sequencing preprocessor
|
3
5
|
# Copyright (C) <2011>
|
4
6
|
# Authors: Almudena Bocinos Rioboo, Diego Dario Guerrero Fernandez,
|
@@ -57,9 +59,35 @@
|
|
57
59
|
# $: << File.expand_path(ROOT_PATH)
|
58
60
|
|
59
61
|
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
|
60
|
-
$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
|
62
|
+
# $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
|
61
63
|
|
62
64
|
require 'seqtrimnext'
|
65
|
+
require 'scbi_headers'
|
66
|
+
|
67
|
+
|
68
|
+
def put_header
|
69
|
+
header = ScbiHeader.new('SeqTrimNEXT',Seqtrimnext::SEQTRIM_VERSION)
|
70
|
+
|
71
|
+
header.description="SeqtrimNEXT is a customizable and distributed pre-processing software for NGS (Next Generation Sequencing) biological data. It makes use of scbi_mapreduce gem to be able to run in parallel and distributed environments. It is specially suited for Roche 454 (normal and paired-end) & Ilumina datasets, although it could be easyly adapted to any other situation."
|
72
|
+
|
73
|
+
header.copyright='2011'
|
74
|
+
|
75
|
+
header.authors<< "Darío Guerrero"
|
76
|
+
header.authors<< "Almudena Bocinos"
|
77
|
+
header.authors<< "Rocío Bautista"
|
78
|
+
header.authors<< "Noé Fernández"
|
79
|
+
header.authors<< "Juan Falgueras"
|
80
|
+
header.authors<< "M. Gonzalo Claros"
|
81
|
+
|
82
|
+
# header.articles<< "Article one: with one description line"
|
83
|
+
# header.articles<< "Article two: with one description line"
|
84
|
+
|
85
|
+
# To output the header
|
86
|
+
puts header
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
put_header
|
63
91
|
|
64
92
|
############ PATHS #######################
|
65
93
|
$SEQTRIM_PATH = ROOT_PATH
|
@@ -192,7 +220,7 @@ optparse = OptionParser.new do |opts|
|
|
192
220
|
end
|
193
221
|
|
194
222
|
end
|
195
|
-
|
223
|
+
|
196
224
|
end
|
197
225
|
|
198
226
|
|
@@ -211,12 +239,12 @@ optparse = OptionParser.new do |opts|
|
|
211
239
|
opts.on( '-C', '--use_checkpoint', 'Restore at checkpoint if scbi_mapreduce_checkpoint file is available' ) do
|
212
240
|
options[:use_checkpoint] = true
|
213
241
|
end
|
214
|
-
|
242
|
+
|
215
243
|
# options[:skip_initial_stats] = false
|
216
244
|
# opts.on( '-k', '--skip_initial_stats', 'Skip initial stats' ) do
|
217
245
|
# options[:skip_initial_stats] = true
|
218
246
|
# end
|
219
|
-
|
247
|
+
|
220
248
|
|
221
249
|
options[:install_db] = nil
|
222
250
|
opts.on( '-i', '--install_databases TYPE', 'Install base databases and reformat them if necessary') do |db_type|
|
@@ -229,10 +257,12 @@ optparse = OptionParser.new do |opts|
|
|
229
257
|
end
|
230
258
|
|
231
259
|
options[:fastq] = nil
|
232
|
-
opts.on( '-Q', '--fastq
|
260
|
+
opts.on( '-Q', '--fastq FILE1,FILE2',Array, 'Fastq input file. Use - for <STDIN>' ) do |file|
|
233
261
|
options[:fastq] = file
|
262
|
+
puts "FILES:",file,file.class
|
263
|
+
|
234
264
|
end
|
235
|
-
|
265
|
+
|
236
266
|
options[:format] = nil
|
237
267
|
opts.on( '-F', '--fastq_quality_format FORMAT', 'Fastq input quality format use sanger or illumina18 for phred+33 based scores. Use illumina15 for phred+64 based scores (default is sanger) file. Use - for <STDIN>' ) do |value|
|
238
268
|
options[:format] = value
|
@@ -241,7 +271,7 @@ optparse = OptionParser.new do |opts|
|
|
241
271
|
exit
|
242
272
|
end
|
243
273
|
end
|
244
|
-
|
274
|
+
|
245
275
|
|
246
276
|
options[:fasta] = nil
|
247
277
|
opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
|
@@ -255,7 +285,7 @@ optparse = OptionParser.new do |opts|
|
|
255
285
|
|
256
286
|
options[:list_db] = nil
|
257
287
|
options[:list_db_name] = 'ALL'
|
258
|
-
|
288
|
+
|
259
289
|
opts.on( '-L', '--list_db [DB_NAME]', 'List entries IDs in DB_NAME. Use "-L all" to view all available databases' ) do |value|
|
260
290
|
options[:list_db] = true
|
261
291
|
options[:list_db_name] = value if value
|
@@ -281,12 +311,12 @@ optparse = OptionParser.new do |opts|
|
|
281
311
|
opts.on( '-j', '--json', 'Save results in json file' ) do
|
282
312
|
options[:json] = true
|
283
313
|
end
|
284
|
-
|
314
|
+
|
285
315
|
options[:skip_output] = false
|
286
316
|
opts.on( '-K', '--no-verbose', 'Change to no verbose mode. Every sequence will not be written to output log' ) do
|
287
317
|
options[:skip_output] = true
|
288
318
|
end
|
289
|
-
|
319
|
+
|
290
320
|
options[:skip_report] = false
|
291
321
|
opts.on( '-R', '--no-report', 'Do not generate final PDF report (gem scbi_seqtrimnext_report required if you want to generate PDF report).' ) do
|
292
322
|
options[:skip_report] = true
|
@@ -335,7 +365,7 @@ $LOG.info("Using options: "+ options.to_json)
|
|
335
365
|
if options[:install_db] then
|
336
366
|
#install databases
|
337
367
|
InstallDatabase.new(options[:install_db],$DB_PATH)
|
338
|
-
|
368
|
+
|
339
369
|
# reformat databases
|
340
370
|
MakeBlastDb.new($DB_PATH)
|
341
371
|
exit
|
@@ -376,13 +406,17 @@ end
|
|
376
406
|
$LOG.info "Using init file: #{$SEQTRIMNEXT_INIT}"
|
377
407
|
$LOG.info "Using params file: #{options[:template]}"
|
378
408
|
|
379
|
-
#
|
380
|
-
if (!options[:fastq].nil? && options[:fastq]!='-' && !File.exists?(options[:fastq]))
|
381
|
-
$LOG.error "Input file: #{options[:fasta]} doesn't exists"
|
382
|
-
exit
|
383
|
-
end
|
384
|
-
|
409
|
+
# check file existence
|
385
410
|
|
411
|
+
if options[:fastq]
|
412
|
+
options[:fastq].each do |fastq_file|
|
413
|
+
# fastq file
|
414
|
+
if (!fastq_file.nil? && fastq_file!='-' && !File.exists?(File.expand_path(fastq_file)))
|
415
|
+
$LOG.error "Input file: #{fastq_file} doesn't exists"
|
416
|
+
exit
|
417
|
+
end
|
418
|
+
end
|
419
|
+
end
|
386
420
|
|
387
421
|
# fasta file
|
388
422
|
if (!options[:fasta].nil? && !File.exists?(options[:fasta]))
|
@@ -398,8 +432,6 @@ end
|
|
398
432
|
|
399
433
|
s = Seqtrim.new(options)
|
400
434
|
|
401
|
-
|
402
|
-
|
403
435
|
#generate report
|
404
436
|
|
405
437
|
if !options[:skip_report] && system("which generate_report.rb > /dev/null ")
|
@@ -408,10 +440,10 @@ if !options[:skip_report] && system("which generate_report.rb > /dev/null ")
|
|
408
440
|
`#{cmd}`
|
409
441
|
else
|
410
442
|
skip_text='.'
|
411
|
-
|
443
|
+
|
412
444
|
if options[:skip_report]
|
413
445
|
skip_text=' and remove the -R option from the command line.'
|
414
446
|
end
|
415
|
-
|
447
|
+
|
416
448
|
$LOG.info "If you want a detailed report in PDF format, be sure you have installed the optional seqtrimnext_report gem (gem install seqtrimnext_report)#{skip_text}"
|
417
449
|
end
|
@@ -7,10 +7,10 @@ require "seqtrim_action"
|
|
7
7
|
# Inherit: Plugin
|
8
8
|
########################################################
|
9
9
|
|
10
|
-
class
|
10
|
+
class ActionUserContaminant < SeqtrimAction
|
11
11
|
|
12
12
|
def initialize(start_pos,end_pos)
|
13
|
-
super(start_pos,end_pos)
|
13
|
+
super(start_pos,end_pos)
|
14
14
|
@cut =false
|
15
15
|
end
|
16
16
|
|
@@ -13,7 +13,7 @@ STATS_PATH=File.join(OUTPUT_PATH,'stats.json')
|
|
13
13
|
|
14
14
|
class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
15
15
|
|
16
|
-
def self.init_work_manager(
|
16
|
+
def self.init_work_manager(sequence_readers, params, chunk_size = 100, use_json=false, skip_output=false)
|
17
17
|
@@full_stats={}
|
18
18
|
@@params= params
|
19
19
|
@@exit = false
|
@@ -22,7 +22,7 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
22
22
|
@@ongoing_stats[:sequence_count] = 0
|
23
23
|
@@ongoing_stats[:smallest_sequence_size] = 900000000000000
|
24
24
|
@@ongoing_stats[:biggest_sequence_size] = 0
|
25
|
-
|
25
|
+
|
26
26
|
@@skip_output=skip_output
|
27
27
|
|
28
28
|
@@chunk_size = chunk_size
|
@@ -36,17 +36,20 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
36
36
|
end
|
37
37
|
|
38
38
|
#open input file
|
39
|
-
@@
|
39
|
+
@@sequence_readers=sequence_readers
|
40
40
|
|
41
41
|
# @@use_qual = @@fqr.with_qual?
|
42
42
|
# @@use_json = use_json
|
43
43
|
|
44
|
-
@@params.set_param('use_qual',@@
|
44
|
+
@@params.set_param('use_qual',@@sequence_readers.first.with_qual?)
|
45
45
|
@@params.set_param('use_json',use_json)
|
46
|
+
@@params.set_param('tuple_size',@@sequence_readers.count)
|
46
47
|
|
47
48
|
@@use_json=use_json
|
48
49
|
|
49
|
-
@@
|
50
|
+
@@sequence_readers.each do |sequence_reader|
|
51
|
+
sequence_reader.rewind
|
52
|
+
end
|
50
53
|
|
51
54
|
# open output files
|
52
55
|
|
@@ -77,6 +80,8 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
77
80
|
|
78
81
|
@@low_sffinfo_files={}
|
79
82
|
|
83
|
+
@@tuple_id=0
|
84
|
+
|
80
85
|
end
|
81
86
|
|
82
87
|
def self.end_work_manager
|
@@ -94,13 +99,12 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
94
99
|
f.puts JSON.pretty_generate(@@ongoing_stats)
|
95
100
|
end
|
96
101
|
end
|
97
|
-
|
98
102
|
|
99
103
|
# load stats
|
100
104
|
r=File.read(STATS_PATH)
|
101
105
|
stats=JSON::parse(r)
|
102
106
|
|
103
|
-
|
107
|
+
|
104
108
|
|
105
109
|
# make graphs
|
106
110
|
gs=GraphStats.new(stats)
|
@@ -198,7 +202,7 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
198
202
|
# puts "Loaded Stats"
|
199
203
|
# puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
|
200
204
|
|
201
|
-
# TODO - remove sequences from rejected file that were added by cloned
|
205
|
+
# TODO - remove sequences from rejected file that were added by cloned
|
202
206
|
|
203
207
|
super
|
204
208
|
# return checkpoint
|
@@ -218,20 +222,16 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
218
222
|
warn "Deprecated: trash_checkpointed_work was deprecated, it is automatic now"
|
219
223
|
end
|
220
224
|
|
221
|
-
def
|
222
|
-
|
223
|
-
if @@exit
|
224
|
-
return nil
|
225
|
-
end
|
226
|
-
|
225
|
+
def get_next_seq_from_file(file)
|
226
|
+
# find a valid and no repeated sequence in file
|
227
227
|
begin
|
228
228
|
|
229
|
-
n,f,q,c =
|
229
|
+
n,f,q,c = file.next_seq
|
230
230
|
|
231
231
|
if !n.nil? && @@params.repeated_seq?(n)
|
232
232
|
@@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
|
233
233
|
@@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
|
234
|
-
|
234
|
+
|
235
235
|
get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
|
236
236
|
|
237
237
|
end
|
@@ -240,17 +240,61 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
240
240
|
@@ongoing_stats[:sequence_count] += 1
|
241
241
|
@@ongoing_stats[:smallest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].min
|
242
242
|
@@ongoing_stats[:biggest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].max
|
243
|
-
|
243
|
+
|
244
244
|
@@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
|
245
245
|
end
|
246
|
+
|
246
247
|
end while (!n.nil? && @@params.repeated_seq?(n))
|
247
248
|
|
248
|
-
|
249
|
-
|
250
|
-
|
249
|
+
return n,f,q,c
|
250
|
+
|
251
|
+
end
|
252
|
+
|
253
|
+
def next_work
|
254
|
+
|
255
|
+
if @@exit
|
251
256
|
return nil
|
252
257
|
end
|
253
258
|
|
259
|
+
tuple=[]
|
260
|
+
order_in_tuple=0
|
261
|
+
|
262
|
+
@@tuple_id += 1
|
263
|
+
tuple_size=@@sequence_readers.count
|
264
|
+
|
265
|
+
@@sequence_readers.each do |sequence_reader|
|
266
|
+
n,f,q,c = get_next_seq_from_file(sequence_reader)
|
267
|
+
|
268
|
+
if !n.nil?
|
269
|
+
seq=SequenceWithAction.new(n,f.upcase,q,c)
|
270
|
+
seq.tuple_id=@@tuple_id
|
271
|
+
seq.order_in_tuple=order_in_tuple
|
272
|
+
seq.tuple_size=tuple_size
|
273
|
+
tuple << seq
|
274
|
+
order_in_tuple+=1
|
275
|
+
end
|
276
|
+
|
277
|
+
end
|
278
|
+
|
279
|
+
if tuple_size>1
|
280
|
+
# check duplicated names
|
281
|
+
names = tuple.map{|s| s.seq_name}
|
282
|
+
|
283
|
+
if names.uniq.count!=tuple_size
|
284
|
+
# puts "NAMES EQUAL IN TUPLE"
|
285
|
+
tuple.each_with_index do |seq,i|
|
286
|
+
# puts seq.class # seq_name
|
287
|
+
seq.seq_name = "#{seq.seq_name}/#{i+1}"
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
# tuple is complete
|
293
|
+
if tuple.count==tuple_size
|
294
|
+
return tuple
|
295
|
+
else
|
296
|
+
return nil
|
297
|
+
end
|
254
298
|
|
255
299
|
end
|
256
300
|
|