seqtrimnext 2.0.45 → 2.0.46

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +7 -2
  3. data/bin/filter_database.rb +39 -0
  4. data/bin/join_big_illumina_paired.sh +122 -0
  5. data/bin/seqtrimnext +2 -1
  6. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +87 -121
  7. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +4 -1
  8. data/lib/seqtrimnext/classes/graph_stats.rb +7 -2
  9. data/lib/seqtrimnext/classes/seqtrim.rb +3 -2
  10. data/lib/seqtrimnext/classes/sequence_with_action.rb +1 -1
  11. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +2 -2
  12. data/lib/seqtrimnext/plugins/plugin_adapters.rb +2 -2
  13. data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +165 -0
  14. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +2 -2
  15. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +3 -3
  16. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -1
  17. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -1
  18. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -1
  19. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -1
  20. data/lib/seqtrimnext/plugins/plugin_key.rb +1 -1
  21. data/lib/seqtrimnext/plugins/plugin_linker.rb +2 -2
  22. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +1 -1
  23. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +1 -1
  24. data/lib/seqtrimnext/plugins/plugin_mids.rb +2 -2
  25. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +1 -1
  26. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +2 -2
  27. data/lib/seqtrimnext/plugins/plugin_vectors.rb +2 -2
  28. data/lib/seqtrimnext/templates/genomics_illumina.txt +5 -0
  29. data/lib/seqtrimnext/templates/transcriptomics_illumina.txt +8 -0
  30. data/lib/seqtrimnext/utils/hash_stats.rb +2 -1
  31. data/lib/seqtrimnext.rb +1 -1
  32. metadata +14 -5
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 2.0.46 2012-04-13
2
+
3
+ Checkpointing activated. Jobs can be restarted where stopped.
4
+
1
5
  === 2.0.45 2012-03-05
2
6
 
3
7
  Improved LowComplexity plugin to ignore low complexity regions inside low qual regions
data/Manifest.txt CHANGED
@@ -4,9 +4,11 @@ bin/extract_seqs_from_fasta.rb
4
4
  bin/extract_seqs_from_fastq.rb
5
5
  bin/fasta2fastq.rb
6
6
  bin/fastq2fasta.rb
7
+ bin/filter_database.rb
7
8
  bin/gen_qual.rb
8
9
  bin/get_seq.rb
9
10
  bin/group_by_range.rb
11
+ bin/join_big_illumina_paired.sh
10
12
  bin/join_ilumina_paired.rb
11
13
  bin/parse_amplicons.rb
12
14
  bin/parse_json_results.rb
@@ -18,6 +20,7 @@ bin/seqtrimnext
18
20
  bin/split_fastq.rb
19
21
  bin/split_ilumina_paired.rb
20
22
  bin/split_paired.rb
23
+ History.txt
21
24
  lib/seqtrimnext/actions/action_ab_adapter.rb
22
25
  lib/seqtrimnext/actions/action_ab_far_adapter.rb
23
26
  lib/seqtrimnext/actions/action_ab_left_adapter.rb
@@ -35,13 +38,13 @@ lib/seqtrimnext/actions/action_low_complexity.rb
35
38
  lib/seqtrimnext/actions/action_low_high_size.rb
36
39
  lib/seqtrimnext/actions/action_low_quality.rb
37
40
  lib/seqtrimnext/actions/action_mid.rb
41
+ lib/seqtrimnext/actions/action_middle_adapter.rb
38
42
  lib/seqtrimnext/actions/action_multiple_linker.rb
39
43
  lib/seqtrimnext/actions/action_paired_reads.rb
40
44
  lib/seqtrimnext/actions/action_poly_a.rb
41
45
  lib/seqtrimnext/actions/action_poly_t.rb
42
46
  lib/seqtrimnext/actions/action_rem_adit_artifacts.rb
43
47
  lib/seqtrimnext/actions/action_right_adapter.rb
44
- lib/seqtrimnext/actions/action_middle_adapter.rb
45
48
  lib/seqtrimnext/actions/action_right_primer.rb
46
49
  lib/seqtrimnext/actions/action_short_insert.rb
47
50
  lib/seqtrimnext/actions/action_unexpected_poly_t.rb
@@ -71,6 +74,7 @@ lib/seqtrimnext/classes/sequence_with_action.rb
71
74
  lib/seqtrimnext/plugins/plugin.rb
72
75
  lib/seqtrimnext/plugins/plugin_ab_adapters.rb
73
76
  lib/seqtrimnext/plugins/plugin_adapters.rb
77
+ lib/seqtrimnext/plugins/plugin_adapters_old.rb
74
78
  lib/seqtrimnext/plugins/plugin_amplicons.rb
75
79
  lib/seqtrimnext/plugins/plugin_contaminants.rb
76
80
  lib/seqtrimnext/plugins/plugin_extract_inserts.rb
@@ -89,9 +93,11 @@ lib/seqtrimnext/plugins/plugin_vectors.rb
89
93
  lib/seqtrimnext/templates/amplicons.txt
90
94
  lib/seqtrimnext/templates/genomics_454.txt
91
95
  lib/seqtrimnext/templates/genomics_454_with_paired.txt
96
+ lib/seqtrimnext/templates/genomics_illumina.txt
92
97
  lib/seqtrimnext/templates/low_quality.txt
93
98
  lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
94
99
  lib/seqtrimnext/templates/transcriptomics_454.txt
100
+ lib/seqtrimnext/templates/transcriptomics_illumina.txt
95
101
  lib/seqtrimnext/templates/transcriptomics_plants.txt
96
102
  lib/seqtrimnext/utils/extract_samples.rb
97
103
  lib/seqtrimnext/utils/fasta2xml.rb
@@ -103,7 +109,6 @@ lib/seqtrimnext/utils/load_qual_in_hash.rb
103
109
  lib/seqtrimnext/utils/recover_mid.rb
104
110
  lib/seqtrimnext/utils/string_utils.rb
105
111
  lib/seqtrimnext.rb
106
- History.txt
107
112
  Manifest.txt
108
113
  PostInstall.txt
109
114
  Rakefile
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fasta'
4
+
5
+ if ARGV.count!=3
6
+ puts "Usage: #{File.basename($0)} database min_size name_list"
7
+ exit
8
+ end
9
+ min_size = ARGV[1].to_i
10
+
11
+ # read keywords
12
+ keywords=File.read(ARGV[2]).split("\n")
13
+
14
+ # convert all to upcase
15
+ keywords.map { |keyword| keyword.upcase!}
16
+
17
+ # puts "Search keywords"
18
+ # keywords.each { |keyword| puts keyword}
19
+
20
+ fqr=FastaQualFile.new(ARGV[0])
21
+
22
+ all=[]
23
+
24
+ fqr.each do |n,s,c|
25
+ keywords.each do |keyword|
26
+ if s.length<=min_size
27
+ # all+=c.split(" ")
28
+ if c.upcase.index(keyword)
29
+ # puts "[#{s.length.to_s}] - #{n} - #{c}"
30
+ puts ">#{n} #{c}\n#{s}"
31
+ break
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ # puts all.sort.uniq.reject{|e| e=~/\d/}
38
+
39
+ fqr.close
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Sort two big illumina files corresponding to paired-end experiment and then join common sequences on different files. Sequences not in common goes to a separate file.
4
+
5
+ # cat $1 | awk '{split($0, a, " "); n++; if (n%1==0){printf("%s\t",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }'
6
+ #
7
+ # exit
8
+
9
+ if [ "$#" < 4 ];
10
+ then
11
+ echo ""
12
+ echo "Use: $0 file1.fastq file2.fastq base_output_name tmp_dir"
13
+ echo ""
14
+ exit
15
+ fi
16
+
17
+ base_name=$3
18
+
19
+ if [[ -z "$base_name" ]]; then
20
+ echo "Use a : $base_name doesn't exists"
21
+ exit -1
22
+ fi
23
+
24
+ tmp_dir=$4
25
+
26
+ if [[ -z "$4" ]]; then
27
+ tmp_dir=`pwd`
28
+ fi
29
+
30
+ if [[ ! -e "$tmp_dir" ]]; then
31
+ echo "Tmp dir: $4 doesn't exists"
32
+ exit -1
33
+ fi
34
+
35
+ echo "Using TMPDIR $tmp_dir"
36
+
37
+ f1_path=$1
38
+ f2_path=$2
39
+
40
+ f1_name=`basename $1`
41
+ f2_name=`basename $2`
42
+
43
+ f1_tmp="$tmp_dir/${f1_name}"
44
+ f2_tmp="$tmp_dir/${f2_name}"
45
+
46
+ common_names="$tmp_dir/comm.names"
47
+
48
+ only_in_1="$tmp_dir/only_in_1.txt"
49
+ only_in_2="$tmp_dir/only_in_2.txt"
50
+ in_both="$tmp_dir/in_both.txt"
51
+
52
+
53
+ echo "Starting sorting"
54
+
55
+ if [[ ! -e "$f1_tmp.sorted" ]]; then
56
+ echo "Sorting $f1_name"
57
+ cat $f1_path | awk '{split($0, a, " "); sub(/\/1$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f1_tmp.sorted &
58
+
59
+ fi
60
+
61
+ if [[ ! -e "$f2_tmp.sorted" ]]; then
62
+ echo "Sorting $f2_name"
63
+ cat $f2_path | awk '{split($0, a, " "); sub(/\/2$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f2_tmp.sorted &
64
+ fi
65
+ wait
66
+
67
+ echo "Starting name extraction"
68
+ if [[ ! -e "$f1_tmp.names" ]]; then
69
+ echo "Extracting names from $f1_tmp.sorted"
70
+ # cat $1.sorted | cut -f1 | sed 's/\(.*\)\/1$/\1/' > $1.names &
71
+ cat $f1_tmp.sorted | cut -f1 > $f1_tmp.names &
72
+ fi
73
+ if [[ ! -e "$f2_tmp.names" ]]; then
74
+ echo "Extracting names from $f2_tmp.sorted"
75
+ cat $f2_tmp.sorted | cut -f1 > $f2_tmp.names &
76
+ fi
77
+ wait
78
+
79
+ echo "Starting names comparison"
80
+ if [[ ! -e "$common_names" ]]; then
81
+ echo "Making comm file"
82
+ # diff $1.names $2.names > names.diff
83
+ comm $f1_tmp.names $f2_tmp.names > $common_names
84
+ fi
85
+
86
+ echo "Starting names extraction"
87
+ # grep '^>' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/2\n",$0) }' > only_in_2.txt &
88
+ # grep '^<' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/1\n",$0) }' > only_in_1.txt &
89
+
90
+ grep -P '^[^\t]' $common_names > $only_in_1 &
91
+ grep -P '^\t[^\t]' $common_names |tr -d "\t" > $only_in_2 &
92
+ grep -P '^\t\t[^\t]' $common_names |tr -d "\t" > $in_both &
93
+ wait
94
+
95
+ echo "Num seqs only in 1) $f1_name"
96
+ wc -l $only_in_1
97
+
98
+ echo "Num seqs only in 2) $f2_name"
99
+ wc -l $only_in_2
100
+
101
+ echo "Num seqs in both $f1_name and $f2_name"
102
+ wc -l $in_both
103
+
104
+ echo "Starting extracting seqs"
105
+ join -t $'\t' -1 1 -2 1 $only_in_1 $f1_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal1.fastq &
106
+ join -t $'\t' -1 1 -2 1 $only_in_2 $f2_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal2.fastq &
107
+
108
+ join -t $'\t' -1 1 -2 1 $in_both $f1_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired1.fastq &
109
+ join -t $'\t' -1 1 -2 1 $in_both $f2_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired2.fastq &
110
+ wait
111
+
112
+ rm $f1_tmp.names
113
+ rm $f2_tmp.names
114
+
115
+ rm $f1_tmp.sorted
116
+ rm $f2_tmp.sorted
117
+
118
+ rm $only_in_2
119
+ rm $only_in_1
120
+ rm $in_both
121
+
122
+ rm $common_names
data/bin/seqtrimnext CHANGED
@@ -57,6 +57,7 @@
57
57
  # $: << File.expand_path(ROOT_PATH)
58
58
 
59
59
  $: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
60
+ $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
60
61
 
61
62
  require 'seqtrimnext'
62
63
 
@@ -272,7 +273,7 @@ optparse = OptionParser.new do |opts|
272
273
  end
273
274
 
274
275
  options[:skip_report] = false
275
- opts.on( '-R', '--no-report', 'Change to no verbose mode. Every sequence will not be written to output log' ) do
276
+ opts.on( '-R', '--no-report', 'Do not generate final PDF report (gem scbi_seqtrimnext_report required if you want to generate PDF report).' ) do
276
277
  options[:skip_report] = true
277
278
  end
278
279
 
@@ -18,29 +18,21 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
18
18
  @@params= params
19
19
  @@exit = false
20
20
 
21
+ @@ongoing_stats={}
22
+ @@ongoing_stats[:sequence_count] = 0
23
+ @@ongoing_stats[:smallest_sequence_size] = 900000000000000
24
+ @@ongoing_stats[:biggest_sequence_size] = 0
25
+
21
26
  @@skip_output=skip_output
22
27
 
23
28
  @@chunk_size = chunk_size
24
29
 
25
-
26
- # puts "CHECKPOINT: #{self.checkpoint}\n"*20
27
-
28
- checkpoint_exists=File.exists?('scbi_drb_checkpoint')
30
+ checkpoint_exists=File.exists?(ScbiMapreduce::CHECKPOINT_FILE)
29
31
 
30
32
  # @@use_qual = !qual_path.nil? and File.exists?(qual_path)
31
33
  @@open_mode='w'
32
34
  if checkpoint_exists
33
35
  @@open_mode = 'a'
34
- if File.exists?(STATS_PATH)
35
- # load stats
36
- text = File.read(STATS_PATH)
37
-
38
- # wipe text
39
- # text=text.grep(/^\s*[^#]/).to_s
40
-
41
- # decode json
42
- @@full_stats = JSON.parse(text)
43
- end
44
36
  end
45
37
 
46
38
  #open input file
@@ -91,67 +83,38 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
91
83
 
92
84
  puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
93
85
 
86
+ # create stats file
94
87
  f = File.open(STATS_PATH,'w')
95
88
  f.puts JSON.pretty_generate(@@full_stats)
96
89
  f.close
97
90
 
98
- r=File.read(STATS_PATH)
91
+ # if initial files doesn't exists, create it
92
+ if !File.exists?(File.join(OUTPUT_PATH,'initial_stats.json'))
93
+ File.open(File.join(OUTPUT_PATH,'initial_stats.json'),'w') do |f|
94
+ f.puts JSON.pretty_generate(@@ongoing_stats)
95
+ end
96
+ end
97
+
99
98
 
99
+ # load stats
100
+ r=File.read(STATS_PATH)
100
101
  stats=JSON::parse(r)
101
102
 
103
+
102
104
 
105
+ # make graphs
103
106
  gs=GraphStats.new(stats)
104
107
 
105
- #gs=GraphStats.new(@@full_stats)
106
-
107
-
108
108
  #close all files
109
-
110
- # @@fqr.close
111
109
  if @@use_json
112
110
  @@json_output.close
113
111
  end
114
112
  @@errors_file.close
115
- # @@rejected_output_file.close
116
-
117
- # @@paired_output_files.each do |k,file|
118
- # file.close
119
- # end
120
113
 
121
114
  @@files.each do |k,file|
122
115
  file.close
123
116
  end
124
117
 
125
- # @@paired_qual_output_files.each do |k,file|
126
- # file.close
127
- # end
128
-
129
- # @@sequences_output_files.each do |k,file|
130
- # file.close
131
- # end
132
- #
133
- # @@low_complexity_output_files.each do |k,file|
134
- # file.close
135
- # end
136
- #
137
- # @@sffinfo_files.each do |k,file|
138
- # file.close
139
- # end
140
- #
141
- # @@low_sffinfo_files.each do |k,file|
142
- # file.close
143
- # end
144
-
145
-
146
- # @@qual_output_files.each do |k,file|
147
- # file.close
148
- # end
149
-
150
- # more than one MID found
151
- # if @@full_stats['mid_id'] && @@full_stats['mid_id'].count>1
152
- #
153
- # end
154
-
155
118
  if File.exists?('scbi_drb_checkpoint')
156
119
  File.delete('scbi_drb_checkpoint')
157
120
  end
@@ -172,21 +135,71 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
172
135
  end
173
136
 
174
137
  def load_user_checkpoint(checkpoint)
138
+ # load full_stats from file !!!!!!!!!!!!!
175
139
 
176
- # reset count stats since they are repeated by checkpointing
140
+ if File.exists?(STATS_PATH)
177
141
 
178
- if @@full_stats['sequences'] && @@full_stats['repeated']
179
- @@full_stats['sequences']['count']['repeated']=0
180
- end
181
-
182
- if @@full_stats['sequences'] && @@full_stats['processed']
183
- @@full_stats['sequences']['processed']['count']=0
142
+ # load stats
143
+ text = File.read(STATS_PATH)
144
+
145
+ # wipe text
146
+ # text=text.grep(/^\s*[^#]/).to_s
147
+
148
+ # decode json
149
+ @@full_stats = JSON.parse(text)
184
150
  end
185
-
186
- if @@full_stats['sequences'] && @@full_stats['total']
187
- @@full_stats['sequences']['total']['count']=0
151
+
152
+ # reset count stats since they are repeated by checkpointing
153
+
154
+ # {
155
+ # "sequences": {
156
+ # "count": {
157
+ # "input_count": 1600,
158
+ # "output_seqs": 933,
159
+ # "rejected": 67
160
+ # },
161
+ # "rejected": {
162
+ # "short insert": 39,
163
+ # "contaminated": 26,
164
+ # "unexpected vector": 2
165
+ # }
166
+ # }
167
+ # }
168
+
169
+ if @@full_stats['sequences']
170
+ if @@full_stats['sequences']['count']
171
+ # set input count to 0
172
+ @@full_stats['sequences']['count']['input_count']=0
173
+
174
+ # do not remove outputseqs
175
+ # @@full_stats['sequences']['count']['output_seqs']=0
176
+ end
177
+
178
+ # remove rejected due to repetitions from rejected count
179
+ if @@full_stats['sequences']['rejected']
180
+
181
+ # it there are repeated
182
+ if (@@full_stats['sequences']['rejected']['repeated'])
183
+
184
+ # if repeated count > 0 and there count exists
185
+ if (@@full_stats['sequences']['rejected']['repeated'] > 0) and @@full_stats['sequences']['count']
186
+
187
+ # discount repeated from rejected, since they are going to be added again by checkout process
188
+ @@full_stats['sequences']['count']['rejected'] -= @@full_stats['sequences']['rejected']['repeated']
189
+ end
190
+
191
+ # set repeated to 0
192
+ @@full_stats['sequences']['rejected']['repeated']=0
193
+ end
194
+ end
188
195
  end
189
196
 
197
+
198
+ # puts "Loaded Stats"
199
+ # puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
200
+
201
+ # TODO - remove sequences from rejected file that were added by cloned
202
+
190
203
  super
191
204
  # return checkpoint
192
205
  end
@@ -202,17 +215,7 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
202
215
 
203
216
  # read a work that will not be processed, only to skip until checkpoint
204
217
  def trash_checkpointed_work
205
-
206
- @@chunk_size.times do
207
- begin
208
- n,f,q,c = @@fqr.next_seq
209
- end while (!n.nil? && @@params.repeated_seq?(n))
210
-
211
- if n.nil?
212
- break
213
- end
214
- end
215
-
218
+ warn "Deprecated: trash_checkpointed_work was deprecated, it is automatic now"
216
219
  end
217
220
 
218
221
  def next_work
@@ -228,12 +231,16 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
228
231
  if !n.nil? && @@params.repeated_seq?(n)
229
232
  @@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
230
233
  @@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
231
-
234
+
232
235
  get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
233
-
236
+
234
237
  end
235
-
238
+
236
239
  if !n.nil?
240
+ @@ongoing_stats[:sequence_count] += 1
241
+ @@ongoing_stats[:smallest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].min
242
+ @@ongoing_stats[:biggest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].max
243
+
237
244
  @@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
238
245
  end
239
246
  end while (!n.nil? && @@params.repeated_seq?(n))
@@ -247,50 +254,9 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
247
254
 
248
255
  end
249
256
 
250
- # def next_work
251
- #
252
- # if @@exit
253
- # return nil
254
- # end
255
- # group = SequenceGroup.new
256
- #
257
- # @@chunk_size.times do
258
- # begin
259
- #
260
- # n,f,q,c = @@fqr.next_seq
261
- #
262
- # if !n.nil? && @@params.repeated_seq?(n)
263
- # @@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
264
- # @@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
265
- #
266
- # get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
267
- #
268
- # end
269
- # if !n.nil?
270
- # @@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
271
- # end
272
- # end while (!n.nil? && @@params.repeated_seq?(n))
273
- #
274
- # if !n.nil?
275
- # # @@full_stats.add_stats({'sequences' => {'count' => {'processed' => 1}}})
276
- # group.push SequenceWithAction.new(n,f.upcase,q,c)
277
- # else
278
- # break
279
- # end
280
- # end
281
- #
282
- # # puts "Processing #{group.inspect}"
283
- #
284
- # if group.empty?
285
- # return nil
286
- # else
287
- # return group
288
- # end
289
- #
290
- # end
291
257
 
292
258
  def work_received(obj)
293
-
259
+
294
260
  res = obj
295
261
 
296
262
  # collect stats
@@ -19,6 +19,9 @@
19
19
  #
20
20
  # $: << File.expand_path(ROOT_PATH)
21
21
 
22
+ $: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
23
+ $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
24
+
22
25
  require 'seqtrimnext'
23
26
 
24
27
  $SEQTRIM_PATH = ROOT_PATH
@@ -37,7 +40,7 @@ ENV['BLASTDB']=$FORMATTED_DB_PATH
37
40
  OUTPUT_PATH='output_files'
38
41
 
39
42
  puts "FORMATTED_DB_BLAST in workers: #{$FORMATTED_DB_PATH}"
40
- # $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
43
+
41
44
 
42
45
  require 'scbi_mapreduce'
43
46
  require 'params'
@@ -10,8 +10,13 @@ class GraphStats
10
10
  init_stats=initial_stats
11
11
 
12
12
  if init_stats.nil?
13
- r=File.read(File.join(OUTPUT_PATH,'initial_stats.json'))
14
- init_stats= JSON::parse(r)
13
+ if File.exists?(File.join(OUTPUT_PATH,'initial_stats.json'))
14
+ r=File.read(File.join(OUTPUT_PATH,'initial_stats.json'))
15
+ init_stats= JSON::parse(r)
16
+ else
17
+ init_stats=[]
18
+ end
19
+
15
20
  end
16
21
  # puts init_stats.to_json
17
22
  #r=File.read(File.join(File.dirname(__FILE__),'stats.json'))
@@ -5,8 +5,6 @@
5
5
 
6
6
  require 'extract_stats'
7
7
 
8
- # $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib')
9
-
10
8
  require 'scbi_mapreduce'
11
9
  require 'seqtrim_work_manager'
12
10
  require 'action_manager'
@@ -340,6 +338,9 @@ class Seqtrim
340
338
  # server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,'~/.seqtrimnext')
341
339
  server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,$SEQTRIMNEXT_INIT)
342
340
  server.chunk_size=chunk_size
341
+ server.checkpointing=true
342
+ server.keep_order=true
343
+ server.retry_stuck_jobs=true
343
344
  server.start_server
344
345
 
345
346
  # close sequence reader
@@ -77,7 +77,7 @@ class SequenceWithAction < Sequence
77
77
 
78
78
  # Adds a new action to the sequence
79
79
  def add_action(a)
80
- $LOG.info("Adding action #{a.type} to #{seq_name}")
80
+ $LOG.debug("Adding action #{a.type} to #{seq_name}")
81
81
 
82
82
  @actions.push a
83
83
 
@@ -25,7 +25,7 @@ class PluginAbAdapters < Plugin
25
25
  def do_blasts(seqs)
26
26
  # find MIDS with less results than max_target_seqs value
27
27
  blast=BatchBlast.new("-db #{@params.get_param('adapters_ab_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
28
- $LOG.info('BLAST:'+blast.get_blast_cmd)
28
+ $LOG.debug('BLAST:'+blast.get_blast_cmd)
29
29
 
30
30
  fastas=[]
31
31
 
@@ -61,7 +61,7 @@ class PluginAbAdapters < Plugin
61
61
  raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
62
62
  end
63
63
 
64
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
64
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
65
65
 
66
66
 
67
67
  # blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters_ab.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_ab')} -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
@@ -25,7 +25,7 @@ class PluginAdapters < Plugin
25
25
  def do_blasts(seqs)
26
26
  # find MIDS with less results than max_target_seqs value
27
27
  blast=BatchBlast.new("-db #{@params.get_param('adapters_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_adapters')} -word_size #{MIN_ADAPTER_SIZE}")
28
- $LOG.info('BLAST:'+blast.get_blast_cmd)
28
+ $LOG.debug('BLAST:'+blast.get_blast_cmd)
29
29
 
30
30
  fastas=[]
31
31
 
@@ -64,7 +64,7 @@ class PluginAdapters < Plugin
64
64
  # raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
65
65
  end
66
66
 
67
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
67
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
68
68
 
69
69
 
70
70
  # blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')} -word_size #{MIN_ADAPTER_SIZE}")
@@ -0,0 +1,165 @@
1
+ require "plugin"
2
+
3
+ ########################################################
4
+ # Author: Almudena Bocinos Rioboo
5
+ #
6
+ # Defines the main methods that are necessary to execute PluginAdapters
7
+ # Inherit: Plugin
8
+ ########################################################
9
+
10
+ class PluginAdaptersOld < Plugin
11
+
12
+ def get_type_adapter(p_start,p_end,seq)
13
+ #if q_beg is nearer the left, add adapter action by the left,
14
+ #if q_end esta is nearer the right , add adapter action by the right
15
+ #NOTE: If the adapter is very near from left and rigth,
16
+ #then the sequence isn't valid, because almost sequence is adapter.
17
+
18
+
19
+ v1= p_end.to_i
20
+ v2= p_start.to_i
21
+
22
+ # puts " startadapter #{v2} endadapter #{v1} insert_start #{seq.insert_start} insert_end #{seq.insert_end}"
23
+
24
+ # puts " #{v2+seq.insert_start} <? #{seq.seq_fasta.length - v1 - 1 + seq.seq_fasta_orig.length - seq.insert_end-1}"
25
+ if (v2+seq.insert_start < (seq.seq_fasta.length - v1 - 1+ seq.seq_fasta_orig.length - seq.insert_end-1)) #IF THE NEAREST ONE IS THE LEFT
26
+ type = "ActionLeftAdapter"
27
+
28
+ else
29
+ type = "ActionRightAdapter"
30
+
31
+ end
32
+ return type
33
+ end
34
+
35
+
36
+ def cut_by_right(adapter,seq)
37
+
38
+ left_size = adapter.q_beg-seq.insert_start+1
39
+ right_size = seq.insert_end-adapter.q_end+1
40
+ left_size=0 if (left_size<0)
41
+ right_size=0 if (right_size<0)
42
+
43
+ return (left_size>(right_size/2).to_i)
44
+
45
+ end
46
+
47
+ #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
48
+ def execute(seqs)
49
+ blasts= do_blasts(seqs)
50
+
51
+ seqs.each_with_index do |s,i|
52
+ exec_seq(s,blasts.querys[i])
53
+ end
54
+ end
55
+
56
+ def do_blasts(seqs)
57
+ # find MIDS with less results than max_target_seqs value
58
+ blast=BatchBlast.new("-db #{@params.get_param('adapters_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
59
+ $LOG.debug('BLAST:'+blast.get_blast_cmd)
60
+
61
+ fastas=[]
62
+
63
+ seqs.each do |seq|
64
+ fastas.push ">"+seq.seq_name
65
+ fastas.push seq.seq_fasta
66
+ end
67
+
68
+ # fastas=fastas.join("\n")
69
+
70
+ blast_table_results = blast.do_blast(fastas)
71
+
72
+ # puts blast_table_results.inspect
73
+
74
+ return blast_table_results
75
+ end
76
+
77
+
78
+ def exec_seq(seq,blast_query)
79
+ if blast_query.query_id != seq.seq_name
80
+ raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
81
+ end
82
+
83
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
84
+
85
+
86
+ # blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
87
+
88
+ # blast with only one sequence, no with many sequences from a database
89
+ #---------------------------------------------------------------------
90
+
91
+ # blast_table_results = blast.do_blast(seq.seq_fasta) #rise seq to adapterss executing over blast
92
+
93
+ #blast_table_results = BlastTableResult.new(res)
94
+
95
+ # blast_table_results.inspect
96
+
97
+ adapters=[]
98
+ # blast_table_results.querys.each do |query| # first round to save adapters without overlap
99
+ merge_hits(blast_query,adapters)
100
+ # end
101
+
102
+ begin
103
+ adapters2=adapters # second round to save adapters without overlap
104
+ adapters = []
105
+ merge_hits(adapters2,adapters)
106
+ end until (adapters2.count == adapters.count)
107
+
108
+ actions=[]
109
+ adapter_size=0
110
+ # @stats['adapter_size']={}
111
+ adapters.each do |ad| # adds the correspondent action to the sequence
112
+
113
+ type = get_type_adapter(ad.q_beg,ad.q_end,seq)
114
+ a = seq.new_action(ad.q_beg,ad.q_end,type)
115
+ # puts " state left_action #{a.left_action} right_action #{a.right_action}"
116
+
117
+
118
+ adapter_size=ad.q_end-ad.q_beg+1
119
+
120
+ if cut_by_right(ad,seq)
121
+
122
+ # puts "action right end1 #{seq.insert_end}"
123
+
124
+ a.right_action=true #mark rigth action to get the left insert
125
+ else
126
+
127
+ # puts " cut1 by left #{seq.insert_start} ad #{ad.q_beg+seq.insert_start} #{ad.q_end+seq.insert_start}"
128
+
129
+ a.left_action = true #mark left action to get the right insert
130
+
131
+ end
132
+
133
+ a.message = ad.subject_id
134
+ a.reversed = ad.reversed
135
+ actions.push a
136
+
137
+ # @stats[:adapter_size]={adapter_size => 1}
138
+ add_stats('adapter_size',adapter_size)
139
+
140
+ end
141
+ seq.add_actions(actions)
142
+ #
143
+ end
144
+
145
+ #Returns an array with the errors due to parameters are missing
146
+ def self.check_params(params)
147
+ errors=[]
148
+
149
+ comment='Blast E-value used as cut-off when searching for adapters or primers'
150
+ default_value = 1e-6
151
+ params.check_param(errors,'blast_evalue_adapters','Float',default_value,comment)
152
+
153
+ comment='Minimum required identity (%) for a reliable adapter'
154
+ default_value = 95
155
+ params.check_param(errors,'blast_percent_adapters','Integer',default_value,comment)
156
+
157
+ comment='Path for adapter database'
158
+ default_value = File.join($FORMATTED_DB_PATH,'adapters.fasta')
159
+ params.check_param(errors,'adapters_db','DB',default_value,comment)
160
+
161
+ return errors
162
+ end
163
+
164
+
165
+ end
@@ -25,7 +25,7 @@ class PluginAmplicons < Plugin
25
25
  def do_blasts(seqs)
26
26
  # find MIDS with less results than max_target_seqs value
27
27
  blast=BatchBlast.new("-db #{@params.get_param('primers_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_primers')}")
28
- $LOG.info('BLAST:'+blast.get_blast_cmd)
28
+ $LOG.debug('BLAST:'+blast.get_blast_cmd)
29
29
 
30
30
  fastas=[]
31
31
 
@@ -49,7 +49,7 @@ class PluginAmplicons < Plugin
49
49
  raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
50
50
  end
51
51
 
52
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for primers into the sequence"
52
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for primers into the sequence"
53
53
 
54
54
  # puts blast_query.inspect
55
55
 
@@ -36,7 +36,7 @@ class PluginContaminants < Plugin
36
36
 
37
37
  blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
38
38
 
39
- $LOG.info('BLAST:'+blast.get_blast_cmd(:xml))
39
+ $LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
40
40
 
41
41
  fastas=[]
42
42
 
@@ -67,7 +67,7 @@ class PluginContaminants < Plugin
67
67
  # raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
68
68
  end
69
69
 
70
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for contaminants into the sequence"
70
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for contaminants into the sequence"
71
71
 
72
72
 
73
73
  #blast = BatchBlast.new('-db DB/formatted/contaminants.fasta','blastn',' -task blastn -evalue 1e-10 -perc_identity 95') #get contaminants
@@ -143,7 +143,7 @@ class PluginContaminants < Plugin
143
143
 
144
144
  end
145
145
  else
146
- $LOG.info('Contaminant ignored due to genus match: '+c.definition)
146
+ $LOG.debug('Contaminant ignored due to genus match: '+c.definition)
147
147
  end
148
148
  end
149
149
 
@@ -286,7 +286,7 @@ class PluginExtractInserts < Plugin
286
286
 
287
287
 
288
288
  def exec_seq(seq)
289
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: extract inserts"
289
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: extract inserts"
290
290
 
291
291
  # puts "INSERTO ANTES LINKER INSERT:"+seq.seq_fasta
292
292
 
@@ -319,7 +319,7 @@ class PluginFindPolyAt < Plugin
319
319
 
320
320
 
321
321
  def exec_seq(seq)
322
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
322
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
323
323
 
324
324
  find_polyT(seq)
325
325
  find_polyA(seq)
@@ -21,7 +21,7 @@ def execute(seqs)
21
21
 
22
22
 
23
23
  def exec_seq(seq)
24
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
24
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
25
25
 
26
26
  fasta_input=@params.get_param('truncated_input_file')
27
27
 
@@ -149,7 +149,7 @@ class PluginIndeterminations < Plugin
149
149
 
150
150
 
151
151
  def exec_seq(seq)
152
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
152
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
153
153
 
154
154
  actions=[]
155
155
 
@@ -21,7 +21,7 @@ class PluginKey < Plugin
21
21
 
22
22
 
23
23
  def exec_seq(seq)
24
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: marking key into the sequence"
24
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: marking key into the sequence"
25
25
  # blast_table_results.inspect
26
26
 
27
27
  actions=[]
@@ -83,7 +83,7 @@ class PluginLinker < Plugin
83
83
  # find MIDS with less results than max_target_seqs value
84
84
  blast = BatchBlast.new("-db #{@params.get_param('linkers_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
85
85
 
86
- $LOG.info('BLAST:'+blast.get_blast_cmd)
86
+ $LOG.debug('BLAST:'+blast.get_blast_cmd)
87
87
 
88
88
  fastas=[]
89
89
 
@@ -106,7 +106,7 @@ class PluginLinker < Plugin
106
106
  if blast_query.query_id != seq.seq_name
107
107
  raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
108
108
  end
109
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for linker into the sequence"
109
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for linker into the sequence"
110
110
 
111
111
  # key_beg,key_end=search_key(seq,0,3) if false
112
112
  # blast = BatchBlast.new("-subject #{File.join($FORMATTED_DB_PATH,'linkers.fasta')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
@@ -22,7 +22,7 @@ class PluginLowHighSize < Plugin
22
22
 
23
23
 
24
24
  def exec_seq(seq)
25
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low or high size of the sequence"
25
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low or high size of the sequence"
26
26
 
27
27
  min_size = @params.get_param('min_sequence_size_raw').to_i #min_size is: mean - 2dev
28
28
  max_size = @params.get_param('max_sequence_size_raw').to_i #max_size is: mean + 2dev
@@ -278,7 +278,7 @@ class PluginLowQuality < Plugin
278
278
  if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
279
279
  $LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
280
280
  elsif (seq.seq_qual.size>0)
281
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
281
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
282
282
 
283
283
  @low=@params.get_param('min_quality').to_i
284
284
 
@@ -29,7 +29,7 @@ class PluginMids < Plugin
29
29
  def do_blasts(seqs)
30
30
  # find MIDS with less results than max_target_seqs value
31
31
  blast = BatchBlast.new("-db #{@params.get_param('mids_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_mids')} -max_target_seqs 4 ") #get mids
32
- $LOG.info('BLAST:'+blast.get_blast_cmd)
32
+ $LOG.debug('BLAST:'+blast.get_blast_cmd)
33
33
 
34
34
  fastas=[]
35
35
 
@@ -54,7 +54,7 @@ class PluginMids < Plugin
54
54
  end
55
55
 
56
56
 
57
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
57
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
58
58
 
59
59
 
60
60
  # blast_table_results = blast.do_blast(seq.seq_fasta[0..SIZE_SEARCH_MID]) # execute blast to find mids
@@ -27,7 +27,7 @@ class PluginRemAditArtifacts < Plugin
27
27
 
28
28
  def exec_seq(seq)
29
29
 
30
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing artifacts into the sequence"
30
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing artifacts into the sequence"
31
31
  seq2 = seq.seq_fasta
32
32
  first = 0
33
33
  last = seq2.size-1
@@ -96,7 +96,7 @@ class PluginShortInsert < Plugin
96
96
 
97
97
  def exec_seq(seq)
98
98
 
99
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
99
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
100
100
  # puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
101
101
 
102
102
  if (seq.seq_fasta.size > 0)
@@ -183,7 +183,7 @@ class PluginShortInsert < Plugin
183
183
 
184
184
  #Begins the plugin1's execution to warn if the inserted is so short
185
185
  def execute_no_cut_quality(seq)
186
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
186
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
187
187
 
188
188
 
189
189
 
@@ -38,7 +38,7 @@ class PluginVectors < Plugin
38
38
  # find MIDS with less results than max_target_seqs value
39
39
  blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors
40
40
 
41
- $LOG.info('BLAST:'+blast.get_blast_cmd)
41
+ $LOG.debug('BLAST:'+blast.get_blast_cmd)
42
42
 
43
43
  fastas=[]
44
44
 
@@ -62,7 +62,7 @@ class PluginVectors < Plugin
62
62
  # raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
63
63
  end
64
64
 
65
- $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for vectors into the sequence "
65
+ $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for vectors into the sequence "
66
66
 
67
67
  #blast contra contaminantes
68
68
 
@@ -0,0 +1,5 @@
1
+ # ======================================
2
+ # General parameters GENOMICS illumina
3
+ # ======================================
4
+
5
+ plugin_list = PluginLowHighSize,PluginIndeterminations,PluginContaminants,PluginLowQuality
@@ -0,0 +1,8 @@
1
+ # ======================================
2
+ # General parameters
3
+ # ======================================
4
+
5
+
6
+ plugin_list = PluginLowHighSize,PluginIndeterminations,PluginFindPolyAt,PluginContaminants,PluginLowQuality,PluginLowComplexity
7
+
8
+ contaminants_db="contaminants.fasta cont_ribosome.fasta"
@@ -15,8 +15,9 @@ def add_stats(h_stats)
15
15
  add_stats.each do |property,hash_value|
16
16
  h[plugin_hash][property]={} if h[plugin_hash][property].nil?
17
17
 
18
+ # values need to be in string format because of later loading from json file
18
19
  hash_value.each do |value, count|
19
- h[plugin_hash][property][value]=(h[plugin_hash][property][value]||0) + count
20
+ h[plugin_hash][property][value.to_s]=(h[plugin_hash][property][value.to_s]||0) + count
20
21
  end
21
22
  end
22
23
  end
data/lib/seqtrimnext.rb CHANGED
@@ -30,7 +30,7 @@ module Seqtrimnext
30
30
  # SEQTRIM_VERSION_STAGE = 'b'
31
31
  # SEQTRIM_VERSION = "2.0.0#{SEQTRIM_VERSION_STAGE}#{SEQTRIM_VERSION_REVISION}"
32
32
 
33
- VERSION = '2.0.45'
33
+ VERSION = '2.0.46'
34
34
 
35
35
  SEQTRIM_VERSION = VERSION
36
36
 
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: seqtrimnext
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 2.0.45
5
+ version: 2.0.46
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dario Guerrero & Almudena Bocinos
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-03-05 00:00:00 Z
13
+ date: 2012-04-13 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: narray
@@ -143,9 +143,11 @@ executables:
143
143
  - extract_seqs_from_fastq.rb
144
144
  - fasta2fastq.rb
145
145
  - fastq2fasta.rb
146
+ - filter_database.rb
146
147
  - gen_qual.rb
147
148
  - get_seq.rb
148
149
  - group_by_range.rb
150
+ - join_big_illumina_paired.sh
149
151
  - join_ilumina_paired.rb
150
152
  - parse_amplicons.rb
151
153
  - parse_json_results.rb
@@ -160,14 +162,16 @@ executables:
160
162
  extensions: []
161
163
 
162
164
  extra_rdoc_files:
165
+ - History.txt
163
166
  - lib/seqtrimnext/templates/amplicons.txt
164
167
  - lib/seqtrimnext/templates/genomics_454.txt
165
168
  - lib/seqtrimnext/templates/genomics_454_with_paired.txt
169
+ - lib/seqtrimnext/templates/genomics_illumina.txt
166
170
  - lib/seqtrimnext/templates/low_quality.txt
167
171
  - lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
168
172
  - lib/seqtrimnext/templates/transcriptomics_454.txt
173
+ - lib/seqtrimnext/templates/transcriptomics_illumina.txt
169
174
  - lib/seqtrimnext/templates/transcriptomics_plants.txt
170
- - History.txt
171
175
  - Manifest.txt
172
176
  - PostInstall.txt
173
177
  files:
@@ -177,9 +181,11 @@ files:
177
181
  - bin/extract_seqs_from_fastq.rb
178
182
  - bin/fasta2fastq.rb
179
183
  - bin/fastq2fasta.rb
184
+ - bin/filter_database.rb
180
185
  - bin/gen_qual.rb
181
186
  - bin/get_seq.rb
182
187
  - bin/group_by_range.rb
188
+ - bin/join_big_illumina_paired.sh
183
189
  - bin/join_ilumina_paired.rb
184
190
  - bin/parse_amplicons.rb
185
191
  - bin/parse_json_results.rb
@@ -191,6 +197,7 @@ files:
191
197
  - bin/split_fastq.rb
192
198
  - bin/split_ilumina_paired.rb
193
199
  - bin/split_paired.rb
200
+ - History.txt
194
201
  - lib/seqtrimnext/actions/action_ab_adapter.rb
195
202
  - lib/seqtrimnext/actions/action_ab_far_adapter.rb
196
203
  - lib/seqtrimnext/actions/action_ab_left_adapter.rb
@@ -208,13 +215,13 @@ files:
208
215
  - lib/seqtrimnext/actions/action_low_high_size.rb
209
216
  - lib/seqtrimnext/actions/action_low_quality.rb
210
217
  - lib/seqtrimnext/actions/action_mid.rb
218
+ - lib/seqtrimnext/actions/action_middle_adapter.rb
211
219
  - lib/seqtrimnext/actions/action_multiple_linker.rb
212
220
  - lib/seqtrimnext/actions/action_paired_reads.rb
213
221
  - lib/seqtrimnext/actions/action_poly_a.rb
214
222
  - lib/seqtrimnext/actions/action_poly_t.rb
215
223
  - lib/seqtrimnext/actions/action_rem_adit_artifacts.rb
216
224
  - lib/seqtrimnext/actions/action_right_adapter.rb
217
- - lib/seqtrimnext/actions/action_middle_adapter.rb
218
225
  - lib/seqtrimnext/actions/action_right_primer.rb
219
226
  - lib/seqtrimnext/actions/action_short_insert.rb
220
227
  - lib/seqtrimnext/actions/action_unexpected_poly_t.rb
@@ -244,6 +251,7 @@ files:
244
251
  - lib/seqtrimnext/plugins/plugin.rb
245
252
  - lib/seqtrimnext/plugins/plugin_ab_adapters.rb
246
253
  - lib/seqtrimnext/plugins/plugin_adapters.rb
254
+ - lib/seqtrimnext/plugins/plugin_adapters_old.rb
247
255
  - lib/seqtrimnext/plugins/plugin_amplicons.rb
248
256
  - lib/seqtrimnext/plugins/plugin_contaminants.rb
249
257
  - lib/seqtrimnext/plugins/plugin_extract_inserts.rb
@@ -262,9 +270,11 @@ files:
262
270
  - lib/seqtrimnext/templates/amplicons.txt
263
271
  - lib/seqtrimnext/templates/genomics_454.txt
264
272
  - lib/seqtrimnext/templates/genomics_454_with_paired.txt
273
+ - lib/seqtrimnext/templates/genomics_illumina.txt
265
274
  - lib/seqtrimnext/templates/low_quality.txt
266
275
  - lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
267
276
  - lib/seqtrimnext/templates/transcriptomics_454.txt
277
+ - lib/seqtrimnext/templates/transcriptomics_illumina.txt
268
278
  - lib/seqtrimnext/templates/transcriptomics_plants.txt
269
279
  - lib/seqtrimnext/utils/extract_samples.rb
270
280
  - lib/seqtrimnext/utils/fasta2xml.rb
@@ -276,7 +286,6 @@ files:
276
286
  - lib/seqtrimnext/utils/recover_mid.rb
277
287
  - lib/seqtrimnext/utils/string_utils.rb
278
288
  - lib/seqtrimnext.rb
279
- - History.txt
280
289
  - Manifest.txt
281
290
  - PostInstall.txt
282
291
  - Rakefile