seqtrimnext 2.0.45 → 2.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +7 -2
- data/bin/filter_database.rb +39 -0
- data/bin/join_big_illumina_paired.sh +122 -0
- data/bin/seqtrimnext +2 -1
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +87 -121
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +4 -1
- data/lib/seqtrimnext/classes/graph_stats.rb +7 -2
- data/lib/seqtrimnext/classes/seqtrim.rb +3 -2
- data/lib/seqtrimnext/classes/sequence_with_action.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +3 -3
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_linker.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_mids.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +2 -2
- data/lib/seqtrimnext/templates/genomics_illumina.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_illumina.txt +8 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +2 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +14 -5
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -4,9 +4,11 @@ bin/extract_seqs_from_fasta.rb
|
|
4
4
|
bin/extract_seqs_from_fastq.rb
|
5
5
|
bin/fasta2fastq.rb
|
6
6
|
bin/fastq2fasta.rb
|
7
|
+
bin/filter_database.rb
|
7
8
|
bin/gen_qual.rb
|
8
9
|
bin/get_seq.rb
|
9
10
|
bin/group_by_range.rb
|
11
|
+
bin/join_big_illumina_paired.sh
|
10
12
|
bin/join_ilumina_paired.rb
|
11
13
|
bin/parse_amplicons.rb
|
12
14
|
bin/parse_json_results.rb
|
@@ -18,6 +20,7 @@ bin/seqtrimnext
|
|
18
20
|
bin/split_fastq.rb
|
19
21
|
bin/split_ilumina_paired.rb
|
20
22
|
bin/split_paired.rb
|
23
|
+
History.txt
|
21
24
|
lib/seqtrimnext/actions/action_ab_adapter.rb
|
22
25
|
lib/seqtrimnext/actions/action_ab_far_adapter.rb
|
23
26
|
lib/seqtrimnext/actions/action_ab_left_adapter.rb
|
@@ -35,13 +38,13 @@ lib/seqtrimnext/actions/action_low_complexity.rb
|
|
35
38
|
lib/seqtrimnext/actions/action_low_high_size.rb
|
36
39
|
lib/seqtrimnext/actions/action_low_quality.rb
|
37
40
|
lib/seqtrimnext/actions/action_mid.rb
|
41
|
+
lib/seqtrimnext/actions/action_middle_adapter.rb
|
38
42
|
lib/seqtrimnext/actions/action_multiple_linker.rb
|
39
43
|
lib/seqtrimnext/actions/action_paired_reads.rb
|
40
44
|
lib/seqtrimnext/actions/action_poly_a.rb
|
41
45
|
lib/seqtrimnext/actions/action_poly_t.rb
|
42
46
|
lib/seqtrimnext/actions/action_rem_adit_artifacts.rb
|
43
47
|
lib/seqtrimnext/actions/action_right_adapter.rb
|
44
|
-
lib/seqtrimnext/actions/action_middle_adapter.rb
|
45
48
|
lib/seqtrimnext/actions/action_right_primer.rb
|
46
49
|
lib/seqtrimnext/actions/action_short_insert.rb
|
47
50
|
lib/seqtrimnext/actions/action_unexpected_poly_t.rb
|
@@ -71,6 +74,7 @@ lib/seqtrimnext/classes/sequence_with_action.rb
|
|
71
74
|
lib/seqtrimnext/plugins/plugin.rb
|
72
75
|
lib/seqtrimnext/plugins/plugin_ab_adapters.rb
|
73
76
|
lib/seqtrimnext/plugins/plugin_adapters.rb
|
77
|
+
lib/seqtrimnext/plugins/plugin_adapters_old.rb
|
74
78
|
lib/seqtrimnext/plugins/plugin_amplicons.rb
|
75
79
|
lib/seqtrimnext/plugins/plugin_contaminants.rb
|
76
80
|
lib/seqtrimnext/plugins/plugin_extract_inserts.rb
|
@@ -89,9 +93,11 @@ lib/seqtrimnext/plugins/plugin_vectors.rb
|
|
89
93
|
lib/seqtrimnext/templates/amplicons.txt
|
90
94
|
lib/seqtrimnext/templates/genomics_454.txt
|
91
95
|
lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
96
|
+
lib/seqtrimnext/templates/genomics_illumina.txt
|
92
97
|
lib/seqtrimnext/templates/low_quality.txt
|
93
98
|
lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
|
94
99
|
lib/seqtrimnext/templates/transcriptomics_454.txt
|
100
|
+
lib/seqtrimnext/templates/transcriptomics_illumina.txt
|
95
101
|
lib/seqtrimnext/templates/transcriptomics_plants.txt
|
96
102
|
lib/seqtrimnext/utils/extract_samples.rb
|
97
103
|
lib/seqtrimnext/utils/fasta2xml.rb
|
@@ -103,7 +109,6 @@ lib/seqtrimnext/utils/load_qual_in_hash.rb
|
|
103
109
|
lib/seqtrimnext/utils/recover_mid.rb
|
104
110
|
lib/seqtrimnext/utils/string_utils.rb
|
105
111
|
lib/seqtrimnext.rb
|
106
|
-
History.txt
|
107
112
|
Manifest.txt
|
108
113
|
PostInstall.txt
|
109
114
|
Rakefile
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scbi_fasta'
|
4
|
+
|
5
|
+
if ARGV.count!=3
|
6
|
+
puts "Usage: #{File.basename($0)} database min_size name_list"
|
7
|
+
exit
|
8
|
+
end
|
9
|
+
min_size = ARGV[1].to_i
|
10
|
+
|
11
|
+
# read keywords
|
12
|
+
keywords=File.read(ARGV[2]).split("\n")
|
13
|
+
|
14
|
+
# convert all to upcase
|
15
|
+
keywords.map { |keyword| keyword.upcase!}
|
16
|
+
|
17
|
+
# puts "Search keywords"
|
18
|
+
# keywords.each { |keyword| puts keyword}
|
19
|
+
|
20
|
+
fqr=FastaQualFile.new(ARGV[0])
|
21
|
+
|
22
|
+
all=[]
|
23
|
+
|
24
|
+
fqr.each do |n,s,c|
|
25
|
+
keywords.each do |keyword|
|
26
|
+
if s.length<=min_size
|
27
|
+
# all+=c.split(" ")
|
28
|
+
if c.upcase.index(keyword)
|
29
|
+
# puts "[#{s.length.to_s}] - #{n} - #{c}"
|
30
|
+
puts ">#{n} #{c}\n#{s}"
|
31
|
+
break
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# puts all.sort.uniq.reject{|e| e=~/\d/}
|
38
|
+
|
39
|
+
fqr.close
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# Sort two big illumina files corresponding to paired-end experiment and then join common sequences on different files. Sequences not in common goes to a separate file.
|
4
|
+
|
5
|
+
# cat $1 | awk '{split($0, a, " "); n++; if (n%1==0){printf("%s\t",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }'
|
6
|
+
#
|
7
|
+
# exit
|
8
|
+
|
9
|
+
if [ "$#" < 4 ];
|
10
|
+
then
|
11
|
+
echo ""
|
12
|
+
echo "Use: $0 file1.fastq file2.fastq base_output_name tmp_dir"
|
13
|
+
echo ""
|
14
|
+
exit
|
15
|
+
fi
|
16
|
+
|
17
|
+
base_name=$3
|
18
|
+
|
19
|
+
if [[ -z "$base_name" ]]; then
|
20
|
+
echo "Use a : $base_name doesn't exists"
|
21
|
+
exit -1
|
22
|
+
fi
|
23
|
+
|
24
|
+
tmp_dir=$4
|
25
|
+
|
26
|
+
if [[ -z "$4" ]]; then
|
27
|
+
tmp_dir=`pwd`
|
28
|
+
fi
|
29
|
+
|
30
|
+
if [[ ! -e "$tmp_dir" ]]; then
|
31
|
+
echo "Tmp dir: $4 doesn't exists"
|
32
|
+
exit -1
|
33
|
+
fi
|
34
|
+
|
35
|
+
echo "Using TMPDIR $tmp_dir"
|
36
|
+
|
37
|
+
f1_path=$1
|
38
|
+
f2_path=$2
|
39
|
+
|
40
|
+
f1_name=`basename $1`
|
41
|
+
f2_name=`basename $2`
|
42
|
+
|
43
|
+
f1_tmp="$tmp_dir/${f1_name}"
|
44
|
+
f2_tmp="$tmp_dir/${f2_name}"
|
45
|
+
|
46
|
+
common_names="$tmp_dir/comm.names"
|
47
|
+
|
48
|
+
only_in_1="$tmp_dir/only_in_1.txt"
|
49
|
+
only_in_2="$tmp_dir/only_in_2.txt"
|
50
|
+
in_both="$tmp_dir/in_both.txt"
|
51
|
+
|
52
|
+
|
53
|
+
echo "Starting sorting"
|
54
|
+
|
55
|
+
if [[ ! -e "$f1_tmp.sorted" ]]; then
|
56
|
+
echo "Sorting $f1_name"
|
57
|
+
cat $f1_path | awk '{split($0, a, " "); sub(/\/1$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f1_tmp.sorted &
|
58
|
+
|
59
|
+
fi
|
60
|
+
|
61
|
+
if [[ ! -e "$f2_tmp.sorted" ]]; then
|
62
|
+
echo "Sorting $f2_name"
|
63
|
+
cat $f2_path | awk '{split($0, a, " "); sub(/\/2$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f2_tmp.sorted &
|
64
|
+
fi
|
65
|
+
wait
|
66
|
+
|
67
|
+
echo "Starting name extraction"
|
68
|
+
if [[ ! -e "$f1_tmp.names" ]]; then
|
69
|
+
echo "Extracting names from $f1_tmp.sorted"
|
70
|
+
# cat $1.sorted | cut -f1 | sed 's/\(.*\)\/1$/\1/' > $1.names &
|
71
|
+
cat $f1_tmp.sorted | cut -f1 > $f1_tmp.names &
|
72
|
+
fi
|
73
|
+
if [[ ! -e "$f2_tmp.names" ]]; then
|
74
|
+
echo "Extracting names from $f2_tmp.sorted"
|
75
|
+
cat $f2_tmp.sorted | cut -f1 > $f2_tmp.names &
|
76
|
+
fi
|
77
|
+
wait
|
78
|
+
|
79
|
+
echo "Starting names comparison"
|
80
|
+
if [[ ! -e "$common_names" ]]; then
|
81
|
+
echo "Making comm file"
|
82
|
+
# diff $1.names $2.names > names.diff
|
83
|
+
comm $f1_tmp.names $f2_tmp.names > $common_names
|
84
|
+
fi
|
85
|
+
|
86
|
+
echo "Starting names extraction"
|
87
|
+
# grep '^>' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/2\n",$0) }' > only_in_2.txt &
|
88
|
+
# grep '^<' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/1\n",$0) }' > only_in_1.txt &
|
89
|
+
|
90
|
+
grep -P '^[^\t]' $common_names > $only_in_1 &
|
91
|
+
grep -P '^\t[^\t]' $common_names |tr -d "\t" > $only_in_2 &
|
92
|
+
grep -P '^\t\t[^\t]' $common_names |tr -d "\t" > $in_both &
|
93
|
+
wait
|
94
|
+
|
95
|
+
echo "Num seqs only in 1) $f1_name"
|
96
|
+
wc -l $only_in_1
|
97
|
+
|
98
|
+
echo "Num seqs only in 2) $f2_name"
|
99
|
+
wc -l $only_in_2
|
100
|
+
|
101
|
+
echo "Num seqs in both $f1_name and $f2_name"
|
102
|
+
wc -l $in_both
|
103
|
+
|
104
|
+
echo "Starting extracting seqs"
|
105
|
+
join -t $'\t' -1 1 -2 1 $only_in_1 $f1_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal1.fastq &
|
106
|
+
join -t $'\t' -1 1 -2 1 $only_in_2 $f2_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal2.fastq &
|
107
|
+
|
108
|
+
join -t $'\t' -1 1 -2 1 $in_both $f1_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired1.fastq &
|
109
|
+
join -t $'\t' -1 1 -2 1 $in_both $f2_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired2.fastq &
|
110
|
+
wait
|
111
|
+
|
112
|
+
rm $f1_tmp.names
|
113
|
+
rm $f2_tmp.names
|
114
|
+
|
115
|
+
rm $f1_tmp.sorted
|
116
|
+
rm $f2_tmp.sorted
|
117
|
+
|
118
|
+
rm $only_in_2
|
119
|
+
rm $only_in_1
|
120
|
+
rm $in_both
|
121
|
+
|
122
|
+
rm $common_names
|
data/bin/seqtrimnext
CHANGED
@@ -57,6 +57,7 @@
|
|
57
57
|
# $: << File.expand_path(ROOT_PATH)
|
58
58
|
|
59
59
|
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
|
60
|
+
$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
|
60
61
|
|
61
62
|
require 'seqtrimnext'
|
62
63
|
|
@@ -272,7 +273,7 @@ optparse = OptionParser.new do |opts|
|
|
272
273
|
end
|
273
274
|
|
274
275
|
options[:skip_report] = false
|
275
|
-
opts.on( '-R', '--no-report', '
|
276
|
+
opts.on( '-R', '--no-report', 'Do not generate final PDF report (gem scbi_seqtrimnext_report required if you want to generate PDF report).' ) do
|
276
277
|
options[:skip_report] = true
|
277
278
|
end
|
278
279
|
|
@@ -18,29 +18,21 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
18
18
|
@@params= params
|
19
19
|
@@exit = false
|
20
20
|
|
21
|
+
@@ongoing_stats={}
|
22
|
+
@@ongoing_stats[:sequence_count] = 0
|
23
|
+
@@ongoing_stats[:smallest_sequence_size] = 900000000000000
|
24
|
+
@@ongoing_stats[:biggest_sequence_size] = 0
|
25
|
+
|
21
26
|
@@skip_output=skip_output
|
22
27
|
|
23
28
|
@@chunk_size = chunk_size
|
24
29
|
|
25
|
-
|
26
|
-
# puts "CHECKPOINT: #{self.checkpoint}\n"*20
|
27
|
-
|
28
|
-
checkpoint_exists=File.exists?('scbi_drb_checkpoint')
|
30
|
+
checkpoint_exists=File.exists?(ScbiMapreduce::CHECKPOINT_FILE)
|
29
31
|
|
30
32
|
# @@use_qual = !qual_path.nil? and File.exists?(qual_path)
|
31
33
|
@@open_mode='w'
|
32
34
|
if checkpoint_exists
|
33
35
|
@@open_mode = 'a'
|
34
|
-
if File.exists?(STATS_PATH)
|
35
|
-
# load stats
|
36
|
-
text = File.read(STATS_PATH)
|
37
|
-
|
38
|
-
# wipe text
|
39
|
-
# text=text.grep(/^\s*[^#]/).to_s
|
40
|
-
|
41
|
-
# decode json
|
42
|
-
@@full_stats = JSON.parse(text)
|
43
|
-
end
|
44
36
|
end
|
45
37
|
|
46
38
|
#open input file
|
@@ -91,67 +83,38 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
91
83
|
|
92
84
|
puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
|
93
85
|
|
86
|
+
# create stats file
|
94
87
|
f = File.open(STATS_PATH,'w')
|
95
88
|
f.puts JSON.pretty_generate(@@full_stats)
|
96
89
|
f.close
|
97
90
|
|
98
|
-
|
91
|
+
# if initial files doesn't exists, create it
|
92
|
+
if !File.exists?(File.join(OUTPUT_PATH,'initial_stats.json'))
|
93
|
+
File.open(File.join(OUTPUT_PATH,'initial_stats.json'),'w') do |f|
|
94
|
+
f.puts JSON.pretty_generate(@@ongoing_stats)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
99
98
|
|
99
|
+
# load stats
|
100
|
+
r=File.read(STATS_PATH)
|
100
101
|
stats=JSON::parse(r)
|
101
102
|
|
103
|
+
|
102
104
|
|
105
|
+
# make graphs
|
103
106
|
gs=GraphStats.new(stats)
|
104
107
|
|
105
|
-
#gs=GraphStats.new(@@full_stats)
|
106
|
-
|
107
|
-
|
108
108
|
#close all files
|
109
|
-
|
110
|
-
# @@fqr.close
|
111
109
|
if @@use_json
|
112
110
|
@@json_output.close
|
113
111
|
end
|
114
112
|
@@errors_file.close
|
115
|
-
# @@rejected_output_file.close
|
116
|
-
|
117
|
-
# @@paired_output_files.each do |k,file|
|
118
|
-
# file.close
|
119
|
-
# end
|
120
113
|
|
121
114
|
@@files.each do |k,file|
|
122
115
|
file.close
|
123
116
|
end
|
124
117
|
|
125
|
-
# @@paired_qual_output_files.each do |k,file|
|
126
|
-
# file.close
|
127
|
-
# end
|
128
|
-
|
129
|
-
# @@sequences_output_files.each do |k,file|
|
130
|
-
# file.close
|
131
|
-
# end
|
132
|
-
#
|
133
|
-
# @@low_complexity_output_files.each do |k,file|
|
134
|
-
# file.close
|
135
|
-
# end
|
136
|
-
#
|
137
|
-
# @@sffinfo_files.each do |k,file|
|
138
|
-
# file.close
|
139
|
-
# end
|
140
|
-
#
|
141
|
-
# @@low_sffinfo_files.each do |k,file|
|
142
|
-
# file.close
|
143
|
-
# end
|
144
|
-
|
145
|
-
|
146
|
-
# @@qual_output_files.each do |k,file|
|
147
|
-
# file.close
|
148
|
-
# end
|
149
|
-
|
150
|
-
# more than one MID found
|
151
|
-
# if @@full_stats['mid_id'] && @@full_stats['mid_id'].count>1
|
152
|
-
#
|
153
|
-
# end
|
154
|
-
|
155
118
|
if File.exists?('scbi_drb_checkpoint')
|
156
119
|
File.delete('scbi_drb_checkpoint')
|
157
120
|
end
|
@@ -172,21 +135,71 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
172
135
|
end
|
173
136
|
|
174
137
|
def load_user_checkpoint(checkpoint)
|
138
|
+
# load full_stats from file !!!!!!!!!!!!!
|
175
139
|
|
176
|
-
|
140
|
+
if File.exists?(STATS_PATH)
|
177
141
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
142
|
+
# load stats
|
143
|
+
text = File.read(STATS_PATH)
|
144
|
+
|
145
|
+
# wipe text
|
146
|
+
# text=text.grep(/^\s*[^#]/).to_s
|
147
|
+
|
148
|
+
# decode json
|
149
|
+
@@full_stats = JSON.parse(text)
|
184
150
|
end
|
185
|
-
|
186
|
-
|
187
|
-
|
151
|
+
|
152
|
+
# reset count stats since they are repeated by checkpointing
|
153
|
+
|
154
|
+
# {
|
155
|
+
# "sequences": {
|
156
|
+
# "count": {
|
157
|
+
# "input_count": 1600,
|
158
|
+
# "output_seqs": 933,
|
159
|
+
# "rejected": 67
|
160
|
+
# },
|
161
|
+
# "rejected": {
|
162
|
+
# "short insert": 39,
|
163
|
+
# "contaminated": 26,
|
164
|
+
# "unexpected vector": 2
|
165
|
+
# }
|
166
|
+
# }
|
167
|
+
# }
|
168
|
+
|
169
|
+
if @@full_stats['sequences']
|
170
|
+
if @@full_stats['sequences']['count']
|
171
|
+
# set input count to 0
|
172
|
+
@@full_stats['sequences']['count']['input_count']=0
|
173
|
+
|
174
|
+
# do not remove outputseqs
|
175
|
+
# @@full_stats['sequences']['count']['output_seqs']=0
|
176
|
+
end
|
177
|
+
|
178
|
+
# remove rejected due to repetitions from rejected count
|
179
|
+
if @@full_stats['sequences']['rejected']
|
180
|
+
|
181
|
+
# it there are repeated
|
182
|
+
if (@@full_stats['sequences']['rejected']['repeated'])
|
183
|
+
|
184
|
+
# if repeated count > 0 and there count exists
|
185
|
+
if (@@full_stats['sequences']['rejected']['repeated'] > 0) and @@full_stats['sequences']['count']
|
186
|
+
|
187
|
+
# discount repeated from rejected, since they are going to be added again by checkout process
|
188
|
+
@@full_stats['sequences']['count']['rejected'] -= @@full_stats['sequences']['rejected']['repeated']
|
189
|
+
end
|
190
|
+
|
191
|
+
# set repeated to 0
|
192
|
+
@@full_stats['sequences']['rejected']['repeated']=0
|
193
|
+
end
|
194
|
+
end
|
188
195
|
end
|
189
196
|
|
197
|
+
|
198
|
+
# puts "Loaded Stats"
|
199
|
+
# puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
|
200
|
+
|
201
|
+
# TODO - remove sequences from rejected file that were added by cloned
|
202
|
+
|
190
203
|
super
|
191
204
|
# return checkpoint
|
192
205
|
end
|
@@ -202,17 +215,7 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
202
215
|
|
203
216
|
# read a work that will not be processed, only to skip until checkpoint
|
204
217
|
def trash_checkpointed_work
|
205
|
-
|
206
|
-
@@chunk_size.times do
|
207
|
-
begin
|
208
|
-
n,f,q,c = @@fqr.next_seq
|
209
|
-
end while (!n.nil? && @@params.repeated_seq?(n))
|
210
|
-
|
211
|
-
if n.nil?
|
212
|
-
break
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
218
|
+
warn "Deprecated: trash_checkpointed_work was deprecated, it is automatic now"
|
216
219
|
end
|
217
220
|
|
218
221
|
def next_work
|
@@ -228,12 +231,16 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
228
231
|
if !n.nil? && @@params.repeated_seq?(n)
|
229
232
|
@@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
|
230
233
|
@@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
|
231
|
-
|
234
|
+
|
232
235
|
get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
|
233
|
-
|
236
|
+
|
234
237
|
end
|
235
|
-
|
238
|
+
|
236
239
|
if !n.nil?
|
240
|
+
@@ongoing_stats[:sequence_count] += 1
|
241
|
+
@@ongoing_stats[:smallest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].min
|
242
|
+
@@ongoing_stats[:biggest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].max
|
243
|
+
|
237
244
|
@@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
|
238
245
|
end
|
239
246
|
end while (!n.nil? && @@params.repeated_seq?(n))
|
@@ -247,50 +254,9 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
247
254
|
|
248
255
|
end
|
249
256
|
|
250
|
-
# def next_work
|
251
|
-
#
|
252
|
-
# if @@exit
|
253
|
-
# return nil
|
254
|
-
# end
|
255
|
-
# group = SequenceGroup.new
|
256
|
-
#
|
257
|
-
# @@chunk_size.times do
|
258
|
-
# begin
|
259
|
-
#
|
260
|
-
# n,f,q,c = @@fqr.next_seq
|
261
|
-
#
|
262
|
-
# if !n.nil? && @@params.repeated_seq?(n)
|
263
|
-
# @@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
|
264
|
-
# @@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
|
265
|
-
#
|
266
|
-
# get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
|
267
|
-
#
|
268
|
-
# end
|
269
|
-
# if !n.nil?
|
270
|
-
# @@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
|
271
|
-
# end
|
272
|
-
# end while (!n.nil? && @@params.repeated_seq?(n))
|
273
|
-
#
|
274
|
-
# if !n.nil?
|
275
|
-
# # @@full_stats.add_stats({'sequences' => {'count' => {'processed' => 1}}})
|
276
|
-
# group.push SequenceWithAction.new(n,f.upcase,q,c)
|
277
|
-
# else
|
278
|
-
# break
|
279
|
-
# end
|
280
|
-
# end
|
281
|
-
#
|
282
|
-
# # puts "Processing #{group.inspect}"
|
283
|
-
#
|
284
|
-
# if group.empty?
|
285
|
-
# return nil
|
286
|
-
# else
|
287
|
-
# return group
|
288
|
-
# end
|
289
|
-
#
|
290
|
-
# end
|
291
257
|
|
292
258
|
def work_received(obj)
|
293
|
-
|
259
|
+
|
294
260
|
res = obj
|
295
261
|
|
296
262
|
# collect stats
|
@@ -19,6 +19,9 @@
|
|
19
19
|
#
|
20
20
|
# $: << File.expand_path(ROOT_PATH)
|
21
21
|
|
22
|
+
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
|
23
|
+
$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
24
|
+
|
22
25
|
require 'seqtrimnext'
|
23
26
|
|
24
27
|
$SEQTRIM_PATH = ROOT_PATH
|
@@ -37,7 +40,7 @@ ENV['BLASTDB']=$FORMATTED_DB_PATH
|
|
37
40
|
OUTPUT_PATH='output_files'
|
38
41
|
|
39
42
|
puts "FORMATTED_DB_BLAST in workers: #{$FORMATTED_DB_PATH}"
|
40
|
-
|
43
|
+
|
41
44
|
|
42
45
|
require 'scbi_mapreduce'
|
43
46
|
require 'params'
|
@@ -10,8 +10,13 @@ class GraphStats
|
|
10
10
|
init_stats=initial_stats
|
11
11
|
|
12
12
|
if init_stats.nil?
|
13
|
-
|
14
|
-
|
13
|
+
if File.exists?(File.join(OUTPUT_PATH,'initial_stats.json'))
|
14
|
+
r=File.read(File.join(OUTPUT_PATH,'initial_stats.json'))
|
15
|
+
init_stats= JSON::parse(r)
|
16
|
+
else
|
17
|
+
init_stats=[]
|
18
|
+
end
|
19
|
+
|
15
20
|
end
|
16
21
|
# puts init_stats.to_json
|
17
22
|
#r=File.read(File.join(File.dirname(__FILE__),'stats.json'))
|
@@ -5,8 +5,6 @@
|
|
5
5
|
|
6
6
|
require 'extract_stats'
|
7
7
|
|
8
|
-
# $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib')
|
9
|
-
|
10
8
|
require 'scbi_mapreduce'
|
11
9
|
require 'seqtrim_work_manager'
|
12
10
|
require 'action_manager'
|
@@ -340,6 +338,9 @@ class Seqtrim
|
|
340
338
|
# server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,'~/.seqtrimnext')
|
341
339
|
server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,$SEQTRIMNEXT_INIT)
|
342
340
|
server.chunk_size=chunk_size
|
341
|
+
server.checkpointing=true
|
342
|
+
server.keep_order=true
|
343
|
+
server.retry_stuck_jobs=true
|
343
344
|
server.start_server
|
344
345
|
|
345
346
|
# close sequence reader
|
@@ -25,7 +25,7 @@ class PluginAbAdapters < Plugin
|
|
25
25
|
def do_blasts(seqs)
|
26
26
|
# find MIDS with less results than max_target_seqs value
|
27
27
|
blast=BatchBlast.new("-db #{@params.get_param('adapters_ab_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
|
28
|
-
$LOG.
|
28
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
29
29
|
|
30
30
|
fastas=[]
|
31
31
|
|
@@ -61,7 +61,7 @@ class PluginAbAdapters < Plugin
|
|
61
61
|
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
62
62
|
end
|
63
63
|
|
64
|
-
$LOG.
|
64
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
|
65
65
|
|
66
66
|
|
67
67
|
# blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters_ab.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_ab')} -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
|
@@ -25,7 +25,7 @@ class PluginAdapters < Plugin
|
|
25
25
|
def do_blasts(seqs)
|
26
26
|
# find MIDS with less results than max_target_seqs value
|
27
27
|
blast=BatchBlast.new("-db #{@params.get_param('adapters_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_adapters')} -word_size #{MIN_ADAPTER_SIZE}")
|
28
|
-
$LOG.
|
28
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
29
29
|
|
30
30
|
fastas=[]
|
31
31
|
|
@@ -64,7 +64,7 @@ class PluginAdapters < Plugin
|
|
64
64
|
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
65
65
|
end
|
66
66
|
|
67
|
-
$LOG.
|
67
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
|
68
68
|
|
69
69
|
|
70
70
|
# blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')} -word_size #{MIN_ADAPTER_SIZE}")
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
########################################################
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
5
|
+
#
|
6
|
+
# Defines the main methods that are necessary to execute PluginAdapters
|
7
|
+
# Inherit: Plugin
|
8
|
+
########################################################
|
9
|
+
|
10
|
+
class PluginAdaptersOld < Plugin
|
11
|
+
|
12
|
+
def get_type_adapter(p_start,p_end,seq)
|
13
|
+
#if q_beg is nearer the left, add adapter action by the left,
|
14
|
+
#if q_end esta is nearer the right , add adapter action by the right
|
15
|
+
#NOTE: If the adapter is very near from left and rigth,
|
16
|
+
#then the sequence isn't valid, because almost sequence is adapter.
|
17
|
+
|
18
|
+
|
19
|
+
v1= p_end.to_i
|
20
|
+
v2= p_start.to_i
|
21
|
+
|
22
|
+
# puts " startadapter #{v2} endadapter #{v1} insert_start #{seq.insert_start} insert_end #{seq.insert_end}"
|
23
|
+
|
24
|
+
# puts " #{v2+seq.insert_start} <? #{seq.seq_fasta.length - v1 - 1 + seq.seq_fasta_orig.length - seq.insert_end-1}"
|
25
|
+
if (v2+seq.insert_start < (seq.seq_fasta.length - v1 - 1+ seq.seq_fasta_orig.length - seq.insert_end-1)) #IF THE NEAREST ONE IS THE LEFT
|
26
|
+
type = "ActionLeftAdapter"
|
27
|
+
|
28
|
+
else
|
29
|
+
type = "ActionRightAdapter"
|
30
|
+
|
31
|
+
end
|
32
|
+
return type
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def cut_by_right(adapter,seq)
|
37
|
+
|
38
|
+
left_size = adapter.q_beg-seq.insert_start+1
|
39
|
+
right_size = seq.insert_end-adapter.q_end+1
|
40
|
+
left_size=0 if (left_size<0)
|
41
|
+
right_size=0 if (right_size<0)
|
42
|
+
|
43
|
+
return (left_size>(right_size/2).to_i)
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
48
|
+
def execute(seqs)
|
49
|
+
blasts= do_blasts(seqs)
|
50
|
+
|
51
|
+
seqs.each_with_index do |s,i|
|
52
|
+
exec_seq(s,blasts.querys[i])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def do_blasts(seqs)
|
57
|
+
# find MIDS with less results than max_target_seqs value
|
58
|
+
blast=BatchBlast.new("-db #{@params.get_param('adapters_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
|
59
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
60
|
+
|
61
|
+
fastas=[]
|
62
|
+
|
63
|
+
seqs.each do |seq|
|
64
|
+
fastas.push ">"+seq.seq_name
|
65
|
+
fastas.push seq.seq_fasta
|
66
|
+
end
|
67
|
+
|
68
|
+
# fastas=fastas.join("\n")
|
69
|
+
|
70
|
+
blast_table_results = blast.do_blast(fastas)
|
71
|
+
|
72
|
+
# puts blast_table_results.inspect
|
73
|
+
|
74
|
+
return blast_table_results
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def exec_seq(seq,blast_query)
|
79
|
+
if blast_query.query_id != seq.seq_name
|
80
|
+
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
81
|
+
end
|
82
|
+
|
83
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
|
84
|
+
|
85
|
+
|
86
|
+
# blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
|
87
|
+
|
88
|
+
# blast with only one sequence, no with many sequences from a database
|
89
|
+
#---------------------------------------------------------------------
|
90
|
+
|
91
|
+
# blast_table_results = blast.do_blast(seq.seq_fasta) #rise seq to adapterss executing over blast
|
92
|
+
|
93
|
+
#blast_table_results = BlastTableResult.new(res)
|
94
|
+
|
95
|
+
# blast_table_results.inspect
|
96
|
+
|
97
|
+
adapters=[]
|
98
|
+
# blast_table_results.querys.each do |query| # first round to save adapters without overlap
|
99
|
+
merge_hits(blast_query,adapters)
|
100
|
+
# end
|
101
|
+
|
102
|
+
begin
|
103
|
+
adapters2=adapters # second round to save adapters without overlap
|
104
|
+
adapters = []
|
105
|
+
merge_hits(adapters2,adapters)
|
106
|
+
end until (adapters2.count == adapters.count)
|
107
|
+
|
108
|
+
actions=[]
|
109
|
+
adapter_size=0
|
110
|
+
# @stats['adapter_size']={}
|
111
|
+
adapters.each do |ad| # adds the correspondent action to the sequence
|
112
|
+
|
113
|
+
type = get_type_adapter(ad.q_beg,ad.q_end,seq)
|
114
|
+
a = seq.new_action(ad.q_beg,ad.q_end,type)
|
115
|
+
# puts " state left_action #{a.left_action} right_action #{a.right_action}"
|
116
|
+
|
117
|
+
|
118
|
+
adapter_size=ad.q_end-ad.q_beg+1
|
119
|
+
|
120
|
+
if cut_by_right(ad,seq)
|
121
|
+
|
122
|
+
# puts "action right end1 #{seq.insert_end}"
|
123
|
+
|
124
|
+
a.right_action=true #mark rigth action to get the left insert
|
125
|
+
else
|
126
|
+
|
127
|
+
# puts " cut1 by left #{seq.insert_start} ad #{ad.q_beg+seq.insert_start} #{ad.q_end+seq.insert_start}"
|
128
|
+
|
129
|
+
a.left_action = true #mark left action to get the right insert
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
a.message = ad.subject_id
|
134
|
+
a.reversed = ad.reversed
|
135
|
+
actions.push a
|
136
|
+
|
137
|
+
# @stats[:adapter_size]={adapter_size => 1}
|
138
|
+
add_stats('adapter_size',adapter_size)
|
139
|
+
|
140
|
+
end
|
141
|
+
seq.add_actions(actions)
|
142
|
+
#
|
143
|
+
end
|
144
|
+
|
145
|
+
#Returns an array with the errors due to parameters are missing
|
146
|
+
def self.check_params(params)
|
147
|
+
errors=[]
|
148
|
+
|
149
|
+
comment='Blast E-value used as cut-off when searching for adapters or primers'
|
150
|
+
default_value = 1e-6
|
151
|
+
params.check_param(errors,'blast_evalue_adapters','Float',default_value,comment)
|
152
|
+
|
153
|
+
comment='Minimum required identity (%) for a reliable adapter'
|
154
|
+
default_value = 95
|
155
|
+
params.check_param(errors,'blast_percent_adapters','Integer',default_value,comment)
|
156
|
+
|
157
|
+
comment='Path for adapter database'
|
158
|
+
default_value = File.join($FORMATTED_DB_PATH,'adapters.fasta')
|
159
|
+
params.check_param(errors,'adapters_db','DB',default_value,comment)
|
160
|
+
|
161
|
+
return errors
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
end
|
@@ -25,7 +25,7 @@ class PluginAmplicons < Plugin
|
|
25
25
|
def do_blasts(seqs)
|
26
26
|
# find MIDS with less results than max_target_seqs value
|
27
27
|
blast=BatchBlast.new("-db #{@params.get_param('primers_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_primers')}")
|
28
|
-
$LOG.
|
28
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
29
29
|
|
30
30
|
fastas=[]
|
31
31
|
|
@@ -49,7 +49,7 @@ class PluginAmplicons < Plugin
|
|
49
49
|
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
50
50
|
end
|
51
51
|
|
52
|
-
$LOG.
|
52
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for primers into the sequence"
|
53
53
|
|
54
54
|
# puts blast_query.inspect
|
55
55
|
|
@@ -36,7 +36,7 @@ class PluginContaminants < Plugin
|
|
36
36
|
|
37
37
|
blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
38
38
|
|
39
|
-
$LOG.
|
39
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
|
40
40
|
|
41
41
|
fastas=[]
|
42
42
|
|
@@ -67,7 +67,7 @@ class PluginContaminants < Plugin
|
|
67
67
|
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
68
68
|
end
|
69
69
|
|
70
|
-
$LOG.
|
70
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for contaminants into the sequence"
|
71
71
|
|
72
72
|
|
73
73
|
#blast = BatchBlast.new('-db DB/formatted/contaminants.fasta','blastn',' -task blastn -evalue 1e-10 -perc_identity 95') #get contaminants
|
@@ -143,7 +143,7 @@ class PluginContaminants < Plugin
|
|
143
143
|
|
144
144
|
end
|
145
145
|
else
|
146
|
-
$LOG.
|
146
|
+
$LOG.debug('Contaminant ignored due to genus match: '+c.definition)
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
@@ -286,7 +286,7 @@ class PluginExtractInserts < Plugin
|
|
286
286
|
|
287
287
|
|
288
288
|
def exec_seq(seq)
|
289
|
-
$LOG.
|
289
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: extract inserts"
|
290
290
|
|
291
291
|
# puts "INSERTO ANTES LINKER INSERT:"+seq.seq_fasta
|
292
292
|
|
@@ -319,7 +319,7 @@ class PluginFindPolyAt < Plugin
|
|
319
319
|
|
320
320
|
|
321
321
|
def exec_seq(seq)
|
322
|
-
$LOG.
|
322
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
|
323
323
|
|
324
324
|
find_polyT(seq)
|
325
325
|
find_polyA(seq)
|
@@ -21,7 +21,7 @@ def execute(seqs)
|
|
21
21
|
|
22
22
|
|
23
23
|
def exec_seq(seq)
|
24
|
-
$LOG.
|
24
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
|
25
25
|
|
26
26
|
fasta_input=@params.get_param('truncated_input_file')
|
27
27
|
|
@@ -149,7 +149,7 @@ class PluginIndeterminations < Plugin
|
|
149
149
|
|
150
150
|
|
151
151
|
def exec_seq(seq)
|
152
|
-
$LOG.
|
152
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
|
153
153
|
|
154
154
|
actions=[]
|
155
155
|
|
@@ -21,7 +21,7 @@ class PluginKey < Plugin
|
|
21
21
|
|
22
22
|
|
23
23
|
def exec_seq(seq)
|
24
|
-
$LOG.
|
24
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: marking key into the sequence"
|
25
25
|
# blast_table_results.inspect
|
26
26
|
|
27
27
|
actions=[]
|
@@ -83,7 +83,7 @@ class PluginLinker < Plugin
|
|
83
83
|
# find MIDS with less results than max_target_seqs value
|
84
84
|
blast = BatchBlast.new("-db #{@params.get_param('linkers_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
|
85
85
|
|
86
|
-
$LOG.
|
86
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
87
87
|
|
88
88
|
fastas=[]
|
89
89
|
|
@@ -106,7 +106,7 @@ class PluginLinker < Plugin
|
|
106
106
|
if blast_query.query_id != seq.seq_name
|
107
107
|
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
108
108
|
end
|
109
|
-
$LOG.
|
109
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for linker into the sequence"
|
110
110
|
|
111
111
|
# key_beg,key_end=search_key(seq,0,3) if false
|
112
112
|
# blast = BatchBlast.new("-subject #{File.join($FORMATTED_DB_PATH,'linkers.fasta')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
|
@@ -22,7 +22,7 @@ class PluginLowHighSize < Plugin
|
|
22
22
|
|
23
23
|
|
24
24
|
def exec_seq(seq)
|
25
|
-
$LOG.
|
25
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low or high size of the sequence"
|
26
26
|
|
27
27
|
min_size = @params.get_param('min_sequence_size_raw').to_i #min_size is: mean - 2dev
|
28
28
|
max_size = @params.get_param('max_sequence_size_raw').to_i #max_size is: mean + 2dev
|
@@ -278,7 +278,7 @@ class PluginLowQuality < Plugin
|
|
278
278
|
if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
|
279
279
|
$LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
|
280
280
|
elsif (seq.seq_qual.size>0)
|
281
|
-
$LOG.
|
281
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
282
282
|
|
283
283
|
@low=@params.get_param('min_quality').to_i
|
284
284
|
|
@@ -29,7 +29,7 @@ class PluginMids < Plugin
|
|
29
29
|
def do_blasts(seqs)
|
30
30
|
# find MIDS with less results than max_target_seqs value
|
31
31
|
blast = BatchBlast.new("-db #{@params.get_param('mids_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_mids')} -max_target_seqs 4 ") #get mids
|
32
|
-
$LOG.
|
32
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
33
33
|
|
34
34
|
fastas=[]
|
35
35
|
|
@@ -54,7 +54,7 @@ class PluginMids < Plugin
|
|
54
54
|
end
|
55
55
|
|
56
56
|
|
57
|
-
$LOG.
|
57
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
|
58
58
|
|
59
59
|
|
60
60
|
# blast_table_results = blast.do_blast(seq.seq_fasta[0..SIZE_SEARCH_MID]) # execute blast to find mids
|
@@ -27,7 +27,7 @@ class PluginRemAditArtifacts < Plugin
|
|
27
27
|
|
28
28
|
def exec_seq(seq)
|
29
29
|
|
30
|
-
$LOG.
|
30
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing artifacts into the sequence"
|
31
31
|
seq2 = seq.seq_fasta
|
32
32
|
first = 0
|
33
33
|
last = seq2.size-1
|
@@ -96,7 +96,7 @@ class PluginShortInsert < Plugin
|
|
96
96
|
|
97
97
|
def exec_seq(seq)
|
98
98
|
|
99
|
-
$LOG.
|
99
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
100
100
|
# puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
|
101
101
|
|
102
102
|
if (seq.seq_fasta.size > 0)
|
@@ -183,7 +183,7 @@ class PluginShortInsert < Plugin
|
|
183
183
|
|
184
184
|
#Begins the plugin1's execution to warn if the inserted is so short
|
185
185
|
def execute_no_cut_quality(seq)
|
186
|
-
$LOG.
|
186
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
187
187
|
|
188
188
|
|
189
189
|
|
@@ -38,7 +38,7 @@ class PluginVectors < Plugin
|
|
38
38
|
# find MIDS with less results than max_target_seqs value
|
39
39
|
blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors
|
40
40
|
|
41
|
-
$LOG.
|
41
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
42
42
|
|
43
43
|
fastas=[]
|
44
44
|
|
@@ -62,7 +62,7 @@ class PluginVectors < Plugin
|
|
62
62
|
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
63
63
|
end
|
64
64
|
|
65
|
-
$LOG.
|
65
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for vectors into the sequence "
|
66
66
|
|
67
67
|
#blast contra contaminantes
|
68
68
|
|
@@ -0,0 +1,8 @@
|
|
1
|
+
# ======================================
|
2
|
+
# General parameters
|
3
|
+
# ======================================
|
4
|
+
|
5
|
+
|
6
|
+
plugin_list = PluginLowHighSize,PluginIndeterminations,PluginFindPolyAt,PluginContaminants,PluginLowQuality,PluginLowComplexity
|
7
|
+
|
8
|
+
contaminants_db="contaminants.fasta cont_ribosome.fasta"
|
@@ -15,8 +15,9 @@ def add_stats(h_stats)
|
|
15
15
|
add_stats.each do |property,hash_value|
|
16
16
|
h[plugin_hash][property]={} if h[plugin_hash][property].nil?
|
17
17
|
|
18
|
+
# values need to be in string format because of later loading from json file
|
18
19
|
hash_value.each do |value, count|
|
19
|
-
h[plugin_hash][property][value]=(h[plugin_hash][property][value]||0) + count
|
20
|
+
h[plugin_hash][property][value.to_s]=(h[plugin_hash][property][value.to_s]||0) + count
|
20
21
|
end
|
21
22
|
end
|
22
23
|
end
|
data/lib/seqtrimnext.rb
CHANGED
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: seqtrimnext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 2.0.
|
5
|
+
version: 2.0.46
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dario Guerrero & Almudena Bocinos
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-04-13 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: narray
|
@@ -143,9 +143,11 @@ executables:
|
|
143
143
|
- extract_seqs_from_fastq.rb
|
144
144
|
- fasta2fastq.rb
|
145
145
|
- fastq2fasta.rb
|
146
|
+
- filter_database.rb
|
146
147
|
- gen_qual.rb
|
147
148
|
- get_seq.rb
|
148
149
|
- group_by_range.rb
|
150
|
+
- join_big_illumina_paired.sh
|
149
151
|
- join_ilumina_paired.rb
|
150
152
|
- parse_amplicons.rb
|
151
153
|
- parse_json_results.rb
|
@@ -160,14 +162,16 @@ executables:
|
|
160
162
|
extensions: []
|
161
163
|
|
162
164
|
extra_rdoc_files:
|
165
|
+
- History.txt
|
163
166
|
- lib/seqtrimnext/templates/amplicons.txt
|
164
167
|
- lib/seqtrimnext/templates/genomics_454.txt
|
165
168
|
- lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
169
|
+
- lib/seqtrimnext/templates/genomics_illumina.txt
|
166
170
|
- lib/seqtrimnext/templates/low_quality.txt
|
167
171
|
- lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
|
168
172
|
- lib/seqtrimnext/templates/transcriptomics_454.txt
|
173
|
+
- lib/seqtrimnext/templates/transcriptomics_illumina.txt
|
169
174
|
- lib/seqtrimnext/templates/transcriptomics_plants.txt
|
170
|
-
- History.txt
|
171
175
|
- Manifest.txt
|
172
176
|
- PostInstall.txt
|
173
177
|
files:
|
@@ -177,9 +181,11 @@ files:
|
|
177
181
|
- bin/extract_seqs_from_fastq.rb
|
178
182
|
- bin/fasta2fastq.rb
|
179
183
|
- bin/fastq2fasta.rb
|
184
|
+
- bin/filter_database.rb
|
180
185
|
- bin/gen_qual.rb
|
181
186
|
- bin/get_seq.rb
|
182
187
|
- bin/group_by_range.rb
|
188
|
+
- bin/join_big_illumina_paired.sh
|
183
189
|
- bin/join_ilumina_paired.rb
|
184
190
|
- bin/parse_amplicons.rb
|
185
191
|
- bin/parse_json_results.rb
|
@@ -191,6 +197,7 @@ files:
|
|
191
197
|
- bin/split_fastq.rb
|
192
198
|
- bin/split_ilumina_paired.rb
|
193
199
|
- bin/split_paired.rb
|
200
|
+
- History.txt
|
194
201
|
- lib/seqtrimnext/actions/action_ab_adapter.rb
|
195
202
|
- lib/seqtrimnext/actions/action_ab_far_adapter.rb
|
196
203
|
- lib/seqtrimnext/actions/action_ab_left_adapter.rb
|
@@ -208,13 +215,13 @@ files:
|
|
208
215
|
- lib/seqtrimnext/actions/action_low_high_size.rb
|
209
216
|
- lib/seqtrimnext/actions/action_low_quality.rb
|
210
217
|
- lib/seqtrimnext/actions/action_mid.rb
|
218
|
+
- lib/seqtrimnext/actions/action_middle_adapter.rb
|
211
219
|
- lib/seqtrimnext/actions/action_multiple_linker.rb
|
212
220
|
- lib/seqtrimnext/actions/action_paired_reads.rb
|
213
221
|
- lib/seqtrimnext/actions/action_poly_a.rb
|
214
222
|
- lib/seqtrimnext/actions/action_poly_t.rb
|
215
223
|
- lib/seqtrimnext/actions/action_rem_adit_artifacts.rb
|
216
224
|
- lib/seqtrimnext/actions/action_right_adapter.rb
|
217
|
-
- lib/seqtrimnext/actions/action_middle_adapter.rb
|
218
225
|
- lib/seqtrimnext/actions/action_right_primer.rb
|
219
226
|
- lib/seqtrimnext/actions/action_short_insert.rb
|
220
227
|
- lib/seqtrimnext/actions/action_unexpected_poly_t.rb
|
@@ -244,6 +251,7 @@ files:
|
|
244
251
|
- lib/seqtrimnext/plugins/plugin.rb
|
245
252
|
- lib/seqtrimnext/plugins/plugin_ab_adapters.rb
|
246
253
|
- lib/seqtrimnext/plugins/plugin_adapters.rb
|
254
|
+
- lib/seqtrimnext/plugins/plugin_adapters_old.rb
|
247
255
|
- lib/seqtrimnext/plugins/plugin_amplicons.rb
|
248
256
|
- lib/seqtrimnext/plugins/plugin_contaminants.rb
|
249
257
|
- lib/seqtrimnext/plugins/plugin_extract_inserts.rb
|
@@ -262,9 +270,11 @@ files:
|
|
262
270
|
- lib/seqtrimnext/templates/amplicons.txt
|
263
271
|
- lib/seqtrimnext/templates/genomics_454.txt
|
264
272
|
- lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
273
|
+
- lib/seqtrimnext/templates/genomics_illumina.txt
|
265
274
|
- lib/seqtrimnext/templates/low_quality.txt
|
266
275
|
- lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
|
267
276
|
- lib/seqtrimnext/templates/transcriptomics_454.txt
|
277
|
+
- lib/seqtrimnext/templates/transcriptomics_illumina.txt
|
268
278
|
- lib/seqtrimnext/templates/transcriptomics_plants.txt
|
269
279
|
- lib/seqtrimnext/utils/extract_samples.rb
|
270
280
|
- lib/seqtrimnext/utils/fasta2xml.rb
|
@@ -276,7 +286,6 @@ files:
|
|
276
286
|
- lib/seqtrimnext/utils/recover_mid.rb
|
277
287
|
- lib/seqtrimnext/utils/string_utils.rb
|
278
288
|
- lib/seqtrimnext.rb
|
279
|
-
- History.txt
|
280
289
|
- Manifest.txt
|
281
290
|
- PostInstall.txt
|
282
291
|
- Rakefile
|