seqtrimnext 2.0.45 → 2.0.46
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +7 -2
- data/bin/filter_database.rb +39 -0
- data/bin/join_big_illumina_paired.sh +122 -0
- data/bin/seqtrimnext +2 -1
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +87 -121
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +4 -1
- data/lib/seqtrimnext/classes/graph_stats.rb +7 -2
- data/lib/seqtrimnext/classes/seqtrim.rb +3 -2
- data/lib/seqtrimnext/classes/sequence_with_action.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +3 -3
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_linker.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_mids.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +1 -1
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +2 -2
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +2 -2
- data/lib/seqtrimnext/templates/genomics_illumina.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_illumina.txt +8 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +2 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +14 -5
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -4,9 +4,11 @@ bin/extract_seqs_from_fasta.rb
|
|
4
4
|
bin/extract_seqs_from_fastq.rb
|
5
5
|
bin/fasta2fastq.rb
|
6
6
|
bin/fastq2fasta.rb
|
7
|
+
bin/filter_database.rb
|
7
8
|
bin/gen_qual.rb
|
8
9
|
bin/get_seq.rb
|
9
10
|
bin/group_by_range.rb
|
11
|
+
bin/join_big_illumina_paired.sh
|
10
12
|
bin/join_ilumina_paired.rb
|
11
13
|
bin/parse_amplicons.rb
|
12
14
|
bin/parse_json_results.rb
|
@@ -18,6 +20,7 @@ bin/seqtrimnext
|
|
18
20
|
bin/split_fastq.rb
|
19
21
|
bin/split_ilumina_paired.rb
|
20
22
|
bin/split_paired.rb
|
23
|
+
History.txt
|
21
24
|
lib/seqtrimnext/actions/action_ab_adapter.rb
|
22
25
|
lib/seqtrimnext/actions/action_ab_far_adapter.rb
|
23
26
|
lib/seqtrimnext/actions/action_ab_left_adapter.rb
|
@@ -35,13 +38,13 @@ lib/seqtrimnext/actions/action_low_complexity.rb
|
|
35
38
|
lib/seqtrimnext/actions/action_low_high_size.rb
|
36
39
|
lib/seqtrimnext/actions/action_low_quality.rb
|
37
40
|
lib/seqtrimnext/actions/action_mid.rb
|
41
|
+
lib/seqtrimnext/actions/action_middle_adapter.rb
|
38
42
|
lib/seqtrimnext/actions/action_multiple_linker.rb
|
39
43
|
lib/seqtrimnext/actions/action_paired_reads.rb
|
40
44
|
lib/seqtrimnext/actions/action_poly_a.rb
|
41
45
|
lib/seqtrimnext/actions/action_poly_t.rb
|
42
46
|
lib/seqtrimnext/actions/action_rem_adit_artifacts.rb
|
43
47
|
lib/seqtrimnext/actions/action_right_adapter.rb
|
44
|
-
lib/seqtrimnext/actions/action_middle_adapter.rb
|
45
48
|
lib/seqtrimnext/actions/action_right_primer.rb
|
46
49
|
lib/seqtrimnext/actions/action_short_insert.rb
|
47
50
|
lib/seqtrimnext/actions/action_unexpected_poly_t.rb
|
@@ -71,6 +74,7 @@ lib/seqtrimnext/classes/sequence_with_action.rb
|
|
71
74
|
lib/seqtrimnext/plugins/plugin.rb
|
72
75
|
lib/seqtrimnext/plugins/plugin_ab_adapters.rb
|
73
76
|
lib/seqtrimnext/plugins/plugin_adapters.rb
|
77
|
+
lib/seqtrimnext/plugins/plugin_adapters_old.rb
|
74
78
|
lib/seqtrimnext/plugins/plugin_amplicons.rb
|
75
79
|
lib/seqtrimnext/plugins/plugin_contaminants.rb
|
76
80
|
lib/seqtrimnext/plugins/plugin_extract_inserts.rb
|
@@ -89,9 +93,11 @@ lib/seqtrimnext/plugins/plugin_vectors.rb
|
|
89
93
|
lib/seqtrimnext/templates/amplicons.txt
|
90
94
|
lib/seqtrimnext/templates/genomics_454.txt
|
91
95
|
lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
96
|
+
lib/seqtrimnext/templates/genomics_illumina.txt
|
92
97
|
lib/seqtrimnext/templates/low_quality.txt
|
93
98
|
lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
|
94
99
|
lib/seqtrimnext/templates/transcriptomics_454.txt
|
100
|
+
lib/seqtrimnext/templates/transcriptomics_illumina.txt
|
95
101
|
lib/seqtrimnext/templates/transcriptomics_plants.txt
|
96
102
|
lib/seqtrimnext/utils/extract_samples.rb
|
97
103
|
lib/seqtrimnext/utils/fasta2xml.rb
|
@@ -103,7 +109,6 @@ lib/seqtrimnext/utils/load_qual_in_hash.rb
|
|
103
109
|
lib/seqtrimnext/utils/recover_mid.rb
|
104
110
|
lib/seqtrimnext/utils/string_utils.rb
|
105
111
|
lib/seqtrimnext.rb
|
106
|
-
History.txt
|
107
112
|
Manifest.txt
|
108
113
|
PostInstall.txt
|
109
114
|
Rakefile
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scbi_fasta'
|
4
|
+
|
5
|
+
if ARGV.count!=3
|
6
|
+
puts "Usage: #{File.basename($0)} database min_size name_list"
|
7
|
+
exit
|
8
|
+
end
|
9
|
+
min_size = ARGV[1].to_i
|
10
|
+
|
11
|
+
# read keywords
|
12
|
+
keywords=File.read(ARGV[2]).split("\n")
|
13
|
+
|
14
|
+
# convert all to upcase
|
15
|
+
keywords.map { |keyword| keyword.upcase!}
|
16
|
+
|
17
|
+
# puts "Search keywords"
|
18
|
+
# keywords.each { |keyword| puts keyword}
|
19
|
+
|
20
|
+
fqr=FastaQualFile.new(ARGV[0])
|
21
|
+
|
22
|
+
all=[]
|
23
|
+
|
24
|
+
fqr.each do |n,s,c|
|
25
|
+
keywords.each do |keyword|
|
26
|
+
if s.length<=min_size
|
27
|
+
# all+=c.split(" ")
|
28
|
+
if c.upcase.index(keyword)
|
29
|
+
# puts "[#{s.length.to_s}] - #{n} - #{c}"
|
30
|
+
puts ">#{n} #{c}\n#{s}"
|
31
|
+
break
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# puts all.sort.uniq.reject{|e| e=~/\d/}
|
38
|
+
|
39
|
+
fqr.close
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# Sort two big illumina files corresponding to paired-end experiment and then join common sequences on different files. Sequences not in common goes to a separate file.
|
4
|
+
|
5
|
+
# cat $1 | awk '{split($0, a, " "); n++; if (n%1==0){printf("%s\t",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }'
|
6
|
+
#
|
7
|
+
# exit
|
8
|
+
|
9
|
+
if [ "$#" < 4 ];
|
10
|
+
then
|
11
|
+
echo ""
|
12
|
+
echo "Use: $0 file1.fastq file2.fastq base_output_name tmp_dir"
|
13
|
+
echo ""
|
14
|
+
exit
|
15
|
+
fi
|
16
|
+
|
17
|
+
base_name=$3
|
18
|
+
|
19
|
+
if [[ -z "$base_name" ]]; then
|
20
|
+
echo "Use a : $base_name doesn't exists"
|
21
|
+
exit -1
|
22
|
+
fi
|
23
|
+
|
24
|
+
tmp_dir=$4
|
25
|
+
|
26
|
+
if [[ -z "$4" ]]; then
|
27
|
+
tmp_dir=`pwd`
|
28
|
+
fi
|
29
|
+
|
30
|
+
if [[ ! -e "$tmp_dir" ]]; then
|
31
|
+
echo "Tmp dir: $4 doesn't exists"
|
32
|
+
exit -1
|
33
|
+
fi
|
34
|
+
|
35
|
+
echo "Using TMPDIR $tmp_dir"
|
36
|
+
|
37
|
+
f1_path=$1
|
38
|
+
f2_path=$2
|
39
|
+
|
40
|
+
f1_name=`basename $1`
|
41
|
+
f2_name=`basename $2`
|
42
|
+
|
43
|
+
f1_tmp="$tmp_dir/${f1_name}"
|
44
|
+
f2_tmp="$tmp_dir/${f2_name}"
|
45
|
+
|
46
|
+
common_names="$tmp_dir/comm.names"
|
47
|
+
|
48
|
+
only_in_1="$tmp_dir/only_in_1.txt"
|
49
|
+
only_in_2="$tmp_dir/only_in_2.txt"
|
50
|
+
in_both="$tmp_dir/in_both.txt"
|
51
|
+
|
52
|
+
|
53
|
+
echo "Starting sorting"
|
54
|
+
|
55
|
+
if [[ ! -e "$f1_tmp.sorted" ]]; then
|
56
|
+
echo "Sorting $f1_name"
|
57
|
+
cat $f1_path | awk '{split($0, a, " "); sub(/\/1$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f1_tmp.sorted &
|
58
|
+
|
59
|
+
fi
|
60
|
+
|
61
|
+
if [[ ! -e "$f2_tmp.sorted" ]]; then
|
62
|
+
echo "Sorting $f2_name"
|
63
|
+
cat $f2_path | awk '{split($0, a, " "); sub(/\/2$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f2_tmp.sorted &
|
64
|
+
fi
|
65
|
+
wait
|
66
|
+
|
67
|
+
echo "Starting name extraction"
|
68
|
+
if [[ ! -e "$f1_tmp.names" ]]; then
|
69
|
+
echo "Extracting names from $f1_tmp.sorted"
|
70
|
+
# cat $1.sorted | cut -f1 | sed 's/\(.*\)\/1$/\1/' > $1.names &
|
71
|
+
cat $f1_tmp.sorted | cut -f1 > $f1_tmp.names &
|
72
|
+
fi
|
73
|
+
if [[ ! -e "$f2_tmp.names" ]]; then
|
74
|
+
echo "Extracting names from $f2_tmp.sorted"
|
75
|
+
cat $f2_tmp.sorted | cut -f1 > $f2_tmp.names &
|
76
|
+
fi
|
77
|
+
wait
|
78
|
+
|
79
|
+
echo "Starting names comparison"
|
80
|
+
if [[ ! -e "$common_names" ]]; then
|
81
|
+
echo "Making comm file"
|
82
|
+
# diff $1.names $2.names > names.diff
|
83
|
+
comm $f1_tmp.names $f2_tmp.names > $common_names
|
84
|
+
fi
|
85
|
+
|
86
|
+
echo "Starting names extraction"
|
87
|
+
# grep '^>' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/2\n",$0) }' > only_in_2.txt &
|
88
|
+
# grep '^<' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/1\n",$0) }' > only_in_1.txt &
|
89
|
+
|
90
|
+
grep -P '^[^\t]' $common_names > $only_in_1 &
|
91
|
+
grep -P '^\t[^\t]' $common_names |tr -d "\t" > $only_in_2 &
|
92
|
+
grep -P '^\t\t[^\t]' $common_names |tr -d "\t" > $in_both &
|
93
|
+
wait
|
94
|
+
|
95
|
+
echo "Num seqs only in 1) $f1_name"
|
96
|
+
wc -l $only_in_1
|
97
|
+
|
98
|
+
echo "Num seqs only in 2) $f2_name"
|
99
|
+
wc -l $only_in_2
|
100
|
+
|
101
|
+
echo "Num seqs in both $f1_name and $f2_name"
|
102
|
+
wc -l $in_both
|
103
|
+
|
104
|
+
echo "Starting extracting seqs"
|
105
|
+
join -t $'\t' -1 1 -2 1 $only_in_1 $f1_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal1.fastq &
|
106
|
+
join -t $'\t' -1 1 -2 1 $only_in_2 $f2_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal2.fastq &
|
107
|
+
|
108
|
+
join -t $'\t' -1 1 -2 1 $in_both $f1_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired1.fastq &
|
109
|
+
join -t $'\t' -1 1 -2 1 $in_both $f2_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired2.fastq &
|
110
|
+
wait
|
111
|
+
|
112
|
+
rm $f1_tmp.names
|
113
|
+
rm $f2_tmp.names
|
114
|
+
|
115
|
+
rm $f1_tmp.sorted
|
116
|
+
rm $f2_tmp.sorted
|
117
|
+
|
118
|
+
rm $only_in_2
|
119
|
+
rm $only_in_1
|
120
|
+
rm $in_both
|
121
|
+
|
122
|
+
rm $common_names
|
data/bin/seqtrimnext
CHANGED
@@ -57,6 +57,7 @@
|
|
57
57
|
# $: << File.expand_path(ROOT_PATH)
|
58
58
|
|
59
59
|
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
|
60
|
+
$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
|
60
61
|
|
61
62
|
require 'seqtrimnext'
|
62
63
|
|
@@ -272,7 +273,7 @@ optparse = OptionParser.new do |opts|
|
|
272
273
|
end
|
273
274
|
|
274
275
|
options[:skip_report] = false
|
275
|
-
opts.on( '-R', '--no-report', '
|
276
|
+
opts.on( '-R', '--no-report', 'Do not generate final PDF report (gem scbi_seqtrimnext_report required if you want to generate PDF report).' ) do
|
276
277
|
options[:skip_report] = true
|
277
278
|
end
|
278
279
|
|
@@ -18,29 +18,21 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
18
18
|
@@params= params
|
19
19
|
@@exit = false
|
20
20
|
|
21
|
+
@@ongoing_stats={}
|
22
|
+
@@ongoing_stats[:sequence_count] = 0
|
23
|
+
@@ongoing_stats[:smallest_sequence_size] = 900000000000000
|
24
|
+
@@ongoing_stats[:biggest_sequence_size] = 0
|
25
|
+
|
21
26
|
@@skip_output=skip_output
|
22
27
|
|
23
28
|
@@chunk_size = chunk_size
|
24
29
|
|
25
|
-
|
26
|
-
# puts "CHECKPOINT: #{self.checkpoint}\n"*20
|
27
|
-
|
28
|
-
checkpoint_exists=File.exists?('scbi_drb_checkpoint')
|
30
|
+
checkpoint_exists=File.exists?(ScbiMapreduce::CHECKPOINT_FILE)
|
29
31
|
|
30
32
|
# @@use_qual = !qual_path.nil? and File.exists?(qual_path)
|
31
33
|
@@open_mode='w'
|
32
34
|
if checkpoint_exists
|
33
35
|
@@open_mode = 'a'
|
34
|
-
if File.exists?(STATS_PATH)
|
35
|
-
# load stats
|
36
|
-
text = File.read(STATS_PATH)
|
37
|
-
|
38
|
-
# wipe text
|
39
|
-
# text=text.grep(/^\s*[^#]/).to_s
|
40
|
-
|
41
|
-
# decode json
|
42
|
-
@@full_stats = JSON.parse(text)
|
43
|
-
end
|
44
36
|
end
|
45
37
|
|
46
38
|
#open input file
|
@@ -91,67 +83,38 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
91
83
|
|
92
84
|
puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
|
93
85
|
|
86
|
+
# create stats file
|
94
87
|
f = File.open(STATS_PATH,'w')
|
95
88
|
f.puts JSON.pretty_generate(@@full_stats)
|
96
89
|
f.close
|
97
90
|
|
98
|
-
|
91
|
+
# if initial files doesn't exists, create it
|
92
|
+
if !File.exists?(File.join(OUTPUT_PATH,'initial_stats.json'))
|
93
|
+
File.open(File.join(OUTPUT_PATH,'initial_stats.json'),'w') do |f|
|
94
|
+
f.puts JSON.pretty_generate(@@ongoing_stats)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
99
98
|
|
99
|
+
# load stats
|
100
|
+
r=File.read(STATS_PATH)
|
100
101
|
stats=JSON::parse(r)
|
101
102
|
|
103
|
+
|
102
104
|
|
105
|
+
# make graphs
|
103
106
|
gs=GraphStats.new(stats)
|
104
107
|
|
105
|
-
#gs=GraphStats.new(@@full_stats)
|
106
|
-
|
107
|
-
|
108
108
|
#close all files
|
109
|
-
|
110
|
-
# @@fqr.close
|
111
109
|
if @@use_json
|
112
110
|
@@json_output.close
|
113
111
|
end
|
114
112
|
@@errors_file.close
|
115
|
-
# @@rejected_output_file.close
|
116
|
-
|
117
|
-
# @@paired_output_files.each do |k,file|
|
118
|
-
# file.close
|
119
|
-
# end
|
120
113
|
|
121
114
|
@@files.each do |k,file|
|
122
115
|
file.close
|
123
116
|
end
|
124
117
|
|
125
|
-
# @@paired_qual_output_files.each do |k,file|
|
126
|
-
# file.close
|
127
|
-
# end
|
128
|
-
|
129
|
-
# @@sequences_output_files.each do |k,file|
|
130
|
-
# file.close
|
131
|
-
# end
|
132
|
-
#
|
133
|
-
# @@low_complexity_output_files.each do |k,file|
|
134
|
-
# file.close
|
135
|
-
# end
|
136
|
-
#
|
137
|
-
# @@sffinfo_files.each do |k,file|
|
138
|
-
# file.close
|
139
|
-
# end
|
140
|
-
#
|
141
|
-
# @@low_sffinfo_files.each do |k,file|
|
142
|
-
# file.close
|
143
|
-
# end
|
144
|
-
|
145
|
-
|
146
|
-
# @@qual_output_files.each do |k,file|
|
147
|
-
# file.close
|
148
|
-
# end
|
149
|
-
|
150
|
-
# more than one MID found
|
151
|
-
# if @@full_stats['mid_id'] && @@full_stats['mid_id'].count>1
|
152
|
-
#
|
153
|
-
# end
|
154
|
-
|
155
118
|
if File.exists?('scbi_drb_checkpoint')
|
156
119
|
File.delete('scbi_drb_checkpoint')
|
157
120
|
end
|
@@ -172,21 +135,71 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
172
135
|
end
|
173
136
|
|
174
137
|
def load_user_checkpoint(checkpoint)
|
138
|
+
# load full_stats from file !!!!!!!!!!!!!
|
175
139
|
|
176
|
-
|
140
|
+
if File.exists?(STATS_PATH)
|
177
141
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
142
|
+
# load stats
|
143
|
+
text = File.read(STATS_PATH)
|
144
|
+
|
145
|
+
# wipe text
|
146
|
+
# text=text.grep(/^\s*[^#]/).to_s
|
147
|
+
|
148
|
+
# decode json
|
149
|
+
@@full_stats = JSON.parse(text)
|
184
150
|
end
|
185
|
-
|
186
|
-
|
187
|
-
|
151
|
+
|
152
|
+
# reset count stats since they are repeated by checkpointing
|
153
|
+
|
154
|
+
# {
|
155
|
+
# "sequences": {
|
156
|
+
# "count": {
|
157
|
+
# "input_count": 1600,
|
158
|
+
# "output_seqs": 933,
|
159
|
+
# "rejected": 67
|
160
|
+
# },
|
161
|
+
# "rejected": {
|
162
|
+
# "short insert": 39,
|
163
|
+
# "contaminated": 26,
|
164
|
+
# "unexpected vector": 2
|
165
|
+
# }
|
166
|
+
# }
|
167
|
+
# }
|
168
|
+
|
169
|
+
if @@full_stats['sequences']
|
170
|
+
if @@full_stats['sequences']['count']
|
171
|
+
# set input count to 0
|
172
|
+
@@full_stats['sequences']['count']['input_count']=0
|
173
|
+
|
174
|
+
# do not remove outputseqs
|
175
|
+
# @@full_stats['sequences']['count']['output_seqs']=0
|
176
|
+
end
|
177
|
+
|
178
|
+
# remove rejected due to repetitions from rejected count
|
179
|
+
if @@full_stats['sequences']['rejected']
|
180
|
+
|
181
|
+
# it there are repeated
|
182
|
+
if (@@full_stats['sequences']['rejected']['repeated'])
|
183
|
+
|
184
|
+
# if repeated count > 0 and there count exists
|
185
|
+
if (@@full_stats['sequences']['rejected']['repeated'] > 0) and @@full_stats['sequences']['count']
|
186
|
+
|
187
|
+
# discount repeated from rejected, since they are going to be added again by checkout process
|
188
|
+
@@full_stats['sequences']['count']['rejected'] -= @@full_stats['sequences']['rejected']['repeated']
|
189
|
+
end
|
190
|
+
|
191
|
+
# set repeated to 0
|
192
|
+
@@full_stats['sequences']['rejected']['repeated']=0
|
193
|
+
end
|
194
|
+
end
|
188
195
|
end
|
189
196
|
|
197
|
+
|
198
|
+
# puts "Loaded Stats"
|
199
|
+
# puts "FULL STATS:\n" +JSON.pretty_generate(@@full_stats)
|
200
|
+
|
201
|
+
# TODO - remove sequences from rejected file that were added by cloned
|
202
|
+
|
190
203
|
super
|
191
204
|
# return checkpoint
|
192
205
|
end
|
@@ -202,17 +215,7 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
202
215
|
|
203
216
|
# read a work that will not be processed, only to skip until checkpoint
|
204
217
|
def trash_checkpointed_work
|
205
|
-
|
206
|
-
@@chunk_size.times do
|
207
|
-
begin
|
208
|
-
n,f,q,c = @@fqr.next_seq
|
209
|
-
end while (!n.nil? && @@params.repeated_seq?(n))
|
210
|
-
|
211
|
-
if n.nil?
|
212
|
-
break
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
218
|
+
warn "Deprecated: trash_checkpointed_work was deprecated, it is automatic now"
|
216
219
|
end
|
217
220
|
|
218
221
|
def next_work
|
@@ -228,12 +231,16 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
228
231
|
if !n.nil? && @@params.repeated_seq?(n)
|
229
232
|
@@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
|
230
233
|
@@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
|
231
|
-
|
234
|
+
|
232
235
|
get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
|
233
|
-
|
236
|
+
|
234
237
|
end
|
235
|
-
|
238
|
+
|
236
239
|
if !n.nil?
|
240
|
+
@@ongoing_stats[:sequence_count] += 1
|
241
|
+
@@ongoing_stats[:smallest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].min
|
242
|
+
@@ongoing_stats[:biggest_sequence_size] = [f.size, @@ongoing_stats[:smallest_sequence_size]].max
|
243
|
+
|
237
244
|
@@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
|
238
245
|
end
|
239
246
|
end while (!n.nil? && @@params.repeated_seq?(n))
|
@@ -247,50 +254,9 @@ class SeqtrimWorkManager < ScbiMapreduce::WorkManager
|
|
247
254
|
|
248
255
|
end
|
249
256
|
|
250
|
-
# def next_work
|
251
|
-
#
|
252
|
-
# if @@exit
|
253
|
-
# return nil
|
254
|
-
# end
|
255
|
-
# group = SequenceGroup.new
|
256
|
-
#
|
257
|
-
# @@chunk_size.times do
|
258
|
-
# begin
|
259
|
-
#
|
260
|
-
# n,f,q,c = @@fqr.next_seq
|
261
|
-
#
|
262
|
-
# if !n.nil? && @@params.repeated_seq?(n)
|
263
|
-
# @@full_stats.add_stats({'sequences' => {'count' => {'rejected' => 1}}})
|
264
|
-
# @@full_stats.add_stats({'sequences' => {'rejected' => {'repeated' => 1}}})
|
265
|
-
#
|
266
|
-
# get_file(File.join(OUTPUT_PATH,'rejected.txt')).puts('>'+n+ ' repeated')
|
267
|
-
#
|
268
|
-
# end
|
269
|
-
# if !n.nil?
|
270
|
-
# @@full_stats.add_stats({'sequences' => {'count' => {'input_count' => 1}}})
|
271
|
-
# end
|
272
|
-
# end while (!n.nil? && @@params.repeated_seq?(n))
|
273
|
-
#
|
274
|
-
# if !n.nil?
|
275
|
-
# # @@full_stats.add_stats({'sequences' => {'count' => {'processed' => 1}}})
|
276
|
-
# group.push SequenceWithAction.new(n,f.upcase,q,c)
|
277
|
-
# else
|
278
|
-
# break
|
279
|
-
# end
|
280
|
-
# end
|
281
|
-
#
|
282
|
-
# # puts "Processing #{group.inspect}"
|
283
|
-
#
|
284
|
-
# if group.empty?
|
285
|
-
# return nil
|
286
|
-
# else
|
287
|
-
# return group
|
288
|
-
# end
|
289
|
-
#
|
290
|
-
# end
|
291
257
|
|
292
258
|
def work_received(obj)
|
293
|
-
|
259
|
+
|
294
260
|
res = obj
|
295
261
|
|
296
262
|
# collect stats
|
@@ -19,6 +19,9 @@
|
|
19
19
|
#
|
20
20
|
# $: << File.expand_path(ROOT_PATH)
|
21
21
|
|
22
|
+
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
|
23
|
+
$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
24
|
+
|
22
25
|
require 'seqtrimnext'
|
23
26
|
|
24
27
|
$SEQTRIM_PATH = ROOT_PATH
|
@@ -37,7 +40,7 @@ ENV['BLASTDB']=$FORMATTED_DB_PATH
|
|
37
40
|
OUTPUT_PATH='output_files'
|
38
41
|
|
39
42
|
puts "FORMATTED_DB_BLAST in workers: #{$FORMATTED_DB_PATH}"
|
40
|
-
|
43
|
+
|
41
44
|
|
42
45
|
require 'scbi_mapreduce'
|
43
46
|
require 'params'
|
@@ -10,8 +10,13 @@ class GraphStats
|
|
10
10
|
init_stats=initial_stats
|
11
11
|
|
12
12
|
if init_stats.nil?
|
13
|
-
|
14
|
-
|
13
|
+
if File.exists?(File.join(OUTPUT_PATH,'initial_stats.json'))
|
14
|
+
r=File.read(File.join(OUTPUT_PATH,'initial_stats.json'))
|
15
|
+
init_stats= JSON::parse(r)
|
16
|
+
else
|
17
|
+
init_stats=[]
|
18
|
+
end
|
19
|
+
|
15
20
|
end
|
16
21
|
# puts init_stats.to_json
|
17
22
|
#r=File.read(File.join(File.dirname(__FILE__),'stats.json'))
|
@@ -5,8 +5,6 @@
|
|
5
5
|
|
6
6
|
require 'extract_stats'
|
7
7
|
|
8
|
-
# $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib')
|
9
|
-
|
10
8
|
require 'scbi_mapreduce'
|
11
9
|
require 'seqtrim_work_manager'
|
12
10
|
require 'action_manager'
|
@@ -340,6 +338,9 @@ class Seqtrim
|
|
340
338
|
# server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,'~/.seqtrimnext')
|
341
339
|
server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,$SEQTRIMNEXT_INIT)
|
342
340
|
server.chunk_size=chunk_size
|
341
|
+
server.checkpointing=true
|
342
|
+
server.keep_order=true
|
343
|
+
server.retry_stuck_jobs=true
|
343
344
|
server.start_server
|
344
345
|
|
345
346
|
# close sequence reader
|
@@ -25,7 +25,7 @@ class PluginAbAdapters < Plugin
|
|
25
25
|
def do_blasts(seqs)
|
26
26
|
# find MIDS with less results than max_target_seqs value
|
27
27
|
blast=BatchBlast.new("-db #{@params.get_param('adapters_ab_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
|
28
|
-
$LOG.
|
28
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
29
29
|
|
30
30
|
fastas=[]
|
31
31
|
|
@@ -61,7 +61,7 @@ class PluginAbAdapters < Plugin
|
|
61
61
|
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
62
62
|
end
|
63
63
|
|
64
|
-
$LOG.
|
64
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
|
65
65
|
|
66
66
|
|
67
67
|
# blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters_ab.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_ab')} -perc_identity #{@params.get_param('blast_percent_ab')} -word_size #{MIN_ADAPTER_SIZE}")
|
@@ -25,7 +25,7 @@ class PluginAdapters < Plugin
|
|
25
25
|
def do_blasts(seqs)
|
26
26
|
# find MIDS with less results than max_target_seqs value
|
27
27
|
blast=BatchBlast.new("-db #{@params.get_param('adapters_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_adapters')} -word_size #{MIN_ADAPTER_SIZE}")
|
28
|
-
$LOG.
|
28
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
29
29
|
|
30
30
|
fastas=[]
|
31
31
|
|
@@ -64,7 +64,7 @@ class PluginAdapters < Plugin
|
|
64
64
|
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
65
65
|
end
|
66
66
|
|
67
|
-
$LOG.
|
67
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
|
68
68
|
|
69
69
|
|
70
70
|
# blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')} -word_size #{MIN_ADAPTER_SIZE}")
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require "plugin"
|
2
|
+
|
3
|
+
########################################################
|
4
|
+
# Author: Almudena Bocinos Rioboo
|
5
|
+
#
|
6
|
+
# Defines the main methods that are necessary to execute PluginAdapters
|
7
|
+
# Inherit: Plugin
|
8
|
+
########################################################
|
9
|
+
|
10
|
+
class PluginAdaptersOld < Plugin
|
11
|
+
|
12
|
+
def get_type_adapter(p_start,p_end,seq)
|
13
|
+
#if q_beg is nearer the left, add adapter action by the left,
|
14
|
+
#if q_end esta is nearer the right , add adapter action by the right
|
15
|
+
#NOTE: If the adapter is very near from left and rigth,
|
16
|
+
#then the sequence isn't valid, because almost sequence is adapter.
|
17
|
+
|
18
|
+
|
19
|
+
v1= p_end.to_i
|
20
|
+
v2= p_start.to_i
|
21
|
+
|
22
|
+
# puts " startadapter #{v2} endadapter #{v1} insert_start #{seq.insert_start} insert_end #{seq.insert_end}"
|
23
|
+
|
24
|
+
# puts " #{v2+seq.insert_start} <? #{seq.seq_fasta.length - v1 - 1 + seq.seq_fasta_orig.length - seq.insert_end-1}"
|
25
|
+
if (v2+seq.insert_start < (seq.seq_fasta.length - v1 - 1+ seq.seq_fasta_orig.length - seq.insert_end-1)) #IF THE NEAREST ONE IS THE LEFT
|
26
|
+
type = "ActionLeftAdapter"
|
27
|
+
|
28
|
+
else
|
29
|
+
type = "ActionRightAdapter"
|
30
|
+
|
31
|
+
end
|
32
|
+
return type
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def cut_by_right(adapter,seq)
|
37
|
+
|
38
|
+
left_size = adapter.q_beg-seq.insert_start+1
|
39
|
+
right_size = seq.insert_end-adapter.q_end+1
|
40
|
+
left_size=0 if (left_size<0)
|
41
|
+
right_size=0 if (right_size<0)
|
42
|
+
|
43
|
+
return (left_size>(right_size/2).to_i)
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
48
|
+
def execute(seqs)
|
49
|
+
blasts= do_blasts(seqs)
|
50
|
+
|
51
|
+
seqs.each_with_index do |s,i|
|
52
|
+
exec_seq(s,blasts.querys[i])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def do_blasts(seqs)
|
57
|
+
# find MIDS with less results than max_target_seqs value
|
58
|
+
blast=BatchBlast.new("-db #{@params.get_param('adapters_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
|
59
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
60
|
+
|
61
|
+
fastas=[]
|
62
|
+
|
63
|
+
seqs.each do |seq|
|
64
|
+
fastas.push ">"+seq.seq_name
|
65
|
+
fastas.push seq.seq_fasta
|
66
|
+
end
|
67
|
+
|
68
|
+
# fastas=fastas.join("\n")
|
69
|
+
|
70
|
+
blast_table_results = blast.do_blast(fastas)
|
71
|
+
|
72
|
+
# puts blast_table_results.inspect
|
73
|
+
|
74
|
+
return blast_table_results
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def exec_seq(seq,blast_query)
|
79
|
+
if blast_query.query_id != seq.seq_name
|
80
|
+
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
81
|
+
end
|
82
|
+
|
83
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
|
84
|
+
|
85
|
+
|
86
|
+
# blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
|
87
|
+
|
88
|
+
# blast with only one sequence, no with many sequences from a database
|
89
|
+
#---------------------------------------------------------------------
|
90
|
+
|
91
|
+
# blast_table_results = blast.do_blast(seq.seq_fasta) #rise seq to adapterss executing over blast
|
92
|
+
|
93
|
+
#blast_table_results = BlastTableResult.new(res)
|
94
|
+
|
95
|
+
# blast_table_results.inspect
|
96
|
+
|
97
|
+
adapters=[]
|
98
|
+
# blast_table_results.querys.each do |query| # first round to save adapters without overlap
|
99
|
+
merge_hits(blast_query,adapters)
|
100
|
+
# end
|
101
|
+
|
102
|
+
begin
|
103
|
+
adapters2=adapters # second round to save adapters without overlap
|
104
|
+
adapters = []
|
105
|
+
merge_hits(adapters2,adapters)
|
106
|
+
end until (adapters2.count == adapters.count)
|
107
|
+
|
108
|
+
actions=[]
|
109
|
+
adapter_size=0
|
110
|
+
# @stats['adapter_size']={}
|
111
|
+
adapters.each do |ad| # adds the correspondent action to the sequence
|
112
|
+
|
113
|
+
type = get_type_adapter(ad.q_beg,ad.q_end,seq)
|
114
|
+
a = seq.new_action(ad.q_beg,ad.q_end,type)
|
115
|
+
# puts " state left_action #{a.left_action} right_action #{a.right_action}"
|
116
|
+
|
117
|
+
|
118
|
+
adapter_size=ad.q_end-ad.q_beg+1
|
119
|
+
|
120
|
+
if cut_by_right(ad,seq)
|
121
|
+
|
122
|
+
# puts "action right end1 #{seq.insert_end}"
|
123
|
+
|
124
|
+
a.right_action=true #mark rigth action to get the left insert
|
125
|
+
else
|
126
|
+
|
127
|
+
# puts " cut1 by left #{seq.insert_start} ad #{ad.q_beg+seq.insert_start} #{ad.q_end+seq.insert_start}"
|
128
|
+
|
129
|
+
a.left_action = true #mark left action to get the right insert
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
a.message = ad.subject_id
|
134
|
+
a.reversed = ad.reversed
|
135
|
+
actions.push a
|
136
|
+
|
137
|
+
# @stats[:adapter_size]={adapter_size => 1}
|
138
|
+
add_stats('adapter_size',adapter_size)
|
139
|
+
|
140
|
+
end
|
141
|
+
seq.add_actions(actions)
|
142
|
+
#
|
143
|
+
end
|
144
|
+
|
145
|
+
#Returns an array with the errors due to parameters are missing
|
146
|
+
def self.check_params(params)
|
147
|
+
errors=[]
|
148
|
+
|
149
|
+
comment='Blast E-value used as cut-off when searching for adapters or primers'
|
150
|
+
default_value = 1e-6
|
151
|
+
params.check_param(errors,'blast_evalue_adapters','Float',default_value,comment)
|
152
|
+
|
153
|
+
comment='Minimum required identity (%) for a reliable adapter'
|
154
|
+
default_value = 95
|
155
|
+
params.check_param(errors,'blast_percent_adapters','Integer',default_value,comment)
|
156
|
+
|
157
|
+
comment='Path for adapter database'
|
158
|
+
default_value = File.join($FORMATTED_DB_PATH,'adapters.fasta')
|
159
|
+
params.check_param(errors,'adapters_db','DB',default_value,comment)
|
160
|
+
|
161
|
+
return errors
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
end
|
@@ -25,7 +25,7 @@ class PluginAmplicons < Plugin
|
|
25
25
|
def do_blasts(seqs)
|
26
26
|
# find MIDS with less results than max_target_seqs value
|
27
27
|
blast=BatchBlast.new("-db #{@params.get_param('primers_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_primers')}")
|
28
|
-
$LOG.
|
28
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
29
29
|
|
30
30
|
fastas=[]
|
31
31
|
|
@@ -49,7 +49,7 @@ class PluginAmplicons < Plugin
|
|
49
49
|
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
50
50
|
end
|
51
51
|
|
52
|
-
$LOG.
|
52
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for primers into the sequence"
|
53
53
|
|
54
54
|
# puts blast_query.inspect
|
55
55
|
|
@@ -36,7 +36,7 @@ class PluginContaminants < Plugin
|
|
36
36
|
|
37
37
|
blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS}
|
38
38
|
|
39
|
-
$LOG.
|
39
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd(:xml))
|
40
40
|
|
41
41
|
fastas=[]
|
42
42
|
|
@@ -67,7 +67,7 @@ class PluginContaminants < Plugin
|
|
67
67
|
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
68
68
|
end
|
69
69
|
|
70
|
-
$LOG.
|
70
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for contaminants into the sequence"
|
71
71
|
|
72
72
|
|
73
73
|
#blast = BatchBlast.new('-db DB/formatted/contaminants.fasta','blastn',' -task blastn -evalue 1e-10 -perc_identity 95') #get contaminants
|
@@ -143,7 +143,7 @@ class PluginContaminants < Plugin
|
|
143
143
|
|
144
144
|
end
|
145
145
|
else
|
146
|
-
$LOG.
|
146
|
+
$LOG.debug('Contaminant ignored due to genus match: '+c.definition)
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
@@ -286,7 +286,7 @@ class PluginExtractInserts < Plugin
|
|
286
286
|
|
287
287
|
|
288
288
|
def exec_seq(seq)
|
289
|
-
$LOG.
|
289
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: extract inserts"
|
290
290
|
|
291
291
|
# puts "INSERTO ANTES LINKER INSERT:"+seq.seq_fasta
|
292
292
|
|
@@ -319,7 +319,7 @@ class PluginFindPolyAt < Plugin
|
|
319
319
|
|
320
320
|
|
321
321
|
def exec_seq(seq)
|
322
|
-
$LOG.
|
322
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"
|
323
323
|
|
324
324
|
find_polyT(seq)
|
325
325
|
find_polyA(seq)
|
@@ -21,7 +21,7 @@ def execute(seqs)
|
|
21
21
|
|
22
22
|
|
23
23
|
def exec_seq(seq)
|
24
|
-
$LOG.
|
24
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: searching sequence repeated at input file"
|
25
25
|
|
26
26
|
fasta_input=@params.get_param('truncated_input_file')
|
27
27
|
|
@@ -149,7 +149,7 @@ class PluginIndeterminations < Plugin
|
|
149
149
|
|
150
150
|
|
151
151
|
def exec_seq(seq)
|
152
|
-
$LOG.
|
152
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing indeterminations N+"
|
153
153
|
|
154
154
|
actions=[]
|
155
155
|
|
@@ -21,7 +21,7 @@ class PluginKey < Plugin
|
|
21
21
|
|
22
22
|
|
23
23
|
def exec_seq(seq)
|
24
|
-
$LOG.
|
24
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: marking key into the sequence"
|
25
25
|
# blast_table_results.inspect
|
26
26
|
|
27
27
|
actions=[]
|
@@ -83,7 +83,7 @@ class PluginLinker < Plugin
|
|
83
83
|
# find MIDS with less results than max_target_seqs value
|
84
84
|
blast = BatchBlast.new("-db #{@params.get_param('linkers_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
|
85
85
|
|
86
|
-
$LOG.
|
86
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
87
87
|
|
88
88
|
fastas=[]
|
89
89
|
|
@@ -106,7 +106,7 @@ class PluginLinker < Plugin
|
|
106
106
|
if blast_query.query_id != seq.seq_name
|
107
107
|
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
108
108
|
end
|
109
|
-
$LOG.
|
109
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for linker into the sequence"
|
110
110
|
|
111
111
|
# key_beg,key_end=search_key(seq,0,3) if false
|
112
112
|
# blast = BatchBlast.new("-subject #{File.join($FORMATTED_DB_PATH,'linkers.fasta')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_linkers')} -perc_identity #{@params.get_param('blast_percent_linkers')}") #get linkers
|
@@ -22,7 +22,7 @@ class PluginLowHighSize < Plugin
|
|
22
22
|
|
23
23
|
|
24
24
|
def exec_seq(seq)
|
25
|
-
$LOG.
|
25
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low or high size of the sequence"
|
26
26
|
|
27
27
|
min_size = @params.get_param('min_sequence_size_raw').to_i #min_size is: mean - 2dev
|
28
28
|
max_size = @params.get_param('max_sequence_size_raw').to_i #max_size is: mean + 2dev
|
@@ -278,7 +278,7 @@ class PluginLowQuality < Plugin
|
|
278
278
|
if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? )
|
279
279
|
$LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s
|
280
280
|
elsif (seq.seq_qual.size>0)
|
281
|
-
$LOG.
|
281
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence"
|
282
282
|
|
283
283
|
@low=@params.get_param('min_quality').to_i
|
284
284
|
|
@@ -29,7 +29,7 @@ class PluginMids < Plugin
|
|
29
29
|
def do_blasts(seqs)
|
30
30
|
# find MIDS with less results than max_target_seqs value
|
31
31
|
blast = BatchBlast.new("-db #{@params.get_param('mids_db')}",'blastn'," -task blastn-short -perc_identity #{@params.get_param('blast_percent_mids')} -max_target_seqs 4 ") #get mids
|
32
|
-
$LOG.
|
32
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
33
33
|
|
34
34
|
fastas=[]
|
35
35
|
|
@@ -54,7 +54,7 @@ class PluginMids < Plugin
|
|
54
54
|
end
|
55
55
|
|
56
56
|
|
57
|
-
$LOG.
|
57
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for mids into the sequence"
|
58
58
|
|
59
59
|
|
60
60
|
# blast_table_results = blast.do_blast(seq.seq_fasta[0..SIZE_SEARCH_MID]) # execute blast to find mids
|
@@ -27,7 +27,7 @@ class PluginRemAditArtifacts < Plugin
|
|
27
27
|
|
28
28
|
def exec_seq(seq)
|
29
29
|
|
30
|
-
$LOG.
|
30
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: removing artifacts into the sequence"
|
31
31
|
seq2 = seq.seq_fasta
|
32
32
|
first = 0
|
33
33
|
last = seq2.size-1
|
@@ -96,7 +96,7 @@ class PluginShortInsert < Plugin
|
|
96
96
|
|
97
97
|
def exec_seq(seq)
|
98
98
|
|
99
|
-
$LOG.
|
99
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
100
100
|
# puts "inserto #{seq.insert_start}, #{seq.insert_end} size #{seq.seq_fasta.size}"
|
101
101
|
|
102
102
|
if (seq.seq_fasta.size > 0)
|
@@ -183,7 +183,7 @@ class PluginShortInsert < Plugin
|
|
183
183
|
|
184
184
|
#Begins the plugin1's execution to warn if the inserted is so short
|
185
185
|
def execute_no_cut_quality(seq)
|
186
|
-
$LOG.
|
186
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking if insert of sequence has enought size"
|
187
187
|
|
188
188
|
|
189
189
|
|
@@ -38,7 +38,7 @@ class PluginVectors < Plugin
|
|
38
38
|
# find MIDS with less results than max_target_seqs value
|
39
39
|
blast = BatchBlast.new("-db #{@params.get_param('vectors_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_vectors')} -perc_identity #{@params.get_param('blast_percent_vectors')} -culling_limit 1") #get vectors
|
40
40
|
|
41
|
-
$LOG.
|
41
|
+
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
42
42
|
|
43
43
|
fastas=[]
|
44
44
|
|
@@ -62,7 +62,7 @@ class PluginVectors < Plugin
|
|
62
62
|
# raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
63
63
|
end
|
64
64
|
|
65
|
-
$LOG.
|
65
|
+
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for vectors into the sequence "
|
66
66
|
|
67
67
|
#blast contra contaminantes
|
68
68
|
|
@@ -0,0 +1,8 @@
|
|
1
|
+
# ======================================
|
2
|
+
# General parameters
|
3
|
+
# ======================================
|
4
|
+
|
5
|
+
|
6
|
+
plugin_list = PluginLowHighSize,PluginIndeterminations,PluginFindPolyAt,PluginContaminants,PluginLowQuality,PluginLowComplexity
|
7
|
+
|
8
|
+
contaminants_db="contaminants.fasta cont_ribosome.fasta"
|
@@ -15,8 +15,9 @@ def add_stats(h_stats)
|
|
15
15
|
add_stats.each do |property,hash_value|
|
16
16
|
h[plugin_hash][property]={} if h[plugin_hash][property].nil?
|
17
17
|
|
18
|
+
# values need to be in string format because of later loading from json file
|
18
19
|
hash_value.each do |value, count|
|
19
|
-
h[plugin_hash][property][value]=(h[plugin_hash][property][value]||0) + count
|
20
|
+
h[plugin_hash][property][value.to_s]=(h[plugin_hash][property][value.to_s]||0) + count
|
20
21
|
end
|
21
22
|
end
|
22
23
|
end
|
data/lib/seqtrimnext.rb
CHANGED
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: seqtrimnext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 2.0.
|
5
|
+
version: 2.0.46
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dario Guerrero & Almudena Bocinos
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-04-13 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: narray
|
@@ -143,9 +143,11 @@ executables:
|
|
143
143
|
- extract_seqs_from_fastq.rb
|
144
144
|
- fasta2fastq.rb
|
145
145
|
- fastq2fasta.rb
|
146
|
+
- filter_database.rb
|
146
147
|
- gen_qual.rb
|
147
148
|
- get_seq.rb
|
148
149
|
- group_by_range.rb
|
150
|
+
- join_big_illumina_paired.sh
|
149
151
|
- join_ilumina_paired.rb
|
150
152
|
- parse_amplicons.rb
|
151
153
|
- parse_json_results.rb
|
@@ -160,14 +162,16 @@ executables:
|
|
160
162
|
extensions: []
|
161
163
|
|
162
164
|
extra_rdoc_files:
|
165
|
+
- History.txt
|
163
166
|
- lib/seqtrimnext/templates/amplicons.txt
|
164
167
|
- lib/seqtrimnext/templates/genomics_454.txt
|
165
168
|
- lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
169
|
+
- lib/seqtrimnext/templates/genomics_illumina.txt
|
166
170
|
- lib/seqtrimnext/templates/low_quality.txt
|
167
171
|
- lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
|
168
172
|
- lib/seqtrimnext/templates/transcriptomics_454.txt
|
173
|
+
- lib/seqtrimnext/templates/transcriptomics_illumina.txt
|
169
174
|
- lib/seqtrimnext/templates/transcriptomics_plants.txt
|
170
|
-
- History.txt
|
171
175
|
- Manifest.txt
|
172
176
|
- PostInstall.txt
|
173
177
|
files:
|
@@ -177,9 +181,11 @@ files:
|
|
177
181
|
- bin/extract_seqs_from_fastq.rb
|
178
182
|
- bin/fasta2fastq.rb
|
179
183
|
- bin/fastq2fasta.rb
|
184
|
+
- bin/filter_database.rb
|
180
185
|
- bin/gen_qual.rb
|
181
186
|
- bin/get_seq.rb
|
182
187
|
- bin/group_by_range.rb
|
188
|
+
- bin/join_big_illumina_paired.sh
|
183
189
|
- bin/join_ilumina_paired.rb
|
184
190
|
- bin/parse_amplicons.rb
|
185
191
|
- bin/parse_json_results.rb
|
@@ -191,6 +197,7 @@ files:
|
|
191
197
|
- bin/split_fastq.rb
|
192
198
|
- bin/split_ilumina_paired.rb
|
193
199
|
- bin/split_paired.rb
|
200
|
+
- History.txt
|
194
201
|
- lib/seqtrimnext/actions/action_ab_adapter.rb
|
195
202
|
- lib/seqtrimnext/actions/action_ab_far_adapter.rb
|
196
203
|
- lib/seqtrimnext/actions/action_ab_left_adapter.rb
|
@@ -208,13 +215,13 @@ files:
|
|
208
215
|
- lib/seqtrimnext/actions/action_low_high_size.rb
|
209
216
|
- lib/seqtrimnext/actions/action_low_quality.rb
|
210
217
|
- lib/seqtrimnext/actions/action_mid.rb
|
218
|
+
- lib/seqtrimnext/actions/action_middle_adapter.rb
|
211
219
|
- lib/seqtrimnext/actions/action_multiple_linker.rb
|
212
220
|
- lib/seqtrimnext/actions/action_paired_reads.rb
|
213
221
|
- lib/seqtrimnext/actions/action_poly_a.rb
|
214
222
|
- lib/seqtrimnext/actions/action_poly_t.rb
|
215
223
|
- lib/seqtrimnext/actions/action_rem_adit_artifacts.rb
|
216
224
|
- lib/seqtrimnext/actions/action_right_adapter.rb
|
217
|
-
- lib/seqtrimnext/actions/action_middle_adapter.rb
|
218
225
|
- lib/seqtrimnext/actions/action_right_primer.rb
|
219
226
|
- lib/seqtrimnext/actions/action_short_insert.rb
|
220
227
|
- lib/seqtrimnext/actions/action_unexpected_poly_t.rb
|
@@ -244,6 +251,7 @@ files:
|
|
244
251
|
- lib/seqtrimnext/plugins/plugin.rb
|
245
252
|
- lib/seqtrimnext/plugins/plugin_ab_adapters.rb
|
246
253
|
- lib/seqtrimnext/plugins/plugin_adapters.rb
|
254
|
+
- lib/seqtrimnext/plugins/plugin_adapters_old.rb
|
247
255
|
- lib/seqtrimnext/plugins/plugin_amplicons.rb
|
248
256
|
- lib/seqtrimnext/plugins/plugin_contaminants.rb
|
249
257
|
- lib/seqtrimnext/plugins/plugin_extract_inserts.rb
|
@@ -262,9 +270,11 @@ files:
|
|
262
270
|
- lib/seqtrimnext/templates/amplicons.txt
|
263
271
|
- lib/seqtrimnext/templates/genomics_454.txt
|
264
272
|
- lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
273
|
+
- lib/seqtrimnext/templates/genomics_illumina.txt
|
265
274
|
- lib/seqtrimnext/templates/low_quality.txt
|
266
275
|
- lib/seqtrimnext/templates/low_quality_and_low_complexity.txt
|
267
276
|
- lib/seqtrimnext/templates/transcriptomics_454.txt
|
277
|
+
- lib/seqtrimnext/templates/transcriptomics_illumina.txt
|
268
278
|
- lib/seqtrimnext/templates/transcriptomics_plants.txt
|
269
279
|
- lib/seqtrimnext/utils/extract_samples.rb
|
270
280
|
- lib/seqtrimnext/utils/fasta2xml.rb
|
@@ -276,7 +286,6 @@ files:
|
|
276
286
|
- lib/seqtrimnext/utils/recover_mid.rb
|
277
287
|
- lib/seqtrimnext/utils/string_utils.rb
|
278
288
|
- lib/seqtrimnext.rb
|
279
|
-
- History.txt
|
280
289
|
- Manifest.txt
|
281
290
|
- PostInstall.txt
|
282
291
|
- Rakefile
|