full_lengther_next 0.0.6 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -0
- data/Manifest.txt +1 -0
- data/Rakefile +1 -1
- data/bin/full_lengther_next +8 -1
- data/bin/make_user_db.rb +5 -5
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +78 -0
- data/lib/full_lengther_next/classes/fln_stats.rb +148 -36
- data/lib/full_lengther_next/classes/my_worker.rb +53 -5
- data/lib/full_lengther_next/classes/my_worker_manager.rb +93 -29
- metadata +5 -4
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -3,6 +3,7 @@ bin/make_user_db.rb
|
|
3
3
|
bin/full_lengther_next
|
4
4
|
History.txt
|
5
5
|
lib/full_lengther_next/classes/common_functions.rb
|
6
|
+
lib/full_lengther_next/classes/chimeric_seqs.rb
|
6
7
|
lib/full_lengther_next/classes/fl_analysis.rb
|
7
8
|
lib/full_lengther_next/classes/fl_string_utils.rb
|
8
9
|
lib/full_lengther_next/classes/fln_stats.rb
|
data/Rakefile
CHANGED
@@ -20,7 +20,7 @@ $hoe = Hoe.spec 'full_lengther_next' do
|
|
20
20
|
# self.extra_deps << ['gnuplot','>=0']
|
21
21
|
# self.extra_deps << ['term-ansicolor','>=1.0.5']
|
22
22
|
self.extra_deps << ['xml-simple','>=1.0.12']
|
23
|
-
self.extra_deps << ['scbi_blast','>=0.0.
|
23
|
+
self.extra_deps << ['scbi_blast','>=0.0.37']
|
24
24
|
self.extra_deps << ['scbi_mapreduce','>=0.0.29']
|
25
25
|
self.extra_deps << ['scbi_fasta','>=0.1.7']
|
26
26
|
# self.extra_deps << ['scbi_fastq','>=0.0.13']
|
data/bin/full_lengther_next
CHANGED
@@ -50,6 +50,11 @@ optparse = OptionParser.new do |opts|
|
|
50
50
|
options[:distance] = distance.to_i
|
51
51
|
end
|
52
52
|
|
53
|
+
options[:chimera] = nil
|
54
|
+
opts.on( '-q', '--chimera_detection', "apply chimera detection mode\n\n" ) do |chimera|
|
55
|
+
options[:chimera] = chimera
|
56
|
+
end
|
57
|
+
|
53
58
|
options[:workers] = 2
|
54
59
|
opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
|
55
60
|
if File.exists?(workers)
|
@@ -91,7 +96,7 @@ optparse = OptionParser.new do |opts|
|
|
91
96
|
|
92
97
|
|
93
98
|
# Set a banner, displayed at the top of the help screen.
|
94
|
-
opts.banner = "
|
99
|
+
opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
|
95
100
|
|
96
101
|
# This displays the help screen
|
97
102
|
opts.on( '-h', '--help', 'Display this screen' ) do
|
@@ -172,6 +177,8 @@ require 'my_worker_manager'
|
|
172
177
|
$LOG = Logger.new(STDOUT)
|
173
178
|
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
|
174
179
|
|
180
|
+
# puts "ROOT_PATH: #{ROOT_PATH}"
|
181
|
+
|
175
182
|
custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
|
176
183
|
|
177
184
|
$LOG.info 'Starting server'
|
data/bin/make_user_db.rb
CHANGED
@@ -126,16 +126,16 @@ end
|
|
126
126
|
ENV['BLASTDB']=formatted_db_path
|
127
127
|
|
128
128
|
|
129
|
-
if !File.exists?(File.join(
|
130
|
-
Dir.mkdir(
|
129
|
+
if !File.exists?(File.join(formatted_db_path, my_group))
|
130
|
+
Dir.mkdir(File.join(formatted_db_path,my_group))
|
131
131
|
end
|
132
132
|
|
133
|
-
output_file_path=File.join(
|
133
|
+
output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
|
134
134
|
|
135
135
|
output_file = File.new(output_file_path, "w")
|
136
136
|
|
137
|
-
filter_incomplete_seqs(output_file, File.join(
|
138
|
-
filter_incomplete_seqs(output_file, File.join(
|
137
|
+
filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
|
138
|
+
filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
|
139
139
|
|
140
140
|
output_file.close
|
141
141
|
|
data/lib/full_lengther_next.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
|
-
|
4
|
+
# ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
|
5
5
|
|
6
|
-
$: << File.expand_path(File.join(
|
6
|
+
$: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
|
7
7
|
|
8
8
|
|
9
9
|
module FullLengtherNext
|
10
|
-
VERSION = '0.0.
|
10
|
+
VERSION = '0.0.8'
|
11
11
|
|
12
12
|
FULLLENGHTER_VERSION = VERSION
|
13
13
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
|
2
|
+
require 'scbi_blast'
|
3
|
+
|
4
|
+
module ChimericSeqs
|
5
|
+
|
6
|
+
def search_chimeras(seq, blast_query, options, db_name)
|
7
|
+
|
8
|
+
# used to detect if the sequence and the blast are from different query
|
9
|
+
if (seq.seq_name != blast_query.query_def)
|
10
|
+
puts "#{seq.seq_name} --> #{blast_query.query_def}"
|
11
|
+
raise "BLAST query name and sequence are different"
|
12
|
+
end
|
13
|
+
|
14
|
+
q=blast_query
|
15
|
+
# puts "#{q.query_def}"
|
16
|
+
|
17
|
+
ref_hit_beg = q.hits[0].q_beg
|
18
|
+
ref_hit_end = q.hits[0].q_end
|
19
|
+
|
20
|
+
q.hits.each do |hit|
|
21
|
+
# puts "---------#{hit.acc}"
|
22
|
+
# if overlaps or is contained in the ref hit
|
23
|
+
if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
|
24
|
+
# puts "hits overlapping: ref_hit #{ref_hit_beg}-#{ref_hit_end}, current hit #{hit.q_beg}-#{hit.q_end}"
|
25
|
+
ref_hit_beg = [ref_hit_beg,hit.q_beg].min
|
26
|
+
ref_hit_end = [ref_hit_end,hit.q_end].max
|
27
|
+
# puts "modified ref_hit #{ref_hit_beg}-#{ref_hit_end}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
q.hits.each do |hit|
|
31
|
+
if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
|
32
|
+
else
|
33
|
+
if (hit.acc != q.hits[0].acc)
|
34
|
+
# puts "\nreference: #{ref_hit_beg} - #{ref_hit_end}"
|
35
|
+
# puts "hit 0: #{q.hits[0].q_beg} - #{q.hits[0].q_end}"
|
36
|
+
# puts "current: #{hit.q_beg} - #{hit.q_end}"
|
37
|
+
# puts "putative chimeric seq: \n#{q.hits[0].definition}\n#{hit.definition}\n------------------------------------------"
|
38
|
+
|
39
|
+
chimera_annotations = "\n#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tPutative chimera\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t\n#{q.query_def}\t#{seq.fasta_length}\t#{hit.acc}\t#{db_name}\tPutative chimera\t\t#{hit.e_val}\t#{hit.ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{hit.q_frame}\t#{hit.q_beg}\t#{hit.q_end}\t#{hit.s_beg.to_i}\t#{hit.s_end.to_i}\t#{hit.definition}\t"
|
40
|
+
seq.annotate(:chimera,chimera_annotations,false)
|
41
|
+
end
|
42
|
+
|
43
|
+
return
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def select_best_blast(tmp_blast_obj, new_seqs)
|
50
|
+
|
51
|
+
my_seqs ={}
|
52
|
+
|
53
|
+
new_seqs.each do |seq|
|
54
|
+
my_seqs[seq.seq_name] = true
|
55
|
+
end
|
56
|
+
|
57
|
+
reverse_counter = (tmp_blast_obj.querys.length - 1)
|
58
|
+
|
59
|
+
tmp_blast_obj.querys.reverse_each do |query|
|
60
|
+
if (!my_seqs[query.query_def]) # los marcados como quimeras se eliminan para no utilizarse posteriormente
|
61
|
+
# tmp_blast_obj.querys[reverse_counter].delete
|
62
|
+
tmp_blast_obj.querys.delete_at(reverse_counter)
|
63
|
+
else
|
64
|
+
reverse_hit_counter = (query.hits.length - 1)
|
65
|
+
query.hits.reverse_each do |hit|
|
66
|
+
if (hit.acc != query.hits[0].acc)
|
67
|
+
tmp_blast_obj.querys[reverse_counter].hits.delete_at(reverse_hit_counter)
|
68
|
+
end
|
69
|
+
reverse_hit_counter -= 1
|
70
|
+
end
|
71
|
+
end
|
72
|
+
reverse_counter -= 1
|
73
|
+
end
|
74
|
+
|
75
|
+
return tmp_blast_obj
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -4,20 +4,28 @@ module FlnStats
|
|
4
4
|
def summary_stats
|
5
5
|
stats_file = File.open('fln_results/summary_stats.html', 'w')
|
6
6
|
|
7
|
+
size_filter1 = 200
|
8
|
+
size_filter2 = 500
|
9
|
+
|
7
10
|
# recogemos los trozos de html fijos
|
8
11
|
(html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
|
9
12
|
|
10
13
|
total_seqs = 0
|
11
14
|
status_suma = 0
|
12
15
|
#recogemos los datos que necesitamos de los ficheros de resultados
|
13
|
-
(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats
|
14
|
-
(tcode_array, seqs_number2,
|
15
|
-
(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats
|
16
|
+
(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
|
17
|
+
(tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
|
18
|
+
(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
|
19
|
+
(chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
|
16
20
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
+
seqs_number1 = (seqs_number1+chimera_total.to_i)
|
22
|
+
total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
|
23
|
+
uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
|
24
|
+
uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
|
25
|
+
longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
|
26
|
+
db_usage[0] += ch_db_usage[0]
|
27
|
+
db_usage[1] += ch_db_usage[1]
|
28
|
+
db_usage[2] += ch_db_usage[2]
|
21
29
|
stats_file.puts html_head
|
22
30
|
|
23
31
|
if (total_seqs.to_i > 0)
|
@@ -46,6 +54,15 @@ module FlnStats
|
|
46
54
|
end
|
47
55
|
status_suma += status[0]
|
48
56
|
end
|
57
|
+
|
58
|
+
# adding chimeric seqs
|
59
|
+
stats_file.puts ' <tr>
|
60
|
+
<td colspan="2" align="left">Putative chimera</td>
|
61
|
+
<td align="right">'+chimera_total.to_s+'</td>
|
62
|
+
<td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
|
63
|
+
</tr>'
|
64
|
+
status_suma += chimera_total
|
65
|
+
|
49
66
|
# añadimos los coding, P.coding
|
50
67
|
tcode_array.each do |status|
|
51
68
|
if (status[1] == 'Coding')
|
@@ -64,6 +81,7 @@ module FlnStats
|
|
64
81
|
end
|
65
82
|
status_suma += status[0]
|
66
83
|
end
|
84
|
+
|
67
85
|
# se ponen los ncRNA
|
68
86
|
stats_file.puts ' <tr>
|
69
87
|
<td colspan="2" align="left">Putative ncRNA</td>
|
@@ -156,12 +174,12 @@ module FlnStats
|
|
156
174
|
<td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
|
157
175
|
</tr>'
|
158
176
|
stats_file.puts ' <tr>
|
159
|
-
<td align="left">Unigenes >
|
177
|
+
<td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
|
160
178
|
<td align="right">'+uni_500.to_s+'</td>
|
161
179
|
<td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
|
162
180
|
</tr>'
|
163
181
|
stats_file.puts ' <tr>
|
164
|
-
<td align="left">Unigenes >
|
182
|
+
<td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
|
165
183
|
<td align="right">'+uni_200.to_s+'</td>
|
166
184
|
<td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
|
167
185
|
</tr>'
|
@@ -175,6 +193,8 @@ module FlnStats
|
|
175
193
|
<td align="right">'+seqs_number1.to_s+'</td>
|
176
194
|
<td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
|
177
195
|
</tr>'
|
196
|
+
|
197
|
+
if (seqs_number1.to_i > 0)
|
178
198
|
stats_file.puts ' <tr>
|
179
199
|
<td align="left"> Different orthologue IDs</td>
|
180
200
|
<td align="right">'+seq_uniq.to_s+'</td>
|
@@ -195,21 +215,49 @@ module FlnStats
|
|
195
215
|
<td align="right">'+error_1_num.to_s+'</td>
|
196
216
|
<td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
|
197
217
|
</tr>'
|
218
|
+
stats_file.puts ' <tr>
|
219
|
+
<td align="left"> Putative chimera</td>
|
220
|
+
<td align="right">'+chimera_total.to_s+'</td>
|
221
|
+
<td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
|
222
|
+
</tr>'
|
223
|
+
end
|
198
224
|
stats_file.puts ' <tr>
|
199
225
|
<td align="left">Without orthologue <sup>1</sup></td>
|
200
226
|
<td align="right">'+no_db.to_s+'</td>
|
201
227
|
<td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
|
202
228
|
</tr>'
|
229
|
+
|
230
|
+
if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
|
203
231
|
stats_file.puts ' <tr>
|
204
|
-
<td align="left"> Coding</td>
|
232
|
+
<td align="left"> Coding (all)</td>
|
205
233
|
<td align="right">'+tcode_array[0][0].to_s+'</td>
|
206
234
|
<td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
|
207
235
|
</tr>'
|
208
236
|
stats_file.puts ' <tr>
|
209
|
-
<td align="left">
|
237
|
+
<td align="left"> Coding > '+size_filter1.to_s+'bp</td>
|
238
|
+
<td align="right">'+tcode_array[0][2].to_s+'</td>
|
239
|
+
<td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
|
240
|
+
</tr>'
|
241
|
+
stats_file.puts ' <tr>
|
242
|
+
<td align="left"> Coding > '+size_filter2.to_s+'bp</td>
|
243
|
+
<td align="right">'+tcode_array[0][3].to_s+'</td>
|
244
|
+
<td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
|
245
|
+
</tr>'
|
246
|
+
stats_file.puts ' <tr>
|
247
|
+
<td align="left"> Putative Coding (all)</td>
|
210
248
|
<td align="right">'+tcode_array[1][0].to_s+'</td>
|
211
249
|
<td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
|
212
250
|
</tr>'
|
251
|
+
stats_file.puts ' <tr>
|
252
|
+
<td align="left"> Putative Coding > '+size_filter1.to_s+'bp</td>
|
253
|
+
<td align="right">'+tcode_array[1][2].to_s+'</td>
|
254
|
+
<td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
|
255
|
+
</tr>'
|
256
|
+
stats_file.puts ' <tr>
|
257
|
+
<td align="left"> Putative Coding > '+size_filter2.to_s+'bp</td>
|
258
|
+
<td align="right">'+tcode_array[1][3].to_s+'</td>
|
259
|
+
<td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
|
260
|
+
</tr>'
|
213
261
|
stats_file.puts ' <tr>
|
214
262
|
<td align="left"> Putative ncRNA</td>
|
215
263
|
<td align="right">'+ncrna_total.to_s+'</td>
|
@@ -221,16 +269,19 @@ module FlnStats
|
|
221
269
|
<td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
|
222
270
|
</tr>'
|
223
271
|
stats_file.puts ' <tr>
|
224
|
-
<td align="left"> Unknown
|
225
|
-
<td align="right">'+
|
226
|
-
<td align="right">'+'%.2f' % (100*
|
227
|
-
</tr>
|
228
|
-
|
272
|
+
<td align="left"> Unknown > '+size_filter1.to_s+'bp</td>
|
273
|
+
<td align="right">'+tcode_array[2][2].to_s+'</td>
|
274
|
+
<td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
|
275
|
+
</tr>'
|
276
|
+
stats_file.puts ' <tr>
|
277
|
+
<td align="left"> Unknown > '+size_filter2.to_s+'bp</td>
|
278
|
+
<td align="right">'+tcode_array[2][3].to_s+'</td>
|
279
|
+
<td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
|
280
|
+
</tr>'
|
281
|
+
end
|
282
|
+
stats_file.puts ' </table>
|
229
283
|
<sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
|
230
284
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
285
|
end
|
235
286
|
stats_file.puts html_end
|
236
287
|
|
@@ -309,7 +360,7 @@ module FlnStats
|
|
309
360
|
end
|
310
361
|
|
311
362
|
|
312
|
-
def annotation_stats
|
363
|
+
def annotation_stats(size_filter1,size_filter2)
|
313
364
|
|
314
365
|
seqs_number = 0
|
315
366
|
array_of_all_accs = []
|
@@ -353,10 +404,10 @@ module FlnStats
|
|
353
404
|
end
|
354
405
|
|
355
406
|
# -------------------------------------------------------------------------
|
356
|
-
if (fasta_length.to_i >=
|
407
|
+
if (fasta_length.to_i >= size_filter1)
|
357
408
|
uni_200 += 1
|
358
409
|
end
|
359
|
-
if (fasta_length.to_i >=
|
410
|
+
if (fasta_length.to_i >= size_filter2)
|
360
411
|
uni_500 += 1
|
361
412
|
end
|
362
413
|
# -------------------------------------------------------------------------
|
@@ -394,18 +445,17 @@ module FlnStats
|
|
394
445
|
end
|
395
446
|
|
396
447
|
|
397
|
-
def testcode_stats
|
448
|
+
def testcode_stats(size_filter1,size_filter2)
|
398
449
|
|
399
450
|
seqs_number = 0
|
400
|
-
unk_200 = 0
|
401
451
|
uni_500 = 0
|
402
452
|
uni_200 = 0
|
403
453
|
longest_one = 0
|
404
454
|
|
405
455
|
# total, status
|
406
|
-
coding_stats = [0,'Coding']
|
407
|
-
p_coding_stats = [0,'Putative Coding']
|
408
|
-
unknown_stats = [0,'Unknown']
|
456
|
+
coding_stats = [0,'Coding',0,0]
|
457
|
+
p_coding_stats = [0,'Putative Coding',0,0]
|
458
|
+
unknown_stats = [0,'Unknown',0,0]
|
409
459
|
|
410
460
|
File.open('fln_results/new_coding.txt').each do |line|
|
411
461
|
line.chomp!
|
@@ -419,17 +469,31 @@ module FlnStats
|
|
419
469
|
end
|
420
470
|
|
421
471
|
# -------------------------------------------------------------------------
|
422
|
-
if (fasta_length.to_i >=
|
472
|
+
if (fasta_length.to_i >= size_filter1)
|
423
473
|
uni_200 += 1
|
424
474
|
end
|
425
|
-
if (fasta_length.to_i >=
|
475
|
+
if (fasta_length.to_i >= size_filter2)
|
426
476
|
uni_500 += 1
|
427
477
|
end
|
428
478
|
# -------------------------------------------------------------------------
|
429
479
|
|
430
|
-
if (fasta_length.to_i
|
431
|
-
if (status == '
|
432
|
-
|
480
|
+
if (fasta_length.to_i > size_filter1)
|
481
|
+
if (status == 'coding')
|
482
|
+
coding_stats[2] += 1
|
483
|
+
elsif (status == 'putative_coding')
|
484
|
+
p_coding_stats[2] += 1
|
485
|
+
elsif (status == 'unknown')
|
486
|
+
unknown_stats[2] += 1
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
if (fasta_length.to_i > size_filter2)
|
491
|
+
if (status == 'coding')
|
492
|
+
coding_stats[3] += 1
|
493
|
+
elsif (status == 'putative_coding')
|
494
|
+
p_coding_stats[3] += 1
|
495
|
+
elsif (status == 'unknown')
|
496
|
+
unknown_stats[3] += 1
|
433
497
|
end
|
434
498
|
end
|
435
499
|
|
@@ -447,11 +511,11 @@ module FlnStats
|
|
447
511
|
|
448
512
|
status_array = [coding_stats, p_coding_stats, unknown_stats]
|
449
513
|
|
450
|
-
return [status_array, seqs_number,
|
514
|
+
return [status_array, seqs_number, uni_500, uni_200, longest_one]
|
451
515
|
end
|
452
516
|
|
453
517
|
|
454
|
-
def ncrna_stats
|
518
|
+
def ncrna_stats(size_filter1,size_filter2)
|
455
519
|
|
456
520
|
uni_500 = 0
|
457
521
|
uni_200 = 0
|
@@ -468,10 +532,10 @@ module FlnStats
|
|
468
532
|
longest_one = fasta_length.to_i
|
469
533
|
end
|
470
534
|
# -------------------------------------------------------------------------
|
471
|
-
if (fasta_length.to_i >=
|
535
|
+
if (fasta_length.to_i >= size_filter1)
|
472
536
|
uni_200 += 1
|
473
537
|
end
|
474
|
-
if (fasta_length.to_i >=
|
538
|
+
if (fasta_length.to_i >= size_filter2)
|
475
539
|
uni_500 += 1
|
476
540
|
end
|
477
541
|
# -------------------------------------------------------------------------
|
@@ -484,5 +548,53 @@ module FlnStats
|
|
484
548
|
return [nc_total, uni_500, uni_200, longest_one]
|
485
549
|
end
|
486
550
|
|
551
|
+
def chimera_stats(size_filter1,size_filter2)
|
552
|
+
|
553
|
+
uni_500 = 0
|
554
|
+
uni_200 = 0
|
555
|
+
ch_total = 0
|
556
|
+
longest_one = 0
|
557
|
+
db_usage = [0,0,0]
|
558
|
+
|
559
|
+
if !File.exists?('fln_results/chimeric_sequences.txt')
|
560
|
+
return [0, 0, 0, longest_one, db_usage]
|
561
|
+
else
|
562
|
+
File.open('fln_results/chimeric_sequences.txt').each do |line|
|
563
|
+
line.chomp!
|
564
|
+
if (!line.empty?)
|
565
|
+
(name,fasta_length,acc,db_name,status) = line.split("\t")
|
566
|
+
if (status == 'Putative chimera')
|
567
|
+
if (fasta_length.to_i > longest_one)
|
568
|
+
longest_one = fasta_length.to_i
|
569
|
+
end
|
570
|
+
# -------------------------------------------------------------------------
|
571
|
+
if (fasta_length.to_i >= size_filter1)
|
572
|
+
uni_200 += 1
|
573
|
+
end
|
574
|
+
if (fasta_length.to_i >= size_filter2)
|
575
|
+
uni_500 += 1
|
576
|
+
end
|
577
|
+
# -------------------------------------------------------------------------
|
578
|
+
if (db_name =~ /^sp_/)
|
579
|
+
db_usage[1] += 1
|
580
|
+
elsif (db_name =~ /^tr_/)
|
581
|
+
db_usage[2] += 1
|
582
|
+
else
|
583
|
+
db_usage[0] += 1
|
584
|
+
end
|
585
|
+
# -------------------------------------------------------------------------
|
586
|
+
ch_total += 1
|
587
|
+
end
|
588
|
+
end
|
589
|
+
end
|
590
|
+
|
591
|
+
db_usage.each_with_index do |db,i|
|
592
|
+
db_usage[i] = db/2
|
593
|
+
end
|
594
|
+
|
595
|
+
return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
487
599
|
|
488
600
|
end
|
@@ -8,6 +8,9 @@ require 'fl_string_utils'
|
|
8
8
|
require "lcs" # like the class simliar of seqtrim, return the longest common sequence
|
9
9
|
require "test_code"
|
10
10
|
|
11
|
+
require 'chimeric_seqs'
|
12
|
+
include ChimericSeqs
|
13
|
+
|
11
14
|
require 'fl_analysis'
|
12
15
|
include FlAnalysis
|
13
16
|
|
@@ -46,7 +49,12 @@ class MyWorker < ScbiMapreduce::Worker
|
|
46
49
|
# ejecuta blast utilizando los parametros fichero de entrada, base de datos, fichero de salida y tipo de blast
|
47
50
|
def run_blast(input, database, blast_type, evalue)
|
48
51
|
|
49
|
-
|
52
|
+
if (@options[:chimera].nil?)
|
53
|
+
blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
|
54
|
+
else
|
55
|
+
blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue}")
|
56
|
+
end
|
57
|
+
|
50
58
|
blast_result = blast.do_blast_seqs(input, :xml)
|
51
59
|
|
52
60
|
return blast_result
|
@@ -73,34 +81,71 @@ class MyWorker < ScbiMapreduce::Worker
|
|
73
81
|
# do blast
|
74
82
|
my_blast = run_blast(seqs, "#{@options[:user_db]}", 'blastx', '1e-6')
|
75
83
|
|
84
|
+
# chimera detection
|
85
|
+
if (!@options[:chimera].nil?)
|
86
|
+
seqs.each_with_index do |seq,i|
|
87
|
+
if (!my_blast.querys[i].hits[0].nil?)
|
88
|
+
search_chimeras(seq, my_blast.querys[i], @options, user_db_name)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
93
|
+
my_blast = select_best_blast(my_blast, seqs)
|
94
|
+
end
|
95
|
+
|
76
96
|
# split and parse blast
|
77
97
|
seqs.each_with_index do |seq,i|
|
78
98
|
analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
|
79
99
|
end
|
80
100
|
|
81
|
-
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
101
|
+
new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
|
82
102
|
|
83
103
|
else
|
84
104
|
new_seqs = seqs
|
85
105
|
end
|
106
|
+
|
107
|
+
return if new_seqs.empty?
|
86
108
|
|
87
109
|
# -------------------------------------------- UniProt (sp)
|
88
110
|
# blast
|
89
111
|
sp_path=File.join("sp_#{@options[:tax_group]}","sp_#{@options[:tax_group]}.fasta")
|
90
112
|
my_blast = run_blast(new_seqs, sp_path, 'blastx', '1e-6')
|
91
113
|
|
114
|
+
# chimera detection
|
115
|
+
if (!@options[:chimera].nil?)
|
116
|
+
new_seqs.each_with_index do |seq,i|
|
117
|
+
if (!my_blast.querys[i].hits[0].nil?)
|
118
|
+
search_chimeras(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
new_seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
122
|
+
my_blast = select_best_blast(my_blast, new_seqs)
|
123
|
+
end
|
124
|
+
|
92
125
|
# split and parse blast
|
93
126
|
new_seqs.each_with_index do |seq,i|
|
94
127
|
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
|
95
128
|
end
|
96
129
|
|
97
|
-
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
130
|
+
new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
|
131
|
+
return if new_seqs.empty?
|
98
132
|
|
99
133
|
# -------------------------------------------- UniProt (tr)
|
100
134
|
# blast
|
101
135
|
tr_path=File.join("tr_#{@options[:tax_group]}","tr_#{@options[:tax_group]}.fasta")
|
102
136
|
my_blast = run_blast(new_seqs, tr_path, 'blastx', '1e-6')
|
103
137
|
|
138
|
+
# chimera detection
|
139
|
+
if (!@options[:chimera].nil?)
|
140
|
+
new_seqs.each_with_index do |seq,i|
|
141
|
+
if (!my_blast.querys[i].hits[0].nil?)
|
142
|
+
search_chimeras(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
new_seqs=new_seqs.select{|s| s.get_annotations(:chimera).empty?}
|
146
|
+
my_blast = select_best_blast(my_blast, new_seqs)
|
147
|
+
end
|
148
|
+
|
104
149
|
# split and parse blast
|
105
150
|
new_seqs.each_with_index do |seq,i|
|
106
151
|
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
@@ -108,8 +153,9 @@ class MyWorker < ScbiMapreduce::Worker
|
|
108
153
|
|
109
154
|
# -------------------------------------------- Test Code
|
110
155
|
# the sequences without a reliable similarity with an orthologue are processed with Test Code
|
111
|
-
testcode_input=seqs.select{|s| !s.get_annotations(:apply_tcode).empty?}
|
112
|
-
|
156
|
+
testcode_input=seqs.select{|s| (!s.get_annotations(:apply_tcode).empty? && s.get_annotations(:chimera).empty?)}
|
157
|
+
return if testcode_input.empty?
|
158
|
+
|
113
159
|
# active this line to test tcode, and comment all lines above in this function
|
114
160
|
# testcode_input=seqs
|
115
161
|
|
@@ -119,6 +165,8 @@ class MyWorker < ScbiMapreduce::Worker
|
|
119
165
|
|
120
166
|
# -------------------------------------------- nc RNA
|
121
167
|
unknown_seqs=seqs.select{|s| !s.get_annotations(:tcode_unknown).empty?}
|
168
|
+
return if unknown_seqs.empty?
|
169
|
+
|
122
170
|
# run blastn
|
123
171
|
ncrna_path=File.join('nc_rna_db','ncrna_fln_100.fasta')
|
124
172
|
my_blast = run_blast(unknown_seqs, ncrna_path, 'blastn', '1e-3')
|
@@ -34,6 +34,15 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
|
|
34
34
|
@@nc_rna_file = File.open("fln_results/nc_rnas.txt", 'w')
|
35
35
|
@@nc_rna_file.puts file_head
|
36
36
|
|
37
|
+
if (!options[:chimera].nil?)
|
38
|
+
@@chimera_file = File.open("fln_results/chimeric_sequences.txt", 'w')
|
39
|
+
@@chimera_file.puts file_head
|
40
|
+
else
|
41
|
+
if File.exists?("fln_results/chimeric_sequences.txt")
|
42
|
+
File.delete("fln_results/chimeric_sequences.txt")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
37
46
|
# @@error_fasta_file = File.open("fln_results/error_seqs.fasta", 'w')
|
38
47
|
# @@error_file = File.open("fln_results/errors_info.txt", 'w')
|
39
48
|
|
@@ -50,6 +59,10 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
|
|
50
59
|
@@tcode_file.close
|
51
60
|
@@nc_rna_file.close
|
52
61
|
|
62
|
+
if (!@@options[:chimera].nil?)
|
63
|
+
@@chimera_file.close
|
64
|
+
end
|
65
|
+
|
53
66
|
# @@error_fasta_file.close
|
54
67
|
# @@error_file.close
|
55
68
|
|
@@ -113,50 +126,101 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
|
|
113
126
|
|
114
127
|
def write_seq(seq)
|
115
128
|
begin
|
116
|
-
# --------------------------------------------------------
|
117
|
-
if (
|
129
|
+
# -------------------------------------------------------- Chimeric Seqs
|
130
|
+
if (!@@options[:chimera].nil?)
|
131
|
+
if (q=seq.get_annotations(:chimera).first)
|
132
|
+
@@chimera_file.puts q[:message]
|
133
|
+
# -------------------------------------------------- Complete Seqs
|
134
|
+
elsif (e=seq.get_annotations(:complete).first)
|
135
|
+
|
136
|
+
@@annotation_file.puts e[:message]
|
137
|
+
|
138
|
+
if (a=seq.get_annotations(:alignment).first)
|
139
|
+
@@alignment_file.puts a[:message]
|
140
|
+
end
|
141
|
+
|
142
|
+
if (p=seq.get_annotations(:protein).first)
|
143
|
+
@@prot_file.puts p[:message]
|
144
|
+
end
|
145
|
+
|
146
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
147
|
+
@@nts_file.puts n[:message]
|
148
|
+
end
|
149
|
+
# --------------------------------------------------- Non Complete Seqs
|
150
|
+
elsif (e=seq.get_annotations(:tmp_annotation).first)
|
118
151
|
|
119
|
-
|
152
|
+
@@annotation_file.puts e[:message][0]
|
120
153
|
|
121
|
-
|
122
|
-
|
123
|
-
|
154
|
+
if (a=seq.get_annotations(:alignment).first)
|
155
|
+
if !a[:message].empty?
|
156
|
+
@@alignment_file.puts a[:message]
|
157
|
+
end
|
158
|
+
end
|
124
159
|
|
125
|
-
|
126
|
-
|
127
|
-
|
160
|
+
if (p=seq.get_annotations(:protein).first)
|
161
|
+
if !p[:message].empty?
|
162
|
+
@@prot_file.puts p[:message]
|
163
|
+
end
|
164
|
+
end
|
128
165
|
|
129
|
-
|
130
|
-
|
166
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
167
|
+
@@nts_file.puts n[:message]
|
168
|
+
end
|
169
|
+
# ------------------------------------------------- nc RNA
|
170
|
+
elsif (nc=seq.get_annotations(:ncrna).first)
|
171
|
+
@@nc_rna_file.puts nc[:message]
|
172
|
+
# ------------------------------------------------- Test Code
|
173
|
+
elsif (t=seq.get_annotations(:tcode).first)
|
174
|
+
@@tcode_file.puts t[:message]
|
131
175
|
end
|
132
|
-
#
|
133
|
-
|
176
|
+
# ---------------------------------------------------------------------------------
|
177
|
+
# -------------------------------------------------------- without Chimeric Seqs Mode
|
178
|
+
else
|
179
|
+
# ------------------------------------------------- Complete Seqs
|
180
|
+
if (e=seq.get_annotations(:complete).first)
|
134
181
|
|
135
|
-
|
182
|
+
@@annotation_file.puts e[:message]
|
136
183
|
|
137
|
-
|
138
|
-
if !a[:message].empty?
|
184
|
+
if (a=seq.get_annotations(:alignment).first)
|
139
185
|
@@alignment_file.puts a[:message]
|
140
186
|
end
|
141
|
-
end
|
142
187
|
|
143
|
-
|
144
|
-
if !p[:message].empty?
|
188
|
+
if (p=seq.get_annotations(:protein).first)
|
145
189
|
@@prot_file.puts p[:message]
|
146
190
|
end
|
147
|
-
end
|
148
191
|
|
149
|
-
|
150
|
-
|
192
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
193
|
+
@@nts_file.puts n[:message]
|
194
|
+
end
|
195
|
+
# ------------------------------------------------- Non Complete Seqs
|
196
|
+
elsif (e=seq.get_annotations(:tmp_annotation).first)
|
197
|
+
|
198
|
+
@@annotation_file.puts e[:message][0]
|
199
|
+
|
200
|
+
if (a=seq.get_annotations(:alignment).first)
|
201
|
+
if !a[:message].empty?
|
202
|
+
@@alignment_file.puts a[:message]
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
if (p=seq.get_annotations(:protein).first)
|
207
|
+
if !p[:message].empty?
|
208
|
+
@@prot_file.puts p[:message]
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
213
|
+
@@nts_file.puts n[:message]
|
214
|
+
end
|
215
|
+
# ------------------------------------------------- nc RNA
|
216
|
+
elsif (nc=seq.get_annotations(:ncrna).first)
|
217
|
+
@@nc_rna_file.puts nc[:message]
|
218
|
+
# ------------------------------------------------- Test Code
|
219
|
+
elsif (t=seq.get_annotations(:tcode).first)
|
220
|
+
@@tcode_file.puts t[:message]
|
151
221
|
end
|
152
|
-
# -------------------------------------------------------- nc RNA
|
153
|
-
elsif (nc=seq.get_annotations(:ncrna).first)
|
154
|
-
@@nc_rna_file.puts nc[:message]
|
155
|
-
# -------------------------------------------------------- Test Code
|
156
|
-
elsif (t=seq.get_annotations(:tcode).first)
|
157
|
-
@@tcode_file.puts t[:message]
|
158
222
|
end
|
159
|
-
#
|
223
|
+
# ------------------------------------------------- errors
|
160
224
|
# if e=seq.get_annotations(:error).first
|
161
225
|
# if !e[:message].empty?
|
162
226
|
# @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: full_lengther_next
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.8
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Noe Fernandez & Dario Guerrero
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-11-28 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: xml-simple
|
@@ -31,7 +31,7 @@ dependencies:
|
|
31
31
|
requirements:
|
32
32
|
- - ">="
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version: 0.0.
|
34
|
+
version: 0.0.37
|
35
35
|
type: :runtime
|
36
36
|
version_requirements: *id002
|
37
37
|
- !ruby/object:Gem::Dependency
|
@@ -97,6 +97,7 @@ files:
|
|
97
97
|
- bin/full_lengther_next
|
98
98
|
- History.txt
|
99
99
|
- lib/full_lengther_next/classes/common_functions.rb
|
100
|
+
- lib/full_lengther_next/classes/chimeric_seqs.rb
|
100
101
|
- lib/full_lengther_next/classes/fl_analysis.rb
|
101
102
|
- lib/full_lengther_next/classes/fl_string_utils.rb
|
102
103
|
- lib/full_lengther_next/classes/fln_stats.rb
|
@@ -142,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
143
|
requirements: []
|
143
144
|
|
144
145
|
rubyforge_project: full_lengther_next
|
145
|
-
rubygems_version: 1.
|
146
|
+
rubygems_version: 1.8.24
|
146
147
|
signing_key:
|
147
148
|
specification_version: 3
|
148
149
|
summary: FULL-LENGTHERNEXT is a tool adapted to NGS technologies, able to work in parallel and in a distributed way to minimise computing time
|