full_lengther_next 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +8 -0
- data/Manifest.txt +1 -0
- data/Rakefile +1 -1
- data/bin/full_lengther_next +8 -1
- data/bin/make_user_db.rb +5 -5
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +78 -0
- data/lib/full_lengther_next/classes/fln_stats.rb +148 -36
- data/lib/full_lengther_next/classes/my_worker.rb +53 -5
- data/lib/full_lengther_next/classes/my_worker_manager.rb +93 -29
- metadata +5 -4
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -3,6 +3,7 @@ bin/make_user_db.rb
|
|
3
3
|
bin/full_lengther_next
|
4
4
|
History.txt
|
5
5
|
lib/full_lengther_next/classes/common_functions.rb
|
6
|
+
lib/full_lengther_next/classes/chimeric_seqs.rb
|
6
7
|
lib/full_lengther_next/classes/fl_analysis.rb
|
7
8
|
lib/full_lengther_next/classes/fl_string_utils.rb
|
8
9
|
lib/full_lengther_next/classes/fln_stats.rb
|
data/Rakefile
CHANGED
@@ -20,7 +20,7 @@ $hoe = Hoe.spec 'full_lengther_next' do
|
|
20
20
|
# self.extra_deps << ['gnuplot','>=0']
|
21
21
|
# self.extra_deps << ['term-ansicolor','>=1.0.5']
|
22
22
|
self.extra_deps << ['xml-simple','>=1.0.12']
|
23
|
-
self.extra_deps << ['scbi_blast','>=0.0.
|
23
|
+
self.extra_deps << ['scbi_blast','>=0.0.37']
|
24
24
|
self.extra_deps << ['scbi_mapreduce','>=0.0.29']
|
25
25
|
self.extra_deps << ['scbi_fasta','>=0.1.7']
|
26
26
|
# self.extra_deps << ['scbi_fastq','>=0.0.13']
|
data/bin/full_lengther_next
CHANGED
@@ -50,6 +50,11 @@ optparse = OptionParser.new do |opts|
|
|
50
50
|
options[:distance] = distance.to_i
|
51
51
|
end
|
52
52
|
|
53
|
+
options[:chimera] = nil
|
54
|
+
opts.on( '-q', '--chimera_detection', "apply chimera detection mode\n\n" ) do |chimera|
|
55
|
+
options[:chimera] = chimera
|
56
|
+
end
|
57
|
+
|
53
58
|
options[:workers] = 2
|
54
59
|
opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
|
55
60
|
if File.exists?(workers)
|
@@ -91,7 +96,7 @@ optparse = OptionParser.new do |opts|
|
|
91
96
|
|
92
97
|
|
93
98
|
# Set a banner, displayed at the top of the help screen.
|
94
|
-
opts.banner = "
|
99
|
+
opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
|
95
100
|
|
96
101
|
# This displays the help screen
|
97
102
|
opts.on( '-h', '--help', 'Display this screen' ) do
|
@@ -172,6 +177,8 @@ require 'my_worker_manager'
|
|
172
177
|
$LOG = Logger.new(STDOUT)
|
173
178
|
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
|
174
179
|
|
180
|
+
# puts "ROOT_PATH: #{ROOT_PATH}"
|
181
|
+
|
175
182
|
custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
|
176
183
|
|
177
184
|
$LOG.info 'Starting server'
|
data/bin/make_user_db.rb
CHANGED
@@ -126,16 +126,16 @@ end
|
|
126
126
|
ENV['BLASTDB']=formatted_db_path
|
127
127
|
|
128
128
|
|
129
|
-
if !File.exists?(File.join(
|
130
|
-
Dir.mkdir(
|
129
|
+
if !File.exists?(File.join(formatted_db_path, my_group))
|
130
|
+
Dir.mkdir(File.join(formatted_db_path,my_group))
|
131
131
|
end
|
132
132
|
|
133
|
-
output_file_path=File.join(
|
133
|
+
output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
|
134
134
|
|
135
135
|
output_file = File.new(output_file_path, "w")
|
136
136
|
|
137
|
-
filter_incomplete_seqs(output_file, File.join(
|
138
|
-
filter_incomplete_seqs(output_file, File.join(
|
137
|
+
filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
|
138
|
+
filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
|
139
139
|
|
140
140
|
output_file.close
|
141
141
|
|
data/lib/full_lengther_next.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
|
-
|
4
|
+
# ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
|
5
5
|
|
6
|
-
$: << File.expand_path(File.join(
|
6
|
+
$: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
|
7
7
|
|
8
8
|
|
9
9
|
module FullLengtherNext
|
10
|
-
VERSION = '0.0.
|
10
|
+
VERSION = '0.0.8'
|
11
11
|
|
12
12
|
FULLLENGHTER_VERSION = VERSION
|
13
13
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
|
2
|
+
require 'scbi_blast'
|
3
|
+
|
4
|
+
module ChimericSeqs
|
5
|
+
|
6
|
+
def search_chimeras(seq, blast_query, options, db_name)
|
7
|
+
|
8
|
+
# used to detect if the sequence and the blast are from different query
|
9
|
+
if (seq.seq_name != blast_query.query_def)
|
10
|
+
puts "#{seq.seq_name} --> #{blast_query.query_def}"
|
11
|
+
raise "BLAST query name and sequence are different"
|
12
|
+
end
|
13
|
+
|
14
|
+
q=blast_query
|
15
|
+
# puts "#{q.query_def}"
|
16
|
+
|
17
|
+
ref_hit_beg = q.hits[0].q_beg
|
18
|
+
ref_hit_end = q.hits[0].q_end
|
19
|
+
|
20
|
+
q.hits.each do |hit|
|
21
|
+
# puts "---------#{hit.acc}"
|
22
|
+
# if overlaps or is contained in the ref hit
|
23
|
+
if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
|
24
|
+
# puts "hits overlapping: ref_hit #{ref_hit_beg}-#{ref_hit_end}, current hit #{hit.q_beg}-#{hit.q_end}"
|
25
|
+
ref_hit_beg = [ref_hit_beg,hit.q_beg].min
|
26
|
+
ref_hit_end = [ref_hit_end,hit.q_end].max
|
27
|
+
# puts "modified ref_hit #{ref_hit_beg}-#{ref_hit_end}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
q.hits.each do |hit|
|
31
|
+
if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
|
32
|
+
else
|
33
|
+
if (hit.acc != q.hits[0].acc)
|
34
|
+
# puts "\nreference: #{ref_hit_beg} - #{ref_hit_end}"
|
35
|
+
# puts "hit 0: #{q.hits[0].q_beg} - #{q.hits[0].q_end}"
|
36
|
+
# puts "current: #{hit.q_beg} - #{hit.q_end}"
|
37
|
+
# puts "putative chimeric seq: \n#{q.hits[0].definition}\n#{hit.definition}\n------------------------------------------"
|
38
|
+
|
39
|
+
chimera_annotations = "\n#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tPutative chimera\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t\n#{q.query_def}\t#{seq.fasta_length}\t#{hit.acc}\t#{db_name}\tPutative chimera\t\t#{hit.e_val}\t#{hit.ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{hit.q_frame}\t#{hit.q_beg}\t#{hit.q_end}\t#{hit.s_beg.to_i}\t#{hit.s_end.to_i}\t#{hit.definition}\t"
|
40
|
+
seq.annotate(:chimera,chimera_annotations,false)
|
41
|
+
end
|
42
|
+
|
43
|
+
return
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def select_best_blast(tmp_blast_obj, new_seqs)
|
50
|
+
|
51
|
+
my_seqs ={}
|
52
|
+
|
53
|
+
new_seqs.each do |seq|
|
54
|
+
my_seqs[seq.seq_name] = true
|
55
|
+
end
|
56
|
+
|
57
|
+
reverse_counter = (tmp_blast_obj.querys.length - 1)
|
58
|
+
|
59
|
+
tmp_blast_obj.querys.reverse_each do |query|
|
60
|
+
if (!my_seqs[query.query_def]) # los marcados como quimeras se eliminan para no utilizarse posteriormente
|
61
|
+
# tmp_blast_obj.querys[reverse_counter].delete
|
62
|
+
tmp_blast_obj.querys.delete_at(reverse_counter)
|
63
|
+
else
|
64
|
+
reverse_hit_counter = (query.hits.length - 1)
|
65
|
+
query.hits.reverse_each do |hit|
|
66
|
+
if (hit.acc != query.hits[0].acc)
|
67
|
+
tmp_blast_obj.querys[reverse_counter].hits.delete_at(reverse_hit_counter)
|
68
|
+
end
|
69
|
+
reverse_hit_counter -= 1
|
70
|
+
end
|
71
|
+
end
|
72
|
+
reverse_counter -= 1
|
73
|
+
end
|
74
|
+
|
75
|
+
return tmp_blast_obj
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -4,20 +4,28 @@ module FlnStats
|
|
4
4
|
def summary_stats
|
5
5
|
stats_file = File.open('fln_results/summary_stats.html', 'w')
|
6
6
|
|
7
|
+
size_filter1 = 200
|
8
|
+
size_filter2 = 500
|
9
|
+
|
7
10
|
# recogemos los trozos de html fijos
|
8
11
|
(html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
|
9
12
|
|
10
13
|
total_seqs = 0
|
11
14
|
status_suma = 0
|
12
15
|
#recogemos los datos que necesitamos de los ficheros de resultados
|
13
|
-
(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats
|
14
|
-
(tcode_array, seqs_number2,
|
15
|
-
(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats
|
16
|
+
(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
|
17
|
+
(tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
|
18
|
+
(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
|
19
|
+
(chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
|
16
20
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
+
seqs_number1 = (seqs_number1+chimera_total.to_i)
|
22
|
+
total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
|
23
|
+
uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
|
24
|
+
uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
|
25
|
+
longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
|
26
|
+
db_usage[0] += ch_db_usage[0]
|
27
|
+
db_usage[1] += ch_db_usage[1]
|
28
|
+
db_usage[2] += ch_db_usage[2]
|
21
29
|
stats_file.puts html_head
|
22
30
|
|
23
31
|
if (total_seqs.to_i > 0)
|
@@ -46,6 +54,15 @@ module FlnStats
|
|
46
54
|
end
|
47
55
|
status_suma += status[0]
|
48
56
|
end
|
57
|
+
|
58
|
+
# adding chimeric seqs
|
59
|
+
stats_file.puts ' <tr>
|
60
|
+
<td colspan="2" align="left">Putative chimera</td>
|
61
|
+
<td align="right">'+chimera_total.to_s+'</td>
|
62
|
+
<td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
|
63
|
+
</tr>'
|
64
|
+
status_suma += chimera_total
|
65
|
+
|
49
66
|
# añadimos los coding, P.coding
|
50
67
|
tcode_array.each do |status|
|
51
68
|
if (status[1] == 'Coding')
|
@@ -64,6 +81,7 @@ module FlnStats
|
|
64
81
|
end
|
65
82
|
status_suma += status[0]
|
66
83
|
end
|
84
|
+
|
67
85
|
# se ponen los ncRNA
|
68
86
|
stats_file.puts ' <tr>
|
69
87
|
<td colspan="2" align="left">Putative ncRNA</td>
|
@@ -156,12 +174,12 @@ module FlnStats
|
|
156
174
|
<td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
|
157
175
|
</tr>'
|
158
176
|
stats_file.puts ' <tr>
|
159
|
-
<td align="left">Unigenes >
|
177
|
+
<td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
|
160
178
|
<td align="right">'+uni_500.to_s+'</td>
|
161
179
|
<td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
|
162
180
|
</tr>'
|
163
181
|
stats_file.puts ' <tr>
|
164
|
-
<td align="left">Unigenes >
|
182
|
+
<td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
|
165
183
|
<td align="right">'+uni_200.to_s+'</td>
|
166
184
|
<td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
|
167
185
|
</tr>'
|
@@ -175,6 +193,8 @@ module FlnStats
|
|
175
193
|
<td align="right">'+seqs_number1.to_s+'</td>
|
176
194
|
<td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
|
177
195
|
</tr>'
|
196
|
+
|
197
|
+
if (seqs_number1.to_i > 0)
|
178
198
|
stats_file.puts ' <tr>
|
179
199
|
<td align="left"> Different orthologue IDs</td>
|
180
200
|
<td align="right">'+seq_uniq.to_s+'</td>
|
@@ -195,21 +215,49 @@ module FlnStats
|
|
195
215
|
<td align="right">'+error_1_num.to_s+'</td>
|
196
216
|
<td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
|
197
217
|
</tr>'
|
218
|
+
stats_file.puts ' <tr>
|
219
|
+
<td align="left"> Putative chimera</td>
|
220
|
+
<td align="right">'+chimera_total.to_s+'</td>
|
221
|
+
<td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
|
222
|
+
</tr>'
|
223
|
+
end
|
198
224
|
stats_file.puts ' <tr>
|
199
225
|
<td align="left">Without orthologue <sup>1</sup></td>
|
200
226
|
<td align="right">'+no_db.to_s+'</td>
|
201
227
|
<td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
|
202
228
|
</tr>'
|
229
|
+
|
230
|
+
if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
|
203
231
|
stats_file.puts ' <tr>
|
204
|
-
<td align="left"> Coding</td>
|
232
|
+
<td align="left"> Coding (all)</td>
|
205
233
|
<td align="right">'+tcode_array[0][0].to_s+'</td>
|
206
234
|
<td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
|
207
235
|
</tr>'
|
208
236
|
stats_file.puts ' <tr>
|
209
|
-
<td align="left">
|
237
|
+
<td align="left"> Coding > '+size_filter1.to_s+'bp</td>
|
238
|
+
<td align="right">'+tcode_array[0][2].to_s+'</td>
|
239
|
+
<td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
|
240
|
+
</tr>'
|
241
|
+
stats_file.puts ' <tr>
|
242
|
+
<td align="left"> Coding > '+size_filter2.to_s+'bp</td>
|
243
|
+
<td align="right">'+tcode_array[0][3].to_s+'</td>
|
244
|
+
<td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
|
245
|
+
</tr>'
|
246
|
+
stats_file.puts ' <tr>
|
247
|
+
<td align="left"> Putative Coding (all)</td>
|
210
248
|
<td align="right">'+tcode_array[1][0].to_s+'</td>
|
211
249
|
<td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
|
212
250
|
</tr>'
|
251
|
+
stats_file.puts ' <tr>
|
252
|
+
<td align="left"> Putative Coding > '+size_filter1.to_s+'bp</td>
|
253
|
+
<td align="right">'+tcode_array[1][2].to_s+'</td>
|
254
|
+
<td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
|
255
|
+
</tr>'
|
256
|
+
stats_file.puts ' <tr>
|
257
|
+
<td align="left"> Putative Coding > '+size_filter2.to_s+'bp</td>
|
258
|
+
<td align="right">'+tcode_array[1][3].to_s+'</td>
|
259
|
+
<td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
|
260
|
+
</tr>'
|
213
261
|
stats_file.puts ' <tr>
|
214
262
|
<td align="left"> Putative ncRNA</td>
|
215
263
|
<td align="right">'+ncrna_total.to_s+'</td>
|
@@ -221,16 +269,19 @@ module FlnStats
|
|
221
269
|
<td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
|
222
270
|
</tr>'
|
223
271
|
stats_file.puts ' <tr>
|
224
|
-
<td align="left"> Unknown
|
225
|
-
<td align="right">'+
|
226
|
-
<td align="right">'+'%.2f' % (100*
|
227
|
-
</tr>
|
228
|
-
|
272
|
+
<td align="left"> Unknown > '+size_filter1.to_s+'bp</td>
|
273
|
+
<td align="right">'+tcode_array[2][2].to_s+'</td>
|
274
|
+
<td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
|
275
|
+
</tr>'
|
276
|
+
stats_file.puts ' <tr>
|
277
|
+
<td align="left"> Unknown > '+size_filter2.to_s+'bp</td>
|
278
|
+
<td align="right">'+tcode_array[2][3].to_s+'</td>
|
279
|
+
<td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
|
280
|
+
</tr>'
|
281
|
+
end
|
282
|
+
stats_file.puts ' </table>
|
229
283
|
<sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
|
230
284
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
285
|
end
|
235
286
|
stats_file.puts html_end
|
236
287
|
|
@@ -309,7 +360,7 @@ module FlnStats
|
|
309
360
|
end
|
310
361
|
|
311
362
|
|
312
|
-
def annotation_stats
|
363
|
+
def annotation_stats(size_filter1,size_filter2)
|
313
364
|
|
314
365
|
seqs_number = 0
|
315
366
|
array_of_all_accs = []
|
@@ -353,10 +404,10 @@ module FlnStats
|
|
353
404
|
end
|
354
405
|
|
355
406
|
# -------------------------------------------------------------------------
|
356
|
-
if (fasta_length.to_i >=
|
407
|
+
if (fasta_length.to_i >= size_filter1)
|
357
408
|
uni_200 += 1
|
358
409
|
end
|
359
|
-
if (fasta_length.to_i >=
|
410
|
+
if (fasta_length.to_i >= size_filter2)
|
360
411
|
uni_500 += 1
|
361
412
|
end
|
362
413
|
# -------------------------------------------------------------------------
|
@@ -394,18 +445,17 @@ module FlnStats
|
|
394
445
|
end
|
395
446
|
|
396
447
|
|
397
|
-
def testcode_stats
|
448
|
+
def testcode_stats(size_filter1,size_filter2)
|
398
449
|
|
399
450
|
seqs_number = 0
|
400
|
-
unk_200 = 0
|
401
451
|
uni_500 = 0
|
402
452
|
uni_200 = 0
|
403
453
|
longest_one = 0
|
404
454
|
|
405
455
|
# total, status
|
406
|
-
coding_stats = [0,'Coding']
|
407
|
-
p_coding_stats = [0,'Putative Coding']
|
408
|
-
unknown_stats = [0,'Unknown']
|
456
|
+
coding_stats = [0,'Coding',0,0]
|
457
|
+
p_coding_stats = [0,'Putative Coding',0,0]
|
458
|
+
unknown_stats = [0,'Unknown',0,0]
|
409
459
|
|
410
460
|
File.open('fln_results/new_coding.txt').each do |line|
|
411
461
|
line.chomp!
|
@@ -419,17 +469,31 @@ module FlnStats
|
|
419
469
|
end
|
420
470
|
|
421
471
|
# -------------------------------------------------------------------------
|
422
|
-
if (fasta_length.to_i >=
|
472
|
+
if (fasta_length.to_i >= size_filter1)
|
423
473
|
uni_200 += 1
|
424
474
|
end
|
425
|
-
if (fasta_length.to_i >=
|
475
|
+
if (fasta_length.to_i >= size_filter2)
|
426
476
|
uni_500 += 1
|
427
477
|
end
|
428
478
|
# -------------------------------------------------------------------------
|
429
479
|
|
430
|
-
if (fasta_length.to_i
|
431
|
-
if (status == '
|
432
|
-
|
480
|
+
if (fasta_length.to_i > size_filter1)
|
481
|
+
if (status == 'coding')
|
482
|
+
coding_stats[2] += 1
|
483
|
+
elsif (status == 'putative_coding')
|
484
|
+
p_coding_stats[2] += 1
|
485
|
+
elsif (status == 'unknown')
|
486
|
+
unknown_stats[2] += 1
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
if (fasta_length.to_i > size_filter2)
|
491
|
+
if (status == 'coding')
|
492
|
+
coding_stats[3] += 1
|
493
|
+
elsif (status == 'putative_coding')
|
494
|
+
p_coding_stats[3] += 1
|
495
|
+
elsif (status == 'unknown')
|
496
|
+
unknown_stats[3] += 1
|
433
497
|
end
|
434
498
|
end
|
435
499
|
|
@@ -447,11 +511,11 @@ module FlnStats
|
|
447
511
|
|
448
512
|
status_array = [coding_stats, p_coding_stats, unknown_stats]
|
449
513
|
|
450
|
-
return [status_array, seqs_number,
|
514
|
+
return [status_array, seqs_number, uni_500, uni_200, longest_one]
|
451
515
|
end
|
452
516
|
|
453
517
|
|
454
|
-
def ncrna_stats
|
518
|
+
def ncrna_stats(size_filter1,size_filter2)
|
455
519
|
|
456
520
|
uni_500 = 0
|
457
521
|
uni_200 = 0
|
@@ -468,10 +532,10 @@ module FlnStats
|
|
468
532
|
longest_one = fasta_length.to_i
|
469
533
|
end
|
470
534
|
# -------------------------------------------------------------------------
|
471
|
-
if (fasta_length.to_i >=
|
535
|
+
if (fasta_length.to_i >= size_filter1)
|
472
536
|
uni_200 += 1
|
473
537
|
end
|
474
|
-
if (fasta_length.to_i >=
|
538
|
+
if (fasta_length.to_i >= size_filter2)
|
475
539
|
uni_500 += 1
|
476
540
|
end
|
477
541
|
# -------------------------------------------------------------------------
|
@@ -484,5 +548,53 @@ module FlnStats
|
|
484
548
|
return [nc_total, uni_500, uni_200, longest_one]
|
485
549
|
end
|
486
550
|
|
551
|
+
def chimera_stats(size_filter1,size_filter2)
|
552
|
+
|
553
|
+
uni_500 = 0
|
554
|
+
uni_200 = 0
|
555
|
+
ch_total = 0
|
556
|
+
longest_one = 0
|
557
|
+
db_usage = [0,0,0]
|
558
|
+
|
559
|
+
if !File.exists?('fln_results/chimeric_sequences.txt')
|
560
|
+
return [0, 0, 0, longest_one, db_usage]
|
561
|
+
else
|
562
|
+
File.open('fln_results/chimeric_sequences.txt').each do |line|
|
563
|
+
line.chomp!
|
564
|
+
if (!line.empty?)
|
565
|
+
(name,fasta_length,acc,db_name,status) = line.split("\t")
|
566
|
+
if (status == 'Putative chimera')
|
567
|
+
if (fasta_length.to_i > longest_one)
|
568
|
+
longest_one = fasta_length.to_i
|
569
|
+
end
|
570
|
+
# -------------------------------------------------------------------------
|
571
|
+
if (fasta_length.to_i >= size_filter1)
|
572
|
+
uni_200 += 1
|
573
|
+
end
|
574
|
+
if (fasta_length.to_i >= size_filter2)
|
575
|
+
uni_500 += 1
|
576
|
+
end
|
577
|
+
# -------------------------------------------------------------------------
|
578
|
+
if (db_name =~ /^sp_/)
|
579
|
+
db_usage[1] += 1
|
580
|
+
elsif (db_name =~ /^tr_/)
|
581
|
+
db_usage[2] += 1
|
582
|
+
else
|
583
|
+
db_usage[0] += 1
|
584
|
+
end
|
585
|
+
# -------------------------------------------------------------------------
|
586
|
+
ch_total += 1
|
587
|
+
end
|
588
|
+
end
|
589
|
+
end
|
590
|
+
|
591
|
+
db_usage.each_with_index do |db,i|
|
592
|
+
db_usage[i] = db/2
|
593
|
+
end
|
594
|
+
|
595
|
+
return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
487
599
|
|
488
600
|
end
|
@@ -8,6 +8,9 @@ require 'fl_string_utils'
|
|
8
8
|
require "lcs" # like the class simliar of seqtrim, return the longest common sequence
|
9
9
|
require "test_code"
|
10
10
|
|
11
|
+
require 'chimeric_seqs'
|
12
|
+
include ChimericSeqs
|
13
|
+
|
11
14
|
require 'fl_analysis'
|
12
15
|
include FlAnalysis
|
13
16
|
|
@@ -46,7 +49,12 @@ class MyWorker < ScbiMapreduce::Worker
|
|
46
49
|
# ejecuta blast utilizando los parametros fichero de entrada, base de datos, fichero de salida y tipo de blast
|
47
50
|
def run_blast(input, database, blast_type, evalue)
|
48
51
|
|
49
|
-
|
52
|
+
if (@options[:chimera].nil?)
|
53
|
+
blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
|
54
|
+
else
|
55
|
+
blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue}")
|
56
|
+
end
|
57
|
+
|
50
58
|
blast_result = blast.do_blast_seqs(input, :xml)
|
51
59
|
|
52
60
|
return blast_result
|
@@ -73,34 +81,71 @@ class MyWorker < ScbiMapreduce::Worker
|
|
73
81
|
# do blast
|
74
82
|
my_blast = run_blast(seqs, "#{@options[:user_db]}", 'blastx', '1e-6')
|
75
83
|
|
84
|
+
# chimera detection
|
85
|
+
if (!@options[:chimera].nil?)
|
86
|
+
seqs.each_with_index do |seq,i|
|
87
|
+
if (!my_blast.querys[i].hits[0].nil?)
|
88
|
+
search_chimeras(seq, my_blast.querys[i], @options, user_db_name)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
93
|
+
my_blast = select_best_blast(my_blast, seqs)
|
94
|
+
end
|
95
|
+
|
76
96
|
# split and parse blast
|
77
97
|
seqs.each_with_index do |seq,i|
|
78
98
|
analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
|
79
99
|
end
|
80
100
|
|
81
|
-
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
101
|
+
new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
|
82
102
|
|
83
103
|
else
|
84
104
|
new_seqs = seqs
|
85
105
|
end
|
106
|
+
|
107
|
+
return if new_seqs.empty?
|
86
108
|
|
87
109
|
# -------------------------------------------- UniProt (sp)
|
88
110
|
# blast
|
89
111
|
sp_path=File.join("sp_#{@options[:tax_group]}","sp_#{@options[:tax_group]}.fasta")
|
90
112
|
my_blast = run_blast(new_seqs, sp_path, 'blastx', '1e-6')
|
91
113
|
|
114
|
+
# chimera detection
|
115
|
+
if (!@options[:chimera].nil?)
|
116
|
+
new_seqs.each_with_index do |seq,i|
|
117
|
+
if (!my_blast.querys[i].hits[0].nil?)
|
118
|
+
search_chimeras(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
new_seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
122
|
+
my_blast = select_best_blast(my_blast, new_seqs)
|
123
|
+
end
|
124
|
+
|
92
125
|
# split and parse blast
|
93
126
|
new_seqs.each_with_index do |seq,i|
|
94
127
|
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
|
95
128
|
end
|
96
129
|
|
97
|
-
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
130
|
+
new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
|
131
|
+
return if new_seqs.empty?
|
98
132
|
|
99
133
|
# -------------------------------------------- UniProt (tr)
|
100
134
|
# blast
|
101
135
|
tr_path=File.join("tr_#{@options[:tax_group]}","tr_#{@options[:tax_group]}.fasta")
|
102
136
|
my_blast = run_blast(new_seqs, tr_path, 'blastx', '1e-6')
|
103
137
|
|
138
|
+
# chimera detection
|
139
|
+
if (!@options[:chimera].nil?)
|
140
|
+
new_seqs.each_with_index do |seq,i|
|
141
|
+
if (!my_blast.querys[i].hits[0].nil?)
|
142
|
+
search_chimeras(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
new_seqs=new_seqs.select{|s| s.get_annotations(:chimera).empty?}
|
146
|
+
my_blast = select_best_blast(my_blast, new_seqs)
|
147
|
+
end
|
148
|
+
|
104
149
|
# split and parse blast
|
105
150
|
new_seqs.each_with_index do |seq,i|
|
106
151
|
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
@@ -108,8 +153,9 @@ class MyWorker < ScbiMapreduce::Worker
|
|
108
153
|
|
109
154
|
# -------------------------------------------- Test Code
|
110
155
|
# the sequences without a reliable similarity with an orthologue are processed with Test Code
|
111
|
-
testcode_input=seqs.select{|s| !s.get_annotations(:apply_tcode).empty?}
|
112
|
-
|
156
|
+
testcode_input=seqs.select{|s| (!s.get_annotations(:apply_tcode).empty? && s.get_annotations(:chimera).empty?)}
|
157
|
+
return if testcode_input.empty?
|
158
|
+
|
113
159
|
# active this line to test tcode, and comment all lines above in this function
|
114
160
|
# testcode_input=seqs
|
115
161
|
|
@@ -119,6 +165,8 @@ class MyWorker < ScbiMapreduce::Worker
|
|
119
165
|
|
120
166
|
# -------------------------------------------- nc RNA
|
121
167
|
unknown_seqs=seqs.select{|s| !s.get_annotations(:tcode_unknown).empty?}
|
168
|
+
return if unknown_seqs.empty?
|
169
|
+
|
122
170
|
# run blastn
|
123
171
|
ncrna_path=File.join('nc_rna_db','ncrna_fln_100.fasta')
|
124
172
|
my_blast = run_blast(unknown_seqs, ncrna_path, 'blastn', '1e-3')
|
@@ -34,6 +34,15 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
|
|
34
34
|
@@nc_rna_file = File.open("fln_results/nc_rnas.txt", 'w')
|
35
35
|
@@nc_rna_file.puts file_head
|
36
36
|
|
37
|
+
if (!options[:chimera].nil?)
|
38
|
+
@@chimera_file = File.open("fln_results/chimeric_sequences.txt", 'w')
|
39
|
+
@@chimera_file.puts file_head
|
40
|
+
else
|
41
|
+
if File.exists?("fln_results/chimeric_sequences.txt")
|
42
|
+
File.delete("fln_results/chimeric_sequences.txt")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
37
46
|
# @@error_fasta_file = File.open("fln_results/error_seqs.fasta", 'w')
|
38
47
|
# @@error_file = File.open("fln_results/errors_info.txt", 'w')
|
39
48
|
|
@@ -50,6 +59,10 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
|
|
50
59
|
@@tcode_file.close
|
51
60
|
@@nc_rna_file.close
|
52
61
|
|
62
|
+
if (!@@options[:chimera].nil?)
|
63
|
+
@@chimera_file.close
|
64
|
+
end
|
65
|
+
|
53
66
|
# @@error_fasta_file.close
|
54
67
|
# @@error_file.close
|
55
68
|
|
@@ -113,50 +126,101 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
|
|
113
126
|
|
114
127
|
def write_seq(seq)
|
115
128
|
begin
|
116
|
-
# --------------------------------------------------------
|
117
|
-
if (
|
129
|
+
# -------------------------------------------------------- Chimeric Seqs
|
130
|
+
if (!@@options[:chimera].nil?)
|
131
|
+
if (q=seq.get_annotations(:chimera).first)
|
132
|
+
@@chimera_file.puts q[:message]
|
133
|
+
# -------------------------------------------------- Complete Seqs
|
134
|
+
elsif (e=seq.get_annotations(:complete).first)
|
135
|
+
|
136
|
+
@@annotation_file.puts e[:message]
|
137
|
+
|
138
|
+
if (a=seq.get_annotations(:alignment).first)
|
139
|
+
@@alignment_file.puts a[:message]
|
140
|
+
end
|
141
|
+
|
142
|
+
if (p=seq.get_annotations(:protein).first)
|
143
|
+
@@prot_file.puts p[:message]
|
144
|
+
end
|
145
|
+
|
146
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
147
|
+
@@nts_file.puts n[:message]
|
148
|
+
end
|
149
|
+
# --------------------------------------------------- Non Complete Seqs
|
150
|
+
elsif (e=seq.get_annotations(:tmp_annotation).first)
|
118
151
|
|
119
|
-
|
152
|
+
@@annotation_file.puts e[:message][0]
|
120
153
|
|
121
|
-
|
122
|
-
|
123
|
-
|
154
|
+
if (a=seq.get_annotations(:alignment).first)
|
155
|
+
if !a[:message].empty?
|
156
|
+
@@alignment_file.puts a[:message]
|
157
|
+
end
|
158
|
+
end
|
124
159
|
|
125
|
-
|
126
|
-
|
127
|
-
|
160
|
+
if (p=seq.get_annotations(:protein).first)
|
161
|
+
if !p[:message].empty?
|
162
|
+
@@prot_file.puts p[:message]
|
163
|
+
end
|
164
|
+
end
|
128
165
|
|
129
|
-
|
130
|
-
|
166
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
167
|
+
@@nts_file.puts n[:message]
|
168
|
+
end
|
169
|
+
# ------------------------------------------------- nc RNA
|
170
|
+
elsif (nc=seq.get_annotations(:ncrna).first)
|
171
|
+
@@nc_rna_file.puts nc[:message]
|
172
|
+
# ------------------------------------------------- Test Code
|
173
|
+
elsif (t=seq.get_annotations(:tcode).first)
|
174
|
+
@@tcode_file.puts t[:message]
|
131
175
|
end
|
132
|
-
#
|
133
|
-
|
176
|
+
# ---------------------------------------------------------------------------------
|
177
|
+
# -------------------------------------------------------- without Chimeric Seqs Mode
|
178
|
+
else
|
179
|
+
# ------------------------------------------------- Complete Seqs
|
180
|
+
if (e=seq.get_annotations(:complete).first)
|
134
181
|
|
135
|
-
|
182
|
+
@@annotation_file.puts e[:message]
|
136
183
|
|
137
|
-
|
138
|
-
if !a[:message].empty?
|
184
|
+
if (a=seq.get_annotations(:alignment).first)
|
139
185
|
@@alignment_file.puts a[:message]
|
140
186
|
end
|
141
|
-
end
|
142
187
|
|
143
|
-
|
144
|
-
if !p[:message].empty?
|
188
|
+
if (p=seq.get_annotations(:protein).first)
|
145
189
|
@@prot_file.puts p[:message]
|
146
190
|
end
|
147
|
-
end
|
148
191
|
|
149
|
-
|
150
|
-
|
192
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
193
|
+
@@nts_file.puts n[:message]
|
194
|
+
end
|
195
|
+
# ------------------------------------------------- Non Complete Seqs
|
196
|
+
elsif (e=seq.get_annotations(:tmp_annotation).first)
|
197
|
+
|
198
|
+
@@annotation_file.puts e[:message][0]
|
199
|
+
|
200
|
+
if (a=seq.get_annotations(:alignment).first)
|
201
|
+
if !a[:message].empty?
|
202
|
+
@@alignment_file.puts a[:message]
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
if (p=seq.get_annotations(:protein).first)
|
207
|
+
if !p[:message].empty?
|
208
|
+
@@prot_file.puts p[:message]
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
213
|
+
@@nts_file.puts n[:message]
|
214
|
+
end
|
215
|
+
# ------------------------------------------------- nc RNA
|
216
|
+
elsif (nc=seq.get_annotations(:ncrna).first)
|
217
|
+
@@nc_rna_file.puts nc[:message]
|
218
|
+
# ------------------------------------------------- Test Code
|
219
|
+
elsif (t=seq.get_annotations(:tcode).first)
|
220
|
+
@@tcode_file.puts t[:message]
|
151
221
|
end
|
152
|
-
# -------------------------------------------------------- nc RNA
|
153
|
-
elsif (nc=seq.get_annotations(:ncrna).first)
|
154
|
-
@@nc_rna_file.puts nc[:message]
|
155
|
-
# -------------------------------------------------------- Test Code
|
156
|
-
elsif (t=seq.get_annotations(:tcode).first)
|
157
|
-
@@tcode_file.puts t[:message]
|
158
222
|
end
|
159
|
-
#
|
223
|
+
# ------------------------------------------------- errors
|
160
224
|
# if e=seq.get_annotations(:error).first
|
161
225
|
# if !e[:message].empty?
|
162
226
|
# @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: full_lengther_next
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.8
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Noe Fernandez & Dario Guerrero
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-11-28 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: xml-simple
|
@@ -31,7 +31,7 @@ dependencies:
|
|
31
31
|
requirements:
|
32
32
|
- - ">="
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version: 0.0.
|
34
|
+
version: 0.0.37
|
35
35
|
type: :runtime
|
36
36
|
version_requirements: *id002
|
37
37
|
- !ruby/object:Gem::Dependency
|
@@ -97,6 +97,7 @@ files:
|
|
97
97
|
- bin/full_lengther_next
|
98
98
|
- History.txt
|
99
99
|
- lib/full_lengther_next/classes/common_functions.rb
|
100
|
+
- lib/full_lengther_next/classes/chimeric_seqs.rb
|
100
101
|
- lib/full_lengther_next/classes/fl_analysis.rb
|
101
102
|
- lib/full_lengther_next/classes/fl_string_utils.rb
|
102
103
|
- lib/full_lengther_next/classes/fln_stats.rb
|
@@ -142,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
143
|
requirements: []
|
143
144
|
|
144
145
|
rubyforge_project: full_lengther_next
|
145
|
-
rubygems_version: 1.
|
146
|
+
rubygems_version: 1.8.24
|
146
147
|
signing_key:
|
147
148
|
specification_version: 3
|
148
149
|
summary: FULL-LENGTHERNEXT is a tool adapted to NGS technologies, able to work in parallel and in a distributed way to minimise computing time
|