full_lengther_next 0.0.6 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,11 @@
1
+ === 0.0.8 2012-11-28
2
+
3
+ Protection against empty seqs when all seqs match against user_db
4
+
5
+ === 0.0.7 2012-07-25
6
+
7
+ Chimera detection
8
+
1
9
  === 0.0.6 2012-04-16
2
10
 
3
11
  Fixed some cosmetic issues and parameters names
@@ -3,6 +3,7 @@ bin/make_user_db.rb
3
3
  bin/full_lengther_next
4
4
  History.txt
5
5
  lib/full_lengther_next/classes/common_functions.rb
6
+ lib/full_lengther_next/classes/chimeric_seqs.rb
6
7
  lib/full_lengther_next/classes/fl_analysis.rb
7
8
  lib/full_lengther_next/classes/fl_string_utils.rb
8
9
  lib/full_lengther_next/classes/fln_stats.rb
data/Rakefile CHANGED
@@ -20,7 +20,7 @@ $hoe = Hoe.spec 'full_lengther_next' do
20
20
  # self.extra_deps << ['gnuplot','>=0']
21
21
  # self.extra_deps << ['term-ansicolor','>=1.0.5']
22
22
  self.extra_deps << ['xml-simple','>=1.0.12']
23
- self.extra_deps << ['scbi_blast','>=0.0.32']
23
+ self.extra_deps << ['scbi_blast','>=0.0.37']
24
24
  self.extra_deps << ['scbi_mapreduce','>=0.0.29']
25
25
  self.extra_deps << ['scbi_fasta','>=0.1.7']
26
26
  # self.extra_deps << ['scbi_fastq','>=0.0.13']
@@ -50,6 +50,11 @@ optparse = OptionParser.new do |opts|
50
50
  options[:distance] = distance.to_i
51
51
  end
52
52
 
53
+ options[:chimera] = nil
54
+ opts.on( '-q', '--chimera_detection', "apply chimera detection mode\n\n" ) do |chimera|
55
+ options[:chimera] = chimera
56
+ end
57
+
53
58
  options[:workers] = 2
54
59
  opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
55
60
  if File.exists?(workers)
@@ -91,7 +96,7 @@ optparse = OptionParser.new do |opts|
91
96
 
92
97
 
93
98
  # Set a banner, displayed at the top of the help screen.
94
- opts.banner = "Usage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
99
+ opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
95
100
 
96
101
  # This displays the help screen
97
102
  opts.on( '-h', '--help', 'Display this screen' ) do
@@ -172,6 +177,8 @@ require 'my_worker_manager'
172
177
  $LOG = Logger.new(STDOUT)
173
178
  $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
174
179
 
180
+ # puts "ROOT_PATH: #{ROOT_PATH}"
181
+
175
182
  custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
176
183
 
177
184
  $LOG.info 'Starting server'
@@ -126,16 +126,16 @@ end
126
126
  ENV['BLASTDB']=formatted_db_path
127
127
 
128
128
 
129
- if !File.exists?(File.join(ENV['BLASTDB'], my_group))
130
- Dir.mkdir("blast_dbs/#{my_group}")
129
+ if !File.exists?(File.join(formatted_db_path, my_group))
130
+ Dir.mkdir(File.join(formatted_db_path,my_group))
131
131
  end
132
132
 
133
- output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta")
133
+ output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
134
134
 
135
135
  output_file = File.new(output_file_path, "w")
136
136
 
137
- filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
- filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group)
137
+ filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
+ filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
139
139
 
140
140
  output_file.close
141
141
 
@@ -1,13 +1,13 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
- root_path=File.join(File.dirname(__FILE__),'full_lengther_next')
4
+ # ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
5
5
 
6
- $: << File.expand_path(File.join(root_path, 'classes'))
6
+ $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
7
7
 
8
8
 
9
9
  module FullLengtherNext
10
- VERSION = '0.0.6'
10
+ VERSION = '0.0.8'
11
11
 
12
12
  FULLLENGHTER_VERSION = VERSION
13
13
  end
@@ -0,0 +1,78 @@
1
+
2
+ require 'scbi_blast'
3
+
4
+ module ChimericSeqs
5
+
6
+ def search_chimeras(seq, blast_query, options, db_name)
7
+
8
+ # used to detect if the sequence and the blast are from different query
9
+ if (seq.seq_name != blast_query.query_def)
10
+ puts "#{seq.seq_name} --> #{blast_query.query_def}"
11
+ raise "BLAST query name and sequence are different"
12
+ end
13
+
14
+ q=blast_query
15
+ # puts "#{q.query_def}"
16
+
17
+ ref_hit_beg = q.hits[0].q_beg
18
+ ref_hit_end = q.hits[0].q_end
19
+
20
+ q.hits.each do |hit|
21
+ # puts "---------#{hit.acc}"
22
+ # if overlaps or is contained in the ref hit
23
+ if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
24
+ # puts "hits overlapping: ref_hit #{ref_hit_beg}-#{ref_hit_end}, current hit #{hit.q_beg}-#{hit.q_end}"
25
+ ref_hit_beg = [ref_hit_beg,hit.q_beg].min
26
+ ref_hit_end = [ref_hit_end,hit.q_end].max
27
+ # puts "modified ref_hit #{ref_hit_beg}-#{ref_hit_end}"
28
+ end
29
+ end
30
+ q.hits.each do |hit|
31
+ if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
32
+ else
33
+ if (hit.acc != q.hits[0].acc)
34
+ # puts "\nreference: #{ref_hit_beg} - #{ref_hit_end}"
35
+ # puts "hit 0: #{q.hits[0].q_beg} - #{q.hits[0].q_end}"
36
+ # puts "current: #{hit.q_beg} - #{hit.q_end}"
37
+ # puts "putative chimeric seq: \n#{q.hits[0].definition}\n#{hit.definition}\n------------------------------------------"
38
+
39
+ chimera_annotations = "\n#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tPutative chimera\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t\n#{q.query_def}\t#{seq.fasta_length}\t#{hit.acc}\t#{db_name}\tPutative chimera\t\t#{hit.e_val}\t#{hit.ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{hit.q_frame}\t#{hit.q_beg}\t#{hit.q_end}\t#{hit.s_beg.to_i}\t#{hit.s_end.to_i}\t#{hit.definition}\t"
40
+ seq.annotate(:chimera,chimera_annotations,false)
41
+ end
42
+
43
+ return
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ def select_best_blast(tmp_blast_obj, new_seqs)
50
+
51
+ my_seqs ={}
52
+
53
+ new_seqs.each do |seq|
54
+ my_seqs[seq.seq_name] = true
55
+ end
56
+
57
+ reverse_counter = (tmp_blast_obj.querys.length - 1)
58
+
59
+ tmp_blast_obj.querys.reverse_each do |query|
60
+ if (!my_seqs[query.query_def]) # los marcados como quimeras se eliminan para no utilizarse posteriormente
61
+ # tmp_blast_obj.querys[reverse_counter].delete
62
+ tmp_blast_obj.querys.delete_at(reverse_counter)
63
+ else
64
+ reverse_hit_counter = (query.hits.length - 1)
65
+ query.hits.reverse_each do |hit|
66
+ if (hit.acc != query.hits[0].acc)
67
+ tmp_blast_obj.querys[reverse_counter].hits.delete_at(reverse_hit_counter)
68
+ end
69
+ reverse_hit_counter -= 1
70
+ end
71
+ end
72
+ reverse_counter -= 1
73
+ end
74
+
75
+ return tmp_blast_obj
76
+ end
77
+
78
+ end
@@ -4,20 +4,28 @@ module FlnStats
4
4
  def summary_stats
5
5
  stats_file = File.open('fln_results/summary_stats.html', 'w')
6
6
 
7
+ size_filter1 = 200
8
+ size_filter2 = 500
9
+
7
10
  # recogemos los trozos de html fijos
8
11
  (html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
9
12
 
10
13
  total_seqs = 0
11
14
  status_suma = 0
12
15
  #recogemos los datos que necesitamos de los ficheros de resultados
13
- (status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats
14
- (tcode_array, seqs_number2, unk_200, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats
15
- (ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats
16
+ (status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
17
+ (tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
18
+ (ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
19
+ (chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
16
20
 
17
- total_seqs = seqs_number1 + seqs_number2 + ncrna_total.to_i
18
- uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500)
19
- uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200)
20
- longest_one = [db_longest_one, tc_longest_one, nc_longest_one].max
21
+ seqs_number1 = (seqs_number1+chimera_total.to_i)
22
+ total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
23
+ uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
24
+ uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
25
+ longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
26
+ db_usage[0] += ch_db_usage[0]
27
+ db_usage[1] += ch_db_usage[1]
28
+ db_usage[2] += ch_db_usage[2]
21
29
  stats_file.puts html_head
22
30
 
23
31
  if (total_seqs.to_i > 0)
@@ -46,6 +54,15 @@ module FlnStats
46
54
  end
47
55
  status_suma += status[0]
48
56
  end
57
+
58
+ # adding chimeric seqs
59
+ stats_file.puts ' <tr>
60
+ <td colspan="2" align="left">Putative chimera</td>
61
+ <td align="right">'+chimera_total.to_s+'</td>
62
+ <td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
63
+ </tr>'
64
+ status_suma += chimera_total
65
+
49
66
  # añadimos los coding, P.coding
50
67
  tcode_array.each do |status|
51
68
  if (status[1] == 'Coding')
@@ -64,6 +81,7 @@ module FlnStats
64
81
  end
65
82
  status_suma += status[0]
66
83
  end
84
+
67
85
  # se ponen los ncRNA
68
86
  stats_file.puts ' <tr>
69
87
  <td colspan="2" align="left">Putative ncRNA</td>
@@ -156,12 +174,12 @@ module FlnStats
156
174
  <td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
157
175
  </tr>'
158
176
  stats_file.puts ' <tr>
159
- <td align="left">Unigenes >500pb</td>
177
+ <td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
160
178
  <td align="right">'+uni_500.to_s+'</td>
161
179
  <td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
162
180
  </tr>'
163
181
  stats_file.puts ' <tr>
164
- <td align="left">Unigenes >200pb</td>
182
+ <td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
165
183
  <td align="right">'+uni_200.to_s+'</td>
166
184
  <td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
167
185
  </tr>'
@@ -175,6 +193,8 @@ module FlnStats
175
193
  <td align="right">'+seqs_number1.to_s+'</td>
176
194
  <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
177
195
  </tr>'
196
+
197
+ if (seqs_number1.to_i > 0)
178
198
  stats_file.puts ' <tr>
179
199
  <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different orthologue IDs</td>
180
200
  <td align="right">'+seq_uniq.to_s+'</td>
@@ -195,21 +215,49 @@ module FlnStats
195
215
  <td align="right">'+error_1_num.to_s+'</td>
196
216
  <td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
197
217
  </tr>'
218
+ stats_file.puts ' <tr>
219
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative chimera</td>
220
+ <td align="right">'+chimera_total.to_s+'</td>
221
+ <td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
222
+ </tr>'
223
+ end
198
224
  stats_file.puts ' <tr>
199
225
  <td align="left">Without orthologue <sup>1</sup></td>
200
226
  <td align="right">'+no_db.to_s+'</td>
201
227
  <td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
202
228
  </tr>'
229
+
230
+ if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
203
231
  stats_file.puts ' <tr>
204
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding</td>
232
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding (all)</td>
205
233
  <td align="right">'+tcode_array[0][0].to_s+'</td>
206
234
  <td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
207
235
  </tr>'
208
236
  stats_file.puts ' <tr>
209
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding</td>
237
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter1.to_s+'bp</td>
238
+ <td align="right">'+tcode_array[0][2].to_s+'</td>
239
+ <td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
240
+ </tr>'
241
+ stats_file.puts ' <tr>
242
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter2.to_s+'bp</td>
243
+ <td align="right">'+tcode_array[0][3].to_s+'</td>
244
+ <td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
245
+ </tr>'
246
+ stats_file.puts ' <tr>
247
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding (all)</td>
210
248
  <td align="right">'+tcode_array[1][0].to_s+'</td>
211
249
  <td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
212
250
  </tr>'
251
+ stats_file.puts ' <tr>
252
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter1.to_s+'bp</td>
253
+ <td align="right">'+tcode_array[1][2].to_s+'</td>
254
+ <td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
255
+ </tr>'
256
+ stats_file.puts ' <tr>
257
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter2.to_s+'bp</td>
258
+ <td align="right">'+tcode_array[1][3].to_s+'</td>
259
+ <td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
260
+ </tr>'
213
261
  stats_file.puts ' <tr>
214
262
  <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative ncRNA</td>
215
263
  <td align="right">'+ncrna_total.to_s+'</td>
@@ -221,16 +269,19 @@ module FlnStats
221
269
  <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
222
270
  </tr>'
223
271
  stats_file.puts ' <tr>
224
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown < 200bp</td>
225
- <td align="right">'+unk_200.to_s+'</td>
226
- <td align="right">'+'%.2f' % (100*unk_200.to_f/no_db.to_f).to_s+' %</td>
227
- </tr>
228
- </table>
272
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter1.to_s+'bp</td>
273
+ <td align="right">'+tcode_array[2][2].to_s+'</td>
274
+ <td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
275
+ </tr>'
276
+ stats_file.puts ' <tr>
277
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter2.to_s+'bp</td>
278
+ <td align="right">'+tcode_array[2][3].to_s+'</td>
279
+ <td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
280
+ </tr>'
281
+ end
282
+ stats_file.puts ' </table>
229
283
  <sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
230
284
 
231
-
232
-
233
-
234
285
  end
235
286
  stats_file.puts html_end
236
287
 
@@ -309,7 +360,7 @@ module FlnStats
309
360
  end
310
361
 
311
362
 
312
- def annotation_stats
363
+ def annotation_stats(size_filter1,size_filter2)
313
364
 
314
365
  seqs_number = 0
315
366
  array_of_all_accs = []
@@ -353,10 +404,10 @@ module FlnStats
353
404
  end
354
405
 
355
406
  # -------------------------------------------------------------------------
356
- if (fasta_length.to_i >= 200)
407
+ if (fasta_length.to_i >= size_filter1)
357
408
  uni_200 += 1
358
409
  end
359
- if (fasta_length.to_i >= 500)
410
+ if (fasta_length.to_i >= size_filter2)
360
411
  uni_500 += 1
361
412
  end
362
413
  # -------------------------------------------------------------------------
@@ -394,18 +445,17 @@ module FlnStats
394
445
  end
395
446
 
396
447
 
397
- def testcode_stats
448
+ def testcode_stats(size_filter1,size_filter2)
398
449
 
399
450
  seqs_number = 0
400
- unk_200 = 0
401
451
  uni_500 = 0
402
452
  uni_200 = 0
403
453
  longest_one = 0
404
454
 
405
455
  # total, status
406
- coding_stats = [0,'Coding']
407
- p_coding_stats = [0,'Putative Coding']
408
- unknown_stats = [0,'Unknown']
456
+ coding_stats = [0,'Coding',0,0]
457
+ p_coding_stats = [0,'Putative Coding',0,0]
458
+ unknown_stats = [0,'Unknown',0,0]
409
459
 
410
460
  File.open('fln_results/new_coding.txt').each do |line|
411
461
  line.chomp!
@@ -419,17 +469,31 @@ module FlnStats
419
469
  end
420
470
 
421
471
  # -------------------------------------------------------------------------
422
- if (fasta_length.to_i >= 200)
472
+ if (fasta_length.to_i >= size_filter1)
423
473
  uni_200 += 1
424
474
  end
425
- if (fasta_length.to_i >= 500)
475
+ if (fasta_length.to_i >= size_filter2)
426
476
  uni_500 += 1
427
477
  end
428
478
  # -------------------------------------------------------------------------
429
479
 
430
- if (fasta_length.to_i < 200)
431
- if (status == 'unknown')
432
- unk_200 += 1
480
+ if (fasta_length.to_i > size_filter1)
481
+ if (status == 'coding')
482
+ coding_stats[2] += 1
483
+ elsif (status == 'putative_coding')
484
+ p_coding_stats[2] += 1
485
+ elsif (status == 'unknown')
486
+ unknown_stats[2] += 1
487
+ end
488
+ end
489
+
490
+ if (fasta_length.to_i > size_filter2)
491
+ if (status == 'coding')
492
+ coding_stats[3] += 1
493
+ elsif (status == 'putative_coding')
494
+ p_coding_stats[3] += 1
495
+ elsif (status == 'unknown')
496
+ unknown_stats[3] += 1
433
497
  end
434
498
  end
435
499
 
@@ -447,11 +511,11 @@ module FlnStats
447
511
 
448
512
  status_array = [coding_stats, p_coding_stats, unknown_stats]
449
513
 
450
- return [status_array, seqs_number, unk_200, uni_500, uni_200, longest_one]
514
+ return [status_array, seqs_number, uni_500, uni_200, longest_one]
451
515
  end
452
516
 
453
517
 
454
- def ncrna_stats
518
+ def ncrna_stats(size_filter1,size_filter2)
455
519
 
456
520
  uni_500 = 0
457
521
  uni_200 = 0
@@ -468,10 +532,10 @@ module FlnStats
468
532
  longest_one = fasta_length.to_i
469
533
  end
470
534
  # -------------------------------------------------------------------------
471
- if (fasta_length.to_i >= 200)
535
+ if (fasta_length.to_i >= size_filter1)
472
536
  uni_200 += 1
473
537
  end
474
- if (fasta_length.to_i >= 500)
538
+ if (fasta_length.to_i >= size_filter2)
475
539
  uni_500 += 1
476
540
  end
477
541
  # -------------------------------------------------------------------------
@@ -484,5 +548,53 @@ module FlnStats
484
548
  return [nc_total, uni_500, uni_200, longest_one]
485
549
  end
486
550
 
551
+ def chimera_stats(size_filter1,size_filter2)
552
+
553
+ uni_500 = 0
554
+ uni_200 = 0
555
+ ch_total = 0
556
+ longest_one = 0
557
+ db_usage = [0,0,0]
558
+
559
+ if !File.exists?('fln_results/chimeric_sequences.txt')
560
+ return [0, 0, 0, longest_one, db_usage]
561
+ else
562
+ File.open('fln_results/chimeric_sequences.txt').each do |line|
563
+ line.chomp!
564
+ if (!line.empty?)
565
+ (name,fasta_length,acc,db_name,status) = line.split("\t")
566
+ if (status == 'Putative chimera')
567
+ if (fasta_length.to_i > longest_one)
568
+ longest_one = fasta_length.to_i
569
+ end
570
+ # -------------------------------------------------------------------------
571
+ if (fasta_length.to_i >= size_filter1)
572
+ uni_200 += 1
573
+ end
574
+ if (fasta_length.to_i >= size_filter2)
575
+ uni_500 += 1
576
+ end
577
+ # -------------------------------------------------------------------------
578
+ if (db_name =~ /^sp_/)
579
+ db_usage[1] += 1
580
+ elsif (db_name =~ /^tr_/)
581
+ db_usage[2] += 1
582
+ else
583
+ db_usage[0] += 1
584
+ end
585
+ # -------------------------------------------------------------------------
586
+ ch_total += 1
587
+ end
588
+ end
589
+ end
590
+
591
+ db_usage.each_with_index do |db,i|
592
+ db_usage[i] = db/2
593
+ end
594
+
595
+ return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
596
+ end
597
+ end
598
+
487
599
 
488
600
  end
@@ -8,6 +8,9 @@ require 'fl_string_utils'
8
8
  require "lcs" # like the class simliar of seqtrim, return the longest common sequence
9
9
  require "test_code"
10
10
 
11
+ require 'chimeric_seqs'
12
+ include ChimericSeqs
13
+
11
14
  require 'fl_analysis'
12
15
  include FlAnalysis
13
16
 
@@ -46,7 +49,12 @@ class MyWorker < ScbiMapreduce::Worker
46
49
  # ejecuta blast utilizando los parametros fichero de entrada, base de datos, fichero de salida y tipo de blast
47
50
  def run_blast(input, database, blast_type, evalue)
48
51
 
49
- blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
52
+ if (@options[:chimera].nil?)
53
+ blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
54
+ else
55
+ blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue}")
56
+ end
57
+
50
58
  blast_result = blast.do_blast_seqs(input, :xml)
51
59
 
52
60
  return blast_result
@@ -73,34 +81,71 @@ class MyWorker < ScbiMapreduce::Worker
73
81
  # do blast
74
82
  my_blast = run_blast(seqs, "#{@options[:user_db]}", 'blastx', '1e-6')
75
83
 
84
+ # chimera detection
85
+ if (!@options[:chimera].nil?)
86
+ seqs.each_with_index do |seq,i|
87
+ if (!my_blast.querys[i].hits[0].nil?)
88
+ search_chimeras(seq, my_blast.querys[i], @options, user_db_name)
89
+ end
90
+ end
91
+
92
+ seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
93
+ my_blast = select_best_blast(my_blast, seqs)
94
+ end
95
+
76
96
  # split and parse blast
77
97
  seqs.each_with_index do |seq,i|
78
98
  analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
79
99
  end
80
100
 
81
- new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
101
+ new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
82
102
 
83
103
  else
84
104
  new_seqs = seqs
85
105
  end
106
+
107
+ return if new_seqs.empty?
86
108
 
87
109
  # -------------------------------------------- UniProt (sp)
88
110
  # blast
89
111
  sp_path=File.join("sp_#{@options[:tax_group]}","sp_#{@options[:tax_group]}.fasta")
90
112
  my_blast = run_blast(new_seqs, sp_path, 'blastx', '1e-6')
91
113
 
114
+ # chimera detection
115
+ if (!@options[:chimera].nil?)
116
+ new_seqs.each_with_index do |seq,i|
117
+ if (!my_blast.querys[i].hits[0].nil?)
118
+ search_chimeras(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
119
+ end
120
+ end
121
+ new_seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
122
+ my_blast = select_best_blast(my_blast, new_seqs)
123
+ end
124
+
92
125
  # split and parse blast
93
126
  new_seqs.each_with_index do |seq,i|
94
127
  analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
95
128
  end
96
129
 
97
- new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
130
+ new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
131
+ return if new_seqs.empty?
98
132
 
99
133
  # -------------------------------------------- UniProt (tr)
100
134
  # blast
101
135
  tr_path=File.join("tr_#{@options[:tax_group]}","tr_#{@options[:tax_group]}.fasta")
102
136
  my_blast = run_blast(new_seqs, tr_path, 'blastx', '1e-6')
103
137
 
138
+ # chimera detection
139
+ if (!@options[:chimera].nil?)
140
+ new_seqs.each_with_index do |seq,i|
141
+ if (!my_blast.querys[i].hits[0].nil?)
142
+ search_chimeras(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
143
+ end
144
+ end
145
+ new_seqs=new_seqs.select{|s| s.get_annotations(:chimera).empty?}
146
+ my_blast = select_best_blast(my_blast, new_seqs)
147
+ end
148
+
104
149
  # split and parse blast
105
150
  new_seqs.each_with_index do |seq,i|
106
151
  analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
@@ -108,8 +153,9 @@ class MyWorker < ScbiMapreduce::Worker
108
153
 
109
154
  # -------------------------------------------- Test Code
110
155
  # the sequences without a reliable similarity with an orthologue are processed with Test Code
111
- testcode_input=seqs.select{|s| !s.get_annotations(:apply_tcode).empty?}
112
-
156
+ testcode_input=seqs.select{|s| (!s.get_annotations(:apply_tcode).empty? && s.get_annotations(:chimera).empty?)}
157
+ return if testcode_input.empty?
158
+
113
159
  # active this line to test tcode, and comment all lines above in this function
114
160
  # testcode_input=seqs
115
161
 
@@ -119,6 +165,8 @@ class MyWorker < ScbiMapreduce::Worker
119
165
 
120
166
  # -------------------------------------------- nc RNA
121
167
  unknown_seqs=seqs.select{|s| !s.get_annotations(:tcode_unknown).empty?}
168
+ return if unknown_seqs.empty?
169
+
122
170
  # run blastn
123
171
  ncrna_path=File.join('nc_rna_db','ncrna_fln_100.fasta')
124
172
  my_blast = run_blast(unknown_seqs, ncrna_path, 'blastn', '1e-3')
@@ -34,6 +34,15 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
34
34
  @@nc_rna_file = File.open("fln_results/nc_rnas.txt", 'w')
35
35
  @@nc_rna_file.puts file_head
36
36
 
37
+ if (!options[:chimera].nil?)
38
+ @@chimera_file = File.open("fln_results/chimeric_sequences.txt", 'w')
39
+ @@chimera_file.puts file_head
40
+ else
41
+ if File.exists?("fln_results/chimeric_sequences.txt")
42
+ File.delete("fln_results/chimeric_sequences.txt")
43
+ end
44
+ end
45
+
37
46
  # @@error_fasta_file = File.open("fln_results/error_seqs.fasta", 'w')
38
47
  # @@error_file = File.open("fln_results/errors_info.txt", 'w')
39
48
 
@@ -50,6 +59,10 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
50
59
  @@tcode_file.close
51
60
  @@nc_rna_file.close
52
61
 
62
+ if (!@@options[:chimera].nil?)
63
+ @@chimera_file.close
64
+ end
65
+
53
66
  # @@error_fasta_file.close
54
67
  # @@error_file.close
55
68
 
@@ -113,50 +126,101 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
113
126
 
114
127
  def write_seq(seq)
115
128
  begin
116
- # -------------------------------------------------------- Complete Seqs
117
- if (e=seq.get_annotations(:complete).first)
129
+ # -------------------------------------------------------- Chimeric Seqs
130
+ if (!@@options[:chimera].nil?)
131
+ if (q=seq.get_annotations(:chimera).first)
132
+ @@chimera_file.puts q[:message]
133
+ # -------------------------------------------------- Complete Seqs
134
+ elsif (e=seq.get_annotations(:complete).first)
135
+
136
+ @@annotation_file.puts e[:message]
137
+
138
+ if (a=seq.get_annotations(:alignment).first)
139
+ @@alignment_file.puts a[:message]
140
+ end
141
+
142
+ if (p=seq.get_annotations(:protein).first)
143
+ @@prot_file.puts p[:message]
144
+ end
145
+
146
+ if (n=seq.get_annotations(:nucleotide).first)
147
+ @@nts_file.puts n[:message]
148
+ end
149
+ # --------------------------------------------------- Non Complete Seqs
150
+ elsif (e=seq.get_annotations(:tmp_annotation).first)
118
151
 
119
- @@annotation_file.puts e[:message]
152
+ @@annotation_file.puts e[:message][0]
120
153
 
121
- if (a=seq.get_annotations(:alignment).first)
122
- @@alignment_file.puts a[:message]
123
- end
154
+ if (a=seq.get_annotations(:alignment).first)
155
+ if !a[:message].empty?
156
+ @@alignment_file.puts a[:message]
157
+ end
158
+ end
124
159
 
125
- if (p=seq.get_annotations(:protein).first)
126
- @@prot_file.puts p[:message]
127
- end
160
+ if (p=seq.get_annotations(:protein).first)
161
+ if !p[:message].empty?
162
+ @@prot_file.puts p[:message]
163
+ end
164
+ end
128
165
 
129
- if (n=seq.get_annotations(:nucleotide).first)
130
- @@nts_file.puts n[:message]
166
+ if (n=seq.get_annotations(:nucleotide).first)
167
+ @@nts_file.puts n[:message]
168
+ end
169
+ # ------------------------------------------------- nc RNA
170
+ elsif (nc=seq.get_annotations(:ncrna).first)
171
+ @@nc_rna_file.puts nc[:message]
172
+ # ------------------------------------------------- Test Code
173
+ elsif (t=seq.get_annotations(:tcode).first)
174
+ @@tcode_file.puts t[:message]
131
175
  end
132
- # -------------------------------------------------------- Non Complete Seqs
133
- elsif (e=seq.get_annotations(:tmp_annotation).first)
176
+ # ---------------------------------------------------------------------------------
177
+ # -------------------------------------------------------- without Chimeric Seqs Mode
178
+ else
179
+ # ------------------------------------------------- Complete Seqs
180
+ if (e=seq.get_annotations(:complete).first)
134
181
 
135
- @@annotation_file.puts e[:message][0]
182
+ @@annotation_file.puts e[:message]
136
183
 
137
- if (a=seq.get_annotations(:alignment).first)
138
- if !a[:message].empty?
184
+ if (a=seq.get_annotations(:alignment).first)
139
185
  @@alignment_file.puts a[:message]
140
186
  end
141
- end
142
187
 
143
- if (p=seq.get_annotations(:protein).first)
144
- if !p[:message].empty?
188
+ if (p=seq.get_annotations(:protein).first)
145
189
  @@prot_file.puts p[:message]
146
190
  end
147
- end
148
191
 
149
- if (n=seq.get_annotations(:nucleotide).first)
150
- @@nts_file.puts n[:message]
192
+ if (n=seq.get_annotations(:nucleotide).first)
193
+ @@nts_file.puts n[:message]
194
+ end
195
+ # ------------------------------------------------- Non Complete Seqs
196
+ elsif (e=seq.get_annotations(:tmp_annotation).first)
197
+
198
+ @@annotation_file.puts e[:message][0]
199
+
200
+ if (a=seq.get_annotations(:alignment).first)
201
+ if !a[:message].empty?
202
+ @@alignment_file.puts a[:message]
203
+ end
204
+ end
205
+
206
+ if (p=seq.get_annotations(:protein).first)
207
+ if !p[:message].empty?
208
+ @@prot_file.puts p[:message]
209
+ end
210
+ end
211
+
212
+ if (n=seq.get_annotations(:nucleotide).first)
213
+ @@nts_file.puts n[:message]
214
+ end
215
+ # ------------------------------------------------- nc RNA
216
+ elsif (nc=seq.get_annotations(:ncrna).first)
217
+ @@nc_rna_file.puts nc[:message]
218
+ # ------------------------------------------------- Test Code
219
+ elsif (t=seq.get_annotations(:tcode).first)
220
+ @@tcode_file.puts t[:message]
151
221
  end
152
- # -------------------------------------------------------- nc RNA
153
- elsif (nc=seq.get_annotations(:ncrna).first)
154
- @@nc_rna_file.puts nc[:message]
155
- # -------------------------------------------------------- Test Code
156
- elsif (t=seq.get_annotations(:tcode).first)
157
- @@tcode_file.puts t[:message]
158
222
  end
159
- # -------------------------------------------------------- errors
223
+ # ------------------------------------------------- errors
160
224
  # if e=seq.get_annotations(:error).first
161
225
  # if !e[:message].empty?
162
226
  # @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: full_lengther_next
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.6
5
+ version: 0.0.8
6
6
  platform: ruby
7
7
  authors:
8
8
  - Noe Fernandez & Dario Guerrero
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-04-16 00:00:00 Z
13
+ date: 2012-11-28 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: xml-simple
@@ -31,7 +31,7 @@ dependencies:
31
31
  requirements:
32
32
  - - ">="
33
33
  - !ruby/object:Gem::Version
34
- version: 0.0.32
34
+ version: 0.0.37
35
35
  type: :runtime
36
36
  version_requirements: *id002
37
37
  - !ruby/object:Gem::Dependency
@@ -97,6 +97,7 @@ files:
97
97
  - bin/full_lengther_next
98
98
  - History.txt
99
99
  - lib/full_lengther_next/classes/common_functions.rb
100
+ - lib/full_lengther_next/classes/chimeric_seqs.rb
100
101
  - lib/full_lengther_next/classes/fl_analysis.rb
101
102
  - lib/full_lengther_next/classes/fl_string_utils.rb
102
103
  - lib/full_lengther_next/classes/fln_stats.rb
@@ -142,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
143
  requirements: []
143
144
 
144
145
  rubyforge_project: full_lengther_next
145
- rubygems_version: 1.7.2
146
+ rubygems_version: 1.8.24
146
147
  signing_key:
147
148
  specification_version: 3
148
149
  summary: FULL-LENGTHERNEXT is a tool adapted to NGS technologies, able to work in parallel and in a distributed way to minimise computing time