full_lengther_next 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,11 @@
1
+ === 0.0.8 2012-11-28
2
+
3
+ Protection against empty seqs when all seqs match against user_db
4
+
5
+ === 0.0.7 2012-07-25
6
+
7
+ Chimera detection
8
+
1
9
  === 0.0.6 2012-04-16
2
10
 
3
11
  Fixed some cosmetic issues and parameters names
@@ -3,6 +3,7 @@ bin/make_user_db.rb
3
3
  bin/full_lengther_next
4
4
  History.txt
5
5
  lib/full_lengther_next/classes/common_functions.rb
6
+ lib/full_lengther_next/classes/chimeric_seqs.rb
6
7
  lib/full_lengther_next/classes/fl_analysis.rb
7
8
  lib/full_lengther_next/classes/fl_string_utils.rb
8
9
  lib/full_lengther_next/classes/fln_stats.rb
data/Rakefile CHANGED
@@ -20,7 +20,7 @@ $hoe = Hoe.spec 'full_lengther_next' do
20
20
  # self.extra_deps << ['gnuplot','>=0']
21
21
  # self.extra_deps << ['term-ansicolor','>=1.0.5']
22
22
  self.extra_deps << ['xml-simple','>=1.0.12']
23
- self.extra_deps << ['scbi_blast','>=0.0.32']
23
+ self.extra_deps << ['scbi_blast','>=0.0.37']
24
24
  self.extra_deps << ['scbi_mapreduce','>=0.0.29']
25
25
  self.extra_deps << ['scbi_fasta','>=0.1.7']
26
26
  # self.extra_deps << ['scbi_fastq','>=0.0.13']
@@ -50,6 +50,11 @@ optparse = OptionParser.new do |opts|
50
50
  options[:distance] = distance.to_i
51
51
  end
52
52
 
53
+ options[:chimera] = nil
54
+ opts.on( '-q', '--chimera_detection', "apply chimera detection mode\n\n" ) do |chimera|
55
+ options[:chimera] = chimera
56
+ end
57
+
53
58
  options[:workers] = 2
54
59
  opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
55
60
  if File.exists?(workers)
@@ -91,7 +96,7 @@ optparse = OptionParser.new do |opts|
91
96
 
92
97
 
93
98
  # Set a banner, displayed at the top of the help screen.
94
- opts.banner = "Usage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
99
+ opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
95
100
 
96
101
  # This displays the help screen
97
102
  opts.on( '-h', '--help', 'Display this screen' ) do
@@ -172,6 +177,8 @@ require 'my_worker_manager'
172
177
  $LOG = Logger.new(STDOUT)
173
178
  $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
174
179
 
180
+ # puts "ROOT_PATH: #{ROOT_PATH}"
181
+
175
182
  custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
176
183
 
177
184
  $LOG.info 'Starting server'
@@ -126,16 +126,16 @@ end
126
126
  ENV['BLASTDB']=formatted_db_path
127
127
 
128
128
 
129
- if !File.exists?(File.join(ENV['BLASTDB'], my_group))
130
- Dir.mkdir("blast_dbs/#{my_group}")
129
+ if !File.exists?(File.join(formatted_db_path, my_group))
130
+ Dir.mkdir(File.join(formatted_db_path,my_group))
131
131
  end
132
132
 
133
- output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta")
133
+ output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
134
134
 
135
135
  output_file = File.new(output_file_path, "w")
136
136
 
137
- filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
- filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group)
137
+ filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
+ filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
139
139
 
140
140
  output_file.close
141
141
 
@@ -1,13 +1,13 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
- root_path=File.join(File.dirname(__FILE__),'full_lengther_next')
4
+ # ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
5
5
 
6
- $: << File.expand_path(File.join(root_path, 'classes'))
6
+ $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
7
7
 
8
8
 
9
9
  module FullLengtherNext
10
- VERSION = '0.0.6'
10
+ VERSION = '0.0.8'
11
11
 
12
12
  FULLLENGHTER_VERSION = VERSION
13
13
  end
@@ -0,0 +1,78 @@
1
+
2
+ require 'scbi_blast'
3
+
4
+ module ChimericSeqs
5
+
6
+ def search_chimeras(seq, blast_query, options, db_name)
7
+
8
+ # used to detect if the sequence and the blast are from different query
9
+ if (seq.seq_name != blast_query.query_def)
10
+ puts "#{seq.seq_name} --> #{blast_query.query_def}"
11
+ raise "BLAST query name and sequence are different"
12
+ end
13
+
14
+ q=blast_query
15
+ # puts "#{q.query_def}"
16
+
17
+ ref_hit_beg = q.hits[0].q_beg
18
+ ref_hit_end = q.hits[0].q_end
19
+
20
+ q.hits.each do |hit|
21
+ # puts "---------#{hit.acc}"
22
+ # if overlaps or is contained in the ref hit
23
+ if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
24
+ # puts "hits overlapping: ref_hit #{ref_hit_beg}-#{ref_hit_end}, current hit #{hit.q_beg}-#{hit.q_end}"
25
+ ref_hit_beg = [ref_hit_beg,hit.q_beg].min
26
+ ref_hit_end = [ref_hit_end,hit.q_end].max
27
+ # puts "modified ref_hit #{ref_hit_beg}-#{ref_hit_end}"
28
+ end
29
+ end
30
+ q.hits.each do |hit|
31
+ if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
32
+ else
33
+ if (hit.acc != q.hits[0].acc)
34
+ # puts "\nreference: #{ref_hit_beg} - #{ref_hit_end}"
35
+ # puts "hit 0: #{q.hits[0].q_beg} - #{q.hits[0].q_end}"
36
+ # puts "current: #{hit.q_beg} - #{hit.q_end}"
37
+ # puts "putative chimeric seq: \n#{q.hits[0].definition}\n#{hit.definition}\n------------------------------------------"
38
+
39
+ chimera_annotations = "\n#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tPutative chimera\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t\n#{q.query_def}\t#{seq.fasta_length}\t#{hit.acc}\t#{db_name}\tPutative chimera\t\t#{hit.e_val}\t#{hit.ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{hit.q_frame}\t#{hit.q_beg}\t#{hit.q_end}\t#{hit.s_beg.to_i}\t#{hit.s_end.to_i}\t#{hit.definition}\t"
40
+ seq.annotate(:chimera,chimera_annotations,false)
41
+ end
42
+
43
+ return
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ def select_best_blast(tmp_blast_obj, new_seqs)
50
+
51
+ my_seqs ={}
52
+
53
+ new_seqs.each do |seq|
54
+ my_seqs[seq.seq_name] = true
55
+ end
56
+
57
+ reverse_counter = (tmp_blast_obj.querys.length - 1)
58
+
59
+ tmp_blast_obj.querys.reverse_each do |query|
60
+ if (!my_seqs[query.query_def]) # los marcados como quimeras se eliminan para no utilizarse posteriormente
61
+ # tmp_blast_obj.querys[reverse_counter].delete
62
+ tmp_blast_obj.querys.delete_at(reverse_counter)
63
+ else
64
+ reverse_hit_counter = (query.hits.length - 1)
65
+ query.hits.reverse_each do |hit|
66
+ if (hit.acc != query.hits[0].acc)
67
+ tmp_blast_obj.querys[reverse_counter].hits.delete_at(reverse_hit_counter)
68
+ end
69
+ reverse_hit_counter -= 1
70
+ end
71
+ end
72
+ reverse_counter -= 1
73
+ end
74
+
75
+ return tmp_blast_obj
76
+ end
77
+
78
+ end
@@ -4,20 +4,28 @@ module FlnStats
4
4
  def summary_stats
5
5
  stats_file = File.open('fln_results/summary_stats.html', 'w')
6
6
 
7
+ size_filter1 = 200
8
+ size_filter2 = 500
9
+
7
10
  # recogemos los trozos de html fijos
8
11
  (html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
9
12
 
10
13
  total_seqs = 0
11
14
  status_suma = 0
12
15
  #recogemos los datos que necesitamos de los ficheros de resultados
13
- (status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats
14
- (tcode_array, seqs_number2, unk_200, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats
15
- (ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats
16
+ (status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
17
+ (tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
18
+ (ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
19
+ (chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
16
20
 
17
- total_seqs = seqs_number1 + seqs_number2 + ncrna_total.to_i
18
- uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500)
19
- uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200)
20
- longest_one = [db_longest_one, tc_longest_one, nc_longest_one].max
21
+ seqs_number1 = (seqs_number1+chimera_total.to_i)
22
+ total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
23
+ uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
24
+ uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
25
+ longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
26
+ db_usage[0] += ch_db_usage[0]
27
+ db_usage[1] += ch_db_usage[1]
28
+ db_usage[2] += ch_db_usage[2]
21
29
  stats_file.puts html_head
22
30
 
23
31
  if (total_seqs.to_i > 0)
@@ -46,6 +54,15 @@ module FlnStats
46
54
  end
47
55
  status_suma += status[0]
48
56
  end
57
+
58
+ # adding chimeric seqs
59
+ stats_file.puts ' <tr>
60
+ <td colspan="2" align="left">Putative chimera</td>
61
+ <td align="right">'+chimera_total.to_s+'</td>
62
+ <td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
63
+ </tr>'
64
+ status_suma += chimera_total
65
+
49
66
  # añadimos los coding, P.coding
50
67
  tcode_array.each do |status|
51
68
  if (status[1] == 'Coding')
@@ -64,6 +81,7 @@ module FlnStats
64
81
  end
65
82
  status_suma += status[0]
66
83
  end
84
+
67
85
  # se ponen los ncRNA
68
86
  stats_file.puts ' <tr>
69
87
  <td colspan="2" align="left">Putative ncRNA</td>
@@ -156,12 +174,12 @@ module FlnStats
156
174
  <td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
157
175
  </tr>'
158
176
  stats_file.puts ' <tr>
159
- <td align="left">Unigenes >500pb</td>
177
+ <td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
160
178
  <td align="right">'+uni_500.to_s+'</td>
161
179
  <td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
162
180
  </tr>'
163
181
  stats_file.puts ' <tr>
164
- <td align="left">Unigenes >200pb</td>
182
+ <td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
165
183
  <td align="right">'+uni_200.to_s+'</td>
166
184
  <td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
167
185
  </tr>'
@@ -175,6 +193,8 @@ module FlnStats
175
193
  <td align="right">'+seqs_number1.to_s+'</td>
176
194
  <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
177
195
  </tr>'
196
+
197
+ if (seqs_number1.to_i > 0)
178
198
  stats_file.puts ' <tr>
179
199
  <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different orthologue IDs</td>
180
200
  <td align="right">'+seq_uniq.to_s+'</td>
@@ -195,21 +215,49 @@ module FlnStats
195
215
  <td align="right">'+error_1_num.to_s+'</td>
196
216
  <td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
197
217
  </tr>'
218
+ stats_file.puts ' <tr>
219
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative chimera</td>
220
+ <td align="right">'+chimera_total.to_s+'</td>
221
+ <td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
222
+ </tr>'
223
+ end
198
224
  stats_file.puts ' <tr>
199
225
  <td align="left">Without orthologue <sup>1</sup></td>
200
226
  <td align="right">'+no_db.to_s+'</td>
201
227
  <td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
202
228
  </tr>'
229
+
230
+ if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
203
231
  stats_file.puts ' <tr>
204
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding</td>
232
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding (all)</td>
205
233
  <td align="right">'+tcode_array[0][0].to_s+'</td>
206
234
  <td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
207
235
  </tr>'
208
236
  stats_file.puts ' <tr>
209
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding</td>
237
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter1.to_s+'bp</td>
238
+ <td align="right">'+tcode_array[0][2].to_s+'</td>
239
+ <td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
240
+ </tr>'
241
+ stats_file.puts ' <tr>
242
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter2.to_s+'bp</td>
243
+ <td align="right">'+tcode_array[0][3].to_s+'</td>
244
+ <td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
245
+ </tr>'
246
+ stats_file.puts ' <tr>
247
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding (all)</td>
210
248
  <td align="right">'+tcode_array[1][0].to_s+'</td>
211
249
  <td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
212
250
  </tr>'
251
+ stats_file.puts ' <tr>
252
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter1.to_s+'bp</td>
253
+ <td align="right">'+tcode_array[1][2].to_s+'</td>
254
+ <td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
255
+ </tr>'
256
+ stats_file.puts ' <tr>
257
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter2.to_s+'bp</td>
258
+ <td align="right">'+tcode_array[1][3].to_s+'</td>
259
+ <td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
260
+ </tr>'
213
261
  stats_file.puts ' <tr>
214
262
  <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative ncRNA</td>
215
263
  <td align="right">'+ncrna_total.to_s+'</td>
@@ -221,16 +269,19 @@ module FlnStats
221
269
  <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
222
270
  </tr>'
223
271
  stats_file.puts ' <tr>
224
- <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown < 200bp</td>
225
- <td align="right">'+unk_200.to_s+'</td>
226
- <td align="right">'+'%.2f' % (100*unk_200.to_f/no_db.to_f).to_s+' %</td>
227
- </tr>
228
- </table>
272
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter1.to_s+'bp</td>
273
+ <td align="right">'+tcode_array[2][2].to_s+'</td>
274
+ <td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
275
+ </tr>'
276
+ stats_file.puts ' <tr>
277
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter2.to_s+'bp</td>
278
+ <td align="right">'+tcode_array[2][3].to_s+'</td>
279
+ <td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
280
+ </tr>'
281
+ end
282
+ stats_file.puts ' </table>
229
283
  <sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
230
284
 
231
-
232
-
233
-
234
285
  end
235
286
  stats_file.puts html_end
236
287
 
@@ -309,7 +360,7 @@ module FlnStats
309
360
  end
310
361
 
311
362
 
312
- def annotation_stats
363
+ def annotation_stats(size_filter1,size_filter2)
313
364
 
314
365
  seqs_number = 0
315
366
  array_of_all_accs = []
@@ -353,10 +404,10 @@ module FlnStats
353
404
  end
354
405
 
355
406
  # -------------------------------------------------------------------------
356
- if (fasta_length.to_i >= 200)
407
+ if (fasta_length.to_i >= size_filter1)
357
408
  uni_200 += 1
358
409
  end
359
- if (fasta_length.to_i >= 500)
410
+ if (fasta_length.to_i >= size_filter2)
360
411
  uni_500 += 1
361
412
  end
362
413
  # -------------------------------------------------------------------------
@@ -394,18 +445,17 @@ module FlnStats
394
445
  end
395
446
 
396
447
 
397
- def testcode_stats
448
+ def testcode_stats(size_filter1,size_filter2)
398
449
 
399
450
  seqs_number = 0
400
- unk_200 = 0
401
451
  uni_500 = 0
402
452
  uni_200 = 0
403
453
  longest_one = 0
404
454
 
405
455
  # total, status
406
- coding_stats = [0,'Coding']
407
- p_coding_stats = [0,'Putative Coding']
408
- unknown_stats = [0,'Unknown']
456
+ coding_stats = [0,'Coding',0,0]
457
+ p_coding_stats = [0,'Putative Coding',0,0]
458
+ unknown_stats = [0,'Unknown',0,0]
409
459
 
410
460
  File.open('fln_results/new_coding.txt').each do |line|
411
461
  line.chomp!
@@ -419,17 +469,31 @@ module FlnStats
419
469
  end
420
470
 
421
471
  # -------------------------------------------------------------------------
422
- if (fasta_length.to_i >= 200)
472
+ if (fasta_length.to_i >= size_filter1)
423
473
  uni_200 += 1
424
474
  end
425
- if (fasta_length.to_i >= 500)
475
+ if (fasta_length.to_i >= size_filter2)
426
476
  uni_500 += 1
427
477
  end
428
478
  # -------------------------------------------------------------------------
429
479
 
430
- if (fasta_length.to_i < 200)
431
- if (status == 'unknown')
432
- unk_200 += 1
480
+ if (fasta_length.to_i > size_filter1)
481
+ if (status == 'coding')
482
+ coding_stats[2] += 1
483
+ elsif (status == 'putative_coding')
484
+ p_coding_stats[2] += 1
485
+ elsif (status == 'unknown')
486
+ unknown_stats[2] += 1
487
+ end
488
+ end
489
+
490
+ if (fasta_length.to_i > size_filter2)
491
+ if (status == 'coding')
492
+ coding_stats[3] += 1
493
+ elsif (status == 'putative_coding')
494
+ p_coding_stats[3] += 1
495
+ elsif (status == 'unknown')
496
+ unknown_stats[3] += 1
433
497
  end
434
498
  end
435
499
 
@@ -447,11 +511,11 @@ module FlnStats
447
511
 
448
512
  status_array = [coding_stats, p_coding_stats, unknown_stats]
449
513
 
450
- return [status_array, seqs_number, unk_200, uni_500, uni_200, longest_one]
514
+ return [status_array, seqs_number, uni_500, uni_200, longest_one]
451
515
  end
452
516
 
453
517
 
454
- def ncrna_stats
518
+ def ncrna_stats(size_filter1,size_filter2)
455
519
 
456
520
  uni_500 = 0
457
521
  uni_200 = 0
@@ -468,10 +532,10 @@ module FlnStats
468
532
  longest_one = fasta_length.to_i
469
533
  end
470
534
  # -------------------------------------------------------------------------
471
- if (fasta_length.to_i >= 200)
535
+ if (fasta_length.to_i >= size_filter1)
472
536
  uni_200 += 1
473
537
  end
474
- if (fasta_length.to_i >= 500)
538
+ if (fasta_length.to_i >= size_filter2)
475
539
  uni_500 += 1
476
540
  end
477
541
  # -------------------------------------------------------------------------
@@ -484,5 +548,53 @@ module FlnStats
484
548
  return [nc_total, uni_500, uni_200, longest_one]
485
549
  end
486
550
 
551
+ def chimera_stats(size_filter1,size_filter2)
552
+
553
+ uni_500 = 0
554
+ uni_200 = 0
555
+ ch_total = 0
556
+ longest_one = 0
557
+ db_usage = [0,0,0]
558
+
559
+ if !File.exists?('fln_results/chimeric_sequences.txt')
560
+ return [0, 0, 0, longest_one, db_usage]
561
+ else
562
+ File.open('fln_results/chimeric_sequences.txt').each do |line|
563
+ line.chomp!
564
+ if (!line.empty?)
565
+ (name,fasta_length,acc,db_name,status) = line.split("\t")
566
+ if (status == 'Putative chimera')
567
+ if (fasta_length.to_i > longest_one)
568
+ longest_one = fasta_length.to_i
569
+ end
570
+ # -------------------------------------------------------------------------
571
+ if (fasta_length.to_i >= size_filter1)
572
+ uni_200 += 1
573
+ end
574
+ if (fasta_length.to_i >= size_filter2)
575
+ uni_500 += 1
576
+ end
577
+ # -------------------------------------------------------------------------
578
+ if (db_name =~ /^sp_/)
579
+ db_usage[1] += 1
580
+ elsif (db_name =~ /^tr_/)
581
+ db_usage[2] += 1
582
+ else
583
+ db_usage[0] += 1
584
+ end
585
+ # -------------------------------------------------------------------------
586
+ ch_total += 1
587
+ end
588
+ end
589
+ end
590
+
591
+ db_usage.each_with_index do |db,i|
592
+ db_usage[i] = db/2
593
+ end
594
+
595
+ return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
596
+ end
597
+ end
598
+
487
599
 
488
600
  end
@@ -8,6 +8,9 @@ require 'fl_string_utils'
8
8
  require "lcs" # like the class simliar of seqtrim, return the longest common sequence
9
9
  require "test_code"
10
10
 
11
+ require 'chimeric_seqs'
12
+ include ChimericSeqs
13
+
11
14
  require 'fl_analysis'
12
15
  include FlAnalysis
13
16
 
@@ -46,7 +49,12 @@ class MyWorker < ScbiMapreduce::Worker
46
49
  # ejecuta blast utilizando los parametros fichero de entrada, base de datos, fichero de salida y tipo de blast
47
50
  def run_blast(input, database, blast_type, evalue)
48
51
 
49
- blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
52
+ if (@options[:chimera].nil?)
53
+ blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
54
+ else
55
+ blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue}")
56
+ end
57
+
50
58
  blast_result = blast.do_blast_seqs(input, :xml)
51
59
 
52
60
  return blast_result
@@ -73,34 +81,71 @@ class MyWorker < ScbiMapreduce::Worker
73
81
  # do blast
74
82
  my_blast = run_blast(seqs, "#{@options[:user_db]}", 'blastx', '1e-6')
75
83
 
84
+ # chimera detection
85
+ if (!@options[:chimera].nil?)
86
+ seqs.each_with_index do |seq,i|
87
+ if (!my_blast.querys[i].hits[0].nil?)
88
+ search_chimeras(seq, my_blast.querys[i], @options, user_db_name)
89
+ end
90
+ end
91
+
92
+ seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
93
+ my_blast = select_best_blast(my_blast, seqs)
94
+ end
95
+
76
96
  # split and parse blast
77
97
  seqs.each_with_index do |seq,i|
78
98
  analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
79
99
  end
80
100
 
81
- new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
101
+ new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
82
102
 
83
103
  else
84
104
  new_seqs = seqs
85
105
  end
106
+
107
+ return if new_seqs.empty?
86
108
 
87
109
  # -------------------------------------------- UniProt (sp)
88
110
  # blast
89
111
  sp_path=File.join("sp_#{@options[:tax_group]}","sp_#{@options[:tax_group]}.fasta")
90
112
  my_blast = run_blast(new_seqs, sp_path, 'blastx', '1e-6')
91
113
 
114
+ # chimera detection
115
+ if (!@options[:chimera].nil?)
116
+ new_seqs.each_with_index do |seq,i|
117
+ if (!my_blast.querys[i].hits[0].nil?)
118
+ search_chimeras(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
119
+ end
120
+ end
121
+ new_seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
122
+ my_blast = select_best_blast(my_blast, new_seqs)
123
+ end
124
+
92
125
  # split and parse blast
93
126
  new_seqs.each_with_index do |seq,i|
94
127
  analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
95
128
  end
96
129
 
97
- new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
130
+ new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
131
+ return if new_seqs.empty?
98
132
 
99
133
  # -------------------------------------------- UniProt (tr)
100
134
  # blast
101
135
  tr_path=File.join("tr_#{@options[:tax_group]}","tr_#{@options[:tax_group]}.fasta")
102
136
  my_blast = run_blast(new_seqs, tr_path, 'blastx', '1e-6')
103
137
 
138
+ # chimera detection
139
+ if (!@options[:chimera].nil?)
140
+ new_seqs.each_with_index do |seq,i|
141
+ if (!my_blast.querys[i].hits[0].nil?)
142
+ search_chimeras(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
143
+ end
144
+ end
145
+ new_seqs=new_seqs.select{|s| s.get_annotations(:chimera).empty?}
146
+ my_blast = select_best_blast(my_blast, new_seqs)
147
+ end
148
+
104
149
  # split and parse blast
105
150
  new_seqs.each_with_index do |seq,i|
106
151
  analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
@@ -108,8 +153,9 @@ class MyWorker < ScbiMapreduce::Worker
108
153
 
109
154
  # -------------------------------------------- Test Code
110
155
  # the sequences without a reliable similarity with an orthologue are processed with Test Code
111
- testcode_input=seqs.select{|s| !s.get_annotations(:apply_tcode).empty?}
112
-
156
+ testcode_input=seqs.select{|s| (!s.get_annotations(:apply_tcode).empty? && s.get_annotations(:chimera).empty?)}
157
+ return if testcode_input.empty?
158
+
113
159
  # active this line to test tcode, and comment all lines above in this function
114
160
  # testcode_input=seqs
115
161
 
@@ -119,6 +165,8 @@ class MyWorker < ScbiMapreduce::Worker
119
165
 
120
166
  # -------------------------------------------- nc RNA
121
167
  unknown_seqs=seqs.select{|s| !s.get_annotations(:tcode_unknown).empty?}
168
+ return if unknown_seqs.empty?
169
+
122
170
  # run blastn
123
171
  ncrna_path=File.join('nc_rna_db','ncrna_fln_100.fasta')
124
172
  my_blast = run_blast(unknown_seqs, ncrna_path, 'blastn', '1e-3')
@@ -34,6 +34,15 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
34
34
  @@nc_rna_file = File.open("fln_results/nc_rnas.txt", 'w')
35
35
  @@nc_rna_file.puts file_head
36
36
 
37
+ if (!options[:chimera].nil?)
38
+ @@chimera_file = File.open("fln_results/chimeric_sequences.txt", 'w')
39
+ @@chimera_file.puts file_head
40
+ else
41
+ if File.exists?("fln_results/chimeric_sequences.txt")
42
+ File.delete("fln_results/chimeric_sequences.txt")
43
+ end
44
+ end
45
+
37
46
  # @@error_fasta_file = File.open("fln_results/error_seqs.fasta", 'w')
38
47
  # @@error_file = File.open("fln_results/errors_info.txt", 'w')
39
48
 
@@ -50,6 +59,10 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
50
59
  @@tcode_file.close
51
60
  @@nc_rna_file.close
52
61
 
62
+ if (!@@options[:chimera].nil?)
63
+ @@chimera_file.close
64
+ end
65
+
53
66
  # @@error_fasta_file.close
54
67
  # @@error_file.close
55
68
 
@@ -113,50 +126,101 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
113
126
 
114
127
  def write_seq(seq)
115
128
  begin
116
- # -------------------------------------------------------- Complete Seqs
117
- if (e=seq.get_annotations(:complete).first)
129
+ # -------------------------------------------------------- Chimeric Seqs
130
+ if (!@@options[:chimera].nil?)
131
+ if (q=seq.get_annotations(:chimera).first)
132
+ @@chimera_file.puts q[:message]
133
+ # -------------------------------------------------- Complete Seqs
134
+ elsif (e=seq.get_annotations(:complete).first)
135
+
136
+ @@annotation_file.puts e[:message]
137
+
138
+ if (a=seq.get_annotations(:alignment).first)
139
+ @@alignment_file.puts a[:message]
140
+ end
141
+
142
+ if (p=seq.get_annotations(:protein).first)
143
+ @@prot_file.puts p[:message]
144
+ end
145
+
146
+ if (n=seq.get_annotations(:nucleotide).first)
147
+ @@nts_file.puts n[:message]
148
+ end
149
+ # --------------------------------------------------- Non Complete Seqs
150
+ elsif (e=seq.get_annotations(:tmp_annotation).first)
118
151
 
119
- @@annotation_file.puts e[:message]
152
+ @@annotation_file.puts e[:message][0]
120
153
 
121
- if (a=seq.get_annotations(:alignment).first)
122
- @@alignment_file.puts a[:message]
123
- end
154
+ if (a=seq.get_annotations(:alignment).first)
155
+ if !a[:message].empty?
156
+ @@alignment_file.puts a[:message]
157
+ end
158
+ end
124
159
 
125
- if (p=seq.get_annotations(:protein).first)
126
- @@prot_file.puts p[:message]
127
- end
160
+ if (p=seq.get_annotations(:protein).first)
161
+ if !p[:message].empty?
162
+ @@prot_file.puts p[:message]
163
+ end
164
+ end
128
165
 
129
- if (n=seq.get_annotations(:nucleotide).first)
130
- @@nts_file.puts n[:message]
166
+ if (n=seq.get_annotations(:nucleotide).first)
167
+ @@nts_file.puts n[:message]
168
+ end
169
+ # ------------------------------------------------- nc RNA
170
+ elsif (nc=seq.get_annotations(:ncrna).first)
171
+ @@nc_rna_file.puts nc[:message]
172
+ # ------------------------------------------------- Test Code
173
+ elsif (t=seq.get_annotations(:tcode).first)
174
+ @@tcode_file.puts t[:message]
131
175
  end
132
- # -------------------------------------------------------- Non Complete Seqs
133
- elsif (e=seq.get_annotations(:tmp_annotation).first)
176
+ # ---------------------------------------------------------------------------------
177
+ # -------------------------------------------------------- without Chimeric Seqs Mode
178
+ else
179
+ # ------------------------------------------------- Complete Seqs
180
+ if (e=seq.get_annotations(:complete).first)
134
181
 
135
- @@annotation_file.puts e[:message][0]
182
+ @@annotation_file.puts e[:message]
136
183
 
137
- if (a=seq.get_annotations(:alignment).first)
138
- if !a[:message].empty?
184
+ if (a=seq.get_annotations(:alignment).first)
139
185
  @@alignment_file.puts a[:message]
140
186
  end
141
- end
142
187
 
143
- if (p=seq.get_annotations(:protein).first)
144
- if !p[:message].empty?
188
+ if (p=seq.get_annotations(:protein).first)
145
189
  @@prot_file.puts p[:message]
146
190
  end
147
- end
148
191
 
149
- if (n=seq.get_annotations(:nucleotide).first)
150
- @@nts_file.puts n[:message]
192
+ if (n=seq.get_annotations(:nucleotide).first)
193
+ @@nts_file.puts n[:message]
194
+ end
195
+ # ------------------------------------------------- Non Complete Seqs
196
+ elsif (e=seq.get_annotations(:tmp_annotation).first)
197
+
198
+ @@annotation_file.puts e[:message][0]
199
+
200
+ if (a=seq.get_annotations(:alignment).first)
201
+ if !a[:message].empty?
202
+ @@alignment_file.puts a[:message]
203
+ end
204
+ end
205
+
206
+ if (p=seq.get_annotations(:protein).first)
207
+ if !p[:message].empty?
208
+ @@prot_file.puts p[:message]
209
+ end
210
+ end
211
+
212
+ if (n=seq.get_annotations(:nucleotide).first)
213
+ @@nts_file.puts n[:message]
214
+ end
215
+ # ------------------------------------------------- nc RNA
216
+ elsif (nc=seq.get_annotations(:ncrna).first)
217
+ @@nc_rna_file.puts nc[:message]
218
+ # ------------------------------------------------- Test Code
219
+ elsif (t=seq.get_annotations(:tcode).first)
220
+ @@tcode_file.puts t[:message]
151
221
  end
152
- # -------------------------------------------------------- nc RNA
153
- elsif (nc=seq.get_annotations(:ncrna).first)
154
- @@nc_rna_file.puts nc[:message]
155
- # -------------------------------------------------------- Test Code
156
- elsif (t=seq.get_annotations(:tcode).first)
157
- @@tcode_file.puts t[:message]
158
222
  end
159
- # -------------------------------------------------------- errors
223
+ # ------------------------------------------------- errors
160
224
  # if e=seq.get_annotations(:error).first
161
225
  # if !e[:message].empty?
162
226
  # @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: full_lengther_next
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.6
5
+ version: 0.0.8
6
6
  platform: ruby
7
7
  authors:
8
8
  - Noe Fernandez & Dario Guerrero
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-04-16 00:00:00 Z
13
+ date: 2012-11-28 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: xml-simple
@@ -31,7 +31,7 @@ dependencies:
31
31
  requirements:
32
32
  - - ">="
33
33
  - !ruby/object:Gem::Version
34
- version: 0.0.32
34
+ version: 0.0.37
35
35
  type: :runtime
36
36
  version_requirements: *id002
37
37
  - !ruby/object:Gem::Dependency
@@ -97,6 +97,7 @@ files:
97
97
  - bin/full_lengther_next
98
98
  - History.txt
99
99
  - lib/full_lengther_next/classes/common_functions.rb
100
+ - lib/full_lengther_next/classes/chimeric_seqs.rb
100
101
  - lib/full_lengther_next/classes/fl_analysis.rb
101
102
  - lib/full_lengther_next/classes/fl_string_utils.rb
102
103
  - lib/full_lengther_next/classes/fln_stats.rb
@@ -142,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
143
  requirements: []
143
144
 
144
145
  rubyforge_project: full_lengther_next
145
- rubygems_version: 1.7.2
146
+ rubygems_version: 1.8.24
146
147
  signing_key:
147
148
  specification_version: 3
148
149
  summary: FULL-LENGTHERNEXT is a tool adapted to NGS technologies, able to work in parallel and in a distributed way to minimise computing time