full_lengther_next 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,7 @@
1
+ === 0.0.6 2012-04-16
2
+
3
+ Fixed some cosmetic issues and parameters names
4
+
1
5
  === 0.0.5 2012-03-09
2
6
 
3
7
  Fix NCRNA annotation
@@ -26,7 +26,7 @@ optparse = OptionParser.new do |opts|
26
26
  end
27
27
 
28
28
  options[:user_db] = nil
29
- opts.on( '-d', '--blast_db DB_NAME', 'User blast plus database' ) do |db|
29
+ opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
30
30
  options[:user_db] = db
31
31
  end
32
32
 
@@ -46,7 +46,7 @@ optparse = OptionParser.new do |opts|
46
46
  end
47
47
 
48
48
  options[:distance] = 15
49
- opts.on( '-a', '--aas_distance DISTANCE', "distance threshold in aminoacids used for some calculations, the less distance the more strict. Default=15\n\n" ) do |distance|
49
+ opts.on( '-m', '--max_distance maxDIST', "maximal distance between query and subject gene boundaries to be qualified as putative, the less distance the more strict. Default=15\n\n" ) do |distance|
50
50
  options[:distance] = distance.to_i
51
51
  end
52
52
 
@@ -172,7 +172,7 @@ require 'my_worker_manager'
172
172
  $LOG = Logger.new(STDOUT)
173
173
  $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
174
174
 
175
- custom_worker_file = File.join(ROOT_PATH,'classes','my_worker.rb')
175
+ custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
176
176
 
177
177
  $LOG.info 'Starting server'
178
178
  # initialize work manager (open files, etc)
@@ -1,13 +1,13 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
- ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
4
+ root_path=File.join(File.dirname(__FILE__),'full_lengther_next')
5
5
 
6
- $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
6
+ $: << File.expand_path(File.join(root_path, 'classes'))
7
7
 
8
8
 
9
9
  module FullLengtherNext
10
- VERSION = '0.0.5'
10
+ VERSION = '0.0.6'
11
11
 
12
12
  FULLLENGHTER_VERSION = VERSION
13
13
  end
@@ -247,7 +247,7 @@ module FlAnalysis
247
247
  if (seq.sec_desc.empty?)
248
248
  if (!q.hits[0].definition.nil?)
249
249
  warnings = "Coding sequence with some errors, #{warnings}"
250
- seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
250
+ seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
251
251
  seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
252
252
  else
253
253
  seq.annotate(:apply_tcode,'')
@@ -264,7 +264,7 @@ module FlAnalysis
264
264
  if (seq.sec_desc.empty?)
265
265
  if (!q.hits[0].definition.nil?)
266
266
  warnings = "Coding sequence with some errors, #{warnings}"
267
- seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
267
+ seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
268
268
  end
269
269
  end
270
270
  end
@@ -529,7 +529,7 @@ module FlAnalysis
529
529
  tmp_prot = ">#{q.query_def}\n#{final_prot}"
530
530
  tmp_align = "#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n"
531
531
  tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
532
- seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tCoding Seq\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
532
+ seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tMisassembled\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
533
533
  seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
534
534
 
535
535
  # puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
@@ -4,384 +4,485 @@ module FlnStats
4
4
  def summary_stats
5
5
  stats_file = File.open('fln_results/summary_stats.html', 'w')
6
6
 
7
- (html_head, html_1, html_2, html_3, html_4) = html_code
7
+ # recogemos los trozos de html fijos
8
+ (html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
8
9
 
9
10
  total_seqs = 0
11
+ status_suma = 0
12
+ #recogemos los datos que necesitamos de los ficheros de resultados
13
+ (status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats
14
+ (tcode_array, seqs_number2, unk_200, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats
15
+ (ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats
10
16
 
11
- (status_array, seqs_number1, error_1_num, seq_uniq, complete_uniq, seq_length_stats, complete_seq_length_stats) = annotation_stats
12
- (tcode_array, seqs_number2, tcode_length_stats, coding_length_stats, unknown_length_stats) = testcode_stats
13
- ncrna_array=ncrna_stats
14
-
15
- total_seqs = seqs_number1 + seqs_number2 + ncrna_array[4].to_i
16
-
17
+ total_seqs = seqs_number1 + seqs_number2 + ncrna_total.to_i
18
+ uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500)
19
+ uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200)
20
+ longest_one = [db_longest_one, tc_longest_one, nc_longest_one].max
17
21
  stats_file.puts html_head
18
- stats_file.puts "\t\t\t\t"+'<font color="#FF0000">'+total_seqs.to_s+"</font> sequences in your input fasta\n\t\t\t</h2>\n\t\t</center>"
19
22
 
20
23
  if (total_seqs.to_i > 0)
21
- stats_file.puts html_1
22
- stats_file.puts ' <tr>
23
- <td align="center">YES</td>
24
- <td align="right">'+seqs_number1.to_s+'</td>
25
- <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
26
- <td align="right">'+seq_uniq.to_s+'</td>
27
- <td align="right">'+seq_length_stats[0].to_s+'</td>
28
- <td align="right">'+seq_length_stats[1].to_s+'</td>
29
- <td align="right">'+seq_length_stats[2].to_s+'</td>
30
- <td align="right">'+seq_length_stats[3].to_s+'</td>
31
- </tr>'
32
- stats_file.puts ' <tr>
33
- <td align="center">NO</td>
34
- <td align="right">'+seqs_number2.to_s+'</td>
35
- <td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
36
- <td align="right">-</td>
37
- <td align="right">'+tcode_length_stats[0].to_s+'</td>
38
- <td align="right">'+tcode_length_stats[1].to_s+'</td>
39
- <td align="right">'+tcode_length_stats[2].to_s+'</td>
40
- <td align="right">'+tcode_length_stats[3].to_s+'</td>
41
- </tr>'
42
- stats_file.puts ' <tr>
43
- <td align="center">ncRNA</td>
44
- <td align="right">'+ncrna_array[4].to_s+'</td>
45
- <td align="right">'+'%.2f' % (100*ncrna_array[4].to_f/total_seqs.to_f).to_s+' %</td>
46
- <td align="right">-</td>
47
- <td align="right">'+ncrna_array[0].to_s+'</td>
48
- <td align="right">'+ncrna_array[1].to_s+'</td>
49
- <td align="right">'+ncrna_array[2].to_s+'</td>
50
- <td align="right">'+ncrna_array[3].to_s+'</td>
51
- </tr>
52
- </table>'
53
-
54
- stats_file.puts ' <p><font color="#FF0000">'+error_1_num.to_s+'</font> Sequences with sense and antisense hits error</p>'
55
- stats_file.puts ' <p><font color="#FF0000">'+complete_uniq.to_s+'</font> Complete sequences with different ortologue ID</p>'
56
- stats_file.puts html_2
24
+ # imprimimos la tabla Status Report --------------------------------------------------------------------------------------------
25
+ stats_file.puts html_st
57
26
  status_array.each do |status|
27
+ if (status[1] == 'Internal') || (status[1] == 'Misassembled')
58
28
  stats_file.puts ' <tr>
59
- <td align="right">'+status[4].to_s+'</td>
60
- <td align="right">'+status[0].to_s+'</td>
61
- <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
62
- <td align="right">'+status[1].to_s+'</td>
63
- <td align="right">'+status[2].to_s+'</td>
64
- <td align="right">'+status[3].to_s+'</td>
65
- </tr>'
29
+ <td colspan="2" align="left">'+status[1].to_s+'</td>
30
+ <td align="right">'+status[0].to_s+'</td>
31
+ <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
32
+ </tr>'
33
+ elsif (status[1] =~ /^Putative/)
34
+ stats_file.puts ' <tr>
35
+ <td align="left">Putative</td>
36
+ <td align="right">'+status[0].to_s+'</td>
37
+ <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
38
+ </tr>'
39
+ else
40
+ stats_file.puts ' <tr>
41
+ <td rowspan="2" align="left">'+status[1].to_s+'</td>
42
+ <td align="left">Sure</td>
43
+ <td align="right">'+status[0].to_s+'</td>
44
+ <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
45
+ </tr>'
46
+ end
47
+ status_suma += status[0]
66
48
  end
67
- stats_file.puts html_3
68
-
49
+ # añadimos los coding, P.coding
69
50
  tcode_array.each do |status|
51
+ if (status[1] == 'Coding')
70
52
  stats_file.puts ' <tr>
71
- <td align="right">'+status[5].to_s+'</td>
72
- <td align="right">'+status[4].to_s+'</td>
73
- <td align="right">'+'%.2f' % (100*status[4].to_f/total_seqs.to_f).to_s+' %</td>
74
- <td align="right">'+status[0].to_s+'</td>
75
- <td align="right">'+status[1].to_s+'</td>
76
- <td align="right">'+status[2].to_s+'</td>
77
- <td align="right">'+status[3].to_s+'</td>
78
- </tr>'
53
+ <td rowspan="2" align="left">'+status[1].to_s+'</td>
54
+ <td align="left">Sure</td>
55
+ <td align="right">'+status[0].to_s+'</td>
56
+ <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
57
+ </tr>'
58
+ elsif (status[1] == 'Putative Coding')
59
+ stats_file.puts ' <tr>
60
+ <td align="left">Putative</td>
61
+ <td align="right">'+status[0].to_s+'</td>
62
+ <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
63
+ </tr>'
64
+ end
65
+ status_suma += status[0]
79
66
  end
80
-
81
- # print Non coding RNA
67
+ # se ponen los ncRNA
82
68
  stats_file.puts ' <tr>
83
- <td align="right">Putative ncRNA</td>
84
- <td align="right">'+ncrna_array[4].to_s+'</td>
85
- <td align="right">'+'%.2f' % (100*ncrna_array[4].to_f/total_seqs.to_f).to_s+' %</td>
86
- <td align="right">'+ncrna_array[0].to_s+'</td>
87
- <td align="right">'+ncrna_array[1].to_s+'</td>
88
- <td align="right">'+ncrna_array[2].to_s+'</td>
89
- <td align="right">'+ncrna_array[3].to_s+'</td>
69
+ <td colspan="2" align="left">Putative ncRNA</td>
70
+ <td align="right">'+ncrna_total.to_s+'</td>
71
+ <td align="right">'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' %</td>
72
+ </tr>'
73
+ status_suma += ncrna_total
74
+ # se ponen los unknown
75
+ tcode_array.each do |status|
76
+ if (status[1] =~ /Unknown/i)
77
+ stats_file.puts ' <tr>
78
+ <td colspan="2" align="left">'+status[1].to_s+'</td>
79
+ <td align="right">'+status[0].to_s+'</td>
80
+ <td align="right">'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' %</td>
81
+ </tr>'
82
+ end
83
+ end
84
+ #se añade el total
85
+ stats_file.puts ' <tr>
86
+ <td colspan="2" align="left">Total</td>
87
+ <td align="right">'+status_suma.to_s+'</td>
88
+ <td align="right">'+'%.2f' % (100*status_suma.to_f/total_seqs.to_f).to_s+' %</td>
90
89
  </tr>
91
- </table>
92
- </center>'
93
-
94
- end
95
- stats_file.puts html_4
96
-
97
- stats_file.close
98
- end
99
-
90
+ </table>'
91
+
92
+
93
+ # imprimimos la tabla Unigene Report --------------------------------------------------------------------------------------------
94
+ new_genes = tcode_array[0][0] + tcode_array[1][0]
95
+ total_uni = (seqs_number1 + new_genes + ncrna_total + tcode_array[2][0])
96
+ stats_file.puts html_uni
97
+ stats_file.puts ' <tr>
98
+ <td align="left">With orthologue in DBs</td>
99
+ <td align="right">'+seqs_number1.to_s+'</td>
100
+ <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
101
+ </tr>'
102
+ stats_file.puts ' <tr>
103
+ <td align="left">Putative New Genes</td>
104
+ <td align="right">'+new_genes.to_s+'</td>
105
+ <td align="right">'+'%.2f' % (100*new_genes.to_f/total_seqs.to_f).to_s+' %</td>
106
+ </tr>'
107
+ stats_file.puts ' <tr>
108
+ <td align="left">ncRNAs</td>
109
+ <td align="right">'+ncrna_total.to_s+'</td>
110
+ <td align="right">'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' %</td>
111
+ </tr>'
112
+ stats_file.puts ' <tr>
113
+ <td align="left">Unknown</td>
114
+ <td align="right">'+tcode_array[2][0].to_s+'</td>
115
+ <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/total_seqs.to_f).to_s+' %</td>
116
+ </tr>'
117
+ stats_file.puts ' <tr>
118
+ <td align="left">Total</td>
119
+ <td align="right">'+total_uni.to_s+'</td>
120
+ <td align="right">'+'%.2f' % (100*total_uni.to_f/total_seqs.to_f).to_s+' %</td>
121
+ </tr>
122
+ </table>'
100
123
 
101
- def html_code
102
- html_head = '<html>
103
- <head>
104
- <title>FLN Annotation Summary</title>
105
- </head>
106
-
107
- <body bgcolor="#FFFFFF">
108
- <center>
109
- <h1 ALIGN="center">
110
- Full-LengtherNEXT
111
- <br/>
112
- Annotation summary
113
- </h1>
114
- <h2 align="center">'
115
-
116
- html_1 = '
117
- <center>
118
- <table border=1>
119
- <tr>
120
- <th>Ortologue found</th>
121
- <th>Sequences found</th>
122
- <th>%</th>
123
- <th>Different IDs</th>
124
- <th>&gt;200 bp</th>
125
- <th>&lt;200 bp</th>
126
- <th>&gt;500 bp</th>
127
- <th>&lt;500 bp</th>
124
+ # imprimimos la tabla Database Usage --------------------------------------------------------------------------------------------
125
+ stats_file.puts html_db
126
+ db_names=["UserDB", "SwissProt", "TrEMBL"]
127
+ total_db = 0
128
+
129
+ for i in 0..db_usage.length-1 do i
130
+ total_db += db_usage[i]
131
+ stats_file.puts ' <tr>
132
+ <td align="left">'+db_names[i].to_s+'</td>
133
+ <td align="right">'+db_usage[i].to_s+'</td>
134
+ <td align="right">'+'%.2f' % (100*db_usage[i].to_f/total_seqs.to_f).to_s+' %</td>
128
135
  </tr>'
136
+ end
137
+ no_db = seqs_number2 + ncrna_total.to_i
138
+ stats_file.puts ' <tr>
139
+ <td align="left">None</td>
140
+ <td align="right">'+no_db.to_s+'</td>
141
+ <td align="right">'+'%.2f' % (100*no_db.to_f/total_seqs.to_f).to_s+' %</td>
142
+ </tr>'
143
+ total_db += no_db
144
+ stats_file.puts ' <tr>
145
+ <td align="left">Total</td>
146
+ <td align="right">'+total_db.to_s+'</td>
147
+ <td align="right">'+'%.2f' % (100*total_db.to_f/total_seqs.to_f).to_s+' %</td>
148
+ </tr>
149
+ </table>'
129
150
 
130
- html_2= ' <br/>
131
- <table border=1>
132
- <tr>
133
- <th>Status</th>
134
- <th>Total</th>
135
- <th>%</th>
136
- <th>UserDB</th>
137
- <th>SwissProt</th>
138
- <th>TrEMBL</th>
151
+ # imprimimos la tabla Report guiding assembly quality -------------------------------------------------------------
152
+ stats_file.puts html_as
153
+ stats_file.puts ' <tr>
154
+ <td align="left">Unigenes</td>
155
+ <td align="right">'+total_seqs.to_s+'</td>
156
+ <td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
139
157
  </tr>'
140
-
141
- html_3= ' </table>
142
- <br/>
143
- <table border=1>
144
- <tr>
145
- <th>Status</th>
146
- <th>Total</th>
147
- <th>%</th>
148
- <th>&gt;200 bp</th>
149
- <th>&lt;200 bp</th>
150
- <th>&gt;500 bp</th>
151
- <th>&lt;500 bp</th>
158
+ stats_file.puts ' <tr>
159
+ <td align="left">Unigenes >500pb</td>
160
+ <td align="right">'+uni_500.to_s+'</td>
161
+ <td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
162
+ </tr>'
163
+ stats_file.puts ' <tr>
164
+ <td align="left">Unigenes >200pb</td>
165
+ <td align="right">'+uni_200.to_s+'</td>
166
+ <td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
167
+ </tr>'
168
+ stats_file.puts ' <tr>
169
+ <td align="left">Longest unigene</td>
170
+ <td align="right">'+longest_one.to_s+'</td>
171
+ <td align="right">-</td>
172
+ </tr>'
173
+ stats_file.puts ' <tr>
174
+ <td align="left">With orthologue <sup>1</sup></td>
175
+ <td align="right">'+seqs_number1.to_s+'</td>
176
+ <td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
177
+ </tr>'
178
+ stats_file.puts ' <tr>
179
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different orthologue IDs</td>
180
+ <td align="right">'+seq_uniq.to_s+'</td>
181
+ <td align="right">'+'%.2f' % (100*seq_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
152
182
  </tr>'
183
+ stats_file.puts ' <tr>
184
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Complete transcripts</td>
185
+ <td align="right">'+status_array[0][0].to_s+'</td>
186
+ <td align="right">'+'%.2f' % (100*status_array[0][0].to_f/seqs_number1.to_f).to_s+' %</td>
187
+ </tr>'
188
+ stats_file.puts ' <tr>
189
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different complete transcripts</td>
190
+ <td align="right">'+complete_uniq.to_s+'</td>
191
+ <td align="right">'+'%.2f' % (100*complete_uniq.to_f/seqs_number1.to_f).to_s+' %</td>
192
+ </tr>'
193
+ stats_file.puts ' <tr>
194
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Misassembled</td>
195
+ <td align="right">'+error_1_num.to_s+'</td>
196
+ <td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
197
+ </tr>'
198
+ stats_file.puts ' <tr>
199
+ <td align="left">Without orthologue <sup>1</sup></td>
200
+ <td align="right">'+no_db.to_s+'</td>
201
+ <td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
202
+ </tr>'
203
+ stats_file.puts ' <tr>
204
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding</td>
205
+ <td align="right">'+tcode_array[0][0].to_s+'</td>
206
+ <td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
207
+ </tr>'
208
+ stats_file.puts ' <tr>
209
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding</td>
210
+ <td align="right">'+tcode_array[1][0].to_s+'</td>
211
+ <td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
212
+ </tr>'
213
+ stats_file.puts ' <tr>
214
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative ncRNA</td>
215
+ <td align="right">'+ncrna_total.to_s+'</td>
216
+ <td align="right">'+'%.2f' % (100*ncrna_total.to_f/no_db.to_f).to_s+' %</td>
217
+ </tr>'
218
+ stats_file.puts ' <tr>
219
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown (all)</td>
220
+ <td align="right">'+tcode_array[2][0].to_s+'</td>
221
+ <td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
222
+ </tr>'
223
+ stats_file.puts ' <tr>
224
+ <td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown < 200bp</td>
225
+ <td align="right">'+unk_200.to_s+'</td>
226
+ <td align="right">'+'%.2f' % (100*unk_200.to_f/no_db.to_f).to_s+' %</td>
227
+ </tr>
228
+ </table>
229
+ <sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
153
230
 
154
- html_4 = ' </body>
155
- </html>'
231
+
156
232
 
157
- return [html_head, html_1, html_2, html_3, html_4]
158
233
 
234
+ end
235
+ stats_file.puts html_end
236
+
237
+ stats_file.close
159
238
  end
160
239
 
161
240
 
162
- def stats_my_db(db_name, array)
163
241
 
164
- if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
165
- array[1] += 1
166
- elsif (db_name =~ /^sp_/)
167
- array[2] += 1
168
- elsif (db_name =~ /^tr_/)
169
- array[3] += 1
242
+
243
+ def html_code
244
+ html_head = '<html>
245
+ <head>
246
+ <title>FLN Summary</title>
247
+ </head>
248
+
249
+ <body bgcolor="#FFFFFF">
250
+ <center>
251
+ <h1 align="center">
252
+ Full-LengtherNEXT Summary
253
+ </h1>'
254
+
255
+
256
+ html_1 = '
257
+ <h2 align="center">
258
+ Status report
259
+ </h2>
260
+
261
+ <table border="2" cellspacing="0" cellpadding="2">
262
+ <tr>
263
+ <th colspan="2">Status</th>
264
+ <th>Unigenes</th>
265
+ <th>%</th>
266
+ </tr>'
267
+
268
+ html_2= '
269
+ <h2 align="center">
270
+ Unigene report
271
+ </h2>
272
+
273
+ <table border="2" cellspacing="0" cellpadding="2">
274
+ <tr>
275
+ <th></th>
276
+ <th>Unigenes</th>
277
+ <th>%</th>
278
+ </tr>'
279
+
280
+ html_3= '
281
+ <h2 align="center">
282
+ Database usage
283
+ </h2>
284
+
285
+ <table border="2" cellspacing="0" cellpadding="2">
286
+ <tr>
287
+ <th></th>
288
+ <th>Unigenes</th>
289
+ <th>%</th>
290
+ </tr>'
291
+
292
+ html_4= '
293
+ <h2 align="center">
294
+ Report guiding assembly quality
295
+ </h2>
296
+
297
+ <table border="2" cellspacing="0" cellpadding="2">
298
+ <tr>
299
+ <th></th>
300
+ <th>Unigenes</th>
301
+ <th>%</th>
302
+ </tr>'
303
+
304
+ html_5 = ' </body>
305
+ </html>'
306
+
307
+ return [html_head, html_1, html_2, html_3, html_4, html_5]
308
+
170
309
  end
171
310
 
172
- return array
173
- end
174
311
 
312
+ def annotation_stats
175
313
 
176
- def annotation_stats
314
+ seqs_number = 0
315
+ array_of_all_accs = []
316
+ array_of_complete_accs = []
317
+ error_1_num = 0
318
+ uni_500 = 0
319
+ uni_200 = 0
320
+ longest_one = 0
177
321
 
178
- seqs_number = 0
179
- array_of_all_accs = []
180
- array_of_complete_accs = []
181
- error_1_num = 0
182
-
183
- # >200, <200, >500, <500
184
- seq_length_stats = [0,0,0,0]
185
-
186
- # >200, <200, >500, <500
187
- complete_seq_length_stats = [0,0,0,0]
188
-
189
- status_array = []
190
- # total, userdb, swissprotdb, trembl, status
191
- complete = [0,0,0,0,'Complete']
192
- putative_complete = [0,0,0,0,'Putative Complete']
193
- c_terminus = [0,0,0,0,'C-terminus']
194
- putative_c_terminus = [0,0,0,0,'Putative C-terminus']
195
- n_terminus = [0,0,0,0,'N-terminus']
196
- putative_n_terminus = [0,0,0,0,'Putative N-terminus']
197
- internal = [0,0,0,0,'Internal']
198
- cod_seq = [0,0,0,0,'Misassembled']
199
-
200
-
201
- File.open('fln_results/annotations.txt').each do |line|
202
- line.chomp!
203
- (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
204
-
205
- if (line !~ /^Query_id\t/) && (!line.empty?)
206
- seqs_number += 1
207
- array_of_all_accs.push acc
208
- # -------------------------------------------------------------------------
209
- if (fasta_length.to_i >= 200)
210
- seq_length_stats[0] += 1
211
- # seqs_longer_200 += 1
212
- else
213
- seq_length_stats[1] += 1
214
- # seqs_shorter_200 += 1
215
- end
216
- if (fasta_length.to_i >= 500)
217
- seq_length_stats[2] += 1
218
- # seqs_longer_500 += 1
219
- else
220
- seq_length_stats[3] += 1
221
- # seqs_shorter_500 += 1
222
- end
223
- # -------------------------------------------------------------------------
224
- if (msgs =~ /ERROR#1/)
225
- error_1_num += 1
226
- end
227
- # -------------------------------------------------------------------------
228
- if (status == 'Complete')
229
- complete[0] += 1
230
- array_of_complete_accs.push acc
231
- complete = stats_my_db(db_name, complete)
232
-
322
+ status_array = []
323
+ # total, status
324
+ complete = [0,'Complete']
325
+ putative_complete = [0,'Putative Complete']
326
+ c_terminus = [0,'C-terminus']
327
+ putative_c_terminus = [0,'Putative C-terminus']
328
+ n_terminus = [0,'N-terminus']
329
+ putative_n_terminus = [0,'Putative N-terminus']
330
+ internal = [0,'Internal']
331
+ cod_seq = [0,'Misassembled']
332
+
333
+ #userdb, SwissProt, TrEMBL
334
+ db_usage = [0,0,0]
335
+
336
+ File.open('fln_results/dbannotated.txt').each do |line|
337
+ line.chomp!
338
+ (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
339
+
340
+ if (line !~ /^Query_id\t/) && (!line.empty?)
341
+ seqs_number += 1
342
+ if (fasta_length.to_i > longest_one)
343
+ longest_one = fasta_length.to_i
344
+ end
345
+ array_of_all_accs.push acc
346
+
347
+ if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
348
+ db_usage[0] += 1
349
+ elsif (db_name =~ /^sp_/)
350
+ db_usage[1] += 1
351
+ elsif (db_name =~ /^tr_/)
352
+ db_usage[2] += 1
353
+ end
354
+
355
+ # -------------------------------------------------------------------------
233
356
  if (fasta_length.to_i >= 200)
234
- complete_seq_length_stats[0] += 1
235
- # complete_longer_200 += 1
236
- else
237
- complete_seq_length_stats[1] += 1
238
- # complete_shorter_200 += 1
357
+ uni_200 += 1
239
358
  end
240
-
241
359
  if (fasta_length.to_i >= 500)
242
- complete_seq_length_stats[2] += 1
243
- # complete_longer_500 += 1
244
- else
245
- complete_seq_length_stats[3] += 1
246
- # complete_shorter_500 += 1
360
+ uni_500 += 1
361
+ end
362
+ # -------------------------------------------------------------------------
363
+ if (msgs =~ /ERROR#1/)
364
+ error_1_num += 1
247
365
  end
248
-
249
- elsif (status == 'Putative Complete')
250
- putative_complete[0] += 1
251
- putative_complete = stats_my_db(db_name, putative_complete)
252
- elsif (status == 'C-terminus')
253
- c_terminus[0] += 1
254
- c_terminus = stats_my_db(db_name, c_terminus)
255
- elsif (status == 'N-terminus')
256
- n_terminus[0] += 1
257
- n_terminus = stats_my_db(db_name, n_terminus)
258
- elsif (status == 'Putative C-terminus')
259
- putative_c_terminus[0] += 1
260
- putative_c_terminus = stats_my_db(db_name, putative_c_terminus)
261
- elsif (status == 'Putative N-terminus')
262
- putative_n_terminus[0] += 1
263
- putative_n_terminus = stats_my_db(db_name, putative_n_terminus)
264
- elsif (status == 'Internal')
265
- internal[0] += 1
266
- internal = stats_my_db(db_name, internal)
267
- elsif (status == 'Coding Seq')
268
- cod_seq[0] += 1
269
- cod_seq = stats_my_db(db_name, cod_seq)
366
+ # -------------------------------------------------------------------------
367
+ if (status == 'Complete')
368
+ complete[0] += 1
369
+ array_of_complete_accs.push acc
370
+
371
+ elsif (status == 'Putative Complete')
372
+ putative_complete[0] += 1
373
+ elsif (status == 'C-terminus')
374
+ c_terminus[0] += 1
375
+ elsif (status == 'N-terminus')
376
+ n_terminus[0] += 1
377
+ elsif (status == 'Putative C-terminus')
378
+ putative_c_terminus[0] += 1
379
+ elsif (status == 'Putative N-terminus')
380
+ putative_n_terminus[0] += 1
381
+ elsif (status == 'Internal')
382
+ internal[0] += 1
383
+ elsif (status == 'Misassembled')
384
+ cod_seq[0] += 1
385
+ end
386
+ # -------------------------------------------------------------------------
270
387
  end
271
- # -------------------------------------------------------------------------
388
+
272
389
  end
273
390
 
391
+ status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq]
392
+
393
+ return [status_array, db_usage, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, uni_500, uni_200, longest_one]
274
394
  end
275
-
276
- status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq]
277
-
278
- return [status_array, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, seq_length_stats, complete_seq_length_stats]
279
- end
280
395
 
281
396
 
282
- def testcode_stats
397
+ def testcode_stats
283
398
 
284
- seqs_number = 0
285
-
286
- # >200, <200, >500, <500
287
- all_tcode_stats = [0,0,0,0]
399
+ seqs_number = 0
400
+ unk_200 = 0
401
+ uni_500 = 0
402
+ uni_200 = 0
403
+ longest_one = 0
288
404
 
289
- # >200, <200, >500, <500, total, status
290
- coding_length_stats = [0,0,0,0,0,'Coding']
291
- p_coding_length_stats = [0,0,0,0,0,'Putative Coding']
292
- unknown_length_stats = [0,0,0,0,0,'Unknown']
405
+ # total, status
406
+ coding_stats = [0,'Coding']
407
+ p_coding_stats = [0,'Putative Coding']
408
+ unknown_stats = [0,'Unknown']
293
409
 
294
- File.open('fln_results/tcode_result.txt').each do |line|
295
- line.chomp!
296
- (name,fasta_length,acc,db_name,status) = line.split("\t")
410
+ File.open('fln_results/new_coding.txt').each do |line|
411
+ line.chomp!
412
+ (name,fasta_length,acc,db_name,status) = line.split("\t")
297
413
 
298
- if (line !~ /^Query_id\t/) && (!line.empty?)
299
- seqs_number += 1
300
-
301
- if (fasta_length.to_i >= 200)
302
- all_tcode_stats[0] += 1
303
-
304
- if (status == 'coding')
305
- coding_length_stats[4] += 1
306
- coding_length_stats[0] += 1
307
- elsif (status == 'putative_coding')
308
- p_coding_length_stats[4] += 1
309
- p_coding_length_stats[0] += 1
310
- elsif (status == 'unknown')
311
- unknown_length_stats[4] += 1
312
- unknown_length_stats[0] += 1
414
+ if (line !~ /^Query_id\t/) && (!line.empty?)
415
+ seqs_number += 1
416
+
417
+ if (fasta_length.to_i > longest_one)
418
+ longest_one = fasta_length.to_i
313
419
  end
314
- else
315
- all_tcode_stats[1] += 1
316
-
317
- if (status == 'coding')
318
- coding_length_stats[4] += 1
319
- coding_length_stats[1] += 1
320
- elsif (status == 'putative_coding')
321
- p_coding_length_stats[4] += 1
322
- p_coding_length_stats[1] += 1
323
- elsif (status == 'unknown')
324
- unknown_length_stats[4] += 1
325
- unknown_length_stats[1] += 1
420
+
421
+ # -------------------------------------------------------------------------
422
+ if (fasta_length.to_i >= 200)
423
+ uni_200 += 1
326
424
  end
327
- end
328
- if (fasta_length.to_i >= 500)
329
- all_tcode_stats[2] += 1
330
-
331
- if (status == 'coding')
332
- coding_length_stats[2] += 1
333
- elsif (status == 'putative_coding')
334
- p_coding_length_stats[2] += 1
335
- elsif (status == 'unknown')
336
- unknown_length_stats[2] += 1
425
+ if (fasta_length.to_i >= 500)
426
+ uni_500 += 1
337
427
  end
338
- else
339
- all_tcode_stats[3] += 1
340
-
428
+ # -------------------------------------------------------------------------
429
+
430
+ if (fasta_length.to_i < 200)
431
+ if (status == 'unknown')
432
+ unk_200 += 1
433
+ end
434
+ end
435
+
341
436
  if (status == 'coding')
342
- coding_length_stats[3] += 1
437
+ coding_stats[0] += 1
343
438
  elsif (status == 'putative_coding')
344
- p_coding_length_stats[3] += 1
439
+ p_coding_stats[0] += 1
345
440
  elsif (status == 'unknown')
346
- unknown_length_stats[3] += 1
441
+ unknown_stats[0] += 1
347
442
  end
443
+
348
444
  end
349
-
445
+
350
446
  end
351
447
 
448
+ status_array = [coding_stats, p_coding_stats, unknown_stats]
449
+
450
+ return [status_array, seqs_number, unk_200, uni_500, uni_200, longest_one]
352
451
  end
353
-
354
- status_array = [coding_length_stats, p_coding_length_stats, unknown_length_stats]
355
452
 
356
- return [status_array, seqs_number, all_tcode_stats, coding_length_stats, unknown_length_stats]
357
- end
358
453
 
359
- def ncrna_stats
454
+ def ncrna_stats
360
455
 
361
- # >200, <200, >500, <500, total
362
- ncrna_array = [0,0,0,0,0]
456
+ uni_500 = 0
457
+ uni_200 = 0
458
+ nc_total = 0
459
+ longest_one = 0
363
460
 
364
- File.open('fln_results/nc_rna.txt').each do |line|
365
- line.chomp!
366
- (name,fasta_length,acc,db_name,status) = line.split("\t")
461
+ File.open('fln_results/nc_rnas.txt').each do |line|
462
+ line.chomp!
463
+ (name,fasta_length,acc,db_name,status) = line.split("\t")
464
+
465
+ if (status == 'Putative ncRNA')
466
+
467
+ if (fasta_length.to_i > longest_one)
468
+ longest_one = fasta_length.to_i
469
+ end
470
+ # -------------------------------------------------------------------------
471
+ if (fasta_length.to_i >= 200)
472
+ uni_200 += 1
473
+ end
474
+ if (fasta_length.to_i >= 500)
475
+ uni_500 += 1
476
+ end
477
+ # -------------------------------------------------------------------------
478
+
479
+ nc_total += 1
367
480
 
368
- if (status == 'Putative ncRNA')
369
- ncrna_array[4] += 1
370
-
371
- if (fasta_length.to_i >= 200)
372
- ncrna_array[0] += 1
373
- else
374
- ncrna_array[1] += 1
375
- end
376
- if (fasta_length.to_i >= 500)
377
- ncrna_array[2] += 1
378
- else
379
- ncrna_array[3] += 1
380
481
  end
381
482
  end
483
+
484
+ return [nc_total, uni_500, uni_200, longest_one]
382
485
  end
383
-
384
- return ncrna_array
385
- end
386
-
486
+
487
+
387
488
  end
@@ -22,16 +22,16 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
22
22
  @@chunk_size=chunk_size
23
23
  @@options = options
24
24
 
25
- @@annotation_file = File.open("fln_results/annotations.txt", 'w')
25
+ @@annotation_file = File.open("fln_results/dbannotated.txt", 'w')
26
26
  @@annotation_file.puts file_head
27
27
 
28
28
  @@alignment_file = File.open("fln_results/alignments.txt", 'w')
29
29
  @@prot_file = File.open("fln_results/proteins.fasta", 'w')
30
30
  @@nts_file = File.open("fln_results/nt_seq.txt", 'w')
31
- @@tcode_file=File.open("fln_results/tcode_result.txt", 'w')
31
+ @@tcode_file=File.open("fln_results/new_coding.txt", 'w')
32
32
  @@tcode_file.puts file_head
33
33
 
34
- @@nc_rna_file = File.open("fln_results/nc_rna.txt", 'w')
34
+ @@nc_rna_file = File.open("fln_results/nc_rnas.txt", 'w')
35
35
  @@nc_rna_file.puts file_head
36
36
 
37
37
  # @@error_fasta_file = File.open("fln_results/error_seqs.fasta", 'w')
@@ -11,7 +11,7 @@ module NcRna
11
11
  q=blast_query
12
12
 
13
13
  if (!q.hits[0].nil?) # There is match in blast.
14
- nc_annotations = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\tncRNA\tPutative ncRNA\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\t\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t"
14
+ nc_annotations = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\tncRNA\tPutative ncRNA\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\t\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t"
15
15
  seq.annotate(:ncrna,nc_annotations,true)
16
16
  else
17
17
  unknown_annot = seq.get_annotations(:tcode_unknown).first
@@ -3,11 +3,13 @@ require 'orf'
3
3
 
4
4
  class Sequence
5
5
 
6
- attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc
6
+ attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc,:fasta_length
7
7
 
8
8
  def initialize(seq_name,seq_fasta,seq_qual='')
9
+ fasta_ori = seq_fasta.dup
9
10
  @seq_name=seq_name
10
11
  @seq_fasta = seq_fasta
12
+ @fasta_length = fasta_ori.length
11
13
  change_degenerated_nt!
12
14
  @seq_qual = ''
13
15
  @sec_desc = ''
@@ -18,7 +18,7 @@ class TestCode
18
18
  protein = ''
19
19
  p_long = 0
20
20
 
21
- if (seq.seq_fasta.length < 200)
21
+ if (seq.fasta_length < 200)
22
22
  ref_name = seq.seq_name
23
23
  ref_code = 0.0
24
24
  ref_frame = 0
@@ -26,7 +26,7 @@ class TestCode
26
26
  ref_orf = ''
27
27
  ref_msgs = 'Sequence length < 200 nt'
28
28
 
29
- seq.annotate(:tcode_unknown,"#{ref_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{ref_status}\t#{ref_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
29
+ seq.annotate(:tcode_unknown,"#{ref_name}\t#{seq.fasta_length}\t\ttestcode\t#{ref_status}\t#{ref_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
30
30
  # seq.annotate(:tcode,"#{ref_name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{ref_status}\t#{ref_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
31
31
  else
32
32
 
@@ -45,9 +45,9 @@ class TestCode
45
45
  # see add_region filter
46
46
  (name,t_code,status,ref_start,ref_end,ref_frame,orf,ref_msgs,stop_before_start,more_than_one_frame) = t_code(seq)
47
47
  if (status == :unknown)
48
- seq.annotate(:tcode_unknown,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
48
+ seq.annotate(:tcode_unknown,"#{name}\t#{seq.fasta_length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
49
49
  else
50
- seq.annotate(:tcode,"#{name}\t#{seq.seq_fasta.length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
50
+ seq.annotate(:tcode,"#{name}\t#{seq.fasta_length}\t\ttestcode\t#{status}\t#{t_code}\t\t\t\t\t#{ref_msgs}\t#{ref_frame}\t#{ref_start}\t#{ref_end}\t\t\t\t",true)
51
51
  end
52
52
 
53
53
  # if (ref_msgs.nil?)
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: full_lengther_next
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.5
5
+ version: 0.0.6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Noe Fernandez & Dario Guerrero
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-03-09 00:00:00 Z
13
+ date: 2012-04-16 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: xml-simple