full_lengther_next 0.6.2 → 0.9.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/{README.rdoc → README.md} +0 -0
- data/Rakefile +6 -37
- data/bin/console +14 -0
- data/bin/download_fln_dbs.rb +2 -7
- data/bin/full_lengther_next +85 -6
- data/bin/make_user_db.rb +13 -5
- data/bin/setup +8 -0
- data/full_lengther_next.gemspec +42 -0
- data/lib/full_lengther_next.rb +2 -10
- data/lib/full_lengther_next/artifacts.rb +74 -0
- data/lib/full_lengther_next/{classes/blast_functions.rb → blast_functions.rb} +0 -0
- data/lib/full_lengther_next/{classes/cdhit.rb → cdhit.rb} +0 -0
- data/lib/full_lengther_next/{classes/chimeric_seqs.rb → chimeric_seqs.rb} +0 -0
- data/lib/full_lengther_next/{classes/common_functions.rb → common_functions.rb} +0 -0
- data/lib/full_lengther_next/{classes/exonerate_result.rb → exonerate_result.rb} +0 -0
- data/lib/full_lengther_next/{classes/fl_analysis.rb → fl_analysis.rb} +0 -0
- data/lib/full_lengther_next/{classes/fl_string_utils.rb → fl_string_utils.rb} +0 -0
- data/lib/full_lengther_next/fln_stats.rb +613 -0
- data/lib/full_lengther_next/go_methods.rb +42 -0
- data/lib/full_lengther_next/{classes/handle_db.rb → handle_db.rb} +0 -0
- data/lib/full_lengther_next/mapping.rb +296 -0
- data/lib/full_lengther_next/{classes/my_worker.rb → my_worker.rb} +71 -9
- data/lib/full_lengther_next/{classes/my_worker_EST.rb → my_worker_EST.rb} +0 -0
- data/lib/full_lengther_next/{classes/my_worker_manager_EST.rb → my_worker_manager_EST.rb} +0 -0
- data/lib/full_lengther_next/{classes/my_worker_manager_fln.rb → my_worker_manager_fln.rb} +181 -16
- data/lib/full_lengther_next/{classes/nc_rna.rb → nc_rna.rb} +0 -0
- data/lib/full_lengther_next/{classes/orf.rb → orf.rb} +0 -0
- data/lib/full_lengther_next/{classes/reptrans.rb → reptrans.rb} +9 -5
- data/lib/full_lengther_next/{classes/sequence.rb → sequence.rb} +26 -1
- data/lib/full_lengther_next/{classes/test_code.rb → test_code.rb} +1 -1
- data/lib/full_lengther_next/{classes/types.rb → types.rb} +3 -2
- data/lib/full_lengther_next/{classes/une_los_hit.rb → une_los_hit.rb} +0 -0
- data/lib/full_lengther_next/version.rb +3 -0
- data/lib/full_lengther_next/{classes/warnings.rb → warnings.rb} +0 -0
- data/report_templates/general_summary.erb +140 -0
- data/report_templates/mapping_summary.erb +98 -0
- data/report_templates/reptrans_summary.erb +32 -0
- metadata +112 -134
- data/.gemtest +0 -0
- data/History.txt +0 -32
- data/Manifest.txt +0 -44
- data/PostInstall.txt +0 -6
- data/bin/plot_fln.rb +0 -270
- data/bin/plot_taxonomy.rb +0 -70
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next/classes/artifacts.rb +0 -66
- data/lib/full_lengther_next/classes/fln_stats.rb +0 -641
- data/script/console +0 -10
- data/script/destroy +0 -14
- data/script/generate +0 -14
- data/test/test_full_lengther_next.rb +0 -11
- data/test/test_helper.rb +0 -3
data/lib/full_lengther_next.rb
CHANGED
@@ -1,13 +1,5 @@
|
|
1
|
-
|
2
|
-
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
-
|
4
|
-
# ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
|
5
|
-
|
6
|
-
$: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
|
7
|
-
|
1
|
+
require "full_lengther_next/version"
|
8
2
|
|
9
3
|
module FullLengtherNext
|
10
|
-
|
11
|
-
|
12
|
-
FULL_LENGHTER_VERSION = VERSION
|
4
|
+
# Your code goes here...
|
13
5
|
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'blast_functions'
|
2
|
+
require 'types'
|
3
|
+
require 'chimeric_seqs'
|
4
|
+
include ChimericSeqs
|
5
|
+
|
6
|
+
#####################################################################
|
7
|
+
## MAIN FUNCTION
|
8
|
+
#####################################################################
|
9
|
+
def artifact?(seq, query, db_name, db_path, options, new_seqs)
|
10
|
+
artifact = FALSE
|
11
|
+
# UNMAPPED CONTIG DETECTION
|
12
|
+
if query.nil? && seq.unmapped? #If seq is misassembled stop chimera analisys
|
13
|
+
seq.hit = nil
|
14
|
+
artifact = TRUE
|
15
|
+
seq.type = UNMAPPED
|
16
|
+
end
|
17
|
+
|
18
|
+
if !query.nil?
|
19
|
+
# MISASSEMBLED DETECTION
|
20
|
+
if !artifact && misassembled_detection(query) #If seq is misassembled stop chimera analisys
|
21
|
+
seq.hit = query.hits.first
|
22
|
+
artifact = TRUE
|
23
|
+
seq.type = MISASSEMBLED
|
24
|
+
seq.warnings('ERROR#1')
|
25
|
+
end
|
26
|
+
|
27
|
+
# OVERLAPPING HSPS ON SUBJECT DETECTION
|
28
|
+
=begin
|
29
|
+
if !artifact
|
30
|
+
hit_reference = query.hits.first.dup
|
31
|
+
query, overlapping = overlapping_hsps_on_subject(query)
|
32
|
+
if overlapping
|
33
|
+
if query.hits.first.nil?
|
34
|
+
seq.hit = hit_reference
|
35
|
+
else
|
36
|
+
seq.hit = query.hits.first
|
37
|
+
end
|
38
|
+
artifact = TRUE
|
39
|
+
seq.type = OTHER
|
40
|
+
seq.warnings('ERROR#2')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
=end
|
44
|
+
|
45
|
+
# MULTIPLE HSP DETECTION
|
46
|
+
if !artifact && multiple_hsps(query, 3)
|
47
|
+
seq.hit = query.hits.first
|
48
|
+
seq.warnings('ERROR#3')
|
49
|
+
end
|
50
|
+
|
51
|
+
# CHIMERA DETECTION
|
52
|
+
if !artifact && !options[:chimera].include?('d')
|
53
|
+
chimera = search_chimeras(seq, query, options, db_name, db_path)
|
54
|
+
if !chimera.nil?
|
55
|
+
new_seqs.concat(chimera)
|
56
|
+
seq.db_name = db_name
|
57
|
+
seq.type = CHIMERA
|
58
|
+
artifact = TRUE
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
if artifact
|
63
|
+
if $verbose > 1
|
64
|
+
puts seq.prot_annot_calification
|
65
|
+
end
|
66
|
+
seq.db_name = db_name
|
67
|
+
seq.save_fasta = FALSE
|
68
|
+
seq.ignore = TRUE
|
69
|
+
end
|
70
|
+
return artifact
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,613 @@
|
|
1
|
+
require 'report_html'
|
2
|
+
require 'types.rb'
|
3
|
+
require 'go_methods'
|
4
|
+
|
5
|
+
module FlnStats
|
6
|
+
REPORT_FOLDER = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'report_templates'))
|
7
|
+
def initialize_stats_hash
|
8
|
+
stats_hash = {
|
9
|
+
'input_seqs' => 0,
|
10
|
+
'output_seqs' => 0,
|
11
|
+
'failed' => 0,
|
12
|
+
'full_transcriptome_length' => 0,
|
13
|
+
'PRE_FLN_full_transcriptome_length' => 0,
|
14
|
+
'mean_length' => 0,
|
15
|
+
'PRE_FLN_mean_length' => 0,
|
16
|
+
'indeterminations' => 0,
|
17
|
+
'PRE_FLN_indeterminations' => 0,
|
18
|
+
'gap_number' => 0,
|
19
|
+
'PRE_FLN_gap_number' => 0,
|
20
|
+
'indetermination_mean_length' => 0,
|
21
|
+
'PRE_FLN_indetermination_mean_length' => 0,
|
22
|
+
'sequences_>200' => 0,
|
23
|
+
'sequences_>500' => 0,
|
24
|
+
'PRE_FLN_sequences_>500' => 0,
|
25
|
+
'longest_unigene' => 0,
|
26
|
+
'n50' => 0,
|
27
|
+
'PRE_FLN_n50' => 0,
|
28
|
+
'n90' => 0,
|
29
|
+
'PRE_FLN_n90' => 0,
|
30
|
+
'good_seqs' => 0,
|
31
|
+
'artifacts' => 0,
|
32
|
+
'misassembled' => 0,
|
33
|
+
'chimeras' => 0,
|
34
|
+
'unmapped' => 0,
|
35
|
+
'other_artifacts' => 0,
|
36
|
+
'unknown' => 0,
|
37
|
+
'unknown_>200' => 0,
|
38
|
+
'unknown_>500' => 0,
|
39
|
+
'prot_annotated' => 0,
|
40
|
+
'complete' => 0,
|
41
|
+
'complete_sure' => 0,
|
42
|
+
'complete_putative' => 0,
|
43
|
+
'n_terminal' => 0,
|
44
|
+
'n_terminal_sure' => 0,
|
45
|
+
'n_terminal_putative' => 0,
|
46
|
+
'c_terminal' => 0,
|
47
|
+
'c_terminal_sure' => 0,
|
48
|
+
'c_terminal_putative' => 0,
|
49
|
+
'internal' => 0,
|
50
|
+
'swissprot' => 0,
|
51
|
+
'trembl' => 0,
|
52
|
+
'userdb' => 0,
|
53
|
+
'ncrna' => 0,
|
54
|
+
'coding' => 0,
|
55
|
+
'coding_sure' => 0,
|
56
|
+
'coding_putative' => 0,
|
57
|
+
'coding_>200' => 0,
|
58
|
+
'coding_>500' => 0,
|
59
|
+
'different_orthologues' => 0,
|
60
|
+
'different_completes' => 0,
|
61
|
+
'BA_index' => 0
|
62
|
+
}
|
63
|
+
|
64
|
+
return stats_hash
|
65
|
+
end
|
66
|
+
|
67
|
+
def get_taxonomy(name, taxonomy)
|
68
|
+
organism = nil
|
69
|
+
if name.include?('OS=')
|
70
|
+
fields = name.split('OS=',2)
|
71
|
+
organism = fields.last.split(' GN=').first.strip
|
72
|
+
elsif name[0..2] = 'sp=' || name[0..2] = 'tr='
|
73
|
+
name =~ /(\w+ \w+) \(([\w ]+)\) \(([\w ]+)\)/
|
74
|
+
if !$1.nil?
|
75
|
+
organism = $1
|
76
|
+
else
|
77
|
+
name =~ /(\w+ \w+) \(([\w ]+)\)/
|
78
|
+
if !$1.nil?
|
79
|
+
organism = $1
|
80
|
+
end
|
81
|
+
end
|
82
|
+
else
|
83
|
+
organism = name.split(";",2).last
|
84
|
+
organism = organism.split('.', 2).first
|
85
|
+
organism.gsub!(/\(\D+\)/,'')
|
86
|
+
if organism.split(' ').length > 1
|
87
|
+
organism.gsub!('.','')
|
88
|
+
organism.gsub!(/^ /,'')
|
89
|
+
organism.gsub!(' ','')
|
90
|
+
organism.strip!
|
91
|
+
end
|
92
|
+
end
|
93
|
+
if !organism.nil?
|
94
|
+
organism = organism.split(' ')[0..1].join(' ')
|
95
|
+
if taxonomy[organism].nil?
|
96
|
+
taxonomy[organism] = 1
|
97
|
+
else
|
98
|
+
taxonomy[organism] += 1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def initialize_stats_hash_reptrans
|
104
|
+
stats_hash = {
|
105
|
+
'prot_annotated' => 0,
|
106
|
+
'est_annotated' => 0,
|
107
|
+
'coding_>1' => 0,
|
108
|
+
'coding_>0.94' => 0,
|
109
|
+
'coding_>0.84' => 0,
|
110
|
+
'coding_>0.73' => 0,
|
111
|
+
'coding_>0' => 0
|
112
|
+
}
|
113
|
+
return stats_hash
|
114
|
+
end
|
115
|
+
|
116
|
+
# Extract sequence stats
|
117
|
+
##################################################
|
118
|
+
def sequence_stats(seq, stats_hash)
|
119
|
+
nt_seq = seq.seq_fasta
|
120
|
+
stats_hash['input_seqs'] += 1
|
121
|
+
stats_hash['PRE_FLN_sequences_>500'] += 1 if nt_seq.length >= 500
|
122
|
+
stats_hash['PRE_FLN_full_transcriptome_length'] += nt_seq.length
|
123
|
+
stats_hash['PRE_FLN_indeterminations'] += (nt_seq.count('n') + nt_seq.count('N'))
|
124
|
+
stats_hash['PRE_FLN_gap_number'] += nt_seq.scan(/[nN]+/).length
|
125
|
+
end
|
126
|
+
|
127
|
+
# Build final stats
|
128
|
+
####################################################
|
129
|
+
def summary_stats(seqs, stats_hash, diff_ids_array, diff_ids_complete_array, all_seq_lengths)
|
130
|
+
low_limit = 200
|
131
|
+
upper_limit = 500
|
132
|
+
#All seqs
|
133
|
+
#-----------
|
134
|
+
stats_hash['output_seqs'] += seqs.length
|
135
|
+
good_seqs = seqs.select{|s| s.type >= UNKNOWN}
|
136
|
+
stats_hash['good_seqs'] += good_seqs.length
|
137
|
+
|
138
|
+
#Indeterminations
|
139
|
+
if !good_seqs.empty?
|
140
|
+
stats_hash['indeterminations'] += good_seqs.map{|s| s.seq_fasta.count('n') + s.seq_fasta.count('N')}.inject { |sum, n| sum + n }
|
141
|
+
stats_hash['gap_number'] += good_seqs.map{|s| s.seq_fasta.scan(/[nN]+/).length}.inject { |sum, n| sum + n }
|
142
|
+
end
|
143
|
+
|
144
|
+
#Longest_unigene
|
145
|
+
current_longest_unigene = seqs.map{|s| s.fasta_length}.max
|
146
|
+
if current_longest_unigene > stats_hash['longest_unigene']
|
147
|
+
stats_hash['longest_unigene'] = current_longest_unigene
|
148
|
+
end
|
149
|
+
|
150
|
+
#Load ids
|
151
|
+
seqs.map{|s|
|
152
|
+
if s.type > UNKNOWN && s.type < NCRNA
|
153
|
+
diff_ids_array << s.hit.acc
|
154
|
+
end}
|
155
|
+
diff_ids_array.uniq!
|
156
|
+
|
157
|
+
#By Length
|
158
|
+
if !good_seqs.empty?
|
159
|
+
seq_lengths = good_seqs.map{|s| s.fasta_length }
|
160
|
+
all_seq_lengths.concat(seq_lengths)
|
161
|
+
stats_hash['full_transcriptome_length'] += seq_lengths.inject { |sum, n| sum + n }
|
162
|
+
stats_hash['sequences_>200'] += seq_lengths.select{|l| l > low_limit}.length
|
163
|
+
stats_hash['sequences_>500'] += seq_lengths.select{|l| l > upper_limit}.length
|
164
|
+
end
|
165
|
+
|
166
|
+
stats_hash['failed'] += seqs.select{|s| s.type == FAILED}.length
|
167
|
+
|
168
|
+
#Unknown
|
169
|
+
#-----------------------------
|
170
|
+
all_unknown = seqs.select{|s| s.type == UNKNOWN}
|
171
|
+
stats_hash['unknown'] += all_unknown.length
|
172
|
+
|
173
|
+
#By Length
|
174
|
+
stats_hash['unknown_>200'] += all_unknown.select{|s| s.fasta_length > low_limit}.length
|
175
|
+
stats_hash['unknown_>500'] += all_unknown.select{|s| s.fasta_length > upper_limit}.length
|
176
|
+
|
177
|
+
#Artifacts
|
178
|
+
#----------------
|
179
|
+
stats_hash['artifacts'] += seqs.select{|s| s.type < UNKNOWN && s.type > FAILED}.length
|
180
|
+
stats_hash['misassembled'] += seqs.select{|s| s.type == MISASSEMBLED}.length
|
181
|
+
stats_hash['unmapped'] += seqs.select{|s| s.type == UNMAPPED}.length
|
182
|
+
stats_hash['chimeras'] += seqs.select{|s| s.type == CHIMERA && !s.seq_name.include?('_split_')}.length # We don't want count a multiple chimera
|
183
|
+
stats_hash['other_artifacts'] += seqs.select{|s| s.type == OTHER}.length
|
184
|
+
|
185
|
+
#Annotated with prot
|
186
|
+
#---------------------
|
187
|
+
prot_annotated = seqs.select{|s| s.type >= COMPLETE && s.type <= INTERNAL}
|
188
|
+
stats_hash['prot_annotated'] += prot_annotated.length
|
189
|
+
|
190
|
+
#By annotation
|
191
|
+
stats_hash['internal'] += seqs.select{|s| s.type == INTERNAL}.length
|
192
|
+
complete = seqs.select{|s| s.type == COMPLETE}
|
193
|
+
n_terminal = seqs.select{|s| s.type == N_TERMINAL}
|
194
|
+
c_terminal = seqs.select{|s| s.type == C_TERMINAL}
|
195
|
+
|
196
|
+
stats_hash['complete'] += complete.length
|
197
|
+
stats_hash['n_terminal'] += n_terminal.length
|
198
|
+
stats_hash['c_terminal'] += c_terminal.length
|
199
|
+
|
200
|
+
#Load complete ids
|
201
|
+
complete.map{|s| diff_ids_complete_array << s.hit.acc}
|
202
|
+
diff_ids_complete_array.uniq!
|
203
|
+
|
204
|
+
#----> By Status
|
205
|
+
stats_hash['complete_sure'] += complete.select{|s| s.status}.length
|
206
|
+
stats_hash['n_terminal_sure'] += n_terminal.select{|s| s.status}.length
|
207
|
+
stats_hash['c_terminal_sure'] += c_terminal.select{|s| s.status}.length
|
208
|
+
stats_hash['complete_putative'] += complete.select{|s| !s.status}.length
|
209
|
+
stats_hash['n_terminal_putative'] += n_terminal.select{|s| !s.status}.length
|
210
|
+
stats_hash['c_terminal_putative'] += c_terminal.select{|s| !s.status}.length
|
211
|
+
|
212
|
+
#By database
|
213
|
+
swissprot = prot_annotated.select{|s| s.db_name =~ /^sp_/}.length
|
214
|
+
trembl = prot_annotated.select{|s| s.db_name =~ /^tr_/}.length
|
215
|
+
stats_hash['swissprot'] += swissprot
|
216
|
+
stats_hash['trembl'] += trembl
|
217
|
+
stats_hash['userdb'] += prot_annotated.length - swissprot - trembl
|
218
|
+
|
219
|
+
#ncRNA
|
220
|
+
#----------------
|
221
|
+
stats_hash['ncrna'] += seqs.select{|s| s.type == NCRNA}.length
|
222
|
+
|
223
|
+
#Coding sequences
|
224
|
+
#----------------
|
225
|
+
coding = seqs.select{|s| s.type == CODING}
|
226
|
+
stats_hash['coding'] += coding.length
|
227
|
+
|
228
|
+
#By Status
|
229
|
+
stats_hash['coding_sure'] += coding.select{|s| s.status}.length
|
230
|
+
stats_hash['coding_putative'] += coding.select{|s| !s.status}.length
|
231
|
+
|
232
|
+
#By Length
|
233
|
+
stats_hash['coding_>200'] += coding.select{|s| s.fasta_length > low_limit}.length
|
234
|
+
stats_hash['coding_>500'] += coding.select{|s| s.fasta_length > upper_limit}.length
|
235
|
+
|
236
|
+
|
237
|
+
return stats_hash, diff_ids_array, diff_ids_complete_array, all_seq_lengths
|
238
|
+
end
|
239
|
+
|
240
|
+
def calculate_n50_n90(stats_hash, f_tot_key, n50_key, n90_key, seq_lengths)
|
241
|
+
f_tot_lengths = stats_hash[f_tot_key].to_f
|
242
|
+
cum = 0
|
243
|
+
seq_lengths.sort!{|a, b| b <=> a}
|
244
|
+
seq_lengths.each do |length|
|
245
|
+
cum += length
|
246
|
+
if cum / f_tot_lengths > 0.5 && stats_hash[n50_key] == 0
|
247
|
+
stats_hash[n50_key] = length
|
248
|
+
elsif cum / f_tot_lengths > 0.9
|
249
|
+
stats_hash[n90_key] = length
|
250
|
+
break
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def last_stats(stats_hash, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths)
|
256
|
+
stats_hash['different_orthologues'] = diff_ids_array.length
|
257
|
+
stats_hash['different_completes'] = diff_ids_complete_array.length
|
258
|
+
stats_hash['mean_length'] = stats_hash['full_transcriptome_length'].to_f / stats_hash['good_seqs'] if stats_hash['good_seqs'] > 0
|
259
|
+
stats_hash['indetermination_mean_length'] = stats_hash['indeterminations'].to_f / stats_hash['gap_number'] if stats_hash['gap_number'] > 0
|
260
|
+
stats_hash['PRE_FLN_mean_length'] = stats_hash['PRE_FLN_full_transcriptome_length'].to_f / stats_hash['input_seqs'] if stats_hash['input_seqs'] > 0
|
261
|
+
stats_hash['PRE_FLN_indetermination_mean_length'] = stats_hash['PRE_FLN_indeterminations'].to_f / stats_hash['PRE_FLN_gap_number'] if stats_hash['PRE_FLN_gap_number'] > 0
|
262
|
+
|
263
|
+
calculate_n50_n90(stats_hash, 'full_transcriptome_length', 'n50', 'n90', seq_lengths)
|
264
|
+
calculate_n50_n90(stats_hash, 'PRE_FLN_full_transcriptome_length', 'PRE_FLN_n50', 'PRE_FLN_n90', pre_fln_seq_lengths)
|
265
|
+
|
266
|
+
#BA index
|
267
|
+
if stats_hash['prot_annotated'] > 0 &&
|
268
|
+
stats_hash['complete'] > 0 &&
|
269
|
+
stats_hash['sequences_>500'] > 0 &&
|
270
|
+
stats_hash['different_orthologues'] > 0 &&
|
271
|
+
stats_hash['different_completes'] > 0
|
272
|
+
coef_anot_geom = (stats_hash['prot_annotated'] * stats_hash['complete'] * 1.0)/(stats_hash['sequences_>500']*10000)
|
273
|
+
coef_mejora = (stats_hash['different_orthologues']*1.0 + stats_hash['different_completes'])/(stats_hash['prot_annotated'] + stats_hash['complete'])
|
274
|
+
stats_hash['BA_index'] = Math.sqrt(coef_anot_geom*coef_mejora)
|
275
|
+
end
|
276
|
+
|
277
|
+
return stats_hash
|
278
|
+
end
|
279
|
+
|
280
|
+
def coding_stats_reptrans(coding_seq, stats_hash)
|
281
|
+
group = nil
|
282
|
+
if coding_seq.t_code > 1
|
283
|
+
group = 'coding_>1'
|
284
|
+
elsif coding_seq.t_code > 0.95
|
285
|
+
group = 'coding_>0.94'
|
286
|
+
elsif coding_seq.t_code > 0.85
|
287
|
+
group = 'coding_>0.84'
|
288
|
+
elsif coding_seq.t_code > 0.73
|
289
|
+
group = 'coding_>0.73'
|
290
|
+
elsif coding_seq.t_code > 0
|
291
|
+
group = 'coding_>0'
|
292
|
+
end
|
293
|
+
if !group.nil?
|
294
|
+
stats_hash[group] += 1
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
def handle_data_main_summary(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs)
|
299
|
+
container = {}
|
300
|
+
|
301
|
+
identation = ' '
|
302
|
+
# GENERAL REPORT TABLE
|
303
|
+
#-------------------------------------------------------
|
304
|
+
general_report = [
|
305
|
+
['', 'Sequences', '%'],
|
306
|
+
['Input', stats_hash['input_seqs']],
|
307
|
+
[identation + 'N50 (bp)', stats_hash['PRE_FLN_n50']],
|
308
|
+
[identation + 'N90 (bp)', stats_hash['PRE_FLN_n90']],
|
309
|
+
[identation + 'Full transcriptome length (bp)', stats_hash['PRE_FLN_full_transcriptome_length']],
|
310
|
+
[identation + 'Mean sequence length (bp)', '%.2f' % stats_hash['PRE_FLN_mean_length']],
|
311
|
+
[identation + 'Nucleotide indeterminations (bp)', stats_hash['PRE_FLN_indeterminations']],
|
312
|
+
[identation + 'Mean indetermination length (bp)', '%.2f' % stats_hash['PRE_FLN_indetermination_mean_length']],
|
313
|
+
[identation + 'Unigenes >500pb', stats_hash['PRE_FLN_sequences_>500']],
|
314
|
+
[identation + 'Failing sequences', stats_hash['failed']],
|
315
|
+
[identation + 'Artifacts <sup>1</sup>', stats_hash['artifacts']],
|
316
|
+
[identation*2 + 'Unmapped transcripts', stats_hash['unmapped']],
|
317
|
+
[identation*2 + 'Misassembled', stats_hash['misassembled']],
|
318
|
+
[identation*2 + 'Chimeras', stats_hash['chimeras']],
|
319
|
+
[identation*2 + 'Other', stats_hash['other_artifacts']],
|
320
|
+
['Sequences with resolved chimeras', stats_hash['output_seqs']],
|
321
|
+
['Sequences without artifacts', stats_hash['good_seqs']],
|
322
|
+
[identation + 'N50 (bp)', stats_hash['n50']],
|
323
|
+
[identation + 'N90 (bp)', stats_hash['n90']],
|
324
|
+
[identation + 'Full transcriptome length (bp)', stats_hash['full_transcriptome_length']],
|
325
|
+
[identation + 'Mean sequence length (bp)', '%.2f' % stats_hash['mean_length']],
|
326
|
+
[identation + 'Nucleotide indeterminations (bp)', stats_hash['indeterminations']],
|
327
|
+
[identation + 'Mean indetermination length (bp)', '%.2f' % stats_hash['indetermination_mean_length']]
|
328
|
+
]
|
329
|
+
denominators = [
|
330
|
+
stats_hash['input_seqs'],
|
331
|
+
0,
|
332
|
+
0,
|
333
|
+
0,
|
334
|
+
0,
|
335
|
+
stats_hash['PRE_FLN_full_transcriptome_length'],
|
336
|
+
0,
|
337
|
+
stats_hash['input_seqs'],
|
338
|
+
stats_hash['output_seqs'],
|
339
|
+
stats_hash['output_seqs'],
|
340
|
+
stats_hash['artifacts'],
|
341
|
+
stats_hash['artifacts'],
|
342
|
+
stats_hash['artifacts'],
|
343
|
+
stats_hash['artifacts'],
|
344
|
+
stats_hash['input_seqs'],
|
345
|
+
stats_hash['output_seqs'],
|
346
|
+
0,
|
347
|
+
0,
|
348
|
+
0,
|
349
|
+
0,
|
350
|
+
stats_hash['full_transcriptome_length'],
|
351
|
+
0
|
352
|
+
]
|
353
|
+
add_percentages_by_vector(general_report, 1, denominators)
|
354
|
+
general_report << ['BA index', "%5.2f" % [stats_hash['BA_index']], '-'] if stats_hash['BA_index'] > 0
|
355
|
+
|
356
|
+
# ASSEMBLY REPORT TABLE
|
357
|
+
#-------------------------------------------------------
|
358
|
+
without_orthologue = stats_hash['coding']+ stats_hash['unknown']
|
359
|
+
assembly_report = [
|
360
|
+
['', 'Unigenes', '%'],
|
361
|
+
['Unigenes', stats_hash['good_seqs']],
|
362
|
+
['Unigenes >500pb', stats_hash['sequences_>500']],
|
363
|
+
['Unigenes >200pb', stats_hash['sequences_>200']],
|
364
|
+
['Longest unigene', stats_hash['longest_unigene']],
|
365
|
+
['With orthologue <sup>1</sup>', stats_hash['prot_annotated']],
|
366
|
+
[identation + 'Different orthologue IDs', stats_hash['different_orthologues']],
|
367
|
+
[identation + 'Complete transcripts', stats_hash['complete']],
|
368
|
+
[identation + 'Different complete transcripts', stats_hash['different_completes']],
|
369
|
+
['ncRNA', stats_hash['ncrna']],
|
370
|
+
['Without orthologue <sup>1</sup>', without_orthologue],
|
371
|
+
[identation + 'Coding (all)', stats_hash['coding']],
|
372
|
+
[identation + 'Coding > 200bp', stats_hash['coding_>200']],
|
373
|
+
[identation + 'Coding > 500bp', stats_hash['coding_>500']],
|
374
|
+
[identation + 'Unknown (all)', stats_hash['unknown']],
|
375
|
+
[identation + 'Unknown > 200bp', stats_hash['unknown_>200']],
|
376
|
+
[identation + 'Unknown > 500bp', stats_hash['unknown_>500']]
|
377
|
+
]
|
378
|
+
denominators = [
|
379
|
+
stats_hash['good_seqs'],
|
380
|
+
stats_hash['good_seqs'],
|
381
|
+
stats_hash['good_seqs'],
|
382
|
+
0,
|
383
|
+
stats_hash['good_seqs'],
|
384
|
+
stats_hash['prot_annotated'],
|
385
|
+
stats_hash['prot_annotated'],
|
386
|
+
stats_hash['prot_annotated'],
|
387
|
+
stats_hash['good_seqs'],
|
388
|
+
stats_hash['good_seqs'],
|
389
|
+
without_orthologue,
|
390
|
+
without_orthologue,
|
391
|
+
without_orthologue,
|
392
|
+
without_orthologue,
|
393
|
+
without_orthologue,
|
394
|
+
without_orthologue
|
395
|
+
]
|
396
|
+
add_percentages_by_vector(assembly_report, 1, denominators)
|
397
|
+
|
398
|
+
# STRUCTURAL PROFILE
|
399
|
+
#-------------------------------------------------------
|
400
|
+
structural_data = [
|
401
|
+
['Category', 'Sure', 'Putative'],
|
402
|
+
['Unknown', stats_hash['unknown'], 0],
|
403
|
+
['Complete', stats_hash['complete_sure'], stats_hash['complete_putative']],
|
404
|
+
['N-terminal', stats_hash['n_terminal_sure'], stats_hash['n_terminal_putative']],
|
405
|
+
['C-terminal', stats_hash['c_terminal_sure'], stats_hash['c_terminal_putative']],
|
406
|
+
['Internal', stats_hash['internal'], 0],
|
407
|
+
['ncrna', stats_hash['ncrna'], 0],
|
408
|
+
['Coding', stats_hash['coding'], stats_hash['coding_putative']]
|
409
|
+
]
|
410
|
+
structural_data.each_with_index do |row, i|
|
411
|
+
row.each_with_index do |field, j|
|
412
|
+
structural_data[i][j] = field*100.0/stats_hash['good_seqs'] if i > 0 && j > 0 && structural_data[i][j] > 0
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
# STATUS REPORT
|
417
|
+
#----------------------------------------------------------
|
418
|
+
status_report = [
|
419
|
+
['Status', 'colspan', 'Unigenes', '%'],
|
420
|
+
['Complete', 'Sure', stats_hash['complete_sure']],
|
421
|
+
['rowspan', 'Putative', stats_hash['complete_putative']],
|
422
|
+
['C-terminus', 'Sure', stats_hash['c_terminal_sure']],
|
423
|
+
['rowspan', 'Putative', stats_hash['c_terminal_putative']],
|
424
|
+
['N-terminus', 'Sure', stats_hash['n_terminal_sure']],
|
425
|
+
['rowspan', 'Putative', stats_hash['n_terminal_putative']],
|
426
|
+
['Internal', 'colspan', stats_hash['internal']],
|
427
|
+
['Coding', 'Sure', stats_hash['coding_sure']],
|
428
|
+
['rowspan', 'Putative', stats_hash['coding_putative']],
|
429
|
+
['ncRNA', 'colspan', stats_hash['ncrna']],
|
430
|
+
['Unknown', 'colspan', stats_hash['unknown']],
|
431
|
+
['Total', 'colspan', stats_hash['good_seqs']],
|
432
|
+
]
|
433
|
+
add_percentages_by_scalar(status_report, 2, stats_hash['good_seqs'])
|
434
|
+
|
435
|
+
# TAXONOMY PROFILE
|
436
|
+
#-------------------------------------------------------
|
437
|
+
taxonomy = [
|
438
|
+
['Organism', 'Annotations']
|
439
|
+
].concat(stats_taxonomy.to_a.sort{|s2, s1| s1.last <=> s2.last}[0..20])
|
440
|
+
|
441
|
+
# TAXONOMY PROFILE
|
442
|
+
#-------------------------------------------------------
|
443
|
+
database_report = [
|
444
|
+
['', 'Unigenes', '%'],
|
445
|
+
['UserDB', stats_hash['userdb']],
|
446
|
+
['SwissProt', stats_hash['swissprot']],
|
447
|
+
['TrEMBL', stats_hash['trembl']],
|
448
|
+
['ncRNA', stats_hash['ncrna']],
|
449
|
+
['None', stats_hash['coding']+ stats_hash['unknown']],
|
450
|
+
['Total', stats_hash['good_seqs']]
|
451
|
+
]
|
452
|
+
add_percentages_by_scalar(database_report, 1, stats_hash['good_seqs'])
|
453
|
+
|
454
|
+
# GO ANNOTATION
|
455
|
+
#-------------------------------------------------------
|
456
|
+
container.merge!(go_for_graph(stats_functional_annotation_by_seqs))
|
457
|
+
|
458
|
+
# BUILD CONTAINER
|
459
|
+
#-------------------------------------------------------
|
460
|
+
container[:general_report] = general_report
|
461
|
+
container[:assembly_report] = assembly_report
|
462
|
+
container[:structural_data] = structural_data
|
463
|
+
container[:status_report] = status_report
|
464
|
+
container[:taxonomy] = taxonomy
|
465
|
+
container[:database_report] = database_report
|
466
|
+
return container
|
467
|
+
end
|
468
|
+
|
469
|
+
|
470
|
+
def handle_data_reptrans_summary(stats_hash)
|
471
|
+
# GENERAL REPORT
|
472
|
+
#-------------------------------------------------------
|
473
|
+
all_seqs = 0
|
474
|
+
stats_hash.values.map{|v| all_seqs += v}
|
475
|
+
general_report = [
|
476
|
+
['', 'Sequences', '%'],
|
477
|
+
['Output', all_seqs],
|
478
|
+
['Annotated with protein', stats_hash['prot_annotated']],
|
479
|
+
['Annotated with EST', stats_hash['est_annotated']],
|
480
|
+
['Coding test-code > 1', stats_hash['coding_>1']],
|
481
|
+
['Coding test-code > 0.94', stats_hash['coding_>0.94']],
|
482
|
+
['Coding test-code > 0.84', stats_hash['coding_>0.84']],
|
483
|
+
['Coding test-code > 0.73', stats_hash['coding_>0.73']],
|
484
|
+
['Coding test-code > 0', stats_hash['coding_>0']]
|
485
|
+
]
|
486
|
+
add_percentages_by_scalar(general_report, 1, all_seqs)
|
487
|
+
|
488
|
+
# ACUMULATIVE REPORT
|
489
|
+
#-------------------------------------------------------
|
490
|
+
categories = [
|
491
|
+
'Annotated with protein',
|
492
|
+
'Annotated with EST',
|
493
|
+
'Coding test-code > 1',
|
494
|
+
'Coding test-code > 0.94',
|
495
|
+
'Coding test-code > 0.84',
|
496
|
+
'Coding test-code > 0.73',
|
497
|
+
'Coding test-code > 0'
|
498
|
+
]
|
499
|
+
values = [
|
500
|
+
stats_hash['prot_annotated'],
|
501
|
+
stats_hash['est_annotated'],
|
502
|
+
stats_hash['coding_>1'],
|
503
|
+
stats_hash['coding_>0.94'],
|
504
|
+
stats_hash['coding_>0.84'],
|
505
|
+
stats_hash['coding_>0.73'],
|
506
|
+
stats_hash['coding_>0']
|
507
|
+
]
|
508
|
+
acumulative = []
|
509
|
+
acumulative << values.inject(0) { |result, element|
|
510
|
+
acumulative << result if result > 0
|
511
|
+
result + element
|
512
|
+
}
|
513
|
+
report = []
|
514
|
+
categories.each_with_index do |cat, i|
|
515
|
+
report << [cat, acumulative[i]]
|
516
|
+
end
|
517
|
+
acumulative_report = [
|
518
|
+
['', 'Sequences', '%'],
|
519
|
+
].concat(report)
|
520
|
+
add_percentages_by_scalar(acumulative_report, 1, all_seqs)
|
521
|
+
|
522
|
+
# BUILD CONTAINER
|
523
|
+
#-------------------------------------------------------
|
524
|
+
container = {}
|
525
|
+
container[:general_report] = general_report
|
526
|
+
container[:acumulative_report] = acumulative_report
|
527
|
+
return container
|
528
|
+
end
|
529
|
+
|
530
|
+
def add_percentages_by_vector(table, col, denominators)
|
531
|
+
table.each_with_index do |row, i|
|
532
|
+
next if i == 0 #Skip header
|
533
|
+
den = denominators[i-1]
|
534
|
+
perc = row[col]*100.0/denominators[i-1] if den > 0
|
535
|
+
if den > 0 && !perc.nan? && (perc).infinite?.nil?
|
536
|
+
percentage = '%.2f' % perc.to_s
|
537
|
+
percentage += '%'
|
538
|
+
else
|
539
|
+
percentage ='-'
|
540
|
+
end
|
541
|
+
row << percentage
|
542
|
+
end
|
543
|
+
end
|
544
|
+
|
545
|
+
def add_percentages_by_scalar(table, col, denominator)
|
546
|
+
table.each_with_index do |row, i|
|
547
|
+
next if i == 0 #Skip header
|
548
|
+
perc = row[col]*100.0/denominator
|
549
|
+
if !perc.nan? && perc.infinite?.nil?
|
550
|
+
percentage = '%.2f' % perc.to_s
|
551
|
+
percentage += '%'
|
552
|
+
else
|
553
|
+
percentage ='-'
|
554
|
+
end
|
555
|
+
row << percentage
|
556
|
+
end
|
557
|
+
end
|
558
|
+
|
559
|
+
def write_summary_stats(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths, txt_file, html_file)
|
560
|
+
stats_hash = last_stats(stats_hash, diff_ids_array, diff_ids_complete_array, pre_fln_seq_lengths, seq_lengths)
|
561
|
+
write_txt(stats_hash, txt_file)
|
562
|
+
container = handle_data_main_summary(stats_hash, stats_taxonomy, stats_functional_annotation_by_seqs)
|
563
|
+
template = File.open(File.join(REPORT_FOLDER, 'general_summary.erb')).read
|
564
|
+
report = Report_html.new(container, 'FLN Summary')
|
565
|
+
report.build(template)
|
566
|
+
report.write(html_file)
|
567
|
+
end
|
568
|
+
|
569
|
+
def write_mapping_report(fpkm, coverage_analysis, stats_functional_annotation_by_seqs)
|
570
|
+
if !fpkm.empty? && !coverage_analysis.empty? # REPORT Mapping
|
571
|
+
container = go_for_graph(stats_functional_annotation_by_seqs, fpkm)
|
572
|
+
measured_coverages = coverage_analysis.values.map{|c| [c[1], c[2]]}
|
573
|
+
measured_coverages.sort!{|c1, c2| c2[1] <=> c1[1]}
|
574
|
+
measured_coverages.each_with_index do |cov, i|
|
575
|
+
cov.unshift(i+1) # Puts x axis: 1, 2, 3 ... (seqs)
|
576
|
+
end
|
577
|
+
measured_coverages.unshift(%w[transcripts mean_10max mean])
|
578
|
+
container[:mean_coverage] = measured_coverages
|
579
|
+
count = 0
|
580
|
+
container[:max10_coverage] = coverage_analysis.values.sort{|c1, c2| c2[1] <=> c1[1]}.map{|c| count += 1; [count, c[1]]}
|
581
|
+
container[:normalized_partial_coverage] = coverage_analysis.values.map{|c| [c[3], c[0]] }
|
582
|
+
mean_cov_trasn_cov = coverage_analysis.values.map{|data| [data[3], data[2]]}
|
583
|
+
mean_cov_trasn_cov.sort!{|i1, i2| i1[0] <=> i2[0]}
|
584
|
+
mean_cov_trasn_cov.unshift(%w[trans_cov mean_coverage])
|
585
|
+
container[:normalized_coverages_sorted_by_npc] = mean_cov_trasn_cov
|
586
|
+
template = File.open(File.join(REPORT_FOLDER, 'mapping_summary.erb')).read
|
587
|
+
report = Report_html.new(container, 'FLN Summary')
|
588
|
+
report.build(template)
|
589
|
+
report.write(File.join('fln_results', 'mapping_summary.html'))
|
590
|
+
end
|
591
|
+
end
|
592
|
+
|
593
|
+
def write_reptrans_stats(stats_hash, html_file, txt_file)
|
594
|
+
txt = File.open(txt_file,'w')
|
595
|
+
write_txt(stats_hash, txt)
|
596
|
+
container = handle_data_reptrans_summary(stats_hash)
|
597
|
+
template = File.open(File.join(REPORT_FOLDER, 'reptrans_summary.erb')).read
|
598
|
+
report = Report_html.new(container, 'FLN Reptrans Summary')
|
599
|
+
report.build(template)
|
600
|
+
report.write(html_file)
|
601
|
+
end
|
602
|
+
|
603
|
+
def write_txt(stats_hash, file)
|
604
|
+
stats_hash.each do |key, value|
|
605
|
+
file.puts "#{value}\t#{key}"
|
606
|
+
end
|
607
|
+
end
|
608
|
+
|
609
|
+
def table_title(title)
|
610
|
+
html = '<div style="font-size:25px; margin: 10"><b>'+title+'</b></div>'
|
611
|
+
return html
|
612
|
+
end
|
613
|
+
end
|