full_lengther_next 0.0.8 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,54 @@
|
|
1
|
+
#$: << File.expand_path('~/fln/full_lengther_next/lib')
|
2
|
+
|
3
|
+
$: << File.expand_path(File.join(File.dirname(__FILE__)))
|
4
|
+
require 'types'
|
5
|
+
require 'my_worker'
|
6
|
+
|
7
|
+
class MyWorkerEst < MyWorker
|
8
|
+
#####################################################################################
|
9
|
+
# WORKER FUNCTIONS
|
10
|
+
#####################################################################################
|
11
|
+
def receive_initial_config(params)
|
12
|
+
# Reads the parameters
|
13
|
+
@options = params.first
|
14
|
+
@blast_path = params.last
|
15
|
+
@match = [] # Any sequence matching with a EST
|
16
|
+
@unmatch = [] # Sequences with :test_code annotation that it hasn't match with a EST
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_object (array_seqs)
|
20
|
+
$WORKER_LOG.info "Worker LIST #{array_seqs.length}"
|
21
|
+
array_seqs.sort!{|s1, s2| s2.t_code <=> s1.t_code} #Order by testcode
|
22
|
+
blastEST(array_seqs)
|
23
|
+
return [@match, @unmatch]
|
24
|
+
end
|
25
|
+
|
26
|
+
#####################################################################################
|
27
|
+
# CUSTOM FUNCTIONS
|
28
|
+
#####################################################################################
|
29
|
+
|
30
|
+
def blastEST(array_seqs)
|
31
|
+
blast = run_blast(array_seqs, @blast_path, 'blastn', 1e-6, nil, FALSE)
|
32
|
+
if blast.nil?
|
33
|
+
$LOG.info 'BLAST FAILED'
|
34
|
+
Process.exit(-1)
|
35
|
+
else
|
36
|
+
blast_analysis(blast, array_seqs)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def blast_analysis(blast, array_seqs)
|
41
|
+
blast.querys.each_with_index do |query, i|
|
42
|
+
if query.hits.first.nil?
|
43
|
+
if !array_seqs[i].type == CODING #Keep if is coding
|
44
|
+
@unmatch << array_seqs[i]
|
45
|
+
end
|
46
|
+
else
|
47
|
+
seq = array_seqs[i]
|
48
|
+
seq.hit = query.hits.first
|
49
|
+
@match << seq
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'scbi_fasta'
|
2
|
+
require 'sequence'
|
3
|
+
|
4
|
+
class MyWorkerManagerEst < ScbiMapreduce::WorkManager
|
5
|
+
#############################################################################################
|
6
|
+
# MANAGER INITIALIZATION
|
7
|
+
#############################################################################################
|
8
|
+
# open files and prepare global data
|
9
|
+
def self.init_work_manager(putative_seqs, options, blast_path)
|
10
|
+
@@blast_path = blast_path
|
11
|
+
@@options = options
|
12
|
+
@@putative_seqs = putative_seqs
|
13
|
+
@@match = []
|
14
|
+
@@unmatch = []
|
15
|
+
@@num_seqs = 0
|
16
|
+
end
|
17
|
+
|
18
|
+
#############################################################################################
|
19
|
+
# MANAGER TERMINATION
|
20
|
+
#############################################################################################
|
21
|
+
|
22
|
+
# close files
|
23
|
+
def self.end_work_manager
|
24
|
+
#VOID
|
25
|
+
end
|
26
|
+
|
27
|
+
#############################################################################################
|
28
|
+
# MANAGER NATIVE FUNCTIONS
|
29
|
+
#############################################################################################
|
30
|
+
|
31
|
+
# this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
|
32
|
+
# Return the work data or nil if no more data is available
|
33
|
+
def next_work #Manage INput's worker
|
34
|
+
if @@num_seqs == @@putative_seqs.length-1
|
35
|
+
return nil
|
36
|
+
else
|
37
|
+
seq = @@putative_seqs[@@num_seqs]
|
38
|
+
@@num_seqs += 1
|
39
|
+
return seq
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# this method is ejecuted each time an obj is finished
|
44
|
+
def work_received(match_and_unmatch_array) #Manage OUTput's worker
|
45
|
+
@@match.concat(match_and_unmatch_array.first)
|
46
|
+
@@unmatch.concat(match_and_unmatch_array.last)
|
47
|
+
end
|
48
|
+
|
49
|
+
def error_received(worker_error, obj)
|
50
|
+
puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
|
51
|
+
end
|
52
|
+
|
53
|
+
def too_many_errors_received
|
54
|
+
$LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
|
55
|
+
end
|
56
|
+
|
57
|
+
# send initial config
|
58
|
+
def worker_initial_config
|
59
|
+
return [@@options, @@blast_path]
|
60
|
+
end
|
61
|
+
|
62
|
+
#############################################################################################
|
63
|
+
# CUSTOM FUNCTIONS
|
64
|
+
#############################################################################################
|
65
|
+
def self.get_array_seqs
|
66
|
+
return @@match, @@unmatch
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,389 @@
|
|
1
|
+
require 'types'
|
2
|
+
require 'scbi_fasta'
|
3
|
+
require 'scbi_blast'
|
4
|
+
require 'sequence'
|
5
|
+
require 'exonerate_result'
|
6
|
+
require 'fln_stats'
|
7
|
+
require 'reptrans'
|
8
|
+
|
9
|
+
include FlnStats
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
class MyWorkerManagerFln < ScbiMapreduce::WorkManager
|
15
|
+
#############################################################################################
|
16
|
+
# MANAGER INITIALIZATION
|
17
|
+
#############################################################################################
|
18
|
+
attr_accessor :seqs_annotation_prot, :seqs_some_coding, :seqs_unknown
|
19
|
+
# open files and prepare global data
|
20
|
+
def self.init_work_manager(options)
|
21
|
+
@@stats_hash = initialize_stats_hash
|
22
|
+
@@stats_taxonomy = {}
|
23
|
+
@@stats_different_prot_id = []
|
24
|
+
@@stats_different_prot_id_complete_seqs = []
|
25
|
+
|
26
|
+
@@options = options
|
27
|
+
$verbose = options[:verbose]
|
28
|
+
|
29
|
+
input_file = options[:fasta]
|
30
|
+
|
31
|
+
if !File.exists?('fln_results')
|
32
|
+
Dir.mkdir('fln_results')
|
33
|
+
end
|
34
|
+
|
35
|
+
@@func_annot_type = {
|
36
|
+
:go_id => 5,
|
37
|
+
:go_description => 6,
|
38
|
+
:kegg_id => 7,
|
39
|
+
:interpro_id => 8,
|
40
|
+
:interpro_description => 9,
|
41
|
+
:ec_id => 10,
|
42
|
+
:pfam_id => 11,
|
43
|
+
:pfam_desc => 12,
|
44
|
+
:unipathway_id => 13
|
45
|
+
}
|
46
|
+
|
47
|
+
@@functional_annotations = {}
|
48
|
+
@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'sp_'+options[:tax_group],'sp_'+options[:tax_group]+'.index')))
|
49
|
+
#@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'tr_'+options[:tax_group],'tr_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('t')
|
50
|
+
|
51
|
+
@@fasta_file = FastaQualFile.new(input_file,'')
|
52
|
+
file_head = "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\te_value\tp_ident\ts_length\tprotein_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tgo_id\tgo_description\tkegg_id\tinterpro_id\tinterpro_description\tec_id\tpfam_id\tpfam_description\tunipathway_id"
|
53
|
+
|
54
|
+
@@output_files = {}
|
55
|
+
# Seq annotation files
|
56
|
+
if !options[:chimera].nil?
|
57
|
+
@@output_files[CHIMERA] = File.open("fln_results/chimeric_sequences.txt", 'w')
|
58
|
+
@@output_files[CHIMERA].puts file_head
|
59
|
+
elsif File.exists?("fln_results/chimeric_sequences.txt")
|
60
|
+
File.delete("fln_results/chimeric_sequences.txt")
|
61
|
+
end
|
62
|
+
@@output_files[OTHER] = File.open('fln_results/artifact_other.txt', 'w')
|
63
|
+
@@output_files[MISASSEMBLED] = File.open('fln_results/misassembled.txt', 'w')
|
64
|
+
@@output_files[UNKNOWN] = File.open('fln_results/unknown.txt', 'w')
|
65
|
+
@@output_files['db'] = File.open('fln_results/pt_seqs', 'w')
|
66
|
+
@@output_files[CODING] = File.open('fln_results/new_coding.txt', 'w')
|
67
|
+
@@output_files[NCRNA] = File.open('fln_results/nc_rnas.txt', 'w')
|
68
|
+
|
69
|
+
# Complementary files
|
70
|
+
@@output_files['align'] = File.open('fln_results/alignments.txt', 'w')
|
71
|
+
@@output_files['prot'] = File.open('fln_results/proteins.fasta', 'w') # FASTA
|
72
|
+
@@output_files['nts'] = File.open("fln_results/nt_seq.txt", 'w')
|
73
|
+
@@output_files['seqs'] = File.open('fln_results/unigenes.fasta', 'w') # FASTA
|
74
|
+
@@output_files['stats_html'] = File.open('fln_results/summary_stats.html', 'w')
|
75
|
+
@@output_files['stats_txt'] = File.open('fln_results/summary_stats.txt', 'w')
|
76
|
+
|
77
|
+
@@output_files[CODING].puts file_head
|
78
|
+
@@output_files['db'].puts file_head
|
79
|
+
@@output_files[NCRNA].puts file_head
|
80
|
+
|
81
|
+
#RepTrans module
|
82
|
+
@@seqs_annotation_prot = []
|
83
|
+
@@seqs_some_coding = []
|
84
|
+
@@seqs_unknown = []
|
85
|
+
|
86
|
+
#Transdecoder module
|
87
|
+
@@complete_sure = []
|
88
|
+
@@seqs_to_analyze = []
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
#############################################################################################
|
93
|
+
# MANAGER TERMINATION
|
94
|
+
#############################################################################################
|
95
|
+
|
96
|
+
# close files
|
97
|
+
def self.end_work_manager
|
98
|
+
orf_prediction_with_transdecoder if @@options[:acess_db].include?('p') && !@@complete_sure.empty? && !@@seqs_to_analyze.empty?
|
99
|
+
write_summary_stats(@@stats_hash, @@stats_taxonomy, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@output_files['stats_txt'], @@output_files['stats_html'])
|
100
|
+
@@output_files.each do |key, handler|
|
101
|
+
handler.close
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
#############################################################################################
|
106
|
+
# MANAGER NATIVE FUNCTIONS
|
107
|
+
#############################################################################################
|
108
|
+
|
109
|
+
# this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
|
110
|
+
# Return the work data or nil if no more data is available
|
111
|
+
def next_work #Manage INput's worker
|
112
|
+
n,f,q = @@fasta_file.next_seq
|
113
|
+
if !n.nil?
|
114
|
+
@@stats_hash['input_seqs'] += 1
|
115
|
+
return Sequence.new(n,f,q)
|
116
|
+
else
|
117
|
+
return nil
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# this method is ejecuted each time an obj is finished
|
122
|
+
def work_received(objs) #Manage OUTput's worker
|
123
|
+
objs.each do |seq|
|
124
|
+
transdecoder_keep_seq(seq)
|
125
|
+
repTrans_keep_seq(seq)
|
126
|
+
if seq.type > UNKNOWN && seq.type < NCRNA
|
127
|
+
get_taxonomy(seq.hit.definition, @@stats_taxonomy)
|
128
|
+
get_functional_annotations(seq)
|
129
|
+
end
|
130
|
+
write_seq(seq) if @@options[:acess_db].include?('c') || !@@options[:acess_db].include?('p') || ( seq.type != UNKNOWN && seq.type != CODING ) #Don't write Unknown or coding sequences when use transdecoder
|
131
|
+
end
|
132
|
+
@@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs = summary_stats(objs, @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs)
|
133
|
+
end
|
134
|
+
|
135
|
+
def error_received(worker_error, obj)
|
136
|
+
puts "WARNING!!!!!. CHUNK FAILED:Error while processing object #{obj.first.seq_name}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
|
137
|
+
end
|
138
|
+
|
139
|
+
def too_many_errors_received
|
140
|
+
$LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
|
141
|
+
end
|
142
|
+
|
143
|
+
# send initial config
|
144
|
+
def worker_initial_config
|
145
|
+
return @@options
|
146
|
+
end
|
147
|
+
|
148
|
+
#############################################################################################
|
149
|
+
# CUSTOM FUNCTIONS
|
150
|
+
#############################################################################################
|
151
|
+
|
152
|
+
def self.load_functional_annotations(annotation_file)
|
153
|
+
functional_annotations = {}
|
154
|
+
File.open(annotation_file).each do |line|
|
155
|
+
line.chomp!
|
156
|
+
fields = line.split("\t")
|
157
|
+
acc = fields.shift
|
158
|
+
functional_annotations[acc] = fields
|
159
|
+
end
|
160
|
+
return functional_annotations
|
161
|
+
end
|
162
|
+
|
163
|
+
def get_functional_annotations(seq)
|
164
|
+
all_info = @@functional_annotations[seq.hit.acc.gsub(/-\d+/,'')] #gsub removes splicing code of uniprot accesion
|
165
|
+
if !all_info.nil?
|
166
|
+
annotations = {}
|
167
|
+
@@func_annot_type.each do |type, position|
|
168
|
+
annotations[type] = all_info[position]
|
169
|
+
end
|
170
|
+
seq.functional_annotations = annotations
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
# write results to files
|
176
|
+
def write_seq(seq)
|
177
|
+
begin
|
178
|
+
seq.write_info(@@output_files)
|
179
|
+
rescue Exception => e
|
180
|
+
puts "Error printing #{seq.seq_name}"
|
181
|
+
puts e.message, e.backtrace.join("\n")
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
def repTrans_keep_seq(seq)
|
187
|
+
if !@@options[:reptrans].nil?
|
188
|
+
case seq.type
|
189
|
+
when COMPLETE .. INTERNAL
|
190
|
+
@@seqs_annotation_prot << seq
|
191
|
+
when CODING
|
192
|
+
@@seqs_some_coding << seq
|
193
|
+
when UNKNOWN
|
194
|
+
@@seqs_unknown << seq
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def self.repTrans_keep_seq(seq)
|
200
|
+
if !@@options[:reptrans].nil?
|
201
|
+
case seq.type
|
202
|
+
when COMPLETE .. INTERNAL
|
203
|
+
@@seqs_annotation_prot << seq
|
204
|
+
when CODING
|
205
|
+
@@seqs_some_coding << seq
|
206
|
+
when UNKNOWN
|
207
|
+
@@seqs_unknown << seq
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def transdecoder_keep_seq(seq)
|
213
|
+
if @@options[:acess_db].include?('p')
|
214
|
+
case seq.type
|
215
|
+
when COMPLETE
|
216
|
+
@@complete_sure << seq if seq.status && seq.hit.ident >= @@options[:training_ident]
|
217
|
+
when CODING
|
218
|
+
@@seqs_to_analyze << seq
|
219
|
+
when UNKNOWN
|
220
|
+
@@seqs_to_analyze << seq
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def self.orf_prediction_with_transdecoder
|
226
|
+
clusters_seqs_annot_prot = clustering_by_id(@@complete_sure)
|
227
|
+
final_seqs = select_representative(clusters_seqs_annot_prot)
|
228
|
+
coding_info = nil
|
229
|
+
Dir.chdir('temp') do
|
230
|
+
orfs = get_seqs(final_seqs)
|
231
|
+
File.open('training_set.fasta', 'w') {|f| f.write(orfs)}
|
232
|
+
orfs = get_seqs(@@seqs_to_analyze)
|
233
|
+
File.open('analyse_set.fasta', 'w') {|f| f.write(orfs)}
|
234
|
+
cmd = "TransDecoder -t analyse_set.fasta --workdir transdecoder --train training_set.fasta"
|
235
|
+
cmd << ' --reuse' if Dir.exists?('transdecoder')
|
236
|
+
system(cmd)
|
237
|
+
coding_info = get_coding_info('transdecoder/longest_orfs.pep')
|
238
|
+
coding_info = get_scores('transdecoder/longest_orfs.cds.scores', coding_info)
|
239
|
+
coding_info = correct_by_selected('transdecoder/longest_orfs.cds.scores.selected', coding_info)
|
240
|
+
end
|
241
|
+
@@seqs_to_analyze.each do |seq|
|
242
|
+
coding = coding_info[seq.seq_name]
|
243
|
+
asign_coding_attributes(seq, coding) if !coding.nil?
|
244
|
+
repTrans_keep_seq(seq)
|
245
|
+
seq.write_info(@@output_files)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def self.get_seqs(seqs)
|
250
|
+
all_seqs = ''
|
251
|
+
seqs.each do |seq|
|
252
|
+
all_seqs << ">#{seq.seq_name}\n#{seq.seq_fasta}\n"
|
253
|
+
end
|
254
|
+
return all_seqs
|
255
|
+
end
|
256
|
+
|
257
|
+
def self.correct_by_selected(selected, coding_info)
|
258
|
+
seqs_selected = []
|
259
|
+
File.open(selected).each do |line|
|
260
|
+
line.chomp!
|
261
|
+
seq_name, orf_id = line.split('|', 2)
|
262
|
+
seqs_selected << orf_id
|
263
|
+
end
|
264
|
+
coding_info.each do |seq_name, orfs|
|
265
|
+
orfs.each do |orf, info|
|
266
|
+
info[1] = '-' if !seqs_selected.include?(orf)
|
267
|
+
end
|
268
|
+
end
|
269
|
+
return coding_info
|
270
|
+
end
|
271
|
+
|
272
|
+
def self.asign_coding_attributes(seq, coding)
|
273
|
+
seq.type = CODING
|
274
|
+
@@stats_hash['unknown'] -= 1
|
275
|
+
@@stats_hash['unknown_>200'] -= 1 if seq.seq_fasta.length > 200
|
276
|
+
@@stats_hash['unknown_>500'] -= 1 if seq.seq_fasta.length > 500
|
277
|
+
@@stats_hash['coding_>200'] += 1 if seq.seq_fasta.length > 200
|
278
|
+
@@stats_hash['coding_>500'] += 1 if seq.seq_fasta.length > 500
|
279
|
+
@@stats_hash['coding'] += 1
|
280
|
+
coding = select_orf(coding)
|
281
|
+
if coding[1] == 'complete'
|
282
|
+
seq.status = TRUE
|
283
|
+
@@stats_hash['coding_sure'] += 1
|
284
|
+
else
|
285
|
+
@@stats_hash['coding_putative'] += 1
|
286
|
+
end
|
287
|
+
|
288
|
+
seq.t_code = coding.last
|
289
|
+
ind = 2
|
290
|
+
ind = 3 if coding[4] == '-'
|
291
|
+
frame = (coding[ind]%3)+1
|
292
|
+
frame = frame * -1 if coding[4] == '-'
|
293
|
+
seq.hit = [coding[2], coding[3], frame]
|
294
|
+
end
|
295
|
+
|
296
|
+
def self.select_orf(orfs_hash)
|
297
|
+
orf = nil
|
298
|
+
ratioX = get_min_Xratio(orfs_hash)
|
299
|
+
orfs_hash.select!{|id, info| info.first == ratioX}
|
300
|
+
orfs = orfs_hash.select{|id, info| info[1] == 'complete'}
|
301
|
+
orfs = orfs_hash if orfs.empty?
|
302
|
+
max_score = get_max_score(orfs)
|
303
|
+
orfs.select!{|id, info| info.last == max_score}
|
304
|
+
orf = orfs.values.first
|
305
|
+
return orf
|
306
|
+
end
|
307
|
+
|
308
|
+
def self.get_max_score(orfs_hash)
|
309
|
+
score = nil
|
310
|
+
orfs_hash.each do |id, info|
|
311
|
+
local = info.last
|
312
|
+
if score.nil?
|
313
|
+
score = local
|
314
|
+
else
|
315
|
+
score = local if local > score
|
316
|
+
end
|
317
|
+
end
|
318
|
+
return score
|
319
|
+
end
|
320
|
+
|
321
|
+
def self.get_min_Xratio(orfs_hash)
|
322
|
+
ratio = nil
|
323
|
+
orfs_hash.each do |id, info|
|
324
|
+
local = info.first
|
325
|
+
if ratio.nil?
|
326
|
+
ratio = local
|
327
|
+
else
|
328
|
+
ratio = local if local < ratio
|
329
|
+
end
|
330
|
+
end
|
331
|
+
return ratio
|
332
|
+
end
|
333
|
+
|
334
|
+
def self.get_coding_info(file_name)
|
335
|
+
coding_info = {}
|
336
|
+
begin
|
337
|
+
FastaQualFile.new(file_name, '').each do |name, seq, comments, qual|
|
338
|
+
seq_length = seq.length
|
339
|
+
f_len = seq.length.to_f
|
340
|
+
x_len = seq.count('X')
|
341
|
+
seq_name, orf_id = name.split('|')
|
342
|
+
comments =~ /type:(\S+)/
|
343
|
+
type = $1
|
344
|
+
comments =~ /:(\d+)-(\d+)\(([+-])\)/
|
345
|
+
start = $1.to_i
|
346
|
+
stop = $2.to_i
|
347
|
+
strand = $3
|
348
|
+
record = coding_info[seq_name]
|
349
|
+
info = [x_len / f_len, type, start, stop, strand]
|
350
|
+
if record.nil?
|
351
|
+
coding_info[seq_name] = {orf_id => info}
|
352
|
+
else
|
353
|
+
record[orf_id] = info
|
354
|
+
end
|
355
|
+
end
|
356
|
+
rescue
|
357
|
+
puts "Warning!!!!!!!!!!: Transdecoder file is missing. Check if Transdecoder is installed"
|
358
|
+
end
|
359
|
+
return coding_info
|
360
|
+
end
|
361
|
+
|
362
|
+
def self.get_scores(file_name, coding_info)
|
363
|
+
File.open(file_name).each do |line|
|
364
|
+
line.chomp!
|
365
|
+
fields = line.split("\t")
|
366
|
+
name = fields.shift
|
367
|
+
seq, orf_id = name.split('|')
|
368
|
+
coding = coding_info[seq]
|
369
|
+
if !coding.nil?
|
370
|
+
orf = coding[orf_id]
|
371
|
+
if !orf.nil?
|
372
|
+
score = fields.first.to_f
|
373
|
+
if score > 0
|
374
|
+
orf << fields.first.to_f if !orf.nil?
|
375
|
+
else
|
376
|
+
coding.delete(orf_id)
|
377
|
+
coding_info.delete(seq) if coding.empty?
|
378
|
+
end
|
379
|
+
end
|
380
|
+
end
|
381
|
+
end
|
382
|
+
return coding_info
|
383
|
+
end
|
384
|
+
|
385
|
+
def self.get_annotations
|
386
|
+
return @@seqs_annotation_prot, @@seqs_some_coding, @@seqs_unknown
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|