full_lengther_next 0.0.8 → 0.5.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,54 @@
|
|
1
|
+
#$: << File.expand_path('~/fln/full_lengther_next/lib')
|
2
|
+
|
3
|
+
$: << File.expand_path(File.join(File.dirname(__FILE__)))
|
4
|
+
require 'types'
|
5
|
+
require 'my_worker'
|
6
|
+
|
7
|
+
class MyWorkerEst < MyWorker
|
8
|
+
#####################################################################################
|
9
|
+
# WORKER FUNCTIONS
|
10
|
+
#####################################################################################
|
11
|
+
def receive_initial_config(params)
|
12
|
+
# Reads the parameters
|
13
|
+
@options = params.first
|
14
|
+
@blast_path = params.last
|
15
|
+
@match = [] # Any sequence matching with a EST
|
16
|
+
@unmatch = [] # Sequences with :test_code annotation that it hasn't match with a EST
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_object (array_seqs)
|
20
|
+
$WORKER_LOG.info "Worker LIST #{array_seqs.length}"
|
21
|
+
array_seqs.sort!{|s1, s2| s2.t_code <=> s1.t_code} #Order by testcode
|
22
|
+
blastEST(array_seqs)
|
23
|
+
return [@match, @unmatch]
|
24
|
+
end
|
25
|
+
|
26
|
+
#####################################################################################
|
27
|
+
# CUSTOM FUNCTIONS
|
28
|
+
#####################################################################################
|
29
|
+
|
30
|
+
def blastEST(array_seqs)
|
31
|
+
blast = run_blast(array_seqs, @blast_path, 'blastn', 1e-6, nil, FALSE)
|
32
|
+
if blast.nil?
|
33
|
+
$LOG.info 'BLAST FAILED'
|
34
|
+
Process.exit(-1)
|
35
|
+
else
|
36
|
+
blast_analysis(blast, array_seqs)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def blast_analysis(blast, array_seqs)
|
41
|
+
blast.querys.each_with_index do |query, i|
|
42
|
+
if query.hits.first.nil?
|
43
|
+
if !array_seqs[i].type == CODING #Keep if is coding
|
44
|
+
@unmatch << array_seqs[i]
|
45
|
+
end
|
46
|
+
else
|
47
|
+
seq = array_seqs[i]
|
48
|
+
seq.hit = query.hits.first
|
49
|
+
@match << seq
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'scbi_fasta'
|
2
|
+
require 'sequence'
|
3
|
+
|
4
|
+
class MyWorkerManagerEst < ScbiMapreduce::WorkManager
|
5
|
+
#############################################################################################
|
6
|
+
# MANAGER INITIALIZATION
|
7
|
+
#############################################################################################
|
8
|
+
# open files and prepare global data
|
9
|
+
def self.init_work_manager(putative_seqs, options, blast_path)
|
10
|
+
@@blast_path = blast_path
|
11
|
+
@@options = options
|
12
|
+
@@putative_seqs = putative_seqs
|
13
|
+
@@match = []
|
14
|
+
@@unmatch = []
|
15
|
+
@@num_seqs = 0
|
16
|
+
end
|
17
|
+
|
18
|
+
#############################################################################################
|
19
|
+
# MANAGER TERMINATION
|
20
|
+
#############################################################################################
|
21
|
+
|
22
|
+
# close files
|
23
|
+
def self.end_work_manager
|
24
|
+
#VOID
|
25
|
+
end
|
26
|
+
|
27
|
+
#############################################################################################
|
28
|
+
# MANAGER NATIVE FUNCTIONS
|
29
|
+
#############################################################################################
|
30
|
+
|
31
|
+
# this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
|
32
|
+
# Return the work data or nil if no more data is available
|
33
|
+
def next_work #Manage INput's worker
|
34
|
+
if @@num_seqs == @@putative_seqs.length-1
|
35
|
+
return nil
|
36
|
+
else
|
37
|
+
seq = @@putative_seqs[@@num_seqs]
|
38
|
+
@@num_seqs += 1
|
39
|
+
return seq
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# this method is ejecuted each time an obj is finished
|
44
|
+
def work_received(match_and_unmatch_array) #Manage OUTput's worker
|
45
|
+
@@match.concat(match_and_unmatch_array.first)
|
46
|
+
@@unmatch.concat(match_and_unmatch_array.last)
|
47
|
+
end
|
48
|
+
|
49
|
+
def error_received(worker_error, obj)
|
50
|
+
puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
|
51
|
+
end
|
52
|
+
|
53
|
+
def too_many_errors_received
|
54
|
+
$LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
|
55
|
+
end
|
56
|
+
|
57
|
+
# send initial config
|
58
|
+
def worker_initial_config
|
59
|
+
return [@@options, @@blast_path]
|
60
|
+
end
|
61
|
+
|
62
|
+
#############################################################################################
|
63
|
+
# CUSTOM FUNCTIONS
|
64
|
+
#############################################################################################
|
65
|
+
def self.get_array_seqs
|
66
|
+
return @@match, @@unmatch
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,389 @@
|
|
1
|
+
require 'types'
|
2
|
+
require 'scbi_fasta'
|
3
|
+
require 'scbi_blast'
|
4
|
+
require 'sequence'
|
5
|
+
require 'exonerate_result'
|
6
|
+
require 'fln_stats'
|
7
|
+
require 'reptrans'
|
8
|
+
|
9
|
+
include FlnStats
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
class MyWorkerManagerFln < ScbiMapreduce::WorkManager
|
15
|
+
#############################################################################################
|
16
|
+
# MANAGER INITIALIZATION
|
17
|
+
#############################################################################################
|
18
|
+
attr_accessor :seqs_annotation_prot, :seqs_some_coding, :seqs_unknown
|
19
|
+
# open files and prepare global data
|
20
|
+
def self.init_work_manager(options)
|
21
|
+
@@stats_hash = initialize_stats_hash
|
22
|
+
@@stats_taxonomy = {}
|
23
|
+
@@stats_different_prot_id = []
|
24
|
+
@@stats_different_prot_id_complete_seqs = []
|
25
|
+
|
26
|
+
@@options = options
|
27
|
+
$verbose = options[:verbose]
|
28
|
+
|
29
|
+
input_file = options[:fasta]
|
30
|
+
|
31
|
+
if !File.exists?('fln_results')
|
32
|
+
Dir.mkdir('fln_results')
|
33
|
+
end
|
34
|
+
|
35
|
+
@@func_annot_type = {
|
36
|
+
:go_id => 5,
|
37
|
+
:go_description => 6,
|
38
|
+
:kegg_id => 7,
|
39
|
+
:interpro_id => 8,
|
40
|
+
:interpro_description => 9,
|
41
|
+
:ec_id => 10,
|
42
|
+
:pfam_id => 11,
|
43
|
+
:pfam_desc => 12,
|
44
|
+
:unipathway_id => 13
|
45
|
+
}
|
46
|
+
|
47
|
+
@@functional_annotations = {}
|
48
|
+
@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'sp_'+options[:tax_group],'sp_'+options[:tax_group]+'.index')))
|
49
|
+
#@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'tr_'+options[:tax_group],'tr_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('t')
|
50
|
+
|
51
|
+
@@fasta_file = FastaQualFile.new(input_file,'')
|
52
|
+
file_head = "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\te_value\tp_ident\ts_length\tprotein_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tgo_id\tgo_description\tkegg_id\tinterpro_id\tinterpro_description\tec_id\tpfam_id\tpfam_description\tunipathway_id"
|
53
|
+
|
54
|
+
@@output_files = {}
|
55
|
+
# Seq annotation files
|
56
|
+
if !options[:chimera].nil?
|
57
|
+
@@output_files[CHIMERA] = File.open("fln_results/chimeric_sequences.txt", 'w')
|
58
|
+
@@output_files[CHIMERA].puts file_head
|
59
|
+
elsif File.exists?("fln_results/chimeric_sequences.txt")
|
60
|
+
File.delete("fln_results/chimeric_sequences.txt")
|
61
|
+
end
|
62
|
+
@@output_files[OTHER] = File.open('fln_results/artifact_other.txt', 'w')
|
63
|
+
@@output_files[MISASSEMBLED] = File.open('fln_results/misassembled.txt', 'w')
|
64
|
+
@@output_files[UNKNOWN] = File.open('fln_results/unknown.txt', 'w')
|
65
|
+
@@output_files['db'] = File.open('fln_results/pt_seqs', 'w')
|
66
|
+
@@output_files[CODING] = File.open('fln_results/new_coding.txt', 'w')
|
67
|
+
@@output_files[NCRNA] = File.open('fln_results/nc_rnas.txt', 'w')
|
68
|
+
|
69
|
+
# Complementary files
|
70
|
+
@@output_files['align'] = File.open('fln_results/alignments.txt', 'w')
|
71
|
+
@@output_files['prot'] = File.open('fln_results/proteins.fasta', 'w') # FASTA
|
72
|
+
@@output_files['nts'] = File.open("fln_results/nt_seq.txt", 'w')
|
73
|
+
@@output_files['seqs'] = File.open('fln_results/unigenes.fasta', 'w') # FASTA
|
74
|
+
@@output_files['stats_html'] = File.open('fln_results/summary_stats.html', 'w')
|
75
|
+
@@output_files['stats_txt'] = File.open('fln_results/summary_stats.txt', 'w')
|
76
|
+
|
77
|
+
@@output_files[CODING].puts file_head
|
78
|
+
@@output_files['db'].puts file_head
|
79
|
+
@@output_files[NCRNA].puts file_head
|
80
|
+
|
81
|
+
#RepTrans module
|
82
|
+
@@seqs_annotation_prot = []
|
83
|
+
@@seqs_some_coding = []
|
84
|
+
@@seqs_unknown = []
|
85
|
+
|
86
|
+
#Transdecoder module
|
87
|
+
@@complete_sure = []
|
88
|
+
@@seqs_to_analyze = []
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
#############################################################################################
|
93
|
+
# MANAGER TERMINATION
|
94
|
+
#############################################################################################
|
95
|
+
|
96
|
+
# close files
|
97
|
+
def self.end_work_manager
|
98
|
+
orf_prediction_with_transdecoder if @@options[:acess_db].include?('p') && !@@complete_sure.empty? && !@@seqs_to_analyze.empty?
|
99
|
+
write_summary_stats(@@stats_hash, @@stats_taxonomy, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@output_files['stats_txt'], @@output_files['stats_html'])
|
100
|
+
@@output_files.each do |key, handler|
|
101
|
+
handler.close
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
#############################################################################################
|
106
|
+
# MANAGER NATIVE FUNCTIONS
|
107
|
+
#############################################################################################
|
108
|
+
|
109
|
+
# this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
|
110
|
+
# Return the work data or nil if no more data is available
|
111
|
+
def next_work #Manage INput's worker
|
112
|
+
n,f,q = @@fasta_file.next_seq
|
113
|
+
if !n.nil?
|
114
|
+
@@stats_hash['input_seqs'] += 1
|
115
|
+
return Sequence.new(n,f,q)
|
116
|
+
else
|
117
|
+
return nil
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# this method is ejecuted each time an obj is finished
|
122
|
+
def work_received(objs) #Manage OUTput's worker
|
123
|
+
objs.each do |seq|
|
124
|
+
transdecoder_keep_seq(seq)
|
125
|
+
repTrans_keep_seq(seq)
|
126
|
+
if seq.type > UNKNOWN && seq.type < NCRNA
|
127
|
+
get_taxonomy(seq.hit.definition, @@stats_taxonomy)
|
128
|
+
get_functional_annotations(seq)
|
129
|
+
end
|
130
|
+
write_seq(seq) if @@options[:acess_db].include?('c') || !@@options[:acess_db].include?('p') || ( seq.type != UNKNOWN && seq.type != CODING ) #Don't write Unknown or coding sequences when use transdecoder
|
131
|
+
end
|
132
|
+
@@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs = summary_stats(objs, @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs)
|
133
|
+
end
|
134
|
+
|
135
|
+
def error_received(worker_error, obj)
|
136
|
+
puts "WARNING!!!!!. CHUNK FAILED:Error while processing object #{obj.first.seq_name}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
|
137
|
+
end
|
138
|
+
|
139
|
+
def too_many_errors_received
|
140
|
+
$LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
|
141
|
+
end
|
142
|
+
|
143
|
+
# send initial config
|
144
|
+
def worker_initial_config
|
145
|
+
return @@options
|
146
|
+
end
|
147
|
+
|
148
|
+
#############################################################################################
|
149
|
+
# CUSTOM FUNCTIONS
|
150
|
+
#############################################################################################
|
151
|
+
|
152
|
+
def self.load_functional_annotations(annotation_file)
|
153
|
+
functional_annotations = {}
|
154
|
+
File.open(annotation_file).each do |line|
|
155
|
+
line.chomp!
|
156
|
+
fields = line.split("\t")
|
157
|
+
acc = fields.shift
|
158
|
+
functional_annotations[acc] = fields
|
159
|
+
end
|
160
|
+
return functional_annotations
|
161
|
+
end
|
162
|
+
|
163
|
+
def get_functional_annotations(seq)
|
164
|
+
all_info = @@functional_annotations[seq.hit.acc.gsub(/-\d+/,'')] #gsub removes splicing code of uniprot accesion
|
165
|
+
if !all_info.nil?
|
166
|
+
annotations = {}
|
167
|
+
@@func_annot_type.each do |type, position|
|
168
|
+
annotations[type] = all_info[position]
|
169
|
+
end
|
170
|
+
seq.functional_annotations = annotations
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
# write results to files
|
176
|
+
def write_seq(seq)
|
177
|
+
begin
|
178
|
+
seq.write_info(@@output_files)
|
179
|
+
rescue Exception => e
|
180
|
+
puts "Error printing #{seq.seq_name}"
|
181
|
+
puts e.message, e.backtrace.join("\n")
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
def repTrans_keep_seq(seq)
|
187
|
+
if !@@options[:reptrans].nil?
|
188
|
+
case seq.type
|
189
|
+
when COMPLETE .. INTERNAL
|
190
|
+
@@seqs_annotation_prot << seq
|
191
|
+
when CODING
|
192
|
+
@@seqs_some_coding << seq
|
193
|
+
when UNKNOWN
|
194
|
+
@@seqs_unknown << seq
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def self.repTrans_keep_seq(seq)
|
200
|
+
if !@@options[:reptrans].nil?
|
201
|
+
case seq.type
|
202
|
+
when COMPLETE .. INTERNAL
|
203
|
+
@@seqs_annotation_prot << seq
|
204
|
+
when CODING
|
205
|
+
@@seqs_some_coding << seq
|
206
|
+
when UNKNOWN
|
207
|
+
@@seqs_unknown << seq
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def transdecoder_keep_seq(seq)
|
213
|
+
if @@options[:acess_db].include?('p')
|
214
|
+
case seq.type
|
215
|
+
when COMPLETE
|
216
|
+
@@complete_sure << seq if seq.status && seq.hit.ident >= @@options[:training_ident]
|
217
|
+
when CODING
|
218
|
+
@@seqs_to_analyze << seq
|
219
|
+
when UNKNOWN
|
220
|
+
@@seqs_to_analyze << seq
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def self.orf_prediction_with_transdecoder
|
226
|
+
clusters_seqs_annot_prot = clustering_by_id(@@complete_sure)
|
227
|
+
final_seqs = select_representative(clusters_seqs_annot_prot)
|
228
|
+
coding_info = nil
|
229
|
+
Dir.chdir('temp') do
|
230
|
+
orfs = get_seqs(final_seqs)
|
231
|
+
File.open('training_set.fasta', 'w') {|f| f.write(orfs)}
|
232
|
+
orfs = get_seqs(@@seqs_to_analyze)
|
233
|
+
File.open('analyse_set.fasta', 'w') {|f| f.write(orfs)}
|
234
|
+
cmd = "TransDecoder -t analyse_set.fasta --workdir transdecoder --train training_set.fasta"
|
235
|
+
cmd << ' --reuse' if Dir.exists?('transdecoder')
|
236
|
+
system(cmd)
|
237
|
+
coding_info = get_coding_info('transdecoder/longest_orfs.pep')
|
238
|
+
coding_info = get_scores('transdecoder/longest_orfs.cds.scores', coding_info)
|
239
|
+
coding_info = correct_by_selected('transdecoder/longest_orfs.cds.scores.selected', coding_info)
|
240
|
+
end
|
241
|
+
@@seqs_to_analyze.each do |seq|
|
242
|
+
coding = coding_info[seq.seq_name]
|
243
|
+
asign_coding_attributes(seq, coding) if !coding.nil?
|
244
|
+
repTrans_keep_seq(seq)
|
245
|
+
seq.write_info(@@output_files)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def self.get_seqs(seqs)
|
250
|
+
all_seqs = ''
|
251
|
+
seqs.each do |seq|
|
252
|
+
all_seqs << ">#{seq.seq_name}\n#{seq.seq_fasta}\n"
|
253
|
+
end
|
254
|
+
return all_seqs
|
255
|
+
end
|
256
|
+
|
257
|
+
def self.correct_by_selected(selected, coding_info)
|
258
|
+
seqs_selected = []
|
259
|
+
File.open(selected).each do |line|
|
260
|
+
line.chomp!
|
261
|
+
seq_name, orf_id = line.split('|', 2)
|
262
|
+
seqs_selected << orf_id
|
263
|
+
end
|
264
|
+
coding_info.each do |seq_name, orfs|
|
265
|
+
orfs.each do |orf, info|
|
266
|
+
info[1] = '-' if !seqs_selected.include?(orf)
|
267
|
+
end
|
268
|
+
end
|
269
|
+
return coding_info
|
270
|
+
end
|
271
|
+
|
272
|
+
def self.asign_coding_attributes(seq, coding)
|
273
|
+
seq.type = CODING
|
274
|
+
@@stats_hash['unknown'] -= 1
|
275
|
+
@@stats_hash['unknown_>200'] -= 1 if seq.seq_fasta.length > 200
|
276
|
+
@@stats_hash['unknown_>500'] -= 1 if seq.seq_fasta.length > 500
|
277
|
+
@@stats_hash['coding_>200'] += 1 if seq.seq_fasta.length > 200
|
278
|
+
@@stats_hash['coding_>500'] += 1 if seq.seq_fasta.length > 500
|
279
|
+
@@stats_hash['coding'] += 1
|
280
|
+
coding = select_orf(coding)
|
281
|
+
if coding[1] == 'complete'
|
282
|
+
seq.status = TRUE
|
283
|
+
@@stats_hash['coding_sure'] += 1
|
284
|
+
else
|
285
|
+
@@stats_hash['coding_putative'] += 1
|
286
|
+
end
|
287
|
+
|
288
|
+
seq.t_code = coding.last
|
289
|
+
ind = 2
|
290
|
+
ind = 3 if coding[4] == '-'
|
291
|
+
frame = (coding[ind]%3)+1
|
292
|
+
frame = frame * -1 if coding[4] == '-'
|
293
|
+
seq.hit = [coding[2], coding[3], frame]
|
294
|
+
end
|
295
|
+
|
296
|
+
def self.select_orf(orfs_hash)
|
297
|
+
orf = nil
|
298
|
+
ratioX = get_min_Xratio(orfs_hash)
|
299
|
+
orfs_hash.select!{|id, info| info.first == ratioX}
|
300
|
+
orfs = orfs_hash.select{|id, info| info[1] == 'complete'}
|
301
|
+
orfs = orfs_hash if orfs.empty?
|
302
|
+
max_score = get_max_score(orfs)
|
303
|
+
orfs.select!{|id, info| info.last == max_score}
|
304
|
+
orf = orfs.values.first
|
305
|
+
return orf
|
306
|
+
end
|
307
|
+
|
308
|
+
def self.get_max_score(orfs_hash)
|
309
|
+
score = nil
|
310
|
+
orfs_hash.each do |id, info|
|
311
|
+
local = info.last
|
312
|
+
if score.nil?
|
313
|
+
score = local
|
314
|
+
else
|
315
|
+
score = local if local > score
|
316
|
+
end
|
317
|
+
end
|
318
|
+
return score
|
319
|
+
end
|
320
|
+
|
321
|
+
def self.get_min_Xratio(orfs_hash)
|
322
|
+
ratio = nil
|
323
|
+
orfs_hash.each do |id, info|
|
324
|
+
local = info.first
|
325
|
+
if ratio.nil?
|
326
|
+
ratio = local
|
327
|
+
else
|
328
|
+
ratio = local if local < ratio
|
329
|
+
end
|
330
|
+
end
|
331
|
+
return ratio
|
332
|
+
end
|
333
|
+
|
334
|
+
def self.get_coding_info(file_name)
|
335
|
+
coding_info = {}
|
336
|
+
begin
|
337
|
+
FastaQualFile.new(file_name, '').each do |name, seq, comments, qual|
|
338
|
+
seq_length = seq.length
|
339
|
+
f_len = seq.length.to_f
|
340
|
+
x_len = seq.count('X')
|
341
|
+
seq_name, orf_id = name.split('|')
|
342
|
+
comments =~ /type:(\S+)/
|
343
|
+
type = $1
|
344
|
+
comments =~ /:(\d+)-(\d+)\(([+-])\)/
|
345
|
+
start = $1.to_i
|
346
|
+
stop = $2.to_i
|
347
|
+
strand = $3
|
348
|
+
record = coding_info[seq_name]
|
349
|
+
info = [x_len / f_len, type, start, stop, strand]
|
350
|
+
if record.nil?
|
351
|
+
coding_info[seq_name] = {orf_id => info}
|
352
|
+
else
|
353
|
+
record[orf_id] = info
|
354
|
+
end
|
355
|
+
end
|
356
|
+
rescue
|
357
|
+
puts "Warning!!!!!!!!!!: Transdecoder file is missing. Check if Transdecoder is installed"
|
358
|
+
end
|
359
|
+
return coding_info
|
360
|
+
end
|
361
|
+
|
362
|
+
def self.get_scores(file_name, coding_info)
|
363
|
+
File.open(file_name).each do |line|
|
364
|
+
line.chomp!
|
365
|
+
fields = line.split("\t")
|
366
|
+
name = fields.shift
|
367
|
+
seq, orf_id = name.split('|')
|
368
|
+
coding = coding_info[seq]
|
369
|
+
if !coding.nil?
|
370
|
+
orf = coding[orf_id]
|
371
|
+
if !orf.nil?
|
372
|
+
score = fields.first.to_f
|
373
|
+
if score > 0
|
374
|
+
orf << fields.first.to_f if !orf.nil?
|
375
|
+
else
|
376
|
+
coding.delete(orf_id)
|
377
|
+
coding_info.delete(seq) if coding.empty?
|
378
|
+
end
|
379
|
+
end
|
380
|
+
end
|
381
|
+
end
|
382
|
+
return coding_info
|
383
|
+
end
|
384
|
+
|
385
|
+
def self.get_annotations
|
386
|
+
return @@seqs_annotation_prot, @@seqs_some_coding, @@seqs_unknown
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|