full_lengther_next 0.0.8 → 0.5.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'scbi_fasta'
|
2
|
+
|
3
|
+
|
4
|
+
def load_isoform_hash(file)
|
5
|
+
isoform_hash = {}
|
6
|
+
if File.exists?(file)
|
7
|
+
fasta = FastaQualFile.new(file)
|
8
|
+
fasta.each do |name, seq, desc|
|
9
|
+
name =~ /(\w+\|(\w+)\-\d+\|)/
|
10
|
+
if isoform_hash[$2].nil?
|
11
|
+
isoform_hash[$2] = ">#{$1}#{desc}\n#{seq}"
|
12
|
+
else
|
13
|
+
isoform_hash[$2] += "\n>#{$1}#{desc}\n#{seq}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
fasta.close
|
17
|
+
end
|
18
|
+
return isoform_hash
|
19
|
+
end
|
20
|
+
|
21
|
+
def do_makeblastdb(seqs, output, dbtype)
|
22
|
+
cmd="makeblastdb -in - -out #{output} -title #{File.basename(output)} -dbtype #{dbtype} -parse_seqids"
|
23
|
+
IO.popen(cmd,'w+') {|makedb|
|
24
|
+
makedb.sync = TRUE
|
25
|
+
makedb.write(seqs)
|
26
|
+
makedb.close_write
|
27
|
+
puts makedb.readlines
|
28
|
+
makedb.close_read
|
29
|
+
}
|
30
|
+
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
$: << File.expand_path(File.join(File.dirname(__FILE__)))
|
2
|
-
|
2
|
+
|
3
3
|
require 'scbi_mapreduce'
|
4
4
|
require 'scbi_blast'
|
5
|
-
require 'json'
|
6
5
|
require 'sequence'
|
7
6
|
require 'fl_string_utils'
|
8
|
-
require
|
9
|
-
require
|
10
|
-
|
11
|
-
require '
|
12
|
-
|
7
|
+
require 'test_code'
|
8
|
+
require 'types'
|
9
|
+
require 'artifacts'
|
10
|
+
require 'blast_functions'
|
11
|
+
require 'exonerate_result'
|
12
|
+
require 'scbi_fasta'
|
13
13
|
|
14
14
|
require 'fl_analysis'
|
15
15
|
include FlAnalysis
|
@@ -18,166 +18,336 @@ require 'nc_rna'
|
|
18
18
|
include NcRna
|
19
19
|
|
20
20
|
class MyWorker < ScbiMapreduce::Worker
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
rescue Exception => e
|
26
|
-
puts (e.message+ e.backtrace.join("\n"))
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
def receive_initial_config(obj)
|
31
|
-
|
21
|
+
#####################################################################################
|
22
|
+
# WORKER FUNCTIONS
|
23
|
+
#####################################################################################
|
24
|
+
def receive_initial_config(manager_options)
|
32
25
|
# Reads the parameters
|
33
26
|
# $WORKER_LOG.info "Params received: #{obj.to_json}"
|
34
|
-
@options =
|
35
|
-
|
27
|
+
@options = manager_options
|
28
|
+
$verbose = manager_options[:verbose]
|
36
29
|
end
|
37
30
|
|
38
|
-
def process_object(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
31
|
+
def process_object(obj_sequence)
|
32
|
+
# Punto de arranque de FLN
|
33
|
+
$WORKER_LOG.info "Processing chunk: #{obj_sequence.first.seq_name}"
|
34
|
+
full_lenghter2(obj_sequence)
|
35
|
+
return obj_sequence
|
43
36
|
end
|
44
37
|
|
45
38
|
def closing_worker
|
46
39
|
|
47
40
|
end
|
41
|
+
|
42
|
+
#####################################################################################
|
43
|
+
# FLN FUNCTIONS
|
44
|
+
#####################################################################################
|
45
|
+
|
46
|
+
#----------------------------------------------------------------------------------
|
47
|
+
# MAIN FUNCTION
|
48
|
+
#----------------------------------------------------------------------------------
|
49
|
+
|
50
|
+
def full_lenghter2(seqs)
|
51
|
+
#seqs.map{|seq| seq.change_degenerated_nt!} # Clean degenerated nt
|
48
52
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
53
|
+
# User database
|
54
|
+
#--------------------------------------------
|
55
|
+
# if the user has included his own database in the parameters entry,
|
56
|
+
# the location of the database is tested, and blast and the results analysis is done
|
57
|
+
check_seqs = seqs
|
58
|
+
if @options[:user_db]
|
59
|
+
user_db = File.basename(@options[:user_db])
|
60
|
+
check_seqs = check_prot_db(seqs, @options[:user_db], 'blastx', 1, user_db, @options[:blast])
|
61
|
+
end
|
62
|
+
|
63
|
+
# UniProt (sp)
|
64
|
+
#--------------------------------------------
|
65
|
+
if @options[:acess_db].include?('s')
|
66
|
+
sp_db = 'sp_'+@options[:tax_group]
|
67
|
+
sp_path = File.join(sp_db, 'sp_'+@options[:tax_group])
|
68
|
+
check_seqs = check_prot_db(check_seqs, sp_path, 'blastx', 1, sp_db, @options[:blast])
|
69
|
+
end
|
70
|
+
|
71
|
+
# UniProt (tr)
|
72
|
+
#--------------------------------------------
|
73
|
+
if @options[:acess_db].include?('t')
|
74
|
+
tr_db = 'tr_'+@options[:tax_group]
|
75
|
+
tr_path = File.join(tr_db,'tr_'+@options[:tax_group])
|
76
|
+
check_seqs = check_prot_db(check_seqs, tr_path, 'blastx', 1, tr_db, @options[:blast])
|
77
|
+
end
|
78
|
+
|
79
|
+
# nc RNA
|
80
|
+
#--------------------------------------------
|
81
|
+
if @options[:acess_db].include?('n')
|
82
|
+
check_seqs = seqs.select{|s| s.type == UNKNOWN}
|
83
|
+
ncrna_path = File.join('nc_rna_db','ncrna')
|
84
|
+
check_ncRNA(check_seqs, ncrna_path, 'blastn', 1e-3)
|
85
|
+
end
|
86
|
+
|
87
|
+
# Test Code
|
88
|
+
#--------------------------------------------
|
89
|
+
# the sequences without a reliable similarity with an orthologue are processed with Test Code
|
90
|
+
if @options[:acess_db].include?('c')
|
91
|
+
check_seqs = seqs.select{|s| s.type == UNKNOWN }
|
92
|
+
check_testcode(check_seqs)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
#----------------------------------------------------------------------------------
|
96
|
+
# END MAIN
|
97
|
+
#----------------------------------------------------------------------------------
|
98
|
+
|
99
|
+
def check_prot_db(seqs, db_path, blast_type, evalue, db_name, additional_blast_options)
|
100
|
+
|
101
|
+
if $verbose > 0
|
102
|
+
puts "\e[33m=========================================\e[0m",
|
103
|
+
"\e[33m#{db_name}\t#{seqs.length}\e[0m",
|
104
|
+
"\e[33m=========================================\e[0m"
|
105
|
+
end
|
106
|
+
my_blast = run_blast(seqs, db_path, blast_type, evalue, additional_blast_options, @options[:exonerate]) # do blast
|
107
|
+
new_seqs = []
|
108
|
+
seqs.each_with_index do |seq, i| # parse blast
|
109
|
+
puts "\e[31m#{seq.seq_name}\e[0m" if $verbose > 0 ## VERBOSE
|
110
|
+
if !my_blast.querys[i].hits.first.nil?
|
111
|
+
status='Artifact analysis'
|
112
|
+
begin
|
113
|
+
check_blast(seq, my_blast.querys[i]) # Check if seq and query are the same
|
114
|
+
if !artifact?(seq, my_blast.querys[i], db_name, db_path, @options, new_seqs)
|
115
|
+
status = 'Full length analysis'
|
116
|
+
best_hits = filter_hits(my_blast.querys[i], 100)
|
117
|
+
record_position = seqs.index(seq)
|
118
|
+
seq = search_best_orf_y_fl(seq, best_hits, @options, db_name)# FULL LENGTH ANALYSIS
|
119
|
+
seqs[record_position]= seq #Replace the old seq by the new seq
|
120
|
+
seq.area_without_annotation? if @options[:chimera] != 'd' && !seq.hit.nil?
|
121
|
+
end
|
122
|
+
rescue Exception => e
|
123
|
+
rescue_sequence(e, seq, status)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
seqs.concat(new_seqs)
|
128
|
+
check_seqs = seqs.select{|s| !s.ignore || (s.type == COMPLETE && s.area_without_annotation)}
|
129
|
+
return check_seqs
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# ejecuta blast utilizando los parametros fichero de entrada, base de datos, tipo de blast y evalue
|
134
|
+
def run_blast(input, database, blast_type, evalue, additional_blast_options, do_exonerate, filter = TRUE)
|
135
|
+
if !input.empty? && !input.nil?
|
136
|
+
$WORKER_LOG.info "DB: #{File.basename(database)} #{input.length}"
|
137
|
+
blast = BatchBlast.new("-db #{database}", blast_type, "-evalue #{evalue} #{additional_blast_options}")
|
138
|
+
chunk_name = input.first.seq_name.gsub(/\W+/,'_')
|
139
|
+
file_path = File.join('temp', File.basename(database)+'_'+chunk_name)
|
140
|
+
if @options[:hdd] #Write/parse blast on Disk
|
141
|
+
file_name = file_path+'.blast' #Each blast is identified with database_name and first sequence's name on chunk
|
142
|
+
if !File.exists?(file_name)
|
143
|
+
blast_result = blast.do_blast_seqs(input, :table, TRUE, file_name)
|
144
|
+
else
|
145
|
+
blast = nil
|
146
|
+
blast_result=BlastTableResult.new(file_name)
|
147
|
+
end
|
148
|
+
else
|
149
|
+
blast_result = blast.do_blast_seqs(input, :table)
|
150
|
+
end
|
151
|
+
refine_analysis_with_exonerate(blast_result, input, file_path, database, @options[:ident]) if do_exonerate
|
152
|
+
if filter #Delete hits with low identity, this enables ident filter on normal FLN execution and disables it when RepTrans with my_workerEST
|
153
|
+
clean_by_identity(blast_result, @options[:ident])
|
154
|
+
#clean_by_query_length_match(blast_result, 1000)#60 is min length of the match in nt
|
155
|
+
end
|
156
|
+
$WORKER_LOG.info "#BLAST ENDED"
|
157
|
+
return blast_result
|
54
158
|
else
|
55
|
-
|
159
|
+
return nil
|
56
160
|
end
|
57
|
-
|
58
|
-
blast_result = blast.do_blast_seqs(input, :xml)
|
59
|
-
|
60
|
-
return blast_result
|
61
161
|
end
|
62
162
|
|
163
|
+
def rescue_sequence(e, seq, status)
|
164
|
+
seq.save_fasta = FALSE
|
165
|
+
seq.ignore = TRUE
|
166
|
+
seq.type = FAILED
|
167
|
+
puts '-- '+seq.seq_name+' FAILED ANALYSIS -- '+status,
|
168
|
+
e.message,
|
169
|
+
e.backtrace.join("\n")
|
170
|
+
end
|
171
|
+
|
172
|
+
def check_ncRNA(check_seqs, ncrna_path, blast_type, evalue)
|
173
|
+
my_blast = run_blast(check_seqs, ncrna_path, blast_type, evalue, '', FALSE, nil)
|
174
|
+
if !my_blast.nil?
|
175
|
+
check_seqs.each_with_index do |seq,i|
|
176
|
+
find_nc_rna(seq, my_blast.querys[i])
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
63
180
|
|
64
|
-
def
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
181
|
+
def check_testcode(check_seqs)
|
182
|
+
check_seqs.map{|seq| TestCode.new(seq)}
|
183
|
+
end
|
184
|
+
|
185
|
+
def check_blast(seq, blast_query)
|
186
|
+
if seq.seq_name != blast_query.query_def # used to detect if the sequence and the blast are from different query
|
187
|
+
raise "BLAST query name and sequence are different"
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def search_best_orf_y_fl(seq, best_hits, options, db_name)
|
192
|
+
warning = nil
|
193
|
+
if best_hits.length > 1
|
194
|
+
all_options = []
|
195
|
+
best_hits.map{|hit|
|
196
|
+
new_seq = seq.clone
|
197
|
+
puts "\n\t\e[35mCheck protein #{hit.first.acc}\e[0m" if $verbose > 1 ## VERBOSE
|
198
|
+
analiza_orf_y_fl(new_seq, hit, options, db_name)
|
199
|
+
all_options << new_seq
|
200
|
+
}
|
201
|
+
all_options.select!{|option| option.type > UNKNOWN}
|
202
|
+
best_type = all_options.map{|option| option.type}.min
|
203
|
+
best_options = all_options.select{|option| option.type == best_type}
|
204
|
+
filtered_options = best_options.select{|option| option.status} # Select sure options
|
205
|
+
filtered_options = best_options if filtered_options.empty? # All options are putative
|
206
|
+
#best_option = filtered_options.first # select hit with big perc ident query
|
207
|
+
best_option = filtered_options.sort{|seq1, seq2| seq2.hit.ident <=> seq1.hit.ident}.first # select hit with big perc ident query
|
208
|
+
if !all_options.empty? # There is one sequence unless
|
209
|
+
warning = [['PositionResult', all_options.index(best_option)+1]]
|
210
|
+
else
|
211
|
+
best_option = seq
|
74
212
|
end
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
213
|
+
else
|
214
|
+
analiza_orf_y_fl(seq, best_hits.first, options, db_name)
|
215
|
+
best_option = seq
|
216
|
+
warning = 'SingleResult'
|
217
|
+
end
|
218
|
+
|
219
|
+
if seq.type == FAILED
|
220
|
+
seq.type = UNKNOWN
|
221
|
+
seq.ignore = FALSE
|
222
|
+
else
|
223
|
+
best_option.warnings(warning) if !warning.nil?
|
224
|
+
end
|
225
|
+
return best_option
|
226
|
+
end
|
227
|
+
|
228
|
+
def refine_analysis_with_exonerate(blast_result, input, file_path, database, ident)
|
229
|
+
querys_stats = hits_statistics(blast_result)
|
230
|
+
if !querys_stats.empty?
|
231
|
+
querys, targets = select_sequences(querys_stats)
|
232
|
+
if !querys.empty? && !targets.empty?
|
233
|
+
write_querys(querys, input, file_path)
|
234
|
+
write_targets(targets, file_path, database)
|
235
|
+
file_name = file_path + '.exonerate'
|
236
|
+
system("exonerate --useaatla 0 --showalignment 0 --model protein2dna #{file_path+'.prot'} #{file_path+'.dna'} > #{file_name}") if !File.exists?(file_name)
|
237
|
+
seqs = {}
|
238
|
+
querys.map{|position| seqs[input[position].seq_name] = input[position].seq_fasta}
|
239
|
+
exonerate_result = ExonerateResult.new(file_name, seqs, get_prot_sequences(file_path))
|
240
|
+
clean_subjec_ids_name(exonerate_result)
|
241
|
+
replace_hits(blast_result, exonerate_result)
|
79
242
|
end
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def replace_hits(blast_result, exonerate_result)
|
247
|
+
blast_result.querys.each do |query|
|
248
|
+
exonerate_query = blast_result.find_query(exonerate_result.querys, query.query_def)
|
249
|
+
if !exonerate_query.nil?
|
250
|
+
blast_hits = cluster_hsps(query.hits)
|
251
|
+
exonerate_hits = cluster_hsps(exonerate_query.hits)
|
252
|
+
blast_hits.map! {|hit|
|
253
|
+
num_hsps = hit.length
|
254
|
+
if num_hsps > 1
|
255
|
+
exonerate_hit = find_hit(hit.first.acc, exonerate_hits)
|
256
|
+
if !exonerate_hit.nil? && exonerate_hit.length < num_hsps #We replace hits with by hits with less hsps because we supose that exonerate has merged them
|
257
|
+
exonerate_hit.map{|ex_hit|
|
258
|
+
ex_hit.s_len = hit.first.s_len
|
259
|
+
ex_hit.q_len = hit.first.q_len
|
260
|
+
ex_hit.definition = hit.first.definition
|
261
|
+
}
|
262
|
+
exonerate_hit
|
263
|
+
else
|
264
|
+
hit
|
265
|
+
end
|
266
|
+
else
|
267
|
+
hit
|
89
268
|
end
|
90
|
-
|
91
|
-
|
92
|
-
seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
93
|
-
my_blast = select_best_blast(my_blast, seqs)
|
94
|
-
end
|
95
|
-
|
96
|
-
# split and parse blast
|
97
|
-
seqs.each_with_index do |seq,i|
|
98
|
-
analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
|
269
|
+
}
|
270
|
+
query.hits = blast_hits.flatten
|
99
271
|
end
|
100
|
-
|
101
|
-
new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
|
102
|
-
|
103
|
-
else
|
104
|
-
new_seqs = seqs
|
105
272
|
end
|
273
|
+
end
|
106
274
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
275
|
+
def clean_subjec_ids_name(exonerate_result)
|
276
|
+
exonerate_result.querys.each do |query|
|
277
|
+
query.hits.map{|hit|
|
278
|
+
hit.subject_id.sub!('lcl|','')
|
279
|
+
hit.acc.sub!('lcl|','')
|
280
|
+
}
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def hits_statistics(blast_result)
|
285
|
+
querys_stats = []
|
286
|
+
blast_result.querys.each_with_index do |query, ind|
|
287
|
+
if !query.hits.empty?
|
288
|
+
query.hits.each do |hit|
|
289
|
+
if querys_stats[ind].nil?
|
290
|
+
querys_stats[ind] = {hit.acc => 1}
|
291
|
+
else
|
292
|
+
if querys_stats[ind][hit.acc].nil?
|
293
|
+
querys_stats[ind][hit.acc] = 1
|
294
|
+
else
|
295
|
+
querys_stats[ind][hit.acc] += 1
|
296
|
+
end
|
297
|
+
end
|
119
298
|
end
|
120
299
|
end
|
121
|
-
new_seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
122
|
-
my_blast = select_best_blast(my_blast, new_seqs)
|
123
300
|
end
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
# chimera detection
|
139
|
-
if (!@options[:chimera].nil?)
|
140
|
-
new_seqs.each_with_index do |seq,i|
|
141
|
-
if (!my_blast.querys[i].hits[0].nil?)
|
142
|
-
search_chimeras(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
301
|
+
return querys_stats
|
302
|
+
end
|
303
|
+
|
304
|
+
def select_sequences(querys_stats)
|
305
|
+
querys = []
|
306
|
+
targets = []
|
307
|
+
querys_stats.each_with_index do |hits, query_position|
|
308
|
+
if !hits.nil?
|
309
|
+
hits.each do |hit_id, n_hsps|
|
310
|
+
if n_hsps > 1
|
311
|
+
querys << query_position if !querys.include?(query_position)
|
312
|
+
targets << hit_id if !targets.include?(hit_id)
|
313
|
+
end
|
143
314
|
end
|
144
315
|
end
|
145
|
-
new_seqs=new_seqs.select{|s| s.get_annotations(:chimera).empty?}
|
146
|
-
my_blast = select_best_blast(my_blast, new_seqs)
|
147
316
|
end
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
317
|
+
return querys, targets
|
318
|
+
end
|
319
|
+
|
320
|
+
def write_querys(querys, input, file_path)
|
321
|
+
file_name = file_path+'.dna'
|
322
|
+
if !File.exists?(file_name)
|
323
|
+
fasta = File.open(file_name, 'w')
|
324
|
+
querys.each do |query_position|
|
325
|
+
seq = input[query_position]
|
326
|
+
fasta.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
327
|
+
end
|
328
|
+
fasta.close
|
152
329
|
end
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
TestCode.new(seq)
|
330
|
+
end
|
331
|
+
|
332
|
+
def write_targets(targets, file_path, database)
|
333
|
+
puts "-- This batch has not unigenes for exonerate: #{file_path}" if targets.empty?
|
334
|
+
file_name = file_path+'.prot'
|
335
|
+
if !File.exists?(file_name)
|
336
|
+
targets.each_slice(400) do |slice| #This loop avoids shell buffered out when the list of entries is huge
|
337
|
+
entries = slice.join(',')
|
338
|
+
system("blastdbcmd -db #{database} -entry #{entries} >> #{file_name}")
|
339
|
+
end
|
164
340
|
end
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
# split and parse blast
|
175
|
-
unknown_seqs.each_with_index do |seq,i|
|
176
|
-
find_nc_rna(seq, my_blast.querys[i])
|
341
|
+
end
|
342
|
+
|
343
|
+
def get_prot_sequences(file_path)
|
344
|
+
sequences = {}
|
345
|
+
file_name = file_path+'.prot'
|
346
|
+
fqr = FastaQualFile.new(file_name)
|
347
|
+
fqr.each do |name,seq_fasta|
|
348
|
+
sequences[name] = seq_fasta
|
177
349
|
end
|
178
|
-
|
179
|
-
|
350
|
+
fqr.close
|
351
|
+
return sequences
|
180
352
|
end
|
181
|
-
|
182
353
|
end
|
183
|
-
|