full_lengther_next 0.0.8 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'scbi_fasta'
|
2
|
+
|
3
|
+
|
4
|
+
def load_isoform_hash(file)
|
5
|
+
isoform_hash = {}
|
6
|
+
if File.exists?(file)
|
7
|
+
fasta = FastaQualFile.new(file)
|
8
|
+
fasta.each do |name, seq, desc|
|
9
|
+
name =~ /(\w+\|(\w+)\-\d+\|)/
|
10
|
+
if isoform_hash[$2].nil?
|
11
|
+
isoform_hash[$2] = ">#{$1}#{desc}\n#{seq}"
|
12
|
+
else
|
13
|
+
isoform_hash[$2] += "\n>#{$1}#{desc}\n#{seq}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
fasta.close
|
17
|
+
end
|
18
|
+
return isoform_hash
|
19
|
+
end
|
20
|
+
|
21
|
+
def do_makeblastdb(seqs, output, dbtype)
|
22
|
+
cmd="makeblastdb -in - -out #{output} -title #{File.basename(output)} -dbtype #{dbtype} -parse_seqids"
|
23
|
+
IO.popen(cmd,'w+') {|makedb|
|
24
|
+
makedb.sync = TRUE
|
25
|
+
makedb.write(seqs)
|
26
|
+
makedb.close_write
|
27
|
+
puts makedb.readlines
|
28
|
+
makedb.close_read
|
29
|
+
}
|
30
|
+
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
$: << File.expand_path(File.join(File.dirname(__FILE__)))
|
2
|
-
|
2
|
+
|
3
3
|
require 'scbi_mapreduce'
|
4
4
|
require 'scbi_blast'
|
5
|
-
require 'json'
|
6
5
|
require 'sequence'
|
7
6
|
require 'fl_string_utils'
|
8
|
-
require
|
9
|
-
require
|
10
|
-
|
11
|
-
require '
|
12
|
-
|
7
|
+
require 'test_code'
|
8
|
+
require 'types'
|
9
|
+
require 'artifacts'
|
10
|
+
require 'blast_functions'
|
11
|
+
require 'exonerate_result'
|
12
|
+
require 'scbi_fasta'
|
13
13
|
|
14
14
|
require 'fl_analysis'
|
15
15
|
include FlAnalysis
|
@@ -18,166 +18,336 @@ require 'nc_rna'
|
|
18
18
|
include NcRna
|
19
19
|
|
20
20
|
class MyWorker < ScbiMapreduce::Worker
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
rescue Exception => e
|
26
|
-
puts (e.message+ e.backtrace.join("\n"))
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
def receive_initial_config(obj)
|
31
|
-
|
21
|
+
#####################################################################################
|
22
|
+
# WORKER FUNCTIONS
|
23
|
+
#####################################################################################
|
24
|
+
def receive_initial_config(manager_options)
|
32
25
|
# Reads the parameters
|
33
26
|
# $WORKER_LOG.info "Params received: #{obj.to_json}"
|
34
|
-
@options =
|
35
|
-
|
27
|
+
@options = manager_options
|
28
|
+
$verbose = manager_options[:verbose]
|
36
29
|
end
|
37
30
|
|
38
|
-
def process_object(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
31
|
+
def process_object(obj_sequence)
|
32
|
+
# Punto de arranque de FLN
|
33
|
+
$WORKER_LOG.info "Processing chunk: #{obj_sequence.first.seq_name}"
|
34
|
+
full_lenghter2(obj_sequence)
|
35
|
+
return obj_sequence
|
43
36
|
end
|
44
37
|
|
45
38
|
def closing_worker
|
46
39
|
|
47
40
|
end
|
41
|
+
|
42
|
+
#####################################################################################
|
43
|
+
# FLN FUNCTIONS
|
44
|
+
#####################################################################################
|
45
|
+
|
46
|
+
#----------------------------------------------------------------------------------
|
47
|
+
# MAIN FUNCTION
|
48
|
+
#----------------------------------------------------------------------------------
|
49
|
+
|
50
|
+
def full_lenghter2(seqs)
|
51
|
+
#seqs.map{|seq| seq.change_degenerated_nt!} # Clean degenerated nt
|
48
52
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
53
|
+
# User database
|
54
|
+
#--------------------------------------------
|
55
|
+
# if the user has included his own database in the parameters entry,
|
56
|
+
# the location of the database is tested, and blast and the results analysis is done
|
57
|
+
check_seqs = seqs
|
58
|
+
if @options[:user_db]
|
59
|
+
user_db = File.basename(@options[:user_db])
|
60
|
+
check_seqs = check_prot_db(seqs, @options[:user_db], 'blastx', 1, user_db, @options[:blast])
|
61
|
+
end
|
62
|
+
|
63
|
+
# UniProt (sp)
|
64
|
+
#--------------------------------------------
|
65
|
+
if @options[:acess_db].include?('s')
|
66
|
+
sp_db = 'sp_'+@options[:tax_group]
|
67
|
+
sp_path = File.join(sp_db, 'sp_'+@options[:tax_group])
|
68
|
+
check_seqs = check_prot_db(check_seqs, sp_path, 'blastx', 1, sp_db, @options[:blast])
|
69
|
+
end
|
70
|
+
|
71
|
+
# UniProt (tr)
|
72
|
+
#--------------------------------------------
|
73
|
+
if @options[:acess_db].include?('t')
|
74
|
+
tr_db = 'tr_'+@options[:tax_group]
|
75
|
+
tr_path = File.join(tr_db,'tr_'+@options[:tax_group])
|
76
|
+
check_seqs = check_prot_db(check_seqs, tr_path, 'blastx', 1, tr_db, @options[:blast])
|
77
|
+
end
|
78
|
+
|
79
|
+
# nc RNA
|
80
|
+
#--------------------------------------------
|
81
|
+
if @options[:acess_db].include?('n')
|
82
|
+
check_seqs = seqs.select{|s| s.type == UNKNOWN}
|
83
|
+
ncrna_path = File.join('nc_rna_db','ncrna')
|
84
|
+
check_ncRNA(check_seqs, ncrna_path, 'blastn', 1e-3)
|
85
|
+
end
|
86
|
+
|
87
|
+
# Test Code
|
88
|
+
#--------------------------------------------
|
89
|
+
# the sequences without a reliable similarity with an orthologue are processed with Test Code
|
90
|
+
if @options[:acess_db].include?('c')
|
91
|
+
check_seqs = seqs.select{|s| s.type == UNKNOWN }
|
92
|
+
check_testcode(check_seqs)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
#----------------------------------------------------------------------------------
|
96
|
+
# END MAIN
|
97
|
+
#----------------------------------------------------------------------------------
|
98
|
+
|
99
|
+
def check_prot_db(seqs, db_path, blast_type, evalue, db_name, additional_blast_options)
|
100
|
+
|
101
|
+
if $verbose > 0
|
102
|
+
puts "\e[33m=========================================\e[0m",
|
103
|
+
"\e[33m#{db_name}\t#{seqs.length}\e[0m",
|
104
|
+
"\e[33m=========================================\e[0m"
|
105
|
+
end
|
106
|
+
my_blast = run_blast(seqs, db_path, blast_type, evalue, additional_blast_options, @options[:exonerate]) # do blast
|
107
|
+
new_seqs = []
|
108
|
+
seqs.each_with_index do |seq, i| # parse blast
|
109
|
+
puts "\e[31m#{seq.seq_name}\e[0m" if $verbose > 0 ## VERBOSE
|
110
|
+
if !my_blast.querys[i].hits.first.nil?
|
111
|
+
status='Artifact analysis'
|
112
|
+
begin
|
113
|
+
check_blast(seq, my_blast.querys[i]) # Check if seq and query are the same
|
114
|
+
if !artifact?(seq, my_blast.querys[i], db_name, db_path, @options, new_seqs)
|
115
|
+
status = 'Full length analysis'
|
116
|
+
best_hits = filter_hits(my_blast.querys[i], 100)
|
117
|
+
record_position = seqs.index(seq)
|
118
|
+
seq = search_best_orf_y_fl(seq, best_hits, @options, db_name)# FULL LENGTH ANALYSIS
|
119
|
+
seqs[record_position]= seq #Replace the old seq by the new seq
|
120
|
+
seq.area_without_annotation? if @options[:chimera] != 'd' && !seq.hit.nil?
|
121
|
+
end
|
122
|
+
rescue Exception => e
|
123
|
+
rescue_sequence(e, seq, status)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
seqs.concat(new_seqs)
|
128
|
+
check_seqs = seqs.select{|s| !s.ignore || (s.type == COMPLETE && s.area_without_annotation)}
|
129
|
+
return check_seqs
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# ejecuta blast utilizando los parametros fichero de entrada, base de datos, tipo de blast y evalue
|
134
|
+
def run_blast(input, database, blast_type, evalue, additional_blast_options, do_exonerate, filter = TRUE)
|
135
|
+
if !input.empty? && !input.nil?
|
136
|
+
$WORKER_LOG.info "DB: #{File.basename(database)} #{input.length}"
|
137
|
+
blast = BatchBlast.new("-db #{database}", blast_type, "-evalue #{evalue} #{additional_blast_options}")
|
138
|
+
chunk_name = input.first.seq_name.gsub(/\W+/,'_')
|
139
|
+
file_path = File.join('temp', File.basename(database)+'_'+chunk_name)
|
140
|
+
if @options[:hdd] #Write/parse blast on Disk
|
141
|
+
file_name = file_path+'.blast' #Each blast is identified with database_name and first sequence's name on chunk
|
142
|
+
if !File.exists?(file_name)
|
143
|
+
blast_result = blast.do_blast_seqs(input, :table, TRUE, file_name)
|
144
|
+
else
|
145
|
+
blast = nil
|
146
|
+
blast_result=BlastTableResult.new(file_name)
|
147
|
+
end
|
148
|
+
else
|
149
|
+
blast_result = blast.do_blast_seqs(input, :table)
|
150
|
+
end
|
151
|
+
refine_analysis_with_exonerate(blast_result, input, file_path, database, @options[:ident]) if do_exonerate
|
152
|
+
if filter #Delete hits with low identity, this enables ident filter on normal FLN execution and disables it when RepTrans with my_workerEST
|
153
|
+
clean_by_identity(blast_result, @options[:ident])
|
154
|
+
#clean_by_query_length_match(blast_result, 1000)#60 is min length of the match in nt
|
155
|
+
end
|
156
|
+
$WORKER_LOG.info "#BLAST ENDED"
|
157
|
+
return blast_result
|
54
158
|
else
|
55
|
-
|
159
|
+
return nil
|
56
160
|
end
|
57
|
-
|
58
|
-
blast_result = blast.do_blast_seqs(input, :xml)
|
59
|
-
|
60
|
-
return blast_result
|
61
161
|
end
|
62
162
|
|
163
|
+
def rescue_sequence(e, seq, status)
|
164
|
+
seq.save_fasta = FALSE
|
165
|
+
seq.ignore = TRUE
|
166
|
+
seq.type = FAILED
|
167
|
+
puts '-- '+seq.seq_name+' FAILED ANALYSIS -- '+status,
|
168
|
+
e.message,
|
169
|
+
e.backtrace.join("\n")
|
170
|
+
end
|
171
|
+
|
172
|
+
def check_ncRNA(check_seqs, ncrna_path, blast_type, evalue)
|
173
|
+
my_blast = run_blast(check_seqs, ncrna_path, blast_type, evalue, '', FALSE, nil)
|
174
|
+
if !my_blast.nil?
|
175
|
+
check_seqs.each_with_index do |seq,i|
|
176
|
+
find_nc_rna(seq, my_blast.querys[i])
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
63
180
|
|
64
|
-
def
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
181
|
+
def check_testcode(check_seqs)
|
182
|
+
check_seqs.map{|seq| TestCode.new(seq)}
|
183
|
+
end
|
184
|
+
|
185
|
+
def check_blast(seq, blast_query)
|
186
|
+
if seq.seq_name != blast_query.query_def # used to detect if the sequence and the blast are from different query
|
187
|
+
raise "BLAST query name and sequence are different"
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def search_best_orf_y_fl(seq, best_hits, options, db_name)
|
192
|
+
warning = nil
|
193
|
+
if best_hits.length > 1
|
194
|
+
all_options = []
|
195
|
+
best_hits.map{|hit|
|
196
|
+
new_seq = seq.clone
|
197
|
+
puts "\n\t\e[35mCheck protein #{hit.first.acc}\e[0m" if $verbose > 1 ## VERBOSE
|
198
|
+
analiza_orf_y_fl(new_seq, hit, options, db_name)
|
199
|
+
all_options << new_seq
|
200
|
+
}
|
201
|
+
all_options.select!{|option| option.type > UNKNOWN}
|
202
|
+
best_type = all_options.map{|option| option.type}.min
|
203
|
+
best_options = all_options.select{|option| option.type == best_type}
|
204
|
+
filtered_options = best_options.select{|option| option.status} # Select sure options
|
205
|
+
filtered_options = best_options if filtered_options.empty? # All options are putative
|
206
|
+
#best_option = filtered_options.first # select hit with big perc ident query
|
207
|
+
best_option = filtered_options.sort{|seq1, seq2| seq2.hit.ident <=> seq1.hit.ident}.first # select hit with big perc ident query
|
208
|
+
if !all_options.empty? # There is one sequence unless
|
209
|
+
warning = [['PositionResult', all_options.index(best_option)+1]]
|
210
|
+
else
|
211
|
+
best_option = seq
|
74
212
|
end
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
213
|
+
else
|
214
|
+
analiza_orf_y_fl(seq, best_hits.first, options, db_name)
|
215
|
+
best_option = seq
|
216
|
+
warning = 'SingleResult'
|
217
|
+
end
|
218
|
+
|
219
|
+
if seq.type == FAILED
|
220
|
+
seq.type = UNKNOWN
|
221
|
+
seq.ignore = FALSE
|
222
|
+
else
|
223
|
+
best_option.warnings(warning) if !warning.nil?
|
224
|
+
end
|
225
|
+
return best_option
|
226
|
+
end
|
227
|
+
|
228
|
+
def refine_analysis_with_exonerate(blast_result, input, file_path, database, ident)
|
229
|
+
querys_stats = hits_statistics(blast_result)
|
230
|
+
if !querys_stats.empty?
|
231
|
+
querys, targets = select_sequences(querys_stats)
|
232
|
+
if !querys.empty? && !targets.empty?
|
233
|
+
write_querys(querys, input, file_path)
|
234
|
+
write_targets(targets, file_path, database)
|
235
|
+
file_name = file_path + '.exonerate'
|
236
|
+
system("exonerate --useaatla 0 --showalignment 0 --model protein2dna #{file_path+'.prot'} #{file_path+'.dna'} > #{file_name}") if !File.exists?(file_name)
|
237
|
+
seqs = {}
|
238
|
+
querys.map{|position| seqs[input[position].seq_name] = input[position].seq_fasta}
|
239
|
+
exonerate_result = ExonerateResult.new(file_name, seqs, get_prot_sequences(file_path))
|
240
|
+
clean_subjec_ids_name(exonerate_result)
|
241
|
+
replace_hits(blast_result, exonerate_result)
|
79
242
|
end
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def replace_hits(blast_result, exonerate_result)
|
247
|
+
blast_result.querys.each do |query|
|
248
|
+
exonerate_query = blast_result.find_query(exonerate_result.querys, query.query_def)
|
249
|
+
if !exonerate_query.nil?
|
250
|
+
blast_hits = cluster_hsps(query.hits)
|
251
|
+
exonerate_hits = cluster_hsps(exonerate_query.hits)
|
252
|
+
blast_hits.map! {|hit|
|
253
|
+
num_hsps = hit.length
|
254
|
+
if num_hsps > 1
|
255
|
+
exonerate_hit = find_hit(hit.first.acc, exonerate_hits)
|
256
|
+
if !exonerate_hit.nil? && exonerate_hit.length < num_hsps #We replace hits with by hits with less hsps because we supose that exonerate has merged them
|
257
|
+
exonerate_hit.map{|ex_hit|
|
258
|
+
ex_hit.s_len = hit.first.s_len
|
259
|
+
ex_hit.q_len = hit.first.q_len
|
260
|
+
ex_hit.definition = hit.first.definition
|
261
|
+
}
|
262
|
+
exonerate_hit
|
263
|
+
else
|
264
|
+
hit
|
265
|
+
end
|
266
|
+
else
|
267
|
+
hit
|
89
268
|
end
|
90
|
-
|
91
|
-
|
92
|
-
seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
93
|
-
my_blast = select_best_blast(my_blast, seqs)
|
94
|
-
end
|
95
|
-
|
96
|
-
# split and parse blast
|
97
|
-
seqs.each_with_index do |seq,i|
|
98
|
-
analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
|
269
|
+
}
|
270
|
+
query.hits = blast_hits.flatten
|
99
271
|
end
|
100
|
-
|
101
|
-
new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
|
102
|
-
|
103
|
-
else
|
104
|
-
new_seqs = seqs
|
105
272
|
end
|
273
|
+
end
|
106
274
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
275
|
+
def clean_subjec_ids_name(exonerate_result)
|
276
|
+
exonerate_result.querys.each do |query|
|
277
|
+
query.hits.map{|hit|
|
278
|
+
hit.subject_id.sub!('lcl|','')
|
279
|
+
hit.acc.sub!('lcl|','')
|
280
|
+
}
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def hits_statistics(blast_result)
|
285
|
+
querys_stats = []
|
286
|
+
blast_result.querys.each_with_index do |query, ind|
|
287
|
+
if !query.hits.empty?
|
288
|
+
query.hits.each do |hit|
|
289
|
+
if querys_stats[ind].nil?
|
290
|
+
querys_stats[ind] = {hit.acc => 1}
|
291
|
+
else
|
292
|
+
if querys_stats[ind][hit.acc].nil?
|
293
|
+
querys_stats[ind][hit.acc] = 1
|
294
|
+
else
|
295
|
+
querys_stats[ind][hit.acc] += 1
|
296
|
+
end
|
297
|
+
end
|
119
298
|
end
|
120
299
|
end
|
121
|
-
new_seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
|
122
|
-
my_blast = select_best_blast(my_blast, new_seqs)
|
123
300
|
end
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
# chimera detection
|
139
|
-
if (!@options[:chimera].nil?)
|
140
|
-
new_seqs.each_with_index do |seq,i|
|
141
|
-
if (!my_blast.querys[i].hits[0].nil?)
|
142
|
-
search_chimeras(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
301
|
+
return querys_stats
|
302
|
+
end
|
303
|
+
|
304
|
+
def select_sequences(querys_stats)
|
305
|
+
querys = []
|
306
|
+
targets = []
|
307
|
+
querys_stats.each_with_index do |hits, query_position|
|
308
|
+
if !hits.nil?
|
309
|
+
hits.each do |hit_id, n_hsps|
|
310
|
+
if n_hsps > 1
|
311
|
+
querys << query_position if !querys.include?(query_position)
|
312
|
+
targets << hit_id if !targets.include?(hit_id)
|
313
|
+
end
|
143
314
|
end
|
144
315
|
end
|
145
|
-
new_seqs=new_seqs.select{|s| s.get_annotations(:chimera).empty?}
|
146
|
-
my_blast = select_best_blast(my_blast, new_seqs)
|
147
316
|
end
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
317
|
+
return querys, targets
|
318
|
+
end
|
319
|
+
|
320
|
+
def write_querys(querys, input, file_path)
|
321
|
+
file_name = file_path+'.dna'
|
322
|
+
if !File.exists?(file_name)
|
323
|
+
fasta = File.open(file_name, 'w')
|
324
|
+
querys.each do |query_position|
|
325
|
+
seq = input[query_position]
|
326
|
+
fasta.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
327
|
+
end
|
328
|
+
fasta.close
|
152
329
|
end
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
TestCode.new(seq)
|
330
|
+
end
|
331
|
+
|
332
|
+
def write_targets(targets, file_path, database)
|
333
|
+
puts "-- This batch has not unigenes for exonerate: #{file_path}" if targets.empty?
|
334
|
+
file_name = file_path+'.prot'
|
335
|
+
if !File.exists?(file_name)
|
336
|
+
targets.each_slice(400) do |slice| #This loop avoids shell buffered out when the list of entries is huge
|
337
|
+
entries = slice.join(',')
|
338
|
+
system("blastdbcmd -db #{database} -entry #{entries} >> #{file_name}")
|
339
|
+
end
|
164
340
|
end
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
# split and parse blast
|
175
|
-
unknown_seqs.each_with_index do |seq,i|
|
176
|
-
find_nc_rna(seq, my_blast.querys[i])
|
341
|
+
end
|
342
|
+
|
343
|
+
def get_prot_sequences(file_path)
|
344
|
+
sequences = {}
|
345
|
+
file_name = file_path+'.prot'
|
346
|
+
fqr = FastaQualFile.new(file_name)
|
347
|
+
fqr.each do |name,seq_fasta|
|
348
|
+
sequences[name] = seq_fasta
|
177
349
|
end
|
178
|
-
|
179
|
-
|
350
|
+
fqr.close
|
351
|
+
return sequences
|
180
352
|
end
|
181
|
-
|
182
353
|
end
|
183
|
-
|