full_lengther_next 0.0.8 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scbi_plot'
|
4
|
+
#############################################
|
5
|
+
### FUNCTIONS
|
6
|
+
#############################################
|
7
|
+
def take_taxonomy(file)
|
8
|
+
taxonomy = {}
|
9
|
+
File.open(file).each_with_index do |line, i|
|
10
|
+
line.chomp!
|
11
|
+
field = line.split("\t").last
|
12
|
+
organism = field.split(";",2).last
|
13
|
+
organism.gsub!(/\(\D+\)/,'')
|
14
|
+
if organism.split(' ').length == 1
|
15
|
+
next
|
16
|
+
end
|
17
|
+
organism.gsub!('.','')
|
18
|
+
organism.gsub!(/^ /,'')
|
19
|
+
organism.gsub!(' ','')
|
20
|
+
organism ='"'+organism+'"'
|
21
|
+
if taxonomy[organism].nil?
|
22
|
+
taxonomy[organism] = 1
|
23
|
+
else
|
24
|
+
taxonomy[organism] += 1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
return taxonomy
|
28
|
+
end
|
29
|
+
|
30
|
+
def plot(taxonomy)
|
31
|
+
p=ScbiPlot::Histogram.new('fln_taxonomy_plot.png','Group organism representation')
|
32
|
+
p.add_x(taxonomy.keys)
|
33
|
+
p.add_y(taxonomy.values)
|
34
|
+
p.do_graph
|
35
|
+
end
|
36
|
+
|
37
|
+
#############################################
|
38
|
+
|
39
|
+
require 'optparse'
|
40
|
+
options = {}
|
41
|
+
|
42
|
+
optparse = OptionParser.new do |opts|
|
43
|
+
options[:path] = File.join('fln_results','pt_seqs')
|
44
|
+
opts.on( '-p', '--path PATH', 'Path to FLN execution' ) do |path|
|
45
|
+
options[:path] = File.join(path,'fln_results','pt_seqs')
|
46
|
+
end
|
47
|
+
|
48
|
+
# Set a banner, displayed at the top of the help screen.
|
49
|
+
opts.banner = "Usage: plot_taxonomy.rb -p PATH \n\n"
|
50
|
+
|
51
|
+
# This displays the help screen
|
52
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
53
|
+
puts opts
|
54
|
+
exit
|
55
|
+
end
|
56
|
+
|
57
|
+
end # End opts
|
58
|
+
|
59
|
+
# parse options and remove from ARGV
|
60
|
+
optparse.parse!
|
61
|
+
|
62
|
+
taxonomy = nil
|
63
|
+
if File.exists?(options[:path])
|
64
|
+
taxonomy = take_taxonomy(options[:path])
|
65
|
+
else
|
66
|
+
puts 'Path isn\'t valid'
|
67
|
+
Process.exit
|
68
|
+
end
|
69
|
+
|
70
|
+
plot(taxonomy)
|
Binary file
|
data/lib/full_lengther_next.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
-
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
4
|
# ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
|
5
5
|
|
@@ -7,7 +7,7 @@ $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', '
|
|
7
7
|
|
8
8
|
|
9
9
|
module FullLengtherNext
|
10
|
-
|
10
|
+
VERSION = '0.5.6'
|
11
11
|
|
12
|
-
|
12
|
+
FULL_LENGHTER_VERSION = VERSION
|
13
13
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'blast_functions'
|
2
|
+
require 'types'
|
3
|
+
require 'chimeric_seqs'
|
4
|
+
include ChimericSeqs
|
5
|
+
|
6
|
+
#####################################################################
|
7
|
+
## MAIN FUNCTION
|
8
|
+
#####################################################################
|
9
|
+
def artifact?(seq, query, db_name, db_path, options, new_seqs)
|
10
|
+
artifact = FALSE
|
11
|
+
# MISASSEMBLED DETECTION
|
12
|
+
if misassembled_detection(query) #If seq is misassembled stop chimera analisys
|
13
|
+
seq.hit = query.hits.first
|
14
|
+
artifact = TRUE
|
15
|
+
seq.type = MISASSEMBLED
|
16
|
+
seq.warnings('ERROR#1')
|
17
|
+
end
|
18
|
+
|
19
|
+
# OVERLAPPING HSPS ON SUBJECT DETECTION
|
20
|
+
=begin
|
21
|
+
if !artifact
|
22
|
+
hit_reference = query.hits.first.dup
|
23
|
+
query, overlapping = overlapping_hsps_on_subject(query)
|
24
|
+
if overlapping
|
25
|
+
if query.hits.first.nil?
|
26
|
+
seq.hit = hit_reference
|
27
|
+
else
|
28
|
+
seq.hit = query.hits.first
|
29
|
+
end
|
30
|
+
artifact = TRUE
|
31
|
+
seq.type = OTHER
|
32
|
+
seq.warnings('ERROR#2')
|
33
|
+
end
|
34
|
+
end
|
35
|
+
=end
|
36
|
+
|
37
|
+
# MULTIPLE HSP DETECTION
|
38
|
+
if !artifact && multiple_hsps(query, 3)
|
39
|
+
seq.hit = query.hits.first
|
40
|
+
seq.warnings('ERROR#3')
|
41
|
+
end
|
42
|
+
|
43
|
+
# CHIMERA DETECTION
|
44
|
+
if !artifact && !options[:chimera].include?('d')
|
45
|
+
chimera = search_chimeras(seq, query, options, db_name, db_path)
|
46
|
+
if !chimera.nil?
|
47
|
+
new_seqs.concat(chimera)
|
48
|
+
seq.db_name = db_name
|
49
|
+
seq.type = CHIMERA
|
50
|
+
artifact = TRUE
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
if artifact
|
55
|
+
if $verbose > 1
|
56
|
+
puts seq.prot_annot_calification
|
57
|
+
end
|
58
|
+
seq.db_name = db_name
|
59
|
+
seq.save_fasta = FALSE
|
60
|
+
seq.ignore = TRUE
|
61
|
+
end
|
62
|
+
return artifact
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
|
@@ -0,0 +1,326 @@
|
|
1
|
+
|
2
|
+
require 'scbi_blast'
|
3
|
+
|
4
|
+
def filter_hits(query, select_hits=10) # Select best hits
|
5
|
+
hits = query.hits
|
6
|
+
if !hits.first.nil?
|
7
|
+
hits = cluster_hsps(hits)
|
8
|
+
hits = hits[0..select_hits]
|
9
|
+
hits = select_hits_by_identity_query(hits)
|
10
|
+
hits = select_hits_by_coverage_subject(hits)
|
11
|
+
end
|
12
|
+
if hits.empty?
|
13
|
+
if select_hits >= query.hits.length || select_hits >= 100 # Condition to stop a infinite recursive function
|
14
|
+
hits = [cluster_hsps(query.hits).first]
|
15
|
+
else
|
16
|
+
hits = filter_hits(query, select_hits+10)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
return hits
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_coverage_subject(hit)
|
23
|
+
perc_identity = hit.align_len*100.0/hit.s_len
|
24
|
+
if perc_identity > 100 && hit.class.to_s == 'ExoBlastHit' && !hit.q_frameshift.empty?
|
25
|
+
hit.q_frameshift.length.times do |n| #Align len correction by frameshift. FS can create a gap in alignment adding extra aa. FS can be deletions or insertions so we check until get a perc_identity of 100
|
26
|
+
align_len = hit.align_len- (n + 1)
|
27
|
+
perc_identity = align_len*100.0/hit.s_len
|
28
|
+
break if perc_identity <= 100
|
29
|
+
end
|
30
|
+
end
|
31
|
+
return perc_identity
|
32
|
+
end
|
33
|
+
|
34
|
+
def select_hits_by_coverage_subject(hits)
|
35
|
+
selected_hits = []
|
36
|
+
coverage_thresold = get_coverage_subject(hits.first.first)
|
37
|
+
coverage_thresold = 100 if coverage_thresold > 100
|
38
|
+
|
39
|
+
hits.map{|hit|
|
40
|
+
hit.each do |hsp|
|
41
|
+
coverage = get_coverage_subject(hsp)
|
42
|
+
if coverage > 100
|
43
|
+
next
|
44
|
+
end
|
45
|
+
if coverage >= coverage_thresold
|
46
|
+
selected_hits << hit
|
47
|
+
break
|
48
|
+
end
|
49
|
+
end
|
50
|
+
}
|
51
|
+
return selected_hits
|
52
|
+
end
|
53
|
+
|
54
|
+
def select_hits_by_identity_query(hits)
|
55
|
+
selected_hits = []
|
56
|
+
identity = hits.first.first.ident
|
57
|
+
hits.map{|hit|
|
58
|
+
hit.each do |hsp|
|
59
|
+
if hsp.ident >= identity
|
60
|
+
selected_hits << hit
|
61
|
+
break
|
62
|
+
end
|
63
|
+
end
|
64
|
+
}
|
65
|
+
return selected_hits
|
66
|
+
end
|
67
|
+
|
68
|
+
def select_hits_by_evalue(hits, evalue)
|
69
|
+
selected_hits = []
|
70
|
+
hits.map{|hit|
|
71
|
+
hit.each do |hsp|
|
72
|
+
if hsp.e_val <= evalue
|
73
|
+
selected_hits << hit
|
74
|
+
end
|
75
|
+
end
|
76
|
+
}
|
77
|
+
return selected_hits
|
78
|
+
end
|
79
|
+
|
80
|
+
def select_hsps_by_id(hits, selected_ids)
|
81
|
+
selected_hits = []
|
82
|
+
hits.map{|hsp|
|
83
|
+
if selected_ids.include?(hsp.acc)
|
84
|
+
selected_hits << hsp
|
85
|
+
end
|
86
|
+
}
|
87
|
+
return selected_hits
|
88
|
+
end
|
89
|
+
|
90
|
+
def set_thresold_evalue(hits)
|
91
|
+
evalue = 100
|
92
|
+
hits.map{|hit|
|
93
|
+
if hit.e_val != 0 && hit.e_val < evalue
|
94
|
+
evalue = hit.e_val
|
95
|
+
end
|
96
|
+
}
|
97
|
+
if evalue == 100
|
98
|
+
evalue = 0
|
99
|
+
else
|
100
|
+
exp = Math.log10(evalue).abs.to_i
|
101
|
+
min_exp = (exp/10.0).ceil
|
102
|
+
evalue = 10.0**-(exp-min_exp)
|
103
|
+
end
|
104
|
+
return evalue
|
105
|
+
end
|
106
|
+
|
107
|
+
def same_subject_hsp(hit, second_hit)
|
108
|
+
same = FALSE
|
109
|
+
if hit.acc == second_hit.acc
|
110
|
+
if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
|
111
|
+
same = TRUE
|
112
|
+
end
|
113
|
+
end
|
114
|
+
return same
|
115
|
+
end
|
116
|
+
|
117
|
+
def same_query_hsp(hit, second_hit)
|
118
|
+
same = FALSE
|
119
|
+
if hit.acc == second_hit.acc
|
120
|
+
if hit.q_beg <= second_hit.q_beg && hit.q_end >= hit.q_end && (second_hit.q_beg - hit.q_end).abs > 1
|
121
|
+
same = TRUE
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return same
|
125
|
+
end
|
126
|
+
|
127
|
+
def same_sense?(hit, second_hit)
|
128
|
+
same= FALSE
|
129
|
+
hit_sense = hit.q_frame <=> 0
|
130
|
+
second_hit_sense = second_hit.q_frame <=> 0
|
131
|
+
if hit_sense == second_hit_sense
|
132
|
+
same = TRUE
|
133
|
+
end
|
134
|
+
return same
|
135
|
+
end
|
136
|
+
|
137
|
+
def clean_by_identity(blast_result, ident)
|
138
|
+
blast_result.querys.each do |query|
|
139
|
+
if !query.hits.first.nil?
|
140
|
+
new_hits = query.hits.select{|hit| hit.ident > ident}
|
141
|
+
new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
|
142
|
+
query.hits = new_hits
|
143
|
+
end
|
144
|
+
query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def clean_by_query_length_match(blast_result, min_len_nt)
|
149
|
+
blast_result.querys.each do |query|
|
150
|
+
if !query.hits.first.nil?
|
151
|
+
new_hits = query.hits.select{|hit| hit.align_len * 3 > min_len_nt}
|
152
|
+
new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
|
153
|
+
query.hits = new_hits
|
154
|
+
end
|
155
|
+
query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer
|
156
|
+
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
def clean_overlapping_hsps(blast_result, keep_if_diff_sense = FALSE)
|
162
|
+
blast_result.querys.each do |query|
|
163
|
+
if query.hits.length > 1
|
164
|
+
query.hits.each_with_index do |hit, j|
|
165
|
+
if hit.nil?
|
166
|
+
next
|
167
|
+
end
|
168
|
+
query.hits.each_with_index do |second_hit, i|
|
169
|
+
if second_hit.nil? || i == j #Same hit
|
170
|
+
next
|
171
|
+
end
|
172
|
+
if same_query_hsp(hit, second_hit) #|| same_subject_hsp(hit, second_hit)
|
173
|
+
if keep_if_diff_sense
|
174
|
+
if same_sense?(hit, second_hit) #Delete second_hit if is into the hit and has same sense
|
175
|
+
query.hits[i] = nil
|
176
|
+
end
|
177
|
+
else
|
178
|
+
query.hits[i] = nil
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
query.hits.compact!
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
#####################################################################
|
189
|
+
## DETECTION FUNCTIONS
|
190
|
+
#####################################################################
|
191
|
+
|
192
|
+
def misassembled_detection(query)
|
193
|
+
miss=FALSE
|
194
|
+
hits = cluster_hsps(query.hits)
|
195
|
+
misassembled_hits = []
|
196
|
+
hits.each do |hit|
|
197
|
+
if hit.length > 1
|
198
|
+
negative_frame = hit.select{|hsp| hsp.q_frame < 0}
|
199
|
+
if negative_frame.length > 0 && negative_frame.length != hit.length
|
200
|
+
misassembled_hits << hit.first.acc
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
if misassembled_hits.length*1.0/ hits.length > 0.5
|
205
|
+
miss = TRUE
|
206
|
+
else #Remove missassembled hits to avoid broken analysis
|
207
|
+
query.hits.reverse_each do |hsp|
|
208
|
+
if misassembled_hits.include?(hsp.acc)
|
209
|
+
query.hits.delete(hsp)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
return miss
|
214
|
+
end
|
215
|
+
|
216
|
+
def multiple_hsps(query, num)
|
217
|
+
multiple = FALSE
|
218
|
+
hsps = query.hits.select{|h| h.acc == query.hits.first.acc}
|
219
|
+
if hsps.length >= num
|
220
|
+
multiple = TRUE
|
221
|
+
end
|
222
|
+
return multiple
|
223
|
+
end
|
224
|
+
|
225
|
+
def overlapping_hsps_on_subject(query)
|
226
|
+
overlapping = FALSE
|
227
|
+
current_hit = query.hits.first.acc
|
228
|
+
complete_hit = []
|
229
|
+
cleaned_hits = []
|
230
|
+
query.hits.each do |hit|
|
231
|
+
if hit.acc != current_hit
|
232
|
+
complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
|
233
|
+
complete_hit = []
|
234
|
+
end
|
235
|
+
complete_hit << hit
|
236
|
+
current_hit = hit.acc
|
237
|
+
end
|
238
|
+
complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
|
239
|
+
query.hits = cleaned_hits
|
240
|
+
return query, overlapping
|
241
|
+
end
|
242
|
+
|
243
|
+
#####################################################################
|
244
|
+
## COMPLEMENTARY FUNCTIONS
|
245
|
+
#####################################################################
|
246
|
+
def clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
|
247
|
+
if complete_hit.length > 1
|
248
|
+
complete_hit, overlapping = subject_overlapping_hsps(complete_hit)
|
249
|
+
end
|
250
|
+
cleaned_hits.concat(complete_hit)
|
251
|
+
return complete_hit, overlapping
|
252
|
+
end
|
253
|
+
|
254
|
+
def subject_overlapping_hsps(hit)
|
255
|
+
overlapping = FALSE
|
256
|
+
hsp_table = hsps_relationship_subject(hit)
|
257
|
+
if !hsp_table.empty?
|
258
|
+
hit = clean_hsp_by_identity(hit, 55)
|
259
|
+
if hit.empty?
|
260
|
+
overlapping = TRUE
|
261
|
+
else
|
262
|
+
hsp_table = hsps_relationship_subject(hit)
|
263
|
+
if !hsp_table.empty?
|
264
|
+
overlapping = TRUE
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
return hit, overlapping
|
269
|
+
end
|
270
|
+
|
271
|
+
def hsps_relationship_subject(hit)
|
272
|
+
hsps = []
|
273
|
+
hit.each_with_index do |hsp, j|
|
274
|
+
hit.each_with_index do |second_hsp, i|
|
275
|
+
if i == j #Same hit
|
276
|
+
next
|
277
|
+
end
|
278
|
+
if same_subject_hsp(hsp, second_hsp)
|
279
|
+
if !hsps.include?([hsp, second_hsp]) && !hsps.include?([second_hsp, hsp]) # Save if no exists direct relationship or his inverse
|
280
|
+
hsps << [hsp, second_hsp]
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|
285
|
+
return hsps
|
286
|
+
end
|
287
|
+
|
288
|
+
def same_subject_hsp(hit, second_hit)
|
289
|
+
same = FALSE
|
290
|
+
if hit.acc == second_hit.acc
|
291
|
+
if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
|
292
|
+
same = TRUE
|
293
|
+
end
|
294
|
+
end
|
295
|
+
return same
|
296
|
+
end
|
297
|
+
|
298
|
+
def clean_hsp_by_identity(hit, identity)
|
299
|
+
hit.select!{|hsp| hsp.ident >= identity}
|
300
|
+
return hit
|
301
|
+
end
|
302
|
+
|
303
|
+
def cluster_hsps(hsps)
|
304
|
+
hits = []
|
305
|
+
last_acc = ''
|
306
|
+
hsps.each do |hsp|
|
307
|
+
if hsp.acc != last_acc
|
308
|
+
hits << [hsp]
|
309
|
+
else
|
310
|
+
hits.last << hsp
|
311
|
+
end
|
312
|
+
last_acc = hsp.acc
|
313
|
+
end
|
314
|
+
return hits
|
315
|
+
end
|
316
|
+
|
317
|
+
def find_hit(hit_acc, ar_hits)
|
318
|
+
selected_hit = nil
|
319
|
+
ar_hits.each do |hit|
|
320
|
+
if hit.first.acc == hit_acc
|
321
|
+
selected_hit = hit
|
322
|
+
break
|
323
|
+
end
|
324
|
+
end
|
325
|
+
return selected_hit
|
326
|
+
end
|