full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_plot'
4
+ #############################################
5
+ ### FUNCTIONS
6
+ #############################################
7
+ def take_taxonomy(file)
8
+ taxonomy = {}
9
+ File.open(file).each_with_index do |line, i|
10
+ line.chomp!
11
+ field = line.split("\t").last
12
+ organism = field.split(";",2).last
13
+ organism.gsub!(/\(\D+\)/,'')
14
+ if organism.split(' ').length == 1
15
+ next
16
+ end
17
+ organism.gsub!('.','')
18
+ organism.gsub!(/^ /,'')
19
+ organism.gsub!(' ','')
20
+ organism ='"'+organism+'"'
21
+ if taxonomy[organism].nil?
22
+ taxonomy[organism] = 1
23
+ else
24
+ taxonomy[organism] += 1
25
+ end
26
+ end
27
+ return taxonomy
28
+ end
29
+
30
+ def plot(taxonomy)
31
+ p=ScbiPlot::Histogram.new('fln_taxonomy_plot.png','Group organism representation')
32
+ p.add_x(taxonomy.keys)
33
+ p.add_y(taxonomy.values)
34
+ p.do_graph
35
+ end
36
+
37
+ #############################################
38
+
39
+ require 'optparse'
40
+ options = {}
41
+
42
+ optparse = OptionParser.new do |opts|
43
+ options[:path] = File.join('fln_results','pt_seqs')
44
+ opts.on( '-p', '--path PATH', 'Path to FLN execution' ) do |path|
45
+ options[:path] = File.join(path,'fln_results','pt_seqs')
46
+ end
47
+
48
+ # Set a banner, displayed at the top of the help screen.
49
+ opts.banner = "Usage: plot_taxonomy.rb -p PATH \n\n"
50
+
51
+ # This displays the help screen
52
+ opts.on( '-h', '--help', 'Display this screen' ) do
53
+ puts opts
54
+ exit
55
+ end
56
+
57
+ end # End opts
58
+
59
+ # parse options and remove from ARGV
60
+ optparse.parse!
61
+
62
+ taxonomy = nil
63
+ if File.exists?(options[:path])
64
+ taxonomy = take_taxonomy(options[:path])
65
+ else
66
+ puts 'Path isn\'t valid'
67
+ Process.exit
68
+ end
69
+
70
+ plot(taxonomy)
Binary file
@@ -1,5 +1,5 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  # ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
5
5
 
@@ -7,7 +7,7 @@ $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', '
7
7
 
8
8
 
9
9
  module FullLengtherNext
10
- VERSION = '0.0.8'
10
+ VERSION = '0.5.6'
11
11
 
12
- FULLLENGHTER_VERSION = VERSION
12
+ FULL_LENGHTER_VERSION = VERSION
13
13
  end
@@ -0,0 +1,66 @@
1
+ require 'blast_functions'
2
+ require 'types'
3
+ require 'chimeric_seqs'
4
+ include ChimericSeqs
5
+
6
+ #####################################################################
7
+ ## MAIN FUNCTION
8
+ #####################################################################
9
+ def artifact?(seq, query, db_name, db_path, options, new_seqs)
10
+ artifact = FALSE
11
+ # MISASSEMBLED DETECTION
12
+ if misassembled_detection(query) #If seq is misassembled stop chimera analisys
13
+ seq.hit = query.hits.first
14
+ artifact = TRUE
15
+ seq.type = MISASSEMBLED
16
+ seq.warnings('ERROR#1')
17
+ end
18
+
19
+ # OVERLAPPING HSPS ON SUBJECT DETECTION
20
+ =begin
21
+ if !artifact
22
+ hit_reference = query.hits.first.dup
23
+ query, overlapping = overlapping_hsps_on_subject(query)
24
+ if overlapping
25
+ if query.hits.first.nil?
26
+ seq.hit = hit_reference
27
+ else
28
+ seq.hit = query.hits.first
29
+ end
30
+ artifact = TRUE
31
+ seq.type = OTHER
32
+ seq.warnings('ERROR#2')
33
+ end
34
+ end
35
+ =end
36
+
37
+ # MULTIPLE HSP DETECTION
38
+ if !artifact && multiple_hsps(query, 3)
39
+ seq.hit = query.hits.first
40
+ seq.warnings('ERROR#3')
41
+ end
42
+
43
+ # CHIMERA DETECTION
44
+ if !artifact && !options[:chimera].include?('d')
45
+ chimera = search_chimeras(seq, query, options, db_name, db_path)
46
+ if !chimera.nil?
47
+ new_seqs.concat(chimera)
48
+ seq.db_name = db_name
49
+ seq.type = CHIMERA
50
+ artifact = TRUE
51
+ end
52
+ end
53
+
54
+ if artifact
55
+ if $verbose > 1
56
+ puts seq.prot_annot_calification
57
+ end
58
+ seq.db_name = db_name
59
+ seq.save_fasta = FALSE
60
+ seq.ignore = TRUE
61
+ end
62
+ return artifact
63
+ end
64
+
65
+
66
+
@@ -0,0 +1,326 @@
1
+
2
+ require 'scbi_blast'
3
+
4
+ def filter_hits(query, select_hits=10) # Select best hits
5
+ hits = query.hits
6
+ if !hits.first.nil?
7
+ hits = cluster_hsps(hits)
8
+ hits = hits[0..select_hits]
9
+ hits = select_hits_by_identity_query(hits)
10
+ hits = select_hits_by_coverage_subject(hits)
11
+ end
12
+ if hits.empty?
13
+ if select_hits >= query.hits.length || select_hits >= 100 # Condition to stop a infinite recursive function
14
+ hits = [cluster_hsps(query.hits).first]
15
+ else
16
+ hits = filter_hits(query, select_hits+10)
17
+ end
18
+ end
19
+ return hits
20
+ end
21
+
22
+ def get_coverage_subject(hit)
23
+ perc_identity = hit.align_len*100.0/hit.s_len
24
+ if perc_identity > 100 && hit.class.to_s == 'ExoBlastHit' && !hit.q_frameshift.empty?
25
+ hit.q_frameshift.length.times do |n| #Align len correction by frameshift. FS can create a gap in alignment adding extra aa. FS can be deletions or insertions so we check until get a perc_identity of 100
26
+ align_len = hit.align_len- (n + 1)
27
+ perc_identity = align_len*100.0/hit.s_len
28
+ break if perc_identity <= 100
29
+ end
30
+ end
31
+ return perc_identity
32
+ end
33
+
34
+ def select_hits_by_coverage_subject(hits)
35
+ selected_hits = []
36
+ coverage_thresold = get_coverage_subject(hits.first.first)
37
+ coverage_thresold = 100 if coverage_thresold > 100
38
+
39
+ hits.map{|hit|
40
+ hit.each do |hsp|
41
+ coverage = get_coverage_subject(hsp)
42
+ if coverage > 100
43
+ next
44
+ end
45
+ if coverage >= coverage_thresold
46
+ selected_hits << hit
47
+ break
48
+ end
49
+ end
50
+ }
51
+ return selected_hits
52
+ end
53
+
54
+ def select_hits_by_identity_query(hits)
55
+ selected_hits = []
56
+ identity = hits.first.first.ident
57
+ hits.map{|hit|
58
+ hit.each do |hsp|
59
+ if hsp.ident >= identity
60
+ selected_hits << hit
61
+ break
62
+ end
63
+ end
64
+ }
65
+ return selected_hits
66
+ end
67
+
68
+ def select_hits_by_evalue(hits, evalue)
69
+ selected_hits = []
70
+ hits.map{|hit|
71
+ hit.each do |hsp|
72
+ if hsp.e_val <= evalue
73
+ selected_hits << hit
74
+ end
75
+ end
76
+ }
77
+ return selected_hits
78
+ end
79
+
80
+ def select_hsps_by_id(hits, selected_ids)
81
+ selected_hits = []
82
+ hits.map{|hsp|
83
+ if selected_ids.include?(hsp.acc)
84
+ selected_hits << hsp
85
+ end
86
+ }
87
+ return selected_hits
88
+ end
89
+
90
+ def set_thresold_evalue(hits)
91
+ evalue = 100
92
+ hits.map{|hit|
93
+ if hit.e_val != 0 && hit.e_val < evalue
94
+ evalue = hit.e_val
95
+ end
96
+ }
97
+ if evalue == 100
98
+ evalue = 0
99
+ else
100
+ exp = Math.log10(evalue).abs.to_i
101
+ min_exp = (exp/10.0).ceil
102
+ evalue = 10.0**-(exp-min_exp)
103
+ end
104
+ return evalue
105
+ end
106
+
107
+ def same_subject_hsp(hit, second_hit)
108
+ same = FALSE
109
+ if hit.acc == second_hit.acc
110
+ if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
111
+ same = TRUE
112
+ end
113
+ end
114
+ return same
115
+ end
116
+
117
+ def same_query_hsp(hit, second_hit)
118
+ same = FALSE
119
+ if hit.acc == second_hit.acc
120
+ if hit.q_beg <= second_hit.q_beg && hit.q_end >= hit.q_end && (second_hit.q_beg - hit.q_end).abs > 1
121
+ same = TRUE
122
+ end
123
+ end
124
+ return same
125
+ end
126
+
127
+ def same_sense?(hit, second_hit)
128
+ same= FALSE
129
+ hit_sense = hit.q_frame <=> 0
130
+ second_hit_sense = second_hit.q_frame <=> 0
131
+ if hit_sense == second_hit_sense
132
+ same = TRUE
133
+ end
134
+ return same
135
+ end
136
+
137
+ def clean_by_identity(blast_result, ident)
138
+ blast_result.querys.each do |query|
139
+ if !query.hits.first.nil?
140
+ new_hits = query.hits.select{|hit| hit.ident > ident}
141
+ new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
142
+ query.hits = new_hits
143
+ end
144
+ query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer
145
+ end
146
+ end
147
+
148
+ def clean_by_query_length_match(blast_result, min_len_nt)
149
+ blast_result.querys.each do |query|
150
+ if !query.hits.first.nil?
151
+ new_hits = query.hits.select{|hit| hit.align_len * 3 > min_len_nt}
152
+ new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
153
+ query.hits = new_hits
154
+ end
155
+ query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer
156
+
157
+ end
158
+ end
159
+
160
+
161
+ def clean_overlapping_hsps(blast_result, keep_if_diff_sense = FALSE)
162
+ blast_result.querys.each do |query|
163
+ if query.hits.length > 1
164
+ query.hits.each_with_index do |hit, j|
165
+ if hit.nil?
166
+ next
167
+ end
168
+ query.hits.each_with_index do |second_hit, i|
169
+ if second_hit.nil? || i == j #Same hit
170
+ next
171
+ end
172
+ if same_query_hsp(hit, second_hit) #|| same_subject_hsp(hit, second_hit)
173
+ if keep_if_diff_sense
174
+ if same_sense?(hit, second_hit) #Delete second_hit if is into the hit and has same sense
175
+ query.hits[i] = nil
176
+ end
177
+ else
178
+ query.hits[i] = nil
179
+ end
180
+ end
181
+ end
182
+ end
183
+ query.hits.compact!
184
+ end
185
+ end
186
+ end
187
+
188
+ #####################################################################
189
+ ## DETECTION FUNCTIONS
190
+ #####################################################################
191
+
192
+ def misassembled_detection(query)
193
+ miss=FALSE
194
+ hits = cluster_hsps(query.hits)
195
+ misassembled_hits = []
196
+ hits.each do |hit|
197
+ if hit.length > 1
198
+ negative_frame = hit.select{|hsp| hsp.q_frame < 0}
199
+ if negative_frame.length > 0 && negative_frame.length != hit.length
200
+ misassembled_hits << hit.first.acc
201
+ end
202
+ end
203
+ end
204
+ if misassembled_hits.length*1.0/ hits.length > 0.5
205
+ miss = TRUE
206
+ else #Remove missassembled hits to avoid broken analysis
207
+ query.hits.reverse_each do |hsp|
208
+ if misassembled_hits.include?(hsp.acc)
209
+ query.hits.delete(hsp)
210
+ end
211
+ end
212
+ end
213
+ return miss
214
+ end
215
+
216
+ def multiple_hsps(query, num)
217
+ multiple = FALSE
218
+ hsps = query.hits.select{|h| h.acc == query.hits.first.acc}
219
+ if hsps.length >= num
220
+ multiple = TRUE
221
+ end
222
+ return multiple
223
+ end
224
+
225
+ def overlapping_hsps_on_subject(query)
226
+ overlapping = FALSE
227
+ current_hit = query.hits.first.acc
228
+ complete_hit = []
229
+ cleaned_hits = []
230
+ query.hits.each do |hit|
231
+ if hit.acc != current_hit
232
+ complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
233
+ complete_hit = []
234
+ end
235
+ complete_hit << hit
236
+ current_hit = hit.acc
237
+ end
238
+ complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
239
+ query.hits = cleaned_hits
240
+ return query, overlapping
241
+ end
242
+
243
+ #####################################################################
244
+ ## COMPLEMENTARY FUNCTIONS
245
+ #####################################################################
246
+ def clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
247
+ if complete_hit.length > 1
248
+ complete_hit, overlapping = subject_overlapping_hsps(complete_hit)
249
+ end
250
+ cleaned_hits.concat(complete_hit)
251
+ return complete_hit, overlapping
252
+ end
253
+
254
+ def subject_overlapping_hsps(hit)
255
+ overlapping = FALSE
256
+ hsp_table = hsps_relationship_subject(hit)
257
+ if !hsp_table.empty?
258
+ hit = clean_hsp_by_identity(hit, 55)
259
+ if hit.empty?
260
+ overlapping = TRUE
261
+ else
262
+ hsp_table = hsps_relationship_subject(hit)
263
+ if !hsp_table.empty?
264
+ overlapping = TRUE
265
+ end
266
+ end
267
+ end
268
+ return hit, overlapping
269
+ end
270
+
271
+ def hsps_relationship_subject(hit)
272
+ hsps = []
273
+ hit.each_with_index do |hsp, j|
274
+ hit.each_with_index do |second_hsp, i|
275
+ if i == j #Same hit
276
+ next
277
+ end
278
+ if same_subject_hsp(hsp, second_hsp)
279
+ if !hsps.include?([hsp, second_hsp]) && !hsps.include?([second_hsp, hsp]) # Save if no exists direct relationship or his inverse
280
+ hsps << [hsp, second_hsp]
281
+ end
282
+ end
283
+ end
284
+ end
285
+ return hsps
286
+ end
287
+
288
+ def same_subject_hsp(hit, second_hit)
289
+ same = FALSE
290
+ if hit.acc == second_hit.acc
291
+ if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
292
+ same = TRUE
293
+ end
294
+ end
295
+ return same
296
+ end
297
+
298
+ def clean_hsp_by_identity(hit, identity)
299
+ hit.select!{|hsp| hsp.ident >= identity}
300
+ return hit
301
+ end
302
+
303
+ def cluster_hsps(hsps)
304
+ hits = []
305
+ last_acc = ''
306
+ hsps.each do |hsp|
307
+ if hsp.acc != last_acc
308
+ hits << [hsp]
309
+ else
310
+ hits.last << hsp
311
+ end
312
+ last_acc = hsp.acc
313
+ end
314
+ return hits
315
+ end
316
+
317
+ def find_hit(hit_acc, ar_hits)
318
+ selected_hit = nil
319
+ ar_hits.each do |hit|
320
+ if hit.first.acc == hit_acc
321
+ selected_hit = hit
322
+ break
323
+ end
324
+ end
325
+ return selected_hit
326
+ end