full_lengther_next 0.0.8 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,54 @@
1
+ #$: << File.expand_path('~/fln/full_lengther_next/lib')
2
+
3
+ $: << File.expand_path(File.join(File.dirname(__FILE__)))
4
+ require 'types'
5
+ require 'my_worker'
6
+
7
+ class MyWorkerEst < MyWorker
8
+ #####################################################################################
9
+ # WORKER FUNCTIONS
10
+ #####################################################################################
11
+ def receive_initial_config(params)
12
+ # Reads the parameters
13
+ @options = params.first
14
+ @blast_path = params.last
15
+ @match = [] # Any sequence matching with a EST
16
+ @unmatch = [] # Sequences with :test_code annotation that it hasn't match with a EST
17
+ end
18
+
19
+ def process_object (array_seqs)
20
+ $WORKER_LOG.info "Worker LIST #{array_seqs.length}"
21
+ array_seqs.sort!{|s1, s2| s2.t_code <=> s1.t_code} #Order by testcode
22
+ blastEST(array_seqs)
23
+ return [@match, @unmatch]
24
+ end
25
+
26
+ #####################################################################################
27
+ # CUSTOM FUNCTIONS
28
+ #####################################################################################
29
+
30
+ def blastEST(array_seqs)
31
+ blast = run_blast(array_seqs, @blast_path, 'blastn', 1e-6, nil, FALSE)
32
+ if blast.nil?
33
+ $LOG.info 'BLAST FAILED'
34
+ Process.exit(-1)
35
+ else
36
+ blast_analysis(blast, array_seqs)
37
+ end
38
+ end
39
+
40
+ def blast_analysis(blast, array_seqs)
41
+ blast.querys.each_with_index do |query, i|
42
+ if query.hits.first.nil?
43
+ if !array_seqs[i].type == CODING #Keep if is coding
44
+ @unmatch << array_seqs[i]
45
+ end
46
+ else
47
+ seq = array_seqs[i]
48
+ seq.hit = query.hits.first
49
+ @match << seq
50
+ end
51
+ end
52
+ end
53
+
54
+ end
@@ -0,0 +1,69 @@
1
+ require 'scbi_fasta'
2
+ require 'sequence'
3
+
4
+ class MyWorkerManagerEst < ScbiMapreduce::WorkManager
5
+ #############################################################################################
6
+ # MANAGER INITIALIZATION
7
+ #############################################################################################
8
+ # open files and prepare global data
9
+ def self.init_work_manager(putative_seqs, options, blast_path)
10
+ @@blast_path = blast_path
11
+ @@options = options
12
+ @@putative_seqs = putative_seqs
13
+ @@match = []
14
+ @@unmatch = []
15
+ @@num_seqs = 0
16
+ end
17
+
18
+ #############################################################################################
19
+ # MANAGER TERMINATION
20
+ #############################################################################################
21
+
22
+ # close files
23
+ def self.end_work_manager
24
+ #VOID
25
+ end
26
+
27
+ #############################################################################################
28
+ # MANAGER NATIVE FUNCTIONS
29
+ #############################################################################################
30
+
31
+ # this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
32
+ # Return the work data or nil if no more data is available
33
+ def next_work #Manage INput's worker
34
+ if @@num_seqs == @@putative_seqs.length-1
35
+ return nil
36
+ else
37
+ seq = @@putative_seqs[@@num_seqs]
38
+ @@num_seqs += 1
39
+ return seq
40
+ end
41
+ end
42
+
43
+ # this method is ejecuted each time an obj is finished
44
+ def work_received(match_and_unmatch_array) #Manage OUTput's worker
45
+ @@match.concat(match_and_unmatch_array.first)
46
+ @@unmatch.concat(match_and_unmatch_array.last)
47
+ end
48
+
49
+ def error_received(worker_error, obj)
50
+ puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
51
+ end
52
+
53
+ def too_many_errors_received
54
+ $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
55
+ end
56
+
57
+ # send initial config
58
+ def worker_initial_config
59
+ return [@@options, @@blast_path]
60
+ end
61
+
62
+ #############################################################################################
63
+ # CUSTOM FUNCTIONS
64
+ #############################################################################################
65
+ def self.get_array_seqs
66
+ return @@match, @@unmatch
67
+ end
68
+ end
69
+
@@ -0,0 +1,389 @@
1
+ require 'types'
2
+ require 'scbi_fasta'
3
+ require 'scbi_blast'
4
+ require 'sequence'
5
+ require 'exonerate_result'
6
+ require 'fln_stats'
7
+ require 'reptrans'
8
+
9
+ include FlnStats
10
+
11
+
12
+
13
+
14
+ class MyWorkerManagerFln < ScbiMapreduce::WorkManager
15
+ #############################################################################################
16
+ # MANAGER INITIALIZATION
17
+ #############################################################################################
18
+ attr_accessor :seqs_annotation_prot, :seqs_some_coding, :seqs_unknown
19
+ # open files and prepare global data
20
+ def self.init_work_manager(options)
21
+ @@stats_hash = initialize_stats_hash
22
+ @@stats_taxonomy = {}
23
+ @@stats_different_prot_id = []
24
+ @@stats_different_prot_id_complete_seqs = []
25
+
26
+ @@options = options
27
+ $verbose = options[:verbose]
28
+
29
+ input_file = options[:fasta]
30
+
31
+ if !File.exists?('fln_results')
32
+ Dir.mkdir('fln_results')
33
+ end
34
+
35
+ @@func_annot_type = {
36
+ :go_id => 5,
37
+ :go_description => 6,
38
+ :kegg_id => 7,
39
+ :interpro_id => 8,
40
+ :interpro_description => 9,
41
+ :ec_id => 10,
42
+ :pfam_id => 11,
43
+ :pfam_desc => 12,
44
+ :unipathway_id => 13
45
+ }
46
+
47
+ @@functional_annotations = {}
48
+ @@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'sp_'+options[:tax_group],'sp_'+options[:tax_group]+'.index')))
49
+ #@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'tr_'+options[:tax_group],'tr_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('t')
50
+
51
+ @@fasta_file = FastaQualFile.new(input_file,'')
52
+ file_head = "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\te_value\tp_ident\ts_length\tprotein_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tgo_id\tgo_description\tkegg_id\tinterpro_id\tinterpro_description\tec_id\tpfam_id\tpfam_description\tunipathway_id"
53
+
54
+ @@output_files = {}
55
+ # Seq annotation files
56
+ if !options[:chimera].nil?
57
+ @@output_files[CHIMERA] = File.open("fln_results/chimeric_sequences.txt", 'w')
58
+ @@output_files[CHIMERA].puts file_head
59
+ elsif File.exists?("fln_results/chimeric_sequences.txt")
60
+ File.delete("fln_results/chimeric_sequences.txt")
61
+ end
62
+ @@output_files[OTHER] = File.open('fln_results/artifact_other.txt', 'w')
63
+ @@output_files[MISASSEMBLED] = File.open('fln_results/misassembled.txt', 'w')
64
+ @@output_files[UNKNOWN] = File.open('fln_results/unknown.txt', 'w')
65
+ @@output_files['db'] = File.open('fln_results/pt_seqs', 'w')
66
+ @@output_files[CODING] = File.open('fln_results/new_coding.txt', 'w')
67
+ @@output_files[NCRNA] = File.open('fln_results/nc_rnas.txt', 'w')
68
+
69
+ # Complementary files
70
+ @@output_files['align'] = File.open('fln_results/alignments.txt', 'w')
71
+ @@output_files['prot'] = File.open('fln_results/proteins.fasta', 'w') # FASTA
72
+ @@output_files['nts'] = File.open("fln_results/nt_seq.txt", 'w')
73
+ @@output_files['seqs'] = File.open('fln_results/unigenes.fasta', 'w') # FASTA
74
+ @@output_files['stats_html'] = File.open('fln_results/summary_stats.html', 'w')
75
+ @@output_files['stats_txt'] = File.open('fln_results/summary_stats.txt', 'w')
76
+
77
+ @@output_files[CODING].puts file_head
78
+ @@output_files['db'].puts file_head
79
+ @@output_files[NCRNA].puts file_head
80
+
81
+ #RepTrans module
82
+ @@seqs_annotation_prot = []
83
+ @@seqs_some_coding = []
84
+ @@seqs_unknown = []
85
+
86
+ #Transdecoder module
87
+ @@complete_sure = []
88
+ @@seqs_to_analyze = []
89
+
90
+ end
91
+
92
+ #############################################################################################
93
+ # MANAGER TERMINATION
94
+ #############################################################################################
95
+
96
+ # close files
97
+ def self.end_work_manager
98
+ orf_prediction_with_transdecoder if @@options[:acess_db].include?('p') && !@@complete_sure.empty? && !@@seqs_to_analyze.empty?
99
+ write_summary_stats(@@stats_hash, @@stats_taxonomy, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@output_files['stats_txt'], @@output_files['stats_html'])
100
+ @@output_files.each do |key, handler|
101
+ handler.close
102
+ end
103
+ end
104
+
105
+ #############################################################################################
106
+ # MANAGER NATIVE FUNCTIONS
107
+ #############################################################################################
108
+
109
+ # this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
110
+ # Return the work data or nil if no more data is available
111
+ def next_work #Manage INput's worker
112
+ n,f,q = @@fasta_file.next_seq
113
+ if !n.nil?
114
+ @@stats_hash['input_seqs'] += 1
115
+ return Sequence.new(n,f,q)
116
+ else
117
+ return nil
118
+ end
119
+ end
120
+
121
+ # this method is ejecuted each time an obj is finished
122
+ def work_received(objs) #Manage OUTput's worker
123
+ objs.each do |seq|
124
+ transdecoder_keep_seq(seq)
125
+ repTrans_keep_seq(seq)
126
+ if seq.type > UNKNOWN && seq.type < NCRNA
127
+ get_taxonomy(seq.hit.definition, @@stats_taxonomy)
128
+ get_functional_annotations(seq)
129
+ end
130
+ write_seq(seq) if @@options[:acess_db].include?('c') || !@@options[:acess_db].include?('p') || ( seq.type != UNKNOWN && seq.type != CODING ) #Don't write Unknown or coding sequences when use transdecoder
131
+ end
132
+ @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs = summary_stats(objs, @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs)
133
+ end
134
+
135
+ def error_received(worker_error, obj)
136
+ puts "WARNING!!!!!. CHUNK FAILED:Error while processing object #{obj.first.seq_name}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
137
+ end
138
+
139
+ def too_many_errors_received
140
+ $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
141
+ end
142
+
143
+ # send initial config
144
+ def worker_initial_config
145
+ return @@options
146
+ end
147
+
148
+ #############################################################################################
149
+ # CUSTOM FUNCTIONS
150
+ #############################################################################################
151
+
152
+ def self.load_functional_annotations(annotation_file)
153
+ functional_annotations = {}
154
+ File.open(annotation_file).each do |line|
155
+ line.chomp!
156
+ fields = line.split("\t")
157
+ acc = fields.shift
158
+ functional_annotations[acc] = fields
159
+ end
160
+ return functional_annotations
161
+ end
162
+
163
+ def get_functional_annotations(seq)
164
+ all_info = @@functional_annotations[seq.hit.acc.gsub(/-\d+/,'')] #gsub removes splicing code of uniprot accesion
165
+ if !all_info.nil?
166
+ annotations = {}
167
+ @@func_annot_type.each do |type, position|
168
+ annotations[type] = all_info[position]
169
+ end
170
+ seq.functional_annotations = annotations
171
+ end
172
+ end
173
+
174
+
175
+ # write results to files
176
+ def write_seq(seq)
177
+ begin
178
+ seq.write_info(@@output_files)
179
+ rescue Exception => e
180
+ puts "Error printing #{seq.seq_name}"
181
+ puts e.message, e.backtrace.join("\n")
182
+ end
183
+
184
+ end
185
+
186
+ def repTrans_keep_seq(seq)
187
+ if !@@options[:reptrans].nil?
188
+ case seq.type
189
+ when COMPLETE .. INTERNAL
190
+ @@seqs_annotation_prot << seq
191
+ when CODING
192
+ @@seqs_some_coding << seq
193
+ when UNKNOWN
194
+ @@seqs_unknown << seq
195
+ end
196
+ end
197
+ end
198
+
199
+ def self.repTrans_keep_seq(seq)
200
+ if !@@options[:reptrans].nil?
201
+ case seq.type
202
+ when COMPLETE .. INTERNAL
203
+ @@seqs_annotation_prot << seq
204
+ when CODING
205
+ @@seqs_some_coding << seq
206
+ when UNKNOWN
207
+ @@seqs_unknown << seq
208
+ end
209
+ end
210
+ end
211
+
212
+ def transdecoder_keep_seq(seq)
213
+ if @@options[:acess_db].include?('p')
214
+ case seq.type
215
+ when COMPLETE
216
+ @@complete_sure << seq if seq.status && seq.hit.ident >= @@options[:training_ident]
217
+ when CODING
218
+ @@seqs_to_analyze << seq
219
+ when UNKNOWN
220
+ @@seqs_to_analyze << seq
221
+ end
222
+ end
223
+ end
224
+
225
+ def self.orf_prediction_with_transdecoder
226
+ clusters_seqs_annot_prot = clustering_by_id(@@complete_sure)
227
+ final_seqs = select_representative(clusters_seqs_annot_prot)
228
+ coding_info = nil
229
+ Dir.chdir('temp') do
230
+ orfs = get_seqs(final_seqs)
231
+ File.open('training_set.fasta', 'w') {|f| f.write(orfs)}
232
+ orfs = get_seqs(@@seqs_to_analyze)
233
+ File.open('analyse_set.fasta', 'w') {|f| f.write(orfs)}
234
+ cmd = "TransDecoder -t analyse_set.fasta --workdir transdecoder --train training_set.fasta"
235
+ cmd << ' --reuse' if Dir.exists?('transdecoder')
236
+ system(cmd)
237
+ coding_info = get_coding_info('transdecoder/longest_orfs.pep')
238
+ coding_info = get_scores('transdecoder/longest_orfs.cds.scores', coding_info)
239
+ coding_info = correct_by_selected('transdecoder/longest_orfs.cds.scores.selected', coding_info)
240
+ end
241
+ @@seqs_to_analyze.each do |seq|
242
+ coding = coding_info[seq.seq_name]
243
+ asign_coding_attributes(seq, coding) if !coding.nil?
244
+ repTrans_keep_seq(seq)
245
+ seq.write_info(@@output_files)
246
+ end
247
+ end
248
+
249
+ def self.get_seqs(seqs)
250
+ all_seqs = ''
251
+ seqs.each do |seq|
252
+ all_seqs << ">#{seq.seq_name}\n#{seq.seq_fasta}\n"
253
+ end
254
+ return all_seqs
255
+ end
256
+
257
+ def self.correct_by_selected(selected, coding_info)
258
+ seqs_selected = []
259
+ File.open(selected).each do |line|
260
+ line.chomp!
261
+ seq_name, orf_id = line.split('|', 2)
262
+ seqs_selected << orf_id
263
+ end
264
+ coding_info.each do |seq_name, orfs|
265
+ orfs.each do |orf, info|
266
+ info[1] = '-' if !seqs_selected.include?(orf)
267
+ end
268
+ end
269
+ return coding_info
270
+ end
271
+
272
+ def self.asign_coding_attributes(seq, coding)
273
+ seq.type = CODING
274
+ @@stats_hash['unknown'] -= 1
275
+ @@stats_hash['unknown_>200'] -= 1 if seq.seq_fasta.length > 200
276
+ @@stats_hash['unknown_>500'] -= 1 if seq.seq_fasta.length > 500
277
+ @@stats_hash['coding_>200'] += 1 if seq.seq_fasta.length > 200
278
+ @@stats_hash['coding_>500'] += 1 if seq.seq_fasta.length > 500
279
+ @@stats_hash['coding'] += 1
280
+ coding = select_orf(coding)
281
+ if coding[1] == 'complete'
282
+ seq.status = TRUE
283
+ @@stats_hash['coding_sure'] += 1
284
+ else
285
+ @@stats_hash['coding_putative'] += 1
286
+ end
287
+
288
+ seq.t_code = coding.last
289
+ ind = 2
290
+ ind = 3 if coding[4] == '-'
291
+ frame = (coding[ind]%3)+1
292
+ frame = frame * -1 if coding[4] == '-'
293
+ seq.hit = [coding[2], coding[3], frame]
294
+ end
295
+
296
+ def self.select_orf(orfs_hash)
297
+ orf = nil
298
+ ratioX = get_min_Xratio(orfs_hash)
299
+ orfs_hash.select!{|id, info| info.first == ratioX}
300
+ orfs = orfs_hash.select{|id, info| info[1] == 'complete'}
301
+ orfs = orfs_hash if orfs.empty?
302
+ max_score = get_max_score(orfs)
303
+ orfs.select!{|id, info| info.last == max_score}
304
+ orf = orfs.values.first
305
+ return orf
306
+ end
307
+
308
+ def self.get_max_score(orfs_hash)
309
+ score = nil
310
+ orfs_hash.each do |id, info|
311
+ local = info.last
312
+ if score.nil?
313
+ score = local
314
+ else
315
+ score = local if local > score
316
+ end
317
+ end
318
+ return score
319
+ end
320
+
321
+ def self.get_min_Xratio(orfs_hash)
322
+ ratio = nil
323
+ orfs_hash.each do |id, info|
324
+ local = info.first
325
+ if ratio.nil?
326
+ ratio = local
327
+ else
328
+ ratio = local if local < ratio
329
+ end
330
+ end
331
+ return ratio
332
+ end
333
+
334
+ def self.get_coding_info(file_name)
335
+ coding_info = {}
336
+ begin
337
+ FastaQualFile.new(file_name, '').each do |name, seq, comments, qual|
338
+ seq_length = seq.length
339
+ f_len = seq.length.to_f
340
+ x_len = seq.count('X')
341
+ seq_name, orf_id = name.split('|')
342
+ comments =~ /type:(\S+)/
343
+ type = $1
344
+ comments =~ /:(\d+)-(\d+)\(([+-])\)/
345
+ start = $1.to_i
346
+ stop = $2.to_i
347
+ strand = $3
348
+ record = coding_info[seq_name]
349
+ info = [x_len / f_len, type, start, stop, strand]
350
+ if record.nil?
351
+ coding_info[seq_name] = {orf_id => info}
352
+ else
353
+ record[orf_id] = info
354
+ end
355
+ end
356
+ rescue
357
+ puts "Warning!!!!!!!!!!: Transdecoder file is missing. Check if Transdecoder is installed"
358
+ end
359
+ return coding_info
360
+ end
361
+
362
+ def self.get_scores(file_name, coding_info)
363
+ File.open(file_name).each do |line|
364
+ line.chomp!
365
+ fields = line.split("\t")
366
+ name = fields.shift
367
+ seq, orf_id = name.split('|')
368
+ coding = coding_info[seq]
369
+ if !coding.nil?
370
+ orf = coding[orf_id]
371
+ if !orf.nil?
372
+ score = fields.first.to_f
373
+ if score > 0
374
+ orf << fields.first.to_f if !orf.nil?
375
+ else
376
+ coding.delete(orf_id)
377
+ coding_info.delete(seq) if coding.empty?
378
+ end
379
+ end
380
+ end
381
+ end
382
+ return coding_info
383
+ end
384
+
385
+ def self.get_annotations
386
+ return @@seqs_annotation_prot, @@seqs_some_coding, @@seqs_unknown
387
+ end
388
+ end
389
+