full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -0,0 +1,54 @@
1
+ #$: << File.expand_path('~/fln/full_lengther_next/lib')
2
+
3
+ $: << File.expand_path(File.join(File.dirname(__FILE__)))
4
+ require 'types'
5
+ require 'my_worker'
6
+
7
+ class MyWorkerEst < MyWorker
8
+ #####################################################################################
9
+ # WORKER FUNCTIONS
10
+ #####################################################################################
11
+ def receive_initial_config(params)
12
+ # Reads the parameters
13
+ @options = params.first
14
+ @blast_path = params.last
15
+ @match = [] # Any sequence matching with a EST
16
+ @unmatch = [] # Sequences with :test_code annotation that it hasn't match with a EST
17
+ end
18
+
19
+ def process_object (array_seqs)
20
+ $WORKER_LOG.info "Worker LIST #{array_seqs.length}"
21
+ array_seqs.sort!{|s1, s2| s2.t_code <=> s1.t_code} #Order by testcode
22
+ blastEST(array_seqs)
23
+ return [@match, @unmatch]
24
+ end
25
+
26
+ #####################################################################################
27
+ # CUSTOM FUNCTIONS
28
+ #####################################################################################
29
+
30
+ def blastEST(array_seqs)
31
+ blast = run_blast(array_seqs, @blast_path, 'blastn', 1e-6, nil, FALSE)
32
+ if blast.nil?
33
+ $LOG.info 'BLAST FAILED'
34
+ Process.exit(-1)
35
+ else
36
+ blast_analysis(blast, array_seqs)
37
+ end
38
+ end
39
+
40
+ def blast_analysis(blast, array_seqs)
41
+ blast.querys.each_with_index do |query, i|
42
+ if query.hits.first.nil?
43
+ if !array_seqs[i].type == CODING #Keep if is coding
44
+ @unmatch << array_seqs[i]
45
+ end
46
+ else
47
+ seq = array_seqs[i]
48
+ seq.hit = query.hits.first
49
+ @match << seq
50
+ end
51
+ end
52
+ end
53
+
54
+ end
@@ -0,0 +1,69 @@
1
+ require 'scbi_fasta'
2
+ require 'sequence'
3
+
4
+ class MyWorkerManagerEst < ScbiMapreduce::WorkManager
5
+ #############################################################################################
6
+ # MANAGER INITIALIZATION
7
+ #############################################################################################
8
+ # open files and prepare global data
9
+ def self.init_work_manager(putative_seqs, options, blast_path)
10
+ @@blast_path = blast_path
11
+ @@options = options
12
+ @@putative_seqs = putative_seqs
13
+ @@match = []
14
+ @@unmatch = []
15
+ @@num_seqs = 0
16
+ end
17
+
18
+ #############################################################################################
19
+ # MANAGER TERMINATION
20
+ #############################################################################################
21
+
22
+ # close files
23
+ def self.end_work_manager
24
+ #VOID
25
+ end
26
+
27
+ #############################################################################################
28
+ # MANAGER NATIVE FUNCTIONS
29
+ #############################################################################################
30
+
31
+ # this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
32
+ # Return the work data or nil if no more data is available
33
+ def next_work #Manage INput's worker
34
+ if @@num_seqs == @@putative_seqs.length-1
35
+ return nil
36
+ else
37
+ seq = @@putative_seqs[@@num_seqs]
38
+ @@num_seqs += 1
39
+ return seq
40
+ end
41
+ end
42
+
43
+ # this method is ejecuted each time an obj is finished
44
+ def work_received(match_and_unmatch_array) #Manage OUTput's worker
45
+ @@match.concat(match_and_unmatch_array.first)
46
+ @@unmatch.concat(match_and_unmatch_array.last)
47
+ end
48
+
49
+ def error_received(worker_error, obj)
50
+ puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
51
+ end
52
+
53
+ def too_many_errors_received
54
+ $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
55
+ end
56
+
57
+ # send initial config
58
+ def worker_initial_config
59
+ return [@@options, @@blast_path]
60
+ end
61
+
62
+ #############################################################################################
63
+ # CUSTOM FUNCTIONS
64
+ #############################################################################################
65
+ def self.get_array_seqs
66
+ return @@match, @@unmatch
67
+ end
68
+ end
69
+
@@ -0,0 +1,389 @@
1
+ require 'types'
2
+ require 'scbi_fasta'
3
+ require 'scbi_blast'
4
+ require 'sequence'
5
+ require 'exonerate_result'
6
+ require 'fln_stats'
7
+ require 'reptrans'
8
+
9
+ include FlnStats
10
+
11
+
12
+
13
+
14
+ class MyWorkerManagerFln < ScbiMapreduce::WorkManager
15
+ #############################################################################################
16
+ # MANAGER INITIALIZATION
17
+ #############################################################################################
18
+ attr_accessor :seqs_annotation_prot, :seqs_some_coding, :seqs_unknown
19
+ # open files and prepare global data
20
+ def self.init_work_manager(options)
21
+ @@stats_hash = initialize_stats_hash
22
+ @@stats_taxonomy = {}
23
+ @@stats_different_prot_id = []
24
+ @@stats_different_prot_id_complete_seqs = []
25
+
26
+ @@options = options
27
+ $verbose = options[:verbose]
28
+
29
+ input_file = options[:fasta]
30
+
31
+ if !File.exists?('fln_results')
32
+ Dir.mkdir('fln_results')
33
+ end
34
+
35
+ @@func_annot_type = {
36
+ :go_id => 5,
37
+ :go_description => 6,
38
+ :kegg_id => 7,
39
+ :interpro_id => 8,
40
+ :interpro_description => 9,
41
+ :ec_id => 10,
42
+ :pfam_id => 11,
43
+ :pfam_desc => 12,
44
+ :unipathway_id => 13
45
+ }
46
+
47
+ @@functional_annotations = {}
48
+ @@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'sp_'+options[:tax_group],'sp_'+options[:tax_group]+'.index')))
49
+ #@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'tr_'+options[:tax_group],'tr_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('t')
50
+
51
+ @@fasta_file = FastaQualFile.new(input_file,'')
52
+ file_head = "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\te_value\tp_ident\ts_length\tprotein_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tgo_id\tgo_description\tkegg_id\tinterpro_id\tinterpro_description\tec_id\tpfam_id\tpfam_description\tunipathway_id"
53
+
54
+ @@output_files = {}
55
+ # Seq annotation files
56
+ if !options[:chimera].nil?
57
+ @@output_files[CHIMERA] = File.open("fln_results/chimeric_sequences.txt", 'w')
58
+ @@output_files[CHIMERA].puts file_head
59
+ elsif File.exists?("fln_results/chimeric_sequences.txt")
60
+ File.delete("fln_results/chimeric_sequences.txt")
61
+ end
62
+ @@output_files[OTHER] = File.open('fln_results/artifact_other.txt', 'w')
63
+ @@output_files[MISASSEMBLED] = File.open('fln_results/misassembled.txt', 'w')
64
+ @@output_files[UNKNOWN] = File.open('fln_results/unknown.txt', 'w')
65
+ @@output_files['db'] = File.open('fln_results/pt_seqs', 'w')
66
+ @@output_files[CODING] = File.open('fln_results/new_coding.txt', 'w')
67
+ @@output_files[NCRNA] = File.open('fln_results/nc_rnas.txt', 'w')
68
+
69
+ # Complementary files
70
+ @@output_files['align'] = File.open('fln_results/alignments.txt', 'w')
71
+ @@output_files['prot'] = File.open('fln_results/proteins.fasta', 'w') # FASTA
72
+ @@output_files['nts'] = File.open("fln_results/nt_seq.txt", 'w')
73
+ @@output_files['seqs'] = File.open('fln_results/unigenes.fasta', 'w') # FASTA
74
+ @@output_files['stats_html'] = File.open('fln_results/summary_stats.html', 'w')
75
+ @@output_files['stats_txt'] = File.open('fln_results/summary_stats.txt', 'w')
76
+
77
+ @@output_files[CODING].puts file_head
78
+ @@output_files['db'].puts file_head
79
+ @@output_files[NCRNA].puts file_head
80
+
81
+ #RepTrans module
82
+ @@seqs_annotation_prot = []
83
+ @@seqs_some_coding = []
84
+ @@seqs_unknown = []
85
+
86
+ #Transdecoder module
87
+ @@complete_sure = []
88
+ @@seqs_to_analyze = []
89
+
90
+ end
91
+
92
+ #############################################################################################
93
+ # MANAGER TERMINATION
94
+ #############################################################################################
95
+
96
+ # close files
97
+ def self.end_work_manager
98
+ orf_prediction_with_transdecoder if @@options[:acess_db].include?('p') && !@@complete_sure.empty? && !@@seqs_to_analyze.empty?
99
+ write_summary_stats(@@stats_hash, @@stats_taxonomy, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@output_files['stats_txt'], @@output_files['stats_html'])
100
+ @@output_files.each do |key, handler|
101
+ handler.close
102
+ end
103
+ end
104
+
105
+ #############################################################################################
106
+ # MANAGER NATIVE FUNCTIONS
107
+ #############################################################################################
108
+
109
+ # this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says.
110
+ # Return the work data or nil if no more data is available
111
+ def next_work #Manage INput's worker
112
+ n,f,q = @@fasta_file.next_seq
113
+ if !n.nil?
114
+ @@stats_hash['input_seqs'] += 1
115
+ return Sequence.new(n,f,q)
116
+ else
117
+ return nil
118
+ end
119
+ end
120
+
121
+ # this method is ejecuted each time an obj is finished
122
+ def work_received(objs) #Manage OUTput's worker
123
+ objs.each do |seq|
124
+ transdecoder_keep_seq(seq)
125
+ repTrans_keep_seq(seq)
126
+ if seq.type > UNKNOWN && seq.type < NCRNA
127
+ get_taxonomy(seq.hit.definition, @@stats_taxonomy)
128
+ get_functional_annotations(seq)
129
+ end
130
+ write_seq(seq) if @@options[:acess_db].include?('c') || !@@options[:acess_db].include?('p') || ( seq.type != UNKNOWN && seq.type != CODING ) #Don't write Unknown or coding sequences when use transdecoder
131
+ end
132
+ @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs = summary_stats(objs, @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs)
133
+ end
134
+
135
+ def error_received(worker_error, obj)
136
+ puts "WARNING!!!!!. CHUNK FAILED:Error while processing object #{obj.first.seq_name}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
137
+ end
138
+
139
+ def too_many_errors_received
140
+ $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
141
+ end
142
+
143
+ # send initial config
144
+ def worker_initial_config
145
+ return @@options
146
+ end
147
+
148
+ #############################################################################################
149
+ # CUSTOM FUNCTIONS
150
+ #############################################################################################
151
+
152
+ def self.load_functional_annotations(annotation_file)
153
+ functional_annotations = {}
154
+ File.open(annotation_file).each do |line|
155
+ line.chomp!
156
+ fields = line.split("\t")
157
+ acc = fields.shift
158
+ functional_annotations[acc] = fields
159
+ end
160
+ return functional_annotations
161
+ end
162
+
163
+ def get_functional_annotations(seq)
164
+ all_info = @@functional_annotations[seq.hit.acc.gsub(/-\d+/,'')] #gsub removes splicing code of uniprot accesion
165
+ if !all_info.nil?
166
+ annotations = {}
167
+ @@func_annot_type.each do |type, position|
168
+ annotations[type] = all_info[position]
169
+ end
170
+ seq.functional_annotations = annotations
171
+ end
172
+ end
173
+
174
+
175
+ # write results to files
176
+ def write_seq(seq)
177
+ begin
178
+ seq.write_info(@@output_files)
179
+ rescue Exception => e
180
+ puts "Error printing #{seq.seq_name}"
181
+ puts e.message, e.backtrace.join("\n")
182
+ end
183
+
184
+ end
185
+
186
+ def repTrans_keep_seq(seq)
187
+ if !@@options[:reptrans].nil?
188
+ case seq.type
189
+ when COMPLETE .. INTERNAL
190
+ @@seqs_annotation_prot << seq
191
+ when CODING
192
+ @@seqs_some_coding << seq
193
+ when UNKNOWN
194
+ @@seqs_unknown << seq
195
+ end
196
+ end
197
+ end
198
+
199
+ def self.repTrans_keep_seq(seq)
200
+ if !@@options[:reptrans].nil?
201
+ case seq.type
202
+ when COMPLETE .. INTERNAL
203
+ @@seqs_annotation_prot << seq
204
+ when CODING
205
+ @@seqs_some_coding << seq
206
+ when UNKNOWN
207
+ @@seqs_unknown << seq
208
+ end
209
+ end
210
+ end
211
+
212
+ def transdecoder_keep_seq(seq)
213
+ if @@options[:acess_db].include?('p')
214
+ case seq.type
215
+ when COMPLETE
216
+ @@complete_sure << seq if seq.status && seq.hit.ident >= @@options[:training_ident]
217
+ when CODING
218
+ @@seqs_to_analyze << seq
219
+ when UNKNOWN
220
+ @@seqs_to_analyze << seq
221
+ end
222
+ end
223
+ end
224
+
225
+ def self.orf_prediction_with_transdecoder
226
+ clusters_seqs_annot_prot = clustering_by_id(@@complete_sure)
227
+ final_seqs = select_representative(clusters_seqs_annot_prot)
228
+ coding_info = nil
229
+ Dir.chdir('temp') do
230
+ orfs = get_seqs(final_seqs)
231
+ File.open('training_set.fasta', 'w') {|f| f.write(orfs)}
232
+ orfs = get_seqs(@@seqs_to_analyze)
233
+ File.open('analyse_set.fasta', 'w') {|f| f.write(orfs)}
234
+ cmd = "TransDecoder -t analyse_set.fasta --workdir transdecoder --train training_set.fasta"
235
+ cmd << ' --reuse' if Dir.exists?('transdecoder')
236
+ system(cmd)
237
+ coding_info = get_coding_info('transdecoder/longest_orfs.pep')
238
+ coding_info = get_scores('transdecoder/longest_orfs.cds.scores', coding_info)
239
+ coding_info = correct_by_selected('transdecoder/longest_orfs.cds.scores.selected', coding_info)
240
+ end
241
+ @@seqs_to_analyze.each do |seq|
242
+ coding = coding_info[seq.seq_name]
243
+ asign_coding_attributes(seq, coding) if !coding.nil?
244
+ repTrans_keep_seq(seq)
245
+ seq.write_info(@@output_files)
246
+ end
247
+ end
248
+
249
+ def self.get_seqs(seqs)
250
+ all_seqs = ''
251
+ seqs.each do |seq|
252
+ all_seqs << ">#{seq.seq_name}\n#{seq.seq_fasta}\n"
253
+ end
254
+ return all_seqs
255
+ end
256
+
257
+ def self.correct_by_selected(selected, coding_info)
258
+ seqs_selected = []
259
+ File.open(selected).each do |line|
260
+ line.chomp!
261
+ seq_name, orf_id = line.split('|', 2)
262
+ seqs_selected << orf_id
263
+ end
264
+ coding_info.each do |seq_name, orfs|
265
+ orfs.each do |orf, info|
266
+ info[1] = '-' if !seqs_selected.include?(orf)
267
+ end
268
+ end
269
+ return coding_info
270
+ end
271
+
272
+ def self.asign_coding_attributes(seq, coding)
273
+ seq.type = CODING
274
+ @@stats_hash['unknown'] -= 1
275
+ @@stats_hash['unknown_>200'] -= 1 if seq.seq_fasta.length > 200
276
+ @@stats_hash['unknown_>500'] -= 1 if seq.seq_fasta.length > 500
277
+ @@stats_hash['coding_>200'] += 1 if seq.seq_fasta.length > 200
278
+ @@stats_hash['coding_>500'] += 1 if seq.seq_fasta.length > 500
279
+ @@stats_hash['coding'] += 1
280
+ coding = select_orf(coding)
281
+ if coding[1] == 'complete'
282
+ seq.status = TRUE
283
+ @@stats_hash['coding_sure'] += 1
284
+ else
285
+ @@stats_hash['coding_putative'] += 1
286
+ end
287
+
288
+ seq.t_code = coding.last
289
+ ind = 2
290
+ ind = 3 if coding[4] == '-'
291
+ frame = (coding[ind]%3)+1
292
+ frame = frame * -1 if coding[4] == '-'
293
+ seq.hit = [coding[2], coding[3], frame]
294
+ end
295
+
296
+ def self.select_orf(orfs_hash)
297
+ orf = nil
298
+ ratioX = get_min_Xratio(orfs_hash)
299
+ orfs_hash.select!{|id, info| info.first == ratioX}
300
+ orfs = orfs_hash.select{|id, info| info[1] == 'complete'}
301
+ orfs = orfs_hash if orfs.empty?
302
+ max_score = get_max_score(orfs)
303
+ orfs.select!{|id, info| info.last == max_score}
304
+ orf = orfs.values.first
305
+ return orf
306
+ end
307
+
308
+ def self.get_max_score(orfs_hash)
309
+ score = nil
310
+ orfs_hash.each do |id, info|
311
+ local = info.last
312
+ if score.nil?
313
+ score = local
314
+ else
315
+ score = local if local > score
316
+ end
317
+ end
318
+ return score
319
+ end
320
+
321
+ def self.get_min_Xratio(orfs_hash)
322
+ ratio = nil
323
+ orfs_hash.each do |id, info|
324
+ local = info.first
325
+ if ratio.nil?
326
+ ratio = local
327
+ else
328
+ ratio = local if local < ratio
329
+ end
330
+ end
331
+ return ratio
332
+ end
333
+
334
+ def self.get_coding_info(file_name)
335
+ coding_info = {}
336
+ begin
337
+ FastaQualFile.new(file_name, '').each do |name, seq, comments, qual|
338
+ seq_length = seq.length
339
+ f_len = seq.length.to_f
340
+ x_len = seq.count('X')
341
+ seq_name, orf_id = name.split('|')
342
+ comments =~ /type:(\S+)/
343
+ type = $1
344
+ comments =~ /:(\d+)-(\d+)\(([+-])\)/
345
+ start = $1.to_i
346
+ stop = $2.to_i
347
+ strand = $3
348
+ record = coding_info[seq_name]
349
+ info = [x_len / f_len, type, start, stop, strand]
350
+ if record.nil?
351
+ coding_info[seq_name] = {orf_id => info}
352
+ else
353
+ record[orf_id] = info
354
+ end
355
+ end
356
+ rescue
357
+ puts "Warning!!!!!!!!!!: Transdecoder file is missing. Check if Transdecoder is installed"
358
+ end
359
+ return coding_info
360
+ end
361
+
362
+ def self.get_scores(file_name, coding_info)
363
+ File.open(file_name).each do |line|
364
+ line.chomp!
365
+ fields = line.split("\t")
366
+ name = fields.shift
367
+ seq, orf_id = name.split('|')
368
+ coding = coding_info[seq]
369
+ if !coding.nil?
370
+ orf = coding[orf_id]
371
+ if !orf.nil?
372
+ score = fields.first.to_f
373
+ if score > 0
374
+ orf << fields.first.to_f if !orf.nil?
375
+ else
376
+ coding.delete(orf_id)
377
+ coding_info.delete(seq) if coding.empty?
378
+ end
379
+ end
380
+ end
381
+ end
382
+ return coding_info
383
+ end
384
+
385
+ def self.get_annotations
386
+ return @@seqs_annotation_prot, @@seqs_some_coding, @@seqs_unknown
387
+ end
388
+ end
389
+