gene_assembler 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,532 @@
1
+ require 'contig'
2
+
3
+ class Dataset
4
+ attr_accessor :type, :contigs, :clusters, :references_hash
5
+ def initialize(type) #Carga un objeto blast para generar los objetos contig que inician esta clase
6
+ @type=type #Definido pero no se usa
7
+ @contigs=[]
8
+ @clusters=[]
9
+ @references_hash=''
10
+ end
11
+
12
+ def add_contig(name)
13
+ c=Contig.new(name)
14
+ @contigs << c
15
+ return c
16
+ end
17
+
18
+ def transfer_contigs(add_contigs,limit=0)
19
+ if limit==0
20
+ @contigs << add_contigs
21
+ @contigs.flatten!
22
+ else
23
+ if add_contigs.class.to_s=='Array'
24
+ add_contigs.each_with_index do |contig,i|
25
+ if i==limit
26
+ break
27
+ end
28
+ @contigs << contig
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ def transfer_cluster(cluster)
35
+ @clusters << cluster
36
+ end
37
+
38
+ def delete_cluster(cluster)
39
+ @clusters.delete(cluster)
40
+ end
41
+
42
+ def delete_cluster_at(ind)
43
+ @clusters.delete_at(ind)
44
+ end
45
+
46
+ #iterador
47
+ def each_contig
48
+ @contigs.each do |contig|
49
+ yield contig
50
+ end
51
+ end
52
+
53
+ def each_contig_with_index
54
+ @contigs.each_with_index do |contig,i|
55
+ yield contig,i
56
+ end
57
+ end
58
+
59
+ def each_cluster
60
+ @clusters.each do |cluster|
61
+ yield cluster
62
+ end
63
+ end
64
+
65
+ def each_cluster_with_index
66
+ @clusters.each_with_index do |cluster,i|
67
+ yield cluster,i
68
+ end
69
+ end
70
+
71
+ def n_contigs?
72
+ n=@contigs.length
73
+ return n
74
+ end
75
+
76
+ def clr_contigs # Vacia @contigs
77
+ @contigs=[]
78
+ end
79
+
80
+ def clusters_empty?
81
+ empty=TRUE
82
+ i=0
83
+ each_cluster{|cl|
84
+ i+=1
85
+ if i>0
86
+ empty=FALSE
87
+ break
88
+ end
89
+ }
90
+ return empty
91
+ end
92
+
93
+ def contig_count
94
+ count=@contigs.length
95
+ return count
96
+ end
97
+
98
+ def cluster_count
99
+ count=@clusters.length
100
+ return count
101
+ end
102
+
103
+ def attrib_recover(dataset) #Reponer atributos en el Dataset del exonerate que se han perdido en el proceso (exonerate no los tiene), se recuperan del blast
104
+ each_contig{|self_contig|
105
+ dataset.each_contig{|dataset_contig|
106
+ if self_contig.name==dataset_contig.name
107
+ self_contig.length=dataset_contig.length
108
+ self_contig.seq=dataset_contig.seq
109
+ self_contig.each_hit{|hit|
110
+ hit.s_length=dataset_contig.first_hit.s_length
111
+ }
112
+ break
113
+ end
114
+ }
115
+ }
116
+ end
117
+
118
+ def correct_hsp_contigs(blast_coor_type)
119
+ each_contig {|contig|
120
+ contig.correct_hsps(blast_coor_type)
121
+ }
122
+ end
123
+
124
+ def clustering # Compara el subject_id entre todos los contig y agrupa en un array aquellos con mismo s_i. Cada array se guarda en el array 'clusters'
125
+ finished_clusters=[]
126
+ each_contig{|contig|
127
+ clust=[]
128
+ if finished_clusters.include?(contig.first_hit.name)
129
+ next
130
+ end
131
+ each_contig{|contig2|
132
+ if contig.first_hit.name==contig2.first_hit.name
133
+ clust << contig2
134
+ contig2=nil
135
+ end
136
+ }
137
+ finished_clusters << contig.first_hit.name
138
+ if !clust.empty?
139
+ @clusters << clust
140
+ end
141
+ }
142
+ end
143
+
144
+ def info_clusters # Muestra informacion sobre @Clusters, muestra contig, la proteina a la q pertenece y un diagrama del alineamiento en aa
145
+ if $verbose
146
+ each_cluster{|cl|
147
+ puts '............................'
148
+ cl.each do |c|
149
+ puts "#{c.first_hit.name}\t#{c.name}"
150
+ end
151
+ puts "............................"
152
+ }
153
+
154
+ each_cluster{|clust|
155
+ puts "\n********************MAP*************************\n"
156
+ clust.each do |contig|
157
+ contig.draw
158
+ end
159
+ }
160
+ puts "\n"
161
+ end
162
+ end
163
+
164
+ def filtering #Bateria de filtros
165
+ putative_contigs=[]
166
+ uni_hsp=[]
167
+ each_contig{ |contig|
168
+ if contig.mixed?
169
+ next
170
+ elsif contig.is_one_hsp? #Apartamos contigs uni-hsp
171
+ uni_hsp << contig
172
+ next
173
+ elsif contig.is_gapped?
174
+ next
175
+ elsif contig.is_truncated?
176
+ next
177
+ elsif contig.hsp_minor_than?(15) #En nt
178
+ next
179
+ else putative_contigs << contig
180
+ if $verbose
181
+ puts "#{contig.first_hit.name}\t#{contig.name}"
182
+ end
183
+ end
184
+ }
185
+ @contigs=putative_contigs
186
+ return uni_hsp
187
+ end
188
+
189
+ def load_seq(hash) #Carga secuencias en @contigs
190
+ each_contig{|contig|
191
+ contig.seq=hash[contig.name]
192
+ contig.seq.upcase!
193
+ }
194
+ end
195
+
196
+ def rev_comp #Realiza la secuencia reverso complementaria en @contigs y @uni_hsp
197
+ each_contig{|contig|
198
+ contig.rev_comp_if_hit
199
+ }
200
+ end
201
+
202
+ def parse_stops
203
+ each_contig{|contig|
204
+ contig.stop_codon_search
205
+ }
206
+ end
207
+
208
+ def fasta(fasta_file) #Crea un archivo fasta a partir de @contigs
209
+ temp=File.open(fasta_file, 'w')
210
+ each_contig{|contig|
211
+ temp.print ">#{contig.name}\n"
212
+ temp.puts contig.seq
213
+ }
214
+ temp.close
215
+ end
216
+
217
+ def filtering_clust # Bateria de filtros q se aplica sobre @clusters. tb muestra informacion
218
+ gene_clusters=[]
219
+ uni_hsp=[]
220
+ each_cluster{|clust|
221
+ if $verbose
222
+ puts "\n********************CLUSTER*************************\n"
223
+ end
224
+ putative_ex=[]
225
+ trash_ex=[]
226
+ clust.each do |contig|
227
+ temp=[]
228
+ if contig.mixed?
229
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
230
+ trash_ex << temp
231
+ elsif contig.is_truncated?
232
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
233
+ trash_ex << temp
234
+ elsif contig.is_one_hsp?
235
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
236
+ trash_ex << temp
237
+ uni_hsp << contig#Se guardan los contig uni-hsp, para procesado posterior
238
+ elsif contig.is_gapped?
239
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
240
+ trash_ex << temp
241
+ else putative_ex << contig
242
+ end
243
+ end
244
+ if $verbose
245
+ putative_ex.each do |contig|
246
+ puts "#{contig.first_hit.name}\t#{contig.name}\t\t\tsc:#{contig.first_hit.first_hsp.score}" #el score de cada hsp es el mismo, por lo que realmente pertenece al alineamiento entero
247
+ end
248
+ puts ',,,,,,,,,,,,,REJECTED,,,,,,,,,,,,,'
249
+ trash_ex.each do |contig|
250
+ puts contig
251
+ end
252
+ puts "\n= = = = = = = = = =MAP= = = = = = = = = = = =\n"
253
+ putative_ex.each do |contig|
254
+ contig.draw
255
+ end
256
+ end
257
+ gene_clusters << putative_ex
258
+ }
259
+ @clusters=gene_clusters
260
+ return uni_hsp
261
+ end
262
+
263
+ def sort_cont_clust #Ordenar contigs dentro de @clusters de menor a mayor en base a su primer hsp
264
+ each_cluster{|cluster|
265
+ cluster=sort_cluster(cluster)
266
+ }
267
+ #@clusters=sort_clusters(@clusters)
268
+ end
269
+
270
+ def sort_cluster(cluster)#Ordena los elementos de cluster(contigs) en base a su posicion en el subject
271
+ cluster.sort!{|e1,e2| e1.first_hit.first_hsp.s_beg<=>e2.first_hit.first_hsp.s_beg}
272
+ end
273
+
274
+ def load_references(references_file) # Carga en @references_hash todas las referencias en forma de objetos contig
275
+ hash={}
276
+ if File.exists?(references_file)
277
+ File.open(references_file, 'r').each do |line|
278
+ fields=line.split
279
+ contig_name=fields[0]
280
+ if !fields[1].nil?
281
+ structures=fields[1].split('|')
282
+ all_models=[]
283
+ structures.each do |structure|
284
+ contig=Contig.new(contig_name)
285
+ contig.add_hit(contig_name, 0, 1,:nt)
286
+ if structure.nil?
287
+ break
288
+ end
289
+ hsps=structure.split(';')
290
+ s_end=0
291
+ nt_add=0
292
+ hsps.each do |hsp|
293
+ coords=hsp.split('-')
294
+ q_beg=coords[0].to_i
295
+ q_end=coords[1].to_i
296
+ s_beg=s_end+1
297
+ exon_length=q_end-q_beg+nt_add
298
+ s_end=s_end+(exon_length/3)
299
+ nt_add=exon_length.modulo(3)
300
+ contig.first_hit.add_hsp(q_beg, q_end, s_beg, s_end, 0, 0, 0, 0)
301
+ end
302
+ contig.length=contig.first_hit.last_hsp.q_end
303
+ all_models << contig
304
+ end
305
+ hash[contig_name]=all_models
306
+ end
307
+ end
308
+ end
309
+ @references_hash=hash
310
+ end
311
+
312
+ def missing_cluster_transfer(dataset) #Busca que clusters estan vacios e intenta llenarlos con clusters de dataset
313
+ add=[]
314
+ delete=[]
315
+ if clusters_empty?
316
+ dataset.each_cluster{ |clust|
317
+ transfer_cluster(clust)
318
+ }
319
+ dataset.clear_clusters
320
+ else
321
+ dataset.each_cluster_with_index{|uni_cluster,ind|
322
+ is_cluster=FALSE
323
+ each_cluster{|cluster| #Se mira si existe cluster uni-hsp en cluster
324
+ if uni_cluster.first.first_hit.name==cluster.first.first_hit.name
325
+ is_cluster=TRUE
326
+ break
327
+ end
328
+ }
329
+ if !is_cluster #Caso de q no exista cluster, se transfiere cluster uni-hsp
330
+ add << uni_cluster
331
+ delete << ind
332
+ end
333
+ }
334
+ add.each do |clust|
335
+ transfer_cluster(clust.dup)
336
+ end
337
+ delete.sort!
338
+ delete.reverse_each do |ind|
339
+ dataset.delete_cluster_at(ind)
340
+ end
341
+ end
342
+ end
343
+
344
+ def clear_clusters
345
+ @clusters=[]
346
+ end
347
+
348
+ def score_correction(factor) #Suma al atributo score la operacion nº intrones*factor
349
+ each_contig{|contig|
350
+ n_intron=contig.n_intron
351
+ contig.first_hit.each_hsp{|hsp|
352
+ hsp.score+=factor*n_intron
353
+ }
354
+ }
355
+ end
356
+
357
+ def correct_left_side_contigs(contig_base)
358
+ last_contig=nil
359
+ limit=0
360
+ correct=0
361
+ ## Alineamiento de los contig entre si para calcular desplazamiento
362
+ if !contig_base.nil?
363
+ limit=-1
364
+ last_contig=contig_base
365
+ end
366
+ each_contig_with_index do |contig,i| # Calculo del desplazamiento necesario para corregir indices negativos en el gff
367
+ if i>limit
368
+ overlap_exon_with_last,ex=contig.compare(last_contig)
369
+ if overlap_exon_with_last>-1
370
+ overlap_exon_current,ex=last_contig.compare(contig)
371
+ diference=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current))
372
+ if diference<correct
373
+ correct=diference
374
+ end
375
+ end
376
+ end
377
+
378
+ if !contig_base.nil?
379
+ last_contig=contig_base
380
+ else
381
+ last_contig=contig
382
+ end
383
+ end
384
+ correct*=-1
385
+ return correct
386
+ end
387
+
388
+ def align_contigs(contig_base)
389
+ limit=0
390
+ las_contig=nil
391
+ if !contig_base.nil?
392
+ limit=-1
393
+ last_contig=contig_base
394
+ end
395
+ ## Alineamiento de los contig entre si o contra una referencia
396
+ add=0
397
+ align=TRUE
398
+ each_contig_with_index do |contig,i|
399
+ if i>limit
400
+ #Calcular desplazamiento de un contig respecto al anterior en el gff
401
+ overlap_exon_with_last,ex=contig.compare(last_contig)
402
+ if overlap_exon_with_last==-1
403
+ if contig_base.nil?
404
+ add+=last_contig.length
405
+ else
406
+ align=FALSE
407
+ end
408
+ else
409
+ overlap_exon_current,ex=last_contig.compare(contig)
410
+ add+=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current))
411
+ if !contig_base.nil?
412
+ align=TRUE
413
+ end
414
+ end
415
+ end
416
+
417
+ #Modificacion de contigs
418
+ if align || contig_base.nil? # Modificar si no existe referencia o el contig a alineado contra la referencia
419
+ contig.modified_coordenates(add)
420
+ contig.length+=add
421
+ end
422
+ if !contig_base.nil?
423
+ last_contig=contig_base
424
+ add=0 #Resetear desplazamiento en caso de usarse una referencia
425
+ else
426
+ last_contig=contig
427
+ end
428
+ end
429
+ end
430
+
431
+ def multiple_align_contigs(array_contig_base,mod_contig_base=FALSE)
432
+ correct=0
433
+ array_contig_base.each do |contig_base|
434
+ local_correct=correct_left_side_contigs(contig_base)
435
+ if local_correct>correct
436
+ correct=local_correct
437
+ end
438
+ self.align_contigs(contig_base,mod_contig_base)
439
+ end
440
+ # Correcion del modelo en base al desplazamiento general calculado para cada fragmento teniendo en cuenta el desplazamiento local realizado
441
+ array_contig_base.each do |contig|
442
+ if correct>0
443
+ contig.modified_coordenates(correct)
444
+ contig.length+=correct
445
+ end
446
+ end
447
+ self.each_contig {|contig|
448
+ if correct>0
449
+ contig.modified_coordenates(correct)
450
+ contig.length+=correct
451
+ end
452
+ }
453
+
454
+ return correct
455
+ end
456
+
457
+ def transfer_n_contigs_def_hit_type(dataset,cluster,new_hit_type,limit)
458
+ if !cluster.empty?||!cluster.nil?
459
+ dataset.each_cluster{|dat_cluster|
460
+ if dat_cluster.empty?||dat_cluster.nil?
461
+ next
462
+ end
463
+ if dat_cluster.first.first_hit.name==cluster.first.first_hit.name # Se busca en los clusters unihsp aquel q pertenece al gen q se esta trabajando
464
+ dat_cluster.each do |contig|
465
+ contig.first_hit.type='pseudogene'
466
+ end
467
+ transfer_contigs(dat_cluster,limit)
468
+ end
469
+
470
+ }
471
+ end
472
+ end
473
+
474
+ def missing_contigs_transfer(dataset) #dataset is uni_hsp. Se buscan contigs q no alineen con los de self
475
+ contigs_cluster=[]
476
+ self.each_cluster_with_index{|self_cluster,s|
477
+ dataset.each_cluster{|dataset_cluster|
478
+ if dataset_cluster.nil? ||dataset_cluster.empty?
479
+ next
480
+ end
481
+ if self_cluster.first.first_hit.name==dataset_cluster.first.first_hit.name #Mismo cluster
482
+ dataset_cluster.each do |dataset_contig|
483
+ align=FALSE
484
+ self_cluster.each do |self_contig|
485
+ position,n_exones=dataset_contig.compare(self_contig)
486
+ if position>-1
487
+ align=TRUE
488
+ break
489
+ end
490
+ end
491
+ if !align
492
+ contigs_cluster << dataset_contig
493
+ end
494
+ end
495
+
496
+ contigs_cluster.each do |contig|
497
+ self.transfer_contig_to_cluster(contig,s)
498
+ dataset_cluster.delete(contig)
499
+ end
500
+ contigs_cluster=[]
501
+ end
502
+ }
503
+ }
504
+ end
505
+
506
+ def transfer_contig_to_cluster(contig,n_cluster)
507
+ @clusters[n_cluster] << contig
508
+ end
509
+
510
+ def generate_file_5_prime(file, fasta)
511
+ prime5_file = File.open(file, 'w')
512
+ fasta_file = File.open(fasta, 'w')
513
+ each_cluster{ |cluster|
514
+ if !cluster.nil? && !cluster.empty?
515
+ gene_name = cluster.first.first_hit.name
516
+ cluster.each do |contig|
517
+ if contig.first_hit.first_hsp.s_beg <= 10
518
+ prime5_end = contig.first_hit.first_hsp.q_beg
519
+ prime5_file.puts "#{gene_name}\t#{contig.name}\t#{prime5_end}"
520
+ seq = contig.seq[0..prime5_end]
521
+ if !seq.nil?
522
+ fasta_file.puts "#{gene_name}\n#{seq}"
523
+ end
524
+ end
525
+ end
526
+ end
527
+ }
528
+ prime5_file.close
529
+ fasta_file.close
530
+ end
531
+
532
+ end