gene_assembler 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,532 @@
1
+ require 'contig'
2
+
3
+ class Dataset
4
+ attr_accessor :type, :contigs, :clusters, :references_hash
5
+ def initialize(type) #Carga un objeto blast para generar los objetos contig que inician esta clase
6
+ @type=type #Definido pero no se usa
7
+ @contigs=[]
8
+ @clusters=[]
9
+ @references_hash=''
10
+ end
11
+
12
+ def add_contig(name)
13
+ c=Contig.new(name)
14
+ @contigs << c
15
+ return c
16
+ end
17
+
18
+ def transfer_contigs(add_contigs,limit=0)
19
+ if limit==0
20
+ @contigs << add_contigs
21
+ @contigs.flatten!
22
+ else
23
+ if add_contigs.class.to_s=='Array'
24
+ add_contigs.each_with_index do |contig,i|
25
+ if i==limit
26
+ break
27
+ end
28
+ @contigs << contig
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ def transfer_cluster(cluster)
35
+ @clusters << cluster
36
+ end
37
+
38
+ def delete_cluster(cluster)
39
+ @clusters.delete(cluster)
40
+ end
41
+
42
+ def delete_cluster_at(ind)
43
+ @clusters.delete_at(ind)
44
+ end
45
+
46
+ #iterador
47
+ def each_contig
48
+ @contigs.each do |contig|
49
+ yield contig
50
+ end
51
+ end
52
+
53
+ def each_contig_with_index
54
+ @contigs.each_with_index do |contig,i|
55
+ yield contig,i
56
+ end
57
+ end
58
+
59
+ def each_cluster
60
+ @clusters.each do |cluster|
61
+ yield cluster
62
+ end
63
+ end
64
+
65
+ def each_cluster_with_index
66
+ @clusters.each_with_index do |cluster,i|
67
+ yield cluster,i
68
+ end
69
+ end
70
+
71
+ def n_contigs?
72
+ n=@contigs.length
73
+ return n
74
+ end
75
+
76
+ def clr_contigs # Vacia @contigs
77
+ @contigs=[]
78
+ end
79
+
80
+ def clusters_empty?
81
+ empty=TRUE
82
+ i=0
83
+ each_cluster{|cl|
84
+ i+=1
85
+ if i>0
86
+ empty=FALSE
87
+ break
88
+ end
89
+ }
90
+ return empty
91
+ end
92
+
93
+ def contig_count
94
+ count=@contigs.length
95
+ return count
96
+ end
97
+
98
+ def cluster_count
99
+ count=@clusters.length
100
+ return count
101
+ end
102
+
103
+ def attrib_recover(dataset) #Reponer atributos en el Dataset del exonerate que se han perdido en el proceso (exonerate no los tiene), se recuperan del blast
104
+ each_contig{|self_contig|
105
+ dataset.each_contig{|dataset_contig|
106
+ if self_contig.name==dataset_contig.name
107
+ self_contig.length=dataset_contig.length
108
+ self_contig.seq=dataset_contig.seq
109
+ self_contig.each_hit{|hit|
110
+ hit.s_length=dataset_contig.first_hit.s_length
111
+ }
112
+ break
113
+ end
114
+ }
115
+ }
116
+ end
117
+
118
+ def correct_hsp_contigs(blast_coor_type)
119
+ each_contig {|contig|
120
+ contig.correct_hsps(blast_coor_type)
121
+ }
122
+ end
123
+
124
+ def clustering # Compara el subject_id entre todos los contig y agrupa en un array aquellos con mismo s_i. Cada array se guarda en el array 'clusters'
125
+ finished_clusters=[]
126
+ each_contig{|contig|
127
+ clust=[]
128
+ if finished_clusters.include?(contig.first_hit.name)
129
+ next
130
+ end
131
+ each_contig{|contig2|
132
+ if contig.first_hit.name==contig2.first_hit.name
133
+ clust << contig2
134
+ contig2=nil
135
+ end
136
+ }
137
+ finished_clusters << contig.first_hit.name
138
+ if !clust.empty?
139
+ @clusters << clust
140
+ end
141
+ }
142
+ end
143
+
144
+ def info_clusters # Muestra informacion sobre @Clusters, muestra contig, la proteina a la q pertenece y un diagrama del alineamiento en aa
145
+ if $verbose
146
+ each_cluster{|cl|
147
+ puts '............................'
148
+ cl.each do |c|
149
+ puts "#{c.first_hit.name}\t#{c.name}"
150
+ end
151
+ puts "............................"
152
+ }
153
+
154
+ each_cluster{|clust|
155
+ puts "\n********************MAP*************************\n"
156
+ clust.each do |contig|
157
+ contig.draw
158
+ end
159
+ }
160
+ puts "\n"
161
+ end
162
+ end
163
+
164
+ def filtering #Bateria de filtros
165
+ putative_contigs=[]
166
+ uni_hsp=[]
167
+ each_contig{ |contig|
168
+ if contig.mixed?
169
+ next
170
+ elsif contig.is_one_hsp? #Apartamos contigs uni-hsp
171
+ uni_hsp << contig
172
+ next
173
+ elsif contig.is_gapped?
174
+ next
175
+ elsif contig.is_truncated?
176
+ next
177
+ elsif contig.hsp_minor_than?(15) #En nt
178
+ next
179
+ else putative_contigs << contig
180
+ if $verbose
181
+ puts "#{contig.first_hit.name}\t#{contig.name}"
182
+ end
183
+ end
184
+ }
185
+ @contigs=putative_contigs
186
+ return uni_hsp
187
+ end
188
+
189
+ def load_seq(hash) #Carga secuencias en @contigs
190
+ each_contig{|contig|
191
+ contig.seq=hash[contig.name]
192
+ contig.seq.upcase!
193
+ }
194
+ end
195
+
196
+ def rev_comp #Realiza la secuencia reverso complementaria en @contigs y @uni_hsp
197
+ each_contig{|contig|
198
+ contig.rev_comp_if_hit
199
+ }
200
+ end
201
+
202
+ def parse_stops
203
+ each_contig{|contig|
204
+ contig.stop_codon_search
205
+ }
206
+ end
207
+
208
+ def fasta(fasta_file) #Crea un archivo fasta a partir de @contigs
209
+ temp=File.open(fasta_file, 'w')
210
+ each_contig{|contig|
211
+ temp.print ">#{contig.name}\n"
212
+ temp.puts contig.seq
213
+ }
214
+ temp.close
215
+ end
216
+
217
+ def filtering_clust # Bateria de filtros q se aplica sobre @clusters. tb muestra informacion
218
+ gene_clusters=[]
219
+ uni_hsp=[]
220
+ each_cluster{|clust|
221
+ if $verbose
222
+ puts "\n********************CLUSTER*************************\n"
223
+ end
224
+ putative_ex=[]
225
+ trash_ex=[]
226
+ clust.each do |contig|
227
+ temp=[]
228
+ if contig.mixed?
229
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
230
+ trash_ex << temp
231
+ elsif contig.is_truncated?
232
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
233
+ trash_ex << temp
234
+ elsif contig.is_one_hsp?
235
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
236
+ trash_ex << temp
237
+ uni_hsp << contig#Se guardan los contig uni-hsp, para procesado posterior
238
+ elsif contig.is_gapped?
239
+ temp << "#{contig.first_hit.name}\t#{contig.name}"
240
+ trash_ex << temp
241
+ else putative_ex << contig
242
+ end
243
+ end
244
+ if $verbose
245
+ putative_ex.each do |contig|
246
+ puts "#{contig.first_hit.name}\t#{contig.name}\t\t\tsc:#{contig.first_hit.first_hsp.score}" #el score de cada hsp es el mismo, por lo que realmente pertenece al alineamiento entero
247
+ end
248
+ puts ',,,,,,,,,,,,,REJECTED,,,,,,,,,,,,,'
249
+ trash_ex.each do |contig|
250
+ puts contig
251
+ end
252
+ puts "\n= = = = = = = = = =MAP= = = = = = = = = = = =\n"
253
+ putative_ex.each do |contig|
254
+ contig.draw
255
+ end
256
+ end
257
+ gene_clusters << putative_ex
258
+ }
259
+ @clusters=gene_clusters
260
+ return uni_hsp
261
+ end
262
+
263
+ def sort_cont_clust #Ordenar contigs dentro de @clusters de menor a mayor en base a su primer hsp
264
+ each_cluster{|cluster|
265
+ cluster=sort_cluster(cluster)
266
+ }
267
+ #@clusters=sort_clusters(@clusters)
268
+ end
269
+
270
+ def sort_cluster(cluster)#Ordena los elementos de cluster(contigs) en base a su posicion en el subject
271
+ cluster.sort!{|e1,e2| e1.first_hit.first_hsp.s_beg<=>e2.first_hit.first_hsp.s_beg}
272
+ end
273
+
274
+ def load_references(references_file) # Carga en @references_hash todas las referencias en forma de objetos contig
275
+ hash={}
276
+ if File.exists?(references_file)
277
+ File.open(references_file, 'r').each do |line|
278
+ fields=line.split
279
+ contig_name=fields[0]
280
+ if !fields[1].nil?
281
+ structures=fields[1].split('|')
282
+ all_models=[]
283
+ structures.each do |structure|
284
+ contig=Contig.new(contig_name)
285
+ contig.add_hit(contig_name, 0, 1,:nt)
286
+ if structure.nil?
287
+ break
288
+ end
289
+ hsps=structure.split(';')
290
+ s_end=0
291
+ nt_add=0
292
+ hsps.each do |hsp|
293
+ coords=hsp.split('-')
294
+ q_beg=coords[0].to_i
295
+ q_end=coords[1].to_i
296
+ s_beg=s_end+1
297
+ exon_length=q_end-q_beg+nt_add
298
+ s_end=s_end+(exon_length/3)
299
+ nt_add=exon_length.modulo(3)
300
+ contig.first_hit.add_hsp(q_beg, q_end, s_beg, s_end, 0, 0, 0, 0)
301
+ end
302
+ contig.length=contig.first_hit.last_hsp.q_end
303
+ all_models << contig
304
+ end
305
+ hash[contig_name]=all_models
306
+ end
307
+ end
308
+ end
309
+ @references_hash=hash
310
+ end
311
+
312
+ def missing_cluster_transfer(dataset) #Busca que clusters estan vacios e intenta llenarlos con clusters de dataset
313
+ add=[]
314
+ delete=[]
315
+ if clusters_empty?
316
+ dataset.each_cluster{ |clust|
317
+ transfer_cluster(clust)
318
+ }
319
+ dataset.clear_clusters
320
+ else
321
+ dataset.each_cluster_with_index{|uni_cluster,ind|
322
+ is_cluster=FALSE
323
+ each_cluster{|cluster| #Se mira si existe cluster uni-hsp en cluster
324
+ if uni_cluster.first.first_hit.name==cluster.first.first_hit.name
325
+ is_cluster=TRUE
326
+ break
327
+ end
328
+ }
329
+ if !is_cluster #Caso de q no exista cluster, se transfiere cluster uni-hsp
330
+ add << uni_cluster
331
+ delete << ind
332
+ end
333
+ }
334
+ add.each do |clust|
335
+ transfer_cluster(clust.dup)
336
+ end
337
+ delete.sort!
338
+ delete.reverse_each do |ind|
339
+ dataset.delete_cluster_at(ind)
340
+ end
341
+ end
342
+ end
343
+
344
+ def clear_clusters
345
+ @clusters=[]
346
+ end
347
+
348
+ def score_correction(factor) #Suma al atributo score la operacion nº intrones*factor
349
+ each_contig{|contig|
350
+ n_intron=contig.n_intron
351
+ contig.first_hit.each_hsp{|hsp|
352
+ hsp.score+=factor*n_intron
353
+ }
354
+ }
355
+ end
356
+
357
+ def correct_left_side_contigs(contig_base)
358
+ last_contig=nil
359
+ limit=0
360
+ correct=0
361
+ ## Alineamiento de los contig entre si para calcular desplazamiento
362
+ if !contig_base.nil?
363
+ limit=-1
364
+ last_contig=contig_base
365
+ end
366
+ each_contig_with_index do |contig,i| # Calculo del desplazamiento necesario para corregir indices negativos en el gff
367
+ if i>limit
368
+ overlap_exon_with_last,ex=contig.compare(last_contig)
369
+ if overlap_exon_with_last>-1
370
+ overlap_exon_current,ex=last_contig.compare(contig)
371
+ diference=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current))
372
+ if diference<correct
373
+ correct=diference
374
+ end
375
+ end
376
+ end
377
+
378
+ if !contig_base.nil?
379
+ last_contig=contig_base
380
+ else
381
+ last_contig=contig
382
+ end
383
+ end
384
+ correct*=-1
385
+ return correct
386
+ end
387
+
388
+ def align_contigs(contig_base)
389
+ limit=0
390
+ las_contig=nil
391
+ if !contig_base.nil?
392
+ limit=-1
393
+ last_contig=contig_base
394
+ end
395
+ ## Alineamiento de los contig entre si o contra una referencia
396
+ add=0
397
+ align=TRUE
398
+ each_contig_with_index do |contig,i|
399
+ if i>limit
400
+ #Calcular desplazamiento de un contig respecto al anterior en el gff
401
+ overlap_exon_with_last,ex=contig.compare(last_contig)
402
+ if overlap_exon_with_last==-1
403
+ if contig_base.nil?
404
+ add+=last_contig.length
405
+ else
406
+ align=FALSE
407
+ end
408
+ else
409
+ overlap_exon_current,ex=last_contig.compare(contig)
410
+ add+=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current))
411
+ if !contig_base.nil?
412
+ align=TRUE
413
+ end
414
+ end
415
+ end
416
+
417
+ #Modificacion de contigs
418
+ if align || contig_base.nil? # Modificar si no existe referencia o el contig a alineado contra la referencia
419
+ contig.modified_coordenates(add)
420
+ contig.length+=add
421
+ end
422
+ if !contig_base.nil?
423
+ last_contig=contig_base
424
+ add=0 #Resetear desplazamiento en caso de usarse una referencia
425
+ else
426
+ last_contig=contig
427
+ end
428
+ end
429
+ end
430
+
431
+ def multiple_align_contigs(array_contig_base,mod_contig_base=FALSE)
432
+ correct=0
433
+ array_contig_base.each do |contig_base|
434
+ local_correct=correct_left_side_contigs(contig_base)
435
+ if local_correct>correct
436
+ correct=local_correct
437
+ end
438
+ self.align_contigs(contig_base,mod_contig_base)
439
+ end
440
+ # Correcion del modelo en base al desplazamiento general calculado para cada fragmento teniendo en cuenta el desplazamiento local realizado
441
+ array_contig_base.each do |contig|
442
+ if correct>0
443
+ contig.modified_coordenates(correct)
444
+ contig.length+=correct
445
+ end
446
+ end
447
+ self.each_contig {|contig|
448
+ if correct>0
449
+ contig.modified_coordenates(correct)
450
+ contig.length+=correct
451
+ end
452
+ }
453
+
454
+ return correct
455
+ end
456
+
457
+ def transfer_n_contigs_def_hit_type(dataset,cluster,new_hit_type,limit)
458
+ if !cluster.empty?||!cluster.nil?
459
+ dataset.each_cluster{|dat_cluster|
460
+ if dat_cluster.empty?||dat_cluster.nil?
461
+ next
462
+ end
463
+ if dat_cluster.first.first_hit.name==cluster.first.first_hit.name # Se busca en los clusters unihsp aquel q pertenece al gen q se esta trabajando
464
+ dat_cluster.each do |contig|
465
+ contig.first_hit.type='pseudogene'
466
+ end
467
+ transfer_contigs(dat_cluster,limit)
468
+ end
469
+
470
+ }
471
+ end
472
+ end
473
+
474
+ def missing_contigs_transfer(dataset) #dataset is uni_hsp. Se buscan contigs q no alineen con los de self
475
+ contigs_cluster=[]
476
+ self.each_cluster_with_index{|self_cluster,s|
477
+ dataset.each_cluster{|dataset_cluster|
478
+ if dataset_cluster.nil? ||dataset_cluster.empty?
479
+ next
480
+ end
481
+ if self_cluster.first.first_hit.name==dataset_cluster.first.first_hit.name #Mismo cluster
482
+ dataset_cluster.each do |dataset_contig|
483
+ align=FALSE
484
+ self_cluster.each do |self_contig|
485
+ position,n_exones=dataset_contig.compare(self_contig)
486
+ if position>-1
487
+ align=TRUE
488
+ break
489
+ end
490
+ end
491
+ if !align
492
+ contigs_cluster << dataset_contig
493
+ end
494
+ end
495
+
496
+ contigs_cluster.each do |contig|
497
+ self.transfer_contig_to_cluster(contig,s)
498
+ dataset_cluster.delete(contig)
499
+ end
500
+ contigs_cluster=[]
501
+ end
502
+ }
503
+ }
504
+ end
505
+
506
+ def transfer_contig_to_cluster(contig,n_cluster)
507
+ @clusters[n_cluster] << contig
508
+ end
509
+
510
+ def generate_file_5_prime(file, fasta)
511
+ prime5_file = File.open(file, 'w')
512
+ fasta_file = File.open(fasta, 'w')
513
+ each_cluster{ |cluster|
514
+ if !cluster.nil? && !cluster.empty?
515
+ gene_name = cluster.first.first_hit.name
516
+ cluster.each do |contig|
517
+ if contig.first_hit.first_hsp.s_beg <= 10
518
+ prime5_end = contig.first_hit.first_hsp.q_beg
519
+ prime5_file.puts "#{gene_name}\t#{contig.name}\t#{prime5_end}"
520
+ seq = contig.seq[0..prime5_end]
521
+ if !seq.nil?
522
+ fasta_file.puts "#{gene_name}\n#{seq}"
523
+ end
524
+ end
525
+ end
526
+ end
527
+ }
528
+ prime5_file.close
529
+ fasta_file.close
530
+ end
531
+
532
+ end