gene_assembler 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,643 @@
1
+ require 'hit'
2
+ require 'snp'
3
+ require 'go'
4
+
5
+
6
+ class Contig
7
+ attr_accessor :name, :seq,:type, :length, :hits, :completed, :q_frameshift, :s_frameshift, :stops, :mod_coord
8
+ def initialize (name)
9
+ @name=name
10
+ @seq=''
11
+ @type=nil
12
+ @length=''
13
+ @hits=[]
14
+ @snps=[]
15
+ @gos=[]
16
+ @completed=''
17
+ @localization=[]
18
+ @q_frameshift=[]
19
+ @s_frameshift=[]
20
+ @stops=[]
21
+ @mod_coord=FALSE #Indica si se han alterado las coordenadas del contig previamente
22
+ end
23
+
24
+ def length=(length)
25
+ if length.class.to_s=='String'
26
+ @length=length.to_i
27
+ else
28
+ @length=length
29
+ end
30
+ end
31
+
32
+ def first_hit
33
+ h=nil
34
+ each_hit{|hit|
35
+ h=hit
36
+ break
37
+ }
38
+ return h
39
+ end
40
+
41
+ def last_hit
42
+ h=nil
43
+ each_hit{|hit|
44
+ h=hit
45
+ }
46
+ return h
47
+ end
48
+
49
+ def hit_count
50
+ count=0
51
+ each_hit{|hit|
52
+ count+=1
53
+ }
54
+ return count
55
+ end
56
+
57
+ def add_localization(localization)
58
+ @localization << localization
59
+ end
60
+
61
+ def each_localization
62
+ @localization.each do |localization|
63
+ yield localization
64
+ end
65
+ end
66
+
67
+ def each_localization_with_index
68
+ @localization.each_with_index do |localization,i|
69
+ yield localization,i
70
+ end
71
+ end
72
+
73
+ def each_stop
74
+ @stops.each do |stop|
75
+ yield stop
76
+ end
77
+ end
78
+
79
+ def each_stop_with_index
80
+ @stops.each_with_index do |stop,i|
81
+ yield stop,i
82
+ end
83
+ end
84
+
85
+ def add_go(go,name,obsolete)
86
+ go=GO.new(go,name,obsolete)
87
+ @gos << go
88
+ return go
89
+ end
90
+
91
+ def each_go
92
+ @gos.each do |go|
93
+ yield go
94
+ end
95
+ end
96
+
97
+ def add_snp(position)
98
+ snp=SNP.new(position)
99
+ @snps << snp
100
+ return snp
101
+ end
102
+
103
+ def add_hit(hit_name, s_length, reversed, type)
104
+ hit=Hit.new(hit_name, s_length, reversed, type)
105
+ @hits << hit
106
+ return hit
107
+ end
108
+
109
+ def has_hit?
110
+ has_hit=FALSE
111
+ if @hits.length>0
112
+ has_hit=TRUE
113
+ end
114
+ return has_hit
115
+ end
116
+
117
+ def each_hit
118
+ @hits.each do |hit|
119
+ yield hit
120
+ end
121
+ end
122
+
123
+ def each_hit_with_index
124
+ @hits.each_with_index do |hit,i|
125
+ yield hit,i
126
+ end
127
+ end
128
+
129
+ def n_hits?
130
+ n=@hits.length
131
+ return n
132
+ end
133
+
134
+ def each_snp
135
+ @snps.each do |snp|
136
+ yield snp
137
+ end
138
+ end
139
+
140
+ def each_snp_with_index
141
+ @snps.each_with_index do |snp,i|
142
+ yield snp,i
143
+ end
144
+ end
145
+
146
+ def hits_sort!
147
+ each_hit{|hit|
148
+ hit.hsps_sort!
149
+ }
150
+ end
151
+
152
+ def modified_coordenates(add)
153
+ @mod_coord=TRUE
154
+ each_hit{|hit|
155
+ hit.modified_coordenates(add)
156
+ stop_modified_coordenates(add)
157
+ frameshift_modified_coordenates(add)
158
+ }
159
+ return last_hit.last_hsp.q_end
160
+ end
161
+
162
+ def stop_modified_coordenates(add)
163
+ @stops.length.times do |n|
164
+ @stops[n]+=add
165
+ end
166
+ end
167
+
168
+ def frameshift_modified_coordenates(add)
169
+ @q_frameshift.length.times do |n|
170
+ @q_frameshift[n]+=add
171
+ end
172
+ end
173
+
174
+ def each_q_frameshift
175
+ @q_frameshift.each do |qfs|
176
+ yield qfs
177
+ end
178
+ end
179
+
180
+ def draw #Realiza una representacion del contig a nivel del subject
181
+ last_hsp_end=0
182
+ overlap=0
183
+ first_hit.each_hsp_with_index{|hsp,c|
184
+ dif=hsp.s_beg-last_hsp_end
185
+ if dif>=0 && dif<=2
186
+ print '/' # Limite solapante uno a continuacion de otro (disposicion normal del exon)
187
+ elsif dif>2
188
+ print '-'*(hsp.s_beg-last_hsp_end)
189
+ print '|'
190
+ elsif dif<0
191
+ print '&'
192
+ overlap=dif
193
+ end
194
+ h=c+1 #Num de hsp
195
+ print "#{h.to_s.center(hsp.s_end-hsp.s_beg+overlap-1)}"
196
+ if dif>2
197
+ #print '|'
198
+ end
199
+ last_hsp_end=hsp.s_end
200
+ }
201
+ print '|-'
202
+ ending=first_hit.s_length-last_hsp_end
203
+ if ending<0
204
+ ending=0
205
+ end
206
+ print '-'*(ending)
207
+ puts "\n"
208
+ end
209
+
210
+ def indices #Muestra los indices de subject y query del contig
211
+ each_hit_with_index {|hit,ind|
212
+ hit.each_hsp_with_index{|hsp,i|
213
+ puts "#{ind+1}.#{i+1})\t#{hsp.q_beg}\t#{hsp.q_end}\t#{hsp.s_beg}\t#{hsp.s_end}\t#{@name}\t#{@length}\t#{@mod_coord}"
214
+ }
215
+ }
216
+ end
217
+
218
+ def exones_s # Devuelve un array con el tamaño de cada hsp/exon a nivel del subject
219
+ exones_s=[]
220
+ each_hit {|hit|
221
+ hit.each_hsp{|hsp|
222
+ long=(hsp.s_end-hsp.s_beg).abs
223
+ exones_s << long
224
+ }
225
+ }
226
+ return exones_s
227
+ end
228
+
229
+ def exones_q # Devuelve un array con el tamaño de cada hsp/exon a nivel del query
230
+ exones_q=[]
231
+ each_hit {|hit|
232
+ hit.each_hsp{|hsp|
233
+ long=(hsp.q_end-hsp.q_beg).abs
234
+ exones_q << long
235
+ }
236
+ }
237
+ return exones_q
238
+ end
239
+
240
+ def intrones_q # Devuelve un array con el tamaño de cada intron a nivel del query
241
+ intrones_q=[]
242
+ l=first_hit.hsp_count
243
+ each_hit {|hit|
244
+ hit.each_hsp_with_index{|hsp,ind|
245
+ if !first_hit.hsp_at(ind+1)
246
+ break
247
+ end
248
+ long=(first_hit.hsp_at(ind+1).q_beg-hsp.q_end).abs
249
+ intrones_q << long
250
+ }
251
+ }
252
+ return intrones_q
253
+ end
254
+
255
+ def n_intron
256
+ n_intron=0
257
+ each_hit{|hit|
258
+ n_intron+=1
259
+ }
260
+ return n_intron
261
+ end
262
+
263
+ def exon_acumulative #Suma la longitud de todos los exones
264
+ long=0
265
+ exones=exones_q
266
+ exones.each do |ex|
267
+ long+=ex
268
+ end
269
+ return long
270
+ end
271
+
272
+ #Funciones para comprobar validez de los contig
273
+ def mixed? #Examina si los hsp de un hit estan desordenados
274
+ is_mix=FALSE
275
+ beg=nil
276
+ sign=0
277
+ sign_local=0
278
+ @hits.first.hsps.each_with_index do |hsp,c|
279
+ if !beg.nil?
280
+ dif=hsp.q_beg-beg
281
+ if dif>=0
282
+ sign_local=1
283
+ else sign_local=0
284
+ end
285
+ if sign_local!=sign && c>1
286
+ is_mix=TRUE
287
+ break
288
+ end
289
+ end
290
+ sign=sign_local
291
+ beg=hsp.q_beg
292
+ end
293
+ return is_mix
294
+ end
295
+
296
+ def is_one_hsp? #Examina si el hit esta compuesto por un solo hsp
297
+ is_one=FALSE
298
+ if first_hit.hsp_count==1
299
+ is_one=TRUE
300
+ end
301
+ return is_one
302
+ end
303
+
304
+ def is_gapped? #Examina si hay gaps internos en la estructura del gen q se mapea sobre la proteina q pudieran señalar la falta parcial o completa de un exon
305
+ gap=3 #Gap maximo permitido medido en aa, como minimo poner a 1
306
+ gapped=FALSE
307
+ s_end_last=0
308
+ @hits.first.hsps.each do |hsp|
309
+ if s_end_last >0
310
+ dif=hsp.s_beg-s_end_last
311
+ if dif>gap #En caso de coordenadas solapantes, siempre saldra dif negativo, con lo que la condicion gap no se cumple
312
+ gapped=TRUE
313
+ break
314
+ end
315
+ end
316
+ s_end_last=hsp.s_end
317
+ end
318
+ return gapped
319
+ end
320
+
321
+ def is_truncated? #Examina si los exones en el borde del contig estan truncados/interrumpidos
322
+ truncated=FALSE
323
+ # puts self.name
324
+ # puts @hits.inspect
325
+ #Truncado por el inicio
326
+ if first_hit.first_hsp.s_beg>1 && first_hit.first_hsp.q_beg==1
327
+ truncated=TRUE
328
+ end
329
+
330
+ #Truncado por el final
331
+ if first_hit.last_hsp.q_end==@length
332
+ truncated=TRUE
333
+ end
334
+ return truncated
335
+ end
336
+
337
+ def hsp_minor_than?(hsp_length) # En nt
338
+ minor=FALSE
339
+ each_hit_with_index {|hit,i|
340
+ if i>0 || i<hit.hsp_count
341
+ if hit.hsp_minor_than?(hsp_length)
342
+ minor=TRUE
343
+ break
344
+ end
345
+ end
346
+ }
347
+ return minor
348
+ end
349
+
350
+ def correct_hsps(blast_coor_type)
351
+ each_hit {|hit|
352
+ hit.correct_hsps(blast_coor_type)
353
+ }
354
+ end
355
+ #-------------------------------------------------------------------------
356
+
357
+ def compare(contig) #Alinea un contig con otro en base a las coordenadas del subject
358
+ exon_match=-1
359
+ exones=0
360
+ match_found=FALSE
361
+ #SELF HIT
362
+ self.each_hit {|self_hit|
363
+ if match_found #Romper bucle si ha habido coindidencia definitiva
364
+ break
365
+ end
366
+ #SELF HSP
367
+ self_hit.each_hsp {|self_hsp|
368
+ if match_found
369
+ break
370
+ end
371
+ #CONTIG HIT
372
+ last=0
373
+ contig_hsp_count=0
374
+ contig.each_hit {|contig_hit|
375
+ #CONTIG HSP
376
+ contig_hit.each_hsp {|contig_hsp|#Valoracion del coverage de cada hit entre si, en el momento que el segundo de mayor se cancela
377
+ coverage=self_hsp.compare(contig_hsp)
378
+ if coverage>last #Guardar coincidencia a la espera de una mejor
379
+ exon_match=contig_hsp_count
380
+ last=coverage
381
+ end
382
+ if coverage>0.2 #Contaje de exones
383
+ exones+=1
384
+ end
385
+ if coverage==0 && exon_match>-1 #Romper bucle cuando se ha encontrado coincidencia y los demas exones no coinciden
386
+ match_found=TRUE
387
+ break
388
+ end
389
+ contig_hsp_count+=1
390
+ }
391
+ if match_found
392
+ break
393
+ end
394
+ }
395
+ }
396
+ }
397
+ return exon_match,exones
398
+ end
399
+
400
+ def rev_comp_if_hit #Devuelve la secuencia reversocomplementaria del contig
401
+ if first_hit.reversed
402
+ rev_comp
403
+ first_hit.reversed=FALSE
404
+ end
405
+ end
406
+
407
+ def rev_comp
408
+ rev_seq=[]
409
+ @seq.each_char do |char|
410
+ char.upcase!
411
+ if char =='A'
412
+ rev_seq << 'T'
413
+ elsif char =='T'
414
+ rev_seq << 'A'
415
+ elsif char =='G'
416
+ rev_seq << 'C'
417
+ elsif char =='C'
418
+ rev_seq << 'G'
419
+ else
420
+ rev_seq << char
421
+ end
422
+ end
423
+ rev_seq.reverse!
424
+ @seq=rev_seq.join
425
+
426
+ end
427
+
428
+ def rev_coord
429
+ each_hit {|hit|
430
+ hit.rev_coord(@length)
431
+ hit.hsps_sort!
432
+ }
433
+ end
434
+
435
+ def start_codon_search #Busqueda codon inicio, busca si existe y una vez encontrado modifica coordenadas para acomodar el codon
436
+ exists=FALSE
437
+ s_beg=@hits.first.hsps.first.s_beg
438
+ s_end=@hits.first.hsps.first.s_end
439
+ q_beg=@hits.first.hsps.first.q_beg
440
+ q_end=@hits.first.hsps.first.q_end
441
+ if s_beg<=10 # Se busca codon si la proteina carece de los 10 primeros aa
442
+ continue=TRUE
443
+ index=0
444
+ temp_index=0
445
+ while continue==TRUE
446
+ if temp_index==0
447
+ find=@seq.index('ATG')
448
+ else
449
+ find=@seq.index('ATG',temp_index+1)
450
+ end
451
+ find+=1
452
+ if find==nil
453
+ continue=FALSE
454
+ else
455
+ if find==q_beg
456
+ continue=FALSE
457
+ index=find
458
+ elsif find>(q_beg-1+3)
459
+ continue=FALSE
460
+ else
461
+ if (find-q_end).modulo(3)==0 || find-q_end==0
462
+ index=find
463
+ end
464
+ if temp_index==0
465
+ temp_index=find+1
466
+ else
467
+ temp_index=find
468
+ end
469
+ end
470
+ end
471
+ end#While
472
+ if index>0
473
+ @hits.first.hsps.first.s_beg=1
474
+ @hits.first.hsps.first.q_beg=index
475
+ exists=TRUE
476
+ end
477
+ end
478
+ return exists
479
+ end
480
+
481
+ def stop_codon(codon,ends,*beg) #Busqueda posibles codones de parada
482
+ reference=ends-1
483
+ if !beg.empty?
484
+ reference=ends
485
+ ends=beg.first
486
+ end
487
+ position=nil
488
+ follow=TRUE
489
+ while follow
490
+ pos=@seq.index(codon,ends)#Implicitamente lleva el +1
491
+ if pos.nil?
492
+ follow=FALSE
493
+ else
494
+ dif=(pos-reference).abs
495
+ #puts "#{pos} #{dif} #{reference}"
496
+ if dif.modulo(3)==0
497
+ position=pos
498
+ follow=FALSE
499
+ else
500
+ ends=pos+1
501
+ end
502
+ end
503
+ end
504
+ return position
505
+ end
506
+
507
+ def coor_intrones
508
+ #Determinar bordes de los intrones
509
+ intrones=[]
510
+ last_hsp=nil
511
+ if first_hit.hsp_count>1
512
+ first_hit.each_hsp_with_index{|hsp,i|
513
+ if i>0
514
+ intrones << [last_hsp.q_end,hsp.q_beg]
515
+ end
516
+ last_hsp=hsp
517
+ }
518
+ end
519
+ #---------------------------------
520
+ return intrones
521
+ end
522
+
523
+ def stop_codon_search #Busqueda codon de parada, busca si existe
524
+ exists=FALSE
525
+ homology_start=first_hit.first_hsp.q_beg
526
+ homology_end=first_hit.last_hsp.q_end
527
+ n=1
528
+ codon=''
529
+ position=nil
530
+ intrones=coor_intrones
531
+ @seq.chars do |c|
532
+ if n<homology_start#Comenzar comparacion al principio del primer exon
533
+ n+=1
534
+ next
535
+ end
536
+ if n>homology_end#Terminar comparacion
537
+ break
538
+ end
539
+ #Saltar intrones
540
+ if !intrones.empty?
541
+ jump=FALSE
542
+ intrones.each do |int|
543
+ if n>int[0] && n<int[1] #NO se incluye el borde del exon
544
+ #print 'i'
545
+ jump=TRUE
546
+ break
547
+ end
548
+ end
549
+ if jump
550
+ n+=1
551
+ next
552
+ end
553
+ end
554
+ #-----------------
555
+ codon+=c
556
+ if codon.length==3 #Comparacion del codon
557
+ #puts position.to_s+"\t"+codon
558
+ if codon=='TAG'||codon=='TAA'||codon=='TGA'
559
+ @stops << position
560
+ exists=TRUE
561
+ end
562
+ codon=''
563
+ elsif codon.length==1 #Guardar posicion del primer nucleotido del codon
564
+ position=n
565
+ end
566
+ n+=1
567
+ end
568
+ return exists
569
+ end
570
+
571
+ def gff(id,parent,add) #Devuelve en estrutura gff los exones en genomico
572
+ text=[]
573
+ #Exones
574
+ first_hit.each_hsp{|hsp|
575
+ # if hsp.reversed==TRUE
576
+ # hsp.q_beg=@length-hsp.q_beg
577
+ # hsp.q_end=@length-hsp.q_end
578
+ # end
579
+ #puts "#{hsp.q_beg+add} #{hsp.q_end+add}"
580
+ text<<"#{id}\t.\texon\t#{hsp.q_beg+add}\t#{hsp.q_end+add}\t.\t+\t.\tID=#{parent}_exon;Parent=#{parent};Name=#{parent}_exon"
581
+ }
582
+ return text
583
+ end
584
+
585
+ def gff_prot(id,prot_name) #Devuelve en estrutura gff los exones en proteina
586
+ #parent="#{parent}_mRNA"
587
+ text=[]
588
+ #Exones
589
+ first_hit.each_hsp{|hsp|
590
+ #puts "#{hsp.q_beg+add} #{hsp.q_end+add}"
591
+ text<<"#{id}\t.\tprotein_match\t#{hsp.s_beg}\t#{hsp.s_end}\t.\t+\t.\tID=#{prot_name}_prot;Parent=#{prot_name};Name=#{id}_prot"
592
+ }
593
+ return text
594
+ end
595
+
596
+ def transfer_contig_hits(contig)
597
+ contig.each_hit{|hit|
598
+ self.transfer_hit(hit)
599
+ }
600
+ end
601
+
602
+ def transfer_hit(hit)
603
+ @hits << hit
604
+ end
605
+
606
+ def overlap
607
+ overlap=[]
608
+ last_hit=nil
609
+ each_hit_with_index{|hit,i|
610
+ overlap << hit.hsp_overlap
611
+ if i>0
612
+ diference=hit.overlap_with(last_hit)
613
+ if diference<0
614
+ overlap << diference
615
+ end
616
+ end
617
+ last_hit=hit
618
+ }
619
+ overlap.flatten!
620
+ return overlap
621
+ end
622
+
623
+ def hsp_at(position)
624
+ hsp_ret=nil
625
+ count_hsp=0
626
+ found=FALSE
627
+ each_hit {|hit|
628
+ hit.each_hsp {|hsp|
629
+ if position==count_hsp
630
+ hsp_ret=hsp
631
+ found=TRUE
632
+ break
633
+ end
634
+ count_hsp+=1
635
+ }
636
+ if found
637
+ break
638
+ end
639
+ }
640
+ return hsp_ret
641
+ end
642
+
643
+ end #class