gene_assembler 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,643 @@
1
+ require 'hit'
2
+ require 'snp'
3
+ require 'go'
4
+
5
+
6
+ class Contig
7
+ attr_accessor :name, :seq,:type, :length, :hits, :completed, :q_frameshift, :s_frameshift, :stops, :mod_coord
8
+ def initialize (name)
9
+ @name=name
10
+ @seq=''
11
+ @type=nil
12
+ @length=''
13
+ @hits=[]
14
+ @snps=[]
15
+ @gos=[]
16
+ @completed=''
17
+ @localization=[]
18
+ @q_frameshift=[]
19
+ @s_frameshift=[]
20
+ @stops=[]
21
+ @mod_coord=FALSE #Indica si se han alterado las coordenadas del contig previamente
22
+ end
23
+
24
+ def length=(length)
25
+ if length.class.to_s=='String'
26
+ @length=length.to_i
27
+ else
28
+ @length=length
29
+ end
30
+ end
31
+
32
+ def first_hit
33
+ h=nil
34
+ each_hit{|hit|
35
+ h=hit
36
+ break
37
+ }
38
+ return h
39
+ end
40
+
41
+ def last_hit
42
+ h=nil
43
+ each_hit{|hit|
44
+ h=hit
45
+ }
46
+ return h
47
+ end
48
+
49
+ def hit_count
50
+ count=0
51
+ each_hit{|hit|
52
+ count+=1
53
+ }
54
+ return count
55
+ end
56
+
57
+ def add_localization(localization)
58
+ @localization << localization
59
+ end
60
+
61
+ def each_localization
62
+ @localization.each do |localization|
63
+ yield localization
64
+ end
65
+ end
66
+
67
+ def each_localization_with_index
68
+ @localization.each_with_index do |localization,i|
69
+ yield localization,i
70
+ end
71
+ end
72
+
73
+ def each_stop
74
+ @stops.each do |stop|
75
+ yield stop
76
+ end
77
+ end
78
+
79
+ def each_stop_with_index
80
+ @stops.each_with_index do |stop,i|
81
+ yield stop,i
82
+ end
83
+ end
84
+
85
+ def add_go(go,name,obsolete)
86
+ go=GO.new(go,name,obsolete)
87
+ @gos << go
88
+ return go
89
+ end
90
+
91
+ def each_go
92
+ @gos.each do |go|
93
+ yield go
94
+ end
95
+ end
96
+
97
+ def add_snp(position)
98
+ snp=SNP.new(position)
99
+ @snps << snp
100
+ return snp
101
+ end
102
+
103
+ def add_hit(hit_name, s_length, reversed, type)
104
+ hit=Hit.new(hit_name, s_length, reversed, type)
105
+ @hits << hit
106
+ return hit
107
+ end
108
+
109
+ def has_hit?
110
+ has_hit=FALSE
111
+ if @hits.length>0
112
+ has_hit=TRUE
113
+ end
114
+ return has_hit
115
+ end
116
+
117
+ def each_hit
118
+ @hits.each do |hit|
119
+ yield hit
120
+ end
121
+ end
122
+
123
+ def each_hit_with_index
124
+ @hits.each_with_index do |hit,i|
125
+ yield hit,i
126
+ end
127
+ end
128
+
129
+ def n_hits?
130
+ n=@hits.length
131
+ return n
132
+ end
133
+
134
+ def each_snp
135
+ @snps.each do |snp|
136
+ yield snp
137
+ end
138
+ end
139
+
140
+ def each_snp_with_index
141
+ @snps.each_with_index do |snp,i|
142
+ yield snp,i
143
+ end
144
+ end
145
+
146
+ def hits_sort!
147
+ each_hit{|hit|
148
+ hit.hsps_sort!
149
+ }
150
+ end
151
+
152
+ def modified_coordenates(add)
153
+ @mod_coord=TRUE
154
+ each_hit{|hit|
155
+ hit.modified_coordenates(add)
156
+ stop_modified_coordenates(add)
157
+ frameshift_modified_coordenates(add)
158
+ }
159
+ return last_hit.last_hsp.q_end
160
+ end
161
+
162
+ def stop_modified_coordenates(add)
163
+ @stops.length.times do |n|
164
+ @stops[n]+=add
165
+ end
166
+ end
167
+
168
+ def frameshift_modified_coordenates(add)
169
+ @q_frameshift.length.times do |n|
170
+ @q_frameshift[n]+=add
171
+ end
172
+ end
173
+
174
+ def each_q_frameshift
175
+ @q_frameshift.each do |qfs|
176
+ yield qfs
177
+ end
178
+ end
179
+
180
+ def draw #Realiza una representacion del contig a nivel del subject
181
+ last_hsp_end=0
182
+ overlap=0
183
+ first_hit.each_hsp_with_index{|hsp,c|
184
+ dif=hsp.s_beg-last_hsp_end
185
+ if dif>=0 && dif<=2
186
+ print '/' # Limite solapante uno a continuacion de otro (disposicion normal del exon)
187
+ elsif dif>2
188
+ print '-'*(hsp.s_beg-last_hsp_end)
189
+ print '|'
190
+ elsif dif<0
191
+ print '&'
192
+ overlap=dif
193
+ end
194
+ h=c+1 #Num de hsp
195
+ print "#{h.to_s.center(hsp.s_end-hsp.s_beg+overlap-1)}"
196
+ if dif>2
197
+ #print '|'
198
+ end
199
+ last_hsp_end=hsp.s_end
200
+ }
201
+ print '|-'
202
+ ending=first_hit.s_length-last_hsp_end
203
+ if ending<0
204
+ ending=0
205
+ end
206
+ print '-'*(ending)
207
+ puts "\n"
208
+ end
209
+
210
+ def indices #Muestra los indices de subject y query del contig
211
+ each_hit_with_index {|hit,ind|
212
+ hit.each_hsp_with_index{|hsp,i|
213
+ puts "#{ind+1}.#{i+1})\t#{hsp.q_beg}\t#{hsp.q_end}\t#{hsp.s_beg}\t#{hsp.s_end}\t#{@name}\t#{@length}\t#{@mod_coord}"
214
+ }
215
+ }
216
+ end
217
+
218
+ def exones_s # Devuelve un array con el tamaño de cada hsp/exon a nivel del subject
219
+ exones_s=[]
220
+ each_hit {|hit|
221
+ hit.each_hsp{|hsp|
222
+ long=(hsp.s_end-hsp.s_beg).abs
223
+ exones_s << long
224
+ }
225
+ }
226
+ return exones_s
227
+ end
228
+
229
+ def exones_q # Devuelve un array con el tamaño de cada hsp/exon a nivel del query
230
+ exones_q=[]
231
+ each_hit {|hit|
232
+ hit.each_hsp{|hsp|
233
+ long=(hsp.q_end-hsp.q_beg).abs
234
+ exones_q << long
235
+ }
236
+ }
237
+ return exones_q
238
+ end
239
+
240
+ def intrones_q # Devuelve un array con el tamaño de cada intron a nivel del query
241
+ intrones_q=[]
242
+ l=first_hit.hsp_count
243
+ each_hit {|hit|
244
+ hit.each_hsp_with_index{|hsp,ind|
245
+ if !first_hit.hsp_at(ind+1)
246
+ break
247
+ end
248
+ long=(first_hit.hsp_at(ind+1).q_beg-hsp.q_end).abs
249
+ intrones_q << long
250
+ }
251
+ }
252
+ return intrones_q
253
+ end
254
+
255
+ def n_intron
256
+ n_intron=0
257
+ each_hit{|hit|
258
+ n_intron+=1
259
+ }
260
+ return n_intron
261
+ end
262
+
263
+ def exon_acumulative #Suma la longitud de todos los exones
264
+ long=0
265
+ exones=exones_q
266
+ exones.each do |ex|
267
+ long+=ex
268
+ end
269
+ return long
270
+ end
271
+
272
+ #Funciones para comprobar validez de los contig
273
+ def mixed? #Examina si los hsp de un hit estan desordenados
274
+ is_mix=FALSE
275
+ beg=nil
276
+ sign=0
277
+ sign_local=0
278
+ @hits.first.hsps.each_with_index do |hsp,c|
279
+ if !beg.nil?
280
+ dif=hsp.q_beg-beg
281
+ if dif>=0
282
+ sign_local=1
283
+ else sign_local=0
284
+ end
285
+ if sign_local!=sign && c>1
286
+ is_mix=TRUE
287
+ break
288
+ end
289
+ end
290
+ sign=sign_local
291
+ beg=hsp.q_beg
292
+ end
293
+ return is_mix
294
+ end
295
+
296
+ def is_one_hsp? #Examina si el hit esta compuesto por un solo hsp
297
+ is_one=FALSE
298
+ if first_hit.hsp_count==1
299
+ is_one=TRUE
300
+ end
301
+ return is_one
302
+ end
303
+
304
+ def is_gapped? #Examina si hay gaps internos en la estructura del gen q se mapea sobre la proteina q pudieran señalar la falta parcial o completa de un exon
305
+ gap=3 #Gap maximo permitido medido en aa, como minimo poner a 1
306
+ gapped=FALSE
307
+ s_end_last=0
308
+ @hits.first.hsps.each do |hsp|
309
+ if s_end_last >0
310
+ dif=hsp.s_beg-s_end_last
311
+ if dif>gap #En caso de coordenadas solapantes, siempre saldra dif negativo, con lo que la condicion gap no se cumple
312
+ gapped=TRUE
313
+ break
314
+ end
315
+ end
316
+ s_end_last=hsp.s_end
317
+ end
318
+ return gapped
319
+ end
320
+
321
+ def is_truncated? #Examina si los exones en el borde del contig estan truncados/interrumpidos
322
+ truncated=FALSE
323
+ # puts self.name
324
+ # puts @hits.inspect
325
+ #Truncado por el inicio
326
+ if first_hit.first_hsp.s_beg>1 && first_hit.first_hsp.q_beg==1
327
+ truncated=TRUE
328
+ end
329
+
330
+ #Truncado por el final
331
+ if first_hit.last_hsp.q_end==@length
332
+ truncated=TRUE
333
+ end
334
+ return truncated
335
+ end
336
+
337
+ def hsp_minor_than?(hsp_length) # En nt
338
+ minor=FALSE
339
+ each_hit_with_index {|hit,i|
340
+ if i>0 || i<hit.hsp_count
341
+ if hit.hsp_minor_than?(hsp_length)
342
+ minor=TRUE
343
+ break
344
+ end
345
+ end
346
+ }
347
+ return minor
348
+ end
349
+
350
+ def correct_hsps(blast_coor_type)
351
+ each_hit {|hit|
352
+ hit.correct_hsps(blast_coor_type)
353
+ }
354
+ end
355
+ #-------------------------------------------------------------------------
356
+
357
+ def compare(contig) #Alinea un contig con otro en base a las coordenadas del subject
358
+ exon_match=-1
359
+ exones=0
360
+ match_found=FALSE
361
+ #SELF HIT
362
+ self.each_hit {|self_hit|
363
+ if match_found #Romper bucle si ha habido coindidencia definitiva
364
+ break
365
+ end
366
+ #SELF HSP
367
+ self_hit.each_hsp {|self_hsp|
368
+ if match_found
369
+ break
370
+ end
371
+ #CONTIG HIT
372
+ last=0
373
+ contig_hsp_count=0
374
+ contig.each_hit {|contig_hit|
375
+ #CONTIG HSP
376
+ contig_hit.each_hsp {|contig_hsp|#Valoracion del coverage de cada hit entre si, en el momento que el segundo de mayor se cancela
377
+ coverage=self_hsp.compare(contig_hsp)
378
+ if coverage>last #Guardar coincidencia a la espera de una mejor
379
+ exon_match=contig_hsp_count
380
+ last=coverage
381
+ end
382
+ if coverage>0.2 #Contaje de exones
383
+ exones+=1
384
+ end
385
+ if coverage==0 && exon_match>-1 #Romper bucle cuando se ha encontrado coincidencia y los demas exones no coinciden
386
+ match_found=TRUE
387
+ break
388
+ end
389
+ contig_hsp_count+=1
390
+ }
391
+ if match_found
392
+ break
393
+ end
394
+ }
395
+ }
396
+ }
397
+ return exon_match,exones
398
+ end
399
+
400
+ def rev_comp_if_hit #Devuelve la secuencia reversocomplementaria del contig
401
+ if first_hit.reversed
402
+ rev_comp
403
+ first_hit.reversed=FALSE
404
+ end
405
+ end
406
+
407
+ def rev_comp
408
+ rev_seq=[]
409
+ @seq.each_char do |char|
410
+ char.upcase!
411
+ if char =='A'
412
+ rev_seq << 'T'
413
+ elsif char =='T'
414
+ rev_seq << 'A'
415
+ elsif char =='G'
416
+ rev_seq << 'C'
417
+ elsif char =='C'
418
+ rev_seq << 'G'
419
+ else
420
+ rev_seq << char
421
+ end
422
+ end
423
+ rev_seq.reverse!
424
+ @seq=rev_seq.join
425
+
426
+ end
427
+
428
+ def rev_coord
429
+ each_hit {|hit|
430
+ hit.rev_coord(@length)
431
+ hit.hsps_sort!
432
+ }
433
+ end
434
+
435
+ def start_codon_search #Busqueda codon inicio, busca si existe y una vez encontrado modifica coordenadas para acomodar el codon
436
+ exists=FALSE
437
+ s_beg=@hits.first.hsps.first.s_beg
438
+ s_end=@hits.first.hsps.first.s_end
439
+ q_beg=@hits.first.hsps.first.q_beg
440
+ q_end=@hits.first.hsps.first.q_end
441
+ if s_beg<=10 # Se busca codon si la proteina carece de los 10 primeros aa
442
+ continue=TRUE
443
+ index=0
444
+ temp_index=0
445
+ while continue==TRUE
446
+ if temp_index==0
447
+ find=@seq.index('ATG')
448
+ else
449
+ find=@seq.index('ATG',temp_index+1)
450
+ end
451
+ find+=1
452
+ if find==nil
453
+ continue=FALSE
454
+ else
455
+ if find==q_beg
456
+ continue=FALSE
457
+ index=find
458
+ elsif find>(q_beg-1+3)
459
+ continue=FALSE
460
+ else
461
+ if (find-q_end).modulo(3)==0 || find-q_end==0
462
+ index=find
463
+ end
464
+ if temp_index==0
465
+ temp_index=find+1
466
+ else
467
+ temp_index=find
468
+ end
469
+ end
470
+ end
471
+ end#While
472
+ if index>0
473
+ @hits.first.hsps.first.s_beg=1
474
+ @hits.first.hsps.first.q_beg=index
475
+ exists=TRUE
476
+ end
477
+ end
478
+ return exists
479
+ end
480
+
481
+ def stop_codon(codon,ends,*beg) #Busqueda posibles codones de parada
482
+ reference=ends-1
483
+ if !beg.empty?
484
+ reference=ends
485
+ ends=beg.first
486
+ end
487
+ position=nil
488
+ follow=TRUE
489
+ while follow
490
+ pos=@seq.index(codon,ends)#Implicitamente lleva el +1
491
+ if pos.nil?
492
+ follow=FALSE
493
+ else
494
+ dif=(pos-reference).abs
495
+ #puts "#{pos} #{dif} #{reference}"
496
+ if dif.modulo(3)==0
497
+ position=pos
498
+ follow=FALSE
499
+ else
500
+ ends=pos+1
501
+ end
502
+ end
503
+ end
504
+ return position
505
+ end
506
+
507
+ def coor_intrones
508
+ #Determinar bordes de los intrones
509
+ intrones=[]
510
+ last_hsp=nil
511
+ if first_hit.hsp_count>1
512
+ first_hit.each_hsp_with_index{|hsp,i|
513
+ if i>0
514
+ intrones << [last_hsp.q_end,hsp.q_beg]
515
+ end
516
+ last_hsp=hsp
517
+ }
518
+ end
519
+ #---------------------------------
520
+ return intrones
521
+ end
522
+
523
+ def stop_codon_search #Busqueda codon de parada, busca si existe
524
+ exists=FALSE
525
+ homology_start=first_hit.first_hsp.q_beg
526
+ homology_end=first_hit.last_hsp.q_end
527
+ n=1
528
+ codon=''
529
+ position=nil
530
+ intrones=coor_intrones
531
+ @seq.chars do |c|
532
+ if n<homology_start#Comenzar comparacion al principio del primer exon
533
+ n+=1
534
+ next
535
+ end
536
+ if n>homology_end#Terminar comparacion
537
+ break
538
+ end
539
+ #Saltar intrones
540
+ if !intrones.empty?
541
+ jump=FALSE
542
+ intrones.each do |int|
543
+ if n>int[0] && n<int[1] #NO se incluye el borde del exon
544
+ #print 'i'
545
+ jump=TRUE
546
+ break
547
+ end
548
+ end
549
+ if jump
550
+ n+=1
551
+ next
552
+ end
553
+ end
554
+ #-----------------
555
+ codon+=c
556
+ if codon.length==3 #Comparacion del codon
557
+ #puts position.to_s+"\t"+codon
558
+ if codon=='TAG'||codon=='TAA'||codon=='TGA'
559
+ @stops << position
560
+ exists=TRUE
561
+ end
562
+ codon=''
563
+ elsif codon.length==1 #Guardar posicion del primer nucleotido del codon
564
+ position=n
565
+ end
566
+ n+=1
567
+ end
568
+ return exists
569
+ end
570
+
571
+ def gff(id,parent,add) #Devuelve en estrutura gff los exones en genomico
572
+ text=[]
573
+ #Exones
574
+ first_hit.each_hsp{|hsp|
575
+ # if hsp.reversed==TRUE
576
+ # hsp.q_beg=@length-hsp.q_beg
577
+ # hsp.q_end=@length-hsp.q_end
578
+ # end
579
+ #puts "#{hsp.q_beg+add} #{hsp.q_end+add}"
580
+ text<<"#{id}\t.\texon\t#{hsp.q_beg+add}\t#{hsp.q_end+add}\t.\t+\t.\tID=#{parent}_exon;Parent=#{parent};Name=#{parent}_exon"
581
+ }
582
+ return text
583
+ end
584
+
585
+ def gff_prot(id,prot_name) #Devuelve en estrutura gff los exones en proteina
586
+ #parent="#{parent}_mRNA"
587
+ text=[]
588
+ #Exones
589
+ first_hit.each_hsp{|hsp|
590
+ #puts "#{hsp.q_beg+add} #{hsp.q_end+add}"
591
+ text<<"#{id}\t.\tprotein_match\t#{hsp.s_beg}\t#{hsp.s_end}\t.\t+\t.\tID=#{prot_name}_prot;Parent=#{prot_name};Name=#{id}_prot"
592
+ }
593
+ return text
594
+ end
595
+
596
+ def transfer_contig_hits(contig)
597
+ contig.each_hit{|hit|
598
+ self.transfer_hit(hit)
599
+ }
600
+ end
601
+
602
+ def transfer_hit(hit)
603
+ @hits << hit
604
+ end
605
+
606
+ def overlap
607
+ overlap=[]
608
+ last_hit=nil
609
+ each_hit_with_index{|hit,i|
610
+ overlap << hit.hsp_overlap
611
+ if i>0
612
+ diference=hit.overlap_with(last_hit)
613
+ if diference<0
614
+ overlap << diference
615
+ end
616
+ end
617
+ last_hit=hit
618
+ }
619
+ overlap.flatten!
620
+ return overlap
621
+ end
622
+
623
+ def hsp_at(position)
624
+ hsp_ret=nil
625
+ count_hsp=0
626
+ found=FALSE
627
+ each_hit {|hit|
628
+ hit.each_hsp {|hsp|
629
+ if position==count_hsp
630
+ hsp_ret=hsp
631
+ found=TRUE
632
+ break
633
+ end
634
+ count_hsp+=1
635
+ }
636
+ if found
637
+ break
638
+ end
639
+ }
640
+ return hsp_ret
641
+ end
642
+
643
+ end #class