gene_assembler 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,100 @@
1
+ class Hsp
2
+ attr_accessor :q_beg, :q_end, :s_beg, :s_end, :align_len, :score, :ident, :gaps, :type
3
+ def initialize (q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
4
+ @q_beg=q_beg #Inicio en query
5
+ @q_end=q_end #Fin en query
6
+ @s_beg=s_beg #Inicio en subject
7
+ @s_end=s_end #Fin en subject
8
+ @align_len=align_len #Tamaño de la secuencia alineada
9
+ #@bit_score=bit_score
10
+ @score=score
11
+ @ident=ident
12
+ @gaps=gaps
13
+ @type=nil
14
+ end
15
+
16
+ def compare(hsp) #Compara hsps distintintos a nivel del subject para saber si son el mismo
17
+ coverage=0
18
+ if self.s_beg==hsp.s_end && self.s_end==hsp.s_end
19
+ coverage=1
20
+ elsif self.s_beg>=hsp.s_beg && self.s_end<hsp.s_end #Caso de q el self este dentro de hsp
21
+ coverage=1
22
+ elsif self.s_beg<=hsp.s_beg && self.s_end>hsp.s_beg && (self.s_end-hsp.s_beg).abs>1
23
+ ext=self.s_end-hsp.s_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
24
+ coverage=ext/(self.s_end-self.s_beg)
25
+ elsif self.s_beg<hsp.s_end && self.s_end>=hsp.s_end && (self.s_beg-hsp.s_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
26
+ ext=hsp.s_end-self.s_beg*1.00
27
+ coverage=ext/(self.s_end-self.s_beg)
28
+ end
29
+ return coverage
30
+ end
31
+
32
+ def compare_q(hsp) #Compara hsps distintintos a nivel del query para saber si son el mismo
33
+ coverage=0
34
+ if self.q_beg==hsp.q_end && self.q_end==hsp.q_end
35
+ coverage=1
36
+ elsif self.q_beg>=hsp.q_beg && self.q_end<hsp.q_end #Caso de q el self este dentro de hsp
37
+ coverage=1
38
+ elsif self.q_beg<=hsp.q_beg && self.q_end>hsp.q_beg && (self.q_end-hsp.q_beg).abs>1
39
+ ext=self.q_end-hsp.q_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
40
+ coverage=ext/(self.q_end-self.q_beg)
41
+ elsif self.q_beg<hsp.q_end && self.q_end>=hsp.q_end && (self.q_beg-hsp.q_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
42
+ ext=hsp.q_end-self.q_beg*1.00
43
+ coverage=ext/(self.q_end-self.q_beg)
44
+ end
45
+ return coverage
46
+ end
47
+
48
+
49
+ def length_q #Longitud del hsp en la query
50
+ length=@q_end-@q_beg
51
+ return length
52
+ end
53
+
54
+ def rev(length_hsp) # Cambia coordenadas de reversas a directas
55
+ @q_beg=length_hsp-@q_beg #Inicio en query
56
+ @q_end=length_hsp-@q_end #Fin en query
57
+ @reversed=FALSE
58
+ end
59
+
60
+ def within?(hsp,long) #Mira si un hsp esta dentro de otro o si hay overlap parcial entre los mismos
61
+ over=0
62
+ if self.q_beg<=hsp.q_beg && self.q_end>=hsp.q_end
63
+ over=1
64
+ end
65
+ if self.s_beg<=hsp.s_beg && self.s_end>=hsp.s_end
66
+ over=1
67
+ end
68
+ if over == 0
69
+ self_coverage=(self.s_end-self.s_beg)*1.00/long
70
+ hsp_coverage=(hsp.s_end-hsp.s_beg)*1.00/long
71
+ if hsp_coverage>(1-self_coverage) #Si el coverage del hsp en mayor que el resto que deja el self, se da como overlap
72
+ over=1
73
+ end
74
+ end
75
+ return over
76
+ end
77
+
78
+ def modified_coordenates(add)
79
+ @q_beg+=add
80
+ @q_end+=add
81
+ end
82
+
83
+ def rev_coord(contig_length)
84
+ puts '---------------------------------'
85
+ puts @q_beg.to_s+' '+@q_end.to_s
86
+ @q_beg=contig_length-@q_beg+1
87
+ @q_end=contig_length-@q_end+1
88
+ puts @q_beg.to_s+' '+@q_end.to_s
89
+ end
90
+
91
+ def overlap_with(last_hsp)
92
+ overlap=0
93
+ diference=self.s_beg-last_hsp.s_end
94
+ #puts "#{self.s_beg} - #{last_hsp.s_end} = #{diference}"
95
+ if diference<0
96
+ overlap=diference
97
+ end
98
+ return overlap
99
+ end
100
+ end
@@ -0,0 +1,228 @@
1
+ require 'scbi_fasta'
2
+
3
+ def mapping(contigs,gene_array,map_path) #Relaciona un archivo sam con un contig, cuantifica nº lecturas por exon
4
+ # Mapping
5
+ #--------------------------------------------------------------
6
+ contigs.each do |contig|
7
+ ruta=File.join(map_path,"#{contig.name}.sam")
8
+
9
+ # Parse mapping & exon valoration
10
+ #--------------------------------------------------------------
11
+ seq_map=[]
12
+ n_reads=0
13
+ if File.exists?(ruta)
14
+ contig.length.times do |x|
15
+ seq_map << 0
16
+ end
17
+ map_file=File.open(File.join(ruta), 'r')
18
+ map_file.each do |line|
19
+ fields=line.split
20
+ if fields[0]!~/[@]/
21
+ n_reads+=1
22
+ #puts "#{fields[3]}\t#{fields[5]}"
23
+ start_map=fields[3].to_i-1
24
+ end_map=start_map-1
25
+ fields[5].split(/[^\d]/).each{|e| end_map+=e.to_i}
26
+ #puts "#{start_map}\t#{end_map}"
27
+ #puts seq_map[start_map..end_map].inspect
28
+ seq_map.each_with_index do |item,a|
29
+ if a>=start_map
30
+ seq_map[a]+=1
31
+ end
32
+ if a>end_map
33
+ break
34
+ end
35
+ end
36
+ end
37
+ end
38
+ #puts seq_map.inspect
39
+
40
+ # Exon valoration
41
+ #-----------------------------------------------------------
42
+ exon_stadistic=[]
43
+ contig.hits.first.hsps.each do |hsp|
44
+ exon=seq_map[hsp.q_beg-1..hsp.q_end-1]
45
+ value=0
46
+ exon.each{|e| value+=e}
47
+ exon_stadistic << (value*100.0/n_reads/exon.length).round(2)
48
+ end
49
+ #puts exon_stadistic.inspect
50
+ y=contigs.index(contig)
51
+ x=gene_array[y].index(1)
52
+ exon_stadistic.each_with_index do |item,b|
53
+ gene_array[y][x+b]=item
54
+ end
55
+ seq_map=[]
56
+ end
57
+ end #end contigs.each
58
+
59
+ if $verbose
60
+ puts "\nGENE ARRAY - EXON VALUATED"
61
+ gene_array.each_with_index do |fila,c|
62
+ print "#{contigs[c].name.center(24)} "
63
+ fila.each do |item|
64
+ print "#{item.to_s}\t"
65
+ end
66
+ puts "\n"
67
+ end
68
+ end
69
+
70
+ contigs.each do |contig|
71
+ puts '...................'
72
+ contig.indices
73
+ end
74
+ puts "\n"
75
+ end
76
+
77
+ def length2D(array) # Devuelve la longitud maxima que tenga un conjunto de arrays
78
+ length=0
79
+ array.each do |item|
80
+ item_length=item.length
81
+ if item_length>length
82
+ length=item_length
83
+ end
84
+ end
85
+ return length
86
+ end
87
+
88
+ def parse_contig_index(gene_array,contigs) #Comprueba codones start- stop en contigs que contengan el primer o el ultimo exon
89
+ exons_model=length2D(gene_array)
90
+ gene_array.each_with_index do |contig,i|
91
+ start=nil #Desconocido
92
+ if contig.first >0 #Comprueba si el contig tiene el primer exon
93
+ start=contigs[i].start_codon_search
94
+ end
95
+ stop=nil #desconocido
96
+ #if contig.length==exons_model #Comprueba si el contig posee el ultimo exon
97
+ stop=contigs[i].stop_codon_search
98
+ #end
99
+ if start==TRUE && stop==TRUE
100
+ contigs[i].completed=TRUE
101
+ elsif start==TRUE
102
+ contigs[i].completed='start'
103
+ elsif stop==TRUE
104
+ contigs[i].completed='stop'
105
+ else
106
+ contigs[i].completed=FALSE
107
+ end
108
+ end
109
+ end
110
+
111
+ def sides_recovery(contigs) # Toma de un conjunto de contigs un contig con señal de stop y un contig con señal de inicio
112
+ start=nil
113
+ stop=nil
114
+ contigs.each do |contig|
115
+ if contig.completed=='start'
116
+ if start.nil?
117
+ start=contig
118
+ else
119
+ if start.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
120
+ start=contig
121
+ end
122
+ end
123
+ end
124
+ if contig.completed=='stop'
125
+ if stop.nil?
126
+ stop=contig
127
+ else
128
+ if stop.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
129
+ stop=contig
130
+ end
131
+ end
132
+ end
133
+ end
134
+ return start,stop
135
+ end
136
+
137
+ def sides_add(contigs,start,stop) #Añade contigs con señal de stop-start si no existen en el array contigs
138
+ beg=TRUE
139
+ ends=TRUE
140
+ contigs.each do |contig|
141
+ if contig.completed=='start'||contig.completed==TRUE
142
+ beg=FALSE
143
+ end
144
+ if contig.completed=='stop'||contig.completed==TRUE
145
+ ends=FALSE
146
+ end
147
+ end
148
+ if beg && !start.nil?
149
+ b=[]
150
+ b << start
151
+ contigs=b.concat(contigs)
152
+ end
153
+ if ends && !stop.nil?
154
+ e=[]
155
+ e << stop
156
+ contigs.concat(e)
157
+ end
158
+ return contigs
159
+ end
160
+
161
+ def cluster_filter(gene_array,cluster,length)# Elimina contigs de cluster y gene_array que tengan etiqueta de stop y solo tengan un hsp
162
+ cluster.each_with_index do |contig,i|
163
+ if contig.completed=='stop'
164
+ if contig.hits.first.hsps.last.s_end-contig.hits.first.hsps.last.s_beg<length && contig.hits.first.hsps.count==1
165
+ cluster[i]=nil
166
+ gene_array[i]=nil
167
+ end
168
+ end
169
+ end
170
+ cluster.compact!
171
+ gene_array.compact!
172
+ return gene_array,cluster
173
+ end
174
+
175
+ def coord_prot(last_contig_hsp, current_contig_hsp) #Devuelve la diferencia de posicion de dos contigs dados en base a su posicion en la proteina
176
+ add=last_contig_hsp.q_beg-current_contig_hsp.q_beg+3*(current_contig_hsp.s_beg-last_contig_hsp.s_beg) #primera parte del sumando representa la diferencia debida a la longitud de los contigs, la segunda parte representa la diferencia de tamaño del hsp
177
+ return add
178
+ end
179
+
180
+ def fasta_hash(path)
181
+ parse_seqs=FastaFile.new(path)
182
+ seqs={}
183
+ parse_seqs.each do |contig,seq_fasta|
184
+ seqs[contig]=seq_fasta
185
+ end
186
+ return seqs
187
+ end
188
+
189
+ def html_header(file,title)
190
+ file.puts '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
191
+ '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">',
192
+ '<head>',
193
+ '<meta http-equiv="content-type" content="text/html;charset=UTF-8" />',
194
+ '<title>'+title+'</title>',
195
+ '</head>',
196
+ '<body>'
197
+ end
198
+
199
+ def html_footer(file)
200
+ file.puts '</body>',
201
+ '</html>'
202
+ end
203
+
204
+ def html_table_header(file, border, headers) #headers es un array
205
+ file.puts '<table border="'+border.to_s+'">',
206
+ '<tr>'
207
+ headers.each do |header|
208
+ file.puts '<th>'+header+'</th>'
209
+ end
210
+ file.puts '</tr>'
211
+ end
212
+
213
+ def html_row(file, cells) #Cells muts be a array
214
+ file.puts '<tr>'
215
+ cells.each do |cell|
216
+ file.puts "<td>#{cell}</td>"
217
+ end
218
+ file.puts '</tr>'
219
+ end
220
+
221
+ def html_link(text, link)
222
+ text_linked='<a href="'+link+'">'+text.to_s+'</a>'
223
+ return text_linked
224
+ end
225
+
226
+ def html_table_footer(file)
227
+ file.puts '</table>'
228
+ end
@@ -0,0 +1,25 @@
1
+ require 'dataset'
2
+
3
+ class Parser
4
+ attr_accessor :dataset
5
+ def initialize(file,type=nil)
6
+ @file=file
7
+ @dataset=create_dataset
8
+ data=parse_file(file) #Se crea objeto de datos para cargar dataset
9
+ load_dataset(data) #Se rellena dataset con la informacion contenida en data
10
+ end
11
+
12
+ def create_dataset
13
+ dataset=Dataset.new('unknown')#No se usa
14
+ return dataset
15
+ end
16
+
17
+ def parse_file(file)
18
+
19
+ end
20
+
21
+ def load_dataset(data)
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,12 @@
1
+ require 'blast_type_parser'
2
+ require 'scbi_blast' #Si falla, buscar e instalar tb 'gem install xml-simple' de la q depende
3
+
4
+
5
+ class ParserBlast < BlastTypeParser
6
+
7
+ def parse_file(file)
8
+ blast=BlastTableResult.new(file)
9
+ return blast
10
+ end
11
+
12
+ end
@@ -0,0 +1,16 @@
1
+ require 'exonerate_result'
2
+ require 'blast_type_parser'
3
+
4
+ class ParserExonerate < BlastTypeParser
5
+
6
+ def parse_file(file)
7
+ exonerate=ExonerateResult.new(file,@all)
8
+ return exonerate
9
+ end
10
+
11
+ def populate_extra_atributes(contig,item) #Añade los frameshift localizados x el exonerate
12
+ contig.q_frameshift=item.q_frameshift
13
+ contig.s_frameshift=item.s_frameshift
14
+ end
15
+
16
+ end
@@ -0,0 +1,975 @@
1
+ require 'dataset'
2
+ require 'other_functions'
3
+ require 'report_gff'
4
+ require 'fileutils'
5
+
6
+ class Rebuild
7
+ def initialize(dataset,dataset_uni_hsp,path) #La clase ha de recibr objetos dataset
8
+ @dataset=dataset
9
+ @dataset_uni_hsp=dataset_uni_hsp
10
+ @path=path
11
+ @db_seqs=fasta_hash(path[:exonerate_db])
12
+ end
13
+
14
+ ###############################################################################################################
15
+ # MAIN METHOD
16
+ ###############################################################################################################
17
+ def rebuild(options) #Genera contigs modelo,gff y busca pseudogenes
18
+ gff_dataset_model=Dataset.new(:mix) #Object for save info of GeneAssembler's output
19
+ gff_dataset=Dataset.new(:mix) #Object for save info of GeneAssembler's output
20
+ file_error=File.open(@path[:error],'w')
21
+ file_web=web_header(options[:web],@path[:html])
22
+ sequences_hash={}
23
+ gene_name=nil
24
+ model=nil
25
+ statistics={:genes => 0, :total_recovered => 0, :total_overlap => 0, :total_fragmentation => 0}
26
+ puts "\nMODELING GENE",'*******************************************'
27
+ @dataset.each_cluster{|cluster|
28
+ begin
29
+ if !cluster.nil?
30
+ gene_name=cluster.first.first_hit.name
31
+ end
32
+ cluster_complete=cluster.dup
33
+ model, length_model, length_cluster = iterative_modeling_gene_w_reference(cluster,@dataset.references_hash,options[:rebuild],sequences_hash) #Realiza la reconstruccion del gen (alineado,descarte y montaje del gen)
34
+
35
+ # GeneAssembler output (gff for Gbrowse)
36
+ #--------------------------------------------------------
37
+ if !model.nil?
38
+ #Format Contigs children of model
39
+ gff_dataset.clr_contigs
40
+ gff_dataset.transfer_contigs(cluster_complete)
41
+ gff_dataset.transfer_n_contigs_def_hit_type(@dataset_uni_hsp,cluster,'pseudogene',50) #Transferir pseudogenes al report
42
+
43
+ # Convertir arrays a contig y ajustar alineamiento añadiendo Ns
44
+ model=correct_model(model, length_model, gff_dataset, sequences_hash)
45
+
46
+ # Comprobaciones en el modelo
47
+ exones=model.exones_s.length # N exones
48
+ puts 'Exones: '+ exones.to_s
49
+ recovered=recover_test(model)
50
+ overlap=overlap_test(model)
51
+ fragmentation=((length_cluster-1.00)/exones).round(2)
52
+ puts 'Fragmentation: ' + fragmentation.to_s
53
+
54
+ # HTML index
55
+ if !file_web.nil?
56
+ gene_link=html_link(model.first_hit.name, @path[:gbrowse_link]+model.first_hit.name)
57
+ html_row(file_web, [gene_link, cluster.first.first_hit.s_length, exones, recovered, overlap, fragmentation])
58
+ end
59
+
60
+ #Format Model for Gbrowse
61
+ gff_dataset_model.clr_contigs
62
+ format_model(model) #Añade la particula _gene al modelo
63
+ gff_dataset_model.transfer_contigs(model)
64
+
65
+ #Write
66
+ write_gbrowse_gff(gff_dataset_model, gff_dataset, @path[:gff], model.name)
67
+
68
+ #General statistics
69
+ statistics[:genes]+=1
70
+ statistics[:total_recovered]+=recovered
71
+ statistics[:total_overlap]+=overlap
72
+ statistics[:total_fragmentation]+=fragmentation
73
+ end
74
+ rescue Exception => e
75
+ gene_error(e, gene_name, file_error, cluster_complete, model)
76
+ end
77
+ puts '* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *'
78
+ }
79
+ file_error.close
80
+ web_body(file_web)
81
+
82
+ puts "\nFINAL STATISTICS\n",
83
+ 'Recovered genes: '+ statistics[:genes].to_s,
84
+ 'Mean recover: ' + (statistics[:total_recovered]/statistics[:genes]).to_s,
85
+ 'Mean overlap: ' + (statistics[:total_overlap]/statistics[:genes]).to_s,
86
+ 'Mean fragmentation: ' + (statistics[:total_fragmentation]/statistics[:genes]).to_s
87
+ write_model_fasta(sequences_hash,@path[:fasta])
88
+ end
89
+ ################################################################################################################################
90
+ # end main method
91
+ ################################################################################################################################
92
+
93
+ def iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash)
94
+ # Model atributes
95
+ model=nil
96
+ length=0
97
+ seq=nil
98
+ cluster_length=0
99
+ length_cluster=0
100
+ prot_reference=cluster.first.first_hit.name
101
+ array_references=references_hash[prot_reference]
102
+
103
+ # Model parameters
104
+ recover=0
105
+ overlap=0
106
+
107
+ #Modelo de gen en ciego
108
+ if $verbose
109
+ puts "\n",'|||||||||| BLIND MODELING ||||||||||'
110
+ end
111
+ model, length, seq, length_cluster= modeling_gene(cluster.dup,nil,options)
112
+
113
+ recover, overlap=eval_model(model.dup, length)
114
+
115
+ if $verbose
116
+ puts "\nRecover: #{recover} Overlap: #{overlap}"
117
+ end
118
+
119
+ guided=FALSE
120
+ #Modelo de gen guiado
121
+ if !array_references.nil?
122
+ array_references.each do |ref|
123
+ if $verbose
124
+ puts "\n",'|||||||||| GUIDED MODELING ||||||||||'
125
+ end
126
+
127
+ guided_model, guided_length, guided_seq, guided_length_cluster = modeling_gene(cluster.dup,ref,options)
128
+ if guided_model.nil? # Si algun modelo sale mal se ignora
129
+ next
130
+ end
131
+ guided_recover, guided_overlap= eval_model(guided_model.dup, guided_length)
132
+ if $verbose
133
+ puts "\nRecover: #{guided_recover} Overlap: #{guided_overlap}"
134
+ end
135
+
136
+ #Arbol de decisiones
137
+ if guided_overlap <= 15 #Si el overlap es menor del 15 %
138
+ if guided_overlap >= overlap-overlap*0.05 && guided_overlap <= overlap+overlap*0.05 # A mismo overlap
139
+ if guided_recover > recover
140
+ guided=TRUE
141
+ end
142
+ else # A distinto overlap
143
+ recover_dif=guided_recover-recover
144
+ if recover_dif < 0 # Si el guided_model tiene menos recuperacion q el anterior
145
+ if recover_dif.abs >= overlap-overlap*0.05 && recover_dif.abs <= overlap+overlap*0.05 #Si la reduccion de la recuperacion se debe a la desaparicion del overlap
146
+ guided=TRUE
147
+ end
148
+ elsif recover_dif> guided_overlap+guided_overlap*0.05 # Comprobar que la diferencia de recover no se debe a u aumento del overlap en la misma magnitud
149
+ guided=TRUE
150
+ end
151
+ end
152
+ elsif guided_overlap < overlap # Quedarnos siempre con los overlap mas bajos aun en situacion de overlap alto
153
+ guided=TRUE
154
+ end
155
+
156
+ if guided
157
+ model=guided_model
158
+ length=guided_length
159
+ seq=guided_seq
160
+ length_cluster=guided_length_cluster
161
+ recover=guided_recover
162
+ overlap=guided_overlap
163
+ end
164
+ end
165
+ end
166
+
167
+ sequences_hash[prot_reference]=seq
168
+ return model, length, length_cluster
169
+ end
170
+
171
+ def eval_model(local_model, length)#modifica model asi q se le ha de pasar una copia
172
+ recover=0
173
+ overlap=0
174
+ if local_model.class.to_s=='Array'
175
+ local_model=array_contigs_to_contig(local_model)
176
+ local_model.length=length
177
+ end
178
+ recover=recover_test(local_model,FALSE)
179
+ overlap=overlap_test(local_model,FALSE)
180
+ return recover, overlap
181
+ end
182
+
183
+ def modeling_gene(cluster, reference, rebuild) #Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo
184
+ model=nil
185
+ model_length=nil
186
+ seq=nil
187
+ length_cluster=0
188
+ # Reduccion iterativa de los contig para seleccionar los que van a formar parte del modelo de gen, elimina fragmentos menores que se puedan tomar como nuevos exones
189
+ #--------------------------------------------------------------------------------------------------
190
+ gene_array_length_before=nil
191
+ continue=TRUE
192
+ gene_array=[]
193
+
194
+ while continue
195
+ cluster,gene_array=gene_array_and_compact(rebuild,cluster,reference)
196
+ gene_array_length_after=length2D(gene_array)
197
+ if gene_array_length_after == gene_array_length_before
198
+ continue=FALSE
199
+ end
200
+ gene_array_length_before=gene_array_length_after
201
+ end
202
+ length_cluster=cluster.length
203
+
204
+ # Modelado del gen
205
+ #----------------------------------------------------
206
+ if rebuild && !cluster.empty? && !gene_array.empty?
207
+ if cluster.length >1
208
+ cluster_comp=contig_compact(cluster) #Fusiona contigs contiguos y devuelve el array correspondiente
209
+ else
210
+ cluster_comp=cluster
211
+ end
212
+ if !cluster_comp.nil?
213
+ model, model_length, seq=gene_model_cut(cluster_comp, reference)
214
+ else
215
+ puts cluster.first.first_hit.name+"\tGENE MODEL ABORTED"
216
+ end
217
+ end
218
+ return model, model_length, seq, length_cluster
219
+ end
220
+
221
+ def gene_array_and_compact(rebuild,cluster,reference)
222
+ # Contruir array de exones (a partir del cluster) con los hsps de forma que los solapantes se alineen en las mismas columnas
223
+ #---------------------------------------------------------------------------------------------------------------------------
224
+ if rebuild
225
+ gene_array,gene_array_introns=build_gene_array(cluster,reference) #Con referencia
226
+ if $verbose
227
+ gene_exons=gene_stadistics(gene_array)
228
+ gene_stadistics_report(gene_exons,'EXONS')
229
+ gene_introns=gene_stadistics(gene_array_introns)
230
+ gene_stadistics_report(gene_introns,'INTRONS')
231
+ end
232
+ end
233
+
234
+ # Seleccion de contigs para modelado de gen
235
+ #-----------------------------------------------------
236
+ if $verbose #Info cluster before compact array contigs
237
+ gene_array_report(gene_array,cluster,rebuild)
238
+ end
239
+ if rebuild
240
+ gene_compact(gene_array,cluster) #Se descartan los contigs redundantes y quedan aquellos que cubren todo el gen para formar el modelo
241
+ end
242
+ if $verbose && rebuild #Info cluster after compact array contigs
243
+ gene_array_report(gene_array,cluster,rebuild)
244
+ end
245
+ return cluster, gene_array
246
+ end
247
+
248
+ def add_uni_hsp(model,cluster)#Compara contigs uni-hsp con contig modelo para determinar pseudogenes
249
+ contigs_uni_hsp=''
250
+ is_contig=0
251
+ pseudogenes=[]
252
+ @clusters_uni_hsp.each do |contigs|
253
+ if contigs.first.first_hit.name==cluster.first.first_hit.name
254
+ contigs_uni_hsp=contigs
255
+ is_contig=1
256
+ break
257
+ end
258
+ end
259
+
260
+ if is_contig==1 #Si se ha encontrado contigs uni-hsp se realiza la comparacion
261
+ if model.class.to_s!='Array'
262
+ model=[model]
263
+ end
264
+ model.each do |item|
265
+ contigs_uni_hsp.each do |contig|
266
+ start,exons=item.compare(contig)
267
+ if exons>1 && !pseudogenes.include?(contig)
268
+ pseudogenes << contig
269
+ end
270
+ end
271
+ end
272
+ end
273
+ return pseudogenes
274
+ end
275
+
276
+ def build_gene_array(contigs,reference=nil) #GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones
277
+ gene_array=[]
278
+ gene_array_introns=[]
279
+ last_contig=''
280
+ if !reference.nil?
281
+ last_contig=reference
282
+ end
283
+ contigs.each do |contig|
284
+ array_contig=[]
285
+ array_contig_introns=[]
286
+ n_exon=contig.first_hit.hsp_count #Contamos cantidad de hsps en el contig
287
+ #Determinar posiciones vacias
288
+ if !gene_array.empty?||reference
289
+ first_exon,ex=contig.compare(last_contig) #Comparamos el contig actual con el que se ha estudiado en la iteracion anterior
290
+ if reference && first_exon==-1 # Abortar alineamiento cuando un contig no coincide con la referencia
291
+ if $verbose
292
+ puts "\n#{contig.name} alignment step OUT OF RANGE"
293
+ end
294
+ gene_array=[]
295
+ gene_array_introns=[]
296
+ break
297
+ end
298
+ if first_exon==-1
299
+ gene_array.last.count.times do #Posiciones vacias cuando NO hay overlapping
300
+ array_contig << 0 # Marca ausencia de exon para esa posicion
301
+ array_contig_introns << 0 # Marca ausencia de intron para esa posicion
302
+ end
303
+ else
304
+ if reference # ASignamiento de la posicion del contig respecto a la referencia
305
+ void_positions=first_exon
306
+ else
307
+ void_positions=first_exon+gene_array.last.count(0)
308
+ end
309
+ void_positions.times do #Posiciones vacias cuando HAY overlapping
310
+ array_contig << 0
311
+ array_contig_introns << 0
312
+ end
313
+ end
314
+ end
315
+ #Agregar exones e intrones del contig
316
+ exones=contig.exones_s
317
+ introns=contig.intrones_q
318
+ array_contig << exones # Marca presencia de exon para esa posicion
319
+ array_contig_introns << introns # Marca presencia de exon para esa posicion
320
+ gene_array << array_contig.flatten!
321
+ gene_array_introns << array_contig_introns.flatten!
322
+ if reference.nil?
323
+ last_contig=contig
324
+ end
325
+ end
326
+ return gene_array, gene_array_introns
327
+ end
328
+
329
+ def gene_stadistics(gene_array) #Calcula el nº exones diferentes que hay por cada posicion del gene_array
330
+ exons=[]
331
+ length=length2D(gene_array)
332
+ length.times do |column|
333
+ exon=[]
334
+ gene_array.each_with_index.each do |item,row|
335
+ if !exon.include?(gene_array[row][column]) && gene_array[row][column]!=0
336
+ exon << gene_array[row][column]
337
+ end
338
+ end
339
+ exons << exon
340
+ end
341
+ exons_stadistic=[]
342
+ exons.each do |ex|
343
+ exons_stadistic << ex.compact.length
344
+ end
345
+ return exons_stadistic
346
+ end
347
+
348
+ def gene_stadistics_report(exons_stadistic,tag) #Muestra estadisticas de intrones o exones
349
+ print "\n#{tag}\t"
350
+ exons_stadistic.each do |item|
351
+ print "#{item}\t"
352
+ end
353
+ print "\n"
354
+ end
355
+
356
+ def gene_array_report(gene_array,contigs,act_array) #Muestra el array de la funncion build_gene_array y una representacion de las secuencias
357
+ if act_array
358
+ puts "\nGENE ARRAY"
359
+ gene_array.each_with_index do |fila,c|
360
+ print "#{contigs[c].name.center(24)}\t "
361
+ print "#{contigs[c].completed}\t"
362
+ fila.each do |item|
363
+ print "#{item}\t"
364
+ end
365
+ puts "\n"
366
+ end
367
+ end
368
+
369
+ puts "\nMAP"
370
+ contigs.each do |contig|
371
+ print "#{contig.name.center(25)}"
372
+ print contig.draw
373
+ end
374
+ end
375
+
376
+ def gene_compact(gene_array, contigs) # Generacion modelo del gen quitando todas las secuencias redundantes posibles
377
+ gene_array.each_with_index do |contig,c1|
378
+ if !contig
379
+ next
380
+ end
381
+ c1_len=contig.length
382
+ n_exons=contig.count{|x| x>0}
383
+ gene_array.each_with_index do |contig2,c2|
384
+ if !contig2 ||c1==c2 #Saltamos contigs a nil o autocomparacion
385
+ next
386
+ end
387
+ c2_len=contig2.length
388
+
389
+ # IGUAL
390
+ if c1_len==c2_len
391
+ if contig2.count{|x| x>0}==n_exons
392
+ if contigs[c1].first_hit.first_hsp.score>=contigs[c2].first_hit.first_hsp.score
393
+ gene_array[c2]=nil
394
+ contigs[c2]=nil
395
+ else
396
+ gene_array[c1]=nil
397
+ contigs[c1]=nil
398
+ break
399
+ end
400
+ elsif contig2.count{|x| x>0}>n_exons
401
+ gene_array[c1]=nil
402
+ contigs[c1]=nil
403
+ break
404
+ else
405
+ gene_array[c2]=nil
406
+ contigs[c2]=nil
407
+ end
408
+
409
+ # MAYOR QUE
410
+ elsif c1_len>c2_len
411
+ if contig.count(0)<=contig2.count(0)
412
+ gene_array[c2]=nil
413
+ contigs[c2]=nil
414
+ end
415
+
416
+ # MENOR QUE
417
+ elsif c1_len<c2_len
418
+ if contig.count(0)==contig2.count(0)
419
+ gene_array[c1]=nil
420
+ contigs[c1]=nil
421
+ break
422
+ end
423
+ end
424
+ end #end contig2
425
+ end #end contig
426
+ gene_array.compact!
427
+ contigs.compact!
428
+ end
429
+
430
+ def contig_compact(contigs) # Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig
431
+ cn_def=[]
432
+ cn_backup=contigs.dup
433
+ #Determinar contigs a fusionar
434
+ cn_to_merge=[]
435
+ s_end=nil
436
+ last_position_ref=nil
437
+ position_overlap=nil
438
+ last_contig=nil
439
+ fusion=[]
440
+ contigs.length.times do
441
+ fusion << FALSE
442
+ end
443
+ #Marcaje de contigs correlativos no solapantes
444
+ contigs.each_with_index do |contig,i|
445
+ if i>0
446
+ diference=contig.first_hit.first_hsp.s_beg-s_end
447
+ if diference==0 || diference==1
448
+ fusion[i]=TRUE
449
+ end
450
+ end
451
+ s_end=contig.first_hit.last_hsp.s_end
452
+ end
453
+
454
+ if fusion.include?(TRUE)
455
+
456
+ #Construccion array contigs a fusionar y guardado de los solapantes
457
+ fusion_contigs=[]
458
+ count=0 # Marca la posicion de las fusiones
459
+ fusion.each_with_index do |cont,i|
460
+ if cont
461
+ if !fusion_contigs.include?(contigs[i-1])
462
+ fusion_contigs << contigs[i-1]
463
+ end
464
+ if !fusion_contigs.include?(contigs[i])
465
+ fusion_contigs << contigs[i]
466
+ end
467
+ else
468
+ if !fusion_contigs.empty?#Marcar fusiones
469
+ cn_to_merge << fusion_contigs
470
+ fusion_contigs=[]
471
+ cn_def << count
472
+ count+=1
473
+ end
474
+ if !fusion[i+1]||fusion[i+1].nil?#Guardar contigs que no participan en las fusiones
475
+ cn_def << contigs[i]
476
+ end
477
+ end
478
+ if i+1==fusion.length && !fusion_contigs.empty? #Control fin de bucle
479
+ cn_to_merge << fusion_contigs
480
+ cn_def << count
481
+ count+=1
482
+ end
483
+ end
484
+
485
+ #Generar fasta de los contig fusionados
486
+ contigs_merge=contigs_seq_merge(cn_to_merge)
487
+ if !contigs_merge.empty?
488
+ temp=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.fasta'),'w')
489
+ contigs_merge.each_with_index do |seq,i|
490
+ temp.puts ">Fusion_#{i}\n#{seq}"
491
+ end
492
+ temp.close
493
+
494
+ temp_db=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.db'),'w')
495
+ temp_db.puts ">#{contigs.first.first_hit.name}\n#{@db_seqs[contigs.first.first_hit.name]}"
496
+ temp_db.close
497
+
498
+ end
499
+
500
+ #Exonerating
501
+ cmd="exonerate -q #{File.join(@path[:local],contigs.first.first_hit.name+'.db')} -t #{File.join(@path[:local],contigs.first.first_hit.name+'.fasta')} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --useaatla 1 --showvulgar > #{File.join(@path[:local],contigs.first.first_hit.name+'.ex')}" #LINUX command line
502
+ system(cmd)
503
+
504
+ #Parsing exonerate
505
+ local = ParserExonerate.new('contig','nucleotide_match', File.join(@path[:local],"#{contigs.first.first_hit.name}.ex"))
506
+ store_local_ex = local.dataset
507
+ #store_local_ex.each_contig {|ite| puts ite.name+' '+ite.first_hit.name; ite.indices}
508
+ store_local_ex.score_correction(30)
509
+ #puts "#{store_local_ex.contig_count}\t#{contigs_merge.length}"
510
+ if store_local_ex.contig_count==contigs_merge.length
511
+ #Recuperar atributos en contigs y cargar array con contigs def
512
+ store_local_ex.each_contig_with_index{|contig,i|
513
+ contig.seq=contigs_merge[i]
514
+ contig.length=contigs_merge[i].length
515
+ contig.first_hit.s_length=contigs.first.first_hit.s_length
516
+ cn_def.each_with_index do |contig_def,j| # Busqueda de la posicion de la fusion y asignacion en el array de contigs definitivos
517
+ if contig_def==i
518
+ cn_def[j]=contig
519
+ end
520
+ end
521
+ }
522
+ else
523
+ cn_def=cn_backup
524
+ end
525
+ else
526
+ cn_def=contigs
527
+ end
528
+
529
+ return cn_def
530
+
531
+ end#def
532
+
533
+ def contigs_seq_merge(contigs) #Devuelve un array con las secuencias fusionadas a partir del array contigs donde se le proporciona los arrays a fusionar
534
+ cn=[]
535
+ seq=''
536
+ contigs.each do |contigs_to_merge|
537
+ contigs_to_merge.each do |contig|
538
+ if seq.empty?
539
+ seq=contig.seq
540
+ else
541
+ seq=seq+'n'*10+contig.seq
542
+ end
543
+ end
544
+ cn << seq
545
+ seq=''
546
+ end
547
+ return cn
548
+ end
549
+
550
+ def gene_model_cut(contigs, reference=nil) #Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig
551
+ q_beg=[]
552
+ q_end=[]
553
+ s_beg=[]
554
+ s_end=[]
555
+ seq=[]
556
+ last_contig=nil
557
+ last_score=0
558
+ length_model=0
559
+ multiple_lengths=[]
560
+ add_length=TRUE
561
+ add_last=0
562
+ last_position_ref=nil
563
+ last_position=nil
564
+ lengthy=[]
565
+ out_of_range=FALSE
566
+ contigs.each do |contig|
567
+ score = contig.first_hit.first_hsp.score/contig.length*contig.exon_acumulative
568
+ n_exones = contig.first_hit.hsp_count
569
+
570
+ # FIRST CONTIG
571
+ #-------------------------------------------------------
572
+ if last_contig.nil?
573
+ q_end_seq=nil #SEQ
574
+ contig.first_hit.each_hsp_with_index{|hsp,i|
575
+ q_beg << hsp.q_beg
576
+ q_end << hsp.q_end
577
+ s_beg << hsp.s_beg
578
+ s_end << hsp.s_end
579
+ #SEQ.................................
580
+ if i==0
581
+ seq << contig.seq[0..contig.first_hit.first_hsp.q_end-1]
582
+ elsif i+1==n_exones
583
+ seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
584
+ else
585
+ seq << contig.seq[q_end_seq..hsp.q_end-1]
586
+ end
587
+ q_end_seq=hsp.q_end
588
+ # ...................................
589
+ }
590
+ length_model+=contig.length
591
+ if !reference.nil? #Posicionamiento del primer contig en la referencia
592
+ last_position_ref,ex=contig.compare(reference)
593
+ if last_position==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count #Abortar modelado en caso de qun contig no alinee con la referencia o la sobrepase
594
+ puts contig.name+' OUT OF RANGE'
595
+ out_of_range=TRUE
596
+ break
597
+ end
598
+ end
599
+
600
+ # OTHER CONTIG
601
+ #--------------------------------------------------------
602
+ else
603
+ position_overlap,ex=contig.compare(last_contig)
604
+
605
+ #Correccion posicion del contig en base a una referencia
606
+ if !reference.nil?
607
+ last_position_ref,position_overlap=position_reference_guided(contig,last_contig,last_position_ref,reference)
608
+ if last_position_ref==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count
609
+ out_of_range=TRUE
610
+ puts contig.name+' OUT OF RANGE'
611
+ break
612
+ end
613
+ end
614
+
615
+ # NOT OVERLAP
616
+ #..........................
617
+ if position_overlap==-1 || contig.first_hit.hsp_count==1
618
+ if contig.first_hit.first_hsp.s_beg-last_contig.first_hit.last_hsp.s_end>1 # Marcar discontinuidad en caso de que el contig no sea correlativo al anterior
619
+ q_beg << 0
620
+ q_end << 0
621
+ s_beg << 0
622
+ s_end << 0
623
+ multiple_lengths << length_model
624
+ length_model=contig.length
625
+ last=length_model
626
+ add_length=FALSE
627
+ seq.last << 'n'*10 #SEQ Indicacion de GAP
628
+ else
629
+ last=length_model #Guardamos longitud anterior para poder desplazar las coordenadas del contig correctamente
630
+ length_model+=contig.length
631
+ end
632
+
633
+ q_end_seq=nil #SEQ
634
+ contig.first_hit.hsps.each_with_index do |hsp,i|
635
+ add_no=last
636
+ if !add_length
637
+ add_no=0
638
+ end
639
+ q_beg << hsp.q_beg+add_no # Se acumula a las coordenadas la longitud del modelo
640
+ q_end << hsp.q_end+add_no
641
+ s_beg << hsp.s_beg
642
+ s_end << hsp.s_end
643
+ #SEQ.................................
644
+ if i==0
645
+ cn=contig.seq[0..contig.first_hit.first_hsp.q_end-1]
646
+ cs="#{cn[0..1].swapcase!}#{cn[2..-1]}"
647
+ seq << cs
648
+ elsif i+1==n_exones
649
+ seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
650
+ else
651
+ seq << contig.seq[q_end_seq..hsp.q_end-1]
652
+ end
653
+ q_end_seq=hsp.q_end
654
+ # ...................................
655
+ end
656
+
657
+ # OVERLAP
658
+ #..........................
659
+ else
660
+ if last_position==-1
661
+ add_last=length_model-last_contig.length
662
+ end
663
+ overlap=last_contig.first_hit.hsp_count-position_overlap
664
+ if last_contig.first_hit.hsp_count ==1
665
+ overlap=1
666
+ end
667
+ #puts "#{overlap} = #{last_contig.first_hit.hsp_count} - #{position_overlap}"
668
+ add=0
669
+ dif=0
670
+ if last_score>=score
671
+ add=last_contig.first_hit.last_hsp.q_end-contig.first_hit.first_hsp.q_end
672
+ dif=add
673
+ if overlap>1 #eliminamos ultimo exon de 'last contig' para reemplazar por el segundo de 'contig' q es mas fiable por ser interno
674
+ add=last_contig.first_hit.hsp_at(last_contig.first_hit.hsp_count-overlap).q_end-contig.first_hit.first_hsp.q_end #Como se dropea el ultimo exon se alinea por el penultimo
675
+ #puts "hsp:#{contig.first_hit.hsp_count}\toverlap:#{overlap}"
676
+ dif=contig.first_hit.hsp_at(overlap-1).q_end
677
+ q_beg=q_beg.reverse.drop(1).reverse
678
+ q_end=q_end.reverse.drop(1).reverse
679
+ s_beg=s_beg.reverse.drop(1).reverse
680
+ s_end=s_end.reverse.drop(1).reverse
681
+ seq=seq.reverse.drop(1).reverse #SEQ
682
+ end
683
+ if overlap==1
684
+ overlap=2
685
+ end
686
+ (contig.first_hit.hsp_count-(overlap-1)).times do |n| #Añadimos el resto de exones del contig al modelo
687
+ q_beg << contig.first_hit.hsp_at(n+overlap-1).q_beg+add+add_last
688
+ q_end << contig.first_hit.hsp_at(n+overlap-1).q_end+add+add_last
689
+ s_beg << contig.first_hit.hsp_at(n+overlap-1).s_beg
690
+ s_end << contig.first_hit.hsp_at(n+overlap-1).s_end
691
+ #SEQ.......................................
692
+ position_hsp=n+overlap-2
693
+ if position_hsp <0
694
+ position_hsp= 0
695
+ end
696
+ position_next_hsp=n+overlap-1
697
+ if position_next_hsp < 0
698
+ position_next_hsp =0
699
+ end
700
+
701
+ if n==0
702
+ cn=contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
703
+ cs=cn[0..1].swapcase!+cn[2..-1]
704
+ seq << cs
705
+ elsif position_next_hsp==contig.first_hit.hsp_count-1
706
+ seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.length-1]
707
+ else
708
+ seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
709
+ end
710
+ #............................................
711
+ end
712
+ else
713
+ hsp_position=last_contig.first_hit.hsp_count-2
714
+ if hsp_position<0 #para los casos de los contigs q solo poseen un hsp
715
+ hsp_position=0
716
+ end
717
+ add=last_contig.first_hit.hsp_at(hsp_position).q_end
718
+ dif=last_contig.length-add
719
+ drop=1
720
+ correction=0
721
+ if overlap>1
722
+ drop=overlap-1
723
+ correction=1
724
+ add=last_contig.first_hit.hsp_at(position_overlap).q_end-contig.first_hit.first_hsp.q_end
725
+ dif=length_model-(add+add_last)
726
+ end
727
+ # Eliminamos exones malos de 'last_contig' (mantenemos el primero del overlap)
728
+ q_beg=q_beg.reverse.drop(drop).reverse
729
+ q_end=q_end.reverse.drop(drop).reverse
730
+ s_beg=s_beg.reverse.drop(drop).reverse
731
+ s_end=s_end.reverse.drop(drop).reverse
732
+ seq=seq.reverse.drop(drop).reverse #SEQ
733
+
734
+ # Añadimos los exones de 'contig' (excepto el primero)
735
+ (contig.first_hit.hsp_count-correction).times do |n| #Añadimos el resto de exones del contig al modelo
736
+ q_beg << contig.first_hit.hsp_at(n+correction).q_beg+add+add_last
737
+ q_end << contig.first_hit.hsp_at(n+correction).q_end+add+add_last
738
+ s_beg << contig.first_hit.hsp_at(n+correction).s_beg
739
+ s_end << contig.first_hit.hsp_at(n+correction).s_end
740
+ #SEQ.............................................................
741
+ if n+1==(contig.first_hit.hsp_count-correction)
742
+ n_correction=n+correction-1
743
+ if n_correction < 0
744
+ n_correction=0
745
+ end
746
+ seq << contig.seq[contig.first_hit.hsp_at(n_correction).q_end..contig.length-1]
747
+ elsif n==0
748
+ if n+correction==0
749
+ cn=contig.seq[0..contig.first_hit.hsp_at(n+correction).q_end-1] # Si n+corr empieza en el primer exon del contig
750
+ else
751
+ cn=contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
752
+ end
753
+ cs=cn[0..1].swapcase!+cn[2..-1]
754
+ seq << cs
755
+ else
756
+ seq << contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
757
+ end
758
+ #................................................................
759
+ end
760
+ end
761
+ length_model+=(contig.length-dif)
762
+ add_length=TRUE
763
+ add_last+=add
764
+ end
765
+ end
766
+ last_position=position_overlap
767
+ last_contig=contig
768
+ last_score=score
769
+ lengthy << length_model
770
+ end
771
+ if !multiple_lengths.empty?
772
+ multiple_lengths << length_model
773
+ length_model=multiple_lengths
774
+ end
775
+
776
+ model=nil
777
+ if !out_of_range #Generar modelo si todos los contigs han alineado con la referencia
778
+ model=void_contig(contigs.first.first_hit.name+'_model',length_model,contigs.first.first_hit.s_length,q_beg,q_end,s_beg,s_end,'contig','gene','exon')
779
+ #Merge contigs under sequence reference
780
+ model_length=nil
781
+ if model.class.to_s=='Array'
782
+ add=0
783
+ model.each_with_index do |contig,i|
784
+ contig.modified_coordenates(add)
785
+ add+=contig.length
786
+ if i<model.length-1
787
+ add+=10
788
+ end
789
+ end
790
+ model_length=add+model.last.length
791
+ end
792
+
793
+ model_n_exones=seq.length
794
+ final_seq=seq.join
795
+ else #No generar modelo si al menos un contig no alinea contra la referencia
796
+ model_length=nil
797
+ final_seq=nil
798
+ end
799
+
800
+ return model, model_length, final_seq
801
+ end
802
+
803
+ def void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) #Genera un objeto contig con los datos proporcionados
804
+ contigs=[]
805
+ is_contig=1
806
+ contig=nil
807
+ n=0
808
+ q_beg.each_with_index do |item,ind|
809
+ if item>0 ||single
810
+ if contig==nil
811
+ if contig_length.class.to_s=='Array'
812
+ length=contig_length[n]
813
+ name="#{contig_name}_#{n}"
814
+ else
815
+ length=contig_length
816
+ name=contig_name
817
+ end
818
+ contig=Contig.new(name)
819
+ contig.length=length
820
+ contig.type=contig_type
821
+ hit_v=contig.add_hit(contig_name,s_length,1,:prot)
822
+ hit_v.type=hit_type
823
+ end
824
+ hsp_v=contig.first_hit.add_hsp(q_beg[ind], q_end[ind], s_beg[ind], s_end[ind], 0, 0, 0, 0)
825
+ hsp_v.type=hsp_type
826
+ end
827
+ if item==0 && contig!=nil && !single||q_beg.length-1==ind
828
+ if single ||!q_beg.include?(0)
829
+ contigs=contig
830
+ else
831
+ contigs << contig
832
+ end
833
+ n+=1
834
+ contig=nil
835
+ end
836
+ end
837
+ return contigs
838
+ end
839
+
840
+ def position_reference_guided(contig,last_contig,last_position_ref,reference)# Si no existe overlap devuelve -1
841
+ position_ref,ex=contig.compare(reference)
842
+ if !last_position_ref.nil?
843
+ if position_ref<=last_position_ref+(last_contig.first_hit.hsp_count-1) #Overlap
844
+ position_overlap=(last_position_ref-position_ref).abs
845
+ else #No overlap
846
+ position_overlap=-1
847
+ end
848
+ end
849
+ return position_ref,position_overlap
850
+ end
851
+
852
+ def array_contigs_to_contig(array_contigs)
853
+ contig=Contig.new(array_contigs.first.name)
854
+ array_contigs.each do |cn|
855
+ contig.transfer_contig_hits(cn)
856
+ end
857
+ contig.length=array_contigs.last.length
858
+ return contig
859
+ end
860
+
861
+ def gene_error(e, gene_name, file_error, cluster, model) #e is a ruby exception object
862
+ puts gene_name+' ERROR'
863
+ file_error.puts "\n"+gene_name+"\n.............................."
864
+ file_error.puts e.message
865
+ e.backtrace.each do |line|
866
+ file_error.puts line
867
+ end
868
+ file_error.puts ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
869
+ cluster.each do |contig|
870
+ file_error.puts contig.name
871
+ #puts contig.name
872
+ #contig.indices
873
+ end
874
+ file_error.puts '----------------------------------------------------------------------------------------'
875
+ end
876
+
877
+ def overlap_test(model,output=TRUE)
878
+ perc_overlap=0
879
+ overlap=model.overlap
880
+ total=0
881
+ if !overlap.empty?
882
+ if output
883
+ print 'WARNING: overlap/s '
884
+ end
885
+ overlap.each do |length_overlap|
886
+ if output
887
+ print (length_overlap*-3).to_s+', '
888
+ end
889
+ total+=length_overlap
890
+ end
891
+ perc_overlap=(total*-100.0/model.first_hit.s_length).round(2)
892
+ if output
893
+ puts 'nt. % Total overlap '+perc_overlap.to_s
894
+ end
895
+ end
896
+ return perc_overlap
897
+ end
898
+
899
+ def recover_test(model,output=TRUE)
900
+ recovered=0
901
+ model.exones_s.each do |exon|
902
+ recovered+=exon
903
+ end
904
+ recovered=(recovered*100.0/model.first_hit.s_length).round(2)
905
+ if output
906
+ puts "Recovered\t"+model.first_hit.name+"\t#{recovered}"
907
+ end
908
+ return recovered
909
+ end
910
+
911
+ def web_header(web, path)
912
+ file_web=nil
913
+ if web
914
+ file_web=File.open(path,'w')
915
+ html_header(file_web,'Gene index')
916
+ html_table_header(file_web,1,['Gene model name', 'Protein length', 'Num exon', '% recovered protein', '% overlapping sequence', 'Fragmentation'])
917
+ end
918
+ return file_web
919
+ end
920
+
921
+ def web_body(file_web)
922
+ if !file_web.nil?
923
+ html_table_footer(file_web)
924
+ html_footer(file_web)
925
+ file_web.close
926
+ end
927
+ end
928
+
929
+ def write_model_fasta(sequences_hash, path)
930
+ model_file=File.open(path,'w')
931
+ sequences_hash.each do |model|
932
+ model_file.puts '>'+model[0]+"_model\n"+model[1]
933
+ end
934
+ model_file.close
935
+ end
936
+
937
+ def write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name)
938
+ gff_model=ReportGff.new(gff_dataset_model,path,'s')
939
+ gff_model.create('a')
940
+ gff=ReportGff.new(gff_dataset,path,'s')
941
+ gff.create('a',name)
942
+ end
943
+
944
+ def format_model(model)
945
+ if model.n_hits?>1
946
+ model.each_hit_with_index{|hit,i|
947
+ hit.name=hit.name+"_gene_#{i}"
948
+ }
949
+ else
950
+ model.first_hit.name=model.first_hit.name+'_gene'
951
+ end
952
+ end
953
+
954
+ def correct_model(model, length_model, gff_dataset, sequences_hash)
955
+ correct_add_Ns=0
956
+ if model.class.to_s=='Array'
957
+ model=array_contigs_to_contig(model)
958
+ model.name=model.name.gsub('_0','')
959
+ model.length=length_model
960
+ end
961
+
962
+ correct_add_Ns=gff_dataset.correct_left_side_contigs(model)
963
+ model.modified_coordenates(correct_add_Ns)
964
+ model.length+=correct_add_Ns
965
+ gff_dataset.align_contigs(model)
966
+
967
+ #Corregir secuencia para que alinee con las features generadas
968
+ if correct_add_Ns>0
969
+ sequences_hash[model.name.gsub('_model','')]='n'*correct_add_Ns+sequences_hash[model.name.gsub('_model','')]
970
+ end
971
+
972
+ return model
973
+ end
974
+
975
+ end