gene_assembler 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,100 @@
1
+ class Hsp
2
+ attr_accessor :q_beg, :q_end, :s_beg, :s_end, :align_len, :score, :ident, :gaps, :type
3
+ def initialize (q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
4
+ @q_beg=q_beg #Inicio en query
5
+ @q_end=q_end #Fin en query
6
+ @s_beg=s_beg #Inicio en subject
7
+ @s_end=s_end #Fin en subject
8
+ @align_len=align_len #Tamaño de la secuencia alineada
9
+ #@bit_score=bit_score
10
+ @score=score
11
+ @ident=ident
12
+ @gaps=gaps
13
+ @type=nil
14
+ end
15
+
16
+ def compare(hsp) #Compara hsps distintintos a nivel del subject para saber si son el mismo
17
+ coverage=0
18
+ if self.s_beg==hsp.s_end && self.s_end==hsp.s_end
19
+ coverage=1
20
+ elsif self.s_beg>=hsp.s_beg && self.s_end<hsp.s_end #Caso de q el self este dentro de hsp
21
+ coverage=1
22
+ elsif self.s_beg<=hsp.s_beg && self.s_end>hsp.s_beg && (self.s_end-hsp.s_beg).abs>1
23
+ ext=self.s_end-hsp.s_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
24
+ coverage=ext/(self.s_end-self.s_beg)
25
+ elsif self.s_beg<hsp.s_end && self.s_end>=hsp.s_end && (self.s_beg-hsp.s_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
26
+ ext=hsp.s_end-self.s_beg*1.00
27
+ coverage=ext/(self.s_end-self.s_beg)
28
+ end
29
+ return coverage
30
+ end
31
+
32
+ def compare_q(hsp) #Compara hsps distintintos a nivel del query para saber si son el mismo
33
+ coverage=0
34
+ if self.q_beg==hsp.q_end && self.q_end==hsp.q_end
35
+ coverage=1
36
+ elsif self.q_beg>=hsp.q_beg && self.q_end<hsp.q_end #Caso de q el self este dentro de hsp
37
+ coverage=1
38
+ elsif self.q_beg<=hsp.q_beg && self.q_end>hsp.q_beg && (self.q_end-hsp.q_beg).abs>1
39
+ ext=self.q_end-hsp.q_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
40
+ coverage=ext/(self.q_end-self.q_beg)
41
+ elsif self.q_beg<hsp.q_end && self.q_end>=hsp.q_end && (self.q_beg-hsp.q_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
42
+ ext=hsp.q_end-self.q_beg*1.00
43
+ coverage=ext/(self.q_end-self.q_beg)
44
+ end
45
+ return coverage
46
+ end
47
+
48
+
49
+ def length_q #Longitud del hsp en la query
50
+ length=@q_end-@q_beg
51
+ return length
52
+ end
53
+
54
+ def rev(length_hsp) # Cambia coordenadas de reversas a directas
55
+ @q_beg=length_hsp-@q_beg #Inicio en query
56
+ @q_end=length_hsp-@q_end #Fin en query
57
+ @reversed=FALSE
58
+ end
59
+
60
+ def within?(hsp,long) #Mira si un hsp esta dentro de otro o si hay overlap parcial entre los mismos
61
+ over=0
62
+ if self.q_beg<=hsp.q_beg && self.q_end>=hsp.q_end
63
+ over=1
64
+ end
65
+ if self.s_beg<=hsp.s_beg && self.s_end>=hsp.s_end
66
+ over=1
67
+ end
68
+ if over == 0
69
+ self_coverage=(self.s_end-self.s_beg)*1.00/long
70
+ hsp_coverage=(hsp.s_end-hsp.s_beg)*1.00/long
71
+ if hsp_coverage>(1-self_coverage) #Si el coverage del hsp en mayor que el resto que deja el self, se da como overlap
72
+ over=1
73
+ end
74
+ end
75
+ return over
76
+ end
77
+
78
+ def modified_coordenates(add)
79
+ @q_beg+=add
80
+ @q_end+=add
81
+ end
82
+
83
+ def rev_coord(contig_length)
84
+ puts '---------------------------------'
85
+ puts @q_beg.to_s+' '+@q_end.to_s
86
+ @q_beg=contig_length-@q_beg+1
87
+ @q_end=contig_length-@q_end+1
88
+ puts @q_beg.to_s+' '+@q_end.to_s
89
+ end
90
+
91
+ def overlap_with(last_hsp)
92
+ overlap=0
93
+ diference=self.s_beg-last_hsp.s_end
94
+ #puts "#{self.s_beg} - #{last_hsp.s_end} = #{diference}"
95
+ if diference<0
96
+ overlap=diference
97
+ end
98
+ return overlap
99
+ end
100
+ end
@@ -0,0 +1,228 @@
1
+ require 'scbi_fasta'
2
+
3
+ def mapping(contigs,gene_array,map_path) #Relaciona un archivo sam con un contig, cuantifica nº lecturas por exon
4
+ # Mapping
5
+ #--------------------------------------------------------------
6
+ contigs.each do |contig|
7
+ ruta=File.join(map_path,"#{contig.name}.sam")
8
+
9
+ # Parse mapping & exon valoration
10
+ #--------------------------------------------------------------
11
+ seq_map=[]
12
+ n_reads=0
13
+ if File.exists?(ruta)
14
+ contig.length.times do |x|
15
+ seq_map << 0
16
+ end
17
+ map_file=File.open(File.join(ruta), 'r')
18
+ map_file.each do |line|
19
+ fields=line.split
20
+ if fields[0]!~/[@]/
21
+ n_reads+=1
22
+ #puts "#{fields[3]}\t#{fields[5]}"
23
+ start_map=fields[3].to_i-1
24
+ end_map=start_map-1
25
+ fields[5].split(/[^\d]/).each{|e| end_map+=e.to_i}
26
+ #puts "#{start_map}\t#{end_map}"
27
+ #puts seq_map[start_map..end_map].inspect
28
+ seq_map.each_with_index do |item,a|
29
+ if a>=start_map
30
+ seq_map[a]+=1
31
+ end
32
+ if a>end_map
33
+ break
34
+ end
35
+ end
36
+ end
37
+ end
38
+ #puts seq_map.inspect
39
+
40
+ # Exon valoration
41
+ #-----------------------------------------------------------
42
+ exon_stadistic=[]
43
+ contig.hits.first.hsps.each do |hsp|
44
+ exon=seq_map[hsp.q_beg-1..hsp.q_end-1]
45
+ value=0
46
+ exon.each{|e| value+=e}
47
+ exon_stadistic << (value*100.0/n_reads/exon.length).round(2)
48
+ end
49
+ #puts exon_stadistic.inspect
50
+ y=contigs.index(contig)
51
+ x=gene_array[y].index(1)
52
+ exon_stadistic.each_with_index do |item,b|
53
+ gene_array[y][x+b]=item
54
+ end
55
+ seq_map=[]
56
+ end
57
+ end #end contigs.each
58
+
59
+ if $verbose
60
+ puts "\nGENE ARRAY - EXON VALUATED"
61
+ gene_array.each_with_index do |fila,c|
62
+ print "#{contigs[c].name.center(24)} "
63
+ fila.each do |item|
64
+ print "#{item.to_s}\t"
65
+ end
66
+ puts "\n"
67
+ end
68
+ end
69
+
70
+ contigs.each do |contig|
71
+ puts '...................'
72
+ contig.indices
73
+ end
74
+ puts "\n"
75
+ end
76
+
77
+ def length2D(array) # Devuelve la longitud maxima que tenga un conjunto de arrays
78
+ length=0
79
+ array.each do |item|
80
+ item_length=item.length
81
+ if item_length>length
82
+ length=item_length
83
+ end
84
+ end
85
+ return length
86
+ end
87
+
88
+ def parse_contig_index(gene_array,contigs) #Comprueba codones start- stop en contigs que contengan el primer o el ultimo exon
89
+ exons_model=length2D(gene_array)
90
+ gene_array.each_with_index do |contig,i|
91
+ start=nil #Desconocido
92
+ if contig.first >0 #Comprueba si el contig tiene el primer exon
93
+ start=contigs[i].start_codon_search
94
+ end
95
+ stop=nil #desconocido
96
+ #if contig.length==exons_model #Comprueba si el contig posee el ultimo exon
97
+ stop=contigs[i].stop_codon_search
98
+ #end
99
+ if start==TRUE && stop==TRUE
100
+ contigs[i].completed=TRUE
101
+ elsif start==TRUE
102
+ contigs[i].completed='start'
103
+ elsif stop==TRUE
104
+ contigs[i].completed='stop'
105
+ else
106
+ contigs[i].completed=FALSE
107
+ end
108
+ end
109
+ end
110
+
111
+ def sides_recovery(contigs) # Toma de un conjunto de contigs un contig con señal de stop y un contig con señal de inicio
112
+ start=nil
113
+ stop=nil
114
+ contigs.each do |contig|
115
+ if contig.completed=='start'
116
+ if start.nil?
117
+ start=contig
118
+ else
119
+ if start.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
120
+ start=contig
121
+ end
122
+ end
123
+ end
124
+ if contig.completed=='stop'
125
+ if stop.nil?
126
+ stop=contig
127
+ else
128
+ if stop.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
129
+ stop=contig
130
+ end
131
+ end
132
+ end
133
+ end
134
+ return start,stop
135
+ end
136
+
137
+ def sides_add(contigs,start,stop) #Añade contigs con señal de stop-start si no existen en el array contigs
138
+ beg=TRUE
139
+ ends=TRUE
140
+ contigs.each do |contig|
141
+ if contig.completed=='start'||contig.completed==TRUE
142
+ beg=FALSE
143
+ end
144
+ if contig.completed=='stop'||contig.completed==TRUE
145
+ ends=FALSE
146
+ end
147
+ end
148
+ if beg && !start.nil?
149
+ b=[]
150
+ b << start
151
+ contigs=b.concat(contigs)
152
+ end
153
+ if ends && !stop.nil?
154
+ e=[]
155
+ e << stop
156
+ contigs.concat(e)
157
+ end
158
+ return contigs
159
+ end
160
+
161
+ def cluster_filter(gene_array,cluster,length)# Elimina contigs de cluster y gene_array que tengan etiqueta de stop y solo tengan un hsp
162
+ cluster.each_with_index do |contig,i|
163
+ if contig.completed=='stop'
164
+ if contig.hits.first.hsps.last.s_end-contig.hits.first.hsps.last.s_beg<length && contig.hits.first.hsps.count==1
165
+ cluster[i]=nil
166
+ gene_array[i]=nil
167
+ end
168
+ end
169
+ end
170
+ cluster.compact!
171
+ gene_array.compact!
172
+ return gene_array,cluster
173
+ end
174
+
175
+ def coord_prot(last_contig_hsp, current_contig_hsp) #Devuelve la diferencia de posicion de dos contigs dados en base a su posicion en la proteina
176
+ add=last_contig_hsp.q_beg-current_contig_hsp.q_beg+3*(current_contig_hsp.s_beg-last_contig_hsp.s_beg) #primera parte del sumando representa la diferencia debida a la longitud de los contigs, la segunda parte representa la diferencia de tamaño del hsp
177
+ return add
178
+ end
179
+
180
+ def fasta_hash(path)
181
+ parse_seqs=FastaFile.new(path)
182
+ seqs={}
183
+ parse_seqs.each do |contig,seq_fasta|
184
+ seqs[contig]=seq_fasta
185
+ end
186
+ return seqs
187
+ end
188
+
189
+ def html_header(file,title)
190
+ file.puts '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
191
+ '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">',
192
+ '<head>',
193
+ '<meta http-equiv="content-type" content="text/html;charset=UTF-8" />',
194
+ '<title>'+title+'</title>',
195
+ '</head>',
196
+ '<body>'
197
+ end
198
+
199
+ def html_footer(file)
200
+ file.puts '</body>',
201
+ '</html>'
202
+ end
203
+
204
+ def html_table_header(file, border, headers) #headers es un array
205
+ file.puts '<table border="'+border.to_s+'">',
206
+ '<tr>'
207
+ headers.each do |header|
208
+ file.puts '<th>'+header+'</th>'
209
+ end
210
+ file.puts '</tr>'
211
+ end
212
+
213
+ def html_row(file, cells) #Cells muts be a array
214
+ file.puts '<tr>'
215
+ cells.each do |cell|
216
+ file.puts "<td>#{cell}</td>"
217
+ end
218
+ file.puts '</tr>'
219
+ end
220
+
221
+ def html_link(text, link)
222
+ text_linked='<a href="'+link+'">'+text.to_s+'</a>'
223
+ return text_linked
224
+ end
225
+
226
+ def html_table_footer(file)
227
+ file.puts '</table>'
228
+ end
@@ -0,0 +1,25 @@
1
+ require 'dataset'
2
+
3
+ class Parser
4
+ attr_accessor :dataset
5
+ def initialize(file,type=nil)
6
+ @file=file
7
+ @dataset=create_dataset
8
+ data=parse_file(file) #Se crea objeto de datos para cargar dataset
9
+ load_dataset(data) #Se rellena dataset con la informacion contenida en data
10
+ end
11
+
12
+ def create_dataset
13
+ dataset=Dataset.new('unknown')#No se usa
14
+ return dataset
15
+ end
16
+
17
+ def parse_file(file)
18
+
19
+ end
20
+
21
+ def load_dataset(data)
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,12 @@
1
+ require 'blast_type_parser'
2
+ require 'scbi_blast' #Si falla, buscar e instalar tb 'gem install xml-simple' de la q depende
3
+
4
+
5
+ class ParserBlast < BlastTypeParser
6
+
7
+ def parse_file(file)
8
+ blast=BlastTableResult.new(file)
9
+ return blast
10
+ end
11
+
12
+ end
@@ -0,0 +1,16 @@
1
+ require 'exonerate_result'
2
+ require 'blast_type_parser'
3
+
4
+ class ParserExonerate < BlastTypeParser
5
+
6
+ def parse_file(file)
7
+ exonerate=ExonerateResult.new(file,@all)
8
+ return exonerate
9
+ end
10
+
11
+ def populate_extra_atributes(contig,item) #Añade los frameshift localizados x el exonerate
12
+ contig.q_frameshift=item.q_frameshift
13
+ contig.s_frameshift=item.s_frameshift
14
+ end
15
+
16
+ end
@@ -0,0 +1,975 @@
1
+ require 'dataset'
2
+ require 'other_functions'
3
+ require 'report_gff'
4
+ require 'fileutils'
5
+
6
+ class Rebuild
7
+ def initialize(dataset,dataset_uni_hsp,path) #La clase ha de recibr objetos dataset
8
+ @dataset=dataset
9
+ @dataset_uni_hsp=dataset_uni_hsp
10
+ @path=path
11
+ @db_seqs=fasta_hash(path[:exonerate_db])
12
+ end
13
+
14
+ ###############################################################################################################
15
+ # MAIN METHOD
16
+ ###############################################################################################################
17
+ def rebuild(options) #Genera contigs modelo,gff y busca pseudogenes
18
+ gff_dataset_model=Dataset.new(:mix) #Object for save info of GeneAssembler's output
19
+ gff_dataset=Dataset.new(:mix) #Object for save info of GeneAssembler's output
20
+ file_error=File.open(@path[:error],'w')
21
+ file_web=web_header(options[:web],@path[:html])
22
+ sequences_hash={}
23
+ gene_name=nil
24
+ model=nil
25
+ statistics={:genes => 0, :total_recovered => 0, :total_overlap => 0, :total_fragmentation => 0}
26
+ puts "\nMODELING GENE",'*******************************************'
27
+ @dataset.each_cluster{|cluster|
28
+ begin
29
+ if !cluster.nil?
30
+ gene_name=cluster.first.first_hit.name
31
+ end
32
+ cluster_complete=cluster.dup
33
+ model, length_model, length_cluster = iterative_modeling_gene_w_reference(cluster,@dataset.references_hash,options[:rebuild],sequences_hash) #Realiza la reconstruccion del gen (alineado,descarte y montaje del gen)
34
+
35
+ # GeneAssembler output (gff for Gbrowse)
36
+ #--------------------------------------------------------
37
+ if !model.nil?
38
+ #Format Contigs children of model
39
+ gff_dataset.clr_contigs
40
+ gff_dataset.transfer_contigs(cluster_complete)
41
+ gff_dataset.transfer_n_contigs_def_hit_type(@dataset_uni_hsp,cluster,'pseudogene',50) #Transferir pseudogenes al report
42
+
43
+ # Convertir arrays a contig y ajustar alineamiento añadiendo Ns
44
+ model=correct_model(model, length_model, gff_dataset, sequences_hash)
45
+
46
+ # Comprobaciones en el modelo
47
+ exones=model.exones_s.length # N exones
48
+ puts 'Exones: '+ exones.to_s
49
+ recovered=recover_test(model)
50
+ overlap=overlap_test(model)
51
+ fragmentation=((length_cluster-1.00)/exones).round(2)
52
+ puts 'Fragmentation: ' + fragmentation.to_s
53
+
54
+ # HTML index
55
+ if !file_web.nil?
56
+ gene_link=html_link(model.first_hit.name, @path[:gbrowse_link]+model.first_hit.name)
57
+ html_row(file_web, [gene_link, cluster.first.first_hit.s_length, exones, recovered, overlap, fragmentation])
58
+ end
59
+
60
+ #Format Model for Gbrowse
61
+ gff_dataset_model.clr_contigs
62
+ format_model(model) #Añade la particula _gene al modelo
63
+ gff_dataset_model.transfer_contigs(model)
64
+
65
+ #Write
66
+ write_gbrowse_gff(gff_dataset_model, gff_dataset, @path[:gff], model.name)
67
+
68
+ #General statistics
69
+ statistics[:genes]+=1
70
+ statistics[:total_recovered]+=recovered
71
+ statistics[:total_overlap]+=overlap
72
+ statistics[:total_fragmentation]+=fragmentation
73
+ end
74
+ rescue Exception => e
75
+ gene_error(e, gene_name, file_error, cluster_complete, model)
76
+ end
77
+ puts '* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *'
78
+ }
79
+ file_error.close
80
+ web_body(file_web)
81
+
82
+ puts "\nFINAL STATISTICS\n",
83
+ 'Recovered genes: '+ statistics[:genes].to_s,
84
+ 'Mean recover: ' + (statistics[:total_recovered]/statistics[:genes]).to_s,
85
+ 'Mean overlap: ' + (statistics[:total_overlap]/statistics[:genes]).to_s,
86
+ 'Mean fragmentation: ' + (statistics[:total_fragmentation]/statistics[:genes]).to_s
87
+ write_model_fasta(sequences_hash,@path[:fasta])
88
+ end
89
+ ################################################################################################################################
90
+ # end main method
91
+ ################################################################################################################################
92
+
93
+ def iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash)
94
+ # Model atributes
95
+ model=nil
96
+ length=0
97
+ seq=nil
98
+ cluster_length=0
99
+ length_cluster=0
100
+ prot_reference=cluster.first.first_hit.name
101
+ array_references=references_hash[prot_reference]
102
+
103
+ # Model parameters
104
+ recover=0
105
+ overlap=0
106
+
107
+ #Modelo de gen en ciego
108
+ if $verbose
109
+ puts "\n",'|||||||||| BLIND MODELING ||||||||||'
110
+ end
111
+ model, length, seq, length_cluster= modeling_gene(cluster.dup,nil,options)
112
+
113
+ recover, overlap=eval_model(model.dup, length)
114
+
115
+ if $verbose
116
+ puts "\nRecover: #{recover} Overlap: #{overlap}"
117
+ end
118
+
119
+ guided=FALSE
120
+ #Modelo de gen guiado
121
+ if !array_references.nil?
122
+ array_references.each do |ref|
123
+ if $verbose
124
+ puts "\n",'|||||||||| GUIDED MODELING ||||||||||'
125
+ end
126
+
127
+ guided_model, guided_length, guided_seq, guided_length_cluster = modeling_gene(cluster.dup,ref,options)
128
+ if guided_model.nil? # Si algun modelo sale mal se ignora
129
+ next
130
+ end
131
+ guided_recover, guided_overlap= eval_model(guided_model.dup, guided_length)
132
+ if $verbose
133
+ puts "\nRecover: #{guided_recover} Overlap: #{guided_overlap}"
134
+ end
135
+
136
+ #Arbol de decisiones
137
+ if guided_overlap <= 15 #Si el overlap es menor del 15 %
138
+ if guided_overlap >= overlap-overlap*0.05 && guided_overlap <= overlap+overlap*0.05 # A mismo overlap
139
+ if guided_recover > recover
140
+ guided=TRUE
141
+ end
142
+ else # A distinto overlap
143
+ recover_dif=guided_recover-recover
144
+ if recover_dif < 0 # Si el guided_model tiene menos recuperacion q el anterior
145
+ if recover_dif.abs >= overlap-overlap*0.05 && recover_dif.abs <= overlap+overlap*0.05 #Si la reduccion de la recuperacion se debe a la desaparicion del overlap
146
+ guided=TRUE
147
+ end
148
+ elsif recover_dif> guided_overlap+guided_overlap*0.05 # Comprobar que la diferencia de recover no se debe a u aumento del overlap en la misma magnitud
149
+ guided=TRUE
150
+ end
151
+ end
152
+ elsif guided_overlap < overlap # Quedarnos siempre con los overlap mas bajos aun en situacion de overlap alto
153
+ guided=TRUE
154
+ end
155
+
156
+ if guided
157
+ model=guided_model
158
+ length=guided_length
159
+ seq=guided_seq
160
+ length_cluster=guided_length_cluster
161
+ recover=guided_recover
162
+ overlap=guided_overlap
163
+ end
164
+ end
165
+ end
166
+
167
+ sequences_hash[prot_reference]=seq
168
+ return model, length, length_cluster
169
+ end
170
+
171
+ def eval_model(local_model, length)#modifica model asi q se le ha de pasar una copia
172
+ recover=0
173
+ overlap=0
174
+ if local_model.class.to_s=='Array'
175
+ local_model=array_contigs_to_contig(local_model)
176
+ local_model.length=length
177
+ end
178
+ recover=recover_test(local_model,FALSE)
179
+ overlap=overlap_test(local_model,FALSE)
180
+ return recover, overlap
181
+ end
182
+
183
+ def modeling_gene(cluster, reference, rebuild) #Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo
184
+ model=nil
185
+ model_length=nil
186
+ seq=nil
187
+ length_cluster=0
188
+ # Reduccion iterativa de los contig para seleccionar los que van a formar parte del modelo de gen, elimina fragmentos menores que se puedan tomar como nuevos exones
189
+ #--------------------------------------------------------------------------------------------------
190
+ gene_array_length_before=nil
191
+ continue=TRUE
192
+ gene_array=[]
193
+
194
+ while continue
195
+ cluster,gene_array=gene_array_and_compact(rebuild,cluster,reference)
196
+ gene_array_length_after=length2D(gene_array)
197
+ if gene_array_length_after == gene_array_length_before
198
+ continue=FALSE
199
+ end
200
+ gene_array_length_before=gene_array_length_after
201
+ end
202
+ length_cluster=cluster.length
203
+
204
+ # Modelado del gen
205
+ #----------------------------------------------------
206
+ if rebuild && !cluster.empty? && !gene_array.empty?
207
+ if cluster.length >1
208
+ cluster_comp=contig_compact(cluster) #Fusiona contigs contiguos y devuelve el array correspondiente
209
+ else
210
+ cluster_comp=cluster
211
+ end
212
+ if !cluster_comp.nil?
213
+ model, model_length, seq=gene_model_cut(cluster_comp, reference)
214
+ else
215
+ puts cluster.first.first_hit.name+"\tGENE MODEL ABORTED"
216
+ end
217
+ end
218
+ return model, model_length, seq, length_cluster
219
+ end
220
+
221
+ def gene_array_and_compact(rebuild,cluster,reference)
222
+ # Contruir array de exones (a partir del cluster) con los hsps de forma que los solapantes se alineen en las mismas columnas
223
+ #---------------------------------------------------------------------------------------------------------------------------
224
+ if rebuild
225
+ gene_array,gene_array_introns=build_gene_array(cluster,reference) #Con referencia
226
+ if $verbose
227
+ gene_exons=gene_stadistics(gene_array)
228
+ gene_stadistics_report(gene_exons,'EXONS')
229
+ gene_introns=gene_stadistics(gene_array_introns)
230
+ gene_stadistics_report(gene_introns,'INTRONS')
231
+ end
232
+ end
233
+
234
+ # Seleccion de contigs para modelado de gen
235
+ #-----------------------------------------------------
236
+ if $verbose #Info cluster before compact array contigs
237
+ gene_array_report(gene_array,cluster,rebuild)
238
+ end
239
+ if rebuild
240
+ gene_compact(gene_array,cluster) #Se descartan los contigs redundantes y quedan aquellos que cubren todo el gen para formar el modelo
241
+ end
242
+ if $verbose && rebuild #Info cluster after compact array contigs
243
+ gene_array_report(gene_array,cluster,rebuild)
244
+ end
245
+ return cluster, gene_array
246
+ end
247
+
248
+ def add_uni_hsp(model,cluster)#Compara contigs uni-hsp con contig modelo para determinar pseudogenes
249
+ contigs_uni_hsp=''
250
+ is_contig=0
251
+ pseudogenes=[]
252
+ @clusters_uni_hsp.each do |contigs|
253
+ if contigs.first.first_hit.name==cluster.first.first_hit.name
254
+ contigs_uni_hsp=contigs
255
+ is_contig=1
256
+ break
257
+ end
258
+ end
259
+
260
+ if is_contig==1 #Si se ha encontrado contigs uni-hsp se realiza la comparacion
261
+ if model.class.to_s!='Array'
262
+ model=[model]
263
+ end
264
+ model.each do |item|
265
+ contigs_uni_hsp.each do |contig|
266
+ start,exons=item.compare(contig)
267
+ if exons>1 && !pseudogenes.include?(contig)
268
+ pseudogenes << contig
269
+ end
270
+ end
271
+ end
272
+ end
273
+ return pseudogenes
274
+ end
275
+
276
+ def build_gene_array(contigs,reference=nil) #GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones
277
+ gene_array=[]
278
+ gene_array_introns=[]
279
+ last_contig=''
280
+ if !reference.nil?
281
+ last_contig=reference
282
+ end
283
+ contigs.each do |contig|
284
+ array_contig=[]
285
+ array_contig_introns=[]
286
+ n_exon=contig.first_hit.hsp_count #Contamos cantidad de hsps en el contig
287
+ #Determinar posiciones vacias
288
+ if !gene_array.empty?||reference
289
+ first_exon,ex=contig.compare(last_contig) #Comparamos el contig actual con el que se ha estudiado en la iteracion anterior
290
+ if reference && first_exon==-1 # Abortar alineamiento cuando un contig no coincide con la referencia
291
+ if $verbose
292
+ puts "\n#{contig.name} alignment step OUT OF RANGE"
293
+ end
294
+ gene_array=[]
295
+ gene_array_introns=[]
296
+ break
297
+ end
298
+ if first_exon==-1
299
+ gene_array.last.count.times do #Posiciones vacias cuando NO hay overlapping
300
+ array_contig << 0 # Marca ausencia de exon para esa posicion
301
+ array_contig_introns << 0 # Marca ausencia de intron para esa posicion
302
+ end
303
+ else
304
+ if reference # ASignamiento de la posicion del contig respecto a la referencia
305
+ void_positions=first_exon
306
+ else
307
+ void_positions=first_exon+gene_array.last.count(0)
308
+ end
309
+ void_positions.times do #Posiciones vacias cuando HAY overlapping
310
+ array_contig << 0
311
+ array_contig_introns << 0
312
+ end
313
+ end
314
+ end
315
+ #Agregar exones e intrones del contig
316
+ exones=contig.exones_s
317
+ introns=contig.intrones_q
318
+ array_contig << exones # Marca presencia de exon para esa posicion
319
+ array_contig_introns << introns # Marca presencia de exon para esa posicion
320
+ gene_array << array_contig.flatten!
321
+ gene_array_introns << array_contig_introns.flatten!
322
+ if reference.nil?
323
+ last_contig=contig
324
+ end
325
+ end
326
+ return gene_array, gene_array_introns
327
+ end
328
+
329
+ def gene_stadistics(gene_array) #Calcula el nº exones diferentes que hay por cada posicion del gene_array
330
+ exons=[]
331
+ length=length2D(gene_array)
332
+ length.times do |column|
333
+ exon=[]
334
+ gene_array.each_with_index.each do |item,row|
335
+ if !exon.include?(gene_array[row][column]) && gene_array[row][column]!=0
336
+ exon << gene_array[row][column]
337
+ end
338
+ end
339
+ exons << exon
340
+ end
341
+ exons_stadistic=[]
342
+ exons.each do |ex|
343
+ exons_stadistic << ex.compact.length
344
+ end
345
+ return exons_stadistic
346
+ end
347
+
348
+ def gene_stadistics_report(exons_stadistic,tag) #Muestra estadisticas de intrones o exones
349
+ print "\n#{tag}\t"
350
+ exons_stadistic.each do |item|
351
+ print "#{item}\t"
352
+ end
353
+ print "\n"
354
+ end
355
+
356
+ def gene_array_report(gene_array,contigs,act_array) #Muestra el array de la funncion build_gene_array y una representacion de las secuencias
357
+ if act_array
358
+ puts "\nGENE ARRAY"
359
+ gene_array.each_with_index do |fila,c|
360
+ print "#{contigs[c].name.center(24)}\t "
361
+ print "#{contigs[c].completed}\t"
362
+ fila.each do |item|
363
+ print "#{item}\t"
364
+ end
365
+ puts "\n"
366
+ end
367
+ end
368
+
369
+ puts "\nMAP"
370
+ contigs.each do |contig|
371
+ print "#{contig.name.center(25)}"
372
+ print contig.draw
373
+ end
374
+ end
375
+
376
+ def gene_compact(gene_array, contigs) # Generacion modelo del gen quitando todas las secuencias redundantes posibles
377
+ gene_array.each_with_index do |contig,c1|
378
+ if !contig
379
+ next
380
+ end
381
+ c1_len=contig.length
382
+ n_exons=contig.count{|x| x>0}
383
+ gene_array.each_with_index do |contig2,c2|
384
+ if !contig2 ||c1==c2 #Saltamos contigs a nil o autocomparacion
385
+ next
386
+ end
387
+ c2_len=contig2.length
388
+
389
+ # IGUAL
390
+ if c1_len==c2_len
391
+ if contig2.count{|x| x>0}==n_exons
392
+ if contigs[c1].first_hit.first_hsp.score>=contigs[c2].first_hit.first_hsp.score
393
+ gene_array[c2]=nil
394
+ contigs[c2]=nil
395
+ else
396
+ gene_array[c1]=nil
397
+ contigs[c1]=nil
398
+ break
399
+ end
400
+ elsif contig2.count{|x| x>0}>n_exons
401
+ gene_array[c1]=nil
402
+ contigs[c1]=nil
403
+ break
404
+ else
405
+ gene_array[c2]=nil
406
+ contigs[c2]=nil
407
+ end
408
+
409
+ # MAYOR QUE
410
+ elsif c1_len>c2_len
411
+ if contig.count(0)<=contig2.count(0)
412
+ gene_array[c2]=nil
413
+ contigs[c2]=nil
414
+ end
415
+
416
+ # MENOR QUE
417
+ elsif c1_len<c2_len
418
+ if contig.count(0)==contig2.count(0)
419
+ gene_array[c1]=nil
420
+ contigs[c1]=nil
421
+ break
422
+ end
423
+ end
424
+ end #end contig2
425
+ end #end contig
426
+ gene_array.compact!
427
+ contigs.compact!
428
+ end
429
+
430
+ def contig_compact(contigs) # Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig
431
+ cn_def=[]
432
+ cn_backup=contigs.dup
433
+ #Determinar contigs a fusionar
434
+ cn_to_merge=[]
435
+ s_end=nil
436
+ last_position_ref=nil
437
+ position_overlap=nil
438
+ last_contig=nil
439
+ fusion=[]
440
+ contigs.length.times do
441
+ fusion << FALSE
442
+ end
443
+ #Marcaje de contigs correlativos no solapantes
444
+ contigs.each_with_index do |contig,i|
445
+ if i>0
446
+ diference=contig.first_hit.first_hsp.s_beg-s_end
447
+ if diference==0 || diference==1
448
+ fusion[i]=TRUE
449
+ end
450
+ end
451
+ s_end=contig.first_hit.last_hsp.s_end
452
+ end
453
+
454
+ if fusion.include?(TRUE)
455
+
456
+ #Construccion array contigs a fusionar y guardado de los solapantes
457
+ fusion_contigs=[]
458
+ count=0 # Marca la posicion de las fusiones
459
+ fusion.each_with_index do |cont,i|
460
+ if cont
461
+ if !fusion_contigs.include?(contigs[i-1])
462
+ fusion_contigs << contigs[i-1]
463
+ end
464
+ if !fusion_contigs.include?(contigs[i])
465
+ fusion_contigs << contigs[i]
466
+ end
467
+ else
468
+ if !fusion_contigs.empty?#Marcar fusiones
469
+ cn_to_merge << fusion_contigs
470
+ fusion_contigs=[]
471
+ cn_def << count
472
+ count+=1
473
+ end
474
+ if !fusion[i+1]||fusion[i+1].nil?#Guardar contigs que no participan en las fusiones
475
+ cn_def << contigs[i]
476
+ end
477
+ end
478
+ if i+1==fusion.length && !fusion_contigs.empty? #Control fin de bucle
479
+ cn_to_merge << fusion_contigs
480
+ cn_def << count
481
+ count+=1
482
+ end
483
+ end
484
+
485
+ #Generar fasta de los contig fusionados
486
+ contigs_merge=contigs_seq_merge(cn_to_merge)
487
+ if !contigs_merge.empty?
488
+ temp=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.fasta'),'w')
489
+ contigs_merge.each_with_index do |seq,i|
490
+ temp.puts ">Fusion_#{i}\n#{seq}"
491
+ end
492
+ temp.close
493
+
494
+ temp_db=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.db'),'w')
495
+ temp_db.puts ">#{contigs.first.first_hit.name}\n#{@db_seqs[contigs.first.first_hit.name]}"
496
+ temp_db.close
497
+
498
+ end
499
+
500
+ #Exonerating
501
+ cmd="exonerate -q #{File.join(@path[:local],contigs.first.first_hit.name+'.db')} -t #{File.join(@path[:local],contigs.first.first_hit.name+'.fasta')} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --useaatla 1 --showvulgar > #{File.join(@path[:local],contigs.first.first_hit.name+'.ex')}" #LINUX command line
502
+ system(cmd)
503
+
504
+ #Parsing exonerate
505
+ local = ParserExonerate.new('contig','nucleotide_match', File.join(@path[:local],"#{contigs.first.first_hit.name}.ex"))
506
+ store_local_ex = local.dataset
507
+ #store_local_ex.each_contig {|ite| puts ite.name+' '+ite.first_hit.name; ite.indices}
508
+ store_local_ex.score_correction(30)
509
+ #puts "#{store_local_ex.contig_count}\t#{contigs_merge.length}"
510
+ if store_local_ex.contig_count==contigs_merge.length
511
+ #Recuperar atributos en contigs y cargar array con contigs def
512
+ store_local_ex.each_contig_with_index{|contig,i|
513
+ contig.seq=contigs_merge[i]
514
+ contig.length=contigs_merge[i].length
515
+ contig.first_hit.s_length=contigs.first.first_hit.s_length
516
+ cn_def.each_with_index do |contig_def,j| # Busqueda de la posicion de la fusion y asignacion en el array de contigs definitivos
517
+ if contig_def==i
518
+ cn_def[j]=contig
519
+ end
520
+ end
521
+ }
522
+ else
523
+ cn_def=cn_backup
524
+ end
525
+ else
526
+ cn_def=contigs
527
+ end
528
+
529
+ return cn_def
530
+
531
+ end#def
532
+
533
+ def contigs_seq_merge(contigs) #Devuelve un array con las secuencias fusionadas a partir del array contigs donde se le proporciona los arrays a fusionar
534
+ cn=[]
535
+ seq=''
536
+ contigs.each do |contigs_to_merge|
537
+ contigs_to_merge.each do |contig|
538
+ if seq.empty?
539
+ seq=contig.seq
540
+ else
541
+ seq=seq+'n'*10+contig.seq
542
+ end
543
+ end
544
+ cn << seq
545
+ seq=''
546
+ end
547
+ return cn
548
+ end
549
+
550
+ def gene_model_cut(contigs, reference=nil) #Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig
551
+ q_beg=[]
552
+ q_end=[]
553
+ s_beg=[]
554
+ s_end=[]
555
+ seq=[]
556
+ last_contig=nil
557
+ last_score=0
558
+ length_model=0
559
+ multiple_lengths=[]
560
+ add_length=TRUE
561
+ add_last=0
562
+ last_position_ref=nil
563
+ last_position=nil
564
+ lengthy=[]
565
+ out_of_range=FALSE
566
+ contigs.each do |contig|
567
+ score = contig.first_hit.first_hsp.score/contig.length*contig.exon_acumulative
568
+ n_exones = contig.first_hit.hsp_count
569
+
570
+ # FIRST CONTIG
571
+ #-------------------------------------------------------
572
+ if last_contig.nil?
573
+ q_end_seq=nil #SEQ
574
+ contig.first_hit.each_hsp_with_index{|hsp,i|
575
+ q_beg << hsp.q_beg
576
+ q_end << hsp.q_end
577
+ s_beg << hsp.s_beg
578
+ s_end << hsp.s_end
579
+ #SEQ.................................
580
+ if i==0
581
+ seq << contig.seq[0..contig.first_hit.first_hsp.q_end-1]
582
+ elsif i+1==n_exones
583
+ seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
584
+ else
585
+ seq << contig.seq[q_end_seq..hsp.q_end-1]
586
+ end
587
+ q_end_seq=hsp.q_end
588
+ # ...................................
589
+ }
590
+ length_model+=contig.length
591
+ if !reference.nil? #Posicionamiento del primer contig en la referencia
592
+ last_position_ref,ex=contig.compare(reference)
593
+ if last_position==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count #Abortar modelado en caso de qun contig no alinee con la referencia o la sobrepase
594
+ puts contig.name+' OUT OF RANGE'
595
+ out_of_range=TRUE
596
+ break
597
+ end
598
+ end
599
+
600
+ # OTHER CONTIG
601
+ #--------------------------------------------------------
602
+ else
603
+ position_overlap,ex=contig.compare(last_contig)
604
+
605
+ #Correccion posicion del contig en base a una referencia
606
+ if !reference.nil?
607
+ last_position_ref,position_overlap=position_reference_guided(contig,last_contig,last_position_ref,reference)
608
+ if last_position_ref==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count
609
+ out_of_range=TRUE
610
+ puts contig.name+' OUT OF RANGE'
611
+ break
612
+ end
613
+ end
614
+
615
+ # NOT OVERLAP
616
+ #..........................
617
+ if position_overlap==-1 || contig.first_hit.hsp_count==1
618
+ if contig.first_hit.first_hsp.s_beg-last_contig.first_hit.last_hsp.s_end>1 # Marcar discontinuidad en caso de que el contig no sea correlativo al anterior
619
+ q_beg << 0
620
+ q_end << 0
621
+ s_beg << 0
622
+ s_end << 0
623
+ multiple_lengths << length_model
624
+ length_model=contig.length
625
+ last=length_model
626
+ add_length=FALSE
627
+ seq.last << 'n'*10 #SEQ Indicacion de GAP
628
+ else
629
+ last=length_model #Guardamos longitud anterior para poder desplazar las coordenadas del contig correctamente
630
+ length_model+=contig.length
631
+ end
632
+
633
+ q_end_seq=nil #SEQ
634
+ contig.first_hit.hsps.each_with_index do |hsp,i|
635
+ add_no=last
636
+ if !add_length
637
+ add_no=0
638
+ end
639
+ q_beg << hsp.q_beg+add_no # Se acumula a las coordenadas la longitud del modelo
640
+ q_end << hsp.q_end+add_no
641
+ s_beg << hsp.s_beg
642
+ s_end << hsp.s_end
643
+ #SEQ.................................
644
+ if i==0
645
+ cn=contig.seq[0..contig.first_hit.first_hsp.q_end-1]
646
+ cs="#{cn[0..1].swapcase!}#{cn[2..-1]}"
647
+ seq << cs
648
+ elsif i+1==n_exones
649
+ seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
650
+ else
651
+ seq << contig.seq[q_end_seq..hsp.q_end-1]
652
+ end
653
+ q_end_seq=hsp.q_end
654
+ # ...................................
655
+ end
656
+
657
+ # OVERLAP
658
+ #..........................
659
+ else
660
+ if last_position==-1
661
+ add_last=length_model-last_contig.length
662
+ end
663
+ overlap=last_contig.first_hit.hsp_count-position_overlap
664
+ if last_contig.first_hit.hsp_count ==1
665
+ overlap=1
666
+ end
667
+ #puts "#{overlap} = #{last_contig.first_hit.hsp_count} - #{position_overlap}"
668
+ add=0
669
+ dif=0
670
+ if last_score>=score
671
+ add=last_contig.first_hit.last_hsp.q_end-contig.first_hit.first_hsp.q_end
672
+ dif=add
673
+ if overlap>1 #eliminamos ultimo exon de 'last contig' para reemplazar por el segundo de 'contig' q es mas fiable por ser interno
674
+ add=last_contig.first_hit.hsp_at(last_contig.first_hit.hsp_count-overlap).q_end-contig.first_hit.first_hsp.q_end #Como se dropea el ultimo exon se alinea por el penultimo
675
+ #puts "hsp:#{contig.first_hit.hsp_count}\toverlap:#{overlap}"
676
+ dif=contig.first_hit.hsp_at(overlap-1).q_end
677
+ q_beg=q_beg.reverse.drop(1).reverse
678
+ q_end=q_end.reverse.drop(1).reverse
679
+ s_beg=s_beg.reverse.drop(1).reverse
680
+ s_end=s_end.reverse.drop(1).reverse
681
+ seq=seq.reverse.drop(1).reverse #SEQ
682
+ end
683
+ if overlap==1
684
+ overlap=2
685
+ end
686
+ (contig.first_hit.hsp_count-(overlap-1)).times do |n| #Añadimos el resto de exones del contig al modelo
687
+ q_beg << contig.first_hit.hsp_at(n+overlap-1).q_beg+add+add_last
688
+ q_end << contig.first_hit.hsp_at(n+overlap-1).q_end+add+add_last
689
+ s_beg << contig.first_hit.hsp_at(n+overlap-1).s_beg
690
+ s_end << contig.first_hit.hsp_at(n+overlap-1).s_end
691
+ #SEQ.......................................
692
+ position_hsp=n+overlap-2
693
+ if position_hsp <0
694
+ position_hsp= 0
695
+ end
696
+ position_next_hsp=n+overlap-1
697
+ if position_next_hsp < 0
698
+ position_next_hsp =0
699
+ end
700
+
701
+ if n==0
702
+ cn=contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
703
+ cs=cn[0..1].swapcase!+cn[2..-1]
704
+ seq << cs
705
+ elsif position_next_hsp==contig.first_hit.hsp_count-1
706
+ seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.length-1]
707
+ else
708
+ seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
709
+ end
710
+ #............................................
711
+ end
712
+ else
713
+ hsp_position=last_contig.first_hit.hsp_count-2
714
+ if hsp_position<0 #para los casos de los contigs q solo poseen un hsp
715
+ hsp_position=0
716
+ end
717
+ add=last_contig.first_hit.hsp_at(hsp_position).q_end
718
+ dif=last_contig.length-add
719
+ drop=1
720
+ correction=0
721
+ if overlap>1
722
+ drop=overlap-1
723
+ correction=1
724
+ add=last_contig.first_hit.hsp_at(position_overlap).q_end-contig.first_hit.first_hsp.q_end
725
+ dif=length_model-(add+add_last)
726
+ end
727
+ # Eliminamos exones malos de 'last_contig' (mantenemos el primero del overlap)
728
+ q_beg=q_beg.reverse.drop(drop).reverse
729
+ q_end=q_end.reverse.drop(drop).reverse
730
+ s_beg=s_beg.reverse.drop(drop).reverse
731
+ s_end=s_end.reverse.drop(drop).reverse
732
+ seq=seq.reverse.drop(drop).reverse #SEQ
733
+
734
+ # Añadimos los exones de 'contig' (excepto el primero)
735
+ (contig.first_hit.hsp_count-correction).times do |n| #Añadimos el resto de exones del contig al modelo
736
+ q_beg << contig.first_hit.hsp_at(n+correction).q_beg+add+add_last
737
+ q_end << contig.first_hit.hsp_at(n+correction).q_end+add+add_last
738
+ s_beg << contig.first_hit.hsp_at(n+correction).s_beg
739
+ s_end << contig.first_hit.hsp_at(n+correction).s_end
740
+ #SEQ.............................................................
741
+ if n+1==(contig.first_hit.hsp_count-correction)
742
+ n_correction=n+correction-1
743
+ if n_correction < 0
744
+ n_correction=0
745
+ end
746
+ seq << contig.seq[contig.first_hit.hsp_at(n_correction).q_end..contig.length-1]
747
+ elsif n==0
748
+ if n+correction==0
749
+ cn=contig.seq[0..contig.first_hit.hsp_at(n+correction).q_end-1] # Si n+corr empieza en el primer exon del contig
750
+ else
751
+ cn=contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
752
+ end
753
+ cs=cn[0..1].swapcase!+cn[2..-1]
754
+ seq << cs
755
+ else
756
+ seq << contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
757
+ end
758
+ #................................................................
759
+ end
760
+ end
761
+ length_model+=(contig.length-dif)
762
+ add_length=TRUE
763
+ add_last+=add
764
+ end
765
+ end
766
+ last_position=position_overlap
767
+ last_contig=contig
768
+ last_score=score
769
+ lengthy << length_model
770
+ end
771
+ if !multiple_lengths.empty?
772
+ multiple_lengths << length_model
773
+ length_model=multiple_lengths
774
+ end
775
+
776
+ model=nil
777
+ if !out_of_range #Generar modelo si todos los contigs han alineado con la referencia
778
+ model=void_contig(contigs.first.first_hit.name+'_model',length_model,contigs.first.first_hit.s_length,q_beg,q_end,s_beg,s_end,'contig','gene','exon')
779
+ #Merge contigs under sequence reference
780
+ model_length=nil
781
+ if model.class.to_s=='Array'
782
+ add=0
783
+ model.each_with_index do |contig,i|
784
+ contig.modified_coordenates(add)
785
+ add+=contig.length
786
+ if i<model.length-1
787
+ add+=10
788
+ end
789
+ end
790
+ model_length=add+model.last.length
791
+ end
792
+
793
+ model_n_exones=seq.length
794
+ final_seq=seq.join
795
+ else #No generar modelo si al menos un contig no alinea contra la referencia
796
+ model_length=nil
797
+ final_seq=nil
798
+ end
799
+
800
+ return model, model_length, final_seq
801
+ end
802
+
803
+ def void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) #Genera un objeto contig con los datos proporcionados
804
+ contigs=[]
805
+ is_contig=1
806
+ contig=nil
807
+ n=0
808
+ q_beg.each_with_index do |item,ind|
809
+ if item>0 ||single
810
+ if contig==nil
811
+ if contig_length.class.to_s=='Array'
812
+ length=contig_length[n]
813
+ name="#{contig_name}_#{n}"
814
+ else
815
+ length=contig_length
816
+ name=contig_name
817
+ end
818
+ contig=Contig.new(name)
819
+ contig.length=length
820
+ contig.type=contig_type
821
+ hit_v=contig.add_hit(contig_name,s_length,1,:prot)
822
+ hit_v.type=hit_type
823
+ end
824
+ hsp_v=contig.first_hit.add_hsp(q_beg[ind], q_end[ind], s_beg[ind], s_end[ind], 0, 0, 0, 0)
825
+ hsp_v.type=hsp_type
826
+ end
827
+ if item==0 && contig!=nil && !single||q_beg.length-1==ind
828
+ if single ||!q_beg.include?(0)
829
+ contigs=contig
830
+ else
831
+ contigs << contig
832
+ end
833
+ n+=1
834
+ contig=nil
835
+ end
836
+ end
837
+ return contigs
838
+ end
839
+
840
+ def position_reference_guided(contig,last_contig,last_position_ref,reference)# Si no existe overlap devuelve -1
841
+ position_ref,ex=contig.compare(reference)
842
+ if !last_position_ref.nil?
843
+ if position_ref<=last_position_ref+(last_contig.first_hit.hsp_count-1) #Overlap
844
+ position_overlap=(last_position_ref-position_ref).abs
845
+ else #No overlap
846
+ position_overlap=-1
847
+ end
848
+ end
849
+ return position_ref,position_overlap
850
+ end
851
+
852
+ def array_contigs_to_contig(array_contigs)
853
+ contig=Contig.new(array_contigs.first.name)
854
+ array_contigs.each do |cn|
855
+ contig.transfer_contig_hits(cn)
856
+ end
857
+ contig.length=array_contigs.last.length
858
+ return contig
859
+ end
860
+
861
+ def gene_error(e, gene_name, file_error, cluster, model) #e is a ruby exception object
862
+ puts gene_name+' ERROR'
863
+ file_error.puts "\n"+gene_name+"\n.............................."
864
+ file_error.puts e.message
865
+ e.backtrace.each do |line|
866
+ file_error.puts line
867
+ end
868
+ file_error.puts ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
869
+ cluster.each do |contig|
870
+ file_error.puts contig.name
871
+ #puts contig.name
872
+ #contig.indices
873
+ end
874
+ file_error.puts '----------------------------------------------------------------------------------------'
875
+ end
876
+
877
+ def overlap_test(model,output=TRUE)
878
+ perc_overlap=0
879
+ overlap=model.overlap
880
+ total=0
881
+ if !overlap.empty?
882
+ if output
883
+ print 'WARNING: overlap/s '
884
+ end
885
+ overlap.each do |length_overlap|
886
+ if output
887
+ print (length_overlap*-3).to_s+', '
888
+ end
889
+ total+=length_overlap
890
+ end
891
+ perc_overlap=(total*-100.0/model.first_hit.s_length).round(2)
892
+ if output
893
+ puts 'nt. % Total overlap '+perc_overlap.to_s
894
+ end
895
+ end
896
+ return perc_overlap
897
+ end
898
+
899
+ def recover_test(model,output=TRUE)
900
+ recovered=0
901
+ model.exones_s.each do |exon|
902
+ recovered+=exon
903
+ end
904
+ recovered=(recovered*100.0/model.first_hit.s_length).round(2)
905
+ if output
906
+ puts "Recovered\t"+model.first_hit.name+"\t#{recovered}"
907
+ end
908
+ return recovered
909
+ end
910
+
911
+ def web_header(web, path)
912
+ file_web=nil
913
+ if web
914
+ file_web=File.open(path,'w')
915
+ html_header(file_web,'Gene index')
916
+ html_table_header(file_web,1,['Gene model name', 'Protein length', 'Num exon', '% recovered protein', '% overlapping sequence', 'Fragmentation'])
917
+ end
918
+ return file_web
919
+ end
920
+
921
+ def web_body(file_web)
922
+ if !file_web.nil?
923
+ html_table_footer(file_web)
924
+ html_footer(file_web)
925
+ file_web.close
926
+ end
927
+ end
928
+
929
+ def write_model_fasta(sequences_hash, path)
930
+ model_file=File.open(path,'w')
931
+ sequences_hash.each do |model|
932
+ model_file.puts '>'+model[0]+"_model\n"+model[1]
933
+ end
934
+ model_file.close
935
+ end
936
+
937
+ def write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name)
938
+ gff_model=ReportGff.new(gff_dataset_model,path,'s')
939
+ gff_model.create('a')
940
+ gff=ReportGff.new(gff_dataset,path,'s')
941
+ gff.create('a',name)
942
+ end
943
+
944
+ def format_model(model)
945
+ if model.n_hits?>1
946
+ model.each_hit_with_index{|hit,i|
947
+ hit.name=hit.name+"_gene_#{i}"
948
+ }
949
+ else
950
+ model.first_hit.name=model.first_hit.name+'_gene'
951
+ end
952
+ end
953
+
954
+ def correct_model(model, length_model, gff_dataset, sequences_hash)
955
+ correct_add_Ns=0
956
+ if model.class.to_s=='Array'
957
+ model=array_contigs_to_contig(model)
958
+ model.name=model.name.gsub('_0','')
959
+ model.length=length_model
960
+ end
961
+
962
+ correct_add_Ns=gff_dataset.correct_left_side_contigs(model)
963
+ model.modified_coordenates(correct_add_Ns)
964
+ model.length+=correct_add_Ns
965
+ gff_dataset.align_contigs(model)
966
+
967
+ #Corregir secuencia para que alinee con las features generadas
968
+ if correct_add_Ns>0
969
+ sequences_hash[model.name.gsub('_model','')]='n'*correct_add_Ns+sequences_hash[model.name.gsub('_model','')]
970
+ end
971
+
972
+ return model
973
+ end
974
+
975
+ end