gene_assembler 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/bin/GeneAssembler +233 -0
- data/bin/phytozome_scan +60 -0
- data/gene_assembler.gemspec +25 -0
- data/lib/gene_assembler.rb +5 -0
- data/lib/gene_assembler/blast_type_parser.rb +41 -0
- data/lib/gene_assembler/contig.rb +643 -0
- data/lib/gene_assembler/dataset.rb +532 -0
- data/lib/gene_assembler/exonerate_result.rb +230 -0
- data/lib/gene_assembler/gff_contig.rb +67 -0
- data/lib/gene_assembler/gff_dataset.rb +152 -0
- data/lib/gene_assembler/gff_feature.rb +175 -0
- data/lib/gene_assembler/gff_frameshift.rb +6 -0
- data/lib/gene_assembler/gff_go.rb +13 -0
- data/lib/gene_assembler/gff_hit.rb +53 -0
- data/lib/gene_assembler/gff_hsp.rb +6 -0
- data/lib/gene_assembler/gff_localization.rb +6 -0
- data/lib/gene_assembler/gff_master_feature.rb +5 -0
- data/lib/gene_assembler/gff_parser.rb +35 -0
- data/lib/gene_assembler/gff_snp.rb +21 -0
- data/lib/gene_assembler/gff_stop.rb +6 -0
- data/lib/gene_assembler/go.rb +13 -0
- data/lib/gene_assembler/hit.rb +191 -0
- data/lib/gene_assembler/hsp.rb +100 -0
- data/lib/gene_assembler/other_functions.rb +228 -0
- data/lib/gene_assembler/parser.rb +25 -0
- data/lib/gene_assembler/parser_blast.rb +12 -0
- data/lib/gene_assembler/parser_exonerate.rb +16 -0
- data/lib/gene_assembler/rebuild.rb +975 -0
- data/lib/gene_assembler/report.rb +13 -0
- data/lib/gene_assembler/report_gff.rb +30 -0
- data/lib/gene_assembler/snp.rb +13 -0
- data/lib/gene_assembler/version.rb +3 -0
- metadata +149 -0
@@ -0,0 +1,100 @@
|
|
1
|
+
class Hsp
|
2
|
+
attr_accessor :q_beg, :q_end, :s_beg, :s_end, :align_len, :score, :ident, :gaps, :type
|
3
|
+
def initialize (q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
|
4
|
+
@q_beg=q_beg #Inicio en query
|
5
|
+
@q_end=q_end #Fin en query
|
6
|
+
@s_beg=s_beg #Inicio en subject
|
7
|
+
@s_end=s_end #Fin en subject
|
8
|
+
@align_len=align_len #Tamaño de la secuencia alineada
|
9
|
+
#@bit_score=bit_score
|
10
|
+
@score=score
|
11
|
+
@ident=ident
|
12
|
+
@gaps=gaps
|
13
|
+
@type=nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def compare(hsp) #Compara hsps distintintos a nivel del subject para saber si son el mismo
|
17
|
+
coverage=0
|
18
|
+
if self.s_beg==hsp.s_end && self.s_end==hsp.s_end
|
19
|
+
coverage=1
|
20
|
+
elsif self.s_beg>=hsp.s_beg && self.s_end<hsp.s_end #Caso de q el self este dentro de hsp
|
21
|
+
coverage=1
|
22
|
+
elsif self.s_beg<=hsp.s_beg && self.s_end>hsp.s_beg && (self.s_end-hsp.s_beg).abs>1
|
23
|
+
ext=self.s_end-hsp.s_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
|
24
|
+
coverage=ext/(self.s_end-self.s_beg)
|
25
|
+
elsif self.s_beg<hsp.s_end && self.s_end>=hsp.s_end && (self.s_beg-hsp.s_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
|
26
|
+
ext=hsp.s_end-self.s_beg*1.00
|
27
|
+
coverage=ext/(self.s_end-self.s_beg)
|
28
|
+
end
|
29
|
+
return coverage
|
30
|
+
end
|
31
|
+
|
32
|
+
def compare_q(hsp) #Compara hsps distintintos a nivel del query para saber si son el mismo
|
33
|
+
coverage=0
|
34
|
+
if self.q_beg==hsp.q_end && self.q_end==hsp.q_end
|
35
|
+
coverage=1
|
36
|
+
elsif self.q_beg>=hsp.q_beg && self.q_end<hsp.q_end #Caso de q el self este dentro de hsp
|
37
|
+
coverage=1
|
38
|
+
elsif self.q_beg<=hsp.q_beg && self.q_end>hsp.q_beg && (self.q_end-hsp.q_beg).abs>1
|
39
|
+
ext=self.q_end-hsp.q_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
|
40
|
+
coverage=ext/(self.q_end-self.q_beg)
|
41
|
+
elsif self.q_beg<hsp.q_end && self.q_end>=hsp.q_end && (self.q_beg-hsp.q_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
|
42
|
+
ext=hsp.q_end-self.q_beg*1.00
|
43
|
+
coverage=ext/(self.q_end-self.q_beg)
|
44
|
+
end
|
45
|
+
return coverage
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
def length_q #Longitud del hsp en la query
|
50
|
+
length=@q_end-@q_beg
|
51
|
+
return length
|
52
|
+
end
|
53
|
+
|
54
|
+
def rev(length_hsp) # Cambia coordenadas de reversas a directas
|
55
|
+
@q_beg=length_hsp-@q_beg #Inicio en query
|
56
|
+
@q_end=length_hsp-@q_end #Fin en query
|
57
|
+
@reversed=FALSE
|
58
|
+
end
|
59
|
+
|
60
|
+
def within?(hsp,long) #Mira si un hsp esta dentro de otro o si hay overlap parcial entre los mismos
|
61
|
+
over=0
|
62
|
+
if self.q_beg<=hsp.q_beg && self.q_end>=hsp.q_end
|
63
|
+
over=1
|
64
|
+
end
|
65
|
+
if self.s_beg<=hsp.s_beg && self.s_end>=hsp.s_end
|
66
|
+
over=1
|
67
|
+
end
|
68
|
+
if over == 0
|
69
|
+
self_coverage=(self.s_end-self.s_beg)*1.00/long
|
70
|
+
hsp_coverage=(hsp.s_end-hsp.s_beg)*1.00/long
|
71
|
+
if hsp_coverage>(1-self_coverage) #Si el coverage del hsp en mayor que el resto que deja el self, se da como overlap
|
72
|
+
over=1
|
73
|
+
end
|
74
|
+
end
|
75
|
+
return over
|
76
|
+
end
|
77
|
+
|
78
|
+
def modified_coordenates(add)
|
79
|
+
@q_beg+=add
|
80
|
+
@q_end+=add
|
81
|
+
end
|
82
|
+
|
83
|
+
def rev_coord(contig_length)
|
84
|
+
puts '---------------------------------'
|
85
|
+
puts @q_beg.to_s+' '+@q_end.to_s
|
86
|
+
@q_beg=contig_length-@q_beg+1
|
87
|
+
@q_end=contig_length-@q_end+1
|
88
|
+
puts @q_beg.to_s+' '+@q_end.to_s
|
89
|
+
end
|
90
|
+
|
91
|
+
def overlap_with(last_hsp)
|
92
|
+
overlap=0
|
93
|
+
diference=self.s_beg-last_hsp.s_end
|
94
|
+
#puts "#{self.s_beg} - #{last_hsp.s_end} = #{diference}"
|
95
|
+
if diference<0
|
96
|
+
overlap=diference
|
97
|
+
end
|
98
|
+
return overlap
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
require 'scbi_fasta'
|
2
|
+
|
3
|
+
def mapping(contigs,gene_array,map_path) #Relaciona un archivo sam con un contig, cuantifica nº lecturas por exon
|
4
|
+
# Mapping
|
5
|
+
#--------------------------------------------------------------
|
6
|
+
contigs.each do |contig|
|
7
|
+
ruta=File.join(map_path,"#{contig.name}.sam")
|
8
|
+
|
9
|
+
# Parse mapping & exon valoration
|
10
|
+
#--------------------------------------------------------------
|
11
|
+
seq_map=[]
|
12
|
+
n_reads=0
|
13
|
+
if File.exists?(ruta)
|
14
|
+
contig.length.times do |x|
|
15
|
+
seq_map << 0
|
16
|
+
end
|
17
|
+
map_file=File.open(File.join(ruta), 'r')
|
18
|
+
map_file.each do |line|
|
19
|
+
fields=line.split
|
20
|
+
if fields[0]!~/[@]/
|
21
|
+
n_reads+=1
|
22
|
+
#puts "#{fields[3]}\t#{fields[5]}"
|
23
|
+
start_map=fields[3].to_i-1
|
24
|
+
end_map=start_map-1
|
25
|
+
fields[5].split(/[^\d]/).each{|e| end_map+=e.to_i}
|
26
|
+
#puts "#{start_map}\t#{end_map}"
|
27
|
+
#puts seq_map[start_map..end_map].inspect
|
28
|
+
seq_map.each_with_index do |item,a|
|
29
|
+
if a>=start_map
|
30
|
+
seq_map[a]+=1
|
31
|
+
end
|
32
|
+
if a>end_map
|
33
|
+
break
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
#puts seq_map.inspect
|
39
|
+
|
40
|
+
# Exon valoration
|
41
|
+
#-----------------------------------------------------------
|
42
|
+
exon_stadistic=[]
|
43
|
+
contig.hits.first.hsps.each do |hsp|
|
44
|
+
exon=seq_map[hsp.q_beg-1..hsp.q_end-1]
|
45
|
+
value=0
|
46
|
+
exon.each{|e| value+=e}
|
47
|
+
exon_stadistic << (value*100.0/n_reads/exon.length).round(2)
|
48
|
+
end
|
49
|
+
#puts exon_stadistic.inspect
|
50
|
+
y=contigs.index(contig)
|
51
|
+
x=gene_array[y].index(1)
|
52
|
+
exon_stadistic.each_with_index do |item,b|
|
53
|
+
gene_array[y][x+b]=item
|
54
|
+
end
|
55
|
+
seq_map=[]
|
56
|
+
end
|
57
|
+
end #end contigs.each
|
58
|
+
|
59
|
+
if $verbose
|
60
|
+
puts "\nGENE ARRAY - EXON VALUATED"
|
61
|
+
gene_array.each_with_index do |fila,c|
|
62
|
+
print "#{contigs[c].name.center(24)} "
|
63
|
+
fila.each do |item|
|
64
|
+
print "#{item.to_s}\t"
|
65
|
+
end
|
66
|
+
puts "\n"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
contigs.each do |contig|
|
71
|
+
puts '...................'
|
72
|
+
contig.indices
|
73
|
+
end
|
74
|
+
puts "\n"
|
75
|
+
end
|
76
|
+
|
77
|
+
def length2D(array) # Devuelve la longitud maxima que tenga un conjunto de arrays
|
78
|
+
length=0
|
79
|
+
array.each do |item|
|
80
|
+
item_length=item.length
|
81
|
+
if item_length>length
|
82
|
+
length=item_length
|
83
|
+
end
|
84
|
+
end
|
85
|
+
return length
|
86
|
+
end
|
87
|
+
|
88
|
+
def parse_contig_index(gene_array,contigs) #Comprueba codones start- stop en contigs que contengan el primer o el ultimo exon
|
89
|
+
exons_model=length2D(gene_array)
|
90
|
+
gene_array.each_with_index do |contig,i|
|
91
|
+
start=nil #Desconocido
|
92
|
+
if contig.first >0 #Comprueba si el contig tiene el primer exon
|
93
|
+
start=contigs[i].start_codon_search
|
94
|
+
end
|
95
|
+
stop=nil #desconocido
|
96
|
+
#if contig.length==exons_model #Comprueba si el contig posee el ultimo exon
|
97
|
+
stop=contigs[i].stop_codon_search
|
98
|
+
#end
|
99
|
+
if start==TRUE && stop==TRUE
|
100
|
+
contigs[i].completed=TRUE
|
101
|
+
elsif start==TRUE
|
102
|
+
contigs[i].completed='start'
|
103
|
+
elsif stop==TRUE
|
104
|
+
contigs[i].completed='stop'
|
105
|
+
else
|
106
|
+
contigs[i].completed=FALSE
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def sides_recovery(contigs) # Toma de un conjunto de contigs un contig con señal de stop y un contig con señal de inicio
|
112
|
+
start=nil
|
113
|
+
stop=nil
|
114
|
+
contigs.each do |contig|
|
115
|
+
if contig.completed=='start'
|
116
|
+
if start.nil?
|
117
|
+
start=contig
|
118
|
+
else
|
119
|
+
if start.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
|
120
|
+
start=contig
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
if contig.completed=='stop'
|
125
|
+
if stop.nil?
|
126
|
+
stop=contig
|
127
|
+
else
|
128
|
+
if stop.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
|
129
|
+
stop=contig
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
return start,stop
|
135
|
+
end
|
136
|
+
|
137
|
+
def sides_add(contigs,start,stop) #Añade contigs con señal de stop-start si no existen en el array contigs
|
138
|
+
beg=TRUE
|
139
|
+
ends=TRUE
|
140
|
+
contigs.each do |contig|
|
141
|
+
if contig.completed=='start'||contig.completed==TRUE
|
142
|
+
beg=FALSE
|
143
|
+
end
|
144
|
+
if contig.completed=='stop'||contig.completed==TRUE
|
145
|
+
ends=FALSE
|
146
|
+
end
|
147
|
+
end
|
148
|
+
if beg && !start.nil?
|
149
|
+
b=[]
|
150
|
+
b << start
|
151
|
+
contigs=b.concat(contigs)
|
152
|
+
end
|
153
|
+
if ends && !stop.nil?
|
154
|
+
e=[]
|
155
|
+
e << stop
|
156
|
+
contigs.concat(e)
|
157
|
+
end
|
158
|
+
return contigs
|
159
|
+
end
|
160
|
+
|
161
|
+
def cluster_filter(gene_array,cluster,length)# Elimina contigs de cluster y gene_array que tengan etiqueta de stop y solo tengan un hsp
|
162
|
+
cluster.each_with_index do |contig,i|
|
163
|
+
if contig.completed=='stop'
|
164
|
+
if contig.hits.first.hsps.last.s_end-contig.hits.first.hsps.last.s_beg<length && contig.hits.first.hsps.count==1
|
165
|
+
cluster[i]=nil
|
166
|
+
gene_array[i]=nil
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
cluster.compact!
|
171
|
+
gene_array.compact!
|
172
|
+
return gene_array,cluster
|
173
|
+
end
|
174
|
+
|
175
|
+
def coord_prot(last_contig_hsp, current_contig_hsp) #Devuelve la diferencia de posicion de dos contigs dados en base a su posicion en la proteina
|
176
|
+
add=last_contig_hsp.q_beg-current_contig_hsp.q_beg+3*(current_contig_hsp.s_beg-last_contig_hsp.s_beg) #primera parte del sumando representa la diferencia debida a la longitud de los contigs, la segunda parte representa la diferencia de tamaño del hsp
|
177
|
+
return add
|
178
|
+
end
|
179
|
+
|
180
|
+
def fasta_hash(path)
|
181
|
+
parse_seqs=FastaFile.new(path)
|
182
|
+
seqs={}
|
183
|
+
parse_seqs.each do |contig,seq_fasta|
|
184
|
+
seqs[contig]=seq_fasta
|
185
|
+
end
|
186
|
+
return seqs
|
187
|
+
end
|
188
|
+
|
189
|
+
def html_header(file,title)
|
190
|
+
file.puts '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
|
191
|
+
'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">',
|
192
|
+
'<head>',
|
193
|
+
'<meta http-equiv="content-type" content="text/html;charset=UTF-8" />',
|
194
|
+
'<title>'+title+'</title>',
|
195
|
+
'</head>',
|
196
|
+
'<body>'
|
197
|
+
end
|
198
|
+
|
199
|
+
def html_footer(file)
|
200
|
+
file.puts '</body>',
|
201
|
+
'</html>'
|
202
|
+
end
|
203
|
+
|
204
|
+
def html_table_header(file, border, headers) #headers es un array
|
205
|
+
file.puts '<table border="'+border.to_s+'">',
|
206
|
+
'<tr>'
|
207
|
+
headers.each do |header|
|
208
|
+
file.puts '<th>'+header+'</th>'
|
209
|
+
end
|
210
|
+
file.puts '</tr>'
|
211
|
+
end
|
212
|
+
|
213
|
+
def html_row(file, cells) #Cells muts be a array
|
214
|
+
file.puts '<tr>'
|
215
|
+
cells.each do |cell|
|
216
|
+
file.puts "<td>#{cell}</td>"
|
217
|
+
end
|
218
|
+
file.puts '</tr>'
|
219
|
+
end
|
220
|
+
|
221
|
+
def html_link(text, link)
|
222
|
+
text_linked='<a href="'+link+'">'+text.to_s+'</a>'
|
223
|
+
return text_linked
|
224
|
+
end
|
225
|
+
|
226
|
+
def html_table_footer(file)
|
227
|
+
file.puts '</table>'
|
228
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'dataset'
|
2
|
+
|
3
|
+
class Parser
|
4
|
+
attr_accessor :dataset
|
5
|
+
def initialize(file,type=nil)
|
6
|
+
@file=file
|
7
|
+
@dataset=create_dataset
|
8
|
+
data=parse_file(file) #Se crea objeto de datos para cargar dataset
|
9
|
+
load_dataset(data) #Se rellena dataset con la informacion contenida en data
|
10
|
+
end
|
11
|
+
|
12
|
+
def create_dataset
|
13
|
+
dataset=Dataset.new('unknown')#No se usa
|
14
|
+
return dataset
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_file(file)
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def load_dataset(data)
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'blast_type_parser'
|
2
|
+
require 'scbi_blast' #Si falla, buscar e instalar tb 'gem install xml-simple' de la q depende
|
3
|
+
|
4
|
+
|
5
|
+
class ParserBlast < BlastTypeParser
|
6
|
+
|
7
|
+
def parse_file(file)
|
8
|
+
blast=BlastTableResult.new(file)
|
9
|
+
return blast
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'exonerate_result'
|
2
|
+
require 'blast_type_parser'
|
3
|
+
|
4
|
+
class ParserExonerate < BlastTypeParser
|
5
|
+
|
6
|
+
def parse_file(file)
|
7
|
+
exonerate=ExonerateResult.new(file,@all)
|
8
|
+
return exonerate
|
9
|
+
end
|
10
|
+
|
11
|
+
def populate_extra_atributes(contig,item) #Añade los frameshift localizados x el exonerate
|
12
|
+
contig.q_frameshift=item.q_frameshift
|
13
|
+
contig.s_frameshift=item.s_frameshift
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,975 @@
|
|
1
|
+
require 'dataset'
|
2
|
+
require 'other_functions'
|
3
|
+
require 'report_gff'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
class Rebuild
|
7
|
+
def initialize(dataset,dataset_uni_hsp,path) #La clase ha de recibr objetos dataset
|
8
|
+
@dataset=dataset
|
9
|
+
@dataset_uni_hsp=dataset_uni_hsp
|
10
|
+
@path=path
|
11
|
+
@db_seqs=fasta_hash(path[:exonerate_db])
|
12
|
+
end
|
13
|
+
|
14
|
+
###############################################################################################################
|
15
|
+
# MAIN METHOD
|
16
|
+
###############################################################################################################
|
17
|
+
def rebuild(options) #Genera contigs modelo,gff y busca pseudogenes
|
18
|
+
gff_dataset_model=Dataset.new(:mix) #Object for save info of GeneAssembler's output
|
19
|
+
gff_dataset=Dataset.new(:mix) #Object for save info of GeneAssembler's output
|
20
|
+
file_error=File.open(@path[:error],'w')
|
21
|
+
file_web=web_header(options[:web],@path[:html])
|
22
|
+
sequences_hash={}
|
23
|
+
gene_name=nil
|
24
|
+
model=nil
|
25
|
+
statistics={:genes => 0, :total_recovered => 0, :total_overlap => 0, :total_fragmentation => 0}
|
26
|
+
puts "\nMODELING GENE",'*******************************************'
|
27
|
+
@dataset.each_cluster{|cluster|
|
28
|
+
begin
|
29
|
+
if !cluster.nil?
|
30
|
+
gene_name=cluster.first.first_hit.name
|
31
|
+
end
|
32
|
+
cluster_complete=cluster.dup
|
33
|
+
model, length_model, length_cluster = iterative_modeling_gene_w_reference(cluster,@dataset.references_hash,options[:rebuild],sequences_hash) #Realiza la reconstruccion del gen (alineado,descarte y montaje del gen)
|
34
|
+
|
35
|
+
# GeneAssembler output (gff for Gbrowse)
|
36
|
+
#--------------------------------------------------------
|
37
|
+
if !model.nil?
|
38
|
+
#Format Contigs children of model
|
39
|
+
gff_dataset.clr_contigs
|
40
|
+
gff_dataset.transfer_contigs(cluster_complete)
|
41
|
+
gff_dataset.transfer_n_contigs_def_hit_type(@dataset_uni_hsp,cluster,'pseudogene',50) #Transferir pseudogenes al report
|
42
|
+
|
43
|
+
# Convertir arrays a contig y ajustar alineamiento añadiendo Ns
|
44
|
+
model=correct_model(model, length_model, gff_dataset, sequences_hash)
|
45
|
+
|
46
|
+
# Comprobaciones en el modelo
|
47
|
+
exones=model.exones_s.length # N exones
|
48
|
+
puts 'Exones: '+ exones.to_s
|
49
|
+
recovered=recover_test(model)
|
50
|
+
overlap=overlap_test(model)
|
51
|
+
fragmentation=((length_cluster-1.00)/exones).round(2)
|
52
|
+
puts 'Fragmentation: ' + fragmentation.to_s
|
53
|
+
|
54
|
+
# HTML index
|
55
|
+
if !file_web.nil?
|
56
|
+
gene_link=html_link(model.first_hit.name, @path[:gbrowse_link]+model.first_hit.name)
|
57
|
+
html_row(file_web, [gene_link, cluster.first.first_hit.s_length, exones, recovered, overlap, fragmentation])
|
58
|
+
end
|
59
|
+
|
60
|
+
#Format Model for Gbrowse
|
61
|
+
gff_dataset_model.clr_contigs
|
62
|
+
format_model(model) #Añade la particula _gene al modelo
|
63
|
+
gff_dataset_model.transfer_contigs(model)
|
64
|
+
|
65
|
+
#Write
|
66
|
+
write_gbrowse_gff(gff_dataset_model, gff_dataset, @path[:gff], model.name)
|
67
|
+
|
68
|
+
#General statistics
|
69
|
+
statistics[:genes]+=1
|
70
|
+
statistics[:total_recovered]+=recovered
|
71
|
+
statistics[:total_overlap]+=overlap
|
72
|
+
statistics[:total_fragmentation]+=fragmentation
|
73
|
+
end
|
74
|
+
rescue Exception => e
|
75
|
+
gene_error(e, gene_name, file_error, cluster_complete, model)
|
76
|
+
end
|
77
|
+
puts '* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *'
|
78
|
+
}
|
79
|
+
file_error.close
|
80
|
+
web_body(file_web)
|
81
|
+
|
82
|
+
puts "\nFINAL STATISTICS\n",
|
83
|
+
'Recovered genes: '+ statistics[:genes].to_s,
|
84
|
+
'Mean recover: ' + (statistics[:total_recovered]/statistics[:genes]).to_s,
|
85
|
+
'Mean overlap: ' + (statistics[:total_overlap]/statistics[:genes]).to_s,
|
86
|
+
'Mean fragmentation: ' + (statistics[:total_fragmentation]/statistics[:genes]).to_s
|
87
|
+
write_model_fasta(sequences_hash,@path[:fasta])
|
88
|
+
end
|
89
|
+
################################################################################################################################
|
90
|
+
# end main method
|
91
|
+
################################################################################################################################
|
92
|
+
|
93
|
+
def iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash)
|
94
|
+
# Model atributes
|
95
|
+
model=nil
|
96
|
+
length=0
|
97
|
+
seq=nil
|
98
|
+
cluster_length=0
|
99
|
+
length_cluster=0
|
100
|
+
prot_reference=cluster.first.first_hit.name
|
101
|
+
array_references=references_hash[prot_reference]
|
102
|
+
|
103
|
+
# Model parameters
|
104
|
+
recover=0
|
105
|
+
overlap=0
|
106
|
+
|
107
|
+
#Modelo de gen en ciego
|
108
|
+
if $verbose
|
109
|
+
puts "\n",'|||||||||| BLIND MODELING ||||||||||'
|
110
|
+
end
|
111
|
+
model, length, seq, length_cluster= modeling_gene(cluster.dup,nil,options)
|
112
|
+
|
113
|
+
recover, overlap=eval_model(model.dup, length)
|
114
|
+
|
115
|
+
if $verbose
|
116
|
+
puts "\nRecover: #{recover} Overlap: #{overlap}"
|
117
|
+
end
|
118
|
+
|
119
|
+
guided=FALSE
|
120
|
+
#Modelo de gen guiado
|
121
|
+
if !array_references.nil?
|
122
|
+
array_references.each do |ref|
|
123
|
+
if $verbose
|
124
|
+
puts "\n",'|||||||||| GUIDED MODELING ||||||||||'
|
125
|
+
end
|
126
|
+
|
127
|
+
guided_model, guided_length, guided_seq, guided_length_cluster = modeling_gene(cluster.dup,ref,options)
|
128
|
+
if guided_model.nil? # Si algun modelo sale mal se ignora
|
129
|
+
next
|
130
|
+
end
|
131
|
+
guided_recover, guided_overlap= eval_model(guided_model.dup, guided_length)
|
132
|
+
if $verbose
|
133
|
+
puts "\nRecover: #{guided_recover} Overlap: #{guided_overlap}"
|
134
|
+
end
|
135
|
+
|
136
|
+
#Arbol de decisiones
|
137
|
+
if guided_overlap <= 15 #Si el overlap es menor del 15 %
|
138
|
+
if guided_overlap >= overlap-overlap*0.05 && guided_overlap <= overlap+overlap*0.05 # A mismo overlap
|
139
|
+
if guided_recover > recover
|
140
|
+
guided=TRUE
|
141
|
+
end
|
142
|
+
else # A distinto overlap
|
143
|
+
recover_dif=guided_recover-recover
|
144
|
+
if recover_dif < 0 # Si el guided_model tiene menos recuperacion q el anterior
|
145
|
+
if recover_dif.abs >= overlap-overlap*0.05 && recover_dif.abs <= overlap+overlap*0.05 #Si la reduccion de la recuperacion se debe a la desaparicion del overlap
|
146
|
+
guided=TRUE
|
147
|
+
end
|
148
|
+
elsif recover_dif> guided_overlap+guided_overlap*0.05 # Comprobar que la diferencia de recover no se debe a u aumento del overlap en la misma magnitud
|
149
|
+
guided=TRUE
|
150
|
+
end
|
151
|
+
end
|
152
|
+
elsif guided_overlap < overlap # Quedarnos siempre con los overlap mas bajos aun en situacion de overlap alto
|
153
|
+
guided=TRUE
|
154
|
+
end
|
155
|
+
|
156
|
+
if guided
|
157
|
+
model=guided_model
|
158
|
+
length=guided_length
|
159
|
+
seq=guided_seq
|
160
|
+
length_cluster=guided_length_cluster
|
161
|
+
recover=guided_recover
|
162
|
+
overlap=guided_overlap
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
sequences_hash[prot_reference]=seq
|
168
|
+
return model, length, length_cluster
|
169
|
+
end
|
170
|
+
|
171
|
+
def eval_model(local_model, length)#modifica model asi q se le ha de pasar una copia
|
172
|
+
recover=0
|
173
|
+
overlap=0
|
174
|
+
if local_model.class.to_s=='Array'
|
175
|
+
local_model=array_contigs_to_contig(local_model)
|
176
|
+
local_model.length=length
|
177
|
+
end
|
178
|
+
recover=recover_test(local_model,FALSE)
|
179
|
+
overlap=overlap_test(local_model,FALSE)
|
180
|
+
return recover, overlap
|
181
|
+
end
|
182
|
+
|
183
|
+
def modeling_gene(cluster, reference, rebuild) #Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo
|
184
|
+
model=nil
|
185
|
+
model_length=nil
|
186
|
+
seq=nil
|
187
|
+
length_cluster=0
|
188
|
+
# Reduccion iterativa de los contig para seleccionar los que van a formar parte del modelo de gen, elimina fragmentos menores que se puedan tomar como nuevos exones
|
189
|
+
#--------------------------------------------------------------------------------------------------
|
190
|
+
gene_array_length_before=nil
|
191
|
+
continue=TRUE
|
192
|
+
gene_array=[]
|
193
|
+
|
194
|
+
while continue
|
195
|
+
cluster,gene_array=gene_array_and_compact(rebuild,cluster,reference)
|
196
|
+
gene_array_length_after=length2D(gene_array)
|
197
|
+
if gene_array_length_after == gene_array_length_before
|
198
|
+
continue=FALSE
|
199
|
+
end
|
200
|
+
gene_array_length_before=gene_array_length_after
|
201
|
+
end
|
202
|
+
length_cluster=cluster.length
|
203
|
+
|
204
|
+
# Modelado del gen
|
205
|
+
#----------------------------------------------------
|
206
|
+
if rebuild && !cluster.empty? && !gene_array.empty?
|
207
|
+
if cluster.length >1
|
208
|
+
cluster_comp=contig_compact(cluster) #Fusiona contigs contiguos y devuelve el array correspondiente
|
209
|
+
else
|
210
|
+
cluster_comp=cluster
|
211
|
+
end
|
212
|
+
if !cluster_comp.nil?
|
213
|
+
model, model_length, seq=gene_model_cut(cluster_comp, reference)
|
214
|
+
else
|
215
|
+
puts cluster.first.first_hit.name+"\tGENE MODEL ABORTED"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
return model, model_length, seq, length_cluster
|
219
|
+
end
|
220
|
+
|
221
|
+
def gene_array_and_compact(rebuild,cluster,reference)
|
222
|
+
# Contruir array de exones (a partir del cluster) con los hsps de forma que los solapantes se alineen en las mismas columnas
|
223
|
+
#---------------------------------------------------------------------------------------------------------------------------
|
224
|
+
if rebuild
|
225
|
+
gene_array,gene_array_introns=build_gene_array(cluster,reference) #Con referencia
|
226
|
+
if $verbose
|
227
|
+
gene_exons=gene_stadistics(gene_array)
|
228
|
+
gene_stadistics_report(gene_exons,'EXONS')
|
229
|
+
gene_introns=gene_stadistics(gene_array_introns)
|
230
|
+
gene_stadistics_report(gene_introns,'INTRONS')
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
# Seleccion de contigs para modelado de gen
|
235
|
+
#-----------------------------------------------------
|
236
|
+
if $verbose #Info cluster before compact array contigs
|
237
|
+
gene_array_report(gene_array,cluster,rebuild)
|
238
|
+
end
|
239
|
+
if rebuild
|
240
|
+
gene_compact(gene_array,cluster) #Se descartan los contigs redundantes y quedan aquellos que cubren todo el gen para formar el modelo
|
241
|
+
end
|
242
|
+
if $verbose && rebuild #Info cluster after compact array contigs
|
243
|
+
gene_array_report(gene_array,cluster,rebuild)
|
244
|
+
end
|
245
|
+
return cluster, gene_array
|
246
|
+
end
|
247
|
+
|
248
|
+
def add_uni_hsp(model,cluster)#Compara contigs uni-hsp con contig modelo para determinar pseudogenes
|
249
|
+
contigs_uni_hsp=''
|
250
|
+
is_contig=0
|
251
|
+
pseudogenes=[]
|
252
|
+
@clusters_uni_hsp.each do |contigs|
|
253
|
+
if contigs.first.first_hit.name==cluster.first.first_hit.name
|
254
|
+
contigs_uni_hsp=contigs
|
255
|
+
is_contig=1
|
256
|
+
break
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
if is_contig==1 #Si se ha encontrado contigs uni-hsp se realiza la comparacion
|
261
|
+
if model.class.to_s!='Array'
|
262
|
+
model=[model]
|
263
|
+
end
|
264
|
+
model.each do |item|
|
265
|
+
contigs_uni_hsp.each do |contig|
|
266
|
+
start,exons=item.compare(contig)
|
267
|
+
if exons>1 && !pseudogenes.include?(contig)
|
268
|
+
pseudogenes << contig
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
return pseudogenes
|
274
|
+
end
|
275
|
+
|
276
|
+
def build_gene_array(contigs,reference=nil) #GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones
|
277
|
+
gene_array=[]
|
278
|
+
gene_array_introns=[]
|
279
|
+
last_contig=''
|
280
|
+
if !reference.nil?
|
281
|
+
last_contig=reference
|
282
|
+
end
|
283
|
+
contigs.each do |contig|
|
284
|
+
array_contig=[]
|
285
|
+
array_contig_introns=[]
|
286
|
+
n_exon=contig.first_hit.hsp_count #Contamos cantidad de hsps en el contig
|
287
|
+
#Determinar posiciones vacias
|
288
|
+
if !gene_array.empty?||reference
|
289
|
+
first_exon,ex=contig.compare(last_contig) #Comparamos el contig actual con el que se ha estudiado en la iteracion anterior
|
290
|
+
if reference && first_exon==-1 # Abortar alineamiento cuando un contig no coincide con la referencia
|
291
|
+
if $verbose
|
292
|
+
puts "\n#{contig.name} alignment step OUT OF RANGE"
|
293
|
+
end
|
294
|
+
gene_array=[]
|
295
|
+
gene_array_introns=[]
|
296
|
+
break
|
297
|
+
end
|
298
|
+
if first_exon==-1
|
299
|
+
gene_array.last.count.times do #Posiciones vacias cuando NO hay overlapping
|
300
|
+
array_contig << 0 # Marca ausencia de exon para esa posicion
|
301
|
+
array_contig_introns << 0 # Marca ausencia de intron para esa posicion
|
302
|
+
end
|
303
|
+
else
|
304
|
+
if reference # ASignamiento de la posicion del contig respecto a la referencia
|
305
|
+
void_positions=first_exon
|
306
|
+
else
|
307
|
+
void_positions=first_exon+gene_array.last.count(0)
|
308
|
+
end
|
309
|
+
void_positions.times do #Posiciones vacias cuando HAY overlapping
|
310
|
+
array_contig << 0
|
311
|
+
array_contig_introns << 0
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
#Agregar exones e intrones del contig
|
316
|
+
exones=contig.exones_s
|
317
|
+
introns=contig.intrones_q
|
318
|
+
array_contig << exones # Marca presencia de exon para esa posicion
|
319
|
+
array_contig_introns << introns # Marca presencia de exon para esa posicion
|
320
|
+
gene_array << array_contig.flatten!
|
321
|
+
gene_array_introns << array_contig_introns.flatten!
|
322
|
+
if reference.nil?
|
323
|
+
last_contig=contig
|
324
|
+
end
|
325
|
+
end
|
326
|
+
return gene_array, gene_array_introns
|
327
|
+
end
|
328
|
+
|
329
|
+
def gene_stadistics(gene_array) #Calcula el nº exones diferentes que hay por cada posicion del gene_array
|
330
|
+
exons=[]
|
331
|
+
length=length2D(gene_array)
|
332
|
+
length.times do |column|
|
333
|
+
exon=[]
|
334
|
+
gene_array.each_with_index.each do |item,row|
|
335
|
+
if !exon.include?(gene_array[row][column]) && gene_array[row][column]!=0
|
336
|
+
exon << gene_array[row][column]
|
337
|
+
end
|
338
|
+
end
|
339
|
+
exons << exon
|
340
|
+
end
|
341
|
+
exons_stadistic=[]
|
342
|
+
exons.each do |ex|
|
343
|
+
exons_stadistic << ex.compact.length
|
344
|
+
end
|
345
|
+
return exons_stadistic
|
346
|
+
end
|
347
|
+
|
348
|
+
def gene_stadistics_report(exons_stadistic,tag) #Muestra estadisticas de intrones o exones
|
349
|
+
print "\n#{tag}\t"
|
350
|
+
exons_stadistic.each do |item|
|
351
|
+
print "#{item}\t"
|
352
|
+
end
|
353
|
+
print "\n"
|
354
|
+
end
|
355
|
+
|
356
|
+
def gene_array_report(gene_array,contigs,act_array) #Muestra el array de la funncion build_gene_array y una representacion de las secuencias
|
357
|
+
if act_array
|
358
|
+
puts "\nGENE ARRAY"
|
359
|
+
gene_array.each_with_index do |fila,c|
|
360
|
+
print "#{contigs[c].name.center(24)}\t "
|
361
|
+
print "#{contigs[c].completed}\t"
|
362
|
+
fila.each do |item|
|
363
|
+
print "#{item}\t"
|
364
|
+
end
|
365
|
+
puts "\n"
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
puts "\nMAP"
|
370
|
+
contigs.each do |contig|
|
371
|
+
print "#{contig.name.center(25)}"
|
372
|
+
print contig.draw
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def gene_compact(gene_array, contigs) # Generacion modelo del gen quitando todas las secuencias redundantes posibles
|
377
|
+
gene_array.each_with_index do |contig,c1|
|
378
|
+
if !contig
|
379
|
+
next
|
380
|
+
end
|
381
|
+
c1_len=contig.length
|
382
|
+
n_exons=contig.count{|x| x>0}
|
383
|
+
gene_array.each_with_index do |contig2,c2|
|
384
|
+
if !contig2 ||c1==c2 #Saltamos contigs a nil o autocomparacion
|
385
|
+
next
|
386
|
+
end
|
387
|
+
c2_len=contig2.length
|
388
|
+
|
389
|
+
# IGUAL
|
390
|
+
if c1_len==c2_len
|
391
|
+
if contig2.count{|x| x>0}==n_exons
|
392
|
+
if contigs[c1].first_hit.first_hsp.score>=contigs[c2].first_hit.first_hsp.score
|
393
|
+
gene_array[c2]=nil
|
394
|
+
contigs[c2]=nil
|
395
|
+
else
|
396
|
+
gene_array[c1]=nil
|
397
|
+
contigs[c1]=nil
|
398
|
+
break
|
399
|
+
end
|
400
|
+
elsif contig2.count{|x| x>0}>n_exons
|
401
|
+
gene_array[c1]=nil
|
402
|
+
contigs[c1]=nil
|
403
|
+
break
|
404
|
+
else
|
405
|
+
gene_array[c2]=nil
|
406
|
+
contigs[c2]=nil
|
407
|
+
end
|
408
|
+
|
409
|
+
# MAYOR QUE
|
410
|
+
elsif c1_len>c2_len
|
411
|
+
if contig.count(0)<=contig2.count(0)
|
412
|
+
gene_array[c2]=nil
|
413
|
+
contigs[c2]=nil
|
414
|
+
end
|
415
|
+
|
416
|
+
# MENOR QUE
|
417
|
+
elsif c1_len<c2_len
|
418
|
+
if contig.count(0)==contig2.count(0)
|
419
|
+
gene_array[c1]=nil
|
420
|
+
contigs[c1]=nil
|
421
|
+
break
|
422
|
+
end
|
423
|
+
end
|
424
|
+
end #end contig2
|
425
|
+
end #end contig
|
426
|
+
gene_array.compact!
|
427
|
+
contigs.compact!
|
428
|
+
end
|
429
|
+
|
430
|
+
def contig_compact(contigs) # Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig
|
431
|
+
cn_def=[]
|
432
|
+
cn_backup=contigs.dup
|
433
|
+
#Determinar contigs a fusionar
|
434
|
+
cn_to_merge=[]
|
435
|
+
s_end=nil
|
436
|
+
last_position_ref=nil
|
437
|
+
position_overlap=nil
|
438
|
+
last_contig=nil
|
439
|
+
fusion=[]
|
440
|
+
contigs.length.times do
|
441
|
+
fusion << FALSE
|
442
|
+
end
|
443
|
+
#Marcaje de contigs correlativos no solapantes
|
444
|
+
contigs.each_with_index do |contig,i|
|
445
|
+
if i>0
|
446
|
+
diference=contig.first_hit.first_hsp.s_beg-s_end
|
447
|
+
if diference==0 || diference==1
|
448
|
+
fusion[i]=TRUE
|
449
|
+
end
|
450
|
+
end
|
451
|
+
s_end=contig.first_hit.last_hsp.s_end
|
452
|
+
end
|
453
|
+
|
454
|
+
if fusion.include?(TRUE)
|
455
|
+
|
456
|
+
#Construccion array contigs a fusionar y guardado de los solapantes
|
457
|
+
fusion_contigs=[]
|
458
|
+
count=0 # Marca la posicion de las fusiones
|
459
|
+
fusion.each_with_index do |cont,i|
|
460
|
+
if cont
|
461
|
+
if !fusion_contigs.include?(contigs[i-1])
|
462
|
+
fusion_contigs << contigs[i-1]
|
463
|
+
end
|
464
|
+
if !fusion_contigs.include?(contigs[i])
|
465
|
+
fusion_contigs << contigs[i]
|
466
|
+
end
|
467
|
+
else
|
468
|
+
if !fusion_contigs.empty?#Marcar fusiones
|
469
|
+
cn_to_merge << fusion_contigs
|
470
|
+
fusion_contigs=[]
|
471
|
+
cn_def << count
|
472
|
+
count+=1
|
473
|
+
end
|
474
|
+
if !fusion[i+1]||fusion[i+1].nil?#Guardar contigs que no participan en las fusiones
|
475
|
+
cn_def << contigs[i]
|
476
|
+
end
|
477
|
+
end
|
478
|
+
if i+1==fusion.length && !fusion_contigs.empty? #Control fin de bucle
|
479
|
+
cn_to_merge << fusion_contigs
|
480
|
+
cn_def << count
|
481
|
+
count+=1
|
482
|
+
end
|
483
|
+
end
|
484
|
+
|
485
|
+
#Generar fasta de los contig fusionados
|
486
|
+
contigs_merge=contigs_seq_merge(cn_to_merge)
|
487
|
+
if !contigs_merge.empty?
|
488
|
+
temp=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.fasta'),'w')
|
489
|
+
contigs_merge.each_with_index do |seq,i|
|
490
|
+
temp.puts ">Fusion_#{i}\n#{seq}"
|
491
|
+
end
|
492
|
+
temp.close
|
493
|
+
|
494
|
+
temp_db=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.db'),'w')
|
495
|
+
temp_db.puts ">#{contigs.first.first_hit.name}\n#{@db_seqs[contigs.first.first_hit.name]}"
|
496
|
+
temp_db.close
|
497
|
+
|
498
|
+
end
|
499
|
+
|
500
|
+
#Exonerating
|
501
|
+
cmd="exonerate -q #{File.join(@path[:local],contigs.first.first_hit.name+'.db')} -t #{File.join(@path[:local],contigs.first.first_hit.name+'.fasta')} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --useaatla 1 --showvulgar > #{File.join(@path[:local],contigs.first.first_hit.name+'.ex')}" #LINUX command line
|
502
|
+
system(cmd)
|
503
|
+
|
504
|
+
#Parsing exonerate
|
505
|
+
local = ParserExonerate.new('contig','nucleotide_match', File.join(@path[:local],"#{contigs.first.first_hit.name}.ex"))
|
506
|
+
store_local_ex = local.dataset
|
507
|
+
#store_local_ex.each_contig {|ite| puts ite.name+' '+ite.first_hit.name; ite.indices}
|
508
|
+
store_local_ex.score_correction(30)
|
509
|
+
#puts "#{store_local_ex.contig_count}\t#{contigs_merge.length}"
|
510
|
+
if store_local_ex.contig_count==contigs_merge.length
|
511
|
+
#Recuperar atributos en contigs y cargar array con contigs def
|
512
|
+
store_local_ex.each_contig_with_index{|contig,i|
|
513
|
+
contig.seq=contigs_merge[i]
|
514
|
+
contig.length=contigs_merge[i].length
|
515
|
+
contig.first_hit.s_length=contigs.first.first_hit.s_length
|
516
|
+
cn_def.each_with_index do |contig_def,j| # Busqueda de la posicion de la fusion y asignacion en el array de contigs definitivos
|
517
|
+
if contig_def==i
|
518
|
+
cn_def[j]=contig
|
519
|
+
end
|
520
|
+
end
|
521
|
+
}
|
522
|
+
else
|
523
|
+
cn_def=cn_backup
|
524
|
+
end
|
525
|
+
else
|
526
|
+
cn_def=contigs
|
527
|
+
end
|
528
|
+
|
529
|
+
return cn_def
|
530
|
+
|
531
|
+
end#def
|
532
|
+
|
533
|
+
def contigs_seq_merge(contigs) #Devuelve un array con las secuencias fusionadas a partir del array contigs donde se le proporciona los arrays a fusionar
|
534
|
+
cn=[]
|
535
|
+
seq=''
|
536
|
+
contigs.each do |contigs_to_merge|
|
537
|
+
contigs_to_merge.each do |contig|
|
538
|
+
if seq.empty?
|
539
|
+
seq=contig.seq
|
540
|
+
else
|
541
|
+
seq=seq+'n'*10+contig.seq
|
542
|
+
end
|
543
|
+
end
|
544
|
+
cn << seq
|
545
|
+
seq=''
|
546
|
+
end
|
547
|
+
return cn
|
548
|
+
end
|
549
|
+
|
550
|
+
def gene_model_cut(contigs, reference=nil) #Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig
|
551
|
+
q_beg=[]
|
552
|
+
q_end=[]
|
553
|
+
s_beg=[]
|
554
|
+
s_end=[]
|
555
|
+
seq=[]
|
556
|
+
last_contig=nil
|
557
|
+
last_score=0
|
558
|
+
length_model=0
|
559
|
+
multiple_lengths=[]
|
560
|
+
add_length=TRUE
|
561
|
+
add_last=0
|
562
|
+
last_position_ref=nil
|
563
|
+
last_position=nil
|
564
|
+
lengthy=[]
|
565
|
+
out_of_range=FALSE
|
566
|
+
contigs.each do |contig|
|
567
|
+
score = contig.first_hit.first_hsp.score/contig.length*contig.exon_acumulative
|
568
|
+
n_exones = contig.first_hit.hsp_count
|
569
|
+
|
570
|
+
# FIRST CONTIG
|
571
|
+
#-------------------------------------------------------
|
572
|
+
if last_contig.nil?
|
573
|
+
q_end_seq=nil #SEQ
|
574
|
+
contig.first_hit.each_hsp_with_index{|hsp,i|
|
575
|
+
q_beg << hsp.q_beg
|
576
|
+
q_end << hsp.q_end
|
577
|
+
s_beg << hsp.s_beg
|
578
|
+
s_end << hsp.s_end
|
579
|
+
#SEQ.................................
|
580
|
+
if i==0
|
581
|
+
seq << contig.seq[0..contig.first_hit.first_hsp.q_end-1]
|
582
|
+
elsif i+1==n_exones
|
583
|
+
seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
|
584
|
+
else
|
585
|
+
seq << contig.seq[q_end_seq..hsp.q_end-1]
|
586
|
+
end
|
587
|
+
q_end_seq=hsp.q_end
|
588
|
+
# ...................................
|
589
|
+
}
|
590
|
+
length_model+=contig.length
|
591
|
+
if !reference.nil? #Posicionamiento del primer contig en la referencia
|
592
|
+
last_position_ref,ex=contig.compare(reference)
|
593
|
+
if last_position==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count #Abortar modelado en caso de qun contig no alinee con la referencia o la sobrepase
|
594
|
+
puts contig.name+' OUT OF RANGE'
|
595
|
+
out_of_range=TRUE
|
596
|
+
break
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
# OTHER CONTIG
|
601
|
+
#--------------------------------------------------------
|
602
|
+
else
|
603
|
+
position_overlap,ex=contig.compare(last_contig)
|
604
|
+
|
605
|
+
#Correccion posicion del contig en base a una referencia
|
606
|
+
if !reference.nil?
|
607
|
+
last_position_ref,position_overlap=position_reference_guided(contig,last_contig,last_position_ref,reference)
|
608
|
+
if last_position_ref==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count
|
609
|
+
out_of_range=TRUE
|
610
|
+
puts contig.name+' OUT OF RANGE'
|
611
|
+
break
|
612
|
+
end
|
613
|
+
end
|
614
|
+
|
615
|
+
# NOT OVERLAP
|
616
|
+
#..........................
|
617
|
+
if position_overlap==-1 || contig.first_hit.hsp_count==1
|
618
|
+
if contig.first_hit.first_hsp.s_beg-last_contig.first_hit.last_hsp.s_end>1 # Marcar discontinuidad en caso de que el contig no sea correlativo al anterior
|
619
|
+
q_beg << 0
|
620
|
+
q_end << 0
|
621
|
+
s_beg << 0
|
622
|
+
s_end << 0
|
623
|
+
multiple_lengths << length_model
|
624
|
+
length_model=contig.length
|
625
|
+
last=length_model
|
626
|
+
add_length=FALSE
|
627
|
+
seq.last << 'n'*10 #SEQ Indicacion de GAP
|
628
|
+
else
|
629
|
+
last=length_model #Guardamos longitud anterior para poder desplazar las coordenadas del contig correctamente
|
630
|
+
length_model+=contig.length
|
631
|
+
end
|
632
|
+
|
633
|
+
q_end_seq=nil #SEQ
|
634
|
+
contig.first_hit.hsps.each_with_index do |hsp,i|
|
635
|
+
add_no=last
|
636
|
+
if !add_length
|
637
|
+
add_no=0
|
638
|
+
end
|
639
|
+
q_beg << hsp.q_beg+add_no # Se acumula a las coordenadas la longitud del modelo
|
640
|
+
q_end << hsp.q_end+add_no
|
641
|
+
s_beg << hsp.s_beg
|
642
|
+
s_end << hsp.s_end
|
643
|
+
#SEQ.................................
|
644
|
+
if i==0
|
645
|
+
cn=contig.seq[0..contig.first_hit.first_hsp.q_end-1]
|
646
|
+
cs="#{cn[0..1].swapcase!}#{cn[2..-1]}"
|
647
|
+
seq << cs
|
648
|
+
elsif i+1==n_exones
|
649
|
+
seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
|
650
|
+
else
|
651
|
+
seq << contig.seq[q_end_seq..hsp.q_end-1]
|
652
|
+
end
|
653
|
+
q_end_seq=hsp.q_end
|
654
|
+
# ...................................
|
655
|
+
end
|
656
|
+
|
657
|
+
# OVERLAP
|
658
|
+
#..........................
|
659
|
+
else
|
660
|
+
if last_position==-1
|
661
|
+
add_last=length_model-last_contig.length
|
662
|
+
end
|
663
|
+
overlap=last_contig.first_hit.hsp_count-position_overlap
|
664
|
+
if last_contig.first_hit.hsp_count ==1
|
665
|
+
overlap=1
|
666
|
+
end
|
667
|
+
#puts "#{overlap} = #{last_contig.first_hit.hsp_count} - #{position_overlap}"
|
668
|
+
add=0
|
669
|
+
dif=0
|
670
|
+
if last_score>=score
|
671
|
+
add=last_contig.first_hit.last_hsp.q_end-contig.first_hit.first_hsp.q_end
|
672
|
+
dif=add
|
673
|
+
if overlap>1 #eliminamos ultimo exon de 'last contig' para reemplazar por el segundo de 'contig' q es mas fiable por ser interno
|
674
|
+
add=last_contig.first_hit.hsp_at(last_contig.first_hit.hsp_count-overlap).q_end-contig.first_hit.first_hsp.q_end #Como se dropea el ultimo exon se alinea por el penultimo
|
675
|
+
#puts "hsp:#{contig.first_hit.hsp_count}\toverlap:#{overlap}"
|
676
|
+
dif=contig.first_hit.hsp_at(overlap-1).q_end
|
677
|
+
q_beg=q_beg.reverse.drop(1).reverse
|
678
|
+
q_end=q_end.reverse.drop(1).reverse
|
679
|
+
s_beg=s_beg.reverse.drop(1).reverse
|
680
|
+
s_end=s_end.reverse.drop(1).reverse
|
681
|
+
seq=seq.reverse.drop(1).reverse #SEQ
|
682
|
+
end
|
683
|
+
if overlap==1
|
684
|
+
overlap=2
|
685
|
+
end
|
686
|
+
(contig.first_hit.hsp_count-(overlap-1)).times do |n| #Añadimos el resto de exones del contig al modelo
|
687
|
+
q_beg << contig.first_hit.hsp_at(n+overlap-1).q_beg+add+add_last
|
688
|
+
q_end << contig.first_hit.hsp_at(n+overlap-1).q_end+add+add_last
|
689
|
+
s_beg << contig.first_hit.hsp_at(n+overlap-1).s_beg
|
690
|
+
s_end << contig.first_hit.hsp_at(n+overlap-1).s_end
|
691
|
+
#SEQ.......................................
|
692
|
+
position_hsp=n+overlap-2
|
693
|
+
if position_hsp <0
|
694
|
+
position_hsp= 0
|
695
|
+
end
|
696
|
+
position_next_hsp=n+overlap-1
|
697
|
+
if position_next_hsp < 0
|
698
|
+
position_next_hsp =0
|
699
|
+
end
|
700
|
+
|
701
|
+
if n==0
|
702
|
+
cn=contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
|
703
|
+
cs=cn[0..1].swapcase!+cn[2..-1]
|
704
|
+
seq << cs
|
705
|
+
elsif position_next_hsp==contig.first_hit.hsp_count-1
|
706
|
+
seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.length-1]
|
707
|
+
else
|
708
|
+
seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
|
709
|
+
end
|
710
|
+
#............................................
|
711
|
+
end
|
712
|
+
else
|
713
|
+
hsp_position=last_contig.first_hit.hsp_count-2
|
714
|
+
if hsp_position<0 #para los casos de los contigs q solo poseen un hsp
|
715
|
+
hsp_position=0
|
716
|
+
end
|
717
|
+
add=last_contig.first_hit.hsp_at(hsp_position).q_end
|
718
|
+
dif=last_contig.length-add
|
719
|
+
drop=1
|
720
|
+
correction=0
|
721
|
+
if overlap>1
|
722
|
+
drop=overlap-1
|
723
|
+
correction=1
|
724
|
+
add=last_contig.first_hit.hsp_at(position_overlap).q_end-contig.first_hit.first_hsp.q_end
|
725
|
+
dif=length_model-(add+add_last)
|
726
|
+
end
|
727
|
+
# Eliminamos exones malos de 'last_contig' (mantenemos el primero del overlap)
|
728
|
+
q_beg=q_beg.reverse.drop(drop).reverse
|
729
|
+
q_end=q_end.reverse.drop(drop).reverse
|
730
|
+
s_beg=s_beg.reverse.drop(drop).reverse
|
731
|
+
s_end=s_end.reverse.drop(drop).reverse
|
732
|
+
seq=seq.reverse.drop(drop).reverse #SEQ
|
733
|
+
|
734
|
+
# Añadimos los exones de 'contig' (excepto el primero)
|
735
|
+
(contig.first_hit.hsp_count-correction).times do |n| #Añadimos el resto de exones del contig al modelo
|
736
|
+
q_beg << contig.first_hit.hsp_at(n+correction).q_beg+add+add_last
|
737
|
+
q_end << contig.first_hit.hsp_at(n+correction).q_end+add+add_last
|
738
|
+
s_beg << contig.first_hit.hsp_at(n+correction).s_beg
|
739
|
+
s_end << contig.first_hit.hsp_at(n+correction).s_end
|
740
|
+
#SEQ.............................................................
|
741
|
+
if n+1==(contig.first_hit.hsp_count-correction)
|
742
|
+
n_correction=n+correction-1
|
743
|
+
if n_correction < 0
|
744
|
+
n_correction=0
|
745
|
+
end
|
746
|
+
seq << contig.seq[contig.first_hit.hsp_at(n_correction).q_end..contig.length-1]
|
747
|
+
elsif n==0
|
748
|
+
if n+correction==0
|
749
|
+
cn=contig.seq[0..contig.first_hit.hsp_at(n+correction).q_end-1] # Si n+corr empieza en el primer exon del contig
|
750
|
+
else
|
751
|
+
cn=contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
|
752
|
+
end
|
753
|
+
cs=cn[0..1].swapcase!+cn[2..-1]
|
754
|
+
seq << cs
|
755
|
+
else
|
756
|
+
seq << contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
|
757
|
+
end
|
758
|
+
#................................................................
|
759
|
+
end
|
760
|
+
end
|
761
|
+
length_model+=(contig.length-dif)
|
762
|
+
add_length=TRUE
|
763
|
+
add_last+=add
|
764
|
+
end
|
765
|
+
end
|
766
|
+
last_position=position_overlap
|
767
|
+
last_contig=contig
|
768
|
+
last_score=score
|
769
|
+
lengthy << length_model
|
770
|
+
end
|
771
|
+
if !multiple_lengths.empty?
|
772
|
+
multiple_lengths << length_model
|
773
|
+
length_model=multiple_lengths
|
774
|
+
end
|
775
|
+
|
776
|
+
model=nil
|
777
|
+
if !out_of_range #Generar modelo si todos los contigs han alineado con la referencia
|
778
|
+
model=void_contig(contigs.first.first_hit.name+'_model',length_model,contigs.first.first_hit.s_length,q_beg,q_end,s_beg,s_end,'contig','gene','exon')
|
779
|
+
#Merge contigs under sequence reference
|
780
|
+
model_length=nil
|
781
|
+
if model.class.to_s=='Array'
|
782
|
+
add=0
|
783
|
+
model.each_with_index do |contig,i|
|
784
|
+
contig.modified_coordenates(add)
|
785
|
+
add+=contig.length
|
786
|
+
if i<model.length-1
|
787
|
+
add+=10
|
788
|
+
end
|
789
|
+
end
|
790
|
+
model_length=add+model.last.length
|
791
|
+
end
|
792
|
+
|
793
|
+
model_n_exones=seq.length
|
794
|
+
final_seq=seq.join
|
795
|
+
else #No generar modelo si al menos un contig no alinea contra la referencia
|
796
|
+
model_length=nil
|
797
|
+
final_seq=nil
|
798
|
+
end
|
799
|
+
|
800
|
+
return model, model_length, final_seq
|
801
|
+
end
|
802
|
+
|
803
|
+
def void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) #Genera un objeto contig con los datos proporcionados
|
804
|
+
contigs=[]
|
805
|
+
is_contig=1
|
806
|
+
contig=nil
|
807
|
+
n=0
|
808
|
+
q_beg.each_with_index do |item,ind|
|
809
|
+
if item>0 ||single
|
810
|
+
if contig==nil
|
811
|
+
if contig_length.class.to_s=='Array'
|
812
|
+
length=contig_length[n]
|
813
|
+
name="#{contig_name}_#{n}"
|
814
|
+
else
|
815
|
+
length=contig_length
|
816
|
+
name=contig_name
|
817
|
+
end
|
818
|
+
contig=Contig.new(name)
|
819
|
+
contig.length=length
|
820
|
+
contig.type=contig_type
|
821
|
+
hit_v=contig.add_hit(contig_name,s_length,1,:prot)
|
822
|
+
hit_v.type=hit_type
|
823
|
+
end
|
824
|
+
hsp_v=contig.first_hit.add_hsp(q_beg[ind], q_end[ind], s_beg[ind], s_end[ind], 0, 0, 0, 0)
|
825
|
+
hsp_v.type=hsp_type
|
826
|
+
end
|
827
|
+
if item==0 && contig!=nil && !single||q_beg.length-1==ind
|
828
|
+
if single ||!q_beg.include?(0)
|
829
|
+
contigs=contig
|
830
|
+
else
|
831
|
+
contigs << contig
|
832
|
+
end
|
833
|
+
n+=1
|
834
|
+
contig=nil
|
835
|
+
end
|
836
|
+
end
|
837
|
+
return contigs
|
838
|
+
end
|
839
|
+
|
840
|
+
def position_reference_guided(contig,last_contig,last_position_ref,reference)# Si no existe overlap devuelve -1
|
841
|
+
position_ref,ex=contig.compare(reference)
|
842
|
+
if !last_position_ref.nil?
|
843
|
+
if position_ref<=last_position_ref+(last_contig.first_hit.hsp_count-1) #Overlap
|
844
|
+
position_overlap=(last_position_ref-position_ref).abs
|
845
|
+
else #No overlap
|
846
|
+
position_overlap=-1
|
847
|
+
end
|
848
|
+
end
|
849
|
+
return position_ref,position_overlap
|
850
|
+
end
|
851
|
+
|
852
|
+
def array_contigs_to_contig(array_contigs)
|
853
|
+
contig=Contig.new(array_contigs.first.name)
|
854
|
+
array_contigs.each do |cn|
|
855
|
+
contig.transfer_contig_hits(cn)
|
856
|
+
end
|
857
|
+
contig.length=array_contigs.last.length
|
858
|
+
return contig
|
859
|
+
end
|
860
|
+
|
861
|
+
def gene_error(e, gene_name, file_error, cluster, model) #e is a ruby exception object
|
862
|
+
puts gene_name+' ERROR'
|
863
|
+
file_error.puts "\n"+gene_name+"\n.............................."
|
864
|
+
file_error.puts e.message
|
865
|
+
e.backtrace.each do |line|
|
866
|
+
file_error.puts line
|
867
|
+
end
|
868
|
+
file_error.puts ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
|
869
|
+
cluster.each do |contig|
|
870
|
+
file_error.puts contig.name
|
871
|
+
#puts contig.name
|
872
|
+
#contig.indices
|
873
|
+
end
|
874
|
+
file_error.puts '----------------------------------------------------------------------------------------'
|
875
|
+
end
|
876
|
+
|
877
|
+
def overlap_test(model,output=TRUE)
|
878
|
+
perc_overlap=0
|
879
|
+
overlap=model.overlap
|
880
|
+
total=0
|
881
|
+
if !overlap.empty?
|
882
|
+
if output
|
883
|
+
print 'WARNING: overlap/s '
|
884
|
+
end
|
885
|
+
overlap.each do |length_overlap|
|
886
|
+
if output
|
887
|
+
print (length_overlap*-3).to_s+', '
|
888
|
+
end
|
889
|
+
total+=length_overlap
|
890
|
+
end
|
891
|
+
perc_overlap=(total*-100.0/model.first_hit.s_length).round(2)
|
892
|
+
if output
|
893
|
+
puts 'nt. % Total overlap '+perc_overlap.to_s
|
894
|
+
end
|
895
|
+
end
|
896
|
+
return perc_overlap
|
897
|
+
end
|
898
|
+
|
899
|
+
def recover_test(model,output=TRUE)
|
900
|
+
recovered=0
|
901
|
+
model.exones_s.each do |exon|
|
902
|
+
recovered+=exon
|
903
|
+
end
|
904
|
+
recovered=(recovered*100.0/model.first_hit.s_length).round(2)
|
905
|
+
if output
|
906
|
+
puts "Recovered\t"+model.first_hit.name+"\t#{recovered}"
|
907
|
+
end
|
908
|
+
return recovered
|
909
|
+
end
|
910
|
+
|
911
|
+
def web_header(web, path)
|
912
|
+
file_web=nil
|
913
|
+
if web
|
914
|
+
file_web=File.open(path,'w')
|
915
|
+
html_header(file_web,'Gene index')
|
916
|
+
html_table_header(file_web,1,['Gene model name', 'Protein length', 'Num exon', '% recovered protein', '% overlapping sequence', 'Fragmentation'])
|
917
|
+
end
|
918
|
+
return file_web
|
919
|
+
end
|
920
|
+
|
921
|
+
def web_body(file_web)
|
922
|
+
if !file_web.nil?
|
923
|
+
html_table_footer(file_web)
|
924
|
+
html_footer(file_web)
|
925
|
+
file_web.close
|
926
|
+
end
|
927
|
+
end
|
928
|
+
|
929
|
+
def write_model_fasta(sequences_hash, path)
|
930
|
+
model_file=File.open(path,'w')
|
931
|
+
sequences_hash.each do |model|
|
932
|
+
model_file.puts '>'+model[0]+"_model\n"+model[1]
|
933
|
+
end
|
934
|
+
model_file.close
|
935
|
+
end
|
936
|
+
|
937
|
+
def write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name)
|
938
|
+
gff_model=ReportGff.new(gff_dataset_model,path,'s')
|
939
|
+
gff_model.create('a')
|
940
|
+
gff=ReportGff.new(gff_dataset,path,'s')
|
941
|
+
gff.create('a',name)
|
942
|
+
end
|
943
|
+
|
944
|
+
def format_model(model)
|
945
|
+
if model.n_hits?>1
|
946
|
+
model.each_hit_with_index{|hit,i|
|
947
|
+
hit.name=hit.name+"_gene_#{i}"
|
948
|
+
}
|
949
|
+
else
|
950
|
+
model.first_hit.name=model.first_hit.name+'_gene'
|
951
|
+
end
|
952
|
+
end
|
953
|
+
|
954
|
+
def correct_model(model, length_model, gff_dataset, sequences_hash)
|
955
|
+
correct_add_Ns=0
|
956
|
+
if model.class.to_s=='Array'
|
957
|
+
model=array_contigs_to_contig(model)
|
958
|
+
model.name=model.name.gsub('_0','')
|
959
|
+
model.length=length_model
|
960
|
+
end
|
961
|
+
|
962
|
+
correct_add_Ns=gff_dataset.correct_left_side_contigs(model)
|
963
|
+
model.modified_coordenates(correct_add_Ns)
|
964
|
+
model.length+=correct_add_Ns
|
965
|
+
gff_dataset.align_contigs(model)
|
966
|
+
|
967
|
+
#Corregir secuencia para que alinee con las features generadas
|
968
|
+
if correct_add_Ns>0
|
969
|
+
sequences_hash[model.name.gsub('_model','')]='n'*correct_add_Ns+sequences_hash[model.name.gsub('_model','')]
|
970
|
+
end
|
971
|
+
|
972
|
+
return model
|
973
|
+
end
|
974
|
+
|
975
|
+
end
|