gene_assembler 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/bin/GeneAssembler +233 -0
- data/bin/phytozome_scan +60 -0
- data/gene_assembler.gemspec +25 -0
- data/lib/gene_assembler.rb +5 -0
- data/lib/gene_assembler/blast_type_parser.rb +41 -0
- data/lib/gene_assembler/contig.rb +643 -0
- data/lib/gene_assembler/dataset.rb +532 -0
- data/lib/gene_assembler/exonerate_result.rb +230 -0
- data/lib/gene_assembler/gff_contig.rb +67 -0
- data/lib/gene_assembler/gff_dataset.rb +152 -0
- data/lib/gene_assembler/gff_feature.rb +175 -0
- data/lib/gene_assembler/gff_frameshift.rb +6 -0
- data/lib/gene_assembler/gff_go.rb +13 -0
- data/lib/gene_assembler/gff_hit.rb +53 -0
- data/lib/gene_assembler/gff_hsp.rb +6 -0
- data/lib/gene_assembler/gff_localization.rb +6 -0
- data/lib/gene_assembler/gff_master_feature.rb +5 -0
- data/lib/gene_assembler/gff_parser.rb +35 -0
- data/lib/gene_assembler/gff_snp.rb +21 -0
- data/lib/gene_assembler/gff_stop.rb +6 -0
- data/lib/gene_assembler/go.rb +13 -0
- data/lib/gene_assembler/hit.rb +191 -0
- data/lib/gene_assembler/hsp.rb +100 -0
- data/lib/gene_assembler/other_functions.rb +228 -0
- data/lib/gene_assembler/parser.rb +25 -0
- data/lib/gene_assembler/parser_blast.rb +12 -0
- data/lib/gene_assembler/parser_exonerate.rb +16 -0
- data/lib/gene_assembler/rebuild.rb +975 -0
- data/lib/gene_assembler/report.rb +13 -0
- data/lib/gene_assembler/report_gff.rb +30 -0
- data/lib/gene_assembler/snp.rb +13 -0
- data/lib/gene_assembler/version.rb +3 -0
- metadata +149 -0
@@ -0,0 +1,100 @@
|
|
1
|
+
class Hsp
|
2
|
+
attr_accessor :q_beg, :q_end, :s_beg, :s_end, :align_len, :score, :ident, :gaps, :type
|
3
|
+
def initialize (q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
|
4
|
+
@q_beg=q_beg #Inicio en query
|
5
|
+
@q_end=q_end #Fin en query
|
6
|
+
@s_beg=s_beg #Inicio en subject
|
7
|
+
@s_end=s_end #Fin en subject
|
8
|
+
@align_len=align_len #Tamaño de la secuencia alineada
|
9
|
+
#@bit_score=bit_score
|
10
|
+
@score=score
|
11
|
+
@ident=ident
|
12
|
+
@gaps=gaps
|
13
|
+
@type=nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def compare(hsp) #Compara hsps distintintos a nivel del subject para saber si son el mismo
|
17
|
+
coverage=0
|
18
|
+
if self.s_beg==hsp.s_end && self.s_end==hsp.s_end
|
19
|
+
coverage=1
|
20
|
+
elsif self.s_beg>=hsp.s_beg && self.s_end<hsp.s_end #Caso de q el self este dentro de hsp
|
21
|
+
coverage=1
|
22
|
+
elsif self.s_beg<=hsp.s_beg && self.s_end>hsp.s_beg && (self.s_end-hsp.s_beg).abs>1
|
23
|
+
ext=self.s_end-hsp.s_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
|
24
|
+
coverage=ext/(self.s_end-self.s_beg)
|
25
|
+
elsif self.s_beg<hsp.s_end && self.s_end>=hsp.s_end && (self.s_beg-hsp.s_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
|
26
|
+
ext=hsp.s_end-self.s_beg*1.00
|
27
|
+
coverage=ext/(self.s_end-self.s_beg)
|
28
|
+
end
|
29
|
+
return coverage
|
30
|
+
end
|
31
|
+
|
32
|
+
def compare_q(hsp) #Compara hsps distintintos a nivel del query para saber si son el mismo
|
33
|
+
coverage=0
|
34
|
+
if self.q_beg==hsp.q_end && self.q_end==hsp.q_end
|
35
|
+
coverage=1
|
36
|
+
elsif self.q_beg>=hsp.q_beg && self.q_end<hsp.q_end #Caso de q el self este dentro de hsp
|
37
|
+
coverage=1
|
38
|
+
elsif self.q_beg<=hsp.q_beg && self.q_end>hsp.q_beg && (self.q_end-hsp.q_beg).abs>1
|
39
|
+
ext=self.q_end-hsp.q_beg*1.00 # El producto obliga a usar la clase float para impedir q trunque el resultado
|
40
|
+
coverage=ext/(self.q_end-self.q_beg)
|
41
|
+
elsif self.q_beg<hsp.q_end && self.q_end>=hsp.q_end && (self.q_beg-hsp.q_end).abs>1 #Ultima condicion impide q de como mismo exon el compartir un aa q realmente esta partido entre 2 exones
|
42
|
+
ext=hsp.q_end-self.q_beg*1.00
|
43
|
+
coverage=ext/(self.q_end-self.q_beg)
|
44
|
+
end
|
45
|
+
return coverage
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
def length_q #Longitud del hsp en la query
|
50
|
+
length=@q_end-@q_beg
|
51
|
+
return length
|
52
|
+
end
|
53
|
+
|
54
|
+
def rev(length_hsp) # Cambia coordenadas de reversas a directas
|
55
|
+
@q_beg=length_hsp-@q_beg #Inicio en query
|
56
|
+
@q_end=length_hsp-@q_end #Fin en query
|
57
|
+
@reversed=FALSE
|
58
|
+
end
|
59
|
+
|
60
|
+
def within?(hsp,long) #Mira si un hsp esta dentro de otro o si hay overlap parcial entre los mismos
|
61
|
+
over=0
|
62
|
+
if self.q_beg<=hsp.q_beg && self.q_end>=hsp.q_end
|
63
|
+
over=1
|
64
|
+
end
|
65
|
+
if self.s_beg<=hsp.s_beg && self.s_end>=hsp.s_end
|
66
|
+
over=1
|
67
|
+
end
|
68
|
+
if over == 0
|
69
|
+
self_coverage=(self.s_end-self.s_beg)*1.00/long
|
70
|
+
hsp_coverage=(hsp.s_end-hsp.s_beg)*1.00/long
|
71
|
+
if hsp_coverage>(1-self_coverage) #Si el coverage del hsp en mayor que el resto que deja el self, se da como overlap
|
72
|
+
over=1
|
73
|
+
end
|
74
|
+
end
|
75
|
+
return over
|
76
|
+
end
|
77
|
+
|
78
|
+
def modified_coordenates(add)
|
79
|
+
@q_beg+=add
|
80
|
+
@q_end+=add
|
81
|
+
end
|
82
|
+
|
83
|
+
def rev_coord(contig_length)
|
84
|
+
puts '---------------------------------'
|
85
|
+
puts @q_beg.to_s+' '+@q_end.to_s
|
86
|
+
@q_beg=contig_length-@q_beg+1
|
87
|
+
@q_end=contig_length-@q_end+1
|
88
|
+
puts @q_beg.to_s+' '+@q_end.to_s
|
89
|
+
end
|
90
|
+
|
91
|
+
def overlap_with(last_hsp)
|
92
|
+
overlap=0
|
93
|
+
diference=self.s_beg-last_hsp.s_end
|
94
|
+
#puts "#{self.s_beg} - #{last_hsp.s_end} = #{diference}"
|
95
|
+
if diference<0
|
96
|
+
overlap=diference
|
97
|
+
end
|
98
|
+
return overlap
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
require 'scbi_fasta'
|
2
|
+
|
3
|
+
def mapping(contigs,gene_array,map_path) #Relaciona un archivo sam con un contig, cuantifica nº lecturas por exon
|
4
|
+
# Mapping
|
5
|
+
#--------------------------------------------------------------
|
6
|
+
contigs.each do |contig|
|
7
|
+
ruta=File.join(map_path,"#{contig.name}.sam")
|
8
|
+
|
9
|
+
# Parse mapping & exon valoration
|
10
|
+
#--------------------------------------------------------------
|
11
|
+
seq_map=[]
|
12
|
+
n_reads=0
|
13
|
+
if File.exists?(ruta)
|
14
|
+
contig.length.times do |x|
|
15
|
+
seq_map << 0
|
16
|
+
end
|
17
|
+
map_file=File.open(File.join(ruta), 'r')
|
18
|
+
map_file.each do |line|
|
19
|
+
fields=line.split
|
20
|
+
if fields[0]!~/[@]/
|
21
|
+
n_reads+=1
|
22
|
+
#puts "#{fields[3]}\t#{fields[5]}"
|
23
|
+
start_map=fields[3].to_i-1
|
24
|
+
end_map=start_map-1
|
25
|
+
fields[5].split(/[^\d]/).each{|e| end_map+=e.to_i}
|
26
|
+
#puts "#{start_map}\t#{end_map}"
|
27
|
+
#puts seq_map[start_map..end_map].inspect
|
28
|
+
seq_map.each_with_index do |item,a|
|
29
|
+
if a>=start_map
|
30
|
+
seq_map[a]+=1
|
31
|
+
end
|
32
|
+
if a>end_map
|
33
|
+
break
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
#puts seq_map.inspect
|
39
|
+
|
40
|
+
# Exon valoration
|
41
|
+
#-----------------------------------------------------------
|
42
|
+
exon_stadistic=[]
|
43
|
+
contig.hits.first.hsps.each do |hsp|
|
44
|
+
exon=seq_map[hsp.q_beg-1..hsp.q_end-1]
|
45
|
+
value=0
|
46
|
+
exon.each{|e| value+=e}
|
47
|
+
exon_stadistic << (value*100.0/n_reads/exon.length).round(2)
|
48
|
+
end
|
49
|
+
#puts exon_stadistic.inspect
|
50
|
+
y=contigs.index(contig)
|
51
|
+
x=gene_array[y].index(1)
|
52
|
+
exon_stadistic.each_with_index do |item,b|
|
53
|
+
gene_array[y][x+b]=item
|
54
|
+
end
|
55
|
+
seq_map=[]
|
56
|
+
end
|
57
|
+
end #end contigs.each
|
58
|
+
|
59
|
+
if $verbose
|
60
|
+
puts "\nGENE ARRAY - EXON VALUATED"
|
61
|
+
gene_array.each_with_index do |fila,c|
|
62
|
+
print "#{contigs[c].name.center(24)} "
|
63
|
+
fila.each do |item|
|
64
|
+
print "#{item.to_s}\t"
|
65
|
+
end
|
66
|
+
puts "\n"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
contigs.each do |contig|
|
71
|
+
puts '...................'
|
72
|
+
contig.indices
|
73
|
+
end
|
74
|
+
puts "\n"
|
75
|
+
end
|
76
|
+
|
77
|
+
def length2D(array) # Devuelve la longitud maxima que tenga un conjunto de arrays
|
78
|
+
length=0
|
79
|
+
array.each do |item|
|
80
|
+
item_length=item.length
|
81
|
+
if item_length>length
|
82
|
+
length=item_length
|
83
|
+
end
|
84
|
+
end
|
85
|
+
return length
|
86
|
+
end
|
87
|
+
|
88
|
+
def parse_contig_index(gene_array,contigs) #Comprueba codones start- stop en contigs que contengan el primer o el ultimo exon
|
89
|
+
exons_model=length2D(gene_array)
|
90
|
+
gene_array.each_with_index do |contig,i|
|
91
|
+
start=nil #Desconocido
|
92
|
+
if contig.first >0 #Comprueba si el contig tiene el primer exon
|
93
|
+
start=contigs[i].start_codon_search
|
94
|
+
end
|
95
|
+
stop=nil #desconocido
|
96
|
+
#if contig.length==exons_model #Comprueba si el contig posee el ultimo exon
|
97
|
+
stop=contigs[i].stop_codon_search
|
98
|
+
#end
|
99
|
+
if start==TRUE && stop==TRUE
|
100
|
+
contigs[i].completed=TRUE
|
101
|
+
elsif start==TRUE
|
102
|
+
contigs[i].completed='start'
|
103
|
+
elsif stop==TRUE
|
104
|
+
contigs[i].completed='stop'
|
105
|
+
else
|
106
|
+
contigs[i].completed=FALSE
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def sides_recovery(contigs) # Toma de un conjunto de contigs un contig con señal de stop y un contig con señal de inicio
|
112
|
+
start=nil
|
113
|
+
stop=nil
|
114
|
+
contigs.each do |contig|
|
115
|
+
if contig.completed=='start'
|
116
|
+
if start.nil?
|
117
|
+
start=contig
|
118
|
+
else
|
119
|
+
if start.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
|
120
|
+
start=contig
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
if contig.completed=='stop'
|
125
|
+
if stop.nil?
|
126
|
+
stop=contig
|
127
|
+
else
|
128
|
+
if stop.hits.first.hsps.first.score<contig.hits.first.hsps.first.score
|
129
|
+
stop=contig
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
return start,stop
|
135
|
+
end
|
136
|
+
|
137
|
+
def sides_add(contigs,start,stop) #Añade contigs con señal de stop-start si no existen en el array contigs
|
138
|
+
beg=TRUE
|
139
|
+
ends=TRUE
|
140
|
+
contigs.each do |contig|
|
141
|
+
if contig.completed=='start'||contig.completed==TRUE
|
142
|
+
beg=FALSE
|
143
|
+
end
|
144
|
+
if contig.completed=='stop'||contig.completed==TRUE
|
145
|
+
ends=FALSE
|
146
|
+
end
|
147
|
+
end
|
148
|
+
if beg && !start.nil?
|
149
|
+
b=[]
|
150
|
+
b << start
|
151
|
+
contigs=b.concat(contigs)
|
152
|
+
end
|
153
|
+
if ends && !stop.nil?
|
154
|
+
e=[]
|
155
|
+
e << stop
|
156
|
+
contigs.concat(e)
|
157
|
+
end
|
158
|
+
return contigs
|
159
|
+
end
|
160
|
+
|
161
|
+
def cluster_filter(gene_array,cluster,length)# Elimina contigs de cluster y gene_array que tengan etiqueta de stop y solo tengan un hsp
|
162
|
+
cluster.each_with_index do |contig,i|
|
163
|
+
if contig.completed=='stop'
|
164
|
+
if contig.hits.first.hsps.last.s_end-contig.hits.first.hsps.last.s_beg<length && contig.hits.first.hsps.count==1
|
165
|
+
cluster[i]=nil
|
166
|
+
gene_array[i]=nil
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
cluster.compact!
|
171
|
+
gene_array.compact!
|
172
|
+
return gene_array,cluster
|
173
|
+
end
|
174
|
+
|
175
|
+
def coord_prot(last_contig_hsp, current_contig_hsp) #Devuelve la diferencia de posicion de dos contigs dados en base a su posicion en la proteina
|
176
|
+
add=last_contig_hsp.q_beg-current_contig_hsp.q_beg+3*(current_contig_hsp.s_beg-last_contig_hsp.s_beg) #primera parte del sumando representa la diferencia debida a la longitud de los contigs, la segunda parte representa la diferencia de tamaño del hsp
|
177
|
+
return add
|
178
|
+
end
|
179
|
+
|
180
|
+
def fasta_hash(path)
|
181
|
+
parse_seqs=FastaFile.new(path)
|
182
|
+
seqs={}
|
183
|
+
parse_seqs.each do |contig,seq_fasta|
|
184
|
+
seqs[contig]=seq_fasta
|
185
|
+
end
|
186
|
+
return seqs
|
187
|
+
end
|
188
|
+
|
189
|
+
def html_header(file,title)
|
190
|
+
file.puts '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
|
191
|
+
'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">',
|
192
|
+
'<head>',
|
193
|
+
'<meta http-equiv="content-type" content="text/html;charset=UTF-8" />',
|
194
|
+
'<title>'+title+'</title>',
|
195
|
+
'</head>',
|
196
|
+
'<body>'
|
197
|
+
end
|
198
|
+
|
199
|
+
def html_footer(file)
|
200
|
+
file.puts '</body>',
|
201
|
+
'</html>'
|
202
|
+
end
|
203
|
+
|
204
|
+
def html_table_header(file, border, headers) #headers es un array
|
205
|
+
file.puts '<table border="'+border.to_s+'">',
|
206
|
+
'<tr>'
|
207
|
+
headers.each do |header|
|
208
|
+
file.puts '<th>'+header+'</th>'
|
209
|
+
end
|
210
|
+
file.puts '</tr>'
|
211
|
+
end
|
212
|
+
|
213
|
+
def html_row(file, cells) #Cells muts be a array
|
214
|
+
file.puts '<tr>'
|
215
|
+
cells.each do |cell|
|
216
|
+
file.puts "<td>#{cell}</td>"
|
217
|
+
end
|
218
|
+
file.puts '</tr>'
|
219
|
+
end
|
220
|
+
|
221
|
+
def html_link(text, link)
|
222
|
+
text_linked='<a href="'+link+'">'+text.to_s+'</a>'
|
223
|
+
return text_linked
|
224
|
+
end
|
225
|
+
|
226
|
+
def html_table_footer(file)
|
227
|
+
file.puts '</table>'
|
228
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'dataset'
|
2
|
+
|
3
|
+
class Parser
|
4
|
+
attr_accessor :dataset
|
5
|
+
def initialize(file,type=nil)
|
6
|
+
@file=file
|
7
|
+
@dataset=create_dataset
|
8
|
+
data=parse_file(file) #Se crea objeto de datos para cargar dataset
|
9
|
+
load_dataset(data) #Se rellena dataset con la informacion contenida en data
|
10
|
+
end
|
11
|
+
|
12
|
+
def create_dataset
|
13
|
+
dataset=Dataset.new('unknown')#No se usa
|
14
|
+
return dataset
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_file(file)
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def load_dataset(data)
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'blast_type_parser'
|
2
|
+
require 'scbi_blast' #Si falla, buscar e instalar tb 'gem install xml-simple' de la q depende
|
3
|
+
|
4
|
+
|
5
|
+
class ParserBlast < BlastTypeParser
|
6
|
+
|
7
|
+
def parse_file(file)
|
8
|
+
blast=BlastTableResult.new(file)
|
9
|
+
return blast
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'exonerate_result'
|
2
|
+
require 'blast_type_parser'
|
3
|
+
|
4
|
+
class ParserExonerate < BlastTypeParser
|
5
|
+
|
6
|
+
def parse_file(file)
|
7
|
+
exonerate=ExonerateResult.new(file,@all)
|
8
|
+
return exonerate
|
9
|
+
end
|
10
|
+
|
11
|
+
def populate_extra_atributes(contig,item) #Añade los frameshift localizados x el exonerate
|
12
|
+
contig.q_frameshift=item.q_frameshift
|
13
|
+
contig.s_frameshift=item.s_frameshift
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,975 @@
|
|
1
|
+
require 'dataset'
|
2
|
+
require 'other_functions'
|
3
|
+
require 'report_gff'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
class Rebuild
|
7
|
+
def initialize(dataset,dataset_uni_hsp,path) #La clase ha de recibr objetos dataset
|
8
|
+
@dataset=dataset
|
9
|
+
@dataset_uni_hsp=dataset_uni_hsp
|
10
|
+
@path=path
|
11
|
+
@db_seqs=fasta_hash(path[:exonerate_db])
|
12
|
+
end
|
13
|
+
|
14
|
+
###############################################################################################################
|
15
|
+
# MAIN METHOD
|
16
|
+
###############################################################################################################
|
17
|
+
def rebuild(options) #Genera contigs modelo,gff y busca pseudogenes
|
18
|
+
gff_dataset_model=Dataset.new(:mix) #Object for save info of GeneAssembler's output
|
19
|
+
gff_dataset=Dataset.new(:mix) #Object for save info of GeneAssembler's output
|
20
|
+
file_error=File.open(@path[:error],'w')
|
21
|
+
file_web=web_header(options[:web],@path[:html])
|
22
|
+
sequences_hash={}
|
23
|
+
gene_name=nil
|
24
|
+
model=nil
|
25
|
+
statistics={:genes => 0, :total_recovered => 0, :total_overlap => 0, :total_fragmentation => 0}
|
26
|
+
puts "\nMODELING GENE",'*******************************************'
|
27
|
+
@dataset.each_cluster{|cluster|
|
28
|
+
begin
|
29
|
+
if !cluster.nil?
|
30
|
+
gene_name=cluster.first.first_hit.name
|
31
|
+
end
|
32
|
+
cluster_complete=cluster.dup
|
33
|
+
model, length_model, length_cluster = iterative_modeling_gene_w_reference(cluster,@dataset.references_hash,options[:rebuild],sequences_hash) #Realiza la reconstruccion del gen (alineado,descarte y montaje del gen)
|
34
|
+
|
35
|
+
# GeneAssembler output (gff for Gbrowse)
|
36
|
+
#--------------------------------------------------------
|
37
|
+
if !model.nil?
|
38
|
+
#Format Contigs children of model
|
39
|
+
gff_dataset.clr_contigs
|
40
|
+
gff_dataset.transfer_contigs(cluster_complete)
|
41
|
+
gff_dataset.transfer_n_contigs_def_hit_type(@dataset_uni_hsp,cluster,'pseudogene',50) #Transferir pseudogenes al report
|
42
|
+
|
43
|
+
# Convertir arrays a contig y ajustar alineamiento añadiendo Ns
|
44
|
+
model=correct_model(model, length_model, gff_dataset, sequences_hash)
|
45
|
+
|
46
|
+
# Comprobaciones en el modelo
|
47
|
+
exones=model.exones_s.length # N exones
|
48
|
+
puts 'Exones: '+ exones.to_s
|
49
|
+
recovered=recover_test(model)
|
50
|
+
overlap=overlap_test(model)
|
51
|
+
fragmentation=((length_cluster-1.00)/exones).round(2)
|
52
|
+
puts 'Fragmentation: ' + fragmentation.to_s
|
53
|
+
|
54
|
+
# HTML index
|
55
|
+
if !file_web.nil?
|
56
|
+
gene_link=html_link(model.first_hit.name, @path[:gbrowse_link]+model.first_hit.name)
|
57
|
+
html_row(file_web, [gene_link, cluster.first.first_hit.s_length, exones, recovered, overlap, fragmentation])
|
58
|
+
end
|
59
|
+
|
60
|
+
#Format Model for Gbrowse
|
61
|
+
gff_dataset_model.clr_contigs
|
62
|
+
format_model(model) #Añade la particula _gene al modelo
|
63
|
+
gff_dataset_model.transfer_contigs(model)
|
64
|
+
|
65
|
+
#Write
|
66
|
+
write_gbrowse_gff(gff_dataset_model, gff_dataset, @path[:gff], model.name)
|
67
|
+
|
68
|
+
#General statistics
|
69
|
+
statistics[:genes]+=1
|
70
|
+
statistics[:total_recovered]+=recovered
|
71
|
+
statistics[:total_overlap]+=overlap
|
72
|
+
statistics[:total_fragmentation]+=fragmentation
|
73
|
+
end
|
74
|
+
rescue Exception => e
|
75
|
+
gene_error(e, gene_name, file_error, cluster_complete, model)
|
76
|
+
end
|
77
|
+
puts '* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *'
|
78
|
+
}
|
79
|
+
file_error.close
|
80
|
+
web_body(file_web)
|
81
|
+
|
82
|
+
puts "\nFINAL STATISTICS\n",
|
83
|
+
'Recovered genes: '+ statistics[:genes].to_s,
|
84
|
+
'Mean recover: ' + (statistics[:total_recovered]/statistics[:genes]).to_s,
|
85
|
+
'Mean overlap: ' + (statistics[:total_overlap]/statistics[:genes]).to_s,
|
86
|
+
'Mean fragmentation: ' + (statistics[:total_fragmentation]/statistics[:genes]).to_s
|
87
|
+
write_model_fasta(sequences_hash,@path[:fasta])
|
88
|
+
end
|
89
|
+
################################################################################################################################
|
90
|
+
# end main method
|
91
|
+
################################################################################################################################
|
92
|
+
|
93
|
+
def iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash)
|
94
|
+
# Model atributes
|
95
|
+
model=nil
|
96
|
+
length=0
|
97
|
+
seq=nil
|
98
|
+
cluster_length=0
|
99
|
+
length_cluster=0
|
100
|
+
prot_reference=cluster.first.first_hit.name
|
101
|
+
array_references=references_hash[prot_reference]
|
102
|
+
|
103
|
+
# Model parameters
|
104
|
+
recover=0
|
105
|
+
overlap=0
|
106
|
+
|
107
|
+
#Modelo de gen en ciego
|
108
|
+
if $verbose
|
109
|
+
puts "\n",'|||||||||| BLIND MODELING ||||||||||'
|
110
|
+
end
|
111
|
+
model, length, seq, length_cluster= modeling_gene(cluster.dup,nil,options)
|
112
|
+
|
113
|
+
recover, overlap=eval_model(model.dup, length)
|
114
|
+
|
115
|
+
if $verbose
|
116
|
+
puts "\nRecover: #{recover} Overlap: #{overlap}"
|
117
|
+
end
|
118
|
+
|
119
|
+
guided=FALSE
|
120
|
+
#Modelo de gen guiado
|
121
|
+
if !array_references.nil?
|
122
|
+
array_references.each do |ref|
|
123
|
+
if $verbose
|
124
|
+
puts "\n",'|||||||||| GUIDED MODELING ||||||||||'
|
125
|
+
end
|
126
|
+
|
127
|
+
guided_model, guided_length, guided_seq, guided_length_cluster = modeling_gene(cluster.dup,ref,options)
|
128
|
+
if guided_model.nil? # Si algun modelo sale mal se ignora
|
129
|
+
next
|
130
|
+
end
|
131
|
+
guided_recover, guided_overlap= eval_model(guided_model.dup, guided_length)
|
132
|
+
if $verbose
|
133
|
+
puts "\nRecover: #{guided_recover} Overlap: #{guided_overlap}"
|
134
|
+
end
|
135
|
+
|
136
|
+
#Arbol de decisiones
|
137
|
+
if guided_overlap <= 15 #Si el overlap es menor del 15 %
|
138
|
+
if guided_overlap >= overlap-overlap*0.05 && guided_overlap <= overlap+overlap*0.05 # A mismo overlap
|
139
|
+
if guided_recover > recover
|
140
|
+
guided=TRUE
|
141
|
+
end
|
142
|
+
else # A distinto overlap
|
143
|
+
recover_dif=guided_recover-recover
|
144
|
+
if recover_dif < 0 # Si el guided_model tiene menos recuperacion q el anterior
|
145
|
+
if recover_dif.abs >= overlap-overlap*0.05 && recover_dif.abs <= overlap+overlap*0.05 #Si la reduccion de la recuperacion se debe a la desaparicion del overlap
|
146
|
+
guided=TRUE
|
147
|
+
end
|
148
|
+
elsif recover_dif> guided_overlap+guided_overlap*0.05 # Comprobar que la diferencia de recover no se debe a u aumento del overlap en la misma magnitud
|
149
|
+
guided=TRUE
|
150
|
+
end
|
151
|
+
end
|
152
|
+
elsif guided_overlap < overlap # Quedarnos siempre con los overlap mas bajos aun en situacion de overlap alto
|
153
|
+
guided=TRUE
|
154
|
+
end
|
155
|
+
|
156
|
+
if guided
|
157
|
+
model=guided_model
|
158
|
+
length=guided_length
|
159
|
+
seq=guided_seq
|
160
|
+
length_cluster=guided_length_cluster
|
161
|
+
recover=guided_recover
|
162
|
+
overlap=guided_overlap
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
sequences_hash[prot_reference]=seq
|
168
|
+
return model, length, length_cluster
|
169
|
+
end
|
170
|
+
|
171
|
+
def eval_model(local_model, length)#modifica model asi q se le ha de pasar una copia
|
172
|
+
recover=0
|
173
|
+
overlap=0
|
174
|
+
if local_model.class.to_s=='Array'
|
175
|
+
local_model=array_contigs_to_contig(local_model)
|
176
|
+
local_model.length=length
|
177
|
+
end
|
178
|
+
recover=recover_test(local_model,FALSE)
|
179
|
+
overlap=overlap_test(local_model,FALSE)
|
180
|
+
return recover, overlap
|
181
|
+
end
|
182
|
+
|
183
|
+
def modeling_gene(cluster, reference, rebuild) #Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo
|
184
|
+
model=nil
|
185
|
+
model_length=nil
|
186
|
+
seq=nil
|
187
|
+
length_cluster=0
|
188
|
+
# Reduccion iterativa de los contig para seleccionar los que van a formar parte del modelo de gen, elimina fragmentos menores que se puedan tomar como nuevos exones
|
189
|
+
#--------------------------------------------------------------------------------------------------
|
190
|
+
gene_array_length_before=nil
|
191
|
+
continue=TRUE
|
192
|
+
gene_array=[]
|
193
|
+
|
194
|
+
while continue
|
195
|
+
cluster,gene_array=gene_array_and_compact(rebuild,cluster,reference)
|
196
|
+
gene_array_length_after=length2D(gene_array)
|
197
|
+
if gene_array_length_after == gene_array_length_before
|
198
|
+
continue=FALSE
|
199
|
+
end
|
200
|
+
gene_array_length_before=gene_array_length_after
|
201
|
+
end
|
202
|
+
length_cluster=cluster.length
|
203
|
+
|
204
|
+
# Modelado del gen
|
205
|
+
#----------------------------------------------------
|
206
|
+
if rebuild && !cluster.empty? && !gene_array.empty?
|
207
|
+
if cluster.length >1
|
208
|
+
cluster_comp=contig_compact(cluster) #Fusiona contigs contiguos y devuelve el array correspondiente
|
209
|
+
else
|
210
|
+
cluster_comp=cluster
|
211
|
+
end
|
212
|
+
if !cluster_comp.nil?
|
213
|
+
model, model_length, seq=gene_model_cut(cluster_comp, reference)
|
214
|
+
else
|
215
|
+
puts cluster.first.first_hit.name+"\tGENE MODEL ABORTED"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
return model, model_length, seq, length_cluster
|
219
|
+
end
|
220
|
+
|
221
|
+
def gene_array_and_compact(rebuild,cluster,reference)
|
222
|
+
# Contruir array de exones (a partir del cluster) con los hsps de forma que los solapantes se alineen en las mismas columnas
|
223
|
+
#---------------------------------------------------------------------------------------------------------------------------
|
224
|
+
if rebuild
|
225
|
+
gene_array,gene_array_introns=build_gene_array(cluster,reference) #Con referencia
|
226
|
+
if $verbose
|
227
|
+
gene_exons=gene_stadistics(gene_array)
|
228
|
+
gene_stadistics_report(gene_exons,'EXONS')
|
229
|
+
gene_introns=gene_stadistics(gene_array_introns)
|
230
|
+
gene_stadistics_report(gene_introns,'INTRONS')
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
# Seleccion de contigs para modelado de gen
|
235
|
+
#-----------------------------------------------------
|
236
|
+
if $verbose #Info cluster before compact array contigs
|
237
|
+
gene_array_report(gene_array,cluster,rebuild)
|
238
|
+
end
|
239
|
+
if rebuild
|
240
|
+
gene_compact(gene_array,cluster) #Se descartan los contigs redundantes y quedan aquellos que cubren todo el gen para formar el modelo
|
241
|
+
end
|
242
|
+
if $verbose && rebuild #Info cluster after compact array contigs
|
243
|
+
gene_array_report(gene_array,cluster,rebuild)
|
244
|
+
end
|
245
|
+
return cluster, gene_array
|
246
|
+
end
|
247
|
+
|
248
|
+
def add_uni_hsp(model,cluster)#Compara contigs uni-hsp con contig modelo para determinar pseudogenes
|
249
|
+
contigs_uni_hsp=''
|
250
|
+
is_contig=0
|
251
|
+
pseudogenes=[]
|
252
|
+
@clusters_uni_hsp.each do |contigs|
|
253
|
+
if contigs.first.first_hit.name==cluster.first.first_hit.name
|
254
|
+
contigs_uni_hsp=contigs
|
255
|
+
is_contig=1
|
256
|
+
break
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
if is_contig==1 #Si se ha encontrado contigs uni-hsp se realiza la comparacion
|
261
|
+
if model.class.to_s!='Array'
|
262
|
+
model=[model]
|
263
|
+
end
|
264
|
+
model.each do |item|
|
265
|
+
contigs_uni_hsp.each do |contig|
|
266
|
+
start,exons=item.compare(contig)
|
267
|
+
if exons>1 && !pseudogenes.include?(contig)
|
268
|
+
pseudogenes << contig
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
return pseudogenes
|
274
|
+
end
|
275
|
+
|
276
|
+
def build_gene_array(contigs,reference=nil) #GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones
|
277
|
+
gene_array=[]
|
278
|
+
gene_array_introns=[]
|
279
|
+
last_contig=''
|
280
|
+
if !reference.nil?
|
281
|
+
last_contig=reference
|
282
|
+
end
|
283
|
+
contigs.each do |contig|
|
284
|
+
array_contig=[]
|
285
|
+
array_contig_introns=[]
|
286
|
+
n_exon=contig.first_hit.hsp_count #Contamos cantidad de hsps en el contig
|
287
|
+
#Determinar posiciones vacias
|
288
|
+
if !gene_array.empty?||reference
|
289
|
+
first_exon,ex=contig.compare(last_contig) #Comparamos el contig actual con el que se ha estudiado en la iteracion anterior
|
290
|
+
if reference && first_exon==-1 # Abortar alineamiento cuando un contig no coincide con la referencia
|
291
|
+
if $verbose
|
292
|
+
puts "\n#{contig.name} alignment step OUT OF RANGE"
|
293
|
+
end
|
294
|
+
gene_array=[]
|
295
|
+
gene_array_introns=[]
|
296
|
+
break
|
297
|
+
end
|
298
|
+
if first_exon==-1
|
299
|
+
gene_array.last.count.times do #Posiciones vacias cuando NO hay overlapping
|
300
|
+
array_contig << 0 # Marca ausencia de exon para esa posicion
|
301
|
+
array_contig_introns << 0 # Marca ausencia de intron para esa posicion
|
302
|
+
end
|
303
|
+
else
|
304
|
+
if reference # ASignamiento de la posicion del contig respecto a la referencia
|
305
|
+
void_positions=first_exon
|
306
|
+
else
|
307
|
+
void_positions=first_exon+gene_array.last.count(0)
|
308
|
+
end
|
309
|
+
void_positions.times do #Posiciones vacias cuando HAY overlapping
|
310
|
+
array_contig << 0
|
311
|
+
array_contig_introns << 0
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
#Agregar exones e intrones del contig
|
316
|
+
exones=contig.exones_s
|
317
|
+
introns=contig.intrones_q
|
318
|
+
array_contig << exones # Marca presencia de exon para esa posicion
|
319
|
+
array_contig_introns << introns # Marca presencia de exon para esa posicion
|
320
|
+
gene_array << array_contig.flatten!
|
321
|
+
gene_array_introns << array_contig_introns.flatten!
|
322
|
+
if reference.nil?
|
323
|
+
last_contig=contig
|
324
|
+
end
|
325
|
+
end
|
326
|
+
return gene_array, gene_array_introns
|
327
|
+
end
|
328
|
+
|
329
|
+
def gene_stadistics(gene_array) #Calcula el nº exones diferentes que hay por cada posicion del gene_array
|
330
|
+
exons=[]
|
331
|
+
length=length2D(gene_array)
|
332
|
+
length.times do |column|
|
333
|
+
exon=[]
|
334
|
+
gene_array.each_with_index.each do |item,row|
|
335
|
+
if !exon.include?(gene_array[row][column]) && gene_array[row][column]!=0
|
336
|
+
exon << gene_array[row][column]
|
337
|
+
end
|
338
|
+
end
|
339
|
+
exons << exon
|
340
|
+
end
|
341
|
+
exons_stadistic=[]
|
342
|
+
exons.each do |ex|
|
343
|
+
exons_stadistic << ex.compact.length
|
344
|
+
end
|
345
|
+
return exons_stadistic
|
346
|
+
end
|
347
|
+
|
348
|
+
def gene_stadistics_report(exons_stadistic,tag) #Muestra estadisticas de intrones o exones
|
349
|
+
print "\n#{tag}\t"
|
350
|
+
exons_stadistic.each do |item|
|
351
|
+
print "#{item}\t"
|
352
|
+
end
|
353
|
+
print "\n"
|
354
|
+
end
|
355
|
+
|
356
|
+
def gene_array_report(gene_array,contigs,act_array) #Muestra el array de la funncion build_gene_array y una representacion de las secuencias
|
357
|
+
if act_array
|
358
|
+
puts "\nGENE ARRAY"
|
359
|
+
gene_array.each_with_index do |fila,c|
|
360
|
+
print "#{contigs[c].name.center(24)}\t "
|
361
|
+
print "#{contigs[c].completed}\t"
|
362
|
+
fila.each do |item|
|
363
|
+
print "#{item}\t"
|
364
|
+
end
|
365
|
+
puts "\n"
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
puts "\nMAP"
|
370
|
+
contigs.each do |contig|
|
371
|
+
print "#{contig.name.center(25)}"
|
372
|
+
print contig.draw
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def gene_compact(gene_array, contigs) # Generacion modelo del gen quitando todas las secuencias redundantes posibles
|
377
|
+
gene_array.each_with_index do |contig,c1|
|
378
|
+
if !contig
|
379
|
+
next
|
380
|
+
end
|
381
|
+
c1_len=contig.length
|
382
|
+
n_exons=contig.count{|x| x>0}
|
383
|
+
gene_array.each_with_index do |contig2,c2|
|
384
|
+
if !contig2 ||c1==c2 #Saltamos contigs a nil o autocomparacion
|
385
|
+
next
|
386
|
+
end
|
387
|
+
c2_len=contig2.length
|
388
|
+
|
389
|
+
# IGUAL
|
390
|
+
if c1_len==c2_len
|
391
|
+
if contig2.count{|x| x>0}==n_exons
|
392
|
+
if contigs[c1].first_hit.first_hsp.score>=contigs[c2].first_hit.first_hsp.score
|
393
|
+
gene_array[c2]=nil
|
394
|
+
contigs[c2]=nil
|
395
|
+
else
|
396
|
+
gene_array[c1]=nil
|
397
|
+
contigs[c1]=nil
|
398
|
+
break
|
399
|
+
end
|
400
|
+
elsif contig2.count{|x| x>0}>n_exons
|
401
|
+
gene_array[c1]=nil
|
402
|
+
contigs[c1]=nil
|
403
|
+
break
|
404
|
+
else
|
405
|
+
gene_array[c2]=nil
|
406
|
+
contigs[c2]=nil
|
407
|
+
end
|
408
|
+
|
409
|
+
# MAYOR QUE
|
410
|
+
elsif c1_len>c2_len
|
411
|
+
if contig.count(0)<=contig2.count(0)
|
412
|
+
gene_array[c2]=nil
|
413
|
+
contigs[c2]=nil
|
414
|
+
end
|
415
|
+
|
416
|
+
# MENOR QUE
|
417
|
+
elsif c1_len<c2_len
|
418
|
+
if contig.count(0)==contig2.count(0)
|
419
|
+
gene_array[c1]=nil
|
420
|
+
contigs[c1]=nil
|
421
|
+
break
|
422
|
+
end
|
423
|
+
end
|
424
|
+
end #end contig2
|
425
|
+
end #end contig
|
426
|
+
gene_array.compact!
|
427
|
+
contigs.compact!
|
428
|
+
end
|
429
|
+
|
430
|
+
def contig_compact(contigs) # Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig
|
431
|
+
cn_def=[]
|
432
|
+
cn_backup=contigs.dup
|
433
|
+
#Determinar contigs a fusionar
|
434
|
+
cn_to_merge=[]
|
435
|
+
s_end=nil
|
436
|
+
last_position_ref=nil
|
437
|
+
position_overlap=nil
|
438
|
+
last_contig=nil
|
439
|
+
fusion=[]
|
440
|
+
contigs.length.times do
|
441
|
+
fusion << FALSE
|
442
|
+
end
|
443
|
+
#Marcaje de contigs correlativos no solapantes
|
444
|
+
contigs.each_with_index do |contig,i|
|
445
|
+
if i>0
|
446
|
+
diference=contig.first_hit.first_hsp.s_beg-s_end
|
447
|
+
if diference==0 || diference==1
|
448
|
+
fusion[i]=TRUE
|
449
|
+
end
|
450
|
+
end
|
451
|
+
s_end=contig.first_hit.last_hsp.s_end
|
452
|
+
end
|
453
|
+
|
454
|
+
if fusion.include?(TRUE)
|
455
|
+
|
456
|
+
#Construccion array contigs a fusionar y guardado de los solapantes
|
457
|
+
fusion_contigs=[]
|
458
|
+
count=0 # Marca la posicion de las fusiones
|
459
|
+
fusion.each_with_index do |cont,i|
|
460
|
+
if cont
|
461
|
+
if !fusion_contigs.include?(contigs[i-1])
|
462
|
+
fusion_contigs << contigs[i-1]
|
463
|
+
end
|
464
|
+
if !fusion_contigs.include?(contigs[i])
|
465
|
+
fusion_contigs << contigs[i]
|
466
|
+
end
|
467
|
+
else
|
468
|
+
if !fusion_contigs.empty?#Marcar fusiones
|
469
|
+
cn_to_merge << fusion_contigs
|
470
|
+
fusion_contigs=[]
|
471
|
+
cn_def << count
|
472
|
+
count+=1
|
473
|
+
end
|
474
|
+
if !fusion[i+1]||fusion[i+1].nil?#Guardar contigs que no participan en las fusiones
|
475
|
+
cn_def << contigs[i]
|
476
|
+
end
|
477
|
+
end
|
478
|
+
if i+1==fusion.length && !fusion_contigs.empty? #Control fin de bucle
|
479
|
+
cn_to_merge << fusion_contigs
|
480
|
+
cn_def << count
|
481
|
+
count+=1
|
482
|
+
end
|
483
|
+
end
|
484
|
+
|
485
|
+
#Generar fasta de los contig fusionados
|
486
|
+
contigs_merge=contigs_seq_merge(cn_to_merge)
|
487
|
+
if !contigs_merge.empty?
|
488
|
+
temp=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.fasta'),'w')
|
489
|
+
contigs_merge.each_with_index do |seq,i|
|
490
|
+
temp.puts ">Fusion_#{i}\n#{seq}"
|
491
|
+
end
|
492
|
+
temp.close
|
493
|
+
|
494
|
+
temp_db=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.db'),'w')
|
495
|
+
temp_db.puts ">#{contigs.first.first_hit.name}\n#{@db_seqs[contigs.first.first_hit.name]}"
|
496
|
+
temp_db.close
|
497
|
+
|
498
|
+
end
|
499
|
+
|
500
|
+
#Exonerating
|
501
|
+
cmd="exonerate -q #{File.join(@path[:local],contigs.first.first_hit.name+'.db')} -t #{File.join(@path[:local],contigs.first.first_hit.name+'.fasta')} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --useaatla 1 --showvulgar > #{File.join(@path[:local],contigs.first.first_hit.name+'.ex')}" #LINUX command line
|
502
|
+
system(cmd)
|
503
|
+
|
504
|
+
#Parsing exonerate
|
505
|
+
local = ParserExonerate.new('contig','nucleotide_match', File.join(@path[:local],"#{contigs.first.first_hit.name}.ex"))
|
506
|
+
store_local_ex = local.dataset
|
507
|
+
#store_local_ex.each_contig {|ite| puts ite.name+' '+ite.first_hit.name; ite.indices}
|
508
|
+
store_local_ex.score_correction(30)
|
509
|
+
#puts "#{store_local_ex.contig_count}\t#{contigs_merge.length}"
|
510
|
+
if store_local_ex.contig_count==contigs_merge.length
|
511
|
+
#Recuperar atributos en contigs y cargar array con contigs def
|
512
|
+
store_local_ex.each_contig_with_index{|contig,i|
|
513
|
+
contig.seq=contigs_merge[i]
|
514
|
+
contig.length=contigs_merge[i].length
|
515
|
+
contig.first_hit.s_length=contigs.first.first_hit.s_length
|
516
|
+
cn_def.each_with_index do |contig_def,j| # Busqueda de la posicion de la fusion y asignacion en el array de contigs definitivos
|
517
|
+
if contig_def==i
|
518
|
+
cn_def[j]=contig
|
519
|
+
end
|
520
|
+
end
|
521
|
+
}
|
522
|
+
else
|
523
|
+
cn_def=cn_backup
|
524
|
+
end
|
525
|
+
else
|
526
|
+
cn_def=contigs
|
527
|
+
end
|
528
|
+
|
529
|
+
return cn_def
|
530
|
+
|
531
|
+
end#def
|
532
|
+
|
533
|
+
def contigs_seq_merge(contigs) #Devuelve un array con las secuencias fusionadas a partir del array contigs donde se le proporciona los arrays a fusionar
|
534
|
+
cn=[]
|
535
|
+
seq=''
|
536
|
+
contigs.each do |contigs_to_merge|
|
537
|
+
contigs_to_merge.each do |contig|
|
538
|
+
if seq.empty?
|
539
|
+
seq=contig.seq
|
540
|
+
else
|
541
|
+
seq=seq+'n'*10+contig.seq
|
542
|
+
end
|
543
|
+
end
|
544
|
+
cn << seq
|
545
|
+
seq=''
|
546
|
+
end
|
547
|
+
return cn
|
548
|
+
end
|
549
|
+
|
550
|
+
def gene_model_cut(contigs, reference=nil) #Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig
|
551
|
+
q_beg=[]
|
552
|
+
q_end=[]
|
553
|
+
s_beg=[]
|
554
|
+
s_end=[]
|
555
|
+
seq=[]
|
556
|
+
last_contig=nil
|
557
|
+
last_score=0
|
558
|
+
length_model=0
|
559
|
+
multiple_lengths=[]
|
560
|
+
add_length=TRUE
|
561
|
+
add_last=0
|
562
|
+
last_position_ref=nil
|
563
|
+
last_position=nil
|
564
|
+
lengthy=[]
|
565
|
+
out_of_range=FALSE
|
566
|
+
contigs.each do |contig|
|
567
|
+
score = contig.first_hit.first_hsp.score/contig.length*contig.exon_acumulative
|
568
|
+
n_exones = contig.first_hit.hsp_count
|
569
|
+
|
570
|
+
# FIRST CONTIG
|
571
|
+
#-------------------------------------------------------
|
572
|
+
if last_contig.nil?
|
573
|
+
q_end_seq=nil #SEQ
|
574
|
+
contig.first_hit.each_hsp_with_index{|hsp,i|
|
575
|
+
q_beg << hsp.q_beg
|
576
|
+
q_end << hsp.q_end
|
577
|
+
s_beg << hsp.s_beg
|
578
|
+
s_end << hsp.s_end
|
579
|
+
#SEQ.................................
|
580
|
+
if i==0
|
581
|
+
seq << contig.seq[0..contig.first_hit.first_hsp.q_end-1]
|
582
|
+
elsif i+1==n_exones
|
583
|
+
seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
|
584
|
+
else
|
585
|
+
seq << contig.seq[q_end_seq..hsp.q_end-1]
|
586
|
+
end
|
587
|
+
q_end_seq=hsp.q_end
|
588
|
+
# ...................................
|
589
|
+
}
|
590
|
+
length_model+=contig.length
|
591
|
+
if !reference.nil? #Posicionamiento del primer contig en la referencia
|
592
|
+
last_position_ref,ex=contig.compare(reference)
|
593
|
+
if last_position==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count #Abortar modelado en caso de qun contig no alinee con la referencia o la sobrepase
|
594
|
+
puts contig.name+' OUT OF RANGE'
|
595
|
+
out_of_range=TRUE
|
596
|
+
break
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
# OTHER CONTIG
|
601
|
+
#--------------------------------------------------------
|
602
|
+
else
|
603
|
+
position_overlap,ex=contig.compare(last_contig)
|
604
|
+
|
605
|
+
#Correccion posicion del contig en base a una referencia
|
606
|
+
if !reference.nil?
|
607
|
+
last_position_ref,position_overlap=position_reference_guided(contig,last_contig,last_position_ref,reference)
|
608
|
+
if last_position_ref==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count
|
609
|
+
out_of_range=TRUE
|
610
|
+
puts contig.name+' OUT OF RANGE'
|
611
|
+
break
|
612
|
+
end
|
613
|
+
end
|
614
|
+
|
615
|
+
# NOT OVERLAP
|
616
|
+
#..........................
|
617
|
+
if position_overlap==-1 || contig.first_hit.hsp_count==1
|
618
|
+
if contig.first_hit.first_hsp.s_beg-last_contig.first_hit.last_hsp.s_end>1 # Marcar discontinuidad en caso de que el contig no sea correlativo al anterior
|
619
|
+
q_beg << 0
|
620
|
+
q_end << 0
|
621
|
+
s_beg << 0
|
622
|
+
s_end << 0
|
623
|
+
multiple_lengths << length_model
|
624
|
+
length_model=contig.length
|
625
|
+
last=length_model
|
626
|
+
add_length=FALSE
|
627
|
+
seq.last << 'n'*10 #SEQ Indicacion de GAP
|
628
|
+
else
|
629
|
+
last=length_model #Guardamos longitud anterior para poder desplazar las coordenadas del contig correctamente
|
630
|
+
length_model+=contig.length
|
631
|
+
end
|
632
|
+
|
633
|
+
q_end_seq=nil #SEQ
|
634
|
+
contig.first_hit.hsps.each_with_index do |hsp,i|
|
635
|
+
add_no=last
|
636
|
+
if !add_length
|
637
|
+
add_no=0
|
638
|
+
end
|
639
|
+
q_beg << hsp.q_beg+add_no # Se acumula a las coordenadas la longitud del modelo
|
640
|
+
q_end << hsp.q_end+add_no
|
641
|
+
s_beg << hsp.s_beg
|
642
|
+
s_end << hsp.s_end
|
643
|
+
#SEQ.................................
|
644
|
+
if i==0
|
645
|
+
cn=contig.seq[0..contig.first_hit.first_hsp.q_end-1]
|
646
|
+
cs="#{cn[0..1].swapcase!}#{cn[2..-1]}"
|
647
|
+
seq << cs
|
648
|
+
elsif i+1==n_exones
|
649
|
+
seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
|
650
|
+
else
|
651
|
+
seq << contig.seq[q_end_seq..hsp.q_end-1]
|
652
|
+
end
|
653
|
+
q_end_seq=hsp.q_end
|
654
|
+
# ...................................
|
655
|
+
end
|
656
|
+
|
657
|
+
# OVERLAP
|
658
|
+
#..........................
|
659
|
+
else
|
660
|
+
if last_position==-1
|
661
|
+
add_last=length_model-last_contig.length
|
662
|
+
end
|
663
|
+
overlap=last_contig.first_hit.hsp_count-position_overlap
|
664
|
+
if last_contig.first_hit.hsp_count ==1
|
665
|
+
overlap=1
|
666
|
+
end
|
667
|
+
#puts "#{overlap} = #{last_contig.first_hit.hsp_count} - #{position_overlap}"
|
668
|
+
add=0
|
669
|
+
dif=0
|
670
|
+
if last_score>=score
|
671
|
+
add=last_contig.first_hit.last_hsp.q_end-contig.first_hit.first_hsp.q_end
|
672
|
+
dif=add
|
673
|
+
if overlap>1 #eliminamos ultimo exon de 'last contig' para reemplazar por el segundo de 'contig' q es mas fiable por ser interno
|
674
|
+
add=last_contig.first_hit.hsp_at(last_contig.first_hit.hsp_count-overlap).q_end-contig.first_hit.first_hsp.q_end #Como se dropea el ultimo exon se alinea por el penultimo
|
675
|
+
#puts "hsp:#{contig.first_hit.hsp_count}\toverlap:#{overlap}"
|
676
|
+
dif=contig.first_hit.hsp_at(overlap-1).q_end
|
677
|
+
q_beg=q_beg.reverse.drop(1).reverse
|
678
|
+
q_end=q_end.reverse.drop(1).reverse
|
679
|
+
s_beg=s_beg.reverse.drop(1).reverse
|
680
|
+
s_end=s_end.reverse.drop(1).reverse
|
681
|
+
seq=seq.reverse.drop(1).reverse #SEQ
|
682
|
+
end
|
683
|
+
if overlap==1
|
684
|
+
overlap=2
|
685
|
+
end
|
686
|
+
(contig.first_hit.hsp_count-(overlap-1)).times do |n| #Añadimos el resto de exones del contig al modelo
|
687
|
+
q_beg << contig.first_hit.hsp_at(n+overlap-1).q_beg+add+add_last
|
688
|
+
q_end << contig.first_hit.hsp_at(n+overlap-1).q_end+add+add_last
|
689
|
+
s_beg << contig.first_hit.hsp_at(n+overlap-1).s_beg
|
690
|
+
s_end << contig.first_hit.hsp_at(n+overlap-1).s_end
|
691
|
+
#SEQ.......................................
|
692
|
+
position_hsp=n+overlap-2
|
693
|
+
if position_hsp <0
|
694
|
+
position_hsp= 0
|
695
|
+
end
|
696
|
+
position_next_hsp=n+overlap-1
|
697
|
+
if position_next_hsp < 0
|
698
|
+
position_next_hsp =0
|
699
|
+
end
|
700
|
+
|
701
|
+
if n==0
|
702
|
+
cn=contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
|
703
|
+
cs=cn[0..1].swapcase!+cn[2..-1]
|
704
|
+
seq << cs
|
705
|
+
elsif position_next_hsp==contig.first_hit.hsp_count-1
|
706
|
+
seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.length-1]
|
707
|
+
else
|
708
|
+
seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
|
709
|
+
end
|
710
|
+
#............................................
|
711
|
+
end
|
712
|
+
else
|
713
|
+
hsp_position=last_contig.first_hit.hsp_count-2
|
714
|
+
if hsp_position<0 #para los casos de los contigs q solo poseen un hsp
|
715
|
+
hsp_position=0
|
716
|
+
end
|
717
|
+
add=last_contig.first_hit.hsp_at(hsp_position).q_end
|
718
|
+
dif=last_contig.length-add
|
719
|
+
drop=1
|
720
|
+
correction=0
|
721
|
+
if overlap>1
|
722
|
+
drop=overlap-1
|
723
|
+
correction=1
|
724
|
+
add=last_contig.first_hit.hsp_at(position_overlap).q_end-contig.first_hit.first_hsp.q_end
|
725
|
+
dif=length_model-(add+add_last)
|
726
|
+
end
|
727
|
+
# Eliminamos exones malos de 'last_contig' (mantenemos el primero del overlap)
|
728
|
+
q_beg=q_beg.reverse.drop(drop).reverse
|
729
|
+
q_end=q_end.reverse.drop(drop).reverse
|
730
|
+
s_beg=s_beg.reverse.drop(drop).reverse
|
731
|
+
s_end=s_end.reverse.drop(drop).reverse
|
732
|
+
seq=seq.reverse.drop(drop).reverse #SEQ
|
733
|
+
|
734
|
+
# Añadimos los exones de 'contig' (excepto el primero)
|
735
|
+
(contig.first_hit.hsp_count-correction).times do |n| #Añadimos el resto de exones del contig al modelo
|
736
|
+
q_beg << contig.first_hit.hsp_at(n+correction).q_beg+add+add_last
|
737
|
+
q_end << contig.first_hit.hsp_at(n+correction).q_end+add+add_last
|
738
|
+
s_beg << contig.first_hit.hsp_at(n+correction).s_beg
|
739
|
+
s_end << contig.first_hit.hsp_at(n+correction).s_end
|
740
|
+
#SEQ.............................................................
|
741
|
+
if n+1==(contig.first_hit.hsp_count-correction)
|
742
|
+
n_correction=n+correction-1
|
743
|
+
if n_correction < 0
|
744
|
+
n_correction=0
|
745
|
+
end
|
746
|
+
seq << contig.seq[contig.first_hit.hsp_at(n_correction).q_end..contig.length-1]
|
747
|
+
elsif n==0
|
748
|
+
if n+correction==0
|
749
|
+
cn=contig.seq[0..contig.first_hit.hsp_at(n+correction).q_end-1] # Si n+corr empieza en el primer exon del contig
|
750
|
+
else
|
751
|
+
cn=contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
|
752
|
+
end
|
753
|
+
cs=cn[0..1].swapcase!+cn[2..-1]
|
754
|
+
seq << cs
|
755
|
+
else
|
756
|
+
seq << contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
|
757
|
+
end
|
758
|
+
#................................................................
|
759
|
+
end
|
760
|
+
end
|
761
|
+
length_model+=(contig.length-dif)
|
762
|
+
add_length=TRUE
|
763
|
+
add_last+=add
|
764
|
+
end
|
765
|
+
end
|
766
|
+
last_position=position_overlap
|
767
|
+
last_contig=contig
|
768
|
+
last_score=score
|
769
|
+
lengthy << length_model
|
770
|
+
end
|
771
|
+
if !multiple_lengths.empty?
|
772
|
+
multiple_lengths << length_model
|
773
|
+
length_model=multiple_lengths
|
774
|
+
end
|
775
|
+
|
776
|
+
model=nil
|
777
|
+
if !out_of_range #Generar modelo si todos los contigs han alineado con la referencia
|
778
|
+
model=void_contig(contigs.first.first_hit.name+'_model',length_model,contigs.first.first_hit.s_length,q_beg,q_end,s_beg,s_end,'contig','gene','exon')
|
779
|
+
#Merge contigs under sequence reference
|
780
|
+
model_length=nil
|
781
|
+
if model.class.to_s=='Array'
|
782
|
+
add=0
|
783
|
+
model.each_with_index do |contig,i|
|
784
|
+
contig.modified_coordenates(add)
|
785
|
+
add+=contig.length
|
786
|
+
if i<model.length-1
|
787
|
+
add+=10
|
788
|
+
end
|
789
|
+
end
|
790
|
+
model_length=add+model.last.length
|
791
|
+
end
|
792
|
+
|
793
|
+
model_n_exones=seq.length
|
794
|
+
final_seq=seq.join
|
795
|
+
else #No generar modelo si al menos un contig no alinea contra la referencia
|
796
|
+
model_length=nil
|
797
|
+
final_seq=nil
|
798
|
+
end
|
799
|
+
|
800
|
+
return model, model_length, final_seq
|
801
|
+
end
|
802
|
+
|
803
|
+
def void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) #Genera un objeto contig con los datos proporcionados
|
804
|
+
contigs=[]
|
805
|
+
is_contig=1
|
806
|
+
contig=nil
|
807
|
+
n=0
|
808
|
+
q_beg.each_with_index do |item,ind|
|
809
|
+
if item>0 ||single
|
810
|
+
if contig==nil
|
811
|
+
if contig_length.class.to_s=='Array'
|
812
|
+
length=contig_length[n]
|
813
|
+
name="#{contig_name}_#{n}"
|
814
|
+
else
|
815
|
+
length=contig_length
|
816
|
+
name=contig_name
|
817
|
+
end
|
818
|
+
contig=Contig.new(name)
|
819
|
+
contig.length=length
|
820
|
+
contig.type=contig_type
|
821
|
+
hit_v=contig.add_hit(contig_name,s_length,1,:prot)
|
822
|
+
hit_v.type=hit_type
|
823
|
+
end
|
824
|
+
hsp_v=contig.first_hit.add_hsp(q_beg[ind], q_end[ind], s_beg[ind], s_end[ind], 0, 0, 0, 0)
|
825
|
+
hsp_v.type=hsp_type
|
826
|
+
end
|
827
|
+
if item==0 && contig!=nil && !single||q_beg.length-1==ind
|
828
|
+
if single ||!q_beg.include?(0)
|
829
|
+
contigs=contig
|
830
|
+
else
|
831
|
+
contigs << contig
|
832
|
+
end
|
833
|
+
n+=1
|
834
|
+
contig=nil
|
835
|
+
end
|
836
|
+
end
|
837
|
+
return contigs
|
838
|
+
end
|
839
|
+
|
840
|
+
def position_reference_guided(contig,last_contig,last_position_ref,reference)# Si no existe overlap devuelve -1
|
841
|
+
position_ref,ex=contig.compare(reference)
|
842
|
+
if !last_position_ref.nil?
|
843
|
+
if position_ref<=last_position_ref+(last_contig.first_hit.hsp_count-1) #Overlap
|
844
|
+
position_overlap=(last_position_ref-position_ref).abs
|
845
|
+
else #No overlap
|
846
|
+
position_overlap=-1
|
847
|
+
end
|
848
|
+
end
|
849
|
+
return position_ref,position_overlap
|
850
|
+
end
|
851
|
+
|
852
|
+
def array_contigs_to_contig(array_contigs)
|
853
|
+
contig=Contig.new(array_contigs.first.name)
|
854
|
+
array_contigs.each do |cn|
|
855
|
+
contig.transfer_contig_hits(cn)
|
856
|
+
end
|
857
|
+
contig.length=array_contigs.last.length
|
858
|
+
return contig
|
859
|
+
end
|
860
|
+
|
861
|
+
def gene_error(e, gene_name, file_error, cluster, model) #e is a ruby exception object
|
862
|
+
puts gene_name+' ERROR'
|
863
|
+
file_error.puts "\n"+gene_name+"\n.............................."
|
864
|
+
file_error.puts e.message
|
865
|
+
e.backtrace.each do |line|
|
866
|
+
file_error.puts line
|
867
|
+
end
|
868
|
+
file_error.puts ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
|
869
|
+
cluster.each do |contig|
|
870
|
+
file_error.puts contig.name
|
871
|
+
#puts contig.name
|
872
|
+
#contig.indices
|
873
|
+
end
|
874
|
+
file_error.puts '----------------------------------------------------------------------------------------'
|
875
|
+
end
|
876
|
+
|
877
|
+
def overlap_test(model,output=TRUE)
|
878
|
+
perc_overlap=0
|
879
|
+
overlap=model.overlap
|
880
|
+
total=0
|
881
|
+
if !overlap.empty?
|
882
|
+
if output
|
883
|
+
print 'WARNING: overlap/s '
|
884
|
+
end
|
885
|
+
overlap.each do |length_overlap|
|
886
|
+
if output
|
887
|
+
print (length_overlap*-3).to_s+', '
|
888
|
+
end
|
889
|
+
total+=length_overlap
|
890
|
+
end
|
891
|
+
perc_overlap=(total*-100.0/model.first_hit.s_length).round(2)
|
892
|
+
if output
|
893
|
+
puts 'nt. % Total overlap '+perc_overlap.to_s
|
894
|
+
end
|
895
|
+
end
|
896
|
+
return perc_overlap
|
897
|
+
end
|
898
|
+
|
899
|
+
def recover_test(model,output=TRUE)
|
900
|
+
recovered=0
|
901
|
+
model.exones_s.each do |exon|
|
902
|
+
recovered+=exon
|
903
|
+
end
|
904
|
+
recovered=(recovered*100.0/model.first_hit.s_length).round(2)
|
905
|
+
if output
|
906
|
+
puts "Recovered\t"+model.first_hit.name+"\t#{recovered}"
|
907
|
+
end
|
908
|
+
return recovered
|
909
|
+
end
|
910
|
+
|
911
|
+
def web_header(web, path)
|
912
|
+
file_web=nil
|
913
|
+
if web
|
914
|
+
file_web=File.open(path,'w')
|
915
|
+
html_header(file_web,'Gene index')
|
916
|
+
html_table_header(file_web,1,['Gene model name', 'Protein length', 'Num exon', '% recovered protein', '% overlapping sequence', 'Fragmentation'])
|
917
|
+
end
|
918
|
+
return file_web
|
919
|
+
end
|
920
|
+
|
921
|
+
def web_body(file_web)
|
922
|
+
if !file_web.nil?
|
923
|
+
html_table_footer(file_web)
|
924
|
+
html_footer(file_web)
|
925
|
+
file_web.close
|
926
|
+
end
|
927
|
+
end
|
928
|
+
|
929
|
+
def write_model_fasta(sequences_hash, path)
|
930
|
+
model_file=File.open(path,'w')
|
931
|
+
sequences_hash.each do |model|
|
932
|
+
model_file.puts '>'+model[0]+"_model\n"+model[1]
|
933
|
+
end
|
934
|
+
model_file.close
|
935
|
+
end
|
936
|
+
|
937
|
+
def write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name)
|
938
|
+
gff_model=ReportGff.new(gff_dataset_model,path,'s')
|
939
|
+
gff_model.create('a')
|
940
|
+
gff=ReportGff.new(gff_dataset,path,'s')
|
941
|
+
gff.create('a',name)
|
942
|
+
end
|
943
|
+
|
944
|
+
def format_model(model)
|
945
|
+
if model.n_hits?>1
|
946
|
+
model.each_hit_with_index{|hit,i|
|
947
|
+
hit.name=hit.name+"_gene_#{i}"
|
948
|
+
}
|
949
|
+
else
|
950
|
+
model.first_hit.name=model.first_hit.name+'_gene'
|
951
|
+
end
|
952
|
+
end
|
953
|
+
|
954
|
+
def correct_model(model, length_model, gff_dataset, sequences_hash)
|
955
|
+
correct_add_Ns=0
|
956
|
+
if model.class.to_s=='Array'
|
957
|
+
model=array_contigs_to_contig(model)
|
958
|
+
model.name=model.name.gsub('_0','')
|
959
|
+
model.length=length_model
|
960
|
+
end
|
961
|
+
|
962
|
+
correct_add_Ns=gff_dataset.correct_left_side_contigs(model)
|
963
|
+
model.modified_coordenates(correct_add_Ns)
|
964
|
+
model.length+=correct_add_Ns
|
965
|
+
gff_dataset.align_contigs(model)
|
966
|
+
|
967
|
+
#Corregir secuencia para que alinee con las features generadas
|
968
|
+
if correct_add_Ns>0
|
969
|
+
sequences_hash[model.name.gsub('_model','')]='n'*correct_add_Ns+sequences_hash[model.name.gsub('_model','')]
|
970
|
+
end
|
971
|
+
|
972
|
+
return model
|
973
|
+
end
|
974
|
+
|
975
|
+
end
|