gene_assembler 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/bin/GeneAssembler +233 -0
- data/bin/phytozome_scan +60 -0
- data/gene_assembler.gemspec +25 -0
- data/lib/gene_assembler.rb +5 -0
- data/lib/gene_assembler/blast_type_parser.rb +41 -0
- data/lib/gene_assembler/contig.rb +643 -0
- data/lib/gene_assembler/dataset.rb +532 -0
- data/lib/gene_assembler/exonerate_result.rb +230 -0
- data/lib/gene_assembler/gff_contig.rb +67 -0
- data/lib/gene_assembler/gff_dataset.rb +152 -0
- data/lib/gene_assembler/gff_feature.rb +175 -0
- data/lib/gene_assembler/gff_frameshift.rb +6 -0
- data/lib/gene_assembler/gff_go.rb +13 -0
- data/lib/gene_assembler/gff_hit.rb +53 -0
- data/lib/gene_assembler/gff_hsp.rb +6 -0
- data/lib/gene_assembler/gff_localization.rb +6 -0
- data/lib/gene_assembler/gff_master_feature.rb +5 -0
- data/lib/gene_assembler/gff_parser.rb +35 -0
- data/lib/gene_assembler/gff_snp.rb +21 -0
- data/lib/gene_assembler/gff_stop.rb +6 -0
- data/lib/gene_assembler/go.rb +13 -0
- data/lib/gene_assembler/hit.rb +191 -0
- data/lib/gene_assembler/hsp.rb +100 -0
- data/lib/gene_assembler/other_functions.rb +228 -0
- data/lib/gene_assembler/parser.rb +25 -0
- data/lib/gene_assembler/parser_blast.rb +12 -0
- data/lib/gene_assembler/parser_exonerate.rb +16 -0
- data/lib/gene_assembler/rebuild.rb +975 -0
- data/lib/gene_assembler/report.rb +13 -0
- data/lib/gene_assembler/report_gff.rb +30 -0
- data/lib/gene_assembler/snp.rb +13 -0
- data/lib/gene_assembler/version.rb +3 -0
- metadata +149 -0
@@ -0,0 +1,230 @@
|
|
1
|
+
# Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
# a copy of this software and associated documentation files (the
|
5
|
+
# 'Software'), to deal in the Software without restriction, including
|
6
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
# the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be
|
12
|
+
# included in all copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
22
|
+
|
23
|
+
require 'blast_query.rb'
|
24
|
+
require 'blast_hit.rb'
|
25
|
+
|
26
|
+
class ExoBlastQuery < BlastQuery
|
27
|
+
attr_accessor :q_frameshift, :s_frameshift
|
28
|
+
def initialize(query_id)
|
29
|
+
super(query_id)
|
30
|
+
@s_frameshift=[]
|
31
|
+
@q_frameshift=[]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
|
36
|
+
class ExonerateResult
|
37
|
+
|
38
|
+
# Parser initialization
|
39
|
+
def initialize(input,all)
|
40
|
+
@querys = []
|
41
|
+
|
42
|
+
if input.is_a?(Array)
|
43
|
+
input.each do |file|
|
44
|
+
fich = File.open(file,'r')
|
45
|
+
lines = fich.readlines
|
46
|
+
fich.close
|
47
|
+
parse_file(lines,all)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
fich = File.open(input,'r')
|
51
|
+
lines = fich.readlines
|
52
|
+
fich.close
|
53
|
+
parse_file(lines,all)
|
54
|
+
end
|
55
|
+
query_name=''
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
def parse_file(lines,all)
|
60
|
+
lines_parsed=nil
|
61
|
+
if !all
|
62
|
+
lines_parsed={}
|
63
|
+
else
|
64
|
+
lines_parsed=[]
|
65
|
+
end
|
66
|
+
lines.each do |line|
|
67
|
+
if line=~ /^vulgar:/
|
68
|
+
fields=line.split
|
69
|
+
features={'query_id'=> fields[1], 'query_start_align'=> fields[2], 'query_end_align'=> fields[3], 'query_strand'=> fields[4],'target_id'=> fields[5], 'target_start_align'=> fields[6], 'target_end_align'=> fields[7], 'target_strand'=> fields[8], 'score'=> fields[9], 'align_data'=> fields[10..fields.length]}
|
70
|
+
if all
|
71
|
+
lines_parsed << features
|
72
|
+
else
|
73
|
+
if !lines_parsed.key?(features['target_id']) # Añadir valor si no existe
|
74
|
+
lines_parsed[features['target_id']]=features
|
75
|
+
else
|
76
|
+
if features['score']>lines_parsed[features['target_id']]['score'] # Si ya existe una query, ver si la nueva presenta un mayor score y reemplazar la antigua
|
77
|
+
lines_parsed[features['target_id']]=features
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
convert_parsed_lines(lines_parsed)
|
84
|
+
end
|
85
|
+
|
86
|
+
def convert_parsed_lines(lines_parsed)
|
87
|
+
lines_parsed.each do |line|
|
88
|
+
align_data=nil
|
89
|
+
features=nil
|
90
|
+
if lines_parsed.class.to_s=='Array'
|
91
|
+
align_data=line['align_data']
|
92
|
+
features=line
|
93
|
+
else #hash
|
94
|
+
align_data=line[1]['align_data']
|
95
|
+
features=line[1]
|
96
|
+
end
|
97
|
+
tags=[]
|
98
|
+
elm=[]
|
99
|
+
align_data_len=align_data.length-1
|
100
|
+
align_data.each_with_index do |field,c|
|
101
|
+
mod=c.modulo(3)
|
102
|
+
if mod==0 #tag operation
|
103
|
+
if c>0
|
104
|
+
tags << elm
|
105
|
+
end
|
106
|
+
elm=[] #array q inicia segunda dimension en el array principal
|
107
|
+
elm << field
|
108
|
+
elsif mod==1 # Coordenada relativa del query
|
109
|
+
elm << field.to_i
|
110
|
+
elsif mod==2 # Coordenada relativa del target
|
111
|
+
elm << field.to_i
|
112
|
+
end
|
113
|
+
if align_data_len==c #Guardar ultima operacion x fin de parseo
|
114
|
+
tags << elm
|
115
|
+
end
|
116
|
+
end
|
117
|
+
hiting(features,tags)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def hiting(features,tags) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
|
122
|
+
query=ExoBlastQuery.new(features['target_id'])
|
123
|
+
#Variables para hit
|
124
|
+
if features['target_strand']=='+'
|
125
|
+
reversed=FALSE
|
126
|
+
strand=1
|
127
|
+
else
|
128
|
+
reversed=TRUE
|
129
|
+
strand=-1
|
130
|
+
end
|
131
|
+
#------------------
|
132
|
+
start_target=features['target_start_align'].to_i#paso de coord 0 a 1
|
133
|
+
ends_target=0
|
134
|
+
start_query=features['query_start_align'].to_i
|
135
|
+
ends_query=0
|
136
|
+
tag_len=tags.length-1
|
137
|
+
counter_target=start_target
|
138
|
+
counter_query=start_query
|
139
|
+
add=TRUE #Hay casos q hay q realizar la adicion de coordenadas relativas antes de la operacion de guardado, esta varible se define para impedir la suma al final de la iteracion
|
140
|
+
tags.each_with_index do |tag,c|
|
141
|
+
if c==tag_len||tag[0]=='S'||tag[0]=='G'
|
142
|
+
counter_query+=tag[1]
|
143
|
+
counter_target+=(tag[2]*strand)
|
144
|
+
add=FALSE
|
145
|
+
else
|
146
|
+
if tag[0]=='F'
|
147
|
+
query.s_frameshift << counter_query
|
148
|
+
query.q_frameshift << counter_target
|
149
|
+
end
|
150
|
+
add=TRUE
|
151
|
+
end
|
152
|
+
if tag[0]=='5'||c==tag_len #Comienzo de intron o final de secuencia
|
153
|
+
ends_target=counter_target
|
154
|
+
ends_query=counter_query
|
155
|
+
|
156
|
+
# creates the hit
|
157
|
+
hit = BlastHit.new(start_target+1, ends_target, start_query+1, ends_query)
|
158
|
+
hit.align_len=(ends_target-start_target)*strand
|
159
|
+
hit.ident=0
|
160
|
+
|
161
|
+
hit.gaps=0
|
162
|
+
hit.mismatches=0
|
163
|
+
hit.e_val=0
|
164
|
+
hit.bit_score=0
|
165
|
+
|
166
|
+
hit.score = features['score'].to_i
|
167
|
+
if reversed
|
168
|
+
hit.q_frame = -1
|
169
|
+
else
|
170
|
+
hit.q_frame =1
|
171
|
+
end
|
172
|
+
hit.s_frame = nil
|
173
|
+
|
174
|
+
hit.reversed=reversed
|
175
|
+
hit.subject_id = features['query_id']
|
176
|
+
hit.full_subject_length=0
|
177
|
+
hit.definition=features['query_id']
|
178
|
+
hit.acc=features['query_id']
|
179
|
+
hit.q_seq=''
|
180
|
+
hit.s_seq=''
|
181
|
+
#puts "#{features['target_id']}\t#{hit.inspect}"
|
182
|
+
query.add_hit(hit)
|
183
|
+
end
|
184
|
+
if add
|
185
|
+
counter_query+=tag[1]
|
186
|
+
counter_target+=tag[2]*strand
|
187
|
+
end
|
188
|
+
|
189
|
+
if tag[0]=='3' # Final de intron x lo tanto comienzo de exon
|
190
|
+
start_query=counter_query
|
191
|
+
start_target=counter_target
|
192
|
+
end
|
193
|
+
end#end do
|
194
|
+
@querys << query
|
195
|
+
end #def
|
196
|
+
|
197
|
+
# inspect results
|
198
|
+
def inspect
|
199
|
+
res = "Exonerate results:\n"
|
200
|
+
res+= '-'*20
|
201
|
+
res+= "\nQuerys: #{@querys.count}\n"
|
202
|
+
@querys.each{|q| res+=q.inspect+"\n"}
|
203
|
+
return res
|
204
|
+
end
|
205
|
+
|
206
|
+
# find query by name
|
207
|
+
def find_query(querys,name_q)
|
208
|
+
# newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
|
209
|
+
new_q=nil
|
210
|
+
|
211
|
+
if !querys.empty?
|
212
|
+
new_q=querys.find{|q| (q.query_id==name_q)}
|
213
|
+
end
|
214
|
+
|
215
|
+
return new_q
|
216
|
+
end
|
217
|
+
|
218
|
+
# check if there are querys
|
219
|
+
def empty?
|
220
|
+
|
221
|
+
return @querys.empty?
|
222
|
+
end
|
223
|
+
|
224
|
+
# get query count
|
225
|
+
def size
|
226
|
+
@querys.size
|
227
|
+
end
|
228
|
+
|
229
|
+
attr_accessor :querys
|
230
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'gff_hit'
|
2
|
+
require 'gff_snp'
|
3
|
+
require 'gff_go'
|
4
|
+
require 'gff_localization'
|
5
|
+
require 'gff_stop'
|
6
|
+
require 'gff_frameshift'
|
7
|
+
|
8
|
+
class GffContig
|
9
|
+
def report(contig,seqid,parent,name_mode)
|
10
|
+
features_parent=nil
|
11
|
+
seq=nil # Para especificar la secuencia del contig o del hit
|
12
|
+
|
13
|
+
#Contig
|
14
|
+
if !parent.nil? #Caso de q cada contig sea una unidad independiente
|
15
|
+
seqid=parent #Se redefine el seqid con el nombre del contig q actua de parent
|
16
|
+
if !contig.seq.nil? #Se especifica secuencia para el contig hijo ya q Gbrowse toma como secuencia aquella perteneciente al parent, por lo q hay especificar la secuencia del contig en el gff
|
17
|
+
seq=contig.seq
|
18
|
+
end
|
19
|
+
end
|
20
|
+
parent_hit=nil
|
21
|
+
contig_text=[]
|
22
|
+
if parent.nil? #Caso de cada contig sea una unidad independiente
|
23
|
+
parent_hit=contig.name
|
24
|
+
text="#{seqid}\tunknown\t#{contig.type}\t1\t#{contig.length}\t.\t+\t.\tID=#{contig.name};Name=#{contig.name}"
|
25
|
+
contig_text << text
|
26
|
+
end
|
27
|
+
|
28
|
+
#Hit
|
29
|
+
gff_hit=GffHit.new
|
30
|
+
contig.each_hit{|hit|
|
31
|
+
text,features_parent= gff_hit.report(hit, parent_hit, seqid, contig.name, name_mode, seq) #En caso de que un contig dependa de otro, features_parent proporcio
|
32
|
+
contig_text << text
|
33
|
+
}
|
34
|
+
|
35
|
+
#Frameshift
|
36
|
+
gff_frameshift=GffFrameshift.new
|
37
|
+
contig.each_q_frameshift{|fs|
|
38
|
+
contig_text << gff_frameshift.report(fs,features_parent,seqid)
|
39
|
+
}
|
40
|
+
|
41
|
+
#Stop
|
42
|
+
gff_stop=GffStop.new
|
43
|
+
contig.each_stop{|stop|
|
44
|
+
contig_text << gff_stop.report(stop,features_parent,seqid)
|
45
|
+
}
|
46
|
+
|
47
|
+
#SNP
|
48
|
+
gff_snp=GffSNP.new
|
49
|
+
contig.each_snp_with_index{|snp,n|
|
50
|
+
contig_text << gff_snp.report(snp, features_parent, seqid,n)
|
51
|
+
}
|
52
|
+
|
53
|
+
#GO
|
54
|
+
gff_go=GffGo.new
|
55
|
+
contig.each_go{|go|
|
56
|
+
contig_text << gff_go.report(go, features_parent, seqid)
|
57
|
+
}
|
58
|
+
|
59
|
+
#Localization
|
60
|
+
gff_localization=Localization.new
|
61
|
+
contig.each_localization_with_index{|localization,n|
|
62
|
+
contig_text << gff_localization.report(localization,features_parent, seqid, contig, n)
|
63
|
+
}
|
64
|
+
|
65
|
+
return contig_text
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
require 'gff_feature'
|
2
|
+
require 'gff_master_feature'
|
3
|
+
class Gff_dataset
|
4
|
+
@@undefined_features=0
|
5
|
+
attr_accessor :master_features, :index
|
6
|
+
def initialize
|
7
|
+
@master_features={}
|
8
|
+
@index={}
|
9
|
+
end
|
10
|
+
|
11
|
+
def master_features
|
12
|
+
if @master_features.length >1
|
13
|
+
master_features=@master_features.values
|
14
|
+
else
|
15
|
+
master_features=@master_features.to_a[0][1]
|
16
|
+
end
|
17
|
+
return master_features
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_master_feature(master_seq_id, source, type, start, stop, score, strand, phase, attribs)
|
21
|
+
master=nil
|
22
|
+
if @index.key?(master_seq_id)#Check that feature region exists, if exists add master feature like a child of that region
|
23
|
+
attribs['Parent']=master_seq_id
|
24
|
+
master=add_feature(source, type, start, stop, score, strand, phase, attribs)
|
25
|
+
@index[attribs['ID']]=master
|
26
|
+
if stop.to_i > @master_features[master_seq_id].stop #Redefine master_feature with new child
|
27
|
+
@master_features[master_seq_id].stop=stop.to_i
|
28
|
+
end
|
29
|
+
elsif attribs['ID']==master_seq_id #Check that exists a parent region for master_feature
|
30
|
+
master=Master_feature.new(source, type, start, stop, score, strand, phase, attribs)
|
31
|
+
@master_features[master_seq_id]=master
|
32
|
+
@index[master_seq_id]=master
|
33
|
+
else #Creates a master feature with his child if it'sn defined master_feature
|
34
|
+
master=Master_feature.new(source, 'region', 1, stop, '.', '.', '.', {'ID'=> master_seq_id})
|
35
|
+
@master_features[master_seq_id]=master
|
36
|
+
@index[master_seq_id]=master
|
37
|
+
attribs['Parent']=master_seq_id
|
38
|
+
child=master.add_child(source, type, start, stop, score, strand, phase, attribs)
|
39
|
+
@index[attribs['ID']]=child
|
40
|
+
end
|
41
|
+
return master
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_feature(source, type, start, stop, score, strand, phase, attribs)
|
45
|
+
feature=@index[attribs['Parent']].add_child(source, type, start, stop, score, strand, phase, attribs)
|
46
|
+
if !attribs['ID'].nil?
|
47
|
+
@index[attribs['ID']]=feature
|
48
|
+
else
|
49
|
+
@index[feature.attrib('Parent')+'_'+@@undefined_features.to_s]=feature
|
50
|
+
@@undefined_features+=1
|
51
|
+
end
|
52
|
+
return feature
|
53
|
+
end
|
54
|
+
|
55
|
+
def each_feature
|
56
|
+
@index.each_value do |feature|
|
57
|
+
yield feature
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def each_master_feature
|
62
|
+
@master_features.each_value do |master_feature|
|
63
|
+
yield master_feature
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def each_id_feat
|
68
|
+
@index.each do |id,feature|
|
69
|
+
yield id,feature
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def each_id_master
|
74
|
+
@master_features.each do |id,master|
|
75
|
+
yield id,master
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def inspects
|
80
|
+
@master_features.each do |item|
|
81
|
+
item[1].inspects
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def feature(hash_key)
|
86
|
+
return @index[hash_key]
|
87
|
+
end
|
88
|
+
|
89
|
+
def has_type?(type)
|
90
|
+
has_type=FALSE
|
91
|
+
@index.each_value do |feature|
|
92
|
+
if feature.type==type
|
93
|
+
has_type=TRUE
|
94
|
+
break
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return has_type
|
98
|
+
end
|
99
|
+
|
100
|
+
def has_source?(source)
|
101
|
+
has_source=FALSE
|
102
|
+
@index.each_value do |feature|
|
103
|
+
if feature.source==source
|
104
|
+
has_source=TRUE
|
105
|
+
break
|
106
|
+
end
|
107
|
+
end
|
108
|
+
return has_source
|
109
|
+
end
|
110
|
+
|
111
|
+
def get(source=FALSE, type=FALSE)
|
112
|
+
features=[]
|
113
|
+
@index.each_value do |feature|
|
114
|
+
s=TRUE
|
115
|
+
if source
|
116
|
+
s=feature.is_source?(source)
|
117
|
+
end
|
118
|
+
t=TRUE
|
119
|
+
if type
|
120
|
+
t=feature.is_type?(type)
|
121
|
+
end
|
122
|
+
|
123
|
+
if s&t==TRUE
|
124
|
+
features << feature
|
125
|
+
end
|
126
|
+
end
|
127
|
+
return features
|
128
|
+
end
|
129
|
+
|
130
|
+
def tree
|
131
|
+
@master_features.each_value do |master|
|
132
|
+
master.tree
|
133
|
+
puts "\n",'--------------------------------'
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def add_parent_to(type_parent,type_child)
|
138
|
+
each_id_feat {|id,feature|
|
139
|
+
if feature.type==type_child
|
140
|
+
new_feature=Feature.new(feature.source, type_parent, feature.start, feature.stop, '.', feature.strand, '.', feature.attribs.dup)
|
141
|
+
feature.change_to_type_id_recursive
|
142
|
+
new_feature.transfer_child(feature.attrib('ID'),feature)
|
143
|
+
@index[feature.attrib('Parent')].child[id]=new_feature
|
144
|
+
@index[id]=new_feature
|
145
|
+
new_feature.each_child {|child|
|
146
|
+
child.attribs['Parent']=new_feature.attrib('ID') #Define new parent
|
147
|
+
}
|
148
|
+
end
|
149
|
+
}
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|