gene_assembler 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/bin/GeneAssembler +233 -0
- data/bin/phytozome_scan +60 -0
- data/gene_assembler.gemspec +25 -0
- data/lib/gene_assembler.rb +5 -0
- data/lib/gene_assembler/blast_type_parser.rb +41 -0
- data/lib/gene_assembler/contig.rb +643 -0
- data/lib/gene_assembler/dataset.rb +532 -0
- data/lib/gene_assembler/exonerate_result.rb +230 -0
- data/lib/gene_assembler/gff_contig.rb +67 -0
- data/lib/gene_assembler/gff_dataset.rb +152 -0
- data/lib/gene_assembler/gff_feature.rb +175 -0
- data/lib/gene_assembler/gff_frameshift.rb +6 -0
- data/lib/gene_assembler/gff_go.rb +13 -0
- data/lib/gene_assembler/gff_hit.rb +53 -0
- data/lib/gene_assembler/gff_hsp.rb +6 -0
- data/lib/gene_assembler/gff_localization.rb +6 -0
- data/lib/gene_assembler/gff_master_feature.rb +5 -0
- data/lib/gene_assembler/gff_parser.rb +35 -0
- data/lib/gene_assembler/gff_snp.rb +21 -0
- data/lib/gene_assembler/gff_stop.rb +6 -0
- data/lib/gene_assembler/go.rb +13 -0
- data/lib/gene_assembler/hit.rb +191 -0
- data/lib/gene_assembler/hsp.rb +100 -0
- data/lib/gene_assembler/other_functions.rb +228 -0
- data/lib/gene_assembler/parser.rb +25 -0
- data/lib/gene_assembler/parser_blast.rb +12 -0
- data/lib/gene_assembler/parser_exonerate.rb +16 -0
- data/lib/gene_assembler/rebuild.rb +975 -0
- data/lib/gene_assembler/report.rb +13 -0
- data/lib/gene_assembler/report_gff.rb +30 -0
- data/lib/gene_assembler/snp.rb +13 -0
- data/lib/gene_assembler/version.rb +3 -0
- metadata +149 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
class Feature
|
2
|
+
attr_accessor :source, :type, :start, :stop, :score, :strand, :phase, :attribs, :child
|
3
|
+
@@undefined_features=0
|
4
|
+
def initialize(source, type, start, stop, score, strand, phase, attribs)
|
5
|
+
@source=source
|
6
|
+
@type=type
|
7
|
+
@start=start.to_i
|
8
|
+
@stop=stop.to_i
|
9
|
+
if score=='.'
|
10
|
+
@score='.'
|
11
|
+
else
|
12
|
+
@score=score.to_f
|
13
|
+
end
|
14
|
+
@strand=strand
|
15
|
+
if phase=='.'
|
16
|
+
@phase='.'
|
17
|
+
else
|
18
|
+
@phase=phase.to_i
|
19
|
+
end
|
20
|
+
@attribs=attribs
|
21
|
+
if attrib('Name').nil? #Añade el atributo name en base a la ID de forma automatica
|
22
|
+
@attribs['Name']=attrib('ID')
|
23
|
+
else
|
24
|
+
revised_name = @attribs['Name']
|
25
|
+
revised_name.gsub!(';','-')
|
26
|
+
@attribs['Name'] = revised_name
|
27
|
+
end
|
28
|
+
@child={}
|
29
|
+
end
|
30
|
+
|
31
|
+
def add_child(source, type, start, stop, score, strand, phase, attribs)
|
32
|
+
child=Feature.new(source, type, start, stop, score, strand, phase, attribs)
|
33
|
+
if !attribs['ID'].nil?
|
34
|
+
@child[attribs['ID']]=child
|
35
|
+
else
|
36
|
+
@child[child.attrib('Parent')+'_'+@@undefined_features.to_s]=child
|
37
|
+
@@undefined_features+=1
|
38
|
+
end
|
39
|
+
return child
|
40
|
+
end
|
41
|
+
|
42
|
+
def transfer_child(id,child)
|
43
|
+
@child[id]=child
|
44
|
+
end
|
45
|
+
|
46
|
+
def each_child
|
47
|
+
@child.each_value do |child|
|
48
|
+
yield child
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def attrib(tag)
|
53
|
+
attrib=@attribs[tag]
|
54
|
+
return attrib
|
55
|
+
end
|
56
|
+
|
57
|
+
def each_tag_attrib
|
58
|
+
@attribs.each do |tag,attrib|
|
59
|
+
yield tag,attrib
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def inspects
|
64
|
+
print attrib('ID')+" ---> "
|
65
|
+
@child.keys.each do |ch|
|
66
|
+
print "#{ch} (#{@child[ch].count})\t"
|
67
|
+
end
|
68
|
+
print "\n"
|
69
|
+
end
|
70
|
+
|
71
|
+
def count
|
72
|
+
return @child.count
|
73
|
+
end
|
74
|
+
|
75
|
+
def cds
|
76
|
+
cds_exones=[]
|
77
|
+
@child.each do |key_cds|
|
78
|
+
if key_cds[1].type=='CDS'
|
79
|
+
cds_exones << [key_cds[1].start, key_cds[1].stop]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return cds_exones
|
83
|
+
end
|
84
|
+
|
85
|
+
def tree(level=0)
|
86
|
+
puts "\t"*level+"#{attrib('ID')}\t\t#{@type}\t#{@source}"
|
87
|
+
level+=1
|
88
|
+
each_child {|child|
|
89
|
+
child.tree(level)
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
def is_type?(type)
|
94
|
+
t=FALSE
|
95
|
+
type=[type].flatten
|
96
|
+
type.each do |typ|
|
97
|
+
if typ == @type
|
98
|
+
t=TRUE
|
99
|
+
break
|
100
|
+
end
|
101
|
+
end
|
102
|
+
return t
|
103
|
+
end
|
104
|
+
|
105
|
+
def is_source?(source)
|
106
|
+
s=FALSE
|
107
|
+
source=[source].flatten
|
108
|
+
source.each do |sour|
|
109
|
+
if sour == @source
|
110
|
+
s=TRUE
|
111
|
+
break
|
112
|
+
end
|
113
|
+
end
|
114
|
+
return s
|
115
|
+
end
|
116
|
+
|
117
|
+
def compare(feature) #Resultado oscila entre 0 y 1
|
118
|
+
overlap=0
|
119
|
+
if feature.start >= self.start && feature.start <= self.stop && feature.stop >= self.start && feature.stop <= self.stop
|
120
|
+
overlap=feature.length*1.00/self.length
|
121
|
+
elsif self.start >= feature.start && self.start <= feature.stop && self.stop >= feature.start && self.stop <= feature.stop
|
122
|
+
overlap=1
|
123
|
+
elsif feature.start > self.start && feature.start < self.stop
|
124
|
+
overlap=(self.stop-feature.start)*1.00/self.length
|
125
|
+
elsif feature.stop > self.start && feature.stop < self.stop
|
126
|
+
overlap=(feature.stop-self.start)*1.00/self.length
|
127
|
+
end
|
128
|
+
return overlap
|
129
|
+
end
|
130
|
+
|
131
|
+
def length
|
132
|
+
length=self.stop-(self.start-1)
|
133
|
+
return length
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
def change_to_type_id_recursive(parent=FALSE)
|
138
|
+
new_parent="#{attrib('ID')}_#{@type}"
|
139
|
+
if !attrib('ID').nil?
|
140
|
+
@attribs['ID']=new_parent
|
141
|
+
end
|
142
|
+
|
143
|
+
if !attrib('Name').nil?
|
144
|
+
@attribs['Name']=new_parent
|
145
|
+
end
|
146
|
+
|
147
|
+
if !attrib('Parent').nil? && parent
|
148
|
+
@attribs['Parent']=parent
|
149
|
+
end
|
150
|
+
|
151
|
+
each_child {|child|
|
152
|
+
child.change_to_type_id_recursive(new_parent)
|
153
|
+
}
|
154
|
+
end
|
155
|
+
|
156
|
+
def write(file,id)
|
157
|
+
file.print "#{id}\t#{@source}\t#{@type}\t#{@start}\t#{@stop}\t#{@score}\t#{@strand}\t#{@phase}\t"
|
158
|
+
each_tag_attrib {|tag,attrib|
|
159
|
+
file.print "#{tag}=#{attrib};"
|
160
|
+
}
|
161
|
+
file.puts #Print \n
|
162
|
+
each_child {|child|
|
163
|
+
child.write(file,id)
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
167
|
+
def add_attribs(array_attribs)
|
168
|
+
array_attribs.each do |attrib|
|
169
|
+
if !attrib[1].nil?
|
170
|
+
@attribs[attrib[0]]=attrib[1]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GffGo
|
2
|
+
def report(go,parent,seqid)
|
3
|
+
if go.source.nil?
|
4
|
+
go.source='Unknown'
|
5
|
+
end
|
6
|
+
obsolete=nil
|
7
|
+
if go.obsolete
|
8
|
+
obsolete="Obsolete=True"
|
9
|
+
end
|
10
|
+
go_text="#{seqid}\t#{go.source}\tOntology\t#{go.beg}\t#{go.end}\t.\t+\t.\tID=#{parent}_#{go.name}#{go.code};Parent=#{parent};Name=#{go.name}_#{go.code};Note=#{go.name}_#{go.code};#{obsolete}"
|
11
|
+
return go_text
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'gff_hsp'
|
2
|
+
|
3
|
+
class GffHit
|
4
|
+
def report(hit,parent,seqid,id,name_mode,seq)
|
5
|
+
hit_text=[]
|
6
|
+
feature_parent=hit.name #Parent par las demas caracteristicas
|
7
|
+
if name_mode=='l'
|
8
|
+
feature_parent=feature_parent+'_'+hit.type
|
9
|
+
end
|
10
|
+
parent_tag=nil
|
11
|
+
if !parent.nil?# si hay parent, hit se comporta como una estructura tipo proteina, o secuencia nucleotidica. En caso contrario pasara a ser un contig hijo de otro contig ya escrito
|
12
|
+
parent_tag="Parent=#{parent};"
|
13
|
+
name=hit.name
|
14
|
+
feature_parent=id
|
15
|
+
else
|
16
|
+
name=id
|
17
|
+
feature_parent=id
|
18
|
+
end
|
19
|
+
hit_seq=nil
|
20
|
+
if !seq.nil?
|
21
|
+
hit_seq="seq=#{seq};"
|
22
|
+
end
|
23
|
+
strand='+'
|
24
|
+
if hit.reversed
|
25
|
+
strand='-'
|
26
|
+
end
|
27
|
+
ident="Perc_Qidentities=#{hit.q_p_ident};"
|
28
|
+
if hit.q_p_ident.nil?
|
29
|
+
ident=nil
|
30
|
+
end
|
31
|
+
conserved="Perc_Qconserved=#{hit.q_p_conserved};"
|
32
|
+
if hit.q_p_conserved.nil?
|
33
|
+
conserved=nil
|
34
|
+
end
|
35
|
+
description=nil
|
36
|
+
if !hit.description.nil?
|
37
|
+
description="Note=#{hit.description.gsub(';','_')};"
|
38
|
+
end
|
39
|
+
if hit.source.nil?
|
40
|
+
hit.source='Unknown'
|
41
|
+
end
|
42
|
+
text="#{seqid}\t#{hit.source}\t#{hit.type}\t#{hit.first_hsp.q_beg}\t#{hit.last_hsp.q_end}\t.\t#{strand}\t.\tID=#{name};#{parent_tag}Name=#{name};#{hit_seq}#{description}#{ident}#{conserved}"
|
43
|
+
hit_text << text
|
44
|
+
gff_hsp=GffHsp.new
|
45
|
+
if hit.hsp_count>1 #Desarrollar si el hit presenta varias coincidencias parciales
|
46
|
+
hit.each_hsp_with_index{|hsp,n|
|
47
|
+
hit_text << gff_hsp.report(hsp,name,seqid,n,hit.type,hit.source)
|
48
|
+
}
|
49
|
+
end
|
50
|
+
return hit_text,feature_parent
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -0,0 +1,6 @@
|
|
1
|
+
class Localization
|
2
|
+
def report(localization,parent, seqid, contig, n)
|
3
|
+
localization_text="#{seqid}\tunknown\tLocalization\t1\t#{contig.length}\t.\t+\t.\tID=#{parent}_loc_#{n};Parent=#{parent};Name=#{localization};Note=#{localization};"
|
4
|
+
return localization_text
|
5
|
+
end
|
6
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'gff_dataset'
|
2
|
+
class Gff_parser
|
3
|
+
attr_accessor :dataset
|
4
|
+
def initialize(file)
|
5
|
+
@dataset=Gff_dataset.new
|
6
|
+
File.open(file,'r').each do |line|
|
7
|
+
line.chomp!
|
8
|
+
if line=~ /^#/ || line=='' #Skip gff comments and blank lines
|
9
|
+
next
|
10
|
+
end
|
11
|
+
if line=~ /^>/ #Skip sequences
|
12
|
+
break
|
13
|
+
end
|
14
|
+
|
15
|
+
fields=line.split("\t")
|
16
|
+
attribs=parse_attribs(fields.last)
|
17
|
+
if !attribs.key?('Parent')
|
18
|
+
@dataset.add_master_feature(fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs)
|
19
|
+
else
|
20
|
+
@dataset.add_feature(fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs) # Feature is a child so it's put on his parent and it hasn't a parent attrib
|
21
|
+
end
|
22
|
+
end
|
23
|
+
return @dataset
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_attribs(attribs) #Attribs must be a string with info of last column of gff
|
27
|
+
attribs_hash={}
|
28
|
+
fields=attribs.split(';')
|
29
|
+
fields.each do |attrib|
|
30
|
+
attrib_items= attrib.split('=')
|
31
|
+
attribs_hash[attrib_items[0]]=attrib_items[1]
|
32
|
+
end
|
33
|
+
return attribs_hash
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
class GffSNP
|
2
|
+
def report(snp, parent, seqid,n)
|
3
|
+
depth="Depth=#{snp.depth};"
|
4
|
+
if snp.depth.nil?
|
5
|
+
depth=nil
|
6
|
+
end
|
7
|
+
zygosity="Zygosity=#{snp.zygosity};"
|
8
|
+
if snp.zygosity.nil?
|
9
|
+
zygosity=nil
|
10
|
+
end
|
11
|
+
mapping_qual="Mapping_qual=#{snp.mapping_qual};"
|
12
|
+
if snp.mapping_qual.nil?
|
13
|
+
mapping_qual=nil
|
14
|
+
end
|
15
|
+
strand_bias="Strand_bias=#{snp.strand_bias};"
|
16
|
+
if snp.strand_bias.nil?
|
17
|
+
end
|
18
|
+
snp_text="#{seqid}\t#{snp.source}\tSNP\t#{snp.position}\t#{snp.position}\t.\t+\t.\tID=#{parent}_SNP_#{n};Parent=#{parent};Name=#{snp.ref} -> #{snp.var};Note=#{snp.ref} -> #{snp.var};#{depth}#{zygosity}#{mapping_qual}#{strand_bias}"
|
19
|
+
return snp_text
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
require 'hsp'
|
2
|
+
class Hit
|
3
|
+
attr_accessor :name, :hsps, :s_length, :reversed, :type, :source, :description, :e_value, :q_p_ident, :q_p_conserved
|
4
|
+
def initialize (hit_name, s_length, q_frame, type)
|
5
|
+
@name=hit_name #Nombre tomado del subject_id
|
6
|
+
#@s_seq=s_seq #Secuencia del subject
|
7
|
+
#@q_seq=q_seq #Secuencia del query
|
8
|
+
@s_length=s_length # Longitud total del subject
|
9
|
+
@hsps=[]
|
10
|
+
if q_frame>0
|
11
|
+
@reversed=FALSE
|
12
|
+
else
|
13
|
+
@reversed=TRUE
|
14
|
+
end
|
15
|
+
@type=type
|
16
|
+
@source=nil
|
17
|
+
@description=nil
|
18
|
+
@e_value=nil
|
19
|
+
@q_p_ident=nil
|
20
|
+
@q_p_conserved=nil
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_hsp(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
|
25
|
+
hsp= Hsp.new(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
|
26
|
+
@hsps << hsp
|
27
|
+
return hsp
|
28
|
+
end
|
29
|
+
|
30
|
+
def each_hsp
|
31
|
+
@hsps.each do |hsp|
|
32
|
+
yield hsp
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def hsp_at(n)
|
37
|
+
hsp_at=nil
|
38
|
+
each_hsp_with_index{|hsp,i|
|
39
|
+
if n==i
|
40
|
+
hsp_at=hsp
|
41
|
+
break
|
42
|
+
end
|
43
|
+
}
|
44
|
+
return hsp_at
|
45
|
+
end
|
46
|
+
|
47
|
+
def each_hsp_with_index
|
48
|
+
@hsps.each_with_index do |hsp,i|
|
49
|
+
yield hsp,i
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def hsps_sort! # Se ordenan los hsps en base a posicion en el subject
|
54
|
+
@hsps.sort!{|e1,e2| e1.s_beg<=>e2.s_beg}
|
55
|
+
end
|
56
|
+
|
57
|
+
def hsp_count
|
58
|
+
n=0
|
59
|
+
each_hsp{|hsp|
|
60
|
+
n+=1
|
61
|
+
}
|
62
|
+
return n
|
63
|
+
end
|
64
|
+
|
65
|
+
def first_hsp
|
66
|
+
h=nil
|
67
|
+
each_hsp{|hit|
|
68
|
+
h=hit
|
69
|
+
break
|
70
|
+
}
|
71
|
+
return h
|
72
|
+
end
|
73
|
+
|
74
|
+
def last_hsp
|
75
|
+
h=nil
|
76
|
+
each_hsp{|hit|
|
77
|
+
h=hit
|
78
|
+
}
|
79
|
+
return h
|
80
|
+
end
|
81
|
+
|
82
|
+
def hsps_correlative? # Ver si los hsps del hit son contiguos en la query
|
83
|
+
is_correlative=FALSE
|
84
|
+
ends=0
|
85
|
+
each_hsp_with_index{|hsp,i|
|
86
|
+
if i==0
|
87
|
+
ends=hsp.q_end
|
88
|
+
next
|
89
|
+
end
|
90
|
+
if (ends-hsp.q_beg).abs>3
|
91
|
+
is_correlative=TRUE
|
92
|
+
break
|
93
|
+
end
|
94
|
+
ends=hsp.q_end
|
95
|
+
}
|
96
|
+
return is_correlative
|
97
|
+
end
|
98
|
+
|
99
|
+
def modified_coordenates(add)
|
100
|
+
each_hsp{|hsp|
|
101
|
+
hsp.modified_coordenates(add)
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def rev_coord(contig_length)
|
106
|
+
each_hsp{|hsp|
|
107
|
+
hsp.rev_coord(contig_length)
|
108
|
+
}
|
109
|
+
end
|
110
|
+
|
111
|
+
def hsp_overlap
|
112
|
+
overlap=[]
|
113
|
+
last_hsp=nil
|
114
|
+
each_hsp_with_index{|hsp,i|
|
115
|
+
if i>0
|
116
|
+
diference=hsp.overlap_with(last_hsp)
|
117
|
+
if diference<0
|
118
|
+
overlap << diference
|
119
|
+
end
|
120
|
+
end
|
121
|
+
last_hsp=hsp
|
122
|
+
}
|
123
|
+
return overlap
|
124
|
+
end
|
125
|
+
|
126
|
+
def overlap_with(last_hit)
|
127
|
+
overlap=0
|
128
|
+
if self.name==last_hit.name
|
129
|
+
diference=self.first_hsp.overlap_with(last_hit.last_hsp)
|
130
|
+
if diference<0
|
131
|
+
overlap=diference
|
132
|
+
end
|
133
|
+
end
|
134
|
+
return overlap
|
135
|
+
end
|
136
|
+
|
137
|
+
def hsp_minor_than?(hsp_length)# En nt
|
138
|
+
minor=FALSE
|
139
|
+
each_hsp {|hsp|
|
140
|
+
if hsp.length_q < hsp_length
|
141
|
+
minor=TRUE
|
142
|
+
break
|
143
|
+
end
|
144
|
+
}
|
145
|
+
return minor
|
146
|
+
end
|
147
|
+
|
148
|
+
def correct_hsps(blast_coor_type)# 's' => subject, 'q' => query
|
149
|
+
# puts self.inspect
|
150
|
+
if hsp_count>1
|
151
|
+
delete_hsps=[]
|
152
|
+
each_hsp_with_index{|hsp,i|
|
153
|
+
each_hsp_with_index{|hsp_second,j|
|
154
|
+
if i==j
|
155
|
+
next
|
156
|
+
end
|
157
|
+
# puts hsp.compare(hsp_second)
|
158
|
+
compare=nil
|
159
|
+
if blast_coor_type == 's'
|
160
|
+
compare = hsp.compare(hsp_second)
|
161
|
+
else
|
162
|
+
compare = hsp.compare_q(hsp_second)
|
163
|
+
end
|
164
|
+
if compare >= 0.9
|
165
|
+
if hsp.score == hsp_second.score # En caso de hsps con scores iguales, nos quedamos con el mas pequeño
|
166
|
+
if hsp.align_len == hsp_second.align_len # Si dos hsps son exactamente iguales eliminamos el segundo
|
167
|
+
delete_hsps << j
|
168
|
+
elsif hsp.align_len < hsp_second.align_len
|
169
|
+
delete_hsps << j
|
170
|
+
else
|
171
|
+
delete_hsps << i
|
172
|
+
end
|
173
|
+
elsif hsp.score > hsp_second.score
|
174
|
+
delete_hsps << j
|
175
|
+
else
|
176
|
+
delete_hsps << i
|
177
|
+
end
|
178
|
+
end
|
179
|
+
}
|
180
|
+
}
|
181
|
+
delete_hsps.uniq!
|
182
|
+
delete_hsps.reverse_each do |hsp|
|
183
|
+
drop_hsp(hsp)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def drop_hsp(position)
|
189
|
+
hsps.delete_at(position)
|
190
|
+
end
|
191
|
+
end
|