gene_assembler 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/bin/GeneAssembler +233 -0
- data/bin/phytozome_scan +60 -0
- data/gene_assembler.gemspec +25 -0
- data/lib/gene_assembler.rb +5 -0
- data/lib/gene_assembler/blast_type_parser.rb +41 -0
- data/lib/gene_assembler/contig.rb +643 -0
- data/lib/gene_assembler/dataset.rb +532 -0
- data/lib/gene_assembler/exonerate_result.rb +230 -0
- data/lib/gene_assembler/gff_contig.rb +67 -0
- data/lib/gene_assembler/gff_dataset.rb +152 -0
- data/lib/gene_assembler/gff_feature.rb +175 -0
- data/lib/gene_assembler/gff_frameshift.rb +6 -0
- data/lib/gene_assembler/gff_go.rb +13 -0
- data/lib/gene_assembler/gff_hit.rb +53 -0
- data/lib/gene_assembler/gff_hsp.rb +6 -0
- data/lib/gene_assembler/gff_localization.rb +6 -0
- data/lib/gene_assembler/gff_master_feature.rb +5 -0
- data/lib/gene_assembler/gff_parser.rb +35 -0
- data/lib/gene_assembler/gff_snp.rb +21 -0
- data/lib/gene_assembler/gff_stop.rb +6 -0
- data/lib/gene_assembler/go.rb +13 -0
- data/lib/gene_assembler/hit.rb +191 -0
- data/lib/gene_assembler/hsp.rb +100 -0
- data/lib/gene_assembler/other_functions.rb +228 -0
- data/lib/gene_assembler/parser.rb +25 -0
- data/lib/gene_assembler/parser_blast.rb +12 -0
- data/lib/gene_assembler/parser_exonerate.rb +16 -0
- data/lib/gene_assembler/rebuild.rb +975 -0
- data/lib/gene_assembler/report.rb +13 -0
- data/lib/gene_assembler/report_gff.rb +30 -0
- data/lib/gene_assembler/snp.rb +13 -0
- data/lib/gene_assembler/version.rb +3 -0
- metadata +149 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
class Feature
|
2
|
+
attr_accessor :source, :type, :start, :stop, :score, :strand, :phase, :attribs, :child
|
3
|
+
@@undefined_features=0
|
4
|
+
def initialize(source, type, start, stop, score, strand, phase, attribs)
|
5
|
+
@source=source
|
6
|
+
@type=type
|
7
|
+
@start=start.to_i
|
8
|
+
@stop=stop.to_i
|
9
|
+
if score=='.'
|
10
|
+
@score='.'
|
11
|
+
else
|
12
|
+
@score=score.to_f
|
13
|
+
end
|
14
|
+
@strand=strand
|
15
|
+
if phase=='.'
|
16
|
+
@phase='.'
|
17
|
+
else
|
18
|
+
@phase=phase.to_i
|
19
|
+
end
|
20
|
+
@attribs=attribs
|
21
|
+
if attrib('Name').nil? #Añade el atributo name en base a la ID de forma automatica
|
22
|
+
@attribs['Name']=attrib('ID')
|
23
|
+
else
|
24
|
+
revised_name = @attribs['Name']
|
25
|
+
revised_name.gsub!(';','-')
|
26
|
+
@attribs['Name'] = revised_name
|
27
|
+
end
|
28
|
+
@child={}
|
29
|
+
end
|
30
|
+
|
31
|
+
def add_child(source, type, start, stop, score, strand, phase, attribs)
|
32
|
+
child=Feature.new(source, type, start, stop, score, strand, phase, attribs)
|
33
|
+
if !attribs['ID'].nil?
|
34
|
+
@child[attribs['ID']]=child
|
35
|
+
else
|
36
|
+
@child[child.attrib('Parent')+'_'+@@undefined_features.to_s]=child
|
37
|
+
@@undefined_features+=1
|
38
|
+
end
|
39
|
+
return child
|
40
|
+
end
|
41
|
+
|
42
|
+
def transfer_child(id,child)
|
43
|
+
@child[id]=child
|
44
|
+
end
|
45
|
+
|
46
|
+
def each_child
|
47
|
+
@child.each_value do |child|
|
48
|
+
yield child
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def attrib(tag)
|
53
|
+
attrib=@attribs[tag]
|
54
|
+
return attrib
|
55
|
+
end
|
56
|
+
|
57
|
+
def each_tag_attrib
|
58
|
+
@attribs.each do |tag,attrib|
|
59
|
+
yield tag,attrib
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def inspects
|
64
|
+
print attrib('ID')+" ---> "
|
65
|
+
@child.keys.each do |ch|
|
66
|
+
print "#{ch} (#{@child[ch].count})\t"
|
67
|
+
end
|
68
|
+
print "\n"
|
69
|
+
end
|
70
|
+
|
71
|
+
def count
|
72
|
+
return @child.count
|
73
|
+
end
|
74
|
+
|
75
|
+
def cds
|
76
|
+
cds_exones=[]
|
77
|
+
@child.each do |key_cds|
|
78
|
+
if key_cds[1].type=='CDS'
|
79
|
+
cds_exones << [key_cds[1].start, key_cds[1].stop]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return cds_exones
|
83
|
+
end
|
84
|
+
|
85
|
+
def tree(level=0)
|
86
|
+
puts "\t"*level+"#{attrib('ID')}\t\t#{@type}\t#{@source}"
|
87
|
+
level+=1
|
88
|
+
each_child {|child|
|
89
|
+
child.tree(level)
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
def is_type?(type)
|
94
|
+
t=FALSE
|
95
|
+
type=[type].flatten
|
96
|
+
type.each do |typ|
|
97
|
+
if typ == @type
|
98
|
+
t=TRUE
|
99
|
+
break
|
100
|
+
end
|
101
|
+
end
|
102
|
+
return t
|
103
|
+
end
|
104
|
+
|
105
|
+
def is_source?(source)
|
106
|
+
s=FALSE
|
107
|
+
source=[source].flatten
|
108
|
+
source.each do |sour|
|
109
|
+
if sour == @source
|
110
|
+
s=TRUE
|
111
|
+
break
|
112
|
+
end
|
113
|
+
end
|
114
|
+
return s
|
115
|
+
end
|
116
|
+
|
117
|
+
def compare(feature) #Resultado oscila entre 0 y 1
|
118
|
+
overlap=0
|
119
|
+
if feature.start >= self.start && feature.start <= self.stop && feature.stop >= self.start && feature.stop <= self.stop
|
120
|
+
overlap=feature.length*1.00/self.length
|
121
|
+
elsif self.start >= feature.start && self.start <= feature.stop && self.stop >= feature.start && self.stop <= feature.stop
|
122
|
+
overlap=1
|
123
|
+
elsif feature.start > self.start && feature.start < self.stop
|
124
|
+
overlap=(self.stop-feature.start)*1.00/self.length
|
125
|
+
elsif feature.stop > self.start && feature.stop < self.stop
|
126
|
+
overlap=(feature.stop-self.start)*1.00/self.length
|
127
|
+
end
|
128
|
+
return overlap
|
129
|
+
end
|
130
|
+
|
131
|
+
def length
|
132
|
+
length=self.stop-(self.start-1)
|
133
|
+
return length
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
def change_to_type_id_recursive(parent=FALSE)
|
138
|
+
new_parent="#{attrib('ID')}_#{@type}"
|
139
|
+
if !attrib('ID').nil?
|
140
|
+
@attribs['ID']=new_parent
|
141
|
+
end
|
142
|
+
|
143
|
+
if !attrib('Name').nil?
|
144
|
+
@attribs['Name']=new_parent
|
145
|
+
end
|
146
|
+
|
147
|
+
if !attrib('Parent').nil? && parent
|
148
|
+
@attribs['Parent']=parent
|
149
|
+
end
|
150
|
+
|
151
|
+
each_child {|child|
|
152
|
+
child.change_to_type_id_recursive(new_parent)
|
153
|
+
}
|
154
|
+
end
|
155
|
+
|
156
|
+
def write(file,id)
|
157
|
+
file.print "#{id}\t#{@source}\t#{@type}\t#{@start}\t#{@stop}\t#{@score}\t#{@strand}\t#{@phase}\t"
|
158
|
+
each_tag_attrib {|tag,attrib|
|
159
|
+
file.print "#{tag}=#{attrib};"
|
160
|
+
}
|
161
|
+
file.puts #Print \n
|
162
|
+
each_child {|child|
|
163
|
+
child.write(file,id)
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
167
|
+
def add_attribs(array_attribs)
|
168
|
+
array_attribs.each do |attrib|
|
169
|
+
if !attrib[1].nil?
|
170
|
+
@attribs[attrib[0]]=attrib[1]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GffGo
|
2
|
+
def report(go,parent,seqid)
|
3
|
+
if go.source.nil?
|
4
|
+
go.source='Unknown'
|
5
|
+
end
|
6
|
+
obsolete=nil
|
7
|
+
if go.obsolete
|
8
|
+
obsolete="Obsolete=True"
|
9
|
+
end
|
10
|
+
go_text="#{seqid}\t#{go.source}\tOntology\t#{go.beg}\t#{go.end}\t.\t+\t.\tID=#{parent}_#{go.name}#{go.code};Parent=#{parent};Name=#{go.name}_#{go.code};Note=#{go.name}_#{go.code};#{obsolete}"
|
11
|
+
return go_text
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'gff_hsp'
|
2
|
+
|
3
|
+
class GffHit
|
4
|
+
def report(hit,parent,seqid,id,name_mode,seq)
|
5
|
+
hit_text=[]
|
6
|
+
feature_parent=hit.name #Parent par las demas caracteristicas
|
7
|
+
if name_mode=='l'
|
8
|
+
feature_parent=feature_parent+'_'+hit.type
|
9
|
+
end
|
10
|
+
parent_tag=nil
|
11
|
+
if !parent.nil?# si hay parent, hit se comporta como una estructura tipo proteina, o secuencia nucleotidica. En caso contrario pasara a ser un contig hijo de otro contig ya escrito
|
12
|
+
parent_tag="Parent=#{parent};"
|
13
|
+
name=hit.name
|
14
|
+
feature_parent=id
|
15
|
+
else
|
16
|
+
name=id
|
17
|
+
feature_parent=id
|
18
|
+
end
|
19
|
+
hit_seq=nil
|
20
|
+
if !seq.nil?
|
21
|
+
hit_seq="seq=#{seq};"
|
22
|
+
end
|
23
|
+
strand='+'
|
24
|
+
if hit.reversed
|
25
|
+
strand='-'
|
26
|
+
end
|
27
|
+
ident="Perc_Qidentities=#{hit.q_p_ident};"
|
28
|
+
if hit.q_p_ident.nil?
|
29
|
+
ident=nil
|
30
|
+
end
|
31
|
+
conserved="Perc_Qconserved=#{hit.q_p_conserved};"
|
32
|
+
if hit.q_p_conserved.nil?
|
33
|
+
conserved=nil
|
34
|
+
end
|
35
|
+
description=nil
|
36
|
+
if !hit.description.nil?
|
37
|
+
description="Note=#{hit.description.gsub(';','_')};"
|
38
|
+
end
|
39
|
+
if hit.source.nil?
|
40
|
+
hit.source='Unknown'
|
41
|
+
end
|
42
|
+
text="#{seqid}\t#{hit.source}\t#{hit.type}\t#{hit.first_hsp.q_beg}\t#{hit.last_hsp.q_end}\t.\t#{strand}\t.\tID=#{name};#{parent_tag}Name=#{name};#{hit_seq}#{description}#{ident}#{conserved}"
|
43
|
+
hit_text << text
|
44
|
+
gff_hsp=GffHsp.new
|
45
|
+
if hit.hsp_count>1 #Desarrollar si el hit presenta varias coincidencias parciales
|
46
|
+
hit.each_hsp_with_index{|hsp,n|
|
47
|
+
hit_text << gff_hsp.report(hsp,name,seqid,n,hit.type,hit.source)
|
48
|
+
}
|
49
|
+
end
|
50
|
+
return hit_text,feature_parent
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -0,0 +1,6 @@
|
|
1
|
+
class Localization
|
2
|
+
def report(localization,parent, seqid, contig, n)
|
3
|
+
localization_text="#{seqid}\tunknown\tLocalization\t1\t#{contig.length}\t.\t+\t.\tID=#{parent}_loc_#{n};Parent=#{parent};Name=#{localization};Note=#{localization};"
|
4
|
+
return localization_text
|
5
|
+
end
|
6
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'gff_dataset'
|
2
|
+
class Gff_parser
|
3
|
+
attr_accessor :dataset
|
4
|
+
def initialize(file)
|
5
|
+
@dataset=Gff_dataset.new
|
6
|
+
File.open(file,'r').each do |line|
|
7
|
+
line.chomp!
|
8
|
+
if line=~ /^#/ || line=='' #Skip gff comments and blank lines
|
9
|
+
next
|
10
|
+
end
|
11
|
+
if line=~ /^>/ #Skip sequences
|
12
|
+
break
|
13
|
+
end
|
14
|
+
|
15
|
+
fields=line.split("\t")
|
16
|
+
attribs=parse_attribs(fields.last)
|
17
|
+
if !attribs.key?('Parent')
|
18
|
+
@dataset.add_master_feature(fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs)
|
19
|
+
else
|
20
|
+
@dataset.add_feature(fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs) # Feature is a child so it's put on his parent and it hasn't a parent attrib
|
21
|
+
end
|
22
|
+
end
|
23
|
+
return @dataset
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_attribs(attribs) #Attribs must be a string with info of last column of gff
|
27
|
+
attribs_hash={}
|
28
|
+
fields=attribs.split(';')
|
29
|
+
fields.each do |attrib|
|
30
|
+
attrib_items= attrib.split('=')
|
31
|
+
attribs_hash[attrib_items[0]]=attrib_items[1]
|
32
|
+
end
|
33
|
+
return attribs_hash
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
class GffSNP
|
2
|
+
def report(snp, parent, seqid,n)
|
3
|
+
depth="Depth=#{snp.depth};"
|
4
|
+
if snp.depth.nil?
|
5
|
+
depth=nil
|
6
|
+
end
|
7
|
+
zygosity="Zygosity=#{snp.zygosity};"
|
8
|
+
if snp.zygosity.nil?
|
9
|
+
zygosity=nil
|
10
|
+
end
|
11
|
+
mapping_qual="Mapping_qual=#{snp.mapping_qual};"
|
12
|
+
if snp.mapping_qual.nil?
|
13
|
+
mapping_qual=nil
|
14
|
+
end
|
15
|
+
strand_bias="Strand_bias=#{snp.strand_bias};"
|
16
|
+
if snp.strand_bias.nil?
|
17
|
+
end
|
18
|
+
snp_text="#{seqid}\t#{snp.source}\tSNP\t#{snp.position}\t#{snp.position}\t.\t+\t.\tID=#{parent}_SNP_#{n};Parent=#{parent};Name=#{snp.ref} -> #{snp.var};Note=#{snp.ref} -> #{snp.var};#{depth}#{zygosity}#{mapping_qual}#{strand_bias}"
|
19
|
+
return snp_text
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
require 'hsp'
|
2
|
+
class Hit
|
3
|
+
attr_accessor :name, :hsps, :s_length, :reversed, :type, :source, :description, :e_value, :q_p_ident, :q_p_conserved
|
4
|
+
def initialize (hit_name, s_length, q_frame, type)
|
5
|
+
@name=hit_name #Nombre tomado del subject_id
|
6
|
+
#@s_seq=s_seq #Secuencia del subject
|
7
|
+
#@q_seq=q_seq #Secuencia del query
|
8
|
+
@s_length=s_length # Longitud total del subject
|
9
|
+
@hsps=[]
|
10
|
+
if q_frame>0
|
11
|
+
@reversed=FALSE
|
12
|
+
else
|
13
|
+
@reversed=TRUE
|
14
|
+
end
|
15
|
+
@type=type
|
16
|
+
@source=nil
|
17
|
+
@description=nil
|
18
|
+
@e_value=nil
|
19
|
+
@q_p_ident=nil
|
20
|
+
@q_p_conserved=nil
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_hsp(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
|
25
|
+
hsp= Hsp.new(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
|
26
|
+
@hsps << hsp
|
27
|
+
return hsp
|
28
|
+
end
|
29
|
+
|
30
|
+
def each_hsp
|
31
|
+
@hsps.each do |hsp|
|
32
|
+
yield hsp
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def hsp_at(n)
|
37
|
+
hsp_at=nil
|
38
|
+
each_hsp_with_index{|hsp,i|
|
39
|
+
if n==i
|
40
|
+
hsp_at=hsp
|
41
|
+
break
|
42
|
+
end
|
43
|
+
}
|
44
|
+
return hsp_at
|
45
|
+
end
|
46
|
+
|
47
|
+
def each_hsp_with_index
|
48
|
+
@hsps.each_with_index do |hsp,i|
|
49
|
+
yield hsp,i
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def hsps_sort! # Se ordenan los hsps en base a posicion en el subject
|
54
|
+
@hsps.sort!{|e1,e2| e1.s_beg<=>e2.s_beg}
|
55
|
+
end
|
56
|
+
|
57
|
+
def hsp_count
|
58
|
+
n=0
|
59
|
+
each_hsp{|hsp|
|
60
|
+
n+=1
|
61
|
+
}
|
62
|
+
return n
|
63
|
+
end
|
64
|
+
|
65
|
+
def first_hsp
|
66
|
+
h=nil
|
67
|
+
each_hsp{|hit|
|
68
|
+
h=hit
|
69
|
+
break
|
70
|
+
}
|
71
|
+
return h
|
72
|
+
end
|
73
|
+
|
74
|
+
def last_hsp
|
75
|
+
h=nil
|
76
|
+
each_hsp{|hit|
|
77
|
+
h=hit
|
78
|
+
}
|
79
|
+
return h
|
80
|
+
end
|
81
|
+
|
82
|
+
def hsps_correlative? # Ver si los hsps del hit son contiguos en la query
|
83
|
+
is_correlative=FALSE
|
84
|
+
ends=0
|
85
|
+
each_hsp_with_index{|hsp,i|
|
86
|
+
if i==0
|
87
|
+
ends=hsp.q_end
|
88
|
+
next
|
89
|
+
end
|
90
|
+
if (ends-hsp.q_beg).abs>3
|
91
|
+
is_correlative=TRUE
|
92
|
+
break
|
93
|
+
end
|
94
|
+
ends=hsp.q_end
|
95
|
+
}
|
96
|
+
return is_correlative
|
97
|
+
end
|
98
|
+
|
99
|
+
def modified_coordenates(add)
|
100
|
+
each_hsp{|hsp|
|
101
|
+
hsp.modified_coordenates(add)
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def rev_coord(contig_length)
|
106
|
+
each_hsp{|hsp|
|
107
|
+
hsp.rev_coord(contig_length)
|
108
|
+
}
|
109
|
+
end
|
110
|
+
|
111
|
+
def hsp_overlap
|
112
|
+
overlap=[]
|
113
|
+
last_hsp=nil
|
114
|
+
each_hsp_with_index{|hsp,i|
|
115
|
+
if i>0
|
116
|
+
diference=hsp.overlap_with(last_hsp)
|
117
|
+
if diference<0
|
118
|
+
overlap << diference
|
119
|
+
end
|
120
|
+
end
|
121
|
+
last_hsp=hsp
|
122
|
+
}
|
123
|
+
return overlap
|
124
|
+
end
|
125
|
+
|
126
|
+
def overlap_with(last_hit)
|
127
|
+
overlap=0
|
128
|
+
if self.name==last_hit.name
|
129
|
+
diference=self.first_hsp.overlap_with(last_hit.last_hsp)
|
130
|
+
if diference<0
|
131
|
+
overlap=diference
|
132
|
+
end
|
133
|
+
end
|
134
|
+
return overlap
|
135
|
+
end
|
136
|
+
|
137
|
+
def hsp_minor_than?(hsp_length)# En nt
|
138
|
+
minor=FALSE
|
139
|
+
each_hsp {|hsp|
|
140
|
+
if hsp.length_q < hsp_length
|
141
|
+
minor=TRUE
|
142
|
+
break
|
143
|
+
end
|
144
|
+
}
|
145
|
+
return minor
|
146
|
+
end
|
147
|
+
|
148
|
+
def correct_hsps(blast_coor_type)# 's' => subject, 'q' => query
|
149
|
+
# puts self.inspect
|
150
|
+
if hsp_count>1
|
151
|
+
delete_hsps=[]
|
152
|
+
each_hsp_with_index{|hsp,i|
|
153
|
+
each_hsp_with_index{|hsp_second,j|
|
154
|
+
if i==j
|
155
|
+
next
|
156
|
+
end
|
157
|
+
# puts hsp.compare(hsp_second)
|
158
|
+
compare=nil
|
159
|
+
if blast_coor_type == 's'
|
160
|
+
compare = hsp.compare(hsp_second)
|
161
|
+
else
|
162
|
+
compare = hsp.compare_q(hsp_second)
|
163
|
+
end
|
164
|
+
if compare >= 0.9
|
165
|
+
if hsp.score == hsp_second.score # En caso de hsps con scores iguales, nos quedamos con el mas pequeño
|
166
|
+
if hsp.align_len == hsp_second.align_len # Si dos hsps son exactamente iguales eliminamos el segundo
|
167
|
+
delete_hsps << j
|
168
|
+
elsif hsp.align_len < hsp_second.align_len
|
169
|
+
delete_hsps << j
|
170
|
+
else
|
171
|
+
delete_hsps << i
|
172
|
+
end
|
173
|
+
elsif hsp.score > hsp_second.score
|
174
|
+
delete_hsps << j
|
175
|
+
else
|
176
|
+
delete_hsps << i
|
177
|
+
end
|
178
|
+
end
|
179
|
+
}
|
180
|
+
}
|
181
|
+
delete_hsps.uniq!
|
182
|
+
delete_hsps.reverse_each do |hsp|
|
183
|
+
drop_hsp(hsp)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def drop_hsp(position)
|
189
|
+
hsps.delete_at(position)
|
190
|
+
end
|
191
|
+
end
|