gene_assembler 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,175 @@
1
+ class Feature
2
+ attr_accessor :source, :type, :start, :stop, :score, :strand, :phase, :attribs, :child
3
+ @@undefined_features=0
4
+ def initialize(source, type, start, stop, score, strand, phase, attribs)
5
+ @source=source
6
+ @type=type
7
+ @start=start.to_i
8
+ @stop=stop.to_i
9
+ if score=='.'
10
+ @score='.'
11
+ else
12
+ @score=score.to_f
13
+ end
14
+ @strand=strand
15
+ if phase=='.'
16
+ @phase='.'
17
+ else
18
+ @phase=phase.to_i
19
+ end
20
+ @attribs=attribs
21
+ if attrib('Name').nil? #Añade el atributo name en base a la ID de forma automatica
22
+ @attribs['Name']=attrib('ID')
23
+ else
24
+ revised_name = @attribs['Name']
25
+ revised_name.gsub!(';','-')
26
+ @attribs['Name'] = revised_name
27
+ end
28
+ @child={}
29
+ end
30
+
31
+ def add_child(source, type, start, stop, score, strand, phase, attribs)
32
+ child=Feature.new(source, type, start, stop, score, strand, phase, attribs)
33
+ if !attribs['ID'].nil?
34
+ @child[attribs['ID']]=child
35
+ else
36
+ @child[child.attrib('Parent')+'_'+@@undefined_features.to_s]=child
37
+ @@undefined_features+=1
38
+ end
39
+ return child
40
+ end
41
+
42
+ def transfer_child(id,child)
43
+ @child[id]=child
44
+ end
45
+
46
+ def each_child
47
+ @child.each_value do |child|
48
+ yield child
49
+ end
50
+ end
51
+
52
+ def attrib(tag)
53
+ attrib=@attribs[tag]
54
+ return attrib
55
+ end
56
+
57
+ def each_tag_attrib
58
+ @attribs.each do |tag,attrib|
59
+ yield tag,attrib
60
+ end
61
+ end
62
+
63
+ def inspects
64
+ print attrib('ID')+" ---> "
65
+ @child.keys.each do |ch|
66
+ print "#{ch} (#{@child[ch].count})\t"
67
+ end
68
+ print "\n"
69
+ end
70
+
71
+ def count
72
+ return @child.count
73
+ end
74
+
75
+ def cds
76
+ cds_exones=[]
77
+ @child.each do |key_cds|
78
+ if key_cds[1].type=='CDS'
79
+ cds_exones << [key_cds[1].start, key_cds[1].stop]
80
+ end
81
+ end
82
+ return cds_exones
83
+ end
84
+
85
+ def tree(level=0)
86
+ puts "\t"*level+"#{attrib('ID')}\t\t#{@type}\t#{@source}"
87
+ level+=1
88
+ each_child {|child|
89
+ child.tree(level)
90
+ }
91
+ end
92
+
93
+ def is_type?(type)
94
+ t=FALSE
95
+ type=[type].flatten
96
+ type.each do |typ|
97
+ if typ == @type
98
+ t=TRUE
99
+ break
100
+ end
101
+ end
102
+ return t
103
+ end
104
+
105
+ def is_source?(source)
106
+ s=FALSE
107
+ source=[source].flatten
108
+ source.each do |sour|
109
+ if sour == @source
110
+ s=TRUE
111
+ break
112
+ end
113
+ end
114
+ return s
115
+ end
116
+
117
+ def compare(feature) #Resultado oscila entre 0 y 1
118
+ overlap=0
119
+ if feature.start >= self.start && feature.start <= self.stop && feature.stop >= self.start && feature.stop <= self.stop
120
+ overlap=feature.length*1.00/self.length
121
+ elsif self.start >= feature.start && self.start <= feature.stop && self.stop >= feature.start && self.stop <= feature.stop
122
+ overlap=1
123
+ elsif feature.start > self.start && feature.start < self.stop
124
+ overlap=(self.stop-feature.start)*1.00/self.length
125
+ elsif feature.stop > self.start && feature.stop < self.stop
126
+ overlap=(feature.stop-self.start)*1.00/self.length
127
+ end
128
+ return overlap
129
+ end
130
+
131
+ def length
132
+ length=self.stop-(self.start-1)
133
+ return length
134
+ end
135
+
136
+
137
+ def change_to_type_id_recursive(parent=FALSE)
138
+ new_parent="#{attrib('ID')}_#{@type}"
139
+ if !attrib('ID').nil?
140
+ @attribs['ID']=new_parent
141
+ end
142
+
143
+ if !attrib('Name').nil?
144
+ @attribs['Name']=new_parent
145
+ end
146
+
147
+ if !attrib('Parent').nil? && parent
148
+ @attribs['Parent']=parent
149
+ end
150
+
151
+ each_child {|child|
152
+ child.change_to_type_id_recursive(new_parent)
153
+ }
154
+ end
155
+
156
+ def write(file,id)
157
+ file.print "#{id}\t#{@source}\t#{@type}\t#{@start}\t#{@stop}\t#{@score}\t#{@strand}\t#{@phase}\t"
158
+ each_tag_attrib {|tag,attrib|
159
+ file.print "#{tag}=#{attrib};"
160
+ }
161
+ file.puts #Print \n
162
+ each_child {|child|
163
+ child.write(file,id)
164
+ }
165
+ end
166
+
167
+ def add_attribs(array_attribs)
168
+ array_attribs.each do |attrib|
169
+ if !attrib[1].nil?
170
+ @attribs[attrib[0]]=attrib[1]
171
+ end
172
+ end
173
+ end
174
+
175
+ end
@@ -0,0 +1,6 @@
1
+ class GffFrameshift
2
+ def report(fs,parent,seqid)
3
+ fs_text="#{seqid}\tunknown\tframeshift\t#{fs}\t#{fs}\t.\t+\t.\tParent=#{parent}"
4
+ return fs_text
5
+ end
6
+ end
@@ -0,0 +1,13 @@
1
+ class GffGo
2
+ def report(go,parent,seqid)
3
+ if go.source.nil?
4
+ go.source='Unknown'
5
+ end
6
+ obsolete=nil
7
+ if go.obsolete
8
+ obsolete="Obsolete=True"
9
+ end
10
+ go_text="#{seqid}\t#{go.source}\tOntology\t#{go.beg}\t#{go.end}\t.\t+\t.\tID=#{parent}_#{go.name}#{go.code};Parent=#{parent};Name=#{go.name}_#{go.code};Note=#{go.name}_#{go.code};#{obsolete}"
11
+ return go_text
12
+ end
13
+ end
@@ -0,0 +1,53 @@
1
+ require 'gff_hsp'
2
+
3
+ class GffHit
4
+ def report(hit,parent,seqid,id,name_mode,seq)
5
+ hit_text=[]
6
+ feature_parent=hit.name #Parent par las demas caracteristicas
7
+ if name_mode=='l'
8
+ feature_parent=feature_parent+'_'+hit.type
9
+ end
10
+ parent_tag=nil
11
+ if !parent.nil?# si hay parent, hit se comporta como una estructura tipo proteina, o secuencia nucleotidica. En caso contrario pasara a ser un contig hijo de otro contig ya escrito
12
+ parent_tag="Parent=#{parent};"
13
+ name=hit.name
14
+ feature_parent=id
15
+ else
16
+ name=id
17
+ feature_parent=id
18
+ end
19
+ hit_seq=nil
20
+ if !seq.nil?
21
+ hit_seq="seq=#{seq};"
22
+ end
23
+ strand='+'
24
+ if hit.reversed
25
+ strand='-'
26
+ end
27
+ ident="Perc_Qidentities=#{hit.q_p_ident};"
28
+ if hit.q_p_ident.nil?
29
+ ident=nil
30
+ end
31
+ conserved="Perc_Qconserved=#{hit.q_p_conserved};"
32
+ if hit.q_p_conserved.nil?
33
+ conserved=nil
34
+ end
35
+ description=nil
36
+ if !hit.description.nil?
37
+ description="Note=#{hit.description.gsub(';','_')};"
38
+ end
39
+ if hit.source.nil?
40
+ hit.source='Unknown'
41
+ end
42
+ text="#{seqid}\t#{hit.source}\t#{hit.type}\t#{hit.first_hsp.q_beg}\t#{hit.last_hsp.q_end}\t.\t#{strand}\t.\tID=#{name};#{parent_tag}Name=#{name};#{hit_seq}#{description}#{ident}#{conserved}"
43
+ hit_text << text
44
+ gff_hsp=GffHsp.new
45
+ if hit.hsp_count>1 #Desarrollar si el hit presenta varias coincidencias parciales
46
+ hit.each_hsp_with_index{|hsp,n|
47
+ hit_text << gff_hsp.report(hsp,name,seqid,n,hit.type,hit.source)
48
+ }
49
+ end
50
+ return hit_text,feature_parent
51
+ end
52
+
53
+ end
@@ -0,0 +1,6 @@
1
+ class GffHsp
2
+ def report(hsp,parent,seqid,n,hit_type,source)
3
+ text="#{seqid}\t#{source}\t#{hsp.type}\t#{hsp.q_beg}\t#{hsp.q_end}\t.\t+\t.\tParent=#{parent};Name=Hsp_#{n}"
4
+ return text
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ class Localization
2
+ def report(localization,parent, seqid, contig, n)
3
+ localization_text="#{seqid}\tunknown\tLocalization\t1\t#{contig.length}\t.\t+\t.\tID=#{parent}_loc_#{n};Parent=#{parent};Name=#{localization};Note=#{localization};"
4
+ return localization_text
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ require 'gff_feature'
2
+ class Master_feature < Feature
3
+
4
+
5
+ end
@@ -0,0 +1,35 @@
1
+ require 'gff_dataset'
2
+ class Gff_parser
3
+ attr_accessor :dataset
4
+ def initialize(file)
5
+ @dataset=Gff_dataset.new
6
+ File.open(file,'r').each do |line|
7
+ line.chomp!
8
+ if line=~ /^#/ || line=='' #Skip gff comments and blank lines
9
+ next
10
+ end
11
+ if line=~ /^>/ #Skip sequences
12
+ break
13
+ end
14
+
15
+ fields=line.split("\t")
16
+ attribs=parse_attribs(fields.last)
17
+ if !attribs.key?('Parent')
18
+ @dataset.add_master_feature(fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs)
19
+ else
20
+ @dataset.add_feature(fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs) # Feature is a child so it's put on his parent and it hasn't a parent attrib
21
+ end
22
+ end
23
+ return @dataset
24
+ end
25
+
26
+ def parse_attribs(attribs) #Attribs must be a string with info of last column of gff
27
+ attribs_hash={}
28
+ fields=attribs.split(';')
29
+ fields.each do |attrib|
30
+ attrib_items= attrib.split('=')
31
+ attribs_hash[attrib_items[0]]=attrib_items[1]
32
+ end
33
+ return attribs_hash
34
+ end
35
+ end
@@ -0,0 +1,21 @@
1
+ class GffSNP
2
+ def report(snp, parent, seqid,n)
3
+ depth="Depth=#{snp.depth};"
4
+ if snp.depth.nil?
5
+ depth=nil
6
+ end
7
+ zygosity="Zygosity=#{snp.zygosity};"
8
+ if snp.zygosity.nil?
9
+ zygosity=nil
10
+ end
11
+ mapping_qual="Mapping_qual=#{snp.mapping_qual};"
12
+ if snp.mapping_qual.nil?
13
+ mapping_qual=nil
14
+ end
15
+ strand_bias="Strand_bias=#{snp.strand_bias};"
16
+ if snp.strand_bias.nil?
17
+ end
18
+ snp_text="#{seqid}\t#{snp.source}\tSNP\t#{snp.position}\t#{snp.position}\t.\t+\t.\tID=#{parent}_SNP_#{n};Parent=#{parent};Name=#{snp.ref} -> #{snp.var};Note=#{snp.ref} -> #{snp.var};#{depth}#{zygosity}#{mapping_qual}#{strand_bias}"
19
+ return snp_text
20
+ end
21
+ end
@@ -0,0 +1,6 @@
1
+ class GffStop
2
+ def report(stop,parent,seqid)
3
+ stop_text="#{seqid}\tunknown\tstop_gained\t#{stop}\t#{stop+2}\t.\t+\t.\tParent=#{parent}"
4
+ return stop_text
5
+ end
6
+ end
@@ -0,0 +1,13 @@
1
+ class GO
2
+ attr_accessor :code, :ontology, :beg, :end, :source, :name, :obsolete
3
+ def initialize(code,name,obsolete)
4
+ @beg=nil
5
+ @end=nil
6
+ @source=nil
7
+ @code=code
8
+ @name=name
9
+ @obsolete=obsolete
10
+ end
11
+ end
12
+
13
+
@@ -0,0 +1,191 @@
1
+ require 'hsp'
2
+ class Hit
3
+ attr_accessor :name, :hsps, :s_length, :reversed, :type, :source, :description, :e_value, :q_p_ident, :q_p_conserved
4
+ def initialize (hit_name, s_length, q_frame, type)
5
+ @name=hit_name #Nombre tomado del subject_id
6
+ #@s_seq=s_seq #Secuencia del subject
7
+ #@q_seq=q_seq #Secuencia del query
8
+ @s_length=s_length # Longitud total del subject
9
+ @hsps=[]
10
+ if q_frame>0
11
+ @reversed=FALSE
12
+ else
13
+ @reversed=TRUE
14
+ end
15
+ @type=type
16
+ @source=nil
17
+ @description=nil
18
+ @e_value=nil
19
+ @q_p_ident=nil
20
+ @q_p_conserved=nil
21
+
22
+ end
23
+
24
+ def add_hsp(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
25
+ hsp= Hsp.new(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
26
+ @hsps << hsp
27
+ return hsp
28
+ end
29
+
30
+ def each_hsp
31
+ @hsps.each do |hsp|
32
+ yield hsp
33
+ end
34
+ end
35
+
36
+ def hsp_at(n)
37
+ hsp_at=nil
38
+ each_hsp_with_index{|hsp,i|
39
+ if n==i
40
+ hsp_at=hsp
41
+ break
42
+ end
43
+ }
44
+ return hsp_at
45
+ end
46
+
47
+ def each_hsp_with_index
48
+ @hsps.each_with_index do |hsp,i|
49
+ yield hsp,i
50
+ end
51
+ end
52
+
53
+ def hsps_sort! # Se ordenan los hsps en base a posicion en el subject
54
+ @hsps.sort!{|e1,e2| e1.s_beg<=>e2.s_beg}
55
+ end
56
+
57
+ def hsp_count
58
+ n=0
59
+ each_hsp{|hsp|
60
+ n+=1
61
+ }
62
+ return n
63
+ end
64
+
65
+ def first_hsp
66
+ h=nil
67
+ each_hsp{|hit|
68
+ h=hit
69
+ break
70
+ }
71
+ return h
72
+ end
73
+
74
+ def last_hsp
75
+ h=nil
76
+ each_hsp{|hit|
77
+ h=hit
78
+ }
79
+ return h
80
+ end
81
+
82
+ def hsps_correlative? # Ver si los hsps del hit son contiguos en la query
83
+ is_correlative=FALSE
84
+ ends=0
85
+ each_hsp_with_index{|hsp,i|
86
+ if i==0
87
+ ends=hsp.q_end
88
+ next
89
+ end
90
+ if (ends-hsp.q_beg).abs>3
91
+ is_correlative=TRUE
92
+ break
93
+ end
94
+ ends=hsp.q_end
95
+ }
96
+ return is_correlative
97
+ end
98
+
99
+ def modified_coordenates(add)
100
+ each_hsp{|hsp|
101
+ hsp.modified_coordenates(add)
102
+ }
103
+ end
104
+
105
+ def rev_coord(contig_length)
106
+ each_hsp{|hsp|
107
+ hsp.rev_coord(contig_length)
108
+ }
109
+ end
110
+
111
+ def hsp_overlap
112
+ overlap=[]
113
+ last_hsp=nil
114
+ each_hsp_with_index{|hsp,i|
115
+ if i>0
116
+ diference=hsp.overlap_with(last_hsp)
117
+ if diference<0
118
+ overlap << diference
119
+ end
120
+ end
121
+ last_hsp=hsp
122
+ }
123
+ return overlap
124
+ end
125
+
126
+ def overlap_with(last_hit)
127
+ overlap=0
128
+ if self.name==last_hit.name
129
+ diference=self.first_hsp.overlap_with(last_hit.last_hsp)
130
+ if diference<0
131
+ overlap=diference
132
+ end
133
+ end
134
+ return overlap
135
+ end
136
+
137
+ def hsp_minor_than?(hsp_length)# En nt
138
+ minor=FALSE
139
+ each_hsp {|hsp|
140
+ if hsp.length_q < hsp_length
141
+ minor=TRUE
142
+ break
143
+ end
144
+ }
145
+ return minor
146
+ end
147
+
148
+ def correct_hsps(blast_coor_type)# 's' => subject, 'q' => query
149
+ # puts self.inspect
150
+ if hsp_count>1
151
+ delete_hsps=[]
152
+ each_hsp_with_index{|hsp,i|
153
+ each_hsp_with_index{|hsp_second,j|
154
+ if i==j
155
+ next
156
+ end
157
+ # puts hsp.compare(hsp_second)
158
+ compare=nil
159
+ if blast_coor_type == 's'
160
+ compare = hsp.compare(hsp_second)
161
+ else
162
+ compare = hsp.compare_q(hsp_second)
163
+ end
164
+ if compare >= 0.9
165
+ if hsp.score == hsp_second.score # En caso de hsps con scores iguales, nos quedamos con el mas pequeño
166
+ if hsp.align_len == hsp_second.align_len # Si dos hsps son exactamente iguales eliminamos el segundo
167
+ delete_hsps << j
168
+ elsif hsp.align_len < hsp_second.align_len
169
+ delete_hsps << j
170
+ else
171
+ delete_hsps << i
172
+ end
173
+ elsif hsp.score > hsp_second.score
174
+ delete_hsps << j
175
+ else
176
+ delete_hsps << i
177
+ end
178
+ end
179
+ }
180
+ }
181
+ delete_hsps.uniq!
182
+ delete_hsps.reverse_each do |hsp|
183
+ drop_hsp(hsp)
184
+ end
185
+ end
186
+ end
187
+
188
+ def drop_hsp(position)
189
+ hsps.delete_at(position)
190
+ end
191
+ end