gene_assembler 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,175 @@
1
+ class Feature
2
+ attr_accessor :source, :type, :start, :stop, :score, :strand, :phase, :attribs, :child
3
+ @@undefined_features=0
4
+ def initialize(source, type, start, stop, score, strand, phase, attribs)
5
+ @source=source
6
+ @type=type
7
+ @start=start.to_i
8
+ @stop=stop.to_i
9
+ if score=='.'
10
+ @score='.'
11
+ else
12
+ @score=score.to_f
13
+ end
14
+ @strand=strand
15
+ if phase=='.'
16
+ @phase='.'
17
+ else
18
+ @phase=phase.to_i
19
+ end
20
+ @attribs=attribs
21
+ if attrib('Name').nil? #Añade el atributo name en base a la ID de forma automatica
22
+ @attribs['Name']=attrib('ID')
23
+ else
24
+ revised_name = @attribs['Name']
25
+ revised_name.gsub!(';','-')
26
+ @attribs['Name'] = revised_name
27
+ end
28
+ @child={}
29
+ end
30
+
31
+ def add_child(source, type, start, stop, score, strand, phase, attribs)
32
+ child=Feature.new(source, type, start, stop, score, strand, phase, attribs)
33
+ if !attribs['ID'].nil?
34
+ @child[attribs['ID']]=child
35
+ else
36
+ @child[child.attrib('Parent')+'_'+@@undefined_features.to_s]=child
37
+ @@undefined_features+=1
38
+ end
39
+ return child
40
+ end
41
+
42
+ def transfer_child(id,child)
43
+ @child[id]=child
44
+ end
45
+
46
+ def each_child
47
+ @child.each_value do |child|
48
+ yield child
49
+ end
50
+ end
51
+
52
+ def attrib(tag)
53
+ attrib=@attribs[tag]
54
+ return attrib
55
+ end
56
+
57
+ def each_tag_attrib
58
+ @attribs.each do |tag,attrib|
59
+ yield tag,attrib
60
+ end
61
+ end
62
+
63
+ def inspects
64
+ print attrib('ID')+" ---> "
65
+ @child.keys.each do |ch|
66
+ print "#{ch} (#{@child[ch].count})\t"
67
+ end
68
+ print "\n"
69
+ end
70
+
71
+ def count
72
+ return @child.count
73
+ end
74
+
75
+ def cds
76
+ cds_exones=[]
77
+ @child.each do |key_cds|
78
+ if key_cds[1].type=='CDS'
79
+ cds_exones << [key_cds[1].start, key_cds[1].stop]
80
+ end
81
+ end
82
+ return cds_exones
83
+ end
84
+
85
+ def tree(level=0)
86
+ puts "\t"*level+"#{attrib('ID')}\t\t#{@type}\t#{@source}"
87
+ level+=1
88
+ each_child {|child|
89
+ child.tree(level)
90
+ }
91
+ end
92
+
93
+ def is_type?(type)
94
+ t=FALSE
95
+ type=[type].flatten
96
+ type.each do |typ|
97
+ if typ == @type
98
+ t=TRUE
99
+ break
100
+ end
101
+ end
102
+ return t
103
+ end
104
+
105
+ def is_source?(source)
106
+ s=FALSE
107
+ source=[source].flatten
108
+ source.each do |sour|
109
+ if sour == @source
110
+ s=TRUE
111
+ break
112
+ end
113
+ end
114
+ return s
115
+ end
116
+
117
+ def compare(feature) #Resultado oscila entre 0 y 1
118
+ overlap=0
119
+ if feature.start >= self.start && feature.start <= self.stop && feature.stop >= self.start && feature.stop <= self.stop
120
+ overlap=feature.length*1.00/self.length
121
+ elsif self.start >= feature.start && self.start <= feature.stop && self.stop >= feature.start && self.stop <= feature.stop
122
+ overlap=1
123
+ elsif feature.start > self.start && feature.start < self.stop
124
+ overlap=(self.stop-feature.start)*1.00/self.length
125
+ elsif feature.stop > self.start && feature.stop < self.stop
126
+ overlap=(feature.stop-self.start)*1.00/self.length
127
+ end
128
+ return overlap
129
+ end
130
+
131
+ def length
132
+ length=self.stop-(self.start-1)
133
+ return length
134
+ end
135
+
136
+
137
+ def change_to_type_id_recursive(parent=FALSE)
138
+ new_parent="#{attrib('ID')}_#{@type}"
139
+ if !attrib('ID').nil?
140
+ @attribs['ID']=new_parent
141
+ end
142
+
143
+ if !attrib('Name').nil?
144
+ @attribs['Name']=new_parent
145
+ end
146
+
147
+ if !attrib('Parent').nil? && parent
148
+ @attribs['Parent']=parent
149
+ end
150
+
151
+ each_child {|child|
152
+ child.change_to_type_id_recursive(new_parent)
153
+ }
154
+ end
155
+
156
+ def write(file,id)
157
+ file.print "#{id}\t#{@source}\t#{@type}\t#{@start}\t#{@stop}\t#{@score}\t#{@strand}\t#{@phase}\t"
158
+ each_tag_attrib {|tag,attrib|
159
+ file.print "#{tag}=#{attrib};"
160
+ }
161
+ file.puts #Print \n
162
+ each_child {|child|
163
+ child.write(file,id)
164
+ }
165
+ end
166
+
167
+ def add_attribs(array_attribs)
168
+ array_attribs.each do |attrib|
169
+ if !attrib[1].nil?
170
+ @attribs[attrib[0]]=attrib[1]
171
+ end
172
+ end
173
+ end
174
+
175
+ end
@@ -0,0 +1,6 @@
1
+ class GffFrameshift
2
+ def report(fs,parent,seqid)
3
+ fs_text="#{seqid}\tunknown\tframeshift\t#{fs}\t#{fs}\t.\t+\t.\tParent=#{parent}"
4
+ return fs_text
5
+ end
6
+ end
@@ -0,0 +1,13 @@
1
+ class GffGo
2
+ def report(go,parent,seqid)
3
+ if go.source.nil?
4
+ go.source='Unknown'
5
+ end
6
+ obsolete=nil
7
+ if go.obsolete
8
+ obsolete="Obsolete=True"
9
+ end
10
+ go_text="#{seqid}\t#{go.source}\tOntology\t#{go.beg}\t#{go.end}\t.\t+\t.\tID=#{parent}_#{go.name}#{go.code};Parent=#{parent};Name=#{go.name}_#{go.code};Note=#{go.name}_#{go.code};#{obsolete}"
11
+ return go_text
12
+ end
13
+ end
@@ -0,0 +1,53 @@
1
+ require 'gff_hsp'
2
+
3
+ class GffHit
4
+ def report(hit,parent,seqid,id,name_mode,seq)
5
+ hit_text=[]
6
+ feature_parent=hit.name #Parent par las demas caracteristicas
7
+ if name_mode=='l'
8
+ feature_parent=feature_parent+'_'+hit.type
9
+ end
10
+ parent_tag=nil
11
+ if !parent.nil?# si hay parent, hit se comporta como una estructura tipo proteina, o secuencia nucleotidica. En caso contrario pasara a ser un contig hijo de otro contig ya escrito
12
+ parent_tag="Parent=#{parent};"
13
+ name=hit.name
14
+ feature_parent=id
15
+ else
16
+ name=id
17
+ feature_parent=id
18
+ end
19
+ hit_seq=nil
20
+ if !seq.nil?
21
+ hit_seq="seq=#{seq};"
22
+ end
23
+ strand='+'
24
+ if hit.reversed
25
+ strand='-'
26
+ end
27
+ ident="Perc_Qidentities=#{hit.q_p_ident};"
28
+ if hit.q_p_ident.nil?
29
+ ident=nil
30
+ end
31
+ conserved="Perc_Qconserved=#{hit.q_p_conserved};"
32
+ if hit.q_p_conserved.nil?
33
+ conserved=nil
34
+ end
35
+ description=nil
36
+ if !hit.description.nil?
37
+ description="Note=#{hit.description.gsub(';','_')};"
38
+ end
39
+ if hit.source.nil?
40
+ hit.source='Unknown'
41
+ end
42
+ text="#{seqid}\t#{hit.source}\t#{hit.type}\t#{hit.first_hsp.q_beg}\t#{hit.last_hsp.q_end}\t.\t#{strand}\t.\tID=#{name};#{parent_tag}Name=#{name};#{hit_seq}#{description}#{ident}#{conserved}"
43
+ hit_text << text
44
+ gff_hsp=GffHsp.new
45
+ if hit.hsp_count>1 #Desarrollar si el hit presenta varias coincidencias parciales
46
+ hit.each_hsp_with_index{|hsp,n|
47
+ hit_text << gff_hsp.report(hsp,name,seqid,n,hit.type,hit.source)
48
+ }
49
+ end
50
+ return hit_text,feature_parent
51
+ end
52
+
53
+ end
@@ -0,0 +1,6 @@
1
+ class GffHsp
2
+ def report(hsp,parent,seqid,n,hit_type,source)
3
+ text="#{seqid}\t#{source}\t#{hsp.type}\t#{hsp.q_beg}\t#{hsp.q_end}\t.\t+\t.\tParent=#{parent};Name=Hsp_#{n}"
4
+ return text
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ class Localization
2
+ def report(localization,parent, seqid, contig, n)
3
+ localization_text="#{seqid}\tunknown\tLocalization\t1\t#{contig.length}\t.\t+\t.\tID=#{parent}_loc_#{n};Parent=#{parent};Name=#{localization};Note=#{localization};"
4
+ return localization_text
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ require 'gff_feature'
2
+ class Master_feature < Feature
3
+
4
+
5
+ end
@@ -0,0 +1,35 @@
1
+ require 'gff_dataset'
2
+ class Gff_parser
3
+ attr_accessor :dataset
4
+ def initialize(file)
5
+ @dataset=Gff_dataset.new
6
+ File.open(file,'r').each do |line|
7
+ line.chomp!
8
+ if line=~ /^#/ || line=='' #Skip gff comments and blank lines
9
+ next
10
+ end
11
+ if line=~ /^>/ #Skip sequences
12
+ break
13
+ end
14
+
15
+ fields=line.split("\t")
16
+ attribs=parse_attribs(fields.last)
17
+ if !attribs.key?('Parent')
18
+ @dataset.add_master_feature(fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs)
19
+ else
20
+ @dataset.add_feature(fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], attribs) # Feature is a child so it's put on his parent and it hasn't a parent attrib
21
+ end
22
+ end
23
+ return @dataset
24
+ end
25
+
26
+ def parse_attribs(attribs) #Attribs must be a string with info of last column of gff
27
+ attribs_hash={}
28
+ fields=attribs.split(';')
29
+ fields.each do |attrib|
30
+ attrib_items= attrib.split('=')
31
+ attribs_hash[attrib_items[0]]=attrib_items[1]
32
+ end
33
+ return attribs_hash
34
+ end
35
+ end
@@ -0,0 +1,21 @@
1
+ class GffSNP
2
+ def report(snp, parent, seqid,n)
3
+ depth="Depth=#{snp.depth};"
4
+ if snp.depth.nil?
5
+ depth=nil
6
+ end
7
+ zygosity="Zygosity=#{snp.zygosity};"
8
+ if snp.zygosity.nil?
9
+ zygosity=nil
10
+ end
11
+ mapping_qual="Mapping_qual=#{snp.mapping_qual};"
12
+ if snp.mapping_qual.nil?
13
+ mapping_qual=nil
14
+ end
15
+ strand_bias="Strand_bias=#{snp.strand_bias};"
16
+ if snp.strand_bias.nil?
17
+ end
18
+ snp_text="#{seqid}\t#{snp.source}\tSNP\t#{snp.position}\t#{snp.position}\t.\t+\t.\tID=#{parent}_SNP_#{n};Parent=#{parent};Name=#{snp.ref} -> #{snp.var};Note=#{snp.ref} -> #{snp.var};#{depth}#{zygosity}#{mapping_qual}#{strand_bias}"
19
+ return snp_text
20
+ end
21
+ end
@@ -0,0 +1,6 @@
1
+ class GffStop
2
+ def report(stop,parent,seqid)
3
+ stop_text="#{seqid}\tunknown\tstop_gained\t#{stop}\t#{stop+2}\t.\t+\t.\tParent=#{parent}"
4
+ return stop_text
5
+ end
6
+ end
@@ -0,0 +1,13 @@
1
+ class GO
2
+ attr_accessor :code, :ontology, :beg, :end, :source, :name, :obsolete
3
+ def initialize(code,name,obsolete)
4
+ @beg=nil
5
+ @end=nil
6
+ @source=nil
7
+ @code=code
8
+ @name=name
9
+ @obsolete=obsolete
10
+ end
11
+ end
12
+
13
+
@@ -0,0 +1,191 @@
1
+ require 'hsp'
2
+ class Hit
3
+ attr_accessor :name, :hsps, :s_length, :reversed, :type, :source, :description, :e_value, :q_p_ident, :q_p_conserved
4
+ def initialize (hit_name, s_length, q_frame, type)
5
+ @name=hit_name #Nombre tomado del subject_id
6
+ #@s_seq=s_seq #Secuencia del subject
7
+ #@q_seq=q_seq #Secuencia del query
8
+ @s_length=s_length # Longitud total del subject
9
+ @hsps=[]
10
+ if q_frame>0
11
+ @reversed=FALSE
12
+ else
13
+ @reversed=TRUE
14
+ end
15
+ @type=type
16
+ @source=nil
17
+ @description=nil
18
+ @e_value=nil
19
+ @q_p_ident=nil
20
+ @q_p_conserved=nil
21
+
22
+ end
23
+
24
+ def add_hsp(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
25
+ hsp= Hsp.new(q_beg, q_end, s_beg, s_end, align_len, score, ident, gaps)
26
+ @hsps << hsp
27
+ return hsp
28
+ end
29
+
30
+ def each_hsp
31
+ @hsps.each do |hsp|
32
+ yield hsp
33
+ end
34
+ end
35
+
36
+ def hsp_at(n)
37
+ hsp_at=nil
38
+ each_hsp_with_index{|hsp,i|
39
+ if n==i
40
+ hsp_at=hsp
41
+ break
42
+ end
43
+ }
44
+ return hsp_at
45
+ end
46
+
47
+ def each_hsp_with_index
48
+ @hsps.each_with_index do |hsp,i|
49
+ yield hsp,i
50
+ end
51
+ end
52
+
53
+ def hsps_sort! # Se ordenan los hsps en base a posicion en el subject
54
+ @hsps.sort!{|e1,e2| e1.s_beg<=>e2.s_beg}
55
+ end
56
+
57
+ def hsp_count
58
+ n=0
59
+ each_hsp{|hsp|
60
+ n+=1
61
+ }
62
+ return n
63
+ end
64
+
65
+ def first_hsp
66
+ h=nil
67
+ each_hsp{|hit|
68
+ h=hit
69
+ break
70
+ }
71
+ return h
72
+ end
73
+
74
+ def last_hsp
75
+ h=nil
76
+ each_hsp{|hit|
77
+ h=hit
78
+ }
79
+ return h
80
+ end
81
+
82
+ def hsps_correlative? # Ver si los hsps del hit son contiguos en la query
83
+ is_correlative=FALSE
84
+ ends=0
85
+ each_hsp_with_index{|hsp,i|
86
+ if i==0
87
+ ends=hsp.q_end
88
+ next
89
+ end
90
+ if (ends-hsp.q_beg).abs>3
91
+ is_correlative=TRUE
92
+ break
93
+ end
94
+ ends=hsp.q_end
95
+ }
96
+ return is_correlative
97
+ end
98
+
99
+ def modified_coordenates(add)
100
+ each_hsp{|hsp|
101
+ hsp.modified_coordenates(add)
102
+ }
103
+ end
104
+
105
+ def rev_coord(contig_length)
106
+ each_hsp{|hsp|
107
+ hsp.rev_coord(contig_length)
108
+ }
109
+ end
110
+
111
+ def hsp_overlap
112
+ overlap=[]
113
+ last_hsp=nil
114
+ each_hsp_with_index{|hsp,i|
115
+ if i>0
116
+ diference=hsp.overlap_with(last_hsp)
117
+ if diference<0
118
+ overlap << diference
119
+ end
120
+ end
121
+ last_hsp=hsp
122
+ }
123
+ return overlap
124
+ end
125
+
126
+ def overlap_with(last_hit)
127
+ overlap=0
128
+ if self.name==last_hit.name
129
+ diference=self.first_hsp.overlap_with(last_hit.last_hsp)
130
+ if diference<0
131
+ overlap=diference
132
+ end
133
+ end
134
+ return overlap
135
+ end
136
+
137
+ def hsp_minor_than?(hsp_length)# En nt
138
+ minor=FALSE
139
+ each_hsp {|hsp|
140
+ if hsp.length_q < hsp_length
141
+ minor=TRUE
142
+ break
143
+ end
144
+ }
145
+ return minor
146
+ end
147
+
148
+ def correct_hsps(blast_coor_type)# 's' => subject, 'q' => query
149
+ # puts self.inspect
150
+ if hsp_count>1
151
+ delete_hsps=[]
152
+ each_hsp_with_index{|hsp,i|
153
+ each_hsp_with_index{|hsp_second,j|
154
+ if i==j
155
+ next
156
+ end
157
+ # puts hsp.compare(hsp_second)
158
+ compare=nil
159
+ if blast_coor_type == 's'
160
+ compare = hsp.compare(hsp_second)
161
+ else
162
+ compare = hsp.compare_q(hsp_second)
163
+ end
164
+ if compare >= 0.9
165
+ if hsp.score == hsp_second.score # En caso de hsps con scores iguales, nos quedamos con el mas pequeño
166
+ if hsp.align_len == hsp_second.align_len # Si dos hsps son exactamente iguales eliminamos el segundo
167
+ delete_hsps << j
168
+ elsif hsp.align_len < hsp_second.align_len
169
+ delete_hsps << j
170
+ else
171
+ delete_hsps << i
172
+ end
173
+ elsif hsp.score > hsp_second.score
174
+ delete_hsps << j
175
+ else
176
+ delete_hsps << i
177
+ end
178
+ end
179
+ }
180
+ }
181
+ delete_hsps.uniq!
182
+ delete_hsps.reverse_each do |hsp|
183
+ drop_hsp(hsp)
184
+ end
185
+ end
186
+ end
187
+
188
+ def drop_hsp(position)
189
+ hsps.delete_at(position)
190
+ end
191
+ end