gene_assembler 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,230 @@
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+
23
+ require 'blast_query.rb'
24
+ require 'blast_hit.rb'
25
+
26
+ class ExoBlastQuery < BlastQuery
27
+ attr_accessor :q_frameshift, :s_frameshift
28
+ def initialize(query_id)
29
+ super(query_id)
30
+ @s_frameshift=[]
31
+ @q_frameshift=[]
32
+ end
33
+ end
34
+
35
+ # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
36
+ class ExonerateResult
37
+
38
+ # Parser initialization
39
+ def initialize(input,all)
40
+ @querys = []
41
+
42
+ if input.is_a?(Array)
43
+ input.each do |file|
44
+ fich = File.open(file,'r')
45
+ lines = fich.readlines
46
+ fich.close
47
+ parse_file(lines,all)
48
+ end
49
+ else
50
+ fich = File.open(input,'r')
51
+ lines = fich.readlines
52
+ fich.close
53
+ parse_file(lines,all)
54
+ end
55
+ query_name=''
56
+
57
+ end
58
+
59
+ def parse_file(lines,all)
60
+ lines_parsed=nil
61
+ if !all
62
+ lines_parsed={}
63
+ else
64
+ lines_parsed=[]
65
+ end
66
+ lines.each do |line|
67
+ if line=~ /^vulgar:/
68
+ fields=line.split
69
+ features={'query_id'=> fields[1], 'query_start_align'=> fields[2], 'query_end_align'=> fields[3], 'query_strand'=> fields[4],'target_id'=> fields[5], 'target_start_align'=> fields[6], 'target_end_align'=> fields[7], 'target_strand'=> fields[8], 'score'=> fields[9], 'align_data'=> fields[10..fields.length]}
70
+ if all
71
+ lines_parsed << features
72
+ else
73
+ if !lines_parsed.key?(features['target_id']) # Añadir valor si no existe
74
+ lines_parsed[features['target_id']]=features
75
+ else
76
+ if features['score']>lines_parsed[features['target_id']]['score'] # Si ya existe una query, ver si la nueva presenta un mayor score y reemplazar la antigua
77
+ lines_parsed[features['target_id']]=features
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ convert_parsed_lines(lines_parsed)
84
+ end
85
+
86
+ def convert_parsed_lines(lines_parsed)
87
+ lines_parsed.each do |line|
88
+ align_data=nil
89
+ features=nil
90
+ if lines_parsed.class.to_s=='Array'
91
+ align_data=line['align_data']
92
+ features=line
93
+ else #hash
94
+ align_data=line[1]['align_data']
95
+ features=line[1]
96
+ end
97
+ tags=[]
98
+ elm=[]
99
+ align_data_len=align_data.length-1
100
+ align_data.each_with_index do |field,c|
101
+ mod=c.modulo(3)
102
+ if mod==0 #tag operation
103
+ if c>0
104
+ tags << elm
105
+ end
106
+ elm=[] #array q inicia segunda dimension en el array principal
107
+ elm << field
108
+ elsif mod==1 # Coordenada relativa del query
109
+ elm << field.to_i
110
+ elsif mod==2 # Coordenada relativa del target
111
+ elm << field.to_i
112
+ end
113
+ if align_data_len==c #Guardar ultima operacion x fin de parseo
114
+ tags << elm
115
+ end
116
+ end
117
+ hiting(features,tags)
118
+ end
119
+ end
120
+
121
+ def hiting(features,tags) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
122
+ query=ExoBlastQuery.new(features['target_id'])
123
+ #Variables para hit
124
+ if features['target_strand']=='+'
125
+ reversed=FALSE
126
+ strand=1
127
+ else
128
+ reversed=TRUE
129
+ strand=-1
130
+ end
131
+ #------------------
132
+ start_target=features['target_start_align'].to_i#paso de coord 0 a 1
133
+ ends_target=0
134
+ start_query=features['query_start_align'].to_i
135
+ ends_query=0
136
+ tag_len=tags.length-1
137
+ counter_target=start_target
138
+ counter_query=start_query
139
+ add=TRUE #Hay casos q hay q realizar la adicion de coordenadas relativas antes de la operacion de guardado, esta varible se define para impedir la suma al final de la iteracion
140
+ tags.each_with_index do |tag,c|
141
+ if c==tag_len||tag[0]=='S'||tag[0]=='G'
142
+ counter_query+=tag[1]
143
+ counter_target+=(tag[2]*strand)
144
+ add=FALSE
145
+ else
146
+ if tag[0]=='F'
147
+ query.s_frameshift << counter_query
148
+ query.q_frameshift << counter_target
149
+ end
150
+ add=TRUE
151
+ end
152
+ if tag[0]=='5'||c==tag_len #Comienzo de intron o final de secuencia
153
+ ends_target=counter_target
154
+ ends_query=counter_query
155
+
156
+ # creates the hit
157
+ hit = BlastHit.new(start_target+1, ends_target, start_query+1, ends_query)
158
+ hit.align_len=(ends_target-start_target)*strand
159
+ hit.ident=0
160
+
161
+ hit.gaps=0
162
+ hit.mismatches=0
163
+ hit.e_val=0
164
+ hit.bit_score=0
165
+
166
+ hit.score = features['score'].to_i
167
+ if reversed
168
+ hit.q_frame = -1
169
+ else
170
+ hit.q_frame =1
171
+ end
172
+ hit.s_frame = nil
173
+
174
+ hit.reversed=reversed
175
+ hit.subject_id = features['query_id']
176
+ hit.full_subject_length=0
177
+ hit.definition=features['query_id']
178
+ hit.acc=features['query_id']
179
+ hit.q_seq=''
180
+ hit.s_seq=''
181
+ #puts "#{features['target_id']}\t#{hit.inspect}"
182
+ query.add_hit(hit)
183
+ end
184
+ if add
185
+ counter_query+=tag[1]
186
+ counter_target+=tag[2]*strand
187
+ end
188
+
189
+ if tag[0]=='3' # Final de intron x lo tanto comienzo de exon
190
+ start_query=counter_query
191
+ start_target=counter_target
192
+ end
193
+ end#end do
194
+ @querys << query
195
+ end #def
196
+
197
+ # inspect results
198
+ def inspect
199
+ res = "Exonerate results:\n"
200
+ res+= '-'*20
201
+ res+= "\nQuerys: #{@querys.count}\n"
202
+ @querys.each{|q| res+=q.inspect+"\n"}
203
+ return res
204
+ end
205
+
206
+ # find query by name
207
+ def find_query(querys,name_q)
208
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
209
+ new_q=nil
210
+
211
+ if !querys.empty?
212
+ new_q=querys.find{|q| (q.query_id==name_q)}
213
+ end
214
+
215
+ return new_q
216
+ end
217
+
218
+ # check if there are querys
219
+ def empty?
220
+
221
+ return @querys.empty?
222
+ end
223
+
224
+ # get query count
225
+ def size
226
+ @querys.size
227
+ end
228
+
229
+ attr_accessor :querys
230
+ end
@@ -0,0 +1,67 @@
1
+ require 'gff_hit'
2
+ require 'gff_snp'
3
+ require 'gff_go'
4
+ require 'gff_localization'
5
+ require 'gff_stop'
6
+ require 'gff_frameshift'
7
+
8
+ class GffContig
9
+ def report(contig,seqid,parent,name_mode)
10
+ features_parent=nil
11
+ seq=nil # Para especificar la secuencia del contig o del hit
12
+
13
+ #Contig
14
+ if !parent.nil? #Caso de q cada contig sea una unidad independiente
15
+ seqid=parent #Se redefine el seqid con el nombre del contig q actua de parent
16
+ if !contig.seq.nil? #Se especifica secuencia para el contig hijo ya q Gbrowse toma como secuencia aquella perteneciente al parent, por lo q hay especificar la secuencia del contig en el gff
17
+ seq=contig.seq
18
+ end
19
+ end
20
+ parent_hit=nil
21
+ contig_text=[]
22
+ if parent.nil? #Caso de cada contig sea una unidad independiente
23
+ parent_hit=contig.name
24
+ text="#{seqid}\tunknown\t#{contig.type}\t1\t#{contig.length}\t.\t+\t.\tID=#{contig.name};Name=#{contig.name}"
25
+ contig_text << text
26
+ end
27
+
28
+ #Hit
29
+ gff_hit=GffHit.new
30
+ contig.each_hit{|hit|
31
+ text,features_parent= gff_hit.report(hit, parent_hit, seqid, contig.name, name_mode, seq) #En caso de que un contig dependa de otro, features_parent proporcio
32
+ contig_text << text
33
+ }
34
+
35
+ #Frameshift
36
+ gff_frameshift=GffFrameshift.new
37
+ contig.each_q_frameshift{|fs|
38
+ contig_text << gff_frameshift.report(fs,features_parent,seqid)
39
+ }
40
+
41
+ #Stop
42
+ gff_stop=GffStop.new
43
+ contig.each_stop{|stop|
44
+ contig_text << gff_stop.report(stop,features_parent,seqid)
45
+ }
46
+
47
+ #SNP
48
+ gff_snp=GffSNP.new
49
+ contig.each_snp_with_index{|snp,n|
50
+ contig_text << gff_snp.report(snp, features_parent, seqid,n)
51
+ }
52
+
53
+ #GO
54
+ gff_go=GffGo.new
55
+ contig.each_go{|go|
56
+ contig_text << gff_go.report(go, features_parent, seqid)
57
+ }
58
+
59
+ #Localization
60
+ gff_localization=Localization.new
61
+ contig.each_localization_with_index{|localization,n|
62
+ contig_text << gff_localization.report(localization,features_parent, seqid, contig, n)
63
+ }
64
+
65
+ return contig_text
66
+ end
67
+ end
@@ -0,0 +1,152 @@
1
+ require 'gff_feature'
2
+ require 'gff_master_feature'
3
+ class Gff_dataset
4
+ @@undefined_features=0
5
+ attr_accessor :master_features, :index
6
+ def initialize
7
+ @master_features={}
8
+ @index={}
9
+ end
10
+
11
+ def master_features
12
+ if @master_features.length >1
13
+ master_features=@master_features.values
14
+ else
15
+ master_features=@master_features.to_a[0][1]
16
+ end
17
+ return master_features
18
+ end
19
+
20
+ def add_master_feature(master_seq_id, source, type, start, stop, score, strand, phase, attribs)
21
+ master=nil
22
+ if @index.key?(master_seq_id)#Check that feature region exists, if exists add master feature like a child of that region
23
+ attribs['Parent']=master_seq_id
24
+ master=add_feature(source, type, start, stop, score, strand, phase, attribs)
25
+ @index[attribs['ID']]=master
26
+ if stop.to_i > @master_features[master_seq_id].stop #Redefine master_feature with new child
27
+ @master_features[master_seq_id].stop=stop.to_i
28
+ end
29
+ elsif attribs['ID']==master_seq_id #Check that exists a parent region for master_feature
30
+ master=Master_feature.new(source, type, start, stop, score, strand, phase, attribs)
31
+ @master_features[master_seq_id]=master
32
+ @index[master_seq_id]=master
33
+ else #Creates a master feature with his child if it'sn defined master_feature
34
+ master=Master_feature.new(source, 'region', 1, stop, '.', '.', '.', {'ID'=> master_seq_id})
35
+ @master_features[master_seq_id]=master
36
+ @index[master_seq_id]=master
37
+ attribs['Parent']=master_seq_id
38
+ child=master.add_child(source, type, start, stop, score, strand, phase, attribs)
39
+ @index[attribs['ID']]=child
40
+ end
41
+ return master
42
+ end
43
+
44
+ def add_feature(source, type, start, stop, score, strand, phase, attribs)
45
+ feature=@index[attribs['Parent']].add_child(source, type, start, stop, score, strand, phase, attribs)
46
+ if !attribs['ID'].nil?
47
+ @index[attribs['ID']]=feature
48
+ else
49
+ @index[feature.attrib('Parent')+'_'+@@undefined_features.to_s]=feature
50
+ @@undefined_features+=1
51
+ end
52
+ return feature
53
+ end
54
+
55
+ def each_feature
56
+ @index.each_value do |feature|
57
+ yield feature
58
+ end
59
+ end
60
+
61
+ def each_master_feature
62
+ @master_features.each_value do |master_feature|
63
+ yield master_feature
64
+ end
65
+ end
66
+
67
+ def each_id_feat
68
+ @index.each do |id,feature|
69
+ yield id,feature
70
+ end
71
+ end
72
+
73
+ def each_id_master
74
+ @master_features.each do |id,master|
75
+ yield id,master
76
+ end
77
+ end
78
+
79
+ def inspects
80
+ @master_features.each do |item|
81
+ item[1].inspects
82
+ end
83
+ end
84
+
85
+ def feature(hash_key)
86
+ return @index[hash_key]
87
+ end
88
+
89
+ def has_type?(type)
90
+ has_type=FALSE
91
+ @index.each_value do |feature|
92
+ if feature.type==type
93
+ has_type=TRUE
94
+ break
95
+ end
96
+ end
97
+ return has_type
98
+ end
99
+
100
+ def has_source?(source)
101
+ has_source=FALSE
102
+ @index.each_value do |feature|
103
+ if feature.source==source
104
+ has_source=TRUE
105
+ break
106
+ end
107
+ end
108
+ return has_source
109
+ end
110
+
111
+ def get(source=FALSE, type=FALSE)
112
+ features=[]
113
+ @index.each_value do |feature|
114
+ s=TRUE
115
+ if source
116
+ s=feature.is_source?(source)
117
+ end
118
+ t=TRUE
119
+ if type
120
+ t=feature.is_type?(type)
121
+ end
122
+
123
+ if s&t==TRUE
124
+ features << feature
125
+ end
126
+ end
127
+ return features
128
+ end
129
+
130
+ def tree
131
+ @master_features.each_value do |master|
132
+ master.tree
133
+ puts "\n",'--------------------------------'
134
+ end
135
+ end
136
+
137
+ def add_parent_to(type_parent,type_child)
138
+ each_id_feat {|id,feature|
139
+ if feature.type==type_child
140
+ new_feature=Feature.new(feature.source, type_parent, feature.start, feature.stop, '.', feature.strand, '.', feature.attribs.dup)
141
+ feature.change_to_type_id_recursive
142
+ new_feature.transfer_child(feature.attrib('ID'),feature)
143
+ @index[feature.attrib('Parent')].child[id]=new_feature
144
+ @index[id]=new_feature
145
+ new_feature.each_child {|child|
146
+ child.attribs['Parent']=new_feature.attrib('ID') #Define new parent
147
+ }
148
+ end
149
+ }
150
+ end
151
+
152
+ end