gene_assembler 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,230 @@
1
+ # Copyright (c) 2010 Dario Guerrero & Almudena Bocinos
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # 'Software'), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+
23
+ require 'blast_query.rb'
24
+ require 'blast_hit.rb'
25
+
26
+ class ExoBlastQuery < BlastQuery
27
+ attr_accessor :q_frameshift, :s_frameshift
28
+ def initialize(query_id)
29
+ super(query_id)
30
+ @s_frameshift=[]
31
+ @q_frameshift=[]
32
+ end
33
+ end
34
+
35
+ # Extracts results from blast table's file and uses it to create instances of "BlastQuery" and "BlastHit"
36
+ class ExonerateResult
37
+
38
+ # Parser initialization
39
+ def initialize(input,all)
40
+ @querys = []
41
+
42
+ if input.is_a?(Array)
43
+ input.each do |file|
44
+ fich = File.open(file,'r')
45
+ lines = fich.readlines
46
+ fich.close
47
+ parse_file(lines,all)
48
+ end
49
+ else
50
+ fich = File.open(input,'r')
51
+ lines = fich.readlines
52
+ fich.close
53
+ parse_file(lines,all)
54
+ end
55
+ query_name=''
56
+
57
+ end
58
+
59
+ def parse_file(lines,all)
60
+ lines_parsed=nil
61
+ if !all
62
+ lines_parsed={}
63
+ else
64
+ lines_parsed=[]
65
+ end
66
+ lines.each do |line|
67
+ if line=~ /^vulgar:/
68
+ fields=line.split
69
+ features={'query_id'=> fields[1], 'query_start_align'=> fields[2], 'query_end_align'=> fields[3], 'query_strand'=> fields[4],'target_id'=> fields[5], 'target_start_align'=> fields[6], 'target_end_align'=> fields[7], 'target_strand'=> fields[8], 'score'=> fields[9], 'align_data'=> fields[10..fields.length]}
70
+ if all
71
+ lines_parsed << features
72
+ else
73
+ if !lines_parsed.key?(features['target_id']) # Añadir valor si no existe
74
+ lines_parsed[features['target_id']]=features
75
+ else
76
+ if features['score']>lines_parsed[features['target_id']]['score'] # Si ya existe una query, ver si la nueva presenta un mayor score y reemplazar la antigua
77
+ lines_parsed[features['target_id']]=features
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ convert_parsed_lines(lines_parsed)
84
+ end
85
+
86
+ def convert_parsed_lines(lines_parsed)
87
+ lines_parsed.each do |line|
88
+ align_data=nil
89
+ features=nil
90
+ if lines_parsed.class.to_s=='Array'
91
+ align_data=line['align_data']
92
+ features=line
93
+ else #hash
94
+ align_data=line[1]['align_data']
95
+ features=line[1]
96
+ end
97
+ tags=[]
98
+ elm=[]
99
+ align_data_len=align_data.length-1
100
+ align_data.each_with_index do |field,c|
101
+ mod=c.modulo(3)
102
+ if mod==0 #tag operation
103
+ if c>0
104
+ tags << elm
105
+ end
106
+ elm=[] #array q inicia segunda dimension en el array principal
107
+ elm << field
108
+ elsif mod==1 # Coordenada relativa del query
109
+ elm << field.to_i
110
+ elsif mod==2 # Coordenada relativa del target
111
+ elm << field.to_i
112
+ end
113
+ if align_data_len==c #Guardar ultima operacion x fin de parseo
114
+ tags << elm
115
+ end
116
+ end
117
+ hiting(features,tags)
118
+ end
119
+ end
120
+
121
+ def hiting(features,tags) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
122
+ query=ExoBlastQuery.new(features['target_id'])
123
+ #Variables para hit
124
+ if features['target_strand']=='+'
125
+ reversed=FALSE
126
+ strand=1
127
+ else
128
+ reversed=TRUE
129
+ strand=-1
130
+ end
131
+ #------------------
132
+ start_target=features['target_start_align'].to_i#paso de coord 0 a 1
133
+ ends_target=0
134
+ start_query=features['query_start_align'].to_i
135
+ ends_query=0
136
+ tag_len=tags.length-1
137
+ counter_target=start_target
138
+ counter_query=start_query
139
+ add=TRUE #Hay casos q hay q realizar la adicion de coordenadas relativas antes de la operacion de guardado, esta varible se define para impedir la suma al final de la iteracion
140
+ tags.each_with_index do |tag,c|
141
+ if c==tag_len||tag[0]=='S'||tag[0]=='G'
142
+ counter_query+=tag[1]
143
+ counter_target+=(tag[2]*strand)
144
+ add=FALSE
145
+ else
146
+ if tag[0]=='F'
147
+ query.s_frameshift << counter_query
148
+ query.q_frameshift << counter_target
149
+ end
150
+ add=TRUE
151
+ end
152
+ if tag[0]=='5'||c==tag_len #Comienzo de intron o final de secuencia
153
+ ends_target=counter_target
154
+ ends_query=counter_query
155
+
156
+ # creates the hit
157
+ hit = BlastHit.new(start_target+1, ends_target, start_query+1, ends_query)
158
+ hit.align_len=(ends_target-start_target)*strand
159
+ hit.ident=0
160
+
161
+ hit.gaps=0
162
+ hit.mismatches=0
163
+ hit.e_val=0
164
+ hit.bit_score=0
165
+
166
+ hit.score = features['score'].to_i
167
+ if reversed
168
+ hit.q_frame = -1
169
+ else
170
+ hit.q_frame =1
171
+ end
172
+ hit.s_frame = nil
173
+
174
+ hit.reversed=reversed
175
+ hit.subject_id = features['query_id']
176
+ hit.full_subject_length=0
177
+ hit.definition=features['query_id']
178
+ hit.acc=features['query_id']
179
+ hit.q_seq=''
180
+ hit.s_seq=''
181
+ #puts "#{features['target_id']}\t#{hit.inspect}"
182
+ query.add_hit(hit)
183
+ end
184
+ if add
185
+ counter_query+=tag[1]
186
+ counter_target+=tag[2]*strand
187
+ end
188
+
189
+ if tag[0]=='3' # Final de intron x lo tanto comienzo de exon
190
+ start_query=counter_query
191
+ start_target=counter_target
192
+ end
193
+ end#end do
194
+ @querys << query
195
+ end #def
196
+
197
+ # inspect results
198
+ def inspect
199
+ res = "Exonerate results:\n"
200
+ res+= '-'*20
201
+ res+= "\nQuerys: #{@querys.count}\n"
202
+ @querys.each{|q| res+=q.inspect+"\n"}
203
+ return res
204
+ end
205
+
206
+ # find query by name
207
+ def find_query(querys,name_q)
208
+ # newq = querys.find{|q| ( q.find{|h| (h.subject_id)})}
209
+ new_q=nil
210
+
211
+ if !querys.empty?
212
+ new_q=querys.find{|q| (q.query_id==name_q)}
213
+ end
214
+
215
+ return new_q
216
+ end
217
+
218
+ # check if there are querys
219
+ def empty?
220
+
221
+ return @querys.empty?
222
+ end
223
+
224
+ # get query count
225
+ def size
226
+ @querys.size
227
+ end
228
+
229
+ attr_accessor :querys
230
+ end
@@ -0,0 +1,67 @@
1
+ require 'gff_hit'
2
+ require 'gff_snp'
3
+ require 'gff_go'
4
+ require 'gff_localization'
5
+ require 'gff_stop'
6
+ require 'gff_frameshift'
7
+
8
+ class GffContig
9
+ def report(contig,seqid,parent,name_mode)
10
+ features_parent=nil
11
+ seq=nil # Para especificar la secuencia del contig o del hit
12
+
13
+ #Contig
14
+ if !parent.nil? #Caso de q cada contig sea una unidad independiente
15
+ seqid=parent #Se redefine el seqid con el nombre del contig q actua de parent
16
+ if !contig.seq.nil? #Se especifica secuencia para el contig hijo ya q Gbrowse toma como secuencia aquella perteneciente al parent, por lo q hay especificar la secuencia del contig en el gff
17
+ seq=contig.seq
18
+ end
19
+ end
20
+ parent_hit=nil
21
+ contig_text=[]
22
+ if parent.nil? #Caso de cada contig sea una unidad independiente
23
+ parent_hit=contig.name
24
+ text="#{seqid}\tunknown\t#{contig.type}\t1\t#{contig.length}\t.\t+\t.\tID=#{contig.name};Name=#{contig.name}"
25
+ contig_text << text
26
+ end
27
+
28
+ #Hit
29
+ gff_hit=GffHit.new
30
+ contig.each_hit{|hit|
31
+ text,features_parent= gff_hit.report(hit, parent_hit, seqid, contig.name, name_mode, seq) #En caso de que un contig dependa de otro, features_parent proporcio
32
+ contig_text << text
33
+ }
34
+
35
+ #Frameshift
36
+ gff_frameshift=GffFrameshift.new
37
+ contig.each_q_frameshift{|fs|
38
+ contig_text << gff_frameshift.report(fs,features_parent,seqid)
39
+ }
40
+
41
+ #Stop
42
+ gff_stop=GffStop.new
43
+ contig.each_stop{|stop|
44
+ contig_text << gff_stop.report(stop,features_parent,seqid)
45
+ }
46
+
47
+ #SNP
48
+ gff_snp=GffSNP.new
49
+ contig.each_snp_with_index{|snp,n|
50
+ contig_text << gff_snp.report(snp, features_parent, seqid,n)
51
+ }
52
+
53
+ #GO
54
+ gff_go=GffGo.new
55
+ contig.each_go{|go|
56
+ contig_text << gff_go.report(go, features_parent, seqid)
57
+ }
58
+
59
+ #Localization
60
+ gff_localization=Localization.new
61
+ contig.each_localization_with_index{|localization,n|
62
+ contig_text << gff_localization.report(localization,features_parent, seqid, contig, n)
63
+ }
64
+
65
+ return contig_text
66
+ end
67
+ end
@@ -0,0 +1,152 @@
1
+ require 'gff_feature'
2
+ require 'gff_master_feature'
3
+ class Gff_dataset
4
+ @@undefined_features=0
5
+ attr_accessor :master_features, :index
6
+ def initialize
7
+ @master_features={}
8
+ @index={}
9
+ end
10
+
11
+ def master_features
12
+ if @master_features.length >1
13
+ master_features=@master_features.values
14
+ else
15
+ master_features=@master_features.to_a[0][1]
16
+ end
17
+ return master_features
18
+ end
19
+
20
+ def add_master_feature(master_seq_id, source, type, start, stop, score, strand, phase, attribs)
21
+ master=nil
22
+ if @index.key?(master_seq_id)#Check that feature region exists, if exists add master feature like a child of that region
23
+ attribs['Parent']=master_seq_id
24
+ master=add_feature(source, type, start, stop, score, strand, phase, attribs)
25
+ @index[attribs['ID']]=master
26
+ if stop.to_i > @master_features[master_seq_id].stop #Redefine master_feature with new child
27
+ @master_features[master_seq_id].stop=stop.to_i
28
+ end
29
+ elsif attribs['ID']==master_seq_id #Check that exists a parent region for master_feature
30
+ master=Master_feature.new(source, type, start, stop, score, strand, phase, attribs)
31
+ @master_features[master_seq_id]=master
32
+ @index[master_seq_id]=master
33
+ else #Creates a master feature with his child if it'sn defined master_feature
34
+ master=Master_feature.new(source, 'region', 1, stop, '.', '.', '.', {'ID'=> master_seq_id})
35
+ @master_features[master_seq_id]=master
36
+ @index[master_seq_id]=master
37
+ attribs['Parent']=master_seq_id
38
+ child=master.add_child(source, type, start, stop, score, strand, phase, attribs)
39
+ @index[attribs['ID']]=child
40
+ end
41
+ return master
42
+ end
43
+
44
+ def add_feature(source, type, start, stop, score, strand, phase, attribs)
45
+ feature=@index[attribs['Parent']].add_child(source, type, start, stop, score, strand, phase, attribs)
46
+ if !attribs['ID'].nil?
47
+ @index[attribs['ID']]=feature
48
+ else
49
+ @index[feature.attrib('Parent')+'_'+@@undefined_features.to_s]=feature
50
+ @@undefined_features+=1
51
+ end
52
+ return feature
53
+ end
54
+
55
+ def each_feature
56
+ @index.each_value do |feature|
57
+ yield feature
58
+ end
59
+ end
60
+
61
+ def each_master_feature
62
+ @master_features.each_value do |master_feature|
63
+ yield master_feature
64
+ end
65
+ end
66
+
67
+ def each_id_feat
68
+ @index.each do |id,feature|
69
+ yield id,feature
70
+ end
71
+ end
72
+
73
+ def each_id_master
74
+ @master_features.each do |id,master|
75
+ yield id,master
76
+ end
77
+ end
78
+
79
+ def inspects
80
+ @master_features.each do |item|
81
+ item[1].inspects
82
+ end
83
+ end
84
+
85
+ def feature(hash_key)
86
+ return @index[hash_key]
87
+ end
88
+
89
+ def has_type?(type)
90
+ has_type=FALSE
91
+ @index.each_value do |feature|
92
+ if feature.type==type
93
+ has_type=TRUE
94
+ break
95
+ end
96
+ end
97
+ return has_type
98
+ end
99
+
100
+ def has_source?(source)
101
+ has_source=FALSE
102
+ @index.each_value do |feature|
103
+ if feature.source==source
104
+ has_source=TRUE
105
+ break
106
+ end
107
+ end
108
+ return has_source
109
+ end
110
+
111
+ def get(source=FALSE, type=FALSE)
112
+ features=[]
113
+ @index.each_value do |feature|
114
+ s=TRUE
115
+ if source
116
+ s=feature.is_source?(source)
117
+ end
118
+ t=TRUE
119
+ if type
120
+ t=feature.is_type?(type)
121
+ end
122
+
123
+ if s&t==TRUE
124
+ features << feature
125
+ end
126
+ end
127
+ return features
128
+ end
129
+
130
+ def tree
131
+ @master_features.each_value do |master|
132
+ master.tree
133
+ puts "\n",'--------------------------------'
134
+ end
135
+ end
136
+
137
+ def add_parent_to(type_parent,type_child)
138
+ each_id_feat {|id,feature|
139
+ if feature.type==type_child
140
+ new_feature=Feature.new(feature.source, type_parent, feature.start, feature.stop, '.', feature.strand, '.', feature.attribs.dup)
141
+ feature.change_to_type_id_recursive
142
+ new_feature.transfer_child(feature.attrib('ID'),feature)
143
+ @index[feature.attrib('Parent')].child[id]=new_feature
144
+ @index[id]=new_feature
145
+ new_feature.each_child {|child|
146
+ child.attribs['Parent']=new_feature.attrib('ID') #Define new parent
147
+ }
148
+ end
149
+ }
150
+ end
151
+
152
+ end