bio-ensembl 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. data/.document +5 -0
  2. data/Gemfile +20 -0
  3. data/Gemfile.lock +40 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +71 -0
  7. data/VERSION +1 -0
  8. data/bin/ensembl +40 -0
  9. data/bin/variation_effect_predictor +106 -0
  10. data/bio-ensembl.gemspec +190 -0
  11. data/lib/bio-ensembl.rb +65 -0
  12. data/lib/bio-ensembl/core/activerecord.rb +1812 -0
  13. data/lib/bio-ensembl/core/collection.rb +64 -0
  14. data/lib/bio-ensembl/core/project.rb +262 -0
  15. data/lib/bio-ensembl/core/slice.rb +657 -0
  16. data/lib/bio-ensembl/core/transcript.rb +409 -0
  17. data/lib/bio-ensembl/core/transform.rb +95 -0
  18. data/lib/bio-ensembl/db_connection.rb +205 -0
  19. data/lib/bio-ensembl/variation/activerecord.rb +536 -0
  20. data/lib/bio-ensembl/variation/variation_feature.rb +376 -0
  21. data/lib/bio-ensembl/variation/variation_feature62.rb +444 -0
  22. data/samples/ensembl_genomes_example.rb +60 -0
  23. data/samples/examples_perl_tutorial.rb +125 -0
  24. data/samples/small_example_ruby_api.rb +34 -0
  25. data/samples/variation_effect_predictor_data.txt +4 -0
  26. data/samples/variation_example.rb +67 -0
  27. data/test/data/seq_c6qbl.fa +10 -0
  28. data/test/data/seq_cso19_coding.fa +16 -0
  29. data/test/data/seq_cso19_transcript.fa +28 -0
  30. data/test/data/seq_drd3_gene.fa +838 -0
  31. data/test/data/seq_drd3_transcript.fa +22 -0
  32. data/test/data/seq_drd4_transcript.fa +24 -0
  33. data/test/data/seq_forward_composite.fa +1669 -0
  34. data/test/data/seq_par_boundary.fa +169 -0
  35. data/test/data/seq_rnd3_transcript.fa +47 -0
  36. data/test/data/seq_ub2r1_coding.fa +13 -0
  37. data/test/data/seq_ub2r1_gene.fa +174 -0
  38. data/test/data/seq_ub2r1_transcript.fa +26 -0
  39. data/test/data/seq_y.fa +2 -0
  40. data/test/default/test_connection.rb +60 -0
  41. data/test/default/test_releases.rb +130 -0
  42. data/test/ensembl_genomes/test_collection.rb +122 -0
  43. data/test/ensembl_genomes/test_gene.rb +46 -0
  44. data/test/ensembl_genomes/test_slice.rb +65 -0
  45. data/test/ensembl_genomes/test_variation.rb +38 -0
  46. data/test/helper.rb +18 -0
  47. data/test/release_50/core/test_project.rb +210 -0
  48. data/test/release_50/core/test_project_human.rb +52 -0
  49. data/test/release_50/core/test_relationships.rb +72 -0
  50. data/test/release_50/core/test_sequence.rb +170 -0
  51. data/test/release_50/core/test_slice.rb +116 -0
  52. data/test/release_50/core/test_transcript.rb +125 -0
  53. data/test/release_50/core/test_transform.rb +217 -0
  54. data/test/release_50/variation/test_activerecord.rb +138 -0
  55. data/test/release_50/variation/test_variation.rb +79 -0
  56. data/test/release_53/core/test_gene.rb +61 -0
  57. data/test/release_53/core/test_project.rb +91 -0
  58. data/test/release_53/core/test_project_human.rb +61 -0
  59. data/test/release_53/core/test_slice.rb +42 -0
  60. data/test/release_53/core/test_transform.rb +57 -0
  61. data/test/release_53/variation/test_activerecord.rb +137 -0
  62. data/test/release_53/variation/test_variation.rb +66 -0
  63. data/test/release_56/core/test_gene.rb +61 -0
  64. data/test/release_56/core/test_project.rb +91 -0
  65. data/test/release_56/core/test_slice.rb +49 -0
  66. data/test/release_56/core/test_transform.rb +57 -0
  67. data/test/release_56/variation/test_activerecord.rb +141 -0
  68. data/test/release_56/variation/test_consequence.rb +131 -0
  69. data/test/release_56/variation/test_variation.rb +63 -0
  70. data/test/release_60/core/test_gene.rb +61 -0
  71. data/test/release_60/core/test_project_human.rb +34 -0
  72. data/test/release_60/core/test_slice.rb +42 -0
  73. data/test/release_60/core/test_transcript.rb +120 -0
  74. data/test/release_60/core/test_transform.rb +57 -0
  75. data/test/release_60/variation/test_activerecord.rb +216 -0
  76. data/test/release_60/variation/test_consequence.rb +153 -0
  77. data/test/release_60/variation/test_variation.rb +64 -0
  78. data/test/release_62/core/test_gene.rb +42 -0
  79. data/test/release_62/variation/test_activerecord.rb +86 -0
  80. data/test/release_62/variation/test_consequence.rb +191 -0
  81. metadata +287 -0
@@ -0,0 +1,376 @@
1
+ #
2
+ # = ensembl/variation/variation.rb - Extension of ActiveRecord classes for Ensembl variation features
3
+ #
4
+ # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # @author Francesco Strozzi
8
+
9
+
10
+ module Ensembl
11
+
12
+ module Variation
13
+
14
+
15
+ # The VariationFeature class gives information about the genomic position of
16
+ # each Variation, including also validation status and consequence type.
17
+ #
18
+ # This class uses ActiveRecord to access data in the Ensembl database.
19
+ # See the general documentation of the Ensembl module for
20
+ # more information on what this means and what methods are available.
21
+ #
22
+ # @example
23
+ # # SLOWER QUERY
24
+ # vf = VariationFeature.find_by_variation_name('rs10111')
25
+ # # FASTER QUERY
26
+ # vf = Variation.find_by_name('rs10111').variation_feature
27
+ #
28
+ # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
29
+ # puts vf.variation.ancestral_allele
30
+ # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
31
+ # genomic_region.genes
32
+ # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
33
+ #
34
+ class VariationFeature < DBConnection
35
+ set_primary_key "variation_feature_id"
36
+ belongs_to :variation
37
+ has_many :tagged_variation_features
38
+ has_many :samples, :through => :tagged_variation_features
39
+ belongs_to :seq_region
40
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
41
+ 'STOP_GAINED',
42
+ 'STOP_LOST',
43
+ 'COMPLEX_INDEL',
44
+ 'FRAMESHIFT_CODING',
45
+ 'NON_SYNONYMOUS_CODING',
46
+ 'SPLICE_SITE',
47
+ 'PARTIAL_CODON',
48
+ 'SYNONYMOUS_CODING',
49
+ 'REGULATORY_REGION',
50
+ 'WITHIN_MATURE_miRNA',
51
+ '5PRIME_UTR',
52
+ '3PRIME_UTR',
53
+ 'INTRONIC',
54
+ 'NMD_TRANSCRIPT',
55
+ 'UPSTREAM',
56
+ 'DOWNSTREAM',
57
+ 'WITHIN_NON_CODING_GENE',
58
+ 'HGMD_MUTATION'
59
+ ], :message => "Consequence type not allowed!"
60
+
61
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
62
+ "#{attributes_before_type_cast['consequence_type']}"
63
+ end
64
+
65
+ # Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
66
+ # starting from the Variation coordinates, expanding the region upstream and
67
+ # downstream.
68
+ #
69
+ # @param [Integer] up Length of upstream flanking region
70
+ # @param [Integer] down Length of downstream flanking region
71
+ # @return [Slice] Slice object containing the variation
72
+ def fetch_region(up = 5000, down = 5000)
73
+ sr = core_connection(self.seq_region_id)
74
+ slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
75
+ return slice
76
+ end
77
+
78
+ def flanking_seq
79
+ sr = core_connection(self.seq_region_id)
80
+ f = Variation.find(self.variation_id).flanking_sequence
81
+ slice_up = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.up_seq_region_start,f.up_seq_region_end,self.seq_region_strand)
82
+ slice_down = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.down_seq_region_start,f.down_seq_region_end,self.seq_region_strand)
83
+ return slice_up,slice_down
84
+ end
85
+
86
+ def transcript_variations
87
+ tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
88
+ if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
89
+ sr = core_connection(self.seq_region_id)
90
+ return custom_transcript_variation(self,sr)
91
+ else
92
+ return tvs # the variation is already present in the database
93
+ end
94
+ end
95
+
96
+ private
97
+
98
+ def core_connection(seq_region_id)
99
+ if !Ensembl::Core::DBConnection.connected? then
100
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
101
+ begin
102
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
103
+ rescue
104
+ raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
105
+ end
106
+ end
107
+ # Check if SeqRegion already exists in Ensembl::SESSION
108
+ seq_region = nil
109
+ if Ensembl::SESSION.seq_regions.has_key?(seq_region_id)
110
+ seq_region = Ensembl::SESSION.seq_regions[seq_region_id]
111
+ else
112
+ seq_region = Ensembl::Core::SeqRegion.find(seq_region_id)
113
+ Ensembl::SESSION.seq_regions[seq_region.id] = seq_region
114
+ end
115
+ return seq_region
116
+ end
117
+
118
+ # Calculate a consequence type for a user-defined variation
119
+ def custom_transcript_variation(vf,sr)
120
+
121
+ @variation_name = vf.variation_name
122
+ @seq_region = sr
123
+
124
+ downstream = 5000
125
+ upstream = 5000
126
+ tvs = [] # store all the calculated TranscriptVariations
127
+ # retrieve the slice of the genomic region where the variation is located
128
+ region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,vf.seq_region_start-upstream,vf.seq_region_end+downstream-1)
129
+ # iterate through all the transcripts present in the region
130
+ genes = region.genes(inclusive = true)
131
+ if genes[0] != nil
132
+ genes.each do |g|
133
+ g.transcripts.each do |t|
134
+ @cache = {}
135
+ tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
136
+ # do the calculations
137
+
138
+ # check if the variation is intergenic for this transcript (no effects)
139
+ tv.consequence_type = check_intergenic(vf,t)
140
+
141
+ # check if the variation is upstram or downstram the transcript
142
+ tv.consequence_type = check_upstream_downstream(vf,t) if tv.consequence_type == ""
143
+
144
+ # if no consequence type is found, then the variation is inside the transcript
145
+ # check for non coding gene
146
+ tv.consequence_type = check_non_coding(vf,t) if tv.consequence_type == "" and t.biotype != 'protein_coding'
147
+
148
+ # if no consequence type is found, then check intron / exon boundaries
149
+ tv.consequence_type = check_splice_site(vf,t) if tv.consequence_type == ""
150
+
151
+ # if no consequence type is found, check if the variation is inside UTRs
152
+ tv.consequence_type = check_utr(vf,t) if tv.consequence_type == ""
153
+
154
+ # if no consequence type is found, then variation is inside an exon.
155
+ # Check the codon change
156
+ (tv.consequence_type,tv.peptide_allele_string) = check_aa_change(vf,t) if tv.consequence_type == ""
157
+
158
+
159
+ begin # this changed from release 58
160
+ tv.transcript_stable_id = t.stable_id
161
+ rescue NoMethodError
162
+ tv.transcript_id = t.id
163
+ end
164
+
165
+ tv.consequence_type = "INTERGENIC" if tv.consequence_type == ""
166
+ tvs << tv
167
+ end
168
+ end
169
+ end
170
+ # if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
171
+ if tvs.size == 0 then
172
+ tv = TranscriptVariation.new()
173
+ tv.consequence_type = "INTERGENIC"
174
+ tvs << tv
175
+ end
176
+
177
+ return tvs
178
+ end
179
+
180
+ ## CONSEQUENCE CALCULATION FUNCTIONS ##
181
+
182
+ def check_intergenic(vf,t)
183
+ if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) > 5000 then
184
+ return "INTERGENIC"
185
+ elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) > 5000 then
186
+ return "INTERGENIC"
187
+ end
188
+ return nil
189
+ end
190
+
191
+ def check_upstream_downstream(vf,t)
192
+ if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) <= 5000 then
193
+ return (t.strand == 1) ? "UPSTREAM" : "DOWNSTREAM"
194
+ elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) <= 5000 then
195
+ return (t.strand == 1) ? "DOWNSTREAM" : "UPSTREAM"
196
+
197
+ # check if it's an InDel and if overlaps the transcript start / end
198
+ elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
199
+ return "COMPLEX_INDEL"
200
+ elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
201
+ return "COMPLEX_INDEL"
202
+ end
203
+ return nil
204
+ end
205
+
206
+ def check_non_coding(vf,t)
207
+ if t.biotype == "miRNA" then
208
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_MATURE_miRNA" : "COMPLEX_INDEL"
209
+ elsif t.biotype == "nonsense_mediated_decay"
210
+ return (vf.seq_region_start == vf.seq_region_end) ? "NMD_TRANSCRIPT" : "COMPLEX_INDEL"
211
+ else
212
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_NON_CODING_GENE" : "COMPLEX_INDEL"
213
+ end
214
+ return nil
215
+ end
216
+
217
+ def check_utr(vf,t)
218
+ if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
219
+ return (t.strand == 1) ? "5PRIME_UTR" : "3PRIME_UTR"
220
+ elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
221
+ return (t.strand == 1) ? "3PRIME_UTR" : "5PRIME_UTR"
222
+ end
223
+ return nil
224
+ end
225
+
226
+ def check_splice_site(vf,t)
227
+ @cache[:exons] = []
228
+ var_start,var_end = (vf.seq_region_strand == 1) ? [vf.seq_region_start,vf.seq_region_end] : [vf.seq_region_end,vf.seq_region_start]
229
+ t.exons.each {|ex| @cache[:exons] << Range.new(ex.seq_region_start,ex.seq_region_end)}
230
+
231
+ exon_up = check_near_exons(var_start,@cache[:exons])
232
+ exon_down = check_near_exons(var_end,@cache[:exons])
233
+ if !exon_up and !exon_down # we are inside an intron
234
+ # checking boundaries
235
+ near_exon_up_2bp = check_near_exons(var_start-2..var_start,@cache[:exons])
236
+ near_exon_down_2bp = check_near_exons(var_end..var_end+2,@cache[:exons])
237
+ if near_exon_up_2bp or near_exon_down_2bp then
238
+ return "ESSENTIAL_SPLICE_SITE"
239
+ else
240
+ near_exon_up_8bp = check_near_exons(var_start+8..var_start,@cache[:exons])
241
+ near_exon_down_8bp = check_near_exons(var_end..var_end+8,@cache[:exons])
242
+ if near_exon_up_8bp or near_exon_down_8bp then
243
+ return "SPLICE_SITE"
244
+ else
245
+ return "INTRONIC"
246
+ end
247
+ end
248
+ elsif exon_up and exon_down # the variation is inside an exon
249
+ # check if it is a splice site
250
+ if (var_start-exon_up.first) <= 3 or (exon_down.last-var_end) <= 3 then
251
+ return "SPLICE_SITE"
252
+ end
253
+ else # a complex indel spanning intron/exon boundary
254
+ return "COMPLEX_INDEL"
255
+ end
256
+ return nil
257
+ end
258
+
259
+ def check_aa_change(vf,t)
260
+ alleles = vf.allele_string.split('/') # get the different alleles for this variation
261
+ # if the variation is an InDel then it produces a frameshift
262
+ if vf.seq_region_start != vf.seq_region_end or alleles.include?("-") then
263
+ return "FRAMESHIFT_CODING",nil
264
+ end
265
+
266
+ # Find the position inside the CDS
267
+
268
+ mutation_position = t.genomic2cds(vf.seq_region_start)
269
+
270
+ mutation_base = Bio::Sequence::NA.new(alleles[1])
271
+ if t.seq_region_strand == -1
272
+ mutation_base.reverse_complement!
273
+ end
274
+ # The rank of the codon
275
+ target_codon = (mutation_position)/3 + 1
276
+ cds_sequence = nil
277
+ cds_sequence = t.cds_seq
278
+ mut_sequence = cds_sequence.dup
279
+ # Replace base with the variant allele
280
+ mut_sequence[mutation_position] = mutation_base.seq
281
+ refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1)]
282
+ mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1)]
283
+ codontable = Bio::CodonTable[1]
284
+ refaa = codontable[refcodon]
285
+ mutaa = codontable[mutcodon.downcase]
286
+ if mutaa == nil
287
+ raise RuntimeError "Codon #{mutcodon.downcase} wasn't recognized."
288
+ end
289
+ pep_string = refaa+"/"+mutaa
290
+ if mutaa == "*" and refaa != "*"
291
+ return "STOP_GAINED",pep_string
292
+ elsif mutaa != "*" and refaa == "*"
293
+ return "STOP_LOST",pep_string
294
+ elsif mutaa != refaa
295
+ return "NON_SYNONYMOUS_CODING",pep_string
296
+ elsif mutaa == refaa
297
+ return "SYNONYMOUS_CODING",pep_string
298
+ end
299
+
300
+ end
301
+
302
+
303
+ def check_near_exons(feature,exons_ranges)
304
+ exons_ranges.each do |exon_range|
305
+ if feature.is_a? Range
306
+ return exon_range if (feature.first <= exon_range.last) && (exon_range.first <= feature.last)
307
+ else
308
+ return exon_range if exon_range.include? feature
309
+ end
310
+ end
311
+ return false
312
+ end
313
+
314
+
315
+ end # VariationFeature
316
+
317
+ # The TranscriptVariation class gives information about the position of
318
+ # a VariationFeature, mapped on an annotated transcript.
319
+ #
320
+ # This class uses ActiveRecord to access data in the Ensembl database.
321
+ # See the general documentation of the Ensembl module for
322
+ # more information on what this means and what methods are available.
323
+ #
324
+ # @example
325
+ # vf = Variation.find_by_name('rs10111').variation_feature
326
+ # vf.transcript_variations.each do |tv|
327
+ # puts tv.peptide_allele_string, tv.transcript.stable_id
328
+ # end
329
+ #
330
+ class TranscriptVariation < DBConnection
331
+ set_primary_key "transcript_variation_id"
332
+ belongs_to :variation_feature
333
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
334
+ 'STOP_GAINED',
335
+ 'STOP_LOST',
336
+ 'COMPLEX_INDEL',
337
+ 'FRAMESHIFT_CODING',
338
+ 'NON_SYNONYMOUS_CODING',
339
+ 'SPLICE_SITE',
340
+ 'PARTIAL_CODON',
341
+ 'SYNONYMOUS_CODING',
342
+ 'REGULATORY_REGION',
343
+ 'WITHIN_MATURE_miRNA',
344
+ '5PRIME_UTR',
345
+ '3PRIME_UTR',
346
+ 'INTRONIC',
347
+ 'NMD_TRANSCRIPT',
348
+ 'UPSTREAM',
349
+ 'DOWNSTREAM',
350
+ 'WITHIN_NON_CODING_GENE',
351
+ 'HGMD_MUTATION'
352
+ ], :message => "Consequence type not allowed!"
353
+
354
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
355
+ "#{attributes_before_type_cast['consequence_type']}"
356
+ end
357
+
358
+ def transcript
359
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
360
+ if !Ensembl::Core::DBConnection.connected? then
361
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
362
+ end
363
+
364
+ begin # this changed from release 58
365
+ return Ensembl::Core::Transcript.find_by_stable_id(self.transcript_stable_id)
366
+ rescue NoMethodError
367
+ return Ensembl::Core::Transcript.find(self.transcript_id)
368
+ end
369
+
370
+ end
371
+
372
+ end
373
+
374
+ end
375
+
376
+ end
@@ -0,0 +1,444 @@
1
+ #
2
+ # = ensembl/variation/variation.rb - Extension of ActiveRecord classes for Ensembl variation features
3
+ #
4
+ # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # @author Francesco Strozzi
8
+
9
+
10
+ module Ensembl
11
+
12
+ module Variation
13
+
14
+
15
+ # The VariationFeature class gives information about the genomic position of
16
+ # each Variation, including also validation status and consequence type.
17
+ #
18
+ # This class uses ActiveRecord to access data in the Ensembl database.
19
+ # See the general documentation of the Ensembl module for
20
+ # more information on what this means and what methods are available.
21
+ #
22
+ # @example
23
+ # # SLOWER QUERY
24
+ # vf = VariationFeature.find_by_variation_name('rs10111')
25
+ # # FASTER QUERY
26
+ # vf = Variation.find_by_name('rs10111').variation_feature
27
+ #
28
+ # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
29
+ # puts vf.variation.ancestral_allele
30
+ # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
31
+ # genomic_region.genes
32
+ # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
33
+ #
34
+ class VariationFeature < DBConnection
35
+ set_primary_key "variation_feature_id"
36
+ belongs_to :variation
37
+ has_many :tagged_variation_features
38
+ has_many :samples, :through => :tagged_variation_features
39
+ belongs_to :seq_region
40
+ validates_inclusion_of :consequence_types, :in => ['intergenic_variant',
41
+ 'splice_acceptor_variant',
42
+ 'splice_donor_variant',
43
+ 'complex_change_in_transcript',
44
+ 'stop_lost',
45
+ 'coding_sequence_variant',
46
+ 'non_synonymous_codon',
47
+ 'stop_gained',
48
+ 'synonymous_codon',
49
+ 'frameshift_variant',
50
+ 'nc_transcript_variant',
51
+ 'mature_miRNA_variant',
52
+ 'NMD_transcript_variant',
53
+ '5_prime_UTR_variant',
54
+ '3_prime_UTR_variant',
55
+ 'incomplete_terminal_codon_variant',
56
+ 'intron_variant',
57
+ 'splice_region_variant',
58
+ '5KB_downstream_variant',
59
+ '500B_downstream_variant',
60
+ '5KB_upstream_variant',
61
+ '2KB_upstream_variant',
62
+ 'initiator_codon_change',
63
+ 'stop_retained_variant',
64
+ 'inframe_codon_gain',
65
+ 'inframe_codon_loss',
66
+ 'miRNA_target_site_variant',
67
+ 'pre_miRNA_variant',
68
+ 'regulatory_region_variant',
69
+ 'increased_binding_affinity',
70
+ 'decreased_binding_affinity',
71
+ 'binding_site_variant'
72
+ ], :message => "Consequence type not allowed!"
73
+
74
+ def consequence_types # workaround as ActiveRecord do not parse SET field in MySQL
75
+ "#{attributes_before_type_cast['consequence_types']}"
76
+ end
77
+
78
+ # Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
79
+ # starting from the Variation coordinates, expanding the region upstream and
80
+ # downstream.
81
+ #
82
+ # @param [Integer] up Length of upstream flanking region
83
+ # @param [Integer] down Length of downstream flanking region
84
+ # @return [Slice] Slice object containing the variation
85
+ def fetch_region(up = 5000, down = 5000)
86
+ sr = core_connection(self.seq_region_id)
87
+ slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
88
+ return slice
89
+ end
90
+
91
+ def flanking_seq
92
+ sr = core_connection(self.seq_region_id)
93
+ f = Variation.find(self.variation_id).flanking_sequence
94
+ slice_up = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.up_seq_region_start,f.up_seq_region_end,self.seq_region_strand)
95
+ slice_down = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.down_seq_region_start,f.down_seq_region_end,self.seq_region_strand)
96
+ return slice_up,slice_down
97
+ end
98
+
99
+ def transcript_variations
100
+ tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
101
+ if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
102
+ sr = core_connection(self.seq_region_id)
103
+ return custom_transcript_variation(self,sr)
104
+ else
105
+ return tvs # the variation is already present in the database
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def core_connection(seq_region_id)
112
+ if !Ensembl::Core::DBConnection.connected? then
113
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
114
+ begin
115
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
116
+ rescue
117
+ raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
118
+ end
119
+ end
120
+ # Check if SeqRegion already exists in Ensembl::SESSION
121
+ seq_region = nil
122
+ if Ensembl::SESSION.seq_regions.has_key?(seq_region_id)
123
+ seq_region = Ensembl::SESSION.seq_regions[seq_region_id]
124
+ else
125
+ seq_region = Ensembl::Core::SeqRegion.find(seq_region_id)
126
+ Ensembl::SESSION.seq_regions[seq_region.id] = seq_region
127
+ end
128
+ return seq_region
129
+ end
130
+
131
+ # Calculate a consequence type for a user-defined variation
132
+ def custom_transcript_variation(vf,sr)
133
+
134
+ @variation_name = vf.variation_name
135
+ @seq_region = sr
136
+
137
+ downstream = 5000
138
+ upstream = 5000
139
+ tvs = [] # store all the calculated TranscriptVariations
140
+ # retrieve the slice of the genomic region where the variation is located
141
+ var_start,var_end = 0,0
142
+ if vf.seq_region_start > vf.seq_region_end
143
+ var_start,var_end = vf.seq_region_end,vf.seq_region_start
144
+ else
145
+ var_start,var_end = vf.seq_region_start,vf.seq_region_end
146
+ end
147
+ region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,var_start-upstream,var_end+downstream)
148
+ # iterate through all the transcripts present in the region
149
+ genes = region.genes(inclusive = true)
150
+ if genes[0] != nil
151
+ genes.each do |g|
152
+ g.transcripts.each do |t|
153
+
154
+ @cache = {}
155
+
156
+ tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
157
+ # do the calculations
158
+
159
+ # check if the variation is intergenic for this transcript (no effects)
160
+ tv.consequence_types = check_intergenic(vf,t)
161
+
162
+ # check if the variation is upstram or downstram the transcript
163
+ tv.consequence_types = check_upstream_downstream(vf,t) if tv.consequence_types == ""
164
+
165
+ # check partial codon
166
+ tv.consequence_types = check_partial_codon(vf,t) if tv.consequence_types == ""
167
+
168
+ # if no consequence type is found, then the variation is inside the transcript
169
+ # check for non coding gene
170
+ tv.consequence_types = check_non_coding(vf,t) if tv.consequence_types == "" && t.biotype != 'protein_coding'
171
+
172
+ # if no consequence type is found, then check intron / exon boundaries
173
+ tv.consequence_types = check_splice_site(vf,t) if tv.consequence_types == ""
174
+
175
+ # if no consequence type is found, check if the variation is inside UTRs
176
+ tv.consequence_types = check_utr(vf,t) if tv.consequence_types == ""
177
+
178
+ # if no consequence type is found, then variation is inside an exon.
179
+ # Check the codon change
180
+ (tv.consequence_types,tv.pep_allele_string) = check_aa_change(vf,t) if tv.consequence_types == ""
181
+
182
+ tv.feature_stable_id = t.stable_id
183
+
184
+ #tv.consequence_types = "intergenic_variant" if tv.consequence_types == ""
185
+ tvs << tv
186
+ end
187
+ end
188
+ end
189
+ # if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
190
+ if tvs.size == 0 then
191
+ tv = TranscriptVariation.new()
192
+ tv.consequence_types = "intergenic_variant"
193
+ tvs << tv
194
+ end
195
+
196
+ return tvs
197
+ end
198
+
199
+ ## CONSEQUENCE CALCULATION METHODS ##
200
+
201
+ def check_intergenic(vf,t)
202
+ if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) > 5000 then
203
+ return "intergenic_variant"
204
+ elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) > 5000 then
205
+ return "intergenic_variant"
206
+ end
207
+ return nil
208
+ end
209
+
210
+ def check_upstream_downstream(vf,t)
211
+ if vf.seq_region_end < t.seq_region_start
212
+ distance = t.seq_region_start - vf.seq_region_end+1
213
+ if t.strand == 1 and distance <= 2000
214
+ return "2KB_upstream_variant"
215
+ elsif t.strand == -1 and distance <= 500
216
+ return "500B_downstream_variant"
217
+ else
218
+ return (t.strand == 1) ? "5KB_upstream_variant" : "5KB_downstream_variant"
219
+ end
220
+ elsif vf.seq_region_start > t.seq_region_end
221
+ distance = vf.seq_region_start - t.seq_region_end+1
222
+ if t.strand == -1 and distance <= 2000
223
+ return "2KB_upstream_variant"
224
+ elsif t.strand == 1 and distance <= 500
225
+ return "500B_downstream_variant"
226
+ else
227
+ return (t.strand == 1) ? "5KB_downstream_variant" : "5KB_upstream_variant"
228
+ end
229
+ # check if it's an InDel and if overlaps the transcript start / end
230
+ elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
231
+ return "complex_change_in_transcript"
232
+ elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
233
+ return "complex_change_in_transcript"
234
+ end
235
+ return nil
236
+ end
237
+
238
+ def check_non_coding(vf,t)
239
+ if t.biotype == "miRNA" then
240
+ return "mature_miRNA_variant"
241
+ elsif t.biotype == "nonsense_mediated_decay"
242
+ return "NMD_transcript_variant"
243
+ else
244
+ return "nc_transcript_variant"
245
+ end
246
+ end
247
+
248
+ def check_utr(vf,t)
249
+ if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
250
+ return (t.strand == 1) ? "5_prime_UTR_variant" : "3_prime_UTR_variant"
251
+ elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
252
+ return (t.strand == 1) ? "3_prime_UTR_variant" : "5_prime_UTR_variant"
253
+ end
254
+ return nil
255
+ end
256
+
257
+ def check_splice_site(vf,t)
258
+ @cache[:exons] = []
259
+ var_start,var_end = (vf.seq_region_strand == 1) ? [vf.seq_region_start,vf.seq_region_end] : [vf.seq_region_end,vf.seq_region_start]
260
+ t.exons.each {|ex| @cache[:exons] << Range.new(ex.seq_region_start,ex.seq_region_end)}
261
+
262
+ exon_up = check_near_exons(var_start,@cache[:exons])
263
+ exon_down = check_near_exons(var_end,@cache[:exons])
264
+ if !exon_up and !exon_down # we are inside an intron
265
+ # checking boundaries
266
+ near_exon_up_2bp = check_near_exons(var_start-2..var_start,@cache[:exons])
267
+ near_exon_down_2bp = check_near_exons(var_end..var_end+2,@cache[:exons])
268
+ if near_exon_up_2bp
269
+ return (t.strand == 1) ? "splice_donor_variant" : "splice_acceptor_variant"
270
+ elsif near_exon_down_2bp
271
+ return (t.strand == 1) ? "splice_acceptor_variant" : "splice_donor_variant"
272
+ else
273
+ near_exon_up_8bp = check_near_exons(var_start+8..var_start,@cache[:exons])
274
+ near_exon_down_8bp = check_near_exons(var_end..var_end+8,@cache[:exons])
275
+ if near_exon_up_8bp or near_exon_down_8bp
276
+ return "splice_region_variant"
277
+ else
278
+ return "intron_variant"
279
+ end
280
+ end
281
+ elsif exon_up and exon_down # the variation is inside an exon
282
+ # check if it is a splice site
283
+ if (var_start-exon_up.first) <= 3 or (exon_down.last-var_end) <= 3 then
284
+ return "splice_region_variant"
285
+ end
286
+ else # a complex indel spanning intron/exon boundary
287
+ return "complex_change_in_transcript"
288
+ end
289
+ return nil
290
+ end
291
+
292
+ def check_aa_change(vf,t)
293
+ alleles = vf.allele_string.split('/') # get the different alleles for this variation
294
+
295
+ # Find the position inside the CDS
296
+ mutation_position = (@cache[:mutation_positon]) ? @cache[:mutation_positon] : t.genomic2cds(vf.seq_region_start)
297
+ cds_sequence = (@cache[:cds_sequence]) ? @cache[:cds_sequence] : t.cds_seq
298
+
299
+ if vf.allele_string =~/INSERTION|DELETION|MUTATION/
300
+ return "coding_sequence_variant",nil
301
+ end
302
+
303
+ mutation_base = Bio::Sequence::NA.new(alleles[1])
304
+ if t.seq_region_strand == -1
305
+ mutation_base.reverse_complement!
306
+ end
307
+ # The rank of the codon
308
+ target_codon = (mutation_position)/3 + 1
309
+ mut_sequence = cds_sequence.dup
310
+
311
+ # Replace base with the variant allele
312
+ if alleles[1] == "-" # a deletion
313
+ mut_sequence.gsub!(/#{alleles[0]}/,'')
314
+ else # insertion or SNP
315
+ mut_sequence[mutation_position] = mutation_base.seq
316
+ end
317
+
318
+ mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1 + alleles[1].length-1)]
319
+ refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1 + alleles[0].length-1)]
320
+ codontable = Bio::CodonTable[1]
321
+ refaa = codontable[refcodon]
322
+ mutaa = codontable[mutcodon.downcase]
323
+
324
+ pep_string = refaa.to_s+"/"+mutaa.to_s
325
+ transcript_start = (t.strand == 1) ? t.coding_region_genomic_start : t.coding_region_genomic_end
326
+ if (vf.seq_region_start - transcript_start).abs <= 3
327
+ return "initiator_codon_change",pep_string
328
+ elsif (mutcodon.length > refcodon.length) && (mutcodon =~/^#{refcodon}/ || mutcodon =~/#{refcodon}$/)
329
+ return "inframe_codon_gain",pep_string
330
+ elsif (mutcodon.length < refcodon.length) && (refcodon =~/^#{mutcodon}/ || refcodon =~/#{mutcodon}$/)
331
+ return "inframe_codon_loss",pep_string
332
+ elsif vf.seq_region_start != vf.seq_region_end
333
+ # if the variation is an InDel then it produces a frameshift
334
+ return "frameshift_variant",nil
335
+ elsif (mutaa == "*" and refaa == "*") && (refcodon != mutcodon.downcase)
336
+ return "stop_retained_variant"
337
+ elsif mutaa == "*" and refaa != "*"
338
+ return "stop_gained",pep_string
339
+ elsif mutaa != "*" and refaa == "*"
340
+ return "stop_lost",pep_string
341
+ elsif mutaa != refaa
342
+ return "non_synonymous_codon",pep_string
343
+ elsif mutaa == refaa
344
+ return "synonymous_codon",pep_string
345
+ end
346
+
347
+ end
348
+
349
+ def check_partial_codon(vf,t)
350
+ begin
351
+ mutation_position = t.genomic2cds(vf.seq_region_start)
352
+ cds_sequence = t.cds_seq
353
+ @cache[:mutation_position] = mutation_position
354
+ @cache[:cds_sequence] = cds_sequence
355
+ # check if the mutation is on the last codon and if it's a partial codon
356
+ if (cds_sequence.length - mutation_position) <= 3
357
+ return (cds_sequence.length % 3 == 0) ? nil : "incomplete_terminal_codon_variant"
358
+ end
359
+ rescue Exception => e
360
+ return nil
361
+ end
362
+ end
363
+
364
+ def check_near_exons(feature,exons_ranges)
365
+ exons_ranges.each do |exon_range|
366
+ if feature.is_a? Range
367
+ return exon_range if (feature.first <= exon_range.last) && (exon_range.first <= feature.last)
368
+ else
369
+ return exon_range if exon_range.include? feature
370
+ end
371
+ end
372
+ return false
373
+ end
374
+
375
+
376
+ end # VariationFeature
377
+
378
+ # The TranscriptVariation class gives information about the position of
379
+ # a VariationFeature, mapped on an annotated transcript.
380
+ #
381
+ # This class uses ActiveRecord to access data in the Ensembl database.
382
+ # See the general documentation of the Ensembl module for
383
+ # more information on what this means and what methods are available.
384
+ #
385
+ # @example
386
+ # vf = Variation.find_by_name('rs10111').variation_feature
387
+ # vf.transcript_variations.each do |tv|
388
+ # puts tv.peptide_allele_string, tv.transcript.stable_id
389
+ # end
390
+ #
391
+ class TranscriptVariation < DBConnection
392
+ set_primary_key "transcript_variation_id"
393
+ belongs_to :variation_feature
394
+ validates_inclusion_of :consequence_types, :in => ['intergenic_variant',
395
+ 'splice_acceptor_variant',
396
+ 'splice_donor_variant',
397
+ 'complex_change_in_transcript',
398
+ 'stop_lost',
399
+ 'coding_sequence_variant',
400
+ 'non_synonymous_codon',
401
+ 'stop_gained',
402
+ 'synonymous_codon',
403
+ 'frameshift_variant',
404
+ 'nc_transcript_variant',
405
+ 'mature_miRNA_variant',
406
+ 'NMD_transcript_variant',
407
+ '5_prime_UTR_variant',
408
+ '3_prime_UTR_variant',
409
+ 'incomplete_terminal_codon_variant',
410
+ 'intron_variant',
411
+ 'splice_region_variant',
412
+ '5KB_downstream_variant',
413
+ '500B_downstream_variant',
414
+ '5KB_upstream_variant',
415
+ '2KB_upstream_variant',
416
+ 'initiator_codon_change',
417
+ 'stop_retained_variant',
418
+ 'inframe_codon_gain',
419
+ 'inframe_codon_loss',
420
+ 'miRNA_target_site_variant',
421
+ 'pre_miRNA_variant',
422
+ 'regulatory_region_variant',
423
+ 'increased_binding_affinity',
424
+ 'decreased_binding_affinity',
425
+ 'binding_site_variant'
426
+ ], :message => "Consequence type not allowed!"
427
+
428
+ def consequence_types # workaround as ActiveRecord do not parse SET field in MySQL
429
+ "#{attributes_before_type_cast['consequence_types']}"
430
+ end
431
+
432
+ def transcript
433
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
434
+ if !Ensembl::Core::DBConnection.connected? then
435
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
436
+ end
437
+ return Ensembl::Core::Transcript.find_by_stable_id(self.feature_stable_id)
438
+ end
439
+
440
+ end
441
+
442
+ end
443
+
444
+ end