bio-ensembl 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. data/.document +5 -0
  2. data/Gemfile +20 -0
  3. data/Gemfile.lock +40 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +71 -0
  7. data/VERSION +1 -0
  8. data/bin/ensembl +40 -0
  9. data/bin/variation_effect_predictor +106 -0
  10. data/bio-ensembl.gemspec +190 -0
  11. data/lib/bio-ensembl.rb +65 -0
  12. data/lib/bio-ensembl/core/activerecord.rb +1812 -0
  13. data/lib/bio-ensembl/core/collection.rb +64 -0
  14. data/lib/bio-ensembl/core/project.rb +262 -0
  15. data/lib/bio-ensembl/core/slice.rb +657 -0
  16. data/lib/bio-ensembl/core/transcript.rb +409 -0
  17. data/lib/bio-ensembl/core/transform.rb +95 -0
  18. data/lib/bio-ensembl/db_connection.rb +205 -0
  19. data/lib/bio-ensembl/variation/activerecord.rb +536 -0
  20. data/lib/bio-ensembl/variation/variation_feature.rb +376 -0
  21. data/lib/bio-ensembl/variation/variation_feature62.rb +444 -0
  22. data/samples/ensembl_genomes_example.rb +60 -0
  23. data/samples/examples_perl_tutorial.rb +125 -0
  24. data/samples/small_example_ruby_api.rb +34 -0
  25. data/samples/variation_effect_predictor_data.txt +4 -0
  26. data/samples/variation_example.rb +67 -0
  27. data/test/data/seq_c6qbl.fa +10 -0
  28. data/test/data/seq_cso19_coding.fa +16 -0
  29. data/test/data/seq_cso19_transcript.fa +28 -0
  30. data/test/data/seq_drd3_gene.fa +838 -0
  31. data/test/data/seq_drd3_transcript.fa +22 -0
  32. data/test/data/seq_drd4_transcript.fa +24 -0
  33. data/test/data/seq_forward_composite.fa +1669 -0
  34. data/test/data/seq_par_boundary.fa +169 -0
  35. data/test/data/seq_rnd3_transcript.fa +47 -0
  36. data/test/data/seq_ub2r1_coding.fa +13 -0
  37. data/test/data/seq_ub2r1_gene.fa +174 -0
  38. data/test/data/seq_ub2r1_transcript.fa +26 -0
  39. data/test/data/seq_y.fa +2 -0
  40. data/test/default/test_connection.rb +60 -0
  41. data/test/default/test_releases.rb +130 -0
  42. data/test/ensembl_genomes/test_collection.rb +122 -0
  43. data/test/ensembl_genomes/test_gene.rb +46 -0
  44. data/test/ensembl_genomes/test_slice.rb +65 -0
  45. data/test/ensembl_genomes/test_variation.rb +38 -0
  46. data/test/helper.rb +18 -0
  47. data/test/release_50/core/test_project.rb +210 -0
  48. data/test/release_50/core/test_project_human.rb +52 -0
  49. data/test/release_50/core/test_relationships.rb +72 -0
  50. data/test/release_50/core/test_sequence.rb +170 -0
  51. data/test/release_50/core/test_slice.rb +116 -0
  52. data/test/release_50/core/test_transcript.rb +125 -0
  53. data/test/release_50/core/test_transform.rb +217 -0
  54. data/test/release_50/variation/test_activerecord.rb +138 -0
  55. data/test/release_50/variation/test_variation.rb +79 -0
  56. data/test/release_53/core/test_gene.rb +61 -0
  57. data/test/release_53/core/test_project.rb +91 -0
  58. data/test/release_53/core/test_project_human.rb +61 -0
  59. data/test/release_53/core/test_slice.rb +42 -0
  60. data/test/release_53/core/test_transform.rb +57 -0
  61. data/test/release_53/variation/test_activerecord.rb +137 -0
  62. data/test/release_53/variation/test_variation.rb +66 -0
  63. data/test/release_56/core/test_gene.rb +61 -0
  64. data/test/release_56/core/test_project.rb +91 -0
  65. data/test/release_56/core/test_slice.rb +49 -0
  66. data/test/release_56/core/test_transform.rb +57 -0
  67. data/test/release_56/variation/test_activerecord.rb +141 -0
  68. data/test/release_56/variation/test_consequence.rb +131 -0
  69. data/test/release_56/variation/test_variation.rb +63 -0
  70. data/test/release_60/core/test_gene.rb +61 -0
  71. data/test/release_60/core/test_project_human.rb +34 -0
  72. data/test/release_60/core/test_slice.rb +42 -0
  73. data/test/release_60/core/test_transcript.rb +120 -0
  74. data/test/release_60/core/test_transform.rb +57 -0
  75. data/test/release_60/variation/test_activerecord.rb +216 -0
  76. data/test/release_60/variation/test_consequence.rb +153 -0
  77. data/test/release_60/variation/test_variation.rb +64 -0
  78. data/test/release_62/core/test_gene.rb +42 -0
  79. data/test/release_62/variation/test_activerecord.rb +86 -0
  80. data/test/release_62/variation/test_consequence.rb +191 -0
  81. metadata +287 -0
@@ -0,0 +1,376 @@
1
+ #
2
+ # = ensembl/variation/variation.rb - Extension of ActiveRecord classes for Ensembl variation features
3
+ #
4
+ # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # @author Francesco Strozzi
8
+
9
+
10
+ module Ensembl
11
+
12
+ module Variation
13
+
14
+
15
+ # The VariationFeature class gives information about the genomic position of
16
+ # each Variation, including also validation status and consequence type.
17
+ #
18
+ # This class uses ActiveRecord to access data in the Ensembl database.
19
+ # See the general documentation of the Ensembl module for
20
+ # more information on what this means and what methods are available.
21
+ #
22
+ # @example
23
+ # # SLOWER QUERY
24
+ # vf = VariationFeature.find_by_variation_name('rs10111')
25
+ # # FASTER QUERY
26
+ # vf = Variation.find_by_name('rs10111').variation_feature
27
+ #
28
+ # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
29
+ # puts vf.variation.ancestral_allele
30
+ # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
31
+ # genomic_region.genes
32
+ # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
33
+ #
34
+ class VariationFeature < DBConnection
35
+ set_primary_key "variation_feature_id"
36
+ belongs_to :variation
37
+ has_many :tagged_variation_features
38
+ has_many :samples, :through => :tagged_variation_features
39
+ belongs_to :seq_region
40
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
41
+ 'STOP_GAINED',
42
+ 'STOP_LOST',
43
+ 'COMPLEX_INDEL',
44
+ 'FRAMESHIFT_CODING',
45
+ 'NON_SYNONYMOUS_CODING',
46
+ 'SPLICE_SITE',
47
+ 'PARTIAL_CODON',
48
+ 'SYNONYMOUS_CODING',
49
+ 'REGULATORY_REGION',
50
+ 'WITHIN_MATURE_miRNA',
51
+ '5PRIME_UTR',
52
+ '3PRIME_UTR',
53
+ 'INTRONIC',
54
+ 'NMD_TRANSCRIPT',
55
+ 'UPSTREAM',
56
+ 'DOWNSTREAM',
57
+ 'WITHIN_NON_CODING_GENE',
58
+ 'HGMD_MUTATION'
59
+ ], :message => "Consequence type not allowed!"
60
+
61
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
62
+ "#{attributes_before_type_cast['consequence_type']}"
63
+ end
64
+
65
+ # Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
66
+ # starting from the Variation coordinates, expanding the region upstream and
67
+ # downstream.
68
+ #
69
+ # @param [Integer] up Length of upstream flanking region
70
+ # @param [Integer] down Length of downstream flanking region
71
+ # @return [Slice] Slice object containing the variation
72
+ def fetch_region(up = 5000, down = 5000)
73
+ sr = core_connection(self.seq_region_id)
74
+ slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
75
+ return slice
76
+ end
77
+
78
+ def flanking_seq
79
+ sr = core_connection(self.seq_region_id)
80
+ f = Variation.find(self.variation_id).flanking_sequence
81
+ slice_up = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.up_seq_region_start,f.up_seq_region_end,self.seq_region_strand)
82
+ slice_down = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.down_seq_region_start,f.down_seq_region_end,self.seq_region_strand)
83
+ return slice_up,slice_down
84
+ end
85
+
86
+ def transcript_variations
87
+ tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
88
+ if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
89
+ sr = core_connection(self.seq_region_id)
90
+ return custom_transcript_variation(self,sr)
91
+ else
92
+ return tvs # the variation is already present in the database
93
+ end
94
+ end
95
+
96
+ private
97
+
98
+ def core_connection(seq_region_id)
99
+ if !Ensembl::Core::DBConnection.connected? then
100
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
101
+ begin
102
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
103
+ rescue
104
+ raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
105
+ end
106
+ end
107
+ # Check if SeqRegion already exists in Ensembl::SESSION
108
+ seq_region = nil
109
+ if Ensembl::SESSION.seq_regions.has_key?(seq_region_id)
110
+ seq_region = Ensembl::SESSION.seq_regions[seq_region_id]
111
+ else
112
+ seq_region = Ensembl::Core::SeqRegion.find(seq_region_id)
113
+ Ensembl::SESSION.seq_regions[seq_region.id] = seq_region
114
+ end
115
+ return seq_region
116
+ end
117
+
118
+ # Calculate a consequence type for a user-defined variation
119
+ def custom_transcript_variation(vf,sr)
120
+
121
+ @variation_name = vf.variation_name
122
+ @seq_region = sr
123
+
124
+ downstream = 5000
125
+ upstream = 5000
126
+ tvs = [] # store all the calculated TranscriptVariations
127
+ # retrieve the slice of the genomic region where the variation is located
128
+ region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,vf.seq_region_start-upstream,vf.seq_region_end+downstream-1)
129
+ # iterate through all the transcripts present in the region
130
+ genes = region.genes(inclusive = true)
131
+ if genes[0] != nil
132
+ genes.each do |g|
133
+ g.transcripts.each do |t|
134
+ @cache = {}
135
+ tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
136
+ # do the calculations
137
+
138
+ # check if the variation is intergenic for this transcript (no effects)
139
+ tv.consequence_type = check_intergenic(vf,t)
140
+
141
+ # check if the variation is upstram or downstram the transcript
142
+ tv.consequence_type = check_upstream_downstream(vf,t) if tv.consequence_type == ""
143
+
144
+ # if no consequence type is found, then the variation is inside the transcript
145
+ # check for non coding gene
146
+ tv.consequence_type = check_non_coding(vf,t) if tv.consequence_type == "" and t.biotype != 'protein_coding'
147
+
148
+ # if no consequence type is found, then check intron / exon boundaries
149
+ tv.consequence_type = check_splice_site(vf,t) if tv.consequence_type == ""
150
+
151
+ # if no consequence type is found, check if the variation is inside UTRs
152
+ tv.consequence_type = check_utr(vf,t) if tv.consequence_type == ""
153
+
154
+ # if no consequence type is found, then variation is inside an exon.
155
+ # Check the codon change
156
+ (tv.consequence_type,tv.peptide_allele_string) = check_aa_change(vf,t) if tv.consequence_type == ""
157
+
158
+
159
+ begin # this changed from release 58
160
+ tv.transcript_stable_id = t.stable_id
161
+ rescue NoMethodError
162
+ tv.transcript_id = t.id
163
+ end
164
+
165
+ tv.consequence_type = "INTERGENIC" if tv.consequence_type == ""
166
+ tvs << tv
167
+ end
168
+ end
169
+ end
170
+ # if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
171
+ if tvs.size == 0 then
172
+ tv = TranscriptVariation.new()
173
+ tv.consequence_type = "INTERGENIC"
174
+ tvs << tv
175
+ end
176
+
177
+ return tvs
178
+ end
179
+
180
+ ## CONSEQUENCE CALCULATION FUNCTIONS ##
181
+
182
+ def check_intergenic(vf,t)
183
+ if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) > 5000 then
184
+ return "INTERGENIC"
185
+ elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) > 5000 then
186
+ return "INTERGENIC"
187
+ end
188
+ return nil
189
+ end
190
+
191
+ def check_upstream_downstream(vf,t)
192
+ if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) <= 5000 then
193
+ return (t.strand == 1) ? "UPSTREAM" : "DOWNSTREAM"
194
+ elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) <= 5000 then
195
+ return (t.strand == 1) ? "DOWNSTREAM" : "UPSTREAM"
196
+
197
+ # check if it's an InDel and if overlaps the transcript start / end
198
+ elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
199
+ return "COMPLEX_INDEL"
200
+ elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
201
+ return "COMPLEX_INDEL"
202
+ end
203
+ return nil
204
+ end
205
+
206
+ def check_non_coding(vf,t)
207
+ if t.biotype == "miRNA" then
208
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_MATURE_miRNA" : "COMPLEX_INDEL"
209
+ elsif t.biotype == "nonsense_mediated_decay"
210
+ return (vf.seq_region_start == vf.seq_region_end) ? "NMD_TRANSCRIPT" : "COMPLEX_INDEL"
211
+ else
212
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_NON_CODING_GENE" : "COMPLEX_INDEL"
213
+ end
214
+ return nil
215
+ end
216
+
217
+ def check_utr(vf,t)
218
+ if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
219
+ return (t.strand == 1) ? "5PRIME_UTR" : "3PRIME_UTR"
220
+ elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
221
+ return (t.strand == 1) ? "3PRIME_UTR" : "5PRIME_UTR"
222
+ end
223
+ return nil
224
+ end
225
+
226
+ def check_splice_site(vf,t)
227
+ @cache[:exons] = []
228
+ var_start,var_end = (vf.seq_region_strand == 1) ? [vf.seq_region_start,vf.seq_region_end] : [vf.seq_region_end,vf.seq_region_start]
229
+ t.exons.each {|ex| @cache[:exons] << Range.new(ex.seq_region_start,ex.seq_region_end)}
230
+
231
+ exon_up = check_near_exons(var_start,@cache[:exons])
232
+ exon_down = check_near_exons(var_end,@cache[:exons])
233
+ if !exon_up and !exon_down # we are inside an intron
234
+ # checking boundaries
235
+ near_exon_up_2bp = check_near_exons(var_start-2..var_start,@cache[:exons])
236
+ near_exon_down_2bp = check_near_exons(var_end..var_end+2,@cache[:exons])
237
+ if near_exon_up_2bp or near_exon_down_2bp then
238
+ return "ESSENTIAL_SPLICE_SITE"
239
+ else
240
+ near_exon_up_8bp = check_near_exons(var_start+8..var_start,@cache[:exons])
241
+ near_exon_down_8bp = check_near_exons(var_end..var_end+8,@cache[:exons])
242
+ if near_exon_up_8bp or near_exon_down_8bp then
243
+ return "SPLICE_SITE"
244
+ else
245
+ return "INTRONIC"
246
+ end
247
+ end
248
+ elsif exon_up and exon_down # the variation is inside an exon
249
+ # check if it is a splice site
250
+ if (var_start-exon_up.first) <= 3 or (exon_down.last-var_end) <= 3 then
251
+ return "SPLICE_SITE"
252
+ end
253
+ else # a complex indel spanning intron/exon boundary
254
+ return "COMPLEX_INDEL"
255
+ end
256
+ return nil
257
+ end
258
+
259
+ def check_aa_change(vf,t)
260
+ alleles = vf.allele_string.split('/') # get the different alleles for this variation
261
+ # if the variation is an InDel then it produces a frameshift
262
+ if vf.seq_region_start != vf.seq_region_end or alleles.include?("-") then
263
+ return "FRAMESHIFT_CODING",nil
264
+ end
265
+
266
+ # Find the position inside the CDS
267
+
268
+ mutation_position = t.genomic2cds(vf.seq_region_start)
269
+
270
+ mutation_base = Bio::Sequence::NA.new(alleles[1])
271
+ if t.seq_region_strand == -1
272
+ mutation_base.reverse_complement!
273
+ end
274
+ # The rank of the codon
275
+ target_codon = (mutation_position)/3 + 1
276
+ cds_sequence = nil
277
+ cds_sequence = t.cds_seq
278
+ mut_sequence = cds_sequence.dup
279
+ # Replace base with the variant allele
280
+ mut_sequence[mutation_position] = mutation_base.seq
281
+ refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1)]
282
+ mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1)]
283
+ codontable = Bio::CodonTable[1]
284
+ refaa = codontable[refcodon]
285
+ mutaa = codontable[mutcodon.downcase]
286
+ if mutaa == nil
287
+ raise RuntimeError "Codon #{mutcodon.downcase} wasn't recognized."
288
+ end
289
+ pep_string = refaa+"/"+mutaa
290
+ if mutaa == "*" and refaa != "*"
291
+ return "STOP_GAINED",pep_string
292
+ elsif mutaa != "*" and refaa == "*"
293
+ return "STOP_LOST",pep_string
294
+ elsif mutaa != refaa
295
+ return "NON_SYNONYMOUS_CODING",pep_string
296
+ elsif mutaa == refaa
297
+ return "SYNONYMOUS_CODING",pep_string
298
+ end
299
+
300
+ end
301
+
302
+
303
+ def check_near_exons(feature,exons_ranges)
304
+ exons_ranges.each do |exon_range|
305
+ if feature.is_a? Range
306
+ return exon_range if (feature.first <= exon_range.last) && (exon_range.first <= feature.last)
307
+ else
308
+ return exon_range if exon_range.include? feature
309
+ end
310
+ end
311
+ return false
312
+ end
313
+
314
+
315
+ end # VariationFeature
316
+
317
+ # The TranscriptVariation class gives information about the position of
318
+ # a VariationFeature, mapped on an annotated transcript.
319
+ #
320
+ # This class uses ActiveRecord to access data in the Ensembl database.
321
+ # See the general documentation of the Ensembl module for
322
+ # more information on what this means and what methods are available.
323
+ #
324
+ # @example
325
+ # vf = Variation.find_by_name('rs10111').variation_feature
326
+ # vf.transcript_variations.each do |tv|
327
+ # puts tv.peptide_allele_string, tv.transcript.stable_id
328
+ # end
329
+ #
330
+ class TranscriptVariation < DBConnection
331
+ set_primary_key "transcript_variation_id"
332
+ belongs_to :variation_feature
333
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
334
+ 'STOP_GAINED',
335
+ 'STOP_LOST',
336
+ 'COMPLEX_INDEL',
337
+ 'FRAMESHIFT_CODING',
338
+ 'NON_SYNONYMOUS_CODING',
339
+ 'SPLICE_SITE',
340
+ 'PARTIAL_CODON',
341
+ 'SYNONYMOUS_CODING',
342
+ 'REGULATORY_REGION',
343
+ 'WITHIN_MATURE_miRNA',
344
+ '5PRIME_UTR',
345
+ '3PRIME_UTR',
346
+ 'INTRONIC',
347
+ 'NMD_TRANSCRIPT',
348
+ 'UPSTREAM',
349
+ 'DOWNSTREAM',
350
+ 'WITHIN_NON_CODING_GENE',
351
+ 'HGMD_MUTATION'
352
+ ], :message => "Consequence type not allowed!"
353
+
354
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
355
+ "#{attributes_before_type_cast['consequence_type']}"
356
+ end
357
+
358
+ def transcript
359
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
360
+ if !Ensembl::Core::DBConnection.connected? then
361
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
362
+ end
363
+
364
+ begin # this changed from release 58
365
+ return Ensembl::Core::Transcript.find_by_stable_id(self.transcript_stable_id)
366
+ rescue NoMethodError
367
+ return Ensembl::Core::Transcript.find(self.transcript_id)
368
+ end
369
+
370
+ end
371
+
372
+ end
373
+
374
+ end
375
+
376
+ end
@@ -0,0 +1,444 @@
1
+ #
2
+ # = ensembl/variation/variation.rb - Extension of ActiveRecord classes for Ensembl variation features
3
+ #
4
+ # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # @author Francesco Strozzi
8
+
9
+
10
+ module Ensembl
11
+
12
+ module Variation
13
+
14
+
15
+ # The VariationFeature class gives information about the genomic position of
16
+ # each Variation, including also validation status and consequence type.
17
+ #
18
+ # This class uses ActiveRecord to access data in the Ensembl database.
19
+ # See the general documentation of the Ensembl module for
20
+ # more information on what this means and what methods are available.
21
+ #
22
+ # @example
23
+ # # SLOWER QUERY
24
+ # vf = VariationFeature.find_by_variation_name('rs10111')
25
+ # # FASTER QUERY
26
+ # vf = Variation.find_by_name('rs10111').variation_feature
27
+ #
28
+ # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
29
+ # puts vf.variation.ancestral_allele
30
+ # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
31
+ # genomic_region.genes
32
+ # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
33
+ #
34
+ class VariationFeature < DBConnection
35
+ set_primary_key "variation_feature_id"
36
+ belongs_to :variation
37
+ has_many :tagged_variation_features
38
+ has_many :samples, :through => :tagged_variation_features
39
+ belongs_to :seq_region
40
+ validates_inclusion_of :consequence_types, :in => ['intergenic_variant',
41
+ 'splice_acceptor_variant',
42
+ 'splice_donor_variant',
43
+ 'complex_change_in_transcript',
44
+ 'stop_lost',
45
+ 'coding_sequence_variant',
46
+ 'non_synonymous_codon',
47
+ 'stop_gained',
48
+ 'synonymous_codon',
49
+ 'frameshift_variant',
50
+ 'nc_transcript_variant',
51
+ 'mature_miRNA_variant',
52
+ 'NMD_transcript_variant',
53
+ '5_prime_UTR_variant',
54
+ '3_prime_UTR_variant',
55
+ 'incomplete_terminal_codon_variant',
56
+ 'intron_variant',
57
+ 'splice_region_variant',
58
+ '5KB_downstream_variant',
59
+ '500B_downstream_variant',
60
+ '5KB_upstream_variant',
61
+ '2KB_upstream_variant',
62
+ 'initiator_codon_change',
63
+ 'stop_retained_variant',
64
+ 'inframe_codon_gain',
65
+ 'inframe_codon_loss',
66
+ 'miRNA_target_site_variant',
67
+ 'pre_miRNA_variant',
68
+ 'regulatory_region_variant',
69
+ 'increased_binding_affinity',
70
+ 'decreased_binding_affinity',
71
+ 'binding_site_variant'
72
+ ], :message => "Consequence type not allowed!"
73
+
74
+ def consequence_types # workaround as ActiveRecord do not parse SET field in MySQL
75
+ "#{attributes_before_type_cast['consequence_types']}"
76
+ end
77
+
78
+ # Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
79
+ # starting from the Variation coordinates, expanding the region upstream and
80
+ # downstream.
81
+ #
82
+ # @param [Integer] up Length of upstream flanking region
83
+ # @param [Integer] down Length of downstream flanking region
84
+ # @return [Slice] Slice object containing the variation
85
+ def fetch_region(up = 5000, down = 5000)
86
+ sr = core_connection(self.seq_region_id)
87
+ slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
88
+ return slice
89
+ end
90
+
91
+ def flanking_seq
92
+ sr = core_connection(self.seq_region_id)
93
+ f = Variation.find(self.variation_id).flanking_sequence
94
+ slice_up = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.up_seq_region_start,f.up_seq_region_end,self.seq_region_strand)
95
+ slice_down = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.down_seq_region_start,f.down_seq_region_end,self.seq_region_strand)
96
+ return slice_up,slice_down
97
+ end
98
+
99
+ def transcript_variations
100
+ tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
101
+ if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
102
+ sr = core_connection(self.seq_region_id)
103
+ return custom_transcript_variation(self,sr)
104
+ else
105
+ return tvs # the variation is already present in the database
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def core_connection(seq_region_id)
112
+ if !Ensembl::Core::DBConnection.connected? then
113
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
114
+ begin
115
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
116
+ rescue
117
+ raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
118
+ end
119
+ end
120
+ # Check if SeqRegion already exists in Ensembl::SESSION
121
+ seq_region = nil
122
+ if Ensembl::SESSION.seq_regions.has_key?(seq_region_id)
123
+ seq_region = Ensembl::SESSION.seq_regions[seq_region_id]
124
+ else
125
+ seq_region = Ensembl::Core::SeqRegion.find(seq_region_id)
126
+ Ensembl::SESSION.seq_regions[seq_region.id] = seq_region
127
+ end
128
+ return seq_region
129
+ end
130
+
131
+ # Calculate a consequence type for a user-defined variation
132
+ def custom_transcript_variation(vf,sr)
133
+
134
+ @variation_name = vf.variation_name
135
+ @seq_region = sr
136
+
137
+ downstream = 5000
138
+ upstream = 5000
139
+ tvs = [] # store all the calculated TranscriptVariations
140
+ # retrieve the slice of the genomic region where the variation is located
141
+ var_start,var_end = 0,0
142
+ if vf.seq_region_start > vf.seq_region_end
143
+ var_start,var_end = vf.seq_region_end,vf.seq_region_start
144
+ else
145
+ var_start,var_end = vf.seq_region_start,vf.seq_region_end
146
+ end
147
+ region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,var_start-upstream,var_end+downstream)
148
+ # iterate through all the transcripts present in the region
149
+ genes = region.genes(inclusive = true)
150
+ if genes[0] != nil
151
+ genes.each do |g|
152
+ g.transcripts.each do |t|
153
+
154
+ @cache = {}
155
+
156
+ tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
157
+ # do the calculations
158
+
159
+ # check if the variation is intergenic for this transcript (no effects)
160
+ tv.consequence_types = check_intergenic(vf,t)
161
+
162
+ # check if the variation is upstram or downstram the transcript
163
+ tv.consequence_types = check_upstream_downstream(vf,t) if tv.consequence_types == ""
164
+
165
+ # check partial codon
166
+ tv.consequence_types = check_partial_codon(vf,t) if tv.consequence_types == ""
167
+
168
+ # if no consequence type is found, then the variation is inside the transcript
169
+ # check for non coding gene
170
+ tv.consequence_types = check_non_coding(vf,t) if tv.consequence_types == "" && t.biotype != 'protein_coding'
171
+
172
+ # if no consequence type is found, then check intron / exon boundaries
173
+ tv.consequence_types = check_splice_site(vf,t) if tv.consequence_types == ""
174
+
175
+ # if no consequence type is found, check if the variation is inside UTRs
176
+ tv.consequence_types = check_utr(vf,t) if tv.consequence_types == ""
177
+
178
+ # if no consequence type is found, then variation is inside an exon.
179
+ # Check the codon change
180
+ (tv.consequence_types,tv.pep_allele_string) = check_aa_change(vf,t) if tv.consequence_types == ""
181
+
182
+ tv.feature_stable_id = t.stable_id
183
+
184
+ #tv.consequence_types = "intergenic_variant" if tv.consequence_types == ""
185
+ tvs << tv
186
+ end
187
+ end
188
+ end
189
+ # if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
190
+ if tvs.size == 0 then
191
+ tv = TranscriptVariation.new()
192
+ tv.consequence_types = "intergenic_variant"
193
+ tvs << tv
194
+ end
195
+
196
+ return tvs
197
+ end
198
+
199
+ ## CONSEQUENCE CALCULATION METHODS ##
200
+
201
+ def check_intergenic(vf,t)
202
+ if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) > 5000 then
203
+ return "intergenic_variant"
204
+ elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) > 5000 then
205
+ return "intergenic_variant"
206
+ end
207
+ return nil
208
+ end
209
+
210
+ def check_upstream_downstream(vf,t)
211
+ if vf.seq_region_end < t.seq_region_start
212
+ distance = t.seq_region_start - vf.seq_region_end+1
213
+ if t.strand == 1 and distance <= 2000
214
+ return "2KB_upstream_variant"
215
+ elsif t.strand == -1 and distance <= 500
216
+ return "500B_downstream_variant"
217
+ else
218
+ return (t.strand == 1) ? "5KB_upstream_variant" : "5KB_downstream_variant"
219
+ end
220
+ elsif vf.seq_region_start > t.seq_region_end
221
+ distance = vf.seq_region_start - t.seq_region_end+1
222
+ if t.strand == -1 and distance <= 2000
223
+ return "2KB_upstream_variant"
224
+ elsif t.strand == 1 and distance <= 500
225
+ return "500B_downstream_variant"
226
+ else
227
+ return (t.strand == 1) ? "5KB_downstream_variant" : "5KB_upstream_variant"
228
+ end
229
+ # check if it's an InDel and if overlaps the transcript start / end
230
+ elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
231
+ return "complex_change_in_transcript"
232
+ elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
233
+ return "complex_change_in_transcript"
234
+ end
235
+ return nil
236
+ end
237
+
238
+ def check_non_coding(vf,t)
239
+ if t.biotype == "miRNA" then
240
+ return "mature_miRNA_variant"
241
+ elsif t.biotype == "nonsense_mediated_decay"
242
+ return "NMD_transcript_variant"
243
+ else
244
+ return "nc_transcript_variant"
245
+ end
246
+ end
247
+
248
+ def check_utr(vf,t)
249
+ if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
250
+ return (t.strand == 1) ? "5_prime_UTR_variant" : "3_prime_UTR_variant"
251
+ elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
252
+ return (t.strand == 1) ? "3_prime_UTR_variant" : "5_prime_UTR_variant"
253
+ end
254
+ return nil
255
+ end
256
+
257
+ def check_splice_site(vf,t)
258
+ @cache[:exons] = []
259
+ var_start,var_end = (vf.seq_region_strand == 1) ? [vf.seq_region_start,vf.seq_region_end] : [vf.seq_region_end,vf.seq_region_start]
260
+ t.exons.each {|ex| @cache[:exons] << Range.new(ex.seq_region_start,ex.seq_region_end)}
261
+
262
+ exon_up = check_near_exons(var_start,@cache[:exons])
263
+ exon_down = check_near_exons(var_end,@cache[:exons])
264
+ if !exon_up and !exon_down # we are inside an intron
265
+ # checking boundaries
266
+ near_exon_up_2bp = check_near_exons(var_start-2..var_start,@cache[:exons])
267
+ near_exon_down_2bp = check_near_exons(var_end..var_end+2,@cache[:exons])
268
+ if near_exon_up_2bp
269
+ return (t.strand == 1) ? "splice_donor_variant" : "splice_acceptor_variant"
270
+ elsif near_exon_down_2bp
271
+ return (t.strand == 1) ? "splice_acceptor_variant" : "splice_donor_variant"
272
+ else
273
+ near_exon_up_8bp = check_near_exons(var_start+8..var_start,@cache[:exons])
274
+ near_exon_down_8bp = check_near_exons(var_end..var_end+8,@cache[:exons])
275
+ if near_exon_up_8bp or near_exon_down_8bp
276
+ return "splice_region_variant"
277
+ else
278
+ return "intron_variant"
279
+ end
280
+ end
281
+ elsif exon_up and exon_down # the variation is inside an exon
282
+ # check if it is a splice site
283
+ if (var_start-exon_up.first) <= 3 or (exon_down.last-var_end) <= 3 then
284
+ return "splice_region_variant"
285
+ end
286
+ else # a complex indel spanning intron/exon boundary
287
+ return "complex_change_in_transcript"
288
+ end
289
+ return nil
290
+ end
291
+
292
+ def check_aa_change(vf,t)
293
+ alleles = vf.allele_string.split('/') # get the different alleles for this variation
294
+
295
+ # Find the position inside the CDS
296
+ mutation_position = (@cache[:mutation_positon]) ? @cache[:mutation_positon] : t.genomic2cds(vf.seq_region_start)
297
+ cds_sequence = (@cache[:cds_sequence]) ? @cache[:cds_sequence] : t.cds_seq
298
+
299
+ if vf.allele_string =~/INSERTION|DELETION|MUTATION/
300
+ return "coding_sequence_variant",nil
301
+ end
302
+
303
+ mutation_base = Bio::Sequence::NA.new(alleles[1])
304
+ if t.seq_region_strand == -1
305
+ mutation_base.reverse_complement!
306
+ end
307
+ # The rank of the codon
308
+ target_codon = (mutation_position)/3 + 1
309
+ mut_sequence = cds_sequence.dup
310
+
311
+ # Replace base with the variant allele
312
+ if alleles[1] == "-" # a deletion
313
+ mut_sequence.gsub!(/#{alleles[0]}/,'')
314
+ else # insertion or SNP
315
+ mut_sequence[mutation_position] = mutation_base.seq
316
+ end
317
+
318
+ mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1 + alleles[1].length-1)]
319
+ refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1 + alleles[0].length-1)]
320
+ codontable = Bio::CodonTable[1]
321
+ refaa = codontable[refcodon]
322
+ mutaa = codontable[mutcodon.downcase]
323
+
324
+ pep_string = refaa.to_s+"/"+mutaa.to_s
325
+ transcript_start = (t.strand == 1) ? t.coding_region_genomic_start : t.coding_region_genomic_end
326
+ if (vf.seq_region_start - transcript_start).abs <= 3
327
+ return "initiator_codon_change",pep_string
328
+ elsif (mutcodon.length > refcodon.length) && (mutcodon =~/^#{refcodon}/ || mutcodon =~/#{refcodon}$/)
329
+ return "inframe_codon_gain",pep_string
330
+ elsif (mutcodon.length < refcodon.length) && (refcodon =~/^#{mutcodon}/ || refcodon =~/#{mutcodon}$/)
331
+ return "inframe_codon_loss",pep_string
332
+ elsif vf.seq_region_start != vf.seq_region_end
333
+ # if the variation is an InDel then it produces a frameshift
334
+ return "frameshift_variant",nil
335
+ elsif (mutaa == "*" and refaa == "*") && (refcodon != mutcodon.downcase)
336
+ return "stop_retained_variant"
337
+ elsif mutaa == "*" and refaa != "*"
338
+ return "stop_gained",pep_string
339
+ elsif mutaa != "*" and refaa == "*"
340
+ return "stop_lost",pep_string
341
+ elsif mutaa != refaa
342
+ return "non_synonymous_codon",pep_string
343
+ elsif mutaa == refaa
344
+ return "synonymous_codon",pep_string
345
+ end
346
+
347
+ end
348
+
349
+ def check_partial_codon(vf,t)
350
+ begin
351
+ mutation_position = t.genomic2cds(vf.seq_region_start)
352
+ cds_sequence = t.cds_seq
353
+ @cache[:mutation_position] = mutation_position
354
+ @cache[:cds_sequence] = cds_sequence
355
+ # check if the mutation is on the last codon and if it's a partial codon
356
+ if (cds_sequence.length - mutation_position) <= 3
357
+ return (cds_sequence.length % 3 == 0) ? nil : "incomplete_terminal_codon_variant"
358
+ end
359
+ rescue Exception => e
360
+ return nil
361
+ end
362
+ end
363
+
364
+ def check_near_exons(feature,exons_ranges)
365
+ exons_ranges.each do |exon_range|
366
+ if feature.is_a? Range
367
+ return exon_range if (feature.first <= exon_range.last) && (exon_range.first <= feature.last)
368
+ else
369
+ return exon_range if exon_range.include? feature
370
+ end
371
+ end
372
+ return false
373
+ end
374
+
375
+
376
+ end # VariationFeature
377
+
378
+ # The TranscriptVariation class gives information about the position of
379
+ # a VariationFeature, mapped on an annotated transcript.
380
+ #
381
+ # This class uses ActiveRecord to access data in the Ensembl database.
382
+ # See the general documentation of the Ensembl module for
383
+ # more information on what this means and what methods are available.
384
+ #
385
+ # @example
386
+ # vf = Variation.find_by_name('rs10111').variation_feature
387
+ # vf.transcript_variations.each do |tv|
388
+ # puts tv.peptide_allele_string, tv.transcript.stable_id
389
+ # end
390
+ #
391
+ class TranscriptVariation < DBConnection
392
+ set_primary_key "transcript_variation_id"
393
+ belongs_to :variation_feature
394
+ validates_inclusion_of :consequence_types, :in => ['intergenic_variant',
395
+ 'splice_acceptor_variant',
396
+ 'splice_donor_variant',
397
+ 'complex_change_in_transcript',
398
+ 'stop_lost',
399
+ 'coding_sequence_variant',
400
+ 'non_synonymous_codon',
401
+ 'stop_gained',
402
+ 'synonymous_codon',
403
+ 'frameshift_variant',
404
+ 'nc_transcript_variant',
405
+ 'mature_miRNA_variant',
406
+ 'NMD_transcript_variant',
407
+ '5_prime_UTR_variant',
408
+ '3_prime_UTR_variant',
409
+ 'incomplete_terminal_codon_variant',
410
+ 'intron_variant',
411
+ 'splice_region_variant',
412
+ '5KB_downstream_variant',
413
+ '500B_downstream_variant',
414
+ '5KB_upstream_variant',
415
+ '2KB_upstream_variant',
416
+ 'initiator_codon_change',
417
+ 'stop_retained_variant',
418
+ 'inframe_codon_gain',
419
+ 'inframe_codon_loss',
420
+ 'miRNA_target_site_variant',
421
+ 'pre_miRNA_variant',
422
+ 'regulatory_region_variant',
423
+ 'increased_binding_affinity',
424
+ 'decreased_binding_affinity',
425
+ 'binding_site_variant'
426
+ ], :message => "Consequence type not allowed!"
427
+
428
+ def consequence_types # workaround as ActiveRecord do not parse SET field in MySQL
429
+ "#{attributes_before_type_cast['consequence_types']}"
430
+ end
431
+
432
+ def transcript
433
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
434
+ if !Ensembl::Core::DBConnection.connected? then
435
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
436
+ end
437
+ return Ensembl::Core::Transcript.find_by_stable_id(self.feature_stable_id)
438
+ end
439
+
440
+ end
441
+
442
+ end
443
+
444
+ end