ruby-ensembl-api 0.9.6 → 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/TUTORIAL.rdoc +1 -1
  2. data/bin/variation_effect_predictor +106 -0
  3. data/lib/ensembl.rb +2 -2
  4. data/lib/ensembl/core/activerecord.rb +119 -225
  5. data/lib/ensembl/core/collection.rb +14 -10
  6. data/lib/ensembl/core/project.rb +6 -8
  7. data/lib/ensembl/core/slice.rb +87 -123
  8. data/lib/ensembl/core/transcript.rb +49 -65
  9. data/lib/ensembl/core/transform.rb +6 -8
  10. data/lib/ensembl/db_connection.rb +56 -72
  11. data/lib/ensembl/variation/activerecord.rb +138 -8
  12. data/lib/ensembl/variation/variation.rb +284 -46
  13. data/samples/ensembl_genomes_example.rb +60 -0
  14. data/samples/examples_perl_tutorial.rb +125 -0
  15. data/samples/small_example_ruby_api.rb +34 -0
  16. data/samples/variation_example.rb +67 -0
  17. data/test/unit/{release_56 → release_60}/core/test_gene.rb +6 -6
  18. data/test/unit/release_60/core/test_project_human.rb +38 -0
  19. data/test/unit/{release_56 → release_60}/core/test_slice.rb +1 -8
  20. data/test/unit/release_60/core/test_transcript.rb +126 -0
  21. data/test/unit/{release_53 → release_60}/core/test_transform.rb +21 -21
  22. data/test/unit/release_60/variation/test_activerecord.rb +213 -0
  23. data/test/unit/release_60/variation/test_consequence.rb +158 -0
  24. data/test/unit/{release_56 → release_60}/variation/test_variation.rb +18 -17
  25. data/test/unit/test_connection.rb +2 -2
  26. data/test/unit/test_releases.rb +8 -8
  27. metadata +27 -43
  28. data/test/unit/data/seq_c6qbl.fa +0 -10
  29. data/test/unit/data/seq_cso19_coding.fa +0 -16
  30. data/test/unit/data/seq_cso19_transcript.fa +0 -28
  31. data/test/unit/data/seq_drd3_gene.fa +0 -838
  32. data/test/unit/data/seq_drd3_transcript.fa +0 -22
  33. data/test/unit/data/seq_drd4_transcript.fa +0 -24
  34. data/test/unit/data/seq_forward_composite.fa +0 -1669
  35. data/test/unit/data/seq_par_boundary.fa +0 -169
  36. data/test/unit/data/seq_rnd3_transcript.fa +0 -47
  37. data/test/unit/data/seq_ub2r1_coding.fa +0 -13
  38. data/test/unit/data/seq_ub2r1_gene.fa +0 -174
  39. data/test/unit/data/seq_ub2r1_transcript.fa +0 -26
  40. data/test/unit/data/seq_y.fa +0 -2
  41. data/test/unit/ensembl_genomes/test_collection.rb +0 -51
  42. data/test/unit/ensembl_genomes/test_gene.rb +0 -52
  43. data/test/unit/ensembl_genomes/test_slice.rb +0 -71
  44. data/test/unit/ensembl_genomes/test_variation.rb +0 -17
  45. data/test/unit/release_50/core/test_project.rb +0 -215
  46. data/test/unit/release_50/core/test_project_human.rb +0 -58
  47. data/test/unit/release_50/core/test_relationships.rb +0 -66
  48. data/test/unit/release_50/core/test_sequence.rb +0 -175
  49. data/test/unit/release_50/core/test_slice.rb +0 -121
  50. data/test/unit/release_50/core/test_transcript.rb +0 -108
  51. data/test/unit/release_50/core/test_transform.rb +0 -223
  52. data/test/unit/release_50/variation/test_activerecord.rb +0 -143
  53. data/test/unit/release_50/variation/test_variation.rb +0 -84
  54. data/test/unit/release_53/core/test_gene.rb +0 -66
  55. data/test/unit/release_53/core/test_project.rb +0 -96
  56. data/test/unit/release_53/core/test_project_human.rb +0 -65
  57. data/test/unit/release_53/core/test_slice.rb +0 -47
  58. data/test/unit/release_53/variation/test_activerecord.rb +0 -145
  59. data/test/unit/release_53/variation/test_variation.rb +0 -71
  60. data/test/unit/release_56/core/test_project.rb +0 -96
  61. data/test/unit/release_56/core/test_transform.rb +0 -63
  62. data/test/unit/release_56/variation/test_activerecord.rb +0 -142
@@ -4,14 +4,13 @@
4
4
  # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
5
  # License:: The Ruby License
6
6
  #
7
+ # @author Francesco Strozzi
7
8
 
8
9
  nil
9
10
  module Ensembl
10
- # = DESCRIPTION
11
11
  # The Ensembl::Variation module covers the variation databases from
12
12
  # ensembldb.ensembl.org.
13
13
  module Variation
14
- # = DESCRIPTION
15
14
  # The Allele class describes a single allele of a variation. In addition to
16
15
  # the nucleotide(s) (or absence of) that representing the allele frequency
17
16
  # and population information may be present.
@@ -28,6 +27,7 @@ module Ensembl
28
27
  belongs_to :sample
29
28
  belongs_to :variation
30
29
  belongs_to :population
30
+ belongs_to :subsnp_handle
31
31
  end
32
32
 
33
33
  # = DESCRIPTION
@@ -62,6 +62,71 @@ module Ensembl
62
62
  belongs_to :allele_group
63
63
  end
64
64
 
65
+ # = DESCRIPTION
66
+ # Store information on attributes types
67
+ #
68
+ # This class uses ActiveRecord to access data in the Ensembl database.
69
+ # See the general documentation of the Ensembl module for
70
+ # more information on what this means and what methods are available.
71
+ class AttribType < DBConnection
72
+ set_primary_key "attrib_type_id"
73
+ end
74
+
75
+
76
+ # = DESCRIPTION
77
+ #
78
+ # This class uses ActiveRecord to access data in the Ensembl database.
79
+ # See the general documentation of the Ensembl module for
80
+ # more information on what this means and what methods are available.
81
+ class ConsequenceMapping < DBConnection
82
+
83
+ end
84
+
85
+ # = DESCRIPTION
86
+ #
87
+ # This class uses ActiveRecord to access data in the Ensembl database.
88
+ # See the general documentation of the Ensembl module for
89
+ # more information on what this means and what methods are available.
90
+ class FailedDescription < DBConnection
91
+ set_primary_key "failed_description_id"
92
+ has_many :failed_variations
93
+ end
94
+
95
+ # = DESCRIPTION
96
+ #
97
+ # This class uses ActiveRecord to access data in the Ensembl database.
98
+ # See the general documentation of the Ensembl module for
99
+ # more information on what this means and what methods are available.
100
+ class FailedVariation < DBConnection
101
+ set_primary_key "failed_variation_id"
102
+ belongs_to :failed_description
103
+ belongs_to :variation
104
+ end
105
+
106
+ # = DESCRIPTION
107
+ #
108
+ # This class uses ActiveRecord to access data in the Ensembl database.
109
+ # See the general documentation of the Ensembl module for
110
+ # more information on what this means and what methods are available.
111
+ class FeatureType < DBConnection
112
+ set_primary_key "feature_type_id"
113
+ end
114
+
115
+ class Meta < DBConnection
116
+ set_primary_key "meta_id"
117
+ end
118
+
119
+ class MetaCoord < DBConnection
120
+
121
+ end
122
+
123
+ class Phenotype < DBConnection
124
+ set_primary_key "phenotype_id"
125
+ has_many :variation_annotations
126
+ end
127
+
128
+
129
+
65
130
  # = DESCRIPTION
66
131
  # The Sample class gives information about the biological samples stored in the database.
67
132
  #
@@ -79,7 +144,6 @@ module Ensembl
79
144
  has_many :tagged_variation_features
80
145
  end
81
146
 
82
- # = DESCRIPTION
83
147
  # The IndividualPopulation class is used to connect Individual and Population classes.
84
148
  # Should not be used directly.
85
149
  #
@@ -87,8 +151,8 @@ module Ensembl
87
151
  # See the general documentation of the Ensembl module for
88
152
  # more information on what this means and what methods are available.
89
153
  class IndividualPopulation < DBConnection
90
- belongs_to :individual
91
- belongs_to :population
154
+ belongs_to :individual, :foreign_key => "individual_sample_id"
155
+ belongs_to :population, :foreign_key => "population_sample_id"
92
156
  end
93
157
 
94
158
  # = DESCRIPTION
@@ -99,17 +163,27 @@ module Ensembl
99
163
  # See the general documentation of the Ensembl module for
100
164
  # more information on what this means and what methods are available.
101
165
  class Individual < DBConnection
166
+ set_primary_key "sample_id"
102
167
  belongs_to :sample
103
- # FIXME
168
+ has_one :individual_type
169
+ has_many :individual_populations, :foreign_key => "individual_sample_id"
170
+ has_many :populations, :through => :individual_populations
104
171
  end
105
172
 
106
173
  class IndividualGenotypeMultipleBp < DBConnection
107
174
  belongs_to :sample
108
175
  belongs_to :variation
176
+ belongs_to :subsnp_handle
109
177
  end
178
+
179
+ class IndividualType < DBConnection
180
+ set_primary_key "invidual_type_id"
181
+ belongs_to :individual
182
+ end
183
+
110
184
 
111
185
  class CompressedGenotypeSingleBp < DBConnection
112
- belongs_to :sample
186
+ belongs_to :population_genotype, :foreign_key => "sample_id"
113
187
  end
114
188
 
115
189
  class ReadCoverage < DBConnection
@@ -118,10 +192,19 @@ module Ensembl
118
192
 
119
193
  class Population < DBConnection
120
194
  belongs_to :sample
195
+ set_primary_key "sample_id"
196
+ has_many :population_genotypes, :foreign_key => "sample_id"
197
+ has_many :individual_populations, :foreign_key => "population_sample_id"
198
+ has_many :individuals, :through => :individual_populations
199
+ has_many :sample_synonyms
200
+ has_one :population_structure
201
+ has_many :tagged_variation_features
202
+ has_many :alleles
203
+ has_many :allele_groups
121
204
  end
122
205
 
123
206
  class PopulationStructure < DBConnection
124
- # FIXME
207
+
125
208
  end
126
209
 
127
210
  # = DESCRIPTION
@@ -135,6 +218,8 @@ module Ensembl
135
218
  set_primary_key "population_genotype_id"
136
219
  belongs_to :variation
137
220
  belongs_to :population
221
+ belongs_to :subsnp_handle
222
+ has_many :compressed_genotype_single_bps, :foreign_key => "sample_id"
138
223
  end
139
224
 
140
225
  # = DESCRIPTION
@@ -166,6 +251,41 @@ module Ensembl
166
251
  has_many :variation_groups
167
252
  has_many :httags
168
253
  has_many :variation_synonyms
254
+ has_many :variation_annotations
255
+ has_many :structural_variations
256
+ end
257
+
258
+ class StructuralVariation < DBConnection
259
+ set_primary_key "structural_variation_id"
260
+ belongs_to :source
261
+ belongs_to :seq_region
262
+
263
+ class << self # Workaround for 'class' field, otherwise it creates a mess for AR
264
+ def instance_method_already_implemented?(method_name)
265
+ return true if method_name == 'class'
266
+ super
267
+ end
268
+ end
269
+
270
+ def sv_class
271
+ self.attributes["class"]
272
+ end
273
+
274
+ end
275
+
276
+
277
+ class SeqRegion < DBConnection
278
+ set_primary_key "seq_region_id"
279
+ has_many :variation_features
280
+ has_many :structural_variations
281
+ end
282
+
283
+ class SubsnpHandle < DBConnection
284
+ set_primary_key "subsnp_id"
285
+ has_many :individual_genotype_multiple_bps, :foreign_key => "subsnp_id"
286
+ has_many :population_genotypes, :foreign_key => "subsnp_id"
287
+ has_many :alleles, :foreign_key => "subsnp_id"
288
+ has_many :variation_synonyms,:foreign_key => "subsnp_id"
169
289
  end
170
290
 
171
291
  # = DESCRIPTION
@@ -179,6 +299,7 @@ module Ensembl
179
299
  set_primary_key "variation_synonym_id"
180
300
  belongs_to :variation
181
301
  belongs_to :source
302
+ belongs_to :subsnp_handle
182
303
  end
183
304
 
184
305
  # = DESCRIPTION
@@ -221,6 +342,14 @@ module Ensembl
221
342
  belongs_to :variation_group
222
343
  end
223
344
 
345
+ class VariationAnnotation < DBConnection
346
+ set_primary_key "variation_annotation_id"
347
+ belongs_to :variation
348
+ belongs_to :phenotype
349
+ belongs_to :source
350
+ end
351
+
352
+
224
353
  # = DESCRIPTION
225
354
  # The FlankingSequence class gives information about the genomic coordinates
226
355
  # of the flanking sequences, for a single VariationFeature.
@@ -249,5 +378,6 @@ module Ensembl
249
378
  belongs_to :variation_group
250
379
  belongs_to :source
251
380
  end
381
+
252
382
  end
253
383
  end
@@ -4,13 +4,13 @@
4
4
  # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
5
  # License:: The Ruby License
6
6
  #
7
+ # @author Francesco Strozzi
8
+
7
9
 
8
- nil
9
10
  module Ensembl
10
11
 
11
12
  module Variation
12
13
 
13
- # = DESCRIPTION
14
14
  # The Variation class represents single nucleotide polymorhisms (SNP) or variations
15
15
  # and provides information like the names (IDs), the validation status and
16
16
  # the allele information.
@@ -23,16 +23,16 @@ module Ensembl
23
23
  # See the general documentation of the Ensembl module for
24
24
  # more information on what this means and what methods are available.
25
25
  #
26
- #= USAGE
27
- # v = Variation.find_by_name('rs10111')
28
- # v.alleles.each do |a|
29
- # puts a.allele, a.frequency
30
- # end
26
+ # @example
27
+ # v = Variation.find_by_name('rs10111')
28
+ # v.alleles.each do |a|
29
+ # puts a.allele, a.frequency
30
+ # end
31
31
  #
32
- # variations = Variation.fetch_all_by_source('dbSNP') (many records)
33
- # variations.each do |v|
34
- # puts v.name
35
- # end
32
+ # variations = Variation.fetch_all_by_source('dbSNP') # many records
33
+ # variations.each do |v|
34
+ # puts v.name
35
+ # end
36
36
  #
37
37
  class Variation < DBConnection
38
38
  set_primary_key "variation_id"
@@ -47,6 +47,8 @@ module Ensembl
47
47
  has_many :variation_group_variations
48
48
  has_many :variation_groups, :through => :variation_group_variations
49
49
  has_many :individual_genotype_multiple_bps
50
+ has_many :failed_variations
51
+ has_many :failed_descriptions, :through => :failed_variations
50
52
 
51
53
  def self.fetch_all_by_source(source)
52
54
  variations = Source.find_by_name(source).variations
@@ -54,7 +56,6 @@ module Ensembl
54
56
  end
55
57
 
56
58
 
57
- # = DESCRIPTION
58
59
  # The VariationFeature class gives information about the genomic position of
59
60
  # each Variation, including also validation status and consequence type.
60
61
  #
@@ -62,30 +63,56 @@ module Ensembl
62
63
  # See the general documentation of the Ensembl module for
63
64
  # more information on what this means and what methods are available.
64
65
  #
65
- #= USAGE
66
- # * SLOWER QUERY*
67
- # vf = VariationFeature.find_by_variation_name('rs10111')
68
- # * FASTER QUERY*
69
- # vf = Variation.find_by_name('rs10111').variation_feature
70
- #
71
- # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
72
- # puts vf.variation.ancestral_allele
73
- # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
74
- # genomic_region.genes
75
- # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
66
+ # @example
67
+ # # SLOWER QUERY
68
+ # vf = VariationFeature.find_by_variation_name('rs10111')
69
+ # # FASTER QUERY
70
+ # vf = Variation.find_by_name('rs10111').variation_feature
71
+ #
72
+ # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
73
+ # puts vf.variation.ancestral_allele
74
+ # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
75
+ # genomic_region.genes
76
+ # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
76
77
  #
77
78
  class VariationFeature < DBConnection
78
79
  set_primary_key "variation_feature_id"
79
80
  belongs_to :variation
80
81
  has_many :tagged_variation_features
81
82
  has_many :samples, :through => :tagged_variation_features
82
- has_many :transcript_variations
83
+ belongs_to :seq_region
84
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
85
+ 'STOP_GAINED',
86
+ 'STOP_LOST',
87
+ 'COMPLEX_INDEL',
88
+ 'FRAMESHIFT_CODING',
89
+ 'NON_SYNONYMOUS_CODING',
90
+ 'SPLICE_SITE',
91
+ 'PARTIAL_CODON',
92
+ 'SYNONYMOUS_CODING',
93
+ 'REGULATORY_REGION',
94
+ 'WITHIN_MATURE_miRNA',
95
+ '5PRIME_UTR',
96
+ '3PRIME_UTR',
97
+ 'INTRONIC',
98
+ 'NMD_TRANSCRIPT',
99
+ 'UPSTREAM',
100
+ 'DOWNSTREAM',
101
+ 'WITHIN_NON_CODING_GENE',
102
+ 'HGMD_MUTATION'
103
+ ], :message => "Consequence type not allowed!"
83
104
 
105
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
106
+ "#{attributes_before_type_cast['consequence_type']}"
107
+ end
84
108
 
85
- #=DESCRIPTION
86
109
  # Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
87
110
  # starting from the Variation coordinates, expanding the region upstream and
88
- # downstream. Default values are -5000 and +5000.
111
+ # downstream.
112
+ #
113
+ # @param [Integer] up Length of upstream flanking region
114
+ # @param [Integer] down Length of downstream flanking region
115
+ # @return [Slice] Slice object containing the variation
89
116
  def fetch_region(up = 5000, down = 5000)
90
117
  sr = core_connection(self.seq_region_id)
91
118
  slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
@@ -100,15 +127,24 @@ module Ensembl
100
127
  return slice_up,slice_down
101
128
  end
102
129
 
130
+ def transcript_variations
131
+ tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
132
+ if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
133
+ sr = core_connection(self.seq_region_id)
134
+ return custom_transcript_variation(self,sr)
135
+ else
136
+ return tvs # the variation is already present in the database
137
+ end
138
+ end
139
+
103
140
  private
104
141
 
105
142
  def core_connection(seq_region_id)
106
143
  if !Ensembl::Core::DBConnection.connected? then
107
- host,user,password,db_name,port = Ensembl::Variation::DBConnection.get_info
108
- if db_name =~/(\w+_\w+)_\w+_(\d+)_\S+/ then
109
- species,release = $1,$2
144
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
145
+ begin
110
146
  Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
111
- else
147
+ rescue
112
148
  raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
113
149
  end
114
150
  end
@@ -123,9 +159,186 @@ module Ensembl
123
159
  return seq_region
124
160
  end
125
161
 
162
+ # Calculate a consequence type for a user-defined variation
163
+ def custom_transcript_variation(vf,sr)
164
+
165
+ @variation_name = vf.variation_name
166
+ @seq_region = sr
167
+
168
+ downstream = 5000
169
+ upstream = 5000
170
+ tvs = [] # store all the calculated TranscriptVariations
171
+ # retrieve the slice of the genomic region where the variation is located
172
+ region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,vf.seq_region_start-upstream,vf.seq_region_end+downstream-1)
173
+ # iterate through all the transcripts present in the region
174
+ genes = region.genes(inclusive = true)
175
+ if genes[0] != nil
176
+ genes.each do |g|
177
+ g.transcripts.each do |t|
178
+ tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
179
+ # do the calculations
180
+
181
+ # check if the variation is intergenic for this transcript (no effects)
182
+ tv.consequence_type = check_intergenic(vf,t)
183
+
184
+ # check if the variation is upstram or downstram the transcript
185
+ tv.consequence_type = check_upstream_downstream(vf,t) if tv.consequence_type == ""
186
+
187
+ # if no consequence type is found, then the variation is inside the transcript
188
+ # check for non coding gene
189
+ tv.consequence_type = check_non_coding(vf,t) if tv.consequence_type == "" and t.biotype != 'protein_coding'
190
+
191
+ # if no consequence type is found, then check intron / exon boundaries
192
+ tv.consequence_type = check_splice_site(vf,t) if tv.consequence_type == ""
193
+
194
+ # if no consequence type is found, check if the variation is inside UTRs
195
+ tv.consequence_type = check_utr(vf,t) if tv.consequence_type == ""
196
+
197
+ # if no consequence type is found, then variation is inside an exon.
198
+ # Check the codon change
199
+ (tv.consequence_type,tv.peptide_allele_string) = check_aa_change(vf,t) if tv.consequence_type == ""
200
+
201
+
202
+ begin # this changed from release 58
203
+ tv.transcript_stable_id = t.stable_id
204
+ rescue NoMethodError
205
+ tv.transcript_id = t.id
206
+ end
207
+
208
+ tv.consequence_type = "INTERGENIC" if tv.consequence_type == ""
209
+ tvs << tv
210
+ end
211
+ end
212
+ end
213
+ # if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
214
+ if tvs.size == 0 then
215
+ tv = TranscriptVariation.new()
216
+ tv.consequence_type = "INTERGENIC"
217
+ tvs << tv
218
+ end
219
+
220
+ return tvs
221
+ end
222
+
223
+ ## CONSEQUENCE CALCULATION FUNCTIONS ##
224
+
225
+ def check_intergenic(vf,t)
226
+ if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) > 5000 then
227
+ return "INTERGENIC"
228
+ elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end) +1) > 5000 then
229
+ return "INTERGENIC"
230
+ end
231
+ return nil
232
+ end
233
+
234
+ def check_upstream_downstream(vf,t)
235
+ if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) <= 5000 then
236
+ return (t.strand == 1) ? "UPSTREAM" : "DOWNSTREAM"
237
+ elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end)+1) <= 5000 then
238
+ return (t.strand == 1) ? "DOWNSTREAM" : "UPSTREAM"
239
+
240
+ # check if it's an InDel and if overlaps the transcript start / end
241
+ elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
242
+ return "COMPLEX_INDEL"
243
+ elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
244
+ return "COMPLEX_INDEL"
245
+ end
246
+ return nil
247
+ end
248
+
249
+ def check_non_coding(vf,t)
250
+ if t.biotype == "miRNA" then
251
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_MATURE_miRNA" : "COMPLEX_INDEL"
252
+ elsif t.biotype == "nonsense_mediated_decay"
253
+ return (vf.seq_region_start == vf.seq_region_end) ? "NMD_TRANSCRIPT" : "COMPLEX_INDEL"
254
+ else
255
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_NON_CODING_GENE" : "COMPLEX_INDEL"
256
+ end
257
+ return nil
258
+ end
259
+
260
+ def check_utr(vf,t)
261
+ if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
262
+ return (t.strand == 1) ? "5PRIME_UTR" : "3PRIME_UTR"
263
+ elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
264
+ return (t.strand == 1) ? "3PRIME_UTR" : "5PRIME_UTR"
265
+ end
266
+ return nil
267
+ end
268
+
269
+ def check_splice_site(vf,t)
270
+ exon_up = t.exon_for_genomic_position(vf.seq_region_start)
271
+ exon_down = t.exon_for_genomic_position(vf.seq_region_end)
272
+ if exon_up.nil? and exon_down.nil? # we are inside an intron
273
+ # checking boundaries
274
+ near_exon_up_2bp = t.exon_for_genomic_position(vf.seq_region_start-2)
275
+ near_exon_down_2bp = t.exon_for_genomic_position(vf.seq_region_end+2)
276
+ near_exon_up_8bp = t.exon_for_genomic_position(vf.seq_region_start-8)
277
+ near_exon_down_8bp = t.exon_for_genomic_position(vf.seq_region_end+8)
278
+ if near_exon_up_2bp or near_exon_down_2bp then
279
+ return "ESSENTIAL_SPLICE_SITE"
280
+ elsif near_exon_up_8bp or near_exon_down_8bp then
281
+ return "SPLICE_SITE"
282
+ else
283
+ return "INTRONIC"
284
+ end
285
+ elsif exon_up and exon_down # the variation is inside an exon
286
+ # check if it is a splice site
287
+ if (vf.seq_region_start-exon_up.seq_region_start) <= 3 or (exon_down.seq_region_end-vf.seq_region_end) <= 3 then
288
+ return "SPLICE_SITE"
289
+ end
290
+ else # a complex indel spanning intron/exon boundary
291
+ return "COMPLEX_INDEL"
292
+ end
293
+ return nil
294
+ end
295
+
296
+ def check_aa_change(vf,t)
297
+ alleles = vf.allele_string.split('/') # get the different alleles for this variation
298
+ # if the variation is an InDel then it produces a frameshift
299
+ if vf.seq_region_start != vf.seq_region_end or alleles.include?("-") then
300
+ return "FRAMESHIFT_CODING",nil
301
+ end
302
+
303
+ # Find the position inside the CDS
304
+
305
+ mutation_position = t.genomic2cds(vf.seq_region_start)
306
+
307
+ mutation_base = Bio::Sequence::NA.new(alleles[1])
308
+ if t.seq_region_strand == -1
309
+ mutation_base.reverse_complement!
310
+ end
311
+ # The rank of the codon
312
+ target_codon = (mutation_position)/3 + 1
313
+ cds_sequence = nil
314
+ cds_sequence = t.cds_seq
315
+ mut_sequence = cds_sequence.dup
316
+ # Replace base with the variant allele
317
+ mut_sequence[mutation_position] = mutation_base.seq
318
+ refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1)]
319
+ mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1)]
320
+ codontable = Bio::CodonTable[1]
321
+ refaa = codontable[refcodon]
322
+ mutaa = codontable[mutcodon.downcase]
323
+ if mutaa == nil
324
+ raise RuntimeError "Codon #{mutcodon.downcase} wasn't recognized."
325
+ end
326
+ pep_string = refaa+"/"+mutaa
327
+ if mutaa == "*" and refaa != "*"
328
+ return "STOP_GAINED",pep_string
329
+ elsif mutaa != "*" and refaa == "*"
330
+ return "STOP_LOST",pep_string
331
+ elsif mutaa != refaa
332
+ return "NON_SYNONYMOUS_CODING",pep_string
333
+ elsif mutaa == refaa
334
+ return "SYNONYMOUS_CODING",pep_string
335
+ end
336
+
337
+ end
338
+
339
+
126
340
  end # VariationFeature
127
341
 
128
- #= DESCRIPTION
129
342
  # The TranscriptVariation class gives information about the position of
130
343
  # a VariationFeature, mapped on an annotated transcript.
131
344
  #
@@ -133,31 +346,56 @@ module Ensembl
133
346
  # See the general documentation of the Ensembl module for
134
347
  # more information on what this means and what methods are available.
135
348
  #
136
- #= USAGE
137
- # vf = Variation.find_by_name('rs10111').variation_feature
138
- # vf.transcript_variations.each do |tv|
139
- # puts tv.peptide_allele_string, tv.transcript.stable_id
140
- # end
349
+ # @example
350
+ # vf = Variation.find_by_name('rs10111').variation_feature
351
+ # vf.transcript_variations.each do |tv|
352
+ # puts tv.peptide_allele_string, tv.transcript.stable_id
353
+ # end
141
354
  #
142
355
  class TranscriptVariation < DBConnection
143
356
  set_primary_key "transcript_variation_id"
144
357
  belongs_to :variation_feature
358
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
359
+ 'STOP_GAINED',
360
+ 'STOP_LOST',
361
+ 'COMPLEX_INDEL',
362
+ 'FRAMESHIFT_CODING',
363
+ 'NON_SYNONYMOUS_CODING',
364
+ 'SPLICE_SITE',
365
+ 'PARTIAL_CODON',
366
+ 'SYNONYMOUS_CODING',
367
+ 'REGULATORY_REGION',
368
+ 'WITHIN_MATURE_miRNA',
369
+ '5PRIME_UTR',
370
+ '3PRIME_UTR',
371
+ 'INTRONIC',
372
+ 'NMD_TRANSCRIPT',
373
+ 'UPSTREAM',
374
+ 'DOWNSTREAM',
375
+ 'WITHIN_NON_CODING_GENE',
376
+ 'HGMD_MUTATION'
377
+ ], :message => "Consequence type not allowed!"
378
+
379
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
380
+ "#{attributes_before_type_cast['consequence_type']}"
381
+ end
145
382
 
146
383
  def transcript
147
- if !Ensembl::Core::DBConnection.connected? then
148
- host,user,password,db_name,port = Ensembl::Variation::DBConnection.get_info
149
- if db_name =~/(\w+_\w+)_\w+_(\d+)_\S+/ then
150
- species,release = $1,$2
151
- Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
152
- else
153
- raise NameError, "Can't get Core database name from #{db_name}. Pheraps you are using non conventional names"
154
- end
384
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
385
+ if !Ensembl::Core::DBConnection.connected? then
386
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
387
+ end
388
+
389
+ begin # this changed from release 58
390
+ return Ensembl::Core::Transcript.find_by_stable_id(self.transcript_stable_id)
391
+ rescue NoMethodError
392
+ return Ensembl::Core::Transcript.find(self.transcript_id)
155
393
  end
156
- Ensembl::Core::Transcript.find(self.transcript_id)
394
+
157
395
  end
158
396
 
159
397
  end
160
398
 
161
399
  end
162
400
 
163
- end
401
+ end