ruby-ensembl-api 0.9.6 → 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/TUTORIAL.rdoc +1 -1
  2. data/bin/variation_effect_predictor +106 -0
  3. data/lib/ensembl.rb +2 -2
  4. data/lib/ensembl/core/activerecord.rb +119 -225
  5. data/lib/ensembl/core/collection.rb +14 -10
  6. data/lib/ensembl/core/project.rb +6 -8
  7. data/lib/ensembl/core/slice.rb +87 -123
  8. data/lib/ensembl/core/transcript.rb +49 -65
  9. data/lib/ensembl/core/transform.rb +6 -8
  10. data/lib/ensembl/db_connection.rb +56 -72
  11. data/lib/ensembl/variation/activerecord.rb +138 -8
  12. data/lib/ensembl/variation/variation.rb +284 -46
  13. data/samples/ensembl_genomes_example.rb +60 -0
  14. data/samples/examples_perl_tutorial.rb +125 -0
  15. data/samples/small_example_ruby_api.rb +34 -0
  16. data/samples/variation_example.rb +67 -0
  17. data/test/unit/{release_56 → release_60}/core/test_gene.rb +6 -6
  18. data/test/unit/release_60/core/test_project_human.rb +38 -0
  19. data/test/unit/{release_56 → release_60}/core/test_slice.rb +1 -8
  20. data/test/unit/release_60/core/test_transcript.rb +126 -0
  21. data/test/unit/{release_53 → release_60}/core/test_transform.rb +21 -21
  22. data/test/unit/release_60/variation/test_activerecord.rb +213 -0
  23. data/test/unit/release_60/variation/test_consequence.rb +158 -0
  24. data/test/unit/{release_56 → release_60}/variation/test_variation.rb +18 -17
  25. data/test/unit/test_connection.rb +2 -2
  26. data/test/unit/test_releases.rb +8 -8
  27. metadata +27 -43
  28. data/test/unit/data/seq_c6qbl.fa +0 -10
  29. data/test/unit/data/seq_cso19_coding.fa +0 -16
  30. data/test/unit/data/seq_cso19_transcript.fa +0 -28
  31. data/test/unit/data/seq_drd3_gene.fa +0 -838
  32. data/test/unit/data/seq_drd3_transcript.fa +0 -22
  33. data/test/unit/data/seq_drd4_transcript.fa +0 -24
  34. data/test/unit/data/seq_forward_composite.fa +0 -1669
  35. data/test/unit/data/seq_par_boundary.fa +0 -169
  36. data/test/unit/data/seq_rnd3_transcript.fa +0 -47
  37. data/test/unit/data/seq_ub2r1_coding.fa +0 -13
  38. data/test/unit/data/seq_ub2r1_gene.fa +0 -174
  39. data/test/unit/data/seq_ub2r1_transcript.fa +0 -26
  40. data/test/unit/data/seq_y.fa +0 -2
  41. data/test/unit/ensembl_genomes/test_collection.rb +0 -51
  42. data/test/unit/ensembl_genomes/test_gene.rb +0 -52
  43. data/test/unit/ensembl_genomes/test_slice.rb +0 -71
  44. data/test/unit/ensembl_genomes/test_variation.rb +0 -17
  45. data/test/unit/release_50/core/test_project.rb +0 -215
  46. data/test/unit/release_50/core/test_project_human.rb +0 -58
  47. data/test/unit/release_50/core/test_relationships.rb +0 -66
  48. data/test/unit/release_50/core/test_sequence.rb +0 -175
  49. data/test/unit/release_50/core/test_slice.rb +0 -121
  50. data/test/unit/release_50/core/test_transcript.rb +0 -108
  51. data/test/unit/release_50/core/test_transform.rb +0 -223
  52. data/test/unit/release_50/variation/test_activerecord.rb +0 -143
  53. data/test/unit/release_50/variation/test_variation.rb +0 -84
  54. data/test/unit/release_53/core/test_gene.rb +0 -66
  55. data/test/unit/release_53/core/test_project.rb +0 -96
  56. data/test/unit/release_53/core/test_project_human.rb +0 -65
  57. data/test/unit/release_53/core/test_slice.rb +0 -47
  58. data/test/unit/release_53/variation/test_activerecord.rb +0 -145
  59. data/test/unit/release_53/variation/test_variation.rb +0 -71
  60. data/test/unit/release_56/core/test_project.rb +0 -96
  61. data/test/unit/release_56/core/test_transform.rb +0 -63
  62. data/test/unit/release_56/variation/test_activerecord.rb +0 -142
@@ -4,14 +4,13 @@
4
4
  # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
5
  # License:: The Ruby License
6
6
  #
7
+ # @author Francesco Strozzi
7
8
 
8
9
  nil
9
10
  module Ensembl
10
- # = DESCRIPTION
11
11
  # The Ensembl::Variation module covers the variation databases from
12
12
  # ensembldb.ensembl.org.
13
13
  module Variation
14
- # = DESCRIPTION
15
14
  # The Allele class describes a single allele of a variation. In addition to
16
15
  # the nucleotide(s) (or absence of) that representing the allele frequency
17
16
  # and population information may be present.
@@ -28,6 +27,7 @@ module Ensembl
28
27
  belongs_to :sample
29
28
  belongs_to :variation
30
29
  belongs_to :population
30
+ belongs_to :subsnp_handle
31
31
  end
32
32
 
33
33
  # = DESCRIPTION
@@ -62,6 +62,71 @@ module Ensembl
62
62
  belongs_to :allele_group
63
63
  end
64
64
 
65
+ # = DESCRIPTION
66
+ # Store information on attributes types
67
+ #
68
+ # This class uses ActiveRecord to access data in the Ensembl database.
69
+ # See the general documentation of the Ensembl module for
70
+ # more information on what this means and what methods are available.
71
+ class AttribType < DBConnection
72
+ set_primary_key "attrib_type_id"
73
+ end
74
+
75
+
76
+ # = DESCRIPTION
77
+ #
78
+ # This class uses ActiveRecord to access data in the Ensembl database.
79
+ # See the general documentation of the Ensembl module for
80
+ # more information on what this means and what methods are available.
81
+ class ConsequenceMapping < DBConnection
82
+
83
+ end
84
+
85
+ # = DESCRIPTION
86
+ #
87
+ # This class uses ActiveRecord to access data in the Ensembl database.
88
+ # See the general documentation of the Ensembl module for
89
+ # more information on what this means and what methods are available.
90
+ class FailedDescription < DBConnection
91
+ set_primary_key "failed_description_id"
92
+ has_many :failed_variations
93
+ end
94
+
95
+ # = DESCRIPTION
96
+ #
97
+ # This class uses ActiveRecord to access data in the Ensembl database.
98
+ # See the general documentation of the Ensembl module for
99
+ # more information on what this means and what methods are available.
100
+ class FailedVariation < DBConnection
101
+ set_primary_key "failed_variation_id"
102
+ belongs_to :failed_description
103
+ belongs_to :variation
104
+ end
105
+
106
+ # = DESCRIPTION
107
+ #
108
+ # This class uses ActiveRecord to access data in the Ensembl database.
109
+ # See the general documentation of the Ensembl module for
110
+ # more information on what this means and what methods are available.
111
+ class FeatureType < DBConnection
112
+ set_primary_key "feature_type_id"
113
+ end
114
+
115
+ class Meta < DBConnection
116
+ set_primary_key "meta_id"
117
+ end
118
+
119
+ class MetaCoord < DBConnection
120
+
121
+ end
122
+
123
+ class Phenotype < DBConnection
124
+ set_primary_key "phenotype_id"
125
+ has_many :variation_annotations
126
+ end
127
+
128
+
129
+
65
130
  # = DESCRIPTION
66
131
  # The Sample class gives information about the biological samples stored in the database.
67
132
  #
@@ -79,7 +144,6 @@ module Ensembl
79
144
  has_many :tagged_variation_features
80
145
  end
81
146
 
82
- # = DESCRIPTION
83
147
  # The IndividualPopulation class is used to connect Individual and Population classes.
84
148
  # Should not be used directly.
85
149
  #
@@ -87,8 +151,8 @@ module Ensembl
87
151
  # See the general documentation of the Ensembl module for
88
152
  # more information on what this means and what methods are available.
89
153
  class IndividualPopulation < DBConnection
90
- belongs_to :individual
91
- belongs_to :population
154
+ belongs_to :individual, :foreign_key => "individual_sample_id"
155
+ belongs_to :population, :foreign_key => "population_sample_id"
92
156
  end
93
157
 
94
158
  # = DESCRIPTION
@@ -99,17 +163,27 @@ module Ensembl
99
163
  # See the general documentation of the Ensembl module for
100
164
  # more information on what this means and what methods are available.
101
165
  class Individual < DBConnection
166
+ set_primary_key "sample_id"
102
167
  belongs_to :sample
103
- # FIXME
168
+ has_one :individual_type
169
+ has_many :individual_populations, :foreign_key => "individual_sample_id"
170
+ has_many :populations, :through => :individual_populations
104
171
  end
105
172
 
106
173
  class IndividualGenotypeMultipleBp < DBConnection
107
174
  belongs_to :sample
108
175
  belongs_to :variation
176
+ belongs_to :subsnp_handle
109
177
  end
178
+
179
+ class IndividualType < DBConnection
180
+ set_primary_key "invidual_type_id"
181
+ belongs_to :individual
182
+ end
183
+
110
184
 
111
185
  class CompressedGenotypeSingleBp < DBConnection
112
- belongs_to :sample
186
+ belongs_to :population_genotype, :foreign_key => "sample_id"
113
187
  end
114
188
 
115
189
  class ReadCoverage < DBConnection
@@ -118,10 +192,19 @@ module Ensembl
118
192
 
119
193
  class Population < DBConnection
120
194
  belongs_to :sample
195
+ set_primary_key "sample_id"
196
+ has_many :population_genotypes, :foreign_key => "sample_id"
197
+ has_many :individual_populations, :foreign_key => "population_sample_id"
198
+ has_many :individuals, :through => :individual_populations
199
+ has_many :sample_synonyms
200
+ has_one :population_structure
201
+ has_many :tagged_variation_features
202
+ has_many :alleles
203
+ has_many :allele_groups
121
204
  end
122
205
 
123
206
  class PopulationStructure < DBConnection
124
- # FIXME
207
+
125
208
  end
126
209
 
127
210
  # = DESCRIPTION
@@ -135,6 +218,8 @@ module Ensembl
135
218
  set_primary_key "population_genotype_id"
136
219
  belongs_to :variation
137
220
  belongs_to :population
221
+ belongs_to :subsnp_handle
222
+ has_many :compressed_genotype_single_bps, :foreign_key => "sample_id"
138
223
  end
139
224
 
140
225
  # = DESCRIPTION
@@ -166,6 +251,41 @@ module Ensembl
166
251
  has_many :variation_groups
167
252
  has_many :httags
168
253
  has_many :variation_synonyms
254
+ has_many :variation_annotations
255
+ has_many :structural_variations
256
+ end
257
+
258
+ class StructuralVariation < DBConnection
259
+ set_primary_key "structural_variation_id"
260
+ belongs_to :source
261
+ belongs_to :seq_region
262
+
263
+ class << self # Workaround for 'class' field, otherwise it creates a mess for AR
264
+ def instance_method_already_implemented?(method_name)
265
+ return true if method_name == 'class'
266
+ super
267
+ end
268
+ end
269
+
270
+ def sv_class
271
+ self.attributes["class"]
272
+ end
273
+
274
+ end
275
+
276
+
277
+ class SeqRegion < DBConnection
278
+ set_primary_key "seq_region_id"
279
+ has_many :variation_features
280
+ has_many :structural_variations
281
+ end
282
+
283
+ class SubsnpHandle < DBConnection
284
+ set_primary_key "subsnp_id"
285
+ has_many :individual_genotype_multiple_bps, :foreign_key => "subsnp_id"
286
+ has_many :population_genotypes, :foreign_key => "subsnp_id"
287
+ has_many :alleles, :foreign_key => "subsnp_id"
288
+ has_many :variation_synonyms,:foreign_key => "subsnp_id"
169
289
  end
170
290
 
171
291
  # = DESCRIPTION
@@ -179,6 +299,7 @@ module Ensembl
179
299
  set_primary_key "variation_synonym_id"
180
300
  belongs_to :variation
181
301
  belongs_to :source
302
+ belongs_to :subsnp_handle
182
303
  end
183
304
 
184
305
  # = DESCRIPTION
@@ -221,6 +342,14 @@ module Ensembl
221
342
  belongs_to :variation_group
222
343
  end
223
344
 
345
+ class VariationAnnotation < DBConnection
346
+ set_primary_key "variation_annotation_id"
347
+ belongs_to :variation
348
+ belongs_to :phenotype
349
+ belongs_to :source
350
+ end
351
+
352
+
224
353
  # = DESCRIPTION
225
354
  # The FlankingSequence class gives information about the genomic coordinates
226
355
  # of the flanking sequences, for a single VariationFeature.
@@ -249,5 +378,6 @@ module Ensembl
249
378
  belongs_to :variation_group
250
379
  belongs_to :source
251
380
  end
381
+
252
382
  end
253
383
  end
@@ -4,13 +4,13 @@
4
4
  # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
5
  # License:: The Ruby License
6
6
  #
7
+ # @author Francesco Strozzi
8
+
7
9
 
8
- nil
9
10
  module Ensembl
10
11
 
11
12
  module Variation
12
13
 
13
- # = DESCRIPTION
14
14
  # The Variation class represents single nucleotide polymorhisms (SNP) or variations
15
15
  # and provides information like the names (IDs), the validation status and
16
16
  # the allele information.
@@ -23,16 +23,16 @@ module Ensembl
23
23
  # See the general documentation of the Ensembl module for
24
24
  # more information on what this means and what methods are available.
25
25
  #
26
- #= USAGE
27
- # v = Variation.find_by_name('rs10111')
28
- # v.alleles.each do |a|
29
- # puts a.allele, a.frequency
30
- # end
26
+ # @example
27
+ # v = Variation.find_by_name('rs10111')
28
+ # v.alleles.each do |a|
29
+ # puts a.allele, a.frequency
30
+ # end
31
31
  #
32
- # variations = Variation.fetch_all_by_source('dbSNP') (many records)
33
- # variations.each do |v|
34
- # puts v.name
35
- # end
32
+ # variations = Variation.fetch_all_by_source('dbSNP') # many records
33
+ # variations.each do |v|
34
+ # puts v.name
35
+ # end
36
36
  #
37
37
  class Variation < DBConnection
38
38
  set_primary_key "variation_id"
@@ -47,6 +47,8 @@ module Ensembl
47
47
  has_many :variation_group_variations
48
48
  has_many :variation_groups, :through => :variation_group_variations
49
49
  has_many :individual_genotype_multiple_bps
50
+ has_many :failed_variations
51
+ has_many :failed_descriptions, :through => :failed_variations
50
52
 
51
53
  def self.fetch_all_by_source(source)
52
54
  variations = Source.find_by_name(source).variations
@@ -54,7 +56,6 @@ module Ensembl
54
56
  end
55
57
 
56
58
 
57
- # = DESCRIPTION
58
59
  # The VariationFeature class gives information about the genomic position of
59
60
  # each Variation, including also validation status and consequence type.
60
61
  #
@@ -62,30 +63,56 @@ module Ensembl
62
63
  # See the general documentation of the Ensembl module for
63
64
  # more information on what this means and what methods are available.
64
65
  #
65
- #= USAGE
66
- # * SLOWER QUERY*
67
- # vf = VariationFeature.find_by_variation_name('rs10111')
68
- # * FASTER QUERY*
69
- # vf = Variation.find_by_name('rs10111').variation_feature
70
- #
71
- # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
72
- # puts vf.variation.ancestral_allele
73
- # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
74
- # genomic_region.genes
75
- # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
66
+ # @example
67
+ # # SLOWER QUERY
68
+ # vf = VariationFeature.find_by_variation_name('rs10111')
69
+ # # FASTER QUERY
70
+ # vf = Variation.find_by_name('rs10111').variation_feature
71
+ #
72
+ # puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
73
+ # puts vf.variation.ancestral_allele
74
+ # genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
75
+ # genomic_region.genes
76
+ # up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
76
77
  #
77
78
  class VariationFeature < DBConnection
78
79
  set_primary_key "variation_feature_id"
79
80
  belongs_to :variation
80
81
  has_many :tagged_variation_features
81
82
  has_many :samples, :through => :tagged_variation_features
82
- has_many :transcript_variations
83
+ belongs_to :seq_region
84
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
85
+ 'STOP_GAINED',
86
+ 'STOP_LOST',
87
+ 'COMPLEX_INDEL',
88
+ 'FRAMESHIFT_CODING',
89
+ 'NON_SYNONYMOUS_CODING',
90
+ 'SPLICE_SITE',
91
+ 'PARTIAL_CODON',
92
+ 'SYNONYMOUS_CODING',
93
+ 'REGULATORY_REGION',
94
+ 'WITHIN_MATURE_miRNA',
95
+ '5PRIME_UTR',
96
+ '3PRIME_UTR',
97
+ 'INTRONIC',
98
+ 'NMD_TRANSCRIPT',
99
+ 'UPSTREAM',
100
+ 'DOWNSTREAM',
101
+ 'WITHIN_NON_CODING_GENE',
102
+ 'HGMD_MUTATION'
103
+ ], :message => "Consequence type not allowed!"
83
104
 
105
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
106
+ "#{attributes_before_type_cast['consequence_type']}"
107
+ end
84
108
 
85
- #=DESCRIPTION
86
109
  # Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
87
110
  # starting from the Variation coordinates, expanding the region upstream and
88
- # downstream. Default values are -5000 and +5000.
111
+ # downstream.
112
+ #
113
+ # @param [Integer] up Length of upstream flanking region
114
+ # @param [Integer] down Length of downstream flanking region
115
+ # @return [Slice] Slice object containing the variation
89
116
  def fetch_region(up = 5000, down = 5000)
90
117
  sr = core_connection(self.seq_region_id)
91
118
  slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
@@ -100,15 +127,24 @@ module Ensembl
100
127
  return slice_up,slice_down
101
128
  end
102
129
 
130
+ def transcript_variations
131
+ tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
132
+ if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
133
+ sr = core_connection(self.seq_region_id)
134
+ return custom_transcript_variation(self,sr)
135
+ else
136
+ return tvs # the variation is already present in the database
137
+ end
138
+ end
139
+
103
140
  private
104
141
 
105
142
  def core_connection(seq_region_id)
106
143
  if !Ensembl::Core::DBConnection.connected? then
107
- host,user,password,db_name,port = Ensembl::Variation::DBConnection.get_info
108
- if db_name =~/(\w+_\w+)_\w+_(\d+)_\S+/ then
109
- species,release = $1,$2
144
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
145
+ begin
110
146
  Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
111
- else
147
+ rescue
112
148
  raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
113
149
  end
114
150
  end
@@ -123,9 +159,186 @@ module Ensembl
123
159
  return seq_region
124
160
  end
125
161
 
162
+ # Calculate a consequence type for a user-defined variation
163
+ def custom_transcript_variation(vf,sr)
164
+
165
+ @variation_name = vf.variation_name
166
+ @seq_region = sr
167
+
168
+ downstream = 5000
169
+ upstream = 5000
170
+ tvs = [] # store all the calculated TranscriptVariations
171
+ # retrieve the slice of the genomic region where the variation is located
172
+ region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,vf.seq_region_start-upstream,vf.seq_region_end+downstream-1)
173
+ # iterate through all the transcripts present in the region
174
+ genes = region.genes(inclusive = true)
175
+ if genes[0] != nil
176
+ genes.each do |g|
177
+ g.transcripts.each do |t|
178
+ tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
179
+ # do the calculations
180
+
181
+ # check if the variation is intergenic for this transcript (no effects)
182
+ tv.consequence_type = check_intergenic(vf,t)
183
+
184
+ # check if the variation is upstram or downstram the transcript
185
+ tv.consequence_type = check_upstream_downstream(vf,t) if tv.consequence_type == ""
186
+
187
+ # if no consequence type is found, then the variation is inside the transcript
188
+ # check for non coding gene
189
+ tv.consequence_type = check_non_coding(vf,t) if tv.consequence_type == "" and t.biotype != 'protein_coding'
190
+
191
+ # if no consequence type is found, then check intron / exon boundaries
192
+ tv.consequence_type = check_splice_site(vf,t) if tv.consequence_type == ""
193
+
194
+ # if no consequence type is found, check if the variation is inside UTRs
195
+ tv.consequence_type = check_utr(vf,t) if tv.consequence_type == ""
196
+
197
+ # if no consequence type is found, then variation is inside an exon.
198
+ # Check the codon change
199
+ (tv.consequence_type,tv.peptide_allele_string) = check_aa_change(vf,t) if tv.consequence_type == ""
200
+
201
+
202
+ begin # this changed from release 58
203
+ tv.transcript_stable_id = t.stable_id
204
+ rescue NoMethodError
205
+ tv.transcript_id = t.id
206
+ end
207
+
208
+ tv.consequence_type = "INTERGENIC" if tv.consequence_type == ""
209
+ tvs << tv
210
+ end
211
+ end
212
+ end
213
+ # if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
214
+ if tvs.size == 0 then
215
+ tv = TranscriptVariation.new()
216
+ tv.consequence_type = "INTERGENIC"
217
+ tvs << tv
218
+ end
219
+
220
+ return tvs
221
+ end
222
+
223
+ ## CONSEQUENCE CALCULATION FUNCTIONS ##
224
+
225
+ def check_intergenic(vf,t)
226
+ if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) > 5000 then
227
+ return "INTERGENIC"
228
+ elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end) +1) > 5000 then
229
+ return "INTERGENIC"
230
+ end
231
+ return nil
232
+ end
233
+
234
+ def check_upstream_downstream(vf,t)
235
+ if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) <= 5000 then
236
+ return (t.strand == 1) ? "UPSTREAM" : "DOWNSTREAM"
237
+ elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end)+1) <= 5000 then
238
+ return (t.strand == 1) ? "DOWNSTREAM" : "UPSTREAM"
239
+
240
+ # check if it's an InDel and if overlaps the transcript start / end
241
+ elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
242
+ return "COMPLEX_INDEL"
243
+ elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
244
+ return "COMPLEX_INDEL"
245
+ end
246
+ return nil
247
+ end
248
+
249
+ def check_non_coding(vf,t)
250
+ if t.biotype == "miRNA" then
251
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_MATURE_miRNA" : "COMPLEX_INDEL"
252
+ elsif t.biotype == "nonsense_mediated_decay"
253
+ return (vf.seq_region_start == vf.seq_region_end) ? "NMD_TRANSCRIPT" : "COMPLEX_INDEL"
254
+ else
255
+ return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_NON_CODING_GENE" : "COMPLEX_INDEL"
256
+ end
257
+ return nil
258
+ end
259
+
260
+ def check_utr(vf,t)
261
+ if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
262
+ return (t.strand == 1) ? "5PRIME_UTR" : "3PRIME_UTR"
263
+ elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
264
+ return (t.strand == 1) ? "3PRIME_UTR" : "5PRIME_UTR"
265
+ end
266
+ return nil
267
+ end
268
+
269
+ def check_splice_site(vf,t)
270
+ exon_up = t.exon_for_genomic_position(vf.seq_region_start)
271
+ exon_down = t.exon_for_genomic_position(vf.seq_region_end)
272
+ if exon_up.nil? and exon_down.nil? # we are inside an intron
273
+ # checking boundaries
274
+ near_exon_up_2bp = t.exon_for_genomic_position(vf.seq_region_start-2)
275
+ near_exon_down_2bp = t.exon_for_genomic_position(vf.seq_region_end+2)
276
+ near_exon_up_8bp = t.exon_for_genomic_position(vf.seq_region_start-8)
277
+ near_exon_down_8bp = t.exon_for_genomic_position(vf.seq_region_end+8)
278
+ if near_exon_up_2bp or near_exon_down_2bp then
279
+ return "ESSENTIAL_SPLICE_SITE"
280
+ elsif near_exon_up_8bp or near_exon_down_8bp then
281
+ return "SPLICE_SITE"
282
+ else
283
+ return "INTRONIC"
284
+ end
285
+ elsif exon_up and exon_down # the variation is inside an exon
286
+ # check if it is a splice site
287
+ if (vf.seq_region_start-exon_up.seq_region_start) <= 3 or (exon_down.seq_region_end-vf.seq_region_end) <= 3 then
288
+ return "SPLICE_SITE"
289
+ end
290
+ else # a complex indel spanning intron/exon boundary
291
+ return "COMPLEX_INDEL"
292
+ end
293
+ return nil
294
+ end
295
+
296
+ def check_aa_change(vf,t)
297
+ alleles = vf.allele_string.split('/') # get the different alleles for this variation
298
+ # if the variation is an InDel then it produces a frameshift
299
+ if vf.seq_region_start != vf.seq_region_end or alleles.include?("-") then
300
+ return "FRAMESHIFT_CODING",nil
301
+ end
302
+
303
+ # Find the position inside the CDS
304
+
305
+ mutation_position = t.genomic2cds(vf.seq_region_start)
306
+
307
+ mutation_base = Bio::Sequence::NA.new(alleles[1])
308
+ if t.seq_region_strand == -1
309
+ mutation_base.reverse_complement!
310
+ end
311
+ # The rank of the codon
312
+ target_codon = (mutation_position)/3 + 1
313
+ cds_sequence = nil
314
+ cds_sequence = t.cds_seq
315
+ mut_sequence = cds_sequence.dup
316
+ # Replace base with the variant allele
317
+ mut_sequence[mutation_position] = mutation_base.seq
318
+ refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1)]
319
+ mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1)]
320
+ codontable = Bio::CodonTable[1]
321
+ refaa = codontable[refcodon]
322
+ mutaa = codontable[mutcodon.downcase]
323
+ if mutaa == nil
324
+ raise RuntimeError "Codon #{mutcodon.downcase} wasn't recognized."
325
+ end
326
+ pep_string = refaa+"/"+mutaa
327
+ if mutaa == "*" and refaa != "*"
328
+ return "STOP_GAINED",pep_string
329
+ elsif mutaa != "*" and refaa == "*"
330
+ return "STOP_LOST",pep_string
331
+ elsif mutaa != refaa
332
+ return "NON_SYNONYMOUS_CODING",pep_string
333
+ elsif mutaa == refaa
334
+ return "SYNONYMOUS_CODING",pep_string
335
+ end
336
+
337
+ end
338
+
339
+
126
340
  end # VariationFeature
127
341
 
128
- #= DESCRIPTION
129
342
  # The TranscriptVariation class gives information about the position of
130
343
  # a VariationFeature, mapped on an annotated transcript.
131
344
  #
@@ -133,31 +346,56 @@ module Ensembl
133
346
  # See the general documentation of the Ensembl module for
134
347
  # more information on what this means and what methods are available.
135
348
  #
136
- #= USAGE
137
- # vf = Variation.find_by_name('rs10111').variation_feature
138
- # vf.transcript_variations.each do |tv|
139
- # puts tv.peptide_allele_string, tv.transcript.stable_id
140
- # end
349
+ # @example
350
+ # vf = Variation.find_by_name('rs10111').variation_feature
351
+ # vf.transcript_variations.each do |tv|
352
+ # puts tv.peptide_allele_string, tv.transcript.stable_id
353
+ # end
141
354
  #
142
355
  class TranscriptVariation < DBConnection
143
356
  set_primary_key "transcript_variation_id"
144
357
  belongs_to :variation_feature
358
+ validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
359
+ 'STOP_GAINED',
360
+ 'STOP_LOST',
361
+ 'COMPLEX_INDEL',
362
+ 'FRAMESHIFT_CODING',
363
+ 'NON_SYNONYMOUS_CODING',
364
+ 'SPLICE_SITE',
365
+ 'PARTIAL_CODON',
366
+ 'SYNONYMOUS_CODING',
367
+ 'REGULATORY_REGION',
368
+ 'WITHIN_MATURE_miRNA',
369
+ '5PRIME_UTR',
370
+ '3PRIME_UTR',
371
+ 'INTRONIC',
372
+ 'NMD_TRANSCRIPT',
373
+ 'UPSTREAM',
374
+ 'DOWNSTREAM',
375
+ 'WITHIN_NON_CODING_GENE',
376
+ 'HGMD_MUTATION'
377
+ ], :message => "Consequence type not allowed!"
378
+
379
+ def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
380
+ "#{attributes_before_type_cast['consequence_type']}"
381
+ end
145
382
 
146
383
  def transcript
147
- if !Ensembl::Core::DBConnection.connected? then
148
- host,user,password,db_name,port = Ensembl::Variation::DBConnection.get_info
149
- if db_name =~/(\w+_\w+)_\w+_(\d+)_\S+/ then
150
- species,release = $1,$2
151
- Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
152
- else
153
- raise NameError, "Can't get Core database name from #{db_name}. Pheraps you are using non conventional names"
154
- end
384
+ host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
385
+ if !Ensembl::Core::DBConnection.connected? then
386
+ Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
387
+ end
388
+
389
+ begin # this changed from release 58
390
+ return Ensembl::Core::Transcript.find_by_stable_id(self.transcript_stable_id)
391
+ rescue NoMethodError
392
+ return Ensembl::Core::Transcript.find(self.transcript_id)
155
393
  end
156
- Ensembl::Core::Transcript.find(self.transcript_id)
394
+
157
395
  end
158
396
 
159
397
  end
160
398
 
161
399
  end
162
400
 
163
- end
401
+ end