ruby-ensembl-api 0.9.6 → 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/TUTORIAL.rdoc +1 -1
- data/bin/variation_effect_predictor +106 -0
- data/lib/ensembl.rb +2 -2
- data/lib/ensembl/core/activerecord.rb +119 -225
- data/lib/ensembl/core/collection.rb +14 -10
- data/lib/ensembl/core/project.rb +6 -8
- data/lib/ensembl/core/slice.rb +87 -123
- data/lib/ensembl/core/transcript.rb +49 -65
- data/lib/ensembl/core/transform.rb +6 -8
- data/lib/ensembl/db_connection.rb +56 -72
- data/lib/ensembl/variation/activerecord.rb +138 -8
- data/lib/ensembl/variation/variation.rb +284 -46
- data/samples/ensembl_genomes_example.rb +60 -0
- data/samples/examples_perl_tutorial.rb +125 -0
- data/samples/small_example_ruby_api.rb +34 -0
- data/samples/variation_example.rb +67 -0
- data/test/unit/{release_56 → release_60}/core/test_gene.rb +6 -6
- data/test/unit/release_60/core/test_project_human.rb +38 -0
- data/test/unit/{release_56 → release_60}/core/test_slice.rb +1 -8
- data/test/unit/release_60/core/test_transcript.rb +126 -0
- data/test/unit/{release_53 → release_60}/core/test_transform.rb +21 -21
- data/test/unit/release_60/variation/test_activerecord.rb +213 -0
- data/test/unit/release_60/variation/test_consequence.rb +158 -0
- data/test/unit/{release_56 → release_60}/variation/test_variation.rb +18 -17
- data/test/unit/test_connection.rb +2 -2
- data/test/unit/test_releases.rb +8 -8
- metadata +27 -43
- data/test/unit/data/seq_c6qbl.fa +0 -10
- data/test/unit/data/seq_cso19_coding.fa +0 -16
- data/test/unit/data/seq_cso19_transcript.fa +0 -28
- data/test/unit/data/seq_drd3_gene.fa +0 -838
- data/test/unit/data/seq_drd3_transcript.fa +0 -22
- data/test/unit/data/seq_drd4_transcript.fa +0 -24
- data/test/unit/data/seq_forward_composite.fa +0 -1669
- data/test/unit/data/seq_par_boundary.fa +0 -169
- data/test/unit/data/seq_rnd3_transcript.fa +0 -47
- data/test/unit/data/seq_ub2r1_coding.fa +0 -13
- data/test/unit/data/seq_ub2r1_gene.fa +0 -174
- data/test/unit/data/seq_ub2r1_transcript.fa +0 -26
- data/test/unit/data/seq_y.fa +0 -2
- data/test/unit/ensembl_genomes/test_collection.rb +0 -51
- data/test/unit/ensembl_genomes/test_gene.rb +0 -52
- data/test/unit/ensembl_genomes/test_slice.rb +0 -71
- data/test/unit/ensembl_genomes/test_variation.rb +0 -17
- data/test/unit/release_50/core/test_project.rb +0 -215
- data/test/unit/release_50/core/test_project_human.rb +0 -58
- data/test/unit/release_50/core/test_relationships.rb +0 -66
- data/test/unit/release_50/core/test_sequence.rb +0 -175
- data/test/unit/release_50/core/test_slice.rb +0 -121
- data/test/unit/release_50/core/test_transcript.rb +0 -108
- data/test/unit/release_50/core/test_transform.rb +0 -223
- data/test/unit/release_50/variation/test_activerecord.rb +0 -143
- data/test/unit/release_50/variation/test_variation.rb +0 -84
- data/test/unit/release_53/core/test_gene.rb +0 -66
- data/test/unit/release_53/core/test_project.rb +0 -96
- data/test/unit/release_53/core/test_project_human.rb +0 -65
- data/test/unit/release_53/core/test_slice.rb +0 -47
- data/test/unit/release_53/variation/test_activerecord.rb +0 -145
- data/test/unit/release_53/variation/test_variation.rb +0 -71
- data/test/unit/release_56/core/test_project.rb +0 -96
- data/test/unit/release_56/core/test_transform.rb +0 -63
- data/test/unit/release_56/variation/test_activerecord.rb +0 -142
@@ -4,14 +4,13 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
+
# @author Francesco Strozzi
|
7
8
|
|
8
9
|
nil
|
9
10
|
module Ensembl
|
10
|
-
# = DESCRIPTION
|
11
11
|
# The Ensembl::Variation module covers the variation databases from
|
12
12
|
# ensembldb.ensembl.org.
|
13
13
|
module Variation
|
14
|
-
# = DESCRIPTION
|
15
14
|
# The Allele class describes a single allele of a variation. In addition to
|
16
15
|
# the nucleotide(s) (or absence of) that representing the allele frequency
|
17
16
|
# and population information may be present.
|
@@ -28,6 +27,7 @@ module Ensembl
|
|
28
27
|
belongs_to :sample
|
29
28
|
belongs_to :variation
|
30
29
|
belongs_to :population
|
30
|
+
belongs_to :subsnp_handle
|
31
31
|
end
|
32
32
|
|
33
33
|
# = DESCRIPTION
|
@@ -62,6 +62,71 @@ module Ensembl
|
|
62
62
|
belongs_to :allele_group
|
63
63
|
end
|
64
64
|
|
65
|
+
# = DESCRIPTION
|
66
|
+
# Store information on attributes types
|
67
|
+
#
|
68
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
69
|
+
# See the general documentation of the Ensembl module for
|
70
|
+
# more information on what this means and what methods are available.
|
71
|
+
class AttribType < DBConnection
|
72
|
+
set_primary_key "attrib_type_id"
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
# = DESCRIPTION
|
77
|
+
#
|
78
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
79
|
+
# See the general documentation of the Ensembl module for
|
80
|
+
# more information on what this means and what methods are available.
|
81
|
+
class ConsequenceMapping < DBConnection
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
# = DESCRIPTION
|
86
|
+
#
|
87
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
88
|
+
# See the general documentation of the Ensembl module for
|
89
|
+
# more information on what this means and what methods are available.
|
90
|
+
class FailedDescription < DBConnection
|
91
|
+
set_primary_key "failed_description_id"
|
92
|
+
has_many :failed_variations
|
93
|
+
end
|
94
|
+
|
95
|
+
# = DESCRIPTION
|
96
|
+
#
|
97
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
98
|
+
# See the general documentation of the Ensembl module for
|
99
|
+
# more information on what this means and what methods are available.
|
100
|
+
class FailedVariation < DBConnection
|
101
|
+
set_primary_key "failed_variation_id"
|
102
|
+
belongs_to :failed_description
|
103
|
+
belongs_to :variation
|
104
|
+
end
|
105
|
+
|
106
|
+
# = DESCRIPTION
|
107
|
+
#
|
108
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
109
|
+
# See the general documentation of the Ensembl module for
|
110
|
+
# more information on what this means and what methods are available.
|
111
|
+
class FeatureType < DBConnection
|
112
|
+
set_primary_key "feature_type_id"
|
113
|
+
end
|
114
|
+
|
115
|
+
class Meta < DBConnection
|
116
|
+
set_primary_key "meta_id"
|
117
|
+
end
|
118
|
+
|
119
|
+
class MetaCoord < DBConnection
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
class Phenotype < DBConnection
|
124
|
+
set_primary_key "phenotype_id"
|
125
|
+
has_many :variation_annotations
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
|
65
130
|
# = DESCRIPTION
|
66
131
|
# The Sample class gives information about the biological samples stored in the database.
|
67
132
|
#
|
@@ -79,7 +144,6 @@ module Ensembl
|
|
79
144
|
has_many :tagged_variation_features
|
80
145
|
end
|
81
146
|
|
82
|
-
# = DESCRIPTION
|
83
147
|
# The IndividualPopulation class is used to connect Individual and Population classes.
|
84
148
|
# Should not be used directly.
|
85
149
|
#
|
@@ -87,8 +151,8 @@ module Ensembl
|
|
87
151
|
# See the general documentation of the Ensembl module for
|
88
152
|
# more information on what this means and what methods are available.
|
89
153
|
class IndividualPopulation < DBConnection
|
90
|
-
belongs_to :individual
|
91
|
-
belongs_to :population
|
154
|
+
belongs_to :individual, :foreign_key => "individual_sample_id"
|
155
|
+
belongs_to :population, :foreign_key => "population_sample_id"
|
92
156
|
end
|
93
157
|
|
94
158
|
# = DESCRIPTION
|
@@ -99,17 +163,27 @@ module Ensembl
|
|
99
163
|
# See the general documentation of the Ensembl module for
|
100
164
|
# more information on what this means and what methods are available.
|
101
165
|
class Individual < DBConnection
|
166
|
+
set_primary_key "sample_id"
|
102
167
|
belongs_to :sample
|
103
|
-
|
168
|
+
has_one :individual_type
|
169
|
+
has_many :individual_populations, :foreign_key => "individual_sample_id"
|
170
|
+
has_many :populations, :through => :individual_populations
|
104
171
|
end
|
105
172
|
|
106
173
|
class IndividualGenotypeMultipleBp < DBConnection
|
107
174
|
belongs_to :sample
|
108
175
|
belongs_to :variation
|
176
|
+
belongs_to :subsnp_handle
|
109
177
|
end
|
178
|
+
|
179
|
+
class IndividualType < DBConnection
|
180
|
+
set_primary_key "invidual_type_id"
|
181
|
+
belongs_to :individual
|
182
|
+
end
|
183
|
+
|
110
184
|
|
111
185
|
class CompressedGenotypeSingleBp < DBConnection
|
112
|
-
belongs_to :
|
186
|
+
belongs_to :population_genotype, :foreign_key => "sample_id"
|
113
187
|
end
|
114
188
|
|
115
189
|
class ReadCoverage < DBConnection
|
@@ -118,10 +192,19 @@ module Ensembl
|
|
118
192
|
|
119
193
|
class Population < DBConnection
|
120
194
|
belongs_to :sample
|
195
|
+
set_primary_key "sample_id"
|
196
|
+
has_many :population_genotypes, :foreign_key => "sample_id"
|
197
|
+
has_many :individual_populations, :foreign_key => "population_sample_id"
|
198
|
+
has_many :individuals, :through => :individual_populations
|
199
|
+
has_many :sample_synonyms
|
200
|
+
has_one :population_structure
|
201
|
+
has_many :tagged_variation_features
|
202
|
+
has_many :alleles
|
203
|
+
has_many :allele_groups
|
121
204
|
end
|
122
205
|
|
123
206
|
class PopulationStructure < DBConnection
|
124
|
-
|
207
|
+
|
125
208
|
end
|
126
209
|
|
127
210
|
# = DESCRIPTION
|
@@ -135,6 +218,8 @@ module Ensembl
|
|
135
218
|
set_primary_key "population_genotype_id"
|
136
219
|
belongs_to :variation
|
137
220
|
belongs_to :population
|
221
|
+
belongs_to :subsnp_handle
|
222
|
+
has_many :compressed_genotype_single_bps, :foreign_key => "sample_id"
|
138
223
|
end
|
139
224
|
|
140
225
|
# = DESCRIPTION
|
@@ -166,6 +251,41 @@ module Ensembl
|
|
166
251
|
has_many :variation_groups
|
167
252
|
has_many :httags
|
168
253
|
has_many :variation_synonyms
|
254
|
+
has_many :variation_annotations
|
255
|
+
has_many :structural_variations
|
256
|
+
end
|
257
|
+
|
258
|
+
class StructuralVariation < DBConnection
|
259
|
+
set_primary_key "structural_variation_id"
|
260
|
+
belongs_to :source
|
261
|
+
belongs_to :seq_region
|
262
|
+
|
263
|
+
class << self # Workaround for 'class' field, otherwise it creates a mess for AR
|
264
|
+
def instance_method_already_implemented?(method_name)
|
265
|
+
return true if method_name == 'class'
|
266
|
+
super
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def sv_class
|
271
|
+
self.attributes["class"]
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
|
276
|
+
|
277
|
+
class SeqRegion < DBConnection
|
278
|
+
set_primary_key "seq_region_id"
|
279
|
+
has_many :variation_features
|
280
|
+
has_many :structural_variations
|
281
|
+
end
|
282
|
+
|
283
|
+
class SubsnpHandle < DBConnection
|
284
|
+
set_primary_key "subsnp_id"
|
285
|
+
has_many :individual_genotype_multiple_bps, :foreign_key => "subsnp_id"
|
286
|
+
has_many :population_genotypes, :foreign_key => "subsnp_id"
|
287
|
+
has_many :alleles, :foreign_key => "subsnp_id"
|
288
|
+
has_many :variation_synonyms,:foreign_key => "subsnp_id"
|
169
289
|
end
|
170
290
|
|
171
291
|
# = DESCRIPTION
|
@@ -179,6 +299,7 @@ module Ensembl
|
|
179
299
|
set_primary_key "variation_synonym_id"
|
180
300
|
belongs_to :variation
|
181
301
|
belongs_to :source
|
302
|
+
belongs_to :subsnp_handle
|
182
303
|
end
|
183
304
|
|
184
305
|
# = DESCRIPTION
|
@@ -221,6 +342,14 @@ module Ensembl
|
|
221
342
|
belongs_to :variation_group
|
222
343
|
end
|
223
344
|
|
345
|
+
class VariationAnnotation < DBConnection
|
346
|
+
set_primary_key "variation_annotation_id"
|
347
|
+
belongs_to :variation
|
348
|
+
belongs_to :phenotype
|
349
|
+
belongs_to :source
|
350
|
+
end
|
351
|
+
|
352
|
+
|
224
353
|
# = DESCRIPTION
|
225
354
|
# The FlankingSequence class gives information about the genomic coordinates
|
226
355
|
# of the flanking sequences, for a single VariationFeature.
|
@@ -249,5 +378,6 @@ module Ensembl
|
|
249
378
|
belongs_to :variation_group
|
250
379
|
belongs_to :source
|
251
380
|
end
|
381
|
+
|
252
382
|
end
|
253
383
|
end
|
@@ -4,13 +4,13 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
+
# @author Francesco Strozzi
|
8
|
+
|
7
9
|
|
8
|
-
nil
|
9
10
|
module Ensembl
|
10
11
|
|
11
12
|
module Variation
|
12
13
|
|
13
|
-
# = DESCRIPTION
|
14
14
|
# The Variation class represents single nucleotide polymorhisms (SNP) or variations
|
15
15
|
# and provides information like the names (IDs), the validation status and
|
16
16
|
# the allele information.
|
@@ -23,16 +23,16 @@ module Ensembl
|
|
23
23
|
# See the general documentation of the Ensembl module for
|
24
24
|
# more information on what this means and what methods are available.
|
25
25
|
#
|
26
|
-
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
26
|
+
# @example
|
27
|
+
# v = Variation.find_by_name('rs10111')
|
28
|
+
# v.alleles.each do |a|
|
29
|
+
# puts a.allele, a.frequency
|
30
|
+
# end
|
31
31
|
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
32
|
+
# variations = Variation.fetch_all_by_source('dbSNP') # many records
|
33
|
+
# variations.each do |v|
|
34
|
+
# puts v.name
|
35
|
+
# end
|
36
36
|
#
|
37
37
|
class Variation < DBConnection
|
38
38
|
set_primary_key "variation_id"
|
@@ -47,6 +47,8 @@ module Ensembl
|
|
47
47
|
has_many :variation_group_variations
|
48
48
|
has_many :variation_groups, :through => :variation_group_variations
|
49
49
|
has_many :individual_genotype_multiple_bps
|
50
|
+
has_many :failed_variations
|
51
|
+
has_many :failed_descriptions, :through => :failed_variations
|
50
52
|
|
51
53
|
def self.fetch_all_by_source(source)
|
52
54
|
variations = Source.find_by_name(source).variations
|
@@ -54,7 +56,6 @@ module Ensembl
|
|
54
56
|
end
|
55
57
|
|
56
58
|
|
57
|
-
# = DESCRIPTION
|
58
59
|
# The VariationFeature class gives information about the genomic position of
|
59
60
|
# each Variation, including also validation status and consequence type.
|
60
61
|
#
|
@@ -62,30 +63,56 @@ module Ensembl
|
|
62
63
|
# See the general documentation of the Ensembl module for
|
63
64
|
# more information on what this means and what methods are available.
|
64
65
|
#
|
65
|
-
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
#
|
70
|
-
#
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
-
#
|
66
|
+
# @example
|
67
|
+
# # SLOWER QUERY
|
68
|
+
# vf = VariationFeature.find_by_variation_name('rs10111')
|
69
|
+
# # FASTER QUERY
|
70
|
+
# vf = Variation.find_by_name('rs10111').variation_feature
|
71
|
+
#
|
72
|
+
# puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
|
73
|
+
# puts vf.variation.ancestral_allele
|
74
|
+
# genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
|
75
|
+
# genomic_region.genes
|
76
|
+
# up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
|
76
77
|
#
|
77
78
|
class VariationFeature < DBConnection
|
78
79
|
set_primary_key "variation_feature_id"
|
79
80
|
belongs_to :variation
|
80
81
|
has_many :tagged_variation_features
|
81
82
|
has_many :samples, :through => :tagged_variation_features
|
82
|
-
|
83
|
+
belongs_to :seq_region
|
84
|
+
validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
|
85
|
+
'STOP_GAINED',
|
86
|
+
'STOP_LOST',
|
87
|
+
'COMPLEX_INDEL',
|
88
|
+
'FRAMESHIFT_CODING',
|
89
|
+
'NON_SYNONYMOUS_CODING',
|
90
|
+
'SPLICE_SITE',
|
91
|
+
'PARTIAL_CODON',
|
92
|
+
'SYNONYMOUS_CODING',
|
93
|
+
'REGULATORY_REGION',
|
94
|
+
'WITHIN_MATURE_miRNA',
|
95
|
+
'5PRIME_UTR',
|
96
|
+
'3PRIME_UTR',
|
97
|
+
'INTRONIC',
|
98
|
+
'NMD_TRANSCRIPT',
|
99
|
+
'UPSTREAM',
|
100
|
+
'DOWNSTREAM',
|
101
|
+
'WITHIN_NON_CODING_GENE',
|
102
|
+
'HGMD_MUTATION'
|
103
|
+
], :message => "Consequence type not allowed!"
|
83
104
|
|
105
|
+
def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
|
106
|
+
"#{attributes_before_type_cast['consequence_type']}"
|
107
|
+
end
|
84
108
|
|
85
|
-
#=DESCRIPTION
|
86
109
|
# Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
|
87
110
|
# starting from the Variation coordinates, expanding the region upstream and
|
88
|
-
# downstream.
|
111
|
+
# downstream.
|
112
|
+
#
|
113
|
+
# @param [Integer] up Length of upstream flanking region
|
114
|
+
# @param [Integer] down Length of downstream flanking region
|
115
|
+
# @return [Slice] Slice object containing the variation
|
89
116
|
def fetch_region(up = 5000, down = 5000)
|
90
117
|
sr = core_connection(self.seq_region_id)
|
91
118
|
slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
|
@@ -100,15 +127,24 @@ module Ensembl
|
|
100
127
|
return slice_up,slice_down
|
101
128
|
end
|
102
129
|
|
130
|
+
def transcript_variations
|
131
|
+
tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
|
132
|
+
if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
|
133
|
+
sr = core_connection(self.seq_region_id)
|
134
|
+
return custom_transcript_variation(self,sr)
|
135
|
+
else
|
136
|
+
return tvs # the variation is already present in the database
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
103
140
|
private
|
104
141
|
|
105
142
|
def core_connection(seq_region_id)
|
106
143
|
if !Ensembl::Core::DBConnection.connected? then
|
107
|
-
host,user,password,db_name,port = Ensembl::Variation::DBConnection.get_info
|
108
|
-
|
109
|
-
species,release = $1,$2
|
144
|
+
host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
|
145
|
+
begin
|
110
146
|
Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
|
111
|
-
|
147
|
+
rescue
|
112
148
|
raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
|
113
149
|
end
|
114
150
|
end
|
@@ -123,9 +159,186 @@ module Ensembl
|
|
123
159
|
return seq_region
|
124
160
|
end
|
125
161
|
|
162
|
+
# Calculate a consequence type for a user-defined variation
|
163
|
+
def custom_transcript_variation(vf,sr)
|
164
|
+
|
165
|
+
@variation_name = vf.variation_name
|
166
|
+
@seq_region = sr
|
167
|
+
|
168
|
+
downstream = 5000
|
169
|
+
upstream = 5000
|
170
|
+
tvs = [] # store all the calculated TranscriptVariations
|
171
|
+
# retrieve the slice of the genomic region where the variation is located
|
172
|
+
region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,vf.seq_region_start-upstream,vf.seq_region_end+downstream-1)
|
173
|
+
# iterate through all the transcripts present in the region
|
174
|
+
genes = region.genes(inclusive = true)
|
175
|
+
if genes[0] != nil
|
176
|
+
genes.each do |g|
|
177
|
+
g.transcripts.each do |t|
|
178
|
+
tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
|
179
|
+
# do the calculations
|
180
|
+
|
181
|
+
# check if the variation is intergenic for this transcript (no effects)
|
182
|
+
tv.consequence_type = check_intergenic(vf,t)
|
183
|
+
|
184
|
+
# check if the variation is upstram or downstram the transcript
|
185
|
+
tv.consequence_type = check_upstream_downstream(vf,t) if tv.consequence_type == ""
|
186
|
+
|
187
|
+
# if no consequence type is found, then the variation is inside the transcript
|
188
|
+
# check for non coding gene
|
189
|
+
tv.consequence_type = check_non_coding(vf,t) if tv.consequence_type == "" and t.biotype != 'protein_coding'
|
190
|
+
|
191
|
+
# if no consequence type is found, then check intron / exon boundaries
|
192
|
+
tv.consequence_type = check_splice_site(vf,t) if tv.consequence_type == ""
|
193
|
+
|
194
|
+
# if no consequence type is found, check if the variation is inside UTRs
|
195
|
+
tv.consequence_type = check_utr(vf,t) if tv.consequence_type == ""
|
196
|
+
|
197
|
+
# if no consequence type is found, then variation is inside an exon.
|
198
|
+
# Check the codon change
|
199
|
+
(tv.consequence_type,tv.peptide_allele_string) = check_aa_change(vf,t) if tv.consequence_type == ""
|
200
|
+
|
201
|
+
|
202
|
+
begin # this changed from release 58
|
203
|
+
tv.transcript_stable_id = t.stable_id
|
204
|
+
rescue NoMethodError
|
205
|
+
tv.transcript_id = t.id
|
206
|
+
end
|
207
|
+
|
208
|
+
tv.consequence_type = "INTERGENIC" if tv.consequence_type == ""
|
209
|
+
tvs << tv
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
# if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
|
214
|
+
if tvs.size == 0 then
|
215
|
+
tv = TranscriptVariation.new()
|
216
|
+
tv.consequence_type = "INTERGENIC"
|
217
|
+
tvs << tv
|
218
|
+
end
|
219
|
+
|
220
|
+
return tvs
|
221
|
+
end
|
222
|
+
|
223
|
+
## CONSEQUENCE CALCULATION FUNCTIONS ##
|
224
|
+
|
225
|
+
def check_intergenic(vf,t)
|
226
|
+
if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) > 5000 then
|
227
|
+
return "INTERGENIC"
|
228
|
+
elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end) +1) > 5000 then
|
229
|
+
return "INTERGENIC"
|
230
|
+
end
|
231
|
+
return nil
|
232
|
+
end
|
233
|
+
|
234
|
+
def check_upstream_downstream(vf,t)
|
235
|
+
if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) <= 5000 then
|
236
|
+
return (t.strand == 1) ? "UPSTREAM" : "DOWNSTREAM"
|
237
|
+
elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end)+1) <= 5000 then
|
238
|
+
return (t.strand == 1) ? "DOWNSTREAM" : "UPSTREAM"
|
239
|
+
|
240
|
+
# check if it's an InDel and if overlaps the transcript start / end
|
241
|
+
elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
|
242
|
+
return "COMPLEX_INDEL"
|
243
|
+
elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
|
244
|
+
return "COMPLEX_INDEL"
|
245
|
+
end
|
246
|
+
return nil
|
247
|
+
end
|
248
|
+
|
249
|
+
def check_non_coding(vf,t)
|
250
|
+
if t.biotype == "miRNA" then
|
251
|
+
return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_MATURE_miRNA" : "COMPLEX_INDEL"
|
252
|
+
elsif t.biotype == "nonsense_mediated_decay"
|
253
|
+
return (vf.seq_region_start == vf.seq_region_end) ? "NMD_TRANSCRIPT" : "COMPLEX_INDEL"
|
254
|
+
else
|
255
|
+
return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_NON_CODING_GENE" : "COMPLEX_INDEL"
|
256
|
+
end
|
257
|
+
return nil
|
258
|
+
end
|
259
|
+
|
260
|
+
def check_utr(vf,t)
|
261
|
+
if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
|
262
|
+
return (t.strand == 1) ? "5PRIME_UTR" : "3PRIME_UTR"
|
263
|
+
elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
|
264
|
+
return (t.strand == 1) ? "3PRIME_UTR" : "5PRIME_UTR"
|
265
|
+
end
|
266
|
+
return nil
|
267
|
+
end
|
268
|
+
|
269
|
+
def check_splice_site(vf,t)
|
270
|
+
exon_up = t.exon_for_genomic_position(vf.seq_region_start)
|
271
|
+
exon_down = t.exon_for_genomic_position(vf.seq_region_end)
|
272
|
+
if exon_up.nil? and exon_down.nil? # we are inside an intron
|
273
|
+
# checking boundaries
|
274
|
+
near_exon_up_2bp = t.exon_for_genomic_position(vf.seq_region_start-2)
|
275
|
+
near_exon_down_2bp = t.exon_for_genomic_position(vf.seq_region_end+2)
|
276
|
+
near_exon_up_8bp = t.exon_for_genomic_position(vf.seq_region_start-8)
|
277
|
+
near_exon_down_8bp = t.exon_for_genomic_position(vf.seq_region_end+8)
|
278
|
+
if near_exon_up_2bp or near_exon_down_2bp then
|
279
|
+
return "ESSENTIAL_SPLICE_SITE"
|
280
|
+
elsif near_exon_up_8bp or near_exon_down_8bp then
|
281
|
+
return "SPLICE_SITE"
|
282
|
+
else
|
283
|
+
return "INTRONIC"
|
284
|
+
end
|
285
|
+
elsif exon_up and exon_down # the variation is inside an exon
|
286
|
+
# check if it is a splice site
|
287
|
+
if (vf.seq_region_start-exon_up.seq_region_start) <= 3 or (exon_down.seq_region_end-vf.seq_region_end) <= 3 then
|
288
|
+
return "SPLICE_SITE"
|
289
|
+
end
|
290
|
+
else # a complex indel spanning intron/exon boundary
|
291
|
+
return "COMPLEX_INDEL"
|
292
|
+
end
|
293
|
+
return nil
|
294
|
+
end
|
295
|
+
|
296
|
+
def check_aa_change(vf,t)
|
297
|
+
alleles = vf.allele_string.split('/') # get the different alleles for this variation
|
298
|
+
# if the variation is an InDel then it produces a frameshift
|
299
|
+
if vf.seq_region_start != vf.seq_region_end or alleles.include?("-") then
|
300
|
+
return "FRAMESHIFT_CODING",nil
|
301
|
+
end
|
302
|
+
|
303
|
+
# Find the position inside the CDS
|
304
|
+
|
305
|
+
mutation_position = t.genomic2cds(vf.seq_region_start)
|
306
|
+
|
307
|
+
mutation_base = Bio::Sequence::NA.new(alleles[1])
|
308
|
+
if t.seq_region_strand == -1
|
309
|
+
mutation_base.reverse_complement!
|
310
|
+
end
|
311
|
+
# The rank of the codon
|
312
|
+
target_codon = (mutation_position)/3 + 1
|
313
|
+
cds_sequence = nil
|
314
|
+
cds_sequence = t.cds_seq
|
315
|
+
mut_sequence = cds_sequence.dup
|
316
|
+
# Replace base with the variant allele
|
317
|
+
mut_sequence[mutation_position] = mutation_base.seq
|
318
|
+
refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1)]
|
319
|
+
mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1)]
|
320
|
+
codontable = Bio::CodonTable[1]
|
321
|
+
refaa = codontable[refcodon]
|
322
|
+
mutaa = codontable[mutcodon.downcase]
|
323
|
+
if mutaa == nil
|
324
|
+
raise RuntimeError "Codon #{mutcodon.downcase} wasn't recognized."
|
325
|
+
end
|
326
|
+
pep_string = refaa+"/"+mutaa
|
327
|
+
if mutaa == "*" and refaa != "*"
|
328
|
+
return "STOP_GAINED",pep_string
|
329
|
+
elsif mutaa != "*" and refaa == "*"
|
330
|
+
return "STOP_LOST",pep_string
|
331
|
+
elsif mutaa != refaa
|
332
|
+
return "NON_SYNONYMOUS_CODING",pep_string
|
333
|
+
elsif mutaa == refaa
|
334
|
+
return "SYNONYMOUS_CODING",pep_string
|
335
|
+
end
|
336
|
+
|
337
|
+
end
|
338
|
+
|
339
|
+
|
126
340
|
end # VariationFeature
|
127
341
|
|
128
|
-
#= DESCRIPTION
|
129
342
|
# The TranscriptVariation class gives information about the position of
|
130
343
|
# a VariationFeature, mapped on an annotated transcript.
|
131
344
|
#
|
@@ -133,31 +346,56 @@ module Ensembl
|
|
133
346
|
# See the general documentation of the Ensembl module for
|
134
347
|
# more information on what this means and what methods are available.
|
135
348
|
#
|
136
|
-
|
137
|
-
#
|
138
|
-
#
|
139
|
-
#
|
140
|
-
#
|
349
|
+
# @example
|
350
|
+
# vf = Variation.find_by_name('rs10111').variation_feature
|
351
|
+
# vf.transcript_variations.each do |tv|
|
352
|
+
# puts tv.peptide_allele_string, tv.transcript.stable_id
|
353
|
+
# end
|
141
354
|
#
|
142
355
|
class TranscriptVariation < DBConnection
|
143
356
|
set_primary_key "transcript_variation_id"
|
144
357
|
belongs_to :variation_feature
|
358
|
+
validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
|
359
|
+
'STOP_GAINED',
|
360
|
+
'STOP_LOST',
|
361
|
+
'COMPLEX_INDEL',
|
362
|
+
'FRAMESHIFT_CODING',
|
363
|
+
'NON_SYNONYMOUS_CODING',
|
364
|
+
'SPLICE_SITE',
|
365
|
+
'PARTIAL_CODON',
|
366
|
+
'SYNONYMOUS_CODING',
|
367
|
+
'REGULATORY_REGION',
|
368
|
+
'WITHIN_MATURE_miRNA',
|
369
|
+
'5PRIME_UTR',
|
370
|
+
'3PRIME_UTR',
|
371
|
+
'INTRONIC',
|
372
|
+
'NMD_TRANSCRIPT',
|
373
|
+
'UPSTREAM',
|
374
|
+
'DOWNSTREAM',
|
375
|
+
'WITHIN_NON_CODING_GENE',
|
376
|
+
'HGMD_MUTATION'
|
377
|
+
], :message => "Consequence type not allowed!"
|
378
|
+
|
379
|
+
def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
|
380
|
+
"#{attributes_before_type_cast['consequence_type']}"
|
381
|
+
end
|
145
382
|
|
146
383
|
def transcript
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
384
|
+
host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
|
385
|
+
if !Ensembl::Core::DBConnection.connected? then
|
386
|
+
Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
|
387
|
+
end
|
388
|
+
|
389
|
+
begin # this changed from release 58
|
390
|
+
return Ensembl::Core::Transcript.find_by_stable_id(self.transcript_stable_id)
|
391
|
+
rescue NoMethodError
|
392
|
+
return Ensembl::Core::Transcript.find(self.transcript_id)
|
155
393
|
end
|
156
|
-
|
394
|
+
|
157
395
|
end
|
158
396
|
|
159
397
|
end
|
160
398
|
|
161
399
|
end
|
162
400
|
|
163
|
-
end
|
401
|
+
end
|