ruby-ensembl-api 0.9.6 → 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/TUTORIAL.rdoc +1 -1
- data/bin/variation_effect_predictor +106 -0
- data/lib/ensembl.rb +2 -2
- data/lib/ensembl/core/activerecord.rb +119 -225
- data/lib/ensembl/core/collection.rb +14 -10
- data/lib/ensembl/core/project.rb +6 -8
- data/lib/ensembl/core/slice.rb +87 -123
- data/lib/ensembl/core/transcript.rb +49 -65
- data/lib/ensembl/core/transform.rb +6 -8
- data/lib/ensembl/db_connection.rb +56 -72
- data/lib/ensembl/variation/activerecord.rb +138 -8
- data/lib/ensembl/variation/variation.rb +284 -46
- data/samples/ensembl_genomes_example.rb +60 -0
- data/samples/examples_perl_tutorial.rb +125 -0
- data/samples/small_example_ruby_api.rb +34 -0
- data/samples/variation_example.rb +67 -0
- data/test/unit/{release_56 → release_60}/core/test_gene.rb +6 -6
- data/test/unit/release_60/core/test_project_human.rb +38 -0
- data/test/unit/{release_56 → release_60}/core/test_slice.rb +1 -8
- data/test/unit/release_60/core/test_transcript.rb +126 -0
- data/test/unit/{release_53 → release_60}/core/test_transform.rb +21 -21
- data/test/unit/release_60/variation/test_activerecord.rb +213 -0
- data/test/unit/release_60/variation/test_consequence.rb +158 -0
- data/test/unit/{release_56 → release_60}/variation/test_variation.rb +18 -17
- data/test/unit/test_connection.rb +2 -2
- data/test/unit/test_releases.rb +8 -8
- metadata +27 -43
- data/test/unit/data/seq_c6qbl.fa +0 -10
- data/test/unit/data/seq_cso19_coding.fa +0 -16
- data/test/unit/data/seq_cso19_transcript.fa +0 -28
- data/test/unit/data/seq_drd3_gene.fa +0 -838
- data/test/unit/data/seq_drd3_transcript.fa +0 -22
- data/test/unit/data/seq_drd4_transcript.fa +0 -24
- data/test/unit/data/seq_forward_composite.fa +0 -1669
- data/test/unit/data/seq_par_boundary.fa +0 -169
- data/test/unit/data/seq_rnd3_transcript.fa +0 -47
- data/test/unit/data/seq_ub2r1_coding.fa +0 -13
- data/test/unit/data/seq_ub2r1_gene.fa +0 -174
- data/test/unit/data/seq_ub2r1_transcript.fa +0 -26
- data/test/unit/data/seq_y.fa +0 -2
- data/test/unit/ensembl_genomes/test_collection.rb +0 -51
- data/test/unit/ensembl_genomes/test_gene.rb +0 -52
- data/test/unit/ensembl_genomes/test_slice.rb +0 -71
- data/test/unit/ensembl_genomes/test_variation.rb +0 -17
- data/test/unit/release_50/core/test_project.rb +0 -215
- data/test/unit/release_50/core/test_project_human.rb +0 -58
- data/test/unit/release_50/core/test_relationships.rb +0 -66
- data/test/unit/release_50/core/test_sequence.rb +0 -175
- data/test/unit/release_50/core/test_slice.rb +0 -121
- data/test/unit/release_50/core/test_transcript.rb +0 -108
- data/test/unit/release_50/core/test_transform.rb +0 -223
- data/test/unit/release_50/variation/test_activerecord.rb +0 -143
- data/test/unit/release_50/variation/test_variation.rb +0 -84
- data/test/unit/release_53/core/test_gene.rb +0 -66
- data/test/unit/release_53/core/test_project.rb +0 -96
- data/test/unit/release_53/core/test_project_human.rb +0 -65
- data/test/unit/release_53/core/test_slice.rb +0 -47
- data/test/unit/release_53/variation/test_activerecord.rb +0 -145
- data/test/unit/release_53/variation/test_variation.rb +0 -71
- data/test/unit/release_56/core/test_project.rb +0 -96
- data/test/unit/release_56/core/test_transform.rb +0 -63
- data/test/unit/release_56/variation/test_activerecord.rb +0 -142
@@ -4,14 +4,13 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
+
# @author Francesco Strozzi
|
7
8
|
|
8
9
|
nil
|
9
10
|
module Ensembl
|
10
|
-
# = DESCRIPTION
|
11
11
|
# The Ensembl::Variation module covers the variation databases from
|
12
12
|
# ensembldb.ensembl.org.
|
13
13
|
module Variation
|
14
|
-
# = DESCRIPTION
|
15
14
|
# The Allele class describes a single allele of a variation. In addition to
|
16
15
|
# the nucleotide(s) (or absence of) that representing the allele frequency
|
17
16
|
# and population information may be present.
|
@@ -28,6 +27,7 @@ module Ensembl
|
|
28
27
|
belongs_to :sample
|
29
28
|
belongs_to :variation
|
30
29
|
belongs_to :population
|
30
|
+
belongs_to :subsnp_handle
|
31
31
|
end
|
32
32
|
|
33
33
|
# = DESCRIPTION
|
@@ -62,6 +62,71 @@ module Ensembl
|
|
62
62
|
belongs_to :allele_group
|
63
63
|
end
|
64
64
|
|
65
|
+
# = DESCRIPTION
|
66
|
+
# Store information on attributes types
|
67
|
+
#
|
68
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
69
|
+
# See the general documentation of the Ensembl module for
|
70
|
+
# more information on what this means and what methods are available.
|
71
|
+
class AttribType < DBConnection
|
72
|
+
set_primary_key "attrib_type_id"
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
# = DESCRIPTION
|
77
|
+
#
|
78
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
79
|
+
# See the general documentation of the Ensembl module for
|
80
|
+
# more information on what this means and what methods are available.
|
81
|
+
class ConsequenceMapping < DBConnection
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
# = DESCRIPTION
|
86
|
+
#
|
87
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
88
|
+
# See the general documentation of the Ensembl module for
|
89
|
+
# more information on what this means and what methods are available.
|
90
|
+
class FailedDescription < DBConnection
|
91
|
+
set_primary_key "failed_description_id"
|
92
|
+
has_many :failed_variations
|
93
|
+
end
|
94
|
+
|
95
|
+
# = DESCRIPTION
|
96
|
+
#
|
97
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
98
|
+
# See the general documentation of the Ensembl module for
|
99
|
+
# more information on what this means and what methods are available.
|
100
|
+
class FailedVariation < DBConnection
|
101
|
+
set_primary_key "failed_variation_id"
|
102
|
+
belongs_to :failed_description
|
103
|
+
belongs_to :variation
|
104
|
+
end
|
105
|
+
|
106
|
+
# = DESCRIPTION
|
107
|
+
#
|
108
|
+
# This class uses ActiveRecord to access data in the Ensembl database.
|
109
|
+
# See the general documentation of the Ensembl module for
|
110
|
+
# more information on what this means and what methods are available.
|
111
|
+
class FeatureType < DBConnection
|
112
|
+
set_primary_key "feature_type_id"
|
113
|
+
end
|
114
|
+
|
115
|
+
class Meta < DBConnection
|
116
|
+
set_primary_key "meta_id"
|
117
|
+
end
|
118
|
+
|
119
|
+
class MetaCoord < DBConnection
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
class Phenotype < DBConnection
|
124
|
+
set_primary_key "phenotype_id"
|
125
|
+
has_many :variation_annotations
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
|
65
130
|
# = DESCRIPTION
|
66
131
|
# The Sample class gives information about the biological samples stored in the database.
|
67
132
|
#
|
@@ -79,7 +144,6 @@ module Ensembl
|
|
79
144
|
has_many :tagged_variation_features
|
80
145
|
end
|
81
146
|
|
82
|
-
# = DESCRIPTION
|
83
147
|
# The IndividualPopulation class is used to connect Individual and Population classes.
|
84
148
|
# Should not be used directly.
|
85
149
|
#
|
@@ -87,8 +151,8 @@ module Ensembl
|
|
87
151
|
# See the general documentation of the Ensembl module for
|
88
152
|
# more information on what this means and what methods are available.
|
89
153
|
class IndividualPopulation < DBConnection
|
90
|
-
belongs_to :individual
|
91
|
-
belongs_to :population
|
154
|
+
belongs_to :individual, :foreign_key => "individual_sample_id"
|
155
|
+
belongs_to :population, :foreign_key => "population_sample_id"
|
92
156
|
end
|
93
157
|
|
94
158
|
# = DESCRIPTION
|
@@ -99,17 +163,27 @@ module Ensembl
|
|
99
163
|
# See the general documentation of the Ensembl module for
|
100
164
|
# more information on what this means and what methods are available.
|
101
165
|
class Individual < DBConnection
|
166
|
+
set_primary_key "sample_id"
|
102
167
|
belongs_to :sample
|
103
|
-
|
168
|
+
has_one :individual_type
|
169
|
+
has_many :individual_populations, :foreign_key => "individual_sample_id"
|
170
|
+
has_many :populations, :through => :individual_populations
|
104
171
|
end
|
105
172
|
|
106
173
|
class IndividualGenotypeMultipleBp < DBConnection
|
107
174
|
belongs_to :sample
|
108
175
|
belongs_to :variation
|
176
|
+
belongs_to :subsnp_handle
|
109
177
|
end
|
178
|
+
|
179
|
+
class IndividualType < DBConnection
|
180
|
+
set_primary_key "invidual_type_id"
|
181
|
+
belongs_to :individual
|
182
|
+
end
|
183
|
+
|
110
184
|
|
111
185
|
class CompressedGenotypeSingleBp < DBConnection
|
112
|
-
belongs_to :
|
186
|
+
belongs_to :population_genotype, :foreign_key => "sample_id"
|
113
187
|
end
|
114
188
|
|
115
189
|
class ReadCoverage < DBConnection
|
@@ -118,10 +192,19 @@ module Ensembl
|
|
118
192
|
|
119
193
|
class Population < DBConnection
|
120
194
|
belongs_to :sample
|
195
|
+
set_primary_key "sample_id"
|
196
|
+
has_many :population_genotypes, :foreign_key => "sample_id"
|
197
|
+
has_many :individual_populations, :foreign_key => "population_sample_id"
|
198
|
+
has_many :individuals, :through => :individual_populations
|
199
|
+
has_many :sample_synonyms
|
200
|
+
has_one :population_structure
|
201
|
+
has_many :tagged_variation_features
|
202
|
+
has_many :alleles
|
203
|
+
has_many :allele_groups
|
121
204
|
end
|
122
205
|
|
123
206
|
class PopulationStructure < DBConnection
|
124
|
-
|
207
|
+
|
125
208
|
end
|
126
209
|
|
127
210
|
# = DESCRIPTION
|
@@ -135,6 +218,8 @@ module Ensembl
|
|
135
218
|
set_primary_key "population_genotype_id"
|
136
219
|
belongs_to :variation
|
137
220
|
belongs_to :population
|
221
|
+
belongs_to :subsnp_handle
|
222
|
+
has_many :compressed_genotype_single_bps, :foreign_key => "sample_id"
|
138
223
|
end
|
139
224
|
|
140
225
|
# = DESCRIPTION
|
@@ -166,6 +251,41 @@ module Ensembl
|
|
166
251
|
has_many :variation_groups
|
167
252
|
has_many :httags
|
168
253
|
has_many :variation_synonyms
|
254
|
+
has_many :variation_annotations
|
255
|
+
has_many :structural_variations
|
256
|
+
end
|
257
|
+
|
258
|
+
class StructuralVariation < DBConnection
|
259
|
+
set_primary_key "structural_variation_id"
|
260
|
+
belongs_to :source
|
261
|
+
belongs_to :seq_region
|
262
|
+
|
263
|
+
class << self # Workaround for 'class' field, otherwise it creates a mess for AR
|
264
|
+
def instance_method_already_implemented?(method_name)
|
265
|
+
return true if method_name == 'class'
|
266
|
+
super
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def sv_class
|
271
|
+
self.attributes["class"]
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
|
276
|
+
|
277
|
+
class SeqRegion < DBConnection
|
278
|
+
set_primary_key "seq_region_id"
|
279
|
+
has_many :variation_features
|
280
|
+
has_many :structural_variations
|
281
|
+
end
|
282
|
+
|
283
|
+
class SubsnpHandle < DBConnection
|
284
|
+
set_primary_key "subsnp_id"
|
285
|
+
has_many :individual_genotype_multiple_bps, :foreign_key => "subsnp_id"
|
286
|
+
has_many :population_genotypes, :foreign_key => "subsnp_id"
|
287
|
+
has_many :alleles, :foreign_key => "subsnp_id"
|
288
|
+
has_many :variation_synonyms,:foreign_key => "subsnp_id"
|
169
289
|
end
|
170
290
|
|
171
291
|
# = DESCRIPTION
|
@@ -179,6 +299,7 @@ module Ensembl
|
|
179
299
|
set_primary_key "variation_synonym_id"
|
180
300
|
belongs_to :variation
|
181
301
|
belongs_to :source
|
302
|
+
belongs_to :subsnp_handle
|
182
303
|
end
|
183
304
|
|
184
305
|
# = DESCRIPTION
|
@@ -221,6 +342,14 @@ module Ensembl
|
|
221
342
|
belongs_to :variation_group
|
222
343
|
end
|
223
344
|
|
345
|
+
class VariationAnnotation < DBConnection
|
346
|
+
set_primary_key "variation_annotation_id"
|
347
|
+
belongs_to :variation
|
348
|
+
belongs_to :phenotype
|
349
|
+
belongs_to :source
|
350
|
+
end
|
351
|
+
|
352
|
+
|
224
353
|
# = DESCRIPTION
|
225
354
|
# The FlankingSequence class gives information about the genomic coordinates
|
226
355
|
# of the flanking sequences, for a single VariationFeature.
|
@@ -249,5 +378,6 @@ module Ensembl
|
|
249
378
|
belongs_to :variation_group
|
250
379
|
belongs_to :source
|
251
380
|
end
|
381
|
+
|
252
382
|
end
|
253
383
|
end
|
@@ -4,13 +4,13 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
+
# @author Francesco Strozzi
|
8
|
+
|
7
9
|
|
8
|
-
nil
|
9
10
|
module Ensembl
|
10
11
|
|
11
12
|
module Variation
|
12
13
|
|
13
|
-
# = DESCRIPTION
|
14
14
|
# The Variation class represents single nucleotide polymorhisms (SNP) or variations
|
15
15
|
# and provides information like the names (IDs), the validation status and
|
16
16
|
# the allele information.
|
@@ -23,16 +23,16 @@ module Ensembl
|
|
23
23
|
# See the general documentation of the Ensembl module for
|
24
24
|
# more information on what this means and what methods are available.
|
25
25
|
#
|
26
|
-
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
26
|
+
# @example
|
27
|
+
# v = Variation.find_by_name('rs10111')
|
28
|
+
# v.alleles.each do |a|
|
29
|
+
# puts a.allele, a.frequency
|
30
|
+
# end
|
31
31
|
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
32
|
+
# variations = Variation.fetch_all_by_source('dbSNP') # many records
|
33
|
+
# variations.each do |v|
|
34
|
+
# puts v.name
|
35
|
+
# end
|
36
36
|
#
|
37
37
|
class Variation < DBConnection
|
38
38
|
set_primary_key "variation_id"
|
@@ -47,6 +47,8 @@ module Ensembl
|
|
47
47
|
has_many :variation_group_variations
|
48
48
|
has_many :variation_groups, :through => :variation_group_variations
|
49
49
|
has_many :individual_genotype_multiple_bps
|
50
|
+
has_many :failed_variations
|
51
|
+
has_many :failed_descriptions, :through => :failed_variations
|
50
52
|
|
51
53
|
def self.fetch_all_by_source(source)
|
52
54
|
variations = Source.find_by_name(source).variations
|
@@ -54,7 +56,6 @@ module Ensembl
|
|
54
56
|
end
|
55
57
|
|
56
58
|
|
57
|
-
# = DESCRIPTION
|
58
59
|
# The VariationFeature class gives information about the genomic position of
|
59
60
|
# each Variation, including also validation status and consequence type.
|
60
61
|
#
|
@@ -62,30 +63,56 @@ module Ensembl
|
|
62
63
|
# See the general documentation of the Ensembl module for
|
63
64
|
# more information on what this means and what methods are available.
|
64
65
|
#
|
65
|
-
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
#
|
70
|
-
#
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
-
#
|
66
|
+
# @example
|
67
|
+
# # SLOWER QUERY
|
68
|
+
# vf = VariationFeature.find_by_variation_name('rs10111')
|
69
|
+
# # FASTER QUERY
|
70
|
+
# vf = Variation.find_by_name('rs10111').variation_feature
|
71
|
+
#
|
72
|
+
# puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
|
73
|
+
# puts vf.variation.ancestral_allele
|
74
|
+
# genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
|
75
|
+
# genomic_region.genes
|
76
|
+
# up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)
|
76
77
|
#
|
77
78
|
class VariationFeature < DBConnection
|
78
79
|
set_primary_key "variation_feature_id"
|
79
80
|
belongs_to :variation
|
80
81
|
has_many :tagged_variation_features
|
81
82
|
has_many :samples, :through => :tagged_variation_features
|
82
|
-
|
83
|
+
belongs_to :seq_region
|
84
|
+
validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
|
85
|
+
'STOP_GAINED',
|
86
|
+
'STOP_LOST',
|
87
|
+
'COMPLEX_INDEL',
|
88
|
+
'FRAMESHIFT_CODING',
|
89
|
+
'NON_SYNONYMOUS_CODING',
|
90
|
+
'SPLICE_SITE',
|
91
|
+
'PARTIAL_CODON',
|
92
|
+
'SYNONYMOUS_CODING',
|
93
|
+
'REGULATORY_REGION',
|
94
|
+
'WITHIN_MATURE_miRNA',
|
95
|
+
'5PRIME_UTR',
|
96
|
+
'3PRIME_UTR',
|
97
|
+
'INTRONIC',
|
98
|
+
'NMD_TRANSCRIPT',
|
99
|
+
'UPSTREAM',
|
100
|
+
'DOWNSTREAM',
|
101
|
+
'WITHIN_NON_CODING_GENE',
|
102
|
+
'HGMD_MUTATION'
|
103
|
+
], :message => "Consequence type not allowed!"
|
83
104
|
|
105
|
+
def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
|
106
|
+
"#{attributes_before_type_cast['consequence_type']}"
|
107
|
+
end
|
84
108
|
|
85
|
-
#=DESCRIPTION
|
86
109
|
# Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region
|
87
110
|
# starting from the Variation coordinates, expanding the region upstream and
|
88
|
-
# downstream.
|
111
|
+
# downstream.
|
112
|
+
#
|
113
|
+
# @param [Integer] up Length of upstream flanking region
|
114
|
+
# @param [Integer] down Length of downstream flanking region
|
115
|
+
# @return [Slice] Slice object containing the variation
|
89
116
|
def fetch_region(up = 5000, down = 5000)
|
90
117
|
sr = core_connection(self.seq_region_id)
|
91
118
|
slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
|
@@ -100,15 +127,24 @@ module Ensembl
|
|
100
127
|
return slice_up,slice_down
|
101
128
|
end
|
102
129
|
|
130
|
+
def transcript_variations
|
131
|
+
tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
|
132
|
+
if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
|
133
|
+
sr = core_connection(self.seq_region_id)
|
134
|
+
return custom_transcript_variation(self,sr)
|
135
|
+
else
|
136
|
+
return tvs # the variation is already present in the database
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
103
140
|
private
|
104
141
|
|
105
142
|
def core_connection(seq_region_id)
|
106
143
|
if !Ensembl::Core::DBConnection.connected? then
|
107
|
-
host,user,password,db_name,port = Ensembl::Variation::DBConnection.get_info
|
108
|
-
|
109
|
-
species,release = $1,$2
|
144
|
+
host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
|
145
|
+
begin
|
110
146
|
Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
|
111
|
-
|
147
|
+
rescue
|
112
148
|
raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
|
113
149
|
end
|
114
150
|
end
|
@@ -123,9 +159,186 @@ module Ensembl
|
|
123
159
|
return seq_region
|
124
160
|
end
|
125
161
|
|
162
|
+
# Calculate a consequence type for a user-defined variation
|
163
|
+
def custom_transcript_variation(vf,sr)
|
164
|
+
|
165
|
+
@variation_name = vf.variation_name
|
166
|
+
@seq_region = sr
|
167
|
+
|
168
|
+
downstream = 5000
|
169
|
+
upstream = 5000
|
170
|
+
tvs = [] # store all the calculated TranscriptVariations
|
171
|
+
# retrieve the slice of the genomic region where the variation is located
|
172
|
+
region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,vf.seq_region_start-upstream,vf.seq_region_end+downstream-1)
|
173
|
+
# iterate through all the transcripts present in the region
|
174
|
+
genes = region.genes(inclusive = true)
|
175
|
+
if genes[0] != nil
|
176
|
+
genes.each do |g|
|
177
|
+
g.transcripts.each do |t|
|
178
|
+
tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
|
179
|
+
# do the calculations
|
180
|
+
|
181
|
+
# check if the variation is intergenic for this transcript (no effects)
|
182
|
+
tv.consequence_type = check_intergenic(vf,t)
|
183
|
+
|
184
|
+
# check if the variation is upstram or downstram the transcript
|
185
|
+
tv.consequence_type = check_upstream_downstream(vf,t) if tv.consequence_type == ""
|
186
|
+
|
187
|
+
# if no consequence type is found, then the variation is inside the transcript
|
188
|
+
# check for non coding gene
|
189
|
+
tv.consequence_type = check_non_coding(vf,t) if tv.consequence_type == "" and t.biotype != 'protein_coding'
|
190
|
+
|
191
|
+
# if no consequence type is found, then check intron / exon boundaries
|
192
|
+
tv.consequence_type = check_splice_site(vf,t) if tv.consequence_type == ""
|
193
|
+
|
194
|
+
# if no consequence type is found, check if the variation is inside UTRs
|
195
|
+
tv.consequence_type = check_utr(vf,t) if tv.consequence_type == ""
|
196
|
+
|
197
|
+
# if no consequence type is found, then variation is inside an exon.
|
198
|
+
# Check the codon change
|
199
|
+
(tv.consequence_type,tv.peptide_allele_string) = check_aa_change(vf,t) if tv.consequence_type == ""
|
200
|
+
|
201
|
+
|
202
|
+
begin # this changed from release 58
|
203
|
+
tv.transcript_stable_id = t.stable_id
|
204
|
+
rescue NoMethodError
|
205
|
+
tv.transcript_id = t.id
|
206
|
+
end
|
207
|
+
|
208
|
+
tv.consequence_type = "INTERGENIC" if tv.consequence_type == ""
|
209
|
+
tvs << tv
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
# if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
|
214
|
+
if tvs.size == 0 then
|
215
|
+
tv = TranscriptVariation.new()
|
216
|
+
tv.consequence_type = "INTERGENIC"
|
217
|
+
tvs << tv
|
218
|
+
end
|
219
|
+
|
220
|
+
return tvs
|
221
|
+
end
|
222
|
+
|
223
|
+
## CONSEQUENCE CALCULATION FUNCTIONS ##
|
224
|
+
|
225
|
+
def check_intergenic(vf,t)
|
226
|
+
if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) > 5000 then
|
227
|
+
return "INTERGENIC"
|
228
|
+
elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end) +1) > 5000 then
|
229
|
+
return "INTERGENIC"
|
230
|
+
end
|
231
|
+
return nil
|
232
|
+
end
|
233
|
+
|
234
|
+
def check_upstream_downstream(vf,t)
|
235
|
+
if vf.seq_region_end < t.seq_region_start and ((t.seq_region_start - vf.seq_region_end) +1) <= 5000 then
|
236
|
+
return (t.strand == 1) ? "UPSTREAM" : "DOWNSTREAM"
|
237
|
+
elsif vf.seq_region_start > t.seq_region_end and ((vf.seq_region_start - t.seq_region_end)+1) <= 5000 then
|
238
|
+
return (t.strand == 1) ? "DOWNSTREAM" : "UPSTREAM"
|
239
|
+
|
240
|
+
# check if it's an InDel and if overlaps the transcript start / end
|
241
|
+
elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
|
242
|
+
return "COMPLEX_INDEL"
|
243
|
+
elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
|
244
|
+
return "COMPLEX_INDEL"
|
245
|
+
end
|
246
|
+
return nil
|
247
|
+
end
|
248
|
+
|
249
|
+
def check_non_coding(vf,t)
|
250
|
+
if t.biotype == "miRNA" then
|
251
|
+
return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_MATURE_miRNA" : "COMPLEX_INDEL"
|
252
|
+
elsif t.biotype == "nonsense_mediated_decay"
|
253
|
+
return (vf.seq_region_start == vf.seq_region_end) ? "NMD_TRANSCRIPT" : "COMPLEX_INDEL"
|
254
|
+
else
|
255
|
+
return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_NON_CODING_GENE" : "COMPLEX_INDEL"
|
256
|
+
end
|
257
|
+
return nil
|
258
|
+
end
|
259
|
+
|
260
|
+
def check_utr(vf,t)
|
261
|
+
if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
|
262
|
+
return (t.strand == 1) ? "5PRIME_UTR" : "3PRIME_UTR"
|
263
|
+
elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
|
264
|
+
return (t.strand == 1) ? "3PRIME_UTR" : "5PRIME_UTR"
|
265
|
+
end
|
266
|
+
return nil
|
267
|
+
end
|
268
|
+
|
269
|
+
def check_splice_site(vf,t)
|
270
|
+
exon_up = t.exon_for_genomic_position(vf.seq_region_start)
|
271
|
+
exon_down = t.exon_for_genomic_position(vf.seq_region_end)
|
272
|
+
if exon_up.nil? and exon_down.nil? # we are inside an intron
|
273
|
+
# checking boundaries
|
274
|
+
near_exon_up_2bp = t.exon_for_genomic_position(vf.seq_region_start-2)
|
275
|
+
near_exon_down_2bp = t.exon_for_genomic_position(vf.seq_region_end+2)
|
276
|
+
near_exon_up_8bp = t.exon_for_genomic_position(vf.seq_region_start-8)
|
277
|
+
near_exon_down_8bp = t.exon_for_genomic_position(vf.seq_region_end+8)
|
278
|
+
if near_exon_up_2bp or near_exon_down_2bp then
|
279
|
+
return "ESSENTIAL_SPLICE_SITE"
|
280
|
+
elsif near_exon_up_8bp or near_exon_down_8bp then
|
281
|
+
return "SPLICE_SITE"
|
282
|
+
else
|
283
|
+
return "INTRONIC"
|
284
|
+
end
|
285
|
+
elsif exon_up and exon_down # the variation is inside an exon
|
286
|
+
# check if it is a splice site
|
287
|
+
if (vf.seq_region_start-exon_up.seq_region_start) <= 3 or (exon_down.seq_region_end-vf.seq_region_end) <= 3 then
|
288
|
+
return "SPLICE_SITE"
|
289
|
+
end
|
290
|
+
else # a complex indel spanning intron/exon boundary
|
291
|
+
return "COMPLEX_INDEL"
|
292
|
+
end
|
293
|
+
return nil
|
294
|
+
end
|
295
|
+
|
296
|
+
def check_aa_change(vf,t)
|
297
|
+
alleles = vf.allele_string.split('/') # get the different alleles for this variation
|
298
|
+
# if the variation is an InDel then it produces a frameshift
|
299
|
+
if vf.seq_region_start != vf.seq_region_end or alleles.include?("-") then
|
300
|
+
return "FRAMESHIFT_CODING",nil
|
301
|
+
end
|
302
|
+
|
303
|
+
# Find the position inside the CDS
|
304
|
+
|
305
|
+
mutation_position = t.genomic2cds(vf.seq_region_start)
|
306
|
+
|
307
|
+
mutation_base = Bio::Sequence::NA.new(alleles[1])
|
308
|
+
if t.seq_region_strand == -1
|
309
|
+
mutation_base.reverse_complement!
|
310
|
+
end
|
311
|
+
# The rank of the codon
|
312
|
+
target_codon = (mutation_position)/3 + 1
|
313
|
+
cds_sequence = nil
|
314
|
+
cds_sequence = t.cds_seq
|
315
|
+
mut_sequence = cds_sequence.dup
|
316
|
+
# Replace base with the variant allele
|
317
|
+
mut_sequence[mutation_position] = mutation_base.seq
|
318
|
+
refcodon = cds_sequence[(target_codon*3 -3)..(target_codon*3-1)]
|
319
|
+
mutcodon = mut_sequence[(target_codon*3 -3)..(target_codon*3-1)]
|
320
|
+
codontable = Bio::CodonTable[1]
|
321
|
+
refaa = codontable[refcodon]
|
322
|
+
mutaa = codontable[mutcodon.downcase]
|
323
|
+
if mutaa == nil
|
324
|
+
raise RuntimeError "Codon #{mutcodon.downcase} wasn't recognized."
|
325
|
+
end
|
326
|
+
pep_string = refaa+"/"+mutaa
|
327
|
+
if mutaa == "*" and refaa != "*"
|
328
|
+
return "STOP_GAINED",pep_string
|
329
|
+
elsif mutaa != "*" and refaa == "*"
|
330
|
+
return "STOP_LOST",pep_string
|
331
|
+
elsif mutaa != refaa
|
332
|
+
return "NON_SYNONYMOUS_CODING",pep_string
|
333
|
+
elsif mutaa == refaa
|
334
|
+
return "SYNONYMOUS_CODING",pep_string
|
335
|
+
end
|
336
|
+
|
337
|
+
end
|
338
|
+
|
339
|
+
|
126
340
|
end # VariationFeature
|
127
341
|
|
128
|
-
#= DESCRIPTION
|
129
342
|
# The TranscriptVariation class gives information about the position of
|
130
343
|
# a VariationFeature, mapped on an annotated transcript.
|
131
344
|
#
|
@@ -133,31 +346,56 @@ module Ensembl
|
|
133
346
|
# See the general documentation of the Ensembl module for
|
134
347
|
# more information on what this means and what methods are available.
|
135
348
|
#
|
136
|
-
|
137
|
-
#
|
138
|
-
#
|
139
|
-
#
|
140
|
-
#
|
349
|
+
# @example
|
350
|
+
# vf = Variation.find_by_name('rs10111').variation_feature
|
351
|
+
# vf.transcript_variations.each do |tv|
|
352
|
+
# puts tv.peptide_allele_string, tv.transcript.stable_id
|
353
|
+
# end
|
141
354
|
#
|
142
355
|
class TranscriptVariation < DBConnection
|
143
356
|
set_primary_key "transcript_variation_id"
|
144
357
|
belongs_to :variation_feature
|
358
|
+
validates_inclusion_of :consequence_type, :in => ['ESSENTIAL_SPLICE_SITE',
|
359
|
+
'STOP_GAINED',
|
360
|
+
'STOP_LOST',
|
361
|
+
'COMPLEX_INDEL',
|
362
|
+
'FRAMESHIFT_CODING',
|
363
|
+
'NON_SYNONYMOUS_CODING',
|
364
|
+
'SPLICE_SITE',
|
365
|
+
'PARTIAL_CODON',
|
366
|
+
'SYNONYMOUS_CODING',
|
367
|
+
'REGULATORY_REGION',
|
368
|
+
'WITHIN_MATURE_miRNA',
|
369
|
+
'5PRIME_UTR',
|
370
|
+
'3PRIME_UTR',
|
371
|
+
'INTRONIC',
|
372
|
+
'NMD_TRANSCRIPT',
|
373
|
+
'UPSTREAM',
|
374
|
+
'DOWNSTREAM',
|
375
|
+
'WITHIN_NON_CODING_GENE',
|
376
|
+
'HGMD_MUTATION'
|
377
|
+
], :message => "Consequence type not allowed!"
|
378
|
+
|
379
|
+
def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
|
380
|
+
"#{attributes_before_type_cast['consequence_type']}"
|
381
|
+
end
|
145
382
|
|
146
383
|
def transcript
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
384
|
+
host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
|
385
|
+
if !Ensembl::Core::DBConnection.connected? then
|
386
|
+
Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
|
387
|
+
end
|
388
|
+
|
389
|
+
begin # this changed from release 58
|
390
|
+
return Ensembl::Core::Transcript.find_by_stable_id(self.transcript_stable_id)
|
391
|
+
rescue NoMethodError
|
392
|
+
return Ensembl::Core::Transcript.find(self.transcript_id)
|
155
393
|
end
|
156
|
-
|
394
|
+
|
157
395
|
end
|
158
396
|
|
159
397
|
end
|
160
398
|
|
161
399
|
end
|
162
400
|
|
163
|
-
end
|
401
|
+
end
|