ruby-ensembl-api 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/TUTORIAL.rdoc +623 -0
  2. data/bin/ensembl +40 -0
  3. data/lib/ensembl.rb +64 -0
  4. data/lib/ensembl/core/activerecord.rb +1914 -0
  5. data/lib/ensembl/core/collection.rb +60 -0
  6. data/lib/ensembl/core/project.rb +264 -0
  7. data/lib/ensembl/core/slice.rb +693 -0
  8. data/lib/ensembl/core/transcript.rb +425 -0
  9. data/lib/ensembl/core/transform.rb +97 -0
  10. data/lib/ensembl/db_connection.rb +216 -0
  11. data/lib/ensembl/variation/activerecord.rb +253 -0
  12. data/lib/ensembl/variation/variation.rb +163 -0
  13. data/test/unit/data/seq_c6qbl.fa +10 -0
  14. data/test/unit/data/seq_cso19_coding.fa +16 -0
  15. data/test/unit/data/seq_cso19_transcript.fa +28 -0
  16. data/test/unit/data/seq_drd3_gene.fa +838 -0
  17. data/test/unit/data/seq_drd3_transcript.fa +22 -0
  18. data/test/unit/data/seq_drd4_transcript.fa +24 -0
  19. data/test/unit/data/seq_forward_composite.fa +1669 -0
  20. data/test/unit/data/seq_par_boundary.fa +169 -0
  21. data/test/unit/data/seq_rnd3_transcript.fa +47 -0
  22. data/test/unit/data/seq_ub2r1_coding.fa +13 -0
  23. data/test/unit/data/seq_ub2r1_gene.fa +174 -0
  24. data/test/unit/data/seq_ub2r1_transcript.fa +26 -0
  25. data/test/unit/data/seq_y.fa +2 -0
  26. data/test/unit/ensembl_genomes/test_collection.rb +51 -0
  27. data/test/unit/ensembl_genomes/test_gene.rb +52 -0
  28. data/test/unit/ensembl_genomes/test_slice.rb +71 -0
  29. data/test/unit/ensembl_genomes/test_variation.rb +17 -0
  30. data/test/unit/release_50/core/test_project.rb +215 -0
  31. data/test/unit/release_50/core/test_project_human.rb +58 -0
  32. data/test/unit/release_50/core/test_relationships.rb +66 -0
  33. data/test/unit/release_50/core/test_sequence.rb +175 -0
  34. data/test/unit/release_50/core/test_slice.rb +121 -0
  35. data/test/unit/release_50/core/test_transcript.rb +108 -0
  36. data/test/unit/release_50/core/test_transform.rb +223 -0
  37. data/test/unit/release_50/variation/test_activerecord.rb +143 -0
  38. data/test/unit/release_50/variation/test_variation.rb +84 -0
  39. data/test/unit/release_53/core/test_gene.rb +66 -0
  40. data/test/unit/release_53/core/test_project.rb +96 -0
  41. data/test/unit/release_53/core/test_project_human.rb +65 -0
  42. data/test/unit/release_53/core/test_slice.rb +47 -0
  43. data/test/unit/release_53/core/test_transform.rb +63 -0
  44. data/test/unit/release_53/variation/test_activerecord.rb +145 -0
  45. data/test/unit/release_53/variation/test_variation.rb +71 -0
  46. data/test/unit/release_56/core/test_gene.rb +66 -0
  47. data/test/unit/release_56/core/test_project.rb +96 -0
  48. data/test/unit/release_56/core/test_slice.rb +54 -0
  49. data/test/unit/release_56/core/test_transform.rb +63 -0
  50. data/test/unit/release_56/variation/test_activerecord.rb +142 -0
  51. data/test/unit/release_56/variation/test_variation.rb +68 -0
  52. data/test/unit/test_connection.rb +66 -0
  53. data/test/unit/test_releases.rb +136 -0
  54. metadata +128 -0
@@ -0,0 +1,216 @@
1
+ #
2
+ # = ensembl/db_connection.rb - Connection classes for Ensembl databases
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Jan Aerts <http://jandot.myopenid.com>
5
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
6
+ #
7
+ # License:: The Ruby License
8
+ #
9
+
10
+
11
+ require 'rubygems'
12
+ require 'activerecord'
13
+
14
+ module Ensembl
15
+ DB_ADAPTER = 'mysql'
16
+ DB_HOST = 'ensembldb.ensembl.org'
17
+ DB_USERNAME = 'anonymous'
18
+ DB_PASSWORD = ''
19
+ EG_HOST = 'mysql.ebi.ac.uk'
20
+ EG_PORT = 4157
21
+
22
+
23
+ # = DESCRIPTION
24
+ # Generic class to perform dynamic connections to the Ensembl database and retrieve database names
25
+ #
26
+ class DummyDBConnection < ActiveRecord::Base
27
+ self.abstract_class = true
28
+ def self.connect(args)
29
+ self.establish_connection(
30
+ :adapter => args[:adapter] ||= Ensembl::DB_ADAPTER,
31
+ :host => args[:host] ||= Ensembl::DB_HOST,
32
+ :username => args[:username] ||= Ensembl::DB_USERNAME,
33
+ :password => args[:password] ||= Ensembl::DB_PASSWORD,
34
+ :port => args[:port],
35
+ :database => args[:database] ||= ''
36
+ )
37
+ end
38
+ end
39
+
40
+ module DBRegistry
41
+ # = DESCRIPTION
42
+ # The Ensembl::Registry::Base is a generic super class providing general methods
43
+ # to get database and connection info.
44
+ #
45
+ class Base < ActiveRecord::Base
46
+ self.abstract_class = true
47
+ self.pluralize_table_names = false
48
+ def self.get_info
49
+ host,user,password,db_name,port = self.retrieve_connection.instance_values["connection_options"]
50
+ end
51
+ # = DESCRIPTION
52
+ # Class method to retrieve the name of a database, using species, release and connection parameters
53
+ # passed by the user.
54
+ #
55
+ def self.get_name_from_db(match,species,release,args)
56
+ species = species.underscore # Always in lowercase. This keeps things simple when dealing with complex species names like in Ensembl Genomes database
57
+ dummy_db = DummyDBConnection.connect(args)
58
+ dummy_connection = dummy_db.connection
59
+
60
+ # check if a database exists with exactly the species name passed (regular way)
61
+ db_name = dummy_connection.select_values("SHOW DATABASES LIKE '%#{species}_#{match}_#{release.to_s}%'")[0]
62
+
63
+ # if a database is not found and we are working on Ensembl Genomes database...
64
+ if db_name.nil? and args[:ensembl_genomes] then
65
+ words = species.split(/_/)
66
+ first = words.shift
67
+ # ...try to find a collection database using the first name of the species passed (convention used for collection databases)
68
+ db_name = dummy_connection.select_values("SHOW DATABASES").select {|d| d=~/#{first}.*_collection_#{match}_#{release.to_s}/}[0]
69
+ # if a collection database match is found, then look inside to find the species
70
+ if db_name != nil then
71
+ dummy_db.disconnect! # close the generic connection with the host
72
+ args[:database] = db_name
73
+ dummy_db = DummyDBConnection.connect(args) # open a new connection directly with the collection database
74
+ others = ''
75
+ words.each do |w|
76
+ others << " #{w}"
77
+ end
78
+ species_name = "#{first}#{others}" # transform the species name, so it can match the species names stored in the collection database
79
+ Ensembl::SESSION.collection_species = species_name # set the species used for this session, so it's easier to fetch slices from the genome of that species
80
+
81
+ # check that the species passed is present in the collection database, otherwise returns a warning
82
+ exists = dummy_db.connection.select_values("SELECT species_id FROM meta WHERE LOWER(meta_value) = '#{species_name}' AND meta_key = 'species.db_name'")[0]
83
+ warn "WARNING: No species '#{species}' found in the database. Please check that the name is correct." if !exists
84
+ end
85
+ end
86
+ warn "WARNING: No connection to database established. Check that the species is in snake_case (was: #{species})." if db_name.nil?
87
+ return db_name
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
94
+
95
+ module Core
96
+ # = DESCRIPTION
97
+ # The Ensembl::Core::DBConnection is the actual connection established
98
+ # with the Ensembl server.
99
+ class DBConnection < Ensembl::DBRegistry::Base
100
+ self.abstract_class = true
101
+ self.pluralize_table_names = false
102
+ # = DESCRIPTION
103
+ # The Ensembl::Core::DBConnection#connect method makes the connection
104
+ # to the Ensembl core database for a given species. By default, it connects
105
+ # to release 50 for that species. You _could_ use a lower number, but
106
+ # some parts of the API might not work, or worse: give the wrong results.
107
+ #
108
+ # = USAGE
109
+ # # Connect to release 50 of human
110
+ # Ensembl::Core::DBConnection.connect('homo_sapiens')
111
+ #
112
+ # # Connect to release 42 of chicken
113
+ # Ensembl::Core::DBConnection.connect('gallus_gallus')
114
+ #
115
+ # ---
116
+ # *Arguments*:
117
+ # * species:: species to connect to. Arguments should be in snake_case
118
+ # * ensembl_release:: the release of the database to connect to
119
+ # (default = 50)
120
+ def self.connect(species, release = Ensembl::ENSEMBL_RELEASE, args = {})
121
+ Ensembl::SESSION.reset
122
+ db_name = nil
123
+ # if the connection is established with Ensembl Genomes, set the default port and host
124
+ if args[:ensembl_genomes]
125
+ args[:port] = EG_PORT
126
+ args[:host] = EG_HOST
127
+ end
128
+ if args[:port].nil? then
129
+ args[:port] = ( release > 47 ) ? 5306 : 3306
130
+ end
131
+ if args[:database]
132
+ db_name = args[:database]
133
+ else
134
+ db_name = self.get_name_from_db('core',species,release,args) # try to find the corresponding core database
135
+ end
136
+ establish_connection(
137
+ :adapter => args[:adapter] || Ensembl::DB_ADAPTER,
138
+ :host => args[:host] || Ensembl::DB_HOST,
139
+ :database => db_name,
140
+ :username => args[:username] || Ensembl::DB_USERNAME,
141
+ :password => args[:password] || Ensembl::DB_PASSWORD,
142
+ :port => args[:port]
143
+ )
144
+
145
+ self.retrieve_connection # Checkout that the connection is working
146
+ end
147
+
148
+
149
+ # = DESCRIPTION
150
+ # Simple wrapper for the normal DBConnection.connect() method. This is used to set the connection directly
151
+ # with the Ensembl Genomes database host
152
+ #
153
+ def self.ensemblgenomes_connect(species, release = Ensembl::ENSEMBL_RELEASE, args = {})
154
+ args[:ensembl_genomes] = true
155
+ self.connect(species,release,args)
156
+ end
157
+
158
+ end # Core::DBConnection
159
+
160
+ end # Core
161
+
162
+ module Variation
163
+ # = DESCRIPTION
164
+ # The Ensembl::Variation::DBConnection is the actual connection established
165
+ # with the Ensembl server.
166
+ class DBConnection < Ensembl::DBRegistry::Base
167
+ self.abstract_class = true
168
+ self.pluralize_table_names = false
169
+ # = DESCRIPTION
170
+ # The Ensembl::Variation::DBConnection#connect method makes the connection
171
+ # to the Ensembl variation database for a given species. By default, it connects
172
+ # to release 50 for that species. You _could_ use a lower number, but
173
+ # some parts of the API might not work, or worse: give the wrong results.
174
+ #
175
+ # = USAGE
176
+ # # Connect to release 50 of human
177
+ # Ensembl::Variation::DBConnection.connect('homo_sapiens')
178
+ #
179
+ # # Connect to release 42 of chicken
180
+ # Ensembl::Variation::DBConnection.connect('gallus_gallus')
181
+ #
182
+ # ---
183
+ # *Arguments*:
184
+ # * species:: species to connect to. Arguments should be in snake_case
185
+ # * ensembl_release:: the release of the database to connect to
186
+ # (default = 50)
187
+ def self.connect(species, release = Ensembl::ENSEMBL_RELEASE, args = {})
188
+ Ensembl::SESSION.reset
189
+ args[:species] = species
190
+ if args[:port].nil? then
191
+ args[:port] = ( release > 47 ) ? 5306 : 3306
192
+ end
193
+ db_name = nil
194
+ if args[:database]
195
+ db_name = args[:database]
196
+ else
197
+ db_name = self.get_name_from_db('variation',species,release,args) # try to find the corresponding variation database
198
+ end
199
+ establish_connection(
200
+ :adapter => args[:adapter] || Ensembl::DB_ADAPTER,
201
+ :host => args[:host] || Ensembl::DB_HOST,
202
+ :database => db_name,
203
+ :username => args[:username] || Ensembl::DB_USERNAME,
204
+ :password => args[:password] || Ensembl::DB_PASSWORD,
205
+ :port => args[:port]
206
+ )
207
+
208
+ self.retrieve_connection # Checkout that the connection is working
209
+
210
+ end
211
+
212
+ end # Variation::DBConnection
213
+
214
+ end # Variation
215
+
216
+ end # Ensembl
@@ -0,0 +1,253 @@
1
+ #
2
+ # = ensembl/variation/activerecord.rb - ActiveRecord mappings to Ensembl Variation
3
+ #
4
+ # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+
8
+ nil
9
+ module Ensembl
10
+ # = DESCRIPTION
11
+ # The Ensembl::Variation module covers the variation databases from
12
+ # ensembldb.ensembl.org.
13
+ module Variation
14
+ # = DESCRIPTION
15
+ # The Allele class describes a single allele of a variation. In addition to
16
+ # the nucleotide(s) (or absence of) that representing the allele frequency
17
+ # and population information may be present.
18
+ #
19
+ # This class uses ActiveRecord to access data in the Ensembl database.
20
+ # See the general documentation of the Ensembl module for
21
+ # more information on what this means and what methods are available.
22
+ #
23
+ # = USAGE
24
+ # allele = Allele.find(1)
25
+ # puts allele.to_yaml
26
+ class Allele < DBConnection
27
+ set_primary_key 'allele_id'
28
+ belongs_to :sample
29
+ belongs_to :variation
30
+ belongs_to :population
31
+ end
32
+
33
+ # = DESCRIPTION
34
+ # The AlleleGroup class represents a grouping of alleles that have tight
35
+ # linkage and are usually present together. This is commonly known as a
36
+ # Haplotype or Haplotype Block.
37
+ #
38
+ # This class uses ActiveRecord to access data in the Ensembl database.
39
+ # See the general documentation of the Ensembl module for
40
+ # more information on what this means and what methods are available.
41
+ #
42
+ # = USAGE
43
+ # allele_group = AlleleGroup.find(1)
44
+ # puts allele_group.to_yaml
45
+ class AlleleGroup < DBConnection
46
+ set_primary_key 'allele_group_id'
47
+ belongs_to :variation_group
48
+ belongs_to :source
49
+ belongs_to :sample
50
+ belongs_to :allele_group_allele
51
+ end
52
+
53
+ # = DESCRIPTION
54
+ # The AlleleGroupAllele class represents a connection class between Allele and AlleleGroup.
55
+ # Should not be used directly.
56
+ #
57
+ # This class uses ActiveRecord to access data in the Ensembl database.
58
+ # See the general documentation of the Ensembl module for
59
+ # more information on what this means and what methods are available.
60
+ class AlleleGroupAllele < DBConnection
61
+ belongs_to :variation
62
+ belongs_to :allele_group
63
+ end
64
+
65
+ # = DESCRIPTION
66
+ # The Sample class gives information about the biological samples stored in the database.
67
+ #
68
+ # This class uses ActiveRecord to access data in the Ensembl database.
69
+ # See the general documentation of the Ensembl module for
70
+ # more information on what this means and what methods are available.
71
+ class Sample < DBConnection
72
+ set_primary_key "sample_id"
73
+ has_one :individual
74
+ has_one :sample_synonym
75
+ has_many :individual_genotype_multiple_bp
76
+ has_many :compressed_genotype_single_bp
77
+ has_many :read_coverage
78
+ has_one :population
79
+ has_many :tagged_variation_features
80
+ end
81
+
82
+ # = DESCRIPTION
83
+ # The IndividualPopulation class is used to connect Individual and Population classes.
84
+ # Should not be used directly.
85
+ #
86
+ # This class uses ActiveRecord to access data in the Ensembl database.
87
+ # See the general documentation of the Ensembl module for
88
+ # more information on what this means and what methods are available.
89
+ class IndividualPopulation < DBConnection
90
+ belongs_to :individual
91
+ belongs_to :population
92
+ end
93
+
94
+ # = DESCRIPTION
95
+ # The Individual class gives information on the single individuals used
96
+ # to retrieve one or more biological samples.
97
+ #
98
+ # This class uses ActiveRecord to access data in the Ensembl database.
99
+ # See the general documentation of the Ensembl module for
100
+ # more information on what this means and what methods are available.
101
+ class Individual < DBConnection
102
+ belongs_to :sample
103
+ # FIXME
104
+ end
105
+
106
+ class IndividualGenotypeMultipleBp < DBConnection
107
+ belongs_to :sample
108
+ belongs_to :variation
109
+ end
110
+
111
+ class CompressedGenotypeSingleBp < DBConnection
112
+ belongs_to :sample
113
+ end
114
+
115
+ class ReadCoverage < DBConnection
116
+ belongs_to :sample
117
+ end
118
+
119
+ class Population < DBConnection
120
+ belongs_to :sample
121
+ end
122
+
123
+ class PopulationStructure < DBConnection
124
+ # FIXME
125
+ end
126
+
127
+ # = DESCRIPTION
128
+ # The PopulationGenotype class gives information about alleles and allele
129
+ # frequencies for a SNP observed within a population or a group of samples.
130
+ #
131
+ # This class uses ActiveRecord to access data in the Ensembl database.
132
+ # See the general documentation of the Ensembl module for
133
+ # more information on what this means and what methods are available.
134
+ class PopulationGenotype < DBConnection
135
+ set_primary_key "population_genotype_id"
136
+ belongs_to :variation
137
+ belongs_to :population
138
+ end
139
+
140
+ # = DESCRIPTION
141
+ # The SampleSynonym class represents information about alternative names
142
+ # for sample entries.
143
+ #
144
+ # This class uses ActiveRecord to access data in the Ensembl database.
145
+ # See the general documentation of the Ensembl module for
146
+ # more information on what this means and what methods are available.
147
+ class SampleSynonym < DBConnection
148
+ set_primary_key "sample_synonym_id"
149
+ belongs_to :source
150
+ belongs_to :sample
151
+ belongs_to :population
152
+ end
153
+
154
+ # = DESCRIPTION
155
+ # The Source class gives information on the different databases and SNP
156
+ # panels used to retrieve the data
157
+ #
158
+ # This class uses ActiveRecord to access data in the Ensembl database.
159
+ # See the general documentation of the Ensembl module for
160
+ # more information on what this means and what methods are available.
161
+ class Source < DBConnection
162
+ set_primary_key "source_id"
163
+ has_many :sample_synonyms
164
+ has_many :allele_groups
165
+ has_many :variations
166
+ has_many :variation_groups
167
+ has_many :httags
168
+ has_many :variation_synonyms
169
+ end
170
+
171
+ # = DESCRIPTION
172
+ # The VariationSynonym class gives information on alterative names used
173
+ # for Variation entries.
174
+ #
175
+ # This class uses ActiveRecord to access data in the Ensembl database.
176
+ # See the general documentation of the Ensembl module for
177
+ # more information on what this means and what methods are available.
178
+ class VariationSynonym < DBConnection
179
+ set_primary_key "variation_synonym_id"
180
+ belongs_to :variation
181
+ belongs_to :source
182
+ end
183
+
184
+ # = DESCRIPTION
185
+ # The VariationGroup class represents a group of variations (SNPs) that are
186
+ # linked and present toghether.
187
+ #
188
+ # This class uses ActiveRecord to access data in the Ensembl database.
189
+ # See the general documentation of the Ensembl module for
190
+ # more information on what this means and what methods are available.
191
+ class VariationGroup < DBConnection
192
+ set_primary_key "variation_group_id"
193
+ belongs_to :source
194
+ has_one :variation_group_variation
195
+ has_one :httag
196
+ has_one :variation_group_feature
197
+ has_one :allele_group
198
+ end
199
+
200
+ # = DESCRIPTION
201
+ # The VariationGroupVariation class is a connection class.
202
+ # Should not be used directly.
203
+ #
204
+ # This class uses ActiveRecord to access data in the Ensembl database.
205
+ # See the general documentation of the Ensembl module for
206
+ # more information on what this means and what methods are available.
207
+ class VariationGroupVariation < DBConnection
208
+ belongs_to :variation
209
+ belongs_to :variation_group
210
+ end
211
+
212
+ # = DESCRIPTION
213
+ # The VariationGroupFeature class gives information on the genomic position
214
+ # of each VariationGroup.
215
+ #
216
+ # This class uses ActiveRecord to access data in the Ensembl database.
217
+ # See the general documentation of the Ensembl module for
218
+ # more information on what this means and what methods are available.
219
+ class VariationGroupFeature < DBConnection
220
+ set_primary_key "variation_group_feature_id"
221
+ belongs_to :variation_group
222
+ end
223
+
224
+ # = DESCRIPTION
225
+ # The FlankingSequence class gives information about the genomic coordinates
226
+ # of the flanking sequences, for a single VariationFeature.
227
+ #
228
+ # This class uses ActiveRecord to access data in the Ensembl database.
229
+ # See the general documentation of the Ensembl module for
230
+ # more information on what this means and what methods are available.
231
+ class FlankingSequence < DBConnection
232
+ belongs_to :variation
233
+ end
234
+
235
+ # = DESCRIPTION
236
+ # The TaggedVariationFeature class is a connection class.
237
+ # Should not be used directly.
238
+ #
239
+ # This class uses ActiveRecord to access data in the Ensembl database.
240
+ # See the general documentation of the Ensembl module for
241
+ # more information on what this means and what methods are available.
242
+ class TaggedVariationFeature < DBConnection
243
+ belongs_to :variation_feature
244
+ belongs_to :sample
245
+ end
246
+
247
+ class Httag < DBConnection
248
+ set_primary_key "httag_id"
249
+ belongs_to :variation_group
250
+ belongs_to :source
251
+ end
252
+ end
253
+ end