ruby-ensembl-api 0.9.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/TUTORIAL.rdoc +623 -0
  2. data/bin/ensembl +40 -0
  3. data/lib/ensembl.rb +64 -0
  4. data/lib/ensembl/core/activerecord.rb +1914 -0
  5. data/lib/ensembl/core/collection.rb +60 -0
  6. data/lib/ensembl/core/project.rb +264 -0
  7. data/lib/ensembl/core/slice.rb +693 -0
  8. data/lib/ensembl/core/transcript.rb +425 -0
  9. data/lib/ensembl/core/transform.rb +97 -0
  10. data/lib/ensembl/db_connection.rb +216 -0
  11. data/lib/ensembl/variation/activerecord.rb +253 -0
  12. data/lib/ensembl/variation/variation.rb +163 -0
  13. data/test/unit/data/seq_c6qbl.fa +10 -0
  14. data/test/unit/data/seq_cso19_coding.fa +16 -0
  15. data/test/unit/data/seq_cso19_transcript.fa +28 -0
  16. data/test/unit/data/seq_drd3_gene.fa +838 -0
  17. data/test/unit/data/seq_drd3_transcript.fa +22 -0
  18. data/test/unit/data/seq_drd4_transcript.fa +24 -0
  19. data/test/unit/data/seq_forward_composite.fa +1669 -0
  20. data/test/unit/data/seq_par_boundary.fa +169 -0
  21. data/test/unit/data/seq_rnd3_transcript.fa +47 -0
  22. data/test/unit/data/seq_ub2r1_coding.fa +13 -0
  23. data/test/unit/data/seq_ub2r1_gene.fa +174 -0
  24. data/test/unit/data/seq_ub2r1_transcript.fa +26 -0
  25. data/test/unit/data/seq_y.fa +2 -0
  26. data/test/unit/ensembl_genomes/test_collection.rb +51 -0
  27. data/test/unit/ensembl_genomes/test_gene.rb +52 -0
  28. data/test/unit/ensembl_genomes/test_slice.rb +71 -0
  29. data/test/unit/ensembl_genomes/test_variation.rb +17 -0
  30. data/test/unit/release_50/core/test_project.rb +215 -0
  31. data/test/unit/release_50/core/test_project_human.rb +58 -0
  32. data/test/unit/release_50/core/test_relationships.rb +66 -0
  33. data/test/unit/release_50/core/test_sequence.rb +175 -0
  34. data/test/unit/release_50/core/test_slice.rb +121 -0
  35. data/test/unit/release_50/core/test_transcript.rb +108 -0
  36. data/test/unit/release_50/core/test_transform.rb +223 -0
  37. data/test/unit/release_50/variation/test_activerecord.rb +143 -0
  38. data/test/unit/release_50/variation/test_variation.rb +84 -0
  39. data/test/unit/release_53/core/test_gene.rb +66 -0
  40. data/test/unit/release_53/core/test_project.rb +96 -0
  41. data/test/unit/release_53/core/test_project_human.rb +65 -0
  42. data/test/unit/release_53/core/test_slice.rb +47 -0
  43. data/test/unit/release_53/core/test_transform.rb +63 -0
  44. data/test/unit/release_53/variation/test_activerecord.rb +145 -0
  45. data/test/unit/release_53/variation/test_variation.rb +71 -0
  46. data/test/unit/release_56/core/test_gene.rb +66 -0
  47. data/test/unit/release_56/core/test_project.rb +96 -0
  48. data/test/unit/release_56/core/test_slice.rb +54 -0
  49. data/test/unit/release_56/core/test_transform.rb +63 -0
  50. data/test/unit/release_56/variation/test_activerecord.rb +142 -0
  51. data/test/unit/release_56/variation/test_variation.rb +68 -0
  52. data/test/unit/test_connection.rb +66 -0
  53. data/test/unit/test_releases.rb +136 -0
  54. metadata +128 -0
@@ -0,0 +1,216 @@
1
+ #
2
+ # = ensembl/db_connection.rb - Connection classes for Ensembl databases
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Jan Aerts <http://jandot.myopenid.com>
5
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
6
+ #
7
+ # License:: The Ruby License
8
+ #
9
+
10
+
11
+ require 'rubygems'
12
+ require 'activerecord'
13
+
14
+ module Ensembl
15
+ DB_ADAPTER = 'mysql'
16
+ DB_HOST = 'ensembldb.ensembl.org'
17
+ DB_USERNAME = 'anonymous'
18
+ DB_PASSWORD = ''
19
+ EG_HOST = 'mysql.ebi.ac.uk'
20
+ EG_PORT = 4157
21
+
22
+
23
+ # = DESCRIPTION
24
+ # Generic class to perform dynamic connections to the Ensembl database and retrieve database names
25
+ #
26
+ class DummyDBConnection < ActiveRecord::Base
27
+ self.abstract_class = true
28
+ def self.connect(args)
29
+ self.establish_connection(
30
+ :adapter => args[:adapter] ||= Ensembl::DB_ADAPTER,
31
+ :host => args[:host] ||= Ensembl::DB_HOST,
32
+ :username => args[:username] ||= Ensembl::DB_USERNAME,
33
+ :password => args[:password] ||= Ensembl::DB_PASSWORD,
34
+ :port => args[:port],
35
+ :database => args[:database] ||= ''
36
+ )
37
+ end
38
+ end
39
+
40
+ module DBRegistry
41
+ # = DESCRIPTION
42
+ # The Ensembl::Registry::Base is a generic super class providing general methods
43
+ # to get database and connection info.
44
+ #
45
+ class Base < ActiveRecord::Base
46
+ self.abstract_class = true
47
+ self.pluralize_table_names = false
48
+ def self.get_info
49
+ host,user,password,db_name,port = self.retrieve_connection.instance_values["connection_options"]
50
+ end
51
+ # = DESCRIPTION
52
+ # Class method to retrieve the name of a database, using species, release and connection parameters
53
+ # passed by the user.
54
+ #
55
+ def self.get_name_from_db(match,species,release,args)
56
+ species = species.underscore # Always in lowercase. This keeps things simple when dealing with complex species names like in Ensembl Genomes database
57
+ dummy_db = DummyDBConnection.connect(args)
58
+ dummy_connection = dummy_db.connection
59
+
60
+ # check if a database exists with exactly the species name passed (regular way)
61
+ db_name = dummy_connection.select_values("SHOW DATABASES LIKE '%#{species}_#{match}_#{release.to_s}%'")[0]
62
+
63
+ # if a database is not found and we are working on Ensembl Genomes database...
64
+ if db_name.nil? and args[:ensembl_genomes] then
65
+ words = species.split(/_/)
66
+ first = words.shift
67
+ # ...try to find a collection database using the first name of the species passed (convention used for collection databases)
68
+ db_name = dummy_connection.select_values("SHOW DATABASES").select {|d| d=~/#{first}.*_collection_#{match}_#{release.to_s}/}[0]
69
+ # if a collection database match is found, then look inside to find the species
70
+ if db_name != nil then
71
+ dummy_db.disconnect! # close the generic connection with the host
72
+ args[:database] = db_name
73
+ dummy_db = DummyDBConnection.connect(args) # open a new connection directly with the collection database
74
+ others = ''
75
+ words.each do |w|
76
+ others << " #{w}"
77
+ end
78
+ species_name = "#{first}#{others}" # transform the species name, so it can match the species names stored in the collection database
79
+ Ensembl::SESSION.collection_species = species_name # set the species used for this session, so it's easier to fetch slices from the genome of that species
80
+
81
+ # check that the species passed is present in the collection database, otherwise returns a warning
82
+ exists = dummy_db.connection.select_values("SELECT species_id FROM meta WHERE LOWER(meta_value) = '#{species_name}' AND meta_key = 'species.db_name'")[0]
83
+ warn "WARNING: No species '#{species}' found in the database. Please check that the name is correct." if !exists
84
+ end
85
+ end
86
+ warn "WARNING: No connection to database established. Check that the species is in snake_case (was: #{species})." if db_name.nil?
87
+ return db_name
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
94
+
95
+ module Core
96
+ # = DESCRIPTION
97
+ # The Ensembl::Core::DBConnection is the actual connection established
98
+ # with the Ensembl server.
99
+ class DBConnection < Ensembl::DBRegistry::Base
100
+ self.abstract_class = true
101
+ self.pluralize_table_names = false
102
+ # = DESCRIPTION
103
+ # The Ensembl::Core::DBConnection#connect method makes the connection
104
+ # to the Ensembl core database for a given species. By default, it connects
105
+ # to release 50 for that species. You _could_ use a lower number, but
106
+ # some parts of the API might not work, or worse: give the wrong results.
107
+ #
108
+ # = USAGE
109
+ # # Connect to release 50 of human
110
+ # Ensembl::Core::DBConnection.connect('homo_sapiens')
111
+ #
112
+ # # Connect to release 42 of chicken
113
+ # Ensembl::Core::DBConnection.connect('gallus_gallus')
114
+ #
115
+ # ---
116
+ # *Arguments*:
117
+ # * species:: species to connect to. Arguments should be in snake_case
118
+ # * ensembl_release:: the release of the database to connect to
119
+ # (default = 50)
120
+ def self.connect(species, release = Ensembl::ENSEMBL_RELEASE, args = {})
121
+ Ensembl::SESSION.reset
122
+ db_name = nil
123
+ # if the connection is established with Ensembl Genomes, set the default port and host
124
+ if args[:ensembl_genomes]
125
+ args[:port] = EG_PORT
126
+ args[:host] = EG_HOST
127
+ end
128
+ if args[:port].nil? then
129
+ args[:port] = ( release > 47 ) ? 5306 : 3306
130
+ end
131
+ if args[:database]
132
+ db_name = args[:database]
133
+ else
134
+ db_name = self.get_name_from_db('core',species,release,args) # try to find the corresponding core database
135
+ end
136
+ establish_connection(
137
+ :adapter => args[:adapter] || Ensembl::DB_ADAPTER,
138
+ :host => args[:host] || Ensembl::DB_HOST,
139
+ :database => db_name,
140
+ :username => args[:username] || Ensembl::DB_USERNAME,
141
+ :password => args[:password] || Ensembl::DB_PASSWORD,
142
+ :port => args[:port]
143
+ )
144
+
145
+ self.retrieve_connection # Checkout that the connection is working
146
+ end
147
+
148
+
149
+ # = DESCRIPTION
150
+ # Simple wrapper for the normal DBConnection.connect() method. This is used to set the connection directly
151
+ # with the Ensembl Genomes database host
152
+ #
153
+ def self.ensemblgenomes_connect(species, release = Ensembl::ENSEMBL_RELEASE, args = {})
154
+ args[:ensembl_genomes] = true
155
+ self.connect(species,release,args)
156
+ end
157
+
158
+ end # Core::DBConnection
159
+
160
+ end # Core
161
+
162
+ module Variation
163
+ # = DESCRIPTION
164
+ # The Ensembl::Variation::DBConnection is the actual connection established
165
+ # with the Ensembl server.
166
+ class DBConnection < Ensembl::DBRegistry::Base
167
+ self.abstract_class = true
168
+ self.pluralize_table_names = false
169
+ # = DESCRIPTION
170
+ # The Ensembl::Variation::DBConnection#connect method makes the connection
171
+ # to the Ensembl variation database for a given species. By default, it connects
172
+ # to release 50 for that species. You _could_ use a lower number, but
173
+ # some parts of the API might not work, or worse: give the wrong results.
174
+ #
175
+ # = USAGE
176
+ # # Connect to release 50 of human
177
+ # Ensembl::Variation::DBConnection.connect('homo_sapiens')
178
+ #
179
+ # # Connect to release 42 of chicken
180
+ # Ensembl::Variation::DBConnection.connect('gallus_gallus')
181
+ #
182
+ # ---
183
+ # *Arguments*:
184
+ # * species:: species to connect to. Arguments should be in snake_case
185
+ # * ensembl_release:: the release of the database to connect to
186
+ # (default = 50)
187
+ def self.connect(species, release = Ensembl::ENSEMBL_RELEASE, args = {})
188
+ Ensembl::SESSION.reset
189
+ args[:species] = species
190
+ if args[:port].nil? then
191
+ args[:port] = ( release > 47 ) ? 5306 : 3306
192
+ end
193
+ db_name = nil
194
+ if args[:database]
195
+ db_name = args[:database]
196
+ else
197
+ db_name = self.get_name_from_db('variation',species,release,args) # try to find the corresponding variation database
198
+ end
199
+ establish_connection(
200
+ :adapter => args[:adapter] || Ensembl::DB_ADAPTER,
201
+ :host => args[:host] || Ensembl::DB_HOST,
202
+ :database => db_name,
203
+ :username => args[:username] || Ensembl::DB_USERNAME,
204
+ :password => args[:password] || Ensembl::DB_PASSWORD,
205
+ :port => args[:port]
206
+ )
207
+
208
+ self.retrieve_connection # Checkout that the connection is working
209
+
210
+ end
211
+
212
+ end # Variation::DBConnection
213
+
214
+ end # Variation
215
+
216
+ end # Ensembl
@@ -0,0 +1,253 @@
1
+ #
2
+ # = ensembl/variation/activerecord.rb - ActiveRecord mappings to Ensembl Variation
3
+ #
4
+ # Copyright:: Copyright (C) 2008 Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+
8
+ nil
9
+ module Ensembl
10
+ # = DESCRIPTION
11
+ # The Ensembl::Variation module covers the variation databases from
12
+ # ensembldb.ensembl.org.
13
+ module Variation
14
+ # = DESCRIPTION
15
+ # The Allele class describes a single allele of a variation. In addition to
16
+ # the nucleotide(s) (or absence of) that representing the allele frequency
17
+ # and population information may be present.
18
+ #
19
+ # This class uses ActiveRecord to access data in the Ensembl database.
20
+ # See the general documentation of the Ensembl module for
21
+ # more information on what this means and what methods are available.
22
+ #
23
+ # = USAGE
24
+ # allele = Allele.find(1)
25
+ # puts allele.to_yaml
26
+ class Allele < DBConnection
27
+ set_primary_key 'allele_id'
28
+ belongs_to :sample
29
+ belongs_to :variation
30
+ belongs_to :population
31
+ end
32
+
33
+ # = DESCRIPTION
34
+ # The AlleleGroup class represents a grouping of alleles that have tight
35
+ # linkage and are usually present together. This is commonly known as a
36
+ # Haplotype or Haplotype Block.
37
+ #
38
+ # This class uses ActiveRecord to access data in the Ensembl database.
39
+ # See the general documentation of the Ensembl module for
40
+ # more information on what this means and what methods are available.
41
+ #
42
+ # = USAGE
43
+ # allele_group = AlleleGroup.find(1)
44
+ # puts allele_group.to_yaml
45
+ class AlleleGroup < DBConnection
46
+ set_primary_key 'allele_group_id'
47
+ belongs_to :variation_group
48
+ belongs_to :source
49
+ belongs_to :sample
50
+ belongs_to :allele_group_allele
51
+ end
52
+
53
+ # = DESCRIPTION
54
+ # The AlleleGroupAllele class represents a connection class between Allele and AlleleGroup.
55
+ # Should not be used directly.
56
+ #
57
+ # This class uses ActiveRecord to access data in the Ensembl database.
58
+ # See the general documentation of the Ensembl module for
59
+ # more information on what this means and what methods are available.
60
+ class AlleleGroupAllele < DBConnection
61
+ belongs_to :variation
62
+ belongs_to :allele_group
63
+ end
64
+
65
+ # = DESCRIPTION
66
+ # The Sample class gives information about the biological samples stored in the database.
67
+ #
68
+ # This class uses ActiveRecord to access data in the Ensembl database.
69
+ # See the general documentation of the Ensembl module for
70
+ # more information on what this means and what methods are available.
71
+ class Sample < DBConnection
72
+ set_primary_key "sample_id"
73
+ has_one :individual
74
+ has_one :sample_synonym
75
+ has_many :individual_genotype_multiple_bp
76
+ has_many :compressed_genotype_single_bp
77
+ has_many :read_coverage
78
+ has_one :population
79
+ has_many :tagged_variation_features
80
+ end
81
+
82
+ # = DESCRIPTION
83
+ # The IndividualPopulation class is used to connect Individual and Population classes.
84
+ # Should not be used directly.
85
+ #
86
+ # This class uses ActiveRecord to access data in the Ensembl database.
87
+ # See the general documentation of the Ensembl module for
88
+ # more information on what this means and what methods are available.
89
+ class IndividualPopulation < DBConnection
90
+ belongs_to :individual
91
+ belongs_to :population
92
+ end
93
+
94
+ # = DESCRIPTION
95
+ # The Individual class gives information on the single individuals used
96
+ # to retrieve one or more biological samples.
97
+ #
98
+ # This class uses ActiveRecord to access data in the Ensembl database.
99
+ # See the general documentation of the Ensembl module for
100
+ # more information on what this means and what methods are available.
101
+ class Individual < DBConnection
102
+ belongs_to :sample
103
+ # FIXME
104
+ end
105
+
106
+ class IndividualGenotypeMultipleBp < DBConnection
107
+ belongs_to :sample
108
+ belongs_to :variation
109
+ end
110
+
111
+ class CompressedGenotypeSingleBp < DBConnection
112
+ belongs_to :sample
113
+ end
114
+
115
+ class ReadCoverage < DBConnection
116
+ belongs_to :sample
117
+ end
118
+
119
+ class Population < DBConnection
120
+ belongs_to :sample
121
+ end
122
+
123
+ class PopulationStructure < DBConnection
124
+ # FIXME
125
+ end
126
+
127
+ # = DESCRIPTION
128
+ # The PopulationGenotype class gives information about alleles and allele
129
+ # frequencies for a SNP observed within a population or a group of samples.
130
+ #
131
+ # This class uses ActiveRecord to access data in the Ensembl database.
132
+ # See the general documentation of the Ensembl module for
133
+ # more information on what this means and what methods are available.
134
+ class PopulationGenotype < DBConnection
135
+ set_primary_key "population_genotype_id"
136
+ belongs_to :variation
137
+ belongs_to :population
138
+ end
139
+
140
+ # = DESCRIPTION
141
+ # The SampleSynonym class represents information about alternative names
142
+ # for sample entries.
143
+ #
144
+ # This class uses ActiveRecord to access data in the Ensembl database.
145
+ # See the general documentation of the Ensembl module for
146
+ # more information on what this means and what methods are available.
147
+ class SampleSynonym < DBConnection
148
+ set_primary_key "sample_synonym_id"
149
+ belongs_to :source
150
+ belongs_to :sample
151
+ belongs_to :population
152
+ end
153
+
154
+ # = DESCRIPTION
155
+ # The Source class gives information on the different databases and SNP
156
+ # panels used to retrieve the data
157
+ #
158
+ # This class uses ActiveRecord to access data in the Ensembl database.
159
+ # See the general documentation of the Ensembl module for
160
+ # more information on what this means and what methods are available.
161
+ class Source < DBConnection
162
+ set_primary_key "source_id"
163
+ has_many :sample_synonyms
164
+ has_many :allele_groups
165
+ has_many :variations
166
+ has_many :variation_groups
167
+ has_many :httags
168
+ has_many :variation_synonyms
169
+ end
170
+
171
+ # = DESCRIPTION
172
+ # The VariationSynonym class gives information on alterative names used
173
+ # for Variation entries.
174
+ #
175
+ # This class uses ActiveRecord to access data in the Ensembl database.
176
+ # See the general documentation of the Ensembl module for
177
+ # more information on what this means and what methods are available.
178
+ class VariationSynonym < DBConnection
179
+ set_primary_key "variation_synonym_id"
180
+ belongs_to :variation
181
+ belongs_to :source
182
+ end
183
+
184
+ # = DESCRIPTION
185
+ # The VariationGroup class represents a group of variations (SNPs) that are
186
+ # linked and present toghether.
187
+ #
188
+ # This class uses ActiveRecord to access data in the Ensembl database.
189
+ # See the general documentation of the Ensembl module for
190
+ # more information on what this means and what methods are available.
191
+ class VariationGroup < DBConnection
192
+ set_primary_key "variation_group_id"
193
+ belongs_to :source
194
+ has_one :variation_group_variation
195
+ has_one :httag
196
+ has_one :variation_group_feature
197
+ has_one :allele_group
198
+ end
199
+
200
+ # = DESCRIPTION
201
+ # The VariationGroupVariation class is a connection class.
202
+ # Should not be used directly.
203
+ #
204
+ # This class uses ActiveRecord to access data in the Ensembl database.
205
+ # See the general documentation of the Ensembl module for
206
+ # more information on what this means and what methods are available.
207
+ class VariationGroupVariation < DBConnection
208
+ belongs_to :variation
209
+ belongs_to :variation_group
210
+ end
211
+
212
+ # = DESCRIPTION
213
+ # The VariationGroupFeature class gives information on the genomic position
214
+ # of each VariationGroup.
215
+ #
216
+ # This class uses ActiveRecord to access data in the Ensembl database.
217
+ # See the general documentation of the Ensembl module for
218
+ # more information on what this means and what methods are available.
219
+ class VariationGroupFeature < DBConnection
220
+ set_primary_key "variation_group_feature_id"
221
+ belongs_to :variation_group
222
+ end
223
+
224
+ # = DESCRIPTION
225
+ # The FlankingSequence class gives information about the genomic coordinates
226
+ # of the flanking sequences, for a single VariationFeature.
227
+ #
228
+ # This class uses ActiveRecord to access data in the Ensembl database.
229
+ # See the general documentation of the Ensembl module for
230
+ # more information on what this means and what methods are available.
231
+ class FlankingSequence < DBConnection
232
+ belongs_to :variation
233
+ end
234
+
235
+ # = DESCRIPTION
236
+ # The TaggedVariationFeature class is a connection class.
237
+ # Should not be used directly.
238
+ #
239
+ # This class uses ActiveRecord to access data in the Ensembl database.
240
+ # See the general documentation of the Ensembl module for
241
+ # more information on what this means and what methods are available.
242
+ class TaggedVariationFeature < DBConnection
243
+ belongs_to :variation_feature
244
+ belongs_to :sample
245
+ end
246
+
247
+ class Httag < DBConnection
248
+ set_primary_key "httag_id"
249
+ belongs_to :variation_group
250
+ belongs_to :source
251
+ end
252
+ end
253
+ end