ruby-ensembl-api 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/TUTORIAL.rdoc +623 -0
  2. data/bin/ensembl +40 -0
  3. data/lib/ensembl.rb +64 -0
  4. data/lib/ensembl/core/activerecord.rb +1914 -0
  5. data/lib/ensembl/core/collection.rb +60 -0
  6. data/lib/ensembl/core/project.rb +264 -0
  7. data/lib/ensembl/core/slice.rb +693 -0
  8. data/lib/ensembl/core/transcript.rb +425 -0
  9. data/lib/ensembl/core/transform.rb +97 -0
  10. data/lib/ensembl/db_connection.rb +216 -0
  11. data/lib/ensembl/variation/activerecord.rb +253 -0
  12. data/lib/ensembl/variation/variation.rb +163 -0
  13. data/test/unit/data/seq_c6qbl.fa +10 -0
  14. data/test/unit/data/seq_cso19_coding.fa +16 -0
  15. data/test/unit/data/seq_cso19_transcript.fa +28 -0
  16. data/test/unit/data/seq_drd3_gene.fa +838 -0
  17. data/test/unit/data/seq_drd3_transcript.fa +22 -0
  18. data/test/unit/data/seq_drd4_transcript.fa +24 -0
  19. data/test/unit/data/seq_forward_composite.fa +1669 -0
  20. data/test/unit/data/seq_par_boundary.fa +169 -0
  21. data/test/unit/data/seq_rnd3_transcript.fa +47 -0
  22. data/test/unit/data/seq_ub2r1_coding.fa +13 -0
  23. data/test/unit/data/seq_ub2r1_gene.fa +174 -0
  24. data/test/unit/data/seq_ub2r1_transcript.fa +26 -0
  25. data/test/unit/data/seq_y.fa +2 -0
  26. data/test/unit/ensembl_genomes/test_collection.rb +51 -0
  27. data/test/unit/ensembl_genomes/test_gene.rb +52 -0
  28. data/test/unit/ensembl_genomes/test_slice.rb +71 -0
  29. data/test/unit/ensembl_genomes/test_variation.rb +17 -0
  30. data/test/unit/release_50/core/test_project.rb +215 -0
  31. data/test/unit/release_50/core/test_project_human.rb +58 -0
  32. data/test/unit/release_50/core/test_relationships.rb +66 -0
  33. data/test/unit/release_50/core/test_sequence.rb +175 -0
  34. data/test/unit/release_50/core/test_slice.rb +121 -0
  35. data/test/unit/release_50/core/test_transcript.rb +108 -0
  36. data/test/unit/release_50/core/test_transform.rb +223 -0
  37. data/test/unit/release_50/variation/test_activerecord.rb +143 -0
  38. data/test/unit/release_50/variation/test_variation.rb +84 -0
  39. data/test/unit/release_53/core/test_gene.rb +66 -0
  40. data/test/unit/release_53/core/test_project.rb +96 -0
  41. data/test/unit/release_53/core/test_project_human.rb +65 -0
  42. data/test/unit/release_53/core/test_slice.rb +47 -0
  43. data/test/unit/release_53/core/test_transform.rb +63 -0
  44. data/test/unit/release_53/variation/test_activerecord.rb +145 -0
  45. data/test/unit/release_53/variation/test_variation.rb +71 -0
  46. data/test/unit/release_56/core/test_gene.rb +66 -0
  47. data/test/unit/release_56/core/test_project.rb +96 -0
  48. data/test/unit/release_56/core/test_slice.rb +54 -0
  49. data/test/unit/release_56/core/test_transform.rb +63 -0
  50. data/test/unit/release_56/variation/test_activerecord.rb +142 -0
  51. data/test/unit/release_56/variation/test_variation.rb +68 -0
  52. data/test/unit/test_connection.rb +66 -0
  53. data/test/unit/test_releases.rb +136 -0
  54. metadata +128 -0
@@ -0,0 +1,425 @@
1
+ #
2
+ # = ensembl/core/transcript.rb - ActiveRecord mapping to Ensembl core for transcript
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ nil
8
+ module Ensembl
9
+ nil
10
+ module Core
11
+ # = DESCRIPTION
12
+ # The Intron class describes an intron.
13
+ #
14
+ # This class does _not_ use ActiveRecord and is only defined within the API.
15
+ # There is no _introns_ table in the Ensembl database.
16
+ #
17
+ # This class includes the mixin Sliceable, which means that it is mapped
18
+ # to a SeqRegion object and a Slice can be created for objects o this
19
+ # class. See Sliceable and Slice for more information.
20
+ #
21
+ # = USAGE
22
+ # exon1 = Ensembl::Core::Exon.find(292811)
23
+ # exon2 = Ensembl::Core::Exon.find(292894)
24
+ # intron = Ensembl::Core::Intron.new(exon1,exon2)
25
+ # puts intron.to_yaml
26
+ #
27
+ # transcript = Ensembl::Core::Transcript.find(58972)
28
+ # puts transcript.introns.to_yaml
29
+ class Intron
30
+ include Sliceable
31
+ attr_accessor :seq_region, :seq_region_start, :seq_region_end, :seq_region_strand
32
+ attr_accessor :previous_exon, :next_exon, :transcript
33
+
34
+ def initialize(exon_1, exon_2)
35
+ # Check if these are actually two adjacent exons from the same transcript
36
+ ok = true
37
+
38
+ transcript = nil
39
+ exon_1.transcripts.each do |t|
40
+ transcript = t if exon_2.transcripts.include?(t)
41
+ end
42
+ raise ArgumentError, "Arguments should be adjacent exons of same transcript" if transcript.nil?
43
+
44
+ rank_1 = ExonTranscript.find_by_transcript_id_and_exon_id(transcript.id, exon_1.id).rank
45
+ rank_2 = ExonTranscript.find_by_transcript_id_and_exon_id(transcript.id, exon_2.id).rank
46
+ raise ArgumentError, "Arguments should be adjacent exons of same transcript" if (rank_2 - rank_1).abs > 1
47
+
48
+ @previous_exon, @next_exon = [exon_1, exon_2].sort_by{|e| e.seq_region_start}
49
+ @transcript = transcript
50
+ @seq_region = @previous_exon.seq_region
51
+ @seq_region_start = @previous_exon.seq_region_end + 1
52
+ @seq_region_end = @next_exon.seq_region_start - 1
53
+ @seq_region_strand = @previous_exon.seq_region_strand
54
+ end
55
+
56
+ end
57
+
58
+ # = DESCRIPTION
59
+ # The Transcript class provides an interface to the transcript
60
+ # table. This table contains mappings of transcripts for a Gene to a
61
+ # SeqRegion.
62
+ #
63
+ # This class uses ActiveRecord to access data in the Ensembl database.
64
+ # See the general documentation of the Ensembl module for
65
+ # more information on what this means and what methods are available.
66
+ #
67
+ # This class includes the mixin Sliceable, which means that it is mapped
68
+ # to a SeqRegion object and a Slice can be created for objects of this
69
+ # class. See Sliceable and Slice for more information.
70
+ #
71
+ # = USAGE
72
+ # #TODO
73
+ class Transcript < DBConnection
74
+ include Sliceable
75
+
76
+ set_table_name 'transcript'
77
+ set_primary_key 'transcript_id'
78
+
79
+ belongs_to :gene
80
+ belongs_to :seq_region
81
+ has_one :transcript_stable_id
82
+ has_many :transcript_attribs
83
+
84
+ has_many :exon_transcripts
85
+ # has_many :exons, :through => :exon_transcripts
86
+
87
+ has_one :translation
88
+
89
+ has_many :object_xrefs, :foreign_key => 'ensembl_id', :conditions => "ensembl_object_type = 'Transcript'"
90
+ has_many :xrefs, :through => :object_xrefs
91
+
92
+ has_many :transcript_supporting_features
93
+ has_many :dna_align_features, :through => :transcript_supporting_features, :conditions => ["feature_type = 'dna_align_feature'"]
94
+ has_many :protein_align_features, :through => :transcript_supporting_features, :conditions => ["feature_type = 'protein_align_feature'"]
95
+
96
+ alias attribs transcript_attribs
97
+
98
+ # The Transcript#exons method returns the exons for this transcript in
99
+ # the order of their ranks in the exon_transcript table.
100
+ # ---
101
+ # *Arguments*:: none
102
+ # *Returns*:: sorted array of Exon objects
103
+ def exons
104
+ if @exons.nil?
105
+ @exons = self.exon_transcripts(:include => [:exons]).sort_by{|et| et.rank.to_i}.collect{|et| et.exon}
106
+ end
107
+ return @exons
108
+ end
109
+
110
+ # The Transcript#introns methods returns the introns for this transcript
111
+ # ---
112
+ # *Arguments*:: none
113
+ # *Returns*:: sorted array of Intron objects
114
+ def introns
115
+ if @introns.nil?
116
+ @introns = Array.new
117
+ if self.exons.length > 1
118
+ self.exons.each_with_index do |exon, index|
119
+ next if index == 0
120
+ @introns.push(Intron.new(self.exons[index - 1], exon))
121
+ end
122
+ end
123
+ end
124
+ return @introns
125
+ end
126
+
127
+ # The Transcript#stable_id method returns the stable ID of the transcript.
128
+ # ---
129
+ # *Arguments*:: none
130
+ # *Returns*:: String
131
+ def stable_id
132
+ return self.transcript_stable_id.stable_id
133
+ end
134
+
135
+ # = DESCRIPTION
136
+ # The Transcript#display_label method returns the default name of the transcript.
137
+ def display_label
138
+ return Xref.find(self.display_xref_id).display_label
139
+ end
140
+ alias :display_name :display_label
141
+ alias :label :display_label
142
+ alias :name :display_label
143
+
144
+ # = DESCRIPTION
145
+ # The Transcript#find_all_by_stable_id class method returns an array of
146
+ # transcripts with the given stable_id. If none were found, an empty
147
+ # array is returned.
148
+ def self.find_all_by_stable_id(stable_id)
149
+ answer = Array.new
150
+ transcript_stable_id_objects = Ensembl::Core::TranscriptStableId.find_all_by_stable_id(stable_id)
151
+ transcript_stable_id_objects.each do |transcript_stable_id_object|
152
+ answer.push(Ensembl::Core::Transcript.find(transcript_stable_id_object.transcript_id))
153
+ end
154
+
155
+ return answer
156
+ end
157
+
158
+ # = DESCRIPTION
159
+ # The Transcript#find_all_by_stable_id class method returns a
160
+ # transcripts with the given stable_id. If none was found, nil is returned.
161
+ def self.find_by_stable_id(stable_id)
162
+ all = self.find_all_by_stable_id(stable_id)
163
+ if all.length == 0
164
+ return nil
165
+ else
166
+ return all[0]
167
+ end
168
+ end
169
+
170
+ # = DESCRIPTION
171
+ # The Transcript#find_by_stable_id class method fetches a Transcript object based on
172
+ # its stable ID (i.e. the "ENST" accession number). If the name is
173
+ # not found, it returns nil.
174
+ def self.find_by_stable_id(stable_id)
175
+ transcript_stable_id = TranscriptStableId.find_by_stable_id(stable_id)
176
+ if transcript_stable_id.nil?
177
+ return nil
178
+ else
179
+ return transcript_stable_id.transcript
180
+ end
181
+ end
182
+
183
+ # = DESCRIPTION
184
+ # The Transcript#seq method returns the full sequence of all concatenated
185
+ # exons.
186
+ def seq
187
+ if @seq.nil?
188
+ @seq = ''
189
+ self.exons.each do |exon|
190
+ @seq += exon.seq
191
+ end
192
+ end
193
+ return @seq
194
+ end
195
+
196
+ # = DESCRIPTION
197
+ # The Transcript#cds_seq method returns the coding sequence of the transcript,
198
+ # i.e. the concatenated sequence of all exons minus the UTRs.
199
+ def cds_seq
200
+ cds_length = self.coding_region_cdna_end - self.coding_region_cdna_start + 1
201
+
202
+ return self.seq[(self.coding_region_cdna_start - 1), cds_length]
203
+ end
204
+
205
+ # = DESCRIPTION
206
+ # The Transcript#five_prime_utr_seq method returns the sequence of the
207
+ # 5'UTR of the transcript.
208
+ def five_prime_utr_seq
209
+ return self.seq[0, self.coding_region_cdna_start - 1]
210
+ end
211
+
212
+ # = DESCRIPTION
213
+ # The Transcript#three_prime_utr_seq method returns the sequence of the
214
+ # 3'UTR of the transcript.
215
+ def three_prime_utr_seq
216
+ return self.seq[self.coding_region_cdna_end..-1]
217
+ end
218
+
219
+ # = DESCRIPTION
220
+ # The Transcript#protein_seq method returns the sequence of the
221
+ # protein of the transcript.
222
+ def protein_seq
223
+ return Bio::Sequence::NA.new(self.cds_seq).translate.seq
224
+ end
225
+
226
+
227
+ # = DESCRIPTION
228
+ # The Transcript#coding_region_genomic_start returns the start position
229
+ # of the CDS in genomic coordinates. Note that, in contrast to
230
+ # Transcript#coding_region_cdna_start, the CDS start position is _always_
231
+ # ''left'' of the end position. So for transcripts on the reverse strand,
232
+ # the CDS start position is at the border of the 3'UTR instead of the
233
+ # 5'UTR.
234
+ def coding_region_genomic_start
235
+ strand = self.translation.start_exon.seq_region_strand
236
+ if strand == 1
237
+ return self.translation.start_exon.seq_region_start + ( self.translation.seq_start - 1 )
238
+ else
239
+ return self.translation.end_exon.seq_region_end - ( self.translation.seq_end - 1 )
240
+ end
241
+ end
242
+
243
+ # = DESCRIPTION
244
+ # The Transcript#coding_region_genomic_end returns the stop position
245
+ # of the CDS in genomic coordinates. Note that, in contrast to
246
+ # Transcript#coding_region_cdna_end, the CDS stop position is _always_
247
+ # ''right'' of the start position. So for transcripts on the reverse strand,
248
+ # the CDS stop position is at the border of the 5'UTR instead of the
249
+ # 3'UTR.
250
+ def coding_region_genomic_end
251
+ strand = self.translation.start_exon.seq_region_strand
252
+ if strand == 1
253
+ return self.translation.end_exon.seq_region_start + ( self.translation.seq_end - 1 )
254
+ else
255
+ return self.translation.start_exon.seq_region_end - ( self.translation.seq_start - 1 )
256
+ end
257
+ end
258
+
259
+ # = DESCRIPTION
260
+ # The Transcript#coding_region_cdna_start returns the start position
261
+ # of the CDS in cDNA coordinates. Note that, in contrast to the
262
+ # Transcript#coding_region_genomic_start, the CDS start position is
263
+ # _always_ at the border of the 5'UTR. So for genes on the reverse
264
+ # strand, the CDS start position in cDNA coordinates will be ''right''
265
+ # of the CDS stop position.
266
+ def coding_region_cdna_start
267
+ answer = 0
268
+
269
+ self.exons.each do |exon|
270
+ if exon == self.translation.start_exon
271
+ answer += self.translation.seq_start
272
+ return answer
273
+ else
274
+ answer += exon.length
275
+ end
276
+ end
277
+
278
+ end
279
+
280
+ # = DESCRIPTION
281
+ # The Transcript#coding_region_cdna_end returns the stop position
282
+ # of the CDS in cDNA coordinates. Note that, in contrast to the
283
+ # Transcript#coding_region_genomic_end, the CDS start position is
284
+ # _always_ at the border of the 3'UTR. So for genes on the reverse
285
+ # strand, the CDS start position in cDNA coordinates will be ''right''
286
+ # of the CDS stop position.
287
+ def coding_region_cdna_end
288
+ answer = 0
289
+
290
+ self.exons.each do |exon|
291
+ if exon == self.translation.end_exon
292
+ answer += self.translation.seq_end
293
+ return answer
294
+ else
295
+ answer += exon.length
296
+ end
297
+ end
298
+ end
299
+
300
+
301
+ # = DESCRIPTION
302
+ # The Transcript#exon_for_position identifies the exon that covers a given
303
+ # genomic position. Returns the exon object, or nil if in intron.
304
+ def exon_for_genomic_position(pos)
305
+ if pos < coding_region_genomic_start or pos > coding_region_genomic_end
306
+ raise RuntimeError, "Position has to be within transcript"
307
+ end
308
+ self.exons.each do |exon|
309
+ if exon.start <= pos and exon.stop >= pos
310
+ return exon
311
+ end
312
+ end
313
+ return nil
314
+ end
315
+
316
+ # = DESCRIPTION
317
+ # The Transcript#exon_for_position identifies the exon that covers a given
318
+ # position of the cDNA.
319
+ def exon_for_cdna_position(pos)
320
+ # FIXME: Still have to check for when pos is outside of scope of cDNA.
321
+ accumulated_exon_length = 0
322
+
323
+ self.exons.each do |exon|
324
+ accumulated_exon_length += exon.length
325
+ if accumulated_exon_length > pos
326
+ return exon
327
+ end
328
+ end
329
+ raise RuntimeError, "Position outside of cDNA scope"
330
+ end
331
+
332
+ # = DESCRIPTION
333
+ # The Transcript#cdna2genomic method converts cDNA coordinates to
334
+ # genomic coordinates for this transcript.
335
+ # ---
336
+ # *Arguments*:
337
+ # * position:: position on the cDNA (required)
338
+ # *Returns*:: integer
339
+ def cdna2genomic(pos)
340
+ #FIXME: Still have to check for when pos is outside of scope of cDNA.
341
+ # Identify the exon we're looking at.
342
+ exon_with_target = self.exon_for_cdna_position(pos)
343
+
344
+ accumulated_position = 0
345
+ self.exons.each do |exon|
346
+ if exon == exon_with_target
347
+ answer = exon.start + ( pos - accumulated_position )
348
+ return answer
349
+ else
350
+ accumulated_position += exon.length
351
+ end
352
+ end
353
+ end
354
+
355
+ # = DESCRIPTION
356
+ # The Transcript#cds2genomic method converts CDS coordinates to
357
+ # genomic coordinates for this transcript.
358
+ # ---
359
+ # *Arguments*:
360
+ # * pos:: position on the CDS (required)
361
+ # *Returns*::
362
+ def cds2genomic(pos)
363
+ return self.cdna2genomic(pos + self.coding_region_cdna_start)
364
+ end
365
+
366
+ # = DESCRIPTION
367
+ # The Transcript#pep2genomic method converts peptide coordinates to
368
+ # genomic coordinates for this transcript.
369
+ # ---
370
+ # *Arguments*:
371
+ # * pos:: position on the peptide (required)
372
+ # *Returns*::
373
+ def pep2genomic(pos)
374
+ raise NotImplementedError
375
+ end
376
+
377
+ # = DESCRIPTION
378
+ # The Transcript#genomic2cdna method converts genomic coordinates to
379
+ # cDNA coordinates for this transcript.
380
+ # ---
381
+ # *Arguments*:
382
+ # * pos:: position on the chromosome (required)
383
+ # *Returns*::
384
+ def genomic2cdna(pos)
385
+ #FIXME: Still have to check for when pos is outside of scope of cDNA.
386
+ # Identify the exon we're looking at.
387
+ exon_with_target = self.exon_for_genomic_position(pos)
388
+
389
+ accumulated_position = 0
390
+ self.exons.each do |exon|
391
+ if exon == exon_with_target
392
+ accumulated_position += ( pos - exon.start )
393
+ return accumulated_position
394
+ else
395
+ accumulated_position += exon.length
396
+ end
397
+ end
398
+ return RuntimeError, "Position outside of cDNA scope"
399
+ end
400
+
401
+ # = DESCRIPTION
402
+ # The Transcript#genomic2cds method converts genomic coordinates to
403
+ # CDS coordinates for this transcript.
404
+ # ---
405
+ # *Arguments*:
406
+ # * pos:: position on the chromosome (required)
407
+ # *Returns*::
408
+ def genomic2cds(pos)
409
+ return self.genomic2cdna(pos) - self.coding_region_cdna_start
410
+ end
411
+
412
+ # = DESCRIPTION
413
+ # The Transcript#genomic2pep method converts genomic coordinates to
414
+ # peptide coordinates for this transcript.
415
+ # ---
416
+ # *Arguments*:
417
+ # * pos:: position on the chromosome (required)
418
+ # *Returns*::
419
+ def genomic2pep(pos)
420
+ raise NotImplementedError
421
+ end
422
+
423
+ end
424
+ end
425
+ end
@@ -0,0 +1,97 @@
1
+ #
2
+ # = bio/api/ensembl/core/transform.rb - transform positions for Ensembl Slice
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ nil
8
+ module Ensembl
9
+ nil
10
+ module Core
11
+ nil
12
+ module Sliceable
13
+ # = DESCRIPTION
14
+ # The #transform method is used to transfer coordinates for a feature
15
+ # from one coordinate system to another. It basically creates a clone of
16
+ # the original feature and changes the seq_region, start position, stop
17
+ # position and strand.
18
+ #
19
+ # Suppose you have a feature on a
20
+ # contig in human (let's say on contig AC000031.6.1.38703) and you
21
+ # want to know the coordinates on the chromosome. This is a
22
+ # transformation of coordinates from a higher ranked coordinate system to
23
+ # a lower ranked coordinate system. Transformations can also be done
24
+ # from a chromosome to the contig level.
25
+ #
26
+ # In contrast to the #project method of Sliceables, the
27
+ # coordinates of a feature can only transformed to the target
28
+ # coordinate system if there is no ambiguity to which SeqRegion.
29
+ #
30
+ # For example, gene A can be transferred from the chromosome system to
31
+ # the clone coordinate system, whereas gene B can not.
32
+ #
33
+ # gene A gene B
34
+ # |---<=====>--------------------<=====>----------------| chromosome
35
+ #
36
+ # |-----------| |-------| |---------| clones
37
+ # |-----------| |-------| |--------|
38
+ #
39
+ # gene_a.transform('clone') --> gene
40
+ # gene_b.transform('clone') --> nil
41
+ #
42
+ # At the moment, transformations can only be done if the two coordinate
43
+ # systems are linked directly in the 'assembly' table.
44
+ #
45
+ # = USAGE
46
+ #
47
+ # # Get a gene in cow and transform to scaffold level
48
+ # # (i.e. going from a high rank coord system to a lower rank coord
49
+ # # system)
50
+ # # Cow scaffold Chr4.10 lies on Chr4 from 8030345 to 10087277 on the
51
+ # # reverse strand
52
+ # source_gene = Gene.find(2408)
53
+ # target_gene = source_gene.transform('scaffold')
54
+ # puts source_gene.seq_region.name #--> 4
55
+ # puts source_gene.seq_region_start #--> 8104409
56
+ # puts source_gene.seq_region_end #--> 8496477
57
+ # puts source_gene.seq_region_strand #--> -1
58
+ # puts target_gene.seq_region.name #--> Chr4.003.10
59
+ # puts target_gene.seq_region_start #--> 1590800
60
+ # puts target_gene.seq_region_end #--> 1982868
61
+ # puts target_gene.seq_region_strand #--> 1
62
+ #
63
+ # ---
64
+ # *Arguments*:
65
+ # * coord_system_name:: name of coordinate system to transform to
66
+ # coordinates to
67
+ # *Returns*:: nil or an object of the same class as self
68
+ def transform(coord_system_name)
69
+ #-
70
+ # There are two things I can do:
71
+ # (1) just use project
72
+ # (2) avoid doing all the calculations in project if the source slice
73
+ # covers multiple target slices, and _then_ go for project.
74
+ # Let's go for nr 1 for the moment and optimize later.
75
+ #+
76
+
77
+ if self.slice.seq_region.coord_system.name == coord_system_name
78
+ return self
79
+ end
80
+
81
+ target_slices = self.slice.project(coord_system_name)
82
+ if target_slices.length > 1
83
+ return nil
84
+ else
85
+ clone = self.clone
86
+ clone.seq_region_id = target_slices[0].seq_region.id
87
+ clone.seq_region_start = target_slices[0].start
88
+ clone.seq_region_end = target_slices[0].stop
89
+
90
+ clone.seq_region_strand = target_slices[0].strand * self.strand
91
+
92
+ return clone
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end