ruby-ensembl-api 0.9.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/TUTORIAL.rdoc +623 -0
  2. data/bin/ensembl +40 -0
  3. data/lib/ensembl.rb +64 -0
  4. data/lib/ensembl/core/activerecord.rb +1914 -0
  5. data/lib/ensembl/core/collection.rb +60 -0
  6. data/lib/ensembl/core/project.rb +264 -0
  7. data/lib/ensembl/core/slice.rb +693 -0
  8. data/lib/ensembl/core/transcript.rb +425 -0
  9. data/lib/ensembl/core/transform.rb +97 -0
  10. data/lib/ensembl/db_connection.rb +216 -0
  11. data/lib/ensembl/variation/activerecord.rb +253 -0
  12. data/lib/ensembl/variation/variation.rb +163 -0
  13. data/test/unit/data/seq_c6qbl.fa +10 -0
  14. data/test/unit/data/seq_cso19_coding.fa +16 -0
  15. data/test/unit/data/seq_cso19_transcript.fa +28 -0
  16. data/test/unit/data/seq_drd3_gene.fa +838 -0
  17. data/test/unit/data/seq_drd3_transcript.fa +22 -0
  18. data/test/unit/data/seq_drd4_transcript.fa +24 -0
  19. data/test/unit/data/seq_forward_composite.fa +1669 -0
  20. data/test/unit/data/seq_par_boundary.fa +169 -0
  21. data/test/unit/data/seq_rnd3_transcript.fa +47 -0
  22. data/test/unit/data/seq_ub2r1_coding.fa +13 -0
  23. data/test/unit/data/seq_ub2r1_gene.fa +174 -0
  24. data/test/unit/data/seq_ub2r1_transcript.fa +26 -0
  25. data/test/unit/data/seq_y.fa +2 -0
  26. data/test/unit/ensembl_genomes/test_collection.rb +51 -0
  27. data/test/unit/ensembl_genomes/test_gene.rb +52 -0
  28. data/test/unit/ensembl_genomes/test_slice.rb +71 -0
  29. data/test/unit/ensembl_genomes/test_variation.rb +17 -0
  30. data/test/unit/release_50/core/test_project.rb +215 -0
  31. data/test/unit/release_50/core/test_project_human.rb +58 -0
  32. data/test/unit/release_50/core/test_relationships.rb +66 -0
  33. data/test/unit/release_50/core/test_sequence.rb +175 -0
  34. data/test/unit/release_50/core/test_slice.rb +121 -0
  35. data/test/unit/release_50/core/test_transcript.rb +108 -0
  36. data/test/unit/release_50/core/test_transform.rb +223 -0
  37. data/test/unit/release_50/variation/test_activerecord.rb +143 -0
  38. data/test/unit/release_50/variation/test_variation.rb +84 -0
  39. data/test/unit/release_53/core/test_gene.rb +66 -0
  40. data/test/unit/release_53/core/test_project.rb +96 -0
  41. data/test/unit/release_53/core/test_project_human.rb +65 -0
  42. data/test/unit/release_53/core/test_slice.rb +47 -0
  43. data/test/unit/release_53/core/test_transform.rb +63 -0
  44. data/test/unit/release_53/variation/test_activerecord.rb +145 -0
  45. data/test/unit/release_53/variation/test_variation.rb +71 -0
  46. data/test/unit/release_56/core/test_gene.rb +66 -0
  47. data/test/unit/release_56/core/test_project.rb +96 -0
  48. data/test/unit/release_56/core/test_slice.rb +54 -0
  49. data/test/unit/release_56/core/test_transform.rb +63 -0
  50. data/test/unit/release_56/variation/test_activerecord.rb +142 -0
  51. data/test/unit/release_56/variation/test_variation.rb +68 -0
  52. data/test/unit/test_connection.rb +66 -0
  53. data/test/unit/test_releases.rb +136 -0
  54. metadata +128 -0
@@ -0,0 +1,425 @@
1
+ #
2
+ # = ensembl/core/transcript.rb - ActiveRecord mapping to Ensembl core for transcript
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ nil
8
+ module Ensembl
9
+ nil
10
+ module Core
11
+ # = DESCRIPTION
12
+ # The Intron class describes an intron.
13
+ #
14
+ # This class does _not_ use ActiveRecord and is only defined within the API.
15
+ # There is no _introns_ table in the Ensembl database.
16
+ #
17
+ # This class includes the mixin Sliceable, which means that it is mapped
18
+ # to a SeqRegion object and a Slice can be created for objects o this
19
+ # class. See Sliceable and Slice for more information.
20
+ #
21
+ # = USAGE
22
+ # exon1 = Ensembl::Core::Exon.find(292811)
23
+ # exon2 = Ensembl::Core::Exon.find(292894)
24
+ # intron = Ensembl::Core::Intron.new(exon1,exon2)
25
+ # puts intron.to_yaml
26
+ #
27
+ # transcript = Ensembl::Core::Transcript.find(58972)
28
+ # puts transcript.introns.to_yaml
29
+ class Intron
30
+ include Sliceable
31
+ attr_accessor :seq_region, :seq_region_start, :seq_region_end, :seq_region_strand
32
+ attr_accessor :previous_exon, :next_exon, :transcript
33
+
34
+ def initialize(exon_1, exon_2)
35
+ # Check if these are actually two adjacent exons from the same transcript
36
+ ok = true
37
+
38
+ transcript = nil
39
+ exon_1.transcripts.each do |t|
40
+ transcript = t if exon_2.transcripts.include?(t)
41
+ end
42
+ raise ArgumentError, "Arguments should be adjacent exons of same transcript" if transcript.nil?
43
+
44
+ rank_1 = ExonTranscript.find_by_transcript_id_and_exon_id(transcript.id, exon_1.id).rank
45
+ rank_2 = ExonTranscript.find_by_transcript_id_and_exon_id(transcript.id, exon_2.id).rank
46
+ raise ArgumentError, "Arguments should be adjacent exons of same transcript" if (rank_2 - rank_1).abs > 1
47
+
48
+ @previous_exon, @next_exon = [exon_1, exon_2].sort_by{|e| e.seq_region_start}
49
+ @transcript = transcript
50
+ @seq_region = @previous_exon.seq_region
51
+ @seq_region_start = @previous_exon.seq_region_end + 1
52
+ @seq_region_end = @next_exon.seq_region_start - 1
53
+ @seq_region_strand = @previous_exon.seq_region_strand
54
+ end
55
+
56
+ end
57
+
58
+ # = DESCRIPTION
59
+ # The Transcript class provides an interface to the transcript
60
+ # table. This table contains mappings of transcripts for a Gene to a
61
+ # SeqRegion.
62
+ #
63
+ # This class uses ActiveRecord to access data in the Ensembl database.
64
+ # See the general documentation of the Ensembl module for
65
+ # more information on what this means and what methods are available.
66
+ #
67
+ # This class includes the mixin Sliceable, which means that it is mapped
68
+ # to a SeqRegion object and a Slice can be created for objects of this
69
+ # class. See Sliceable and Slice for more information.
70
+ #
71
+ # = USAGE
72
+ # #TODO
73
+ class Transcript < DBConnection
74
+ include Sliceable
75
+
76
+ set_table_name 'transcript'
77
+ set_primary_key 'transcript_id'
78
+
79
+ belongs_to :gene
80
+ belongs_to :seq_region
81
+ has_one :transcript_stable_id
82
+ has_many :transcript_attribs
83
+
84
+ has_many :exon_transcripts
85
+ # has_many :exons, :through => :exon_transcripts
86
+
87
+ has_one :translation
88
+
89
+ has_many :object_xrefs, :foreign_key => 'ensembl_id', :conditions => "ensembl_object_type = 'Transcript'"
90
+ has_many :xrefs, :through => :object_xrefs
91
+
92
+ has_many :transcript_supporting_features
93
+ has_many :dna_align_features, :through => :transcript_supporting_features, :conditions => ["feature_type = 'dna_align_feature'"]
94
+ has_many :protein_align_features, :through => :transcript_supporting_features, :conditions => ["feature_type = 'protein_align_feature'"]
95
+
96
+ alias attribs transcript_attribs
97
+
98
+ # The Transcript#exons method returns the exons for this transcript in
99
+ # the order of their ranks in the exon_transcript table.
100
+ # ---
101
+ # *Arguments*:: none
102
+ # *Returns*:: sorted array of Exon objects
103
+ def exons
104
+ if @exons.nil?
105
+ @exons = self.exon_transcripts(:include => [:exons]).sort_by{|et| et.rank.to_i}.collect{|et| et.exon}
106
+ end
107
+ return @exons
108
+ end
109
+
110
+ # The Transcript#introns methods returns the introns for this transcript
111
+ # ---
112
+ # *Arguments*:: none
113
+ # *Returns*:: sorted array of Intron objects
114
+ def introns
115
+ if @introns.nil?
116
+ @introns = Array.new
117
+ if self.exons.length > 1
118
+ self.exons.each_with_index do |exon, index|
119
+ next if index == 0
120
+ @introns.push(Intron.new(self.exons[index - 1], exon))
121
+ end
122
+ end
123
+ end
124
+ return @introns
125
+ end
126
+
127
+ # The Transcript#stable_id method returns the stable ID of the transcript.
128
+ # ---
129
+ # *Arguments*:: none
130
+ # *Returns*:: String
131
+ def stable_id
132
+ return self.transcript_stable_id.stable_id
133
+ end
134
+
135
+ # = DESCRIPTION
136
+ # The Transcript#display_label method returns the default name of the transcript.
137
+ def display_label
138
+ return Xref.find(self.display_xref_id).display_label
139
+ end
140
+ alias :display_name :display_label
141
+ alias :label :display_label
142
+ alias :name :display_label
143
+
144
+ # = DESCRIPTION
145
+ # The Transcript#find_all_by_stable_id class method returns an array of
146
+ # transcripts with the given stable_id. If none were found, an empty
147
+ # array is returned.
148
+ def self.find_all_by_stable_id(stable_id)
149
+ answer = Array.new
150
+ transcript_stable_id_objects = Ensembl::Core::TranscriptStableId.find_all_by_stable_id(stable_id)
151
+ transcript_stable_id_objects.each do |transcript_stable_id_object|
152
+ answer.push(Ensembl::Core::Transcript.find(transcript_stable_id_object.transcript_id))
153
+ end
154
+
155
+ return answer
156
+ end
157
+
158
+ # = DESCRIPTION
159
+ # The Transcript#find_all_by_stable_id class method returns a
160
+ # transcripts with the given stable_id. If none was found, nil is returned.
161
+ def self.find_by_stable_id(stable_id)
162
+ all = self.find_all_by_stable_id(stable_id)
163
+ if all.length == 0
164
+ return nil
165
+ else
166
+ return all[0]
167
+ end
168
+ end
169
+
170
+ # = DESCRIPTION
171
+ # The Transcript#find_by_stable_id class method fetches a Transcript object based on
172
+ # its stable ID (i.e. the "ENST" accession number). If the name is
173
+ # not found, it returns nil.
174
+ def self.find_by_stable_id(stable_id)
175
+ transcript_stable_id = TranscriptStableId.find_by_stable_id(stable_id)
176
+ if transcript_stable_id.nil?
177
+ return nil
178
+ else
179
+ return transcript_stable_id.transcript
180
+ end
181
+ end
182
+
183
+ # = DESCRIPTION
184
+ # The Transcript#seq method returns the full sequence of all concatenated
185
+ # exons.
186
+ def seq
187
+ if @seq.nil?
188
+ @seq = ''
189
+ self.exons.each do |exon|
190
+ @seq += exon.seq
191
+ end
192
+ end
193
+ return @seq
194
+ end
195
+
196
+ # = DESCRIPTION
197
+ # The Transcript#cds_seq method returns the coding sequence of the transcript,
198
+ # i.e. the concatenated sequence of all exons minus the UTRs.
199
+ def cds_seq
200
+ cds_length = self.coding_region_cdna_end - self.coding_region_cdna_start + 1
201
+
202
+ return self.seq[(self.coding_region_cdna_start - 1), cds_length]
203
+ end
204
+
205
+ # = DESCRIPTION
206
+ # The Transcript#five_prime_utr_seq method returns the sequence of the
207
+ # 5'UTR of the transcript.
208
+ def five_prime_utr_seq
209
+ return self.seq[0, self.coding_region_cdna_start - 1]
210
+ end
211
+
212
+ # = DESCRIPTION
213
+ # The Transcript#three_prime_utr_seq method returns the sequence of the
214
+ # 3'UTR of the transcript.
215
+ def three_prime_utr_seq
216
+ return self.seq[self.coding_region_cdna_end..-1]
217
+ end
218
+
219
+ # = DESCRIPTION
220
+ # The Transcript#protein_seq method returns the sequence of the
221
+ # protein of the transcript.
222
+ def protein_seq
223
+ return Bio::Sequence::NA.new(self.cds_seq).translate.seq
224
+ end
225
+
226
+
227
+ # = DESCRIPTION
228
+ # The Transcript#coding_region_genomic_start returns the start position
229
+ # of the CDS in genomic coordinates. Note that, in contrast to
230
+ # Transcript#coding_region_cdna_start, the CDS start position is _always_
231
+ # ''left'' of the end position. So for transcripts on the reverse strand,
232
+ # the CDS start position is at the border of the 3'UTR instead of the
233
+ # 5'UTR.
234
+ def coding_region_genomic_start
235
+ strand = self.translation.start_exon.seq_region_strand
236
+ if strand == 1
237
+ return self.translation.start_exon.seq_region_start + ( self.translation.seq_start - 1 )
238
+ else
239
+ return self.translation.end_exon.seq_region_end - ( self.translation.seq_end - 1 )
240
+ end
241
+ end
242
+
243
+ # = DESCRIPTION
244
+ # The Transcript#coding_region_genomic_end returns the stop position
245
+ # of the CDS in genomic coordinates. Note that, in contrast to
246
+ # Transcript#coding_region_cdna_end, the CDS stop position is _always_
247
+ # ''right'' of the start position. So for transcripts on the reverse strand,
248
+ # the CDS stop position is at the border of the 5'UTR instead of the
249
+ # 3'UTR.
250
+ def coding_region_genomic_end
251
+ strand = self.translation.start_exon.seq_region_strand
252
+ if strand == 1
253
+ return self.translation.end_exon.seq_region_start + ( self.translation.seq_end - 1 )
254
+ else
255
+ return self.translation.start_exon.seq_region_end - ( self.translation.seq_start - 1 )
256
+ end
257
+ end
258
+
259
+ # = DESCRIPTION
260
+ # The Transcript#coding_region_cdna_start returns the start position
261
+ # of the CDS in cDNA coordinates. Note that, in contrast to the
262
+ # Transcript#coding_region_genomic_start, the CDS start position is
263
+ # _always_ at the border of the 5'UTR. So for genes on the reverse
264
+ # strand, the CDS start position in cDNA coordinates will be ''right''
265
+ # of the CDS stop position.
266
+ def coding_region_cdna_start
267
+ answer = 0
268
+
269
+ self.exons.each do |exon|
270
+ if exon == self.translation.start_exon
271
+ answer += self.translation.seq_start
272
+ return answer
273
+ else
274
+ answer += exon.length
275
+ end
276
+ end
277
+
278
+ end
279
+
280
+ # = DESCRIPTION
281
+ # The Transcript#coding_region_cdna_end returns the stop position
282
+ # of the CDS in cDNA coordinates. Note that, in contrast to the
283
+ # Transcript#coding_region_genomic_end, the CDS start position is
284
+ # _always_ at the border of the 3'UTR. So for genes on the reverse
285
+ # strand, the CDS start position in cDNA coordinates will be ''right''
286
+ # of the CDS stop position.
287
+ def coding_region_cdna_end
288
+ answer = 0
289
+
290
+ self.exons.each do |exon|
291
+ if exon == self.translation.end_exon
292
+ answer += self.translation.seq_end
293
+ return answer
294
+ else
295
+ answer += exon.length
296
+ end
297
+ end
298
+ end
299
+
300
+
301
+ # = DESCRIPTION
302
+ # The Transcript#exon_for_position identifies the exon that covers a given
303
+ # genomic position. Returns the exon object, or nil if in intron.
304
+ def exon_for_genomic_position(pos)
305
+ if pos < coding_region_genomic_start or pos > coding_region_genomic_end
306
+ raise RuntimeError, "Position has to be within transcript"
307
+ end
308
+ self.exons.each do |exon|
309
+ if exon.start <= pos and exon.stop >= pos
310
+ return exon
311
+ end
312
+ end
313
+ return nil
314
+ end
315
+
316
+ # = DESCRIPTION
317
+ # The Transcript#exon_for_position identifies the exon that covers a given
318
+ # position of the cDNA.
319
+ def exon_for_cdna_position(pos)
320
+ # FIXME: Still have to check for when pos is outside of scope of cDNA.
321
+ accumulated_exon_length = 0
322
+
323
+ self.exons.each do |exon|
324
+ accumulated_exon_length += exon.length
325
+ if accumulated_exon_length > pos
326
+ return exon
327
+ end
328
+ end
329
+ raise RuntimeError, "Position outside of cDNA scope"
330
+ end
331
+
332
+ # = DESCRIPTION
333
+ # The Transcript#cdna2genomic method converts cDNA coordinates to
334
+ # genomic coordinates for this transcript.
335
+ # ---
336
+ # *Arguments*:
337
+ # * position:: position on the cDNA (required)
338
+ # *Returns*:: integer
339
+ def cdna2genomic(pos)
340
+ #FIXME: Still have to check for when pos is outside of scope of cDNA.
341
+ # Identify the exon we're looking at.
342
+ exon_with_target = self.exon_for_cdna_position(pos)
343
+
344
+ accumulated_position = 0
345
+ self.exons.each do |exon|
346
+ if exon == exon_with_target
347
+ answer = exon.start + ( pos - accumulated_position )
348
+ return answer
349
+ else
350
+ accumulated_position += exon.length
351
+ end
352
+ end
353
+ end
354
+
355
+ # = DESCRIPTION
356
+ # The Transcript#cds2genomic method converts CDS coordinates to
357
+ # genomic coordinates for this transcript.
358
+ # ---
359
+ # *Arguments*:
360
+ # * pos:: position on the CDS (required)
361
+ # *Returns*::
362
+ def cds2genomic(pos)
363
+ return self.cdna2genomic(pos + self.coding_region_cdna_start)
364
+ end
365
+
366
+ # = DESCRIPTION
367
+ # The Transcript#pep2genomic method converts peptide coordinates to
368
+ # genomic coordinates for this transcript.
369
+ # ---
370
+ # *Arguments*:
371
+ # * pos:: position on the peptide (required)
372
+ # *Returns*::
373
+ def pep2genomic(pos)
374
+ raise NotImplementedError
375
+ end
376
+
377
+ # = DESCRIPTION
378
+ # The Transcript#genomic2cdna method converts genomic coordinates to
379
+ # cDNA coordinates for this transcript.
380
+ # ---
381
+ # *Arguments*:
382
+ # * pos:: position on the chromosome (required)
383
+ # *Returns*::
384
+ def genomic2cdna(pos)
385
+ #FIXME: Still have to check for when pos is outside of scope of cDNA.
386
+ # Identify the exon we're looking at.
387
+ exon_with_target = self.exon_for_genomic_position(pos)
388
+
389
+ accumulated_position = 0
390
+ self.exons.each do |exon|
391
+ if exon == exon_with_target
392
+ accumulated_position += ( pos - exon.start )
393
+ return accumulated_position
394
+ else
395
+ accumulated_position += exon.length
396
+ end
397
+ end
398
+ return RuntimeError, "Position outside of cDNA scope"
399
+ end
400
+
401
+ # = DESCRIPTION
402
+ # The Transcript#genomic2cds method converts genomic coordinates to
403
+ # CDS coordinates for this transcript.
404
+ # ---
405
+ # *Arguments*:
406
+ # * pos:: position on the chromosome (required)
407
+ # *Returns*::
408
+ def genomic2cds(pos)
409
+ return self.genomic2cdna(pos) - self.coding_region_cdna_start
410
+ end
411
+
412
+ # = DESCRIPTION
413
+ # The Transcript#genomic2pep method converts genomic coordinates to
414
+ # peptide coordinates for this transcript.
415
+ # ---
416
+ # *Arguments*:
417
+ # * pos:: position on the chromosome (required)
418
+ # *Returns*::
419
+ def genomic2pep(pos)
420
+ raise NotImplementedError
421
+ end
422
+
423
+ end
424
+ end
425
+ end
@@ -0,0 +1,97 @@
1
+ #
2
+ # = bio/api/ensembl/core/transform.rb - transform positions for Ensembl Slice
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ nil
8
+ module Ensembl
9
+ nil
10
+ module Core
11
+ nil
12
+ module Sliceable
13
+ # = DESCRIPTION
14
+ # The #transform method is used to transfer coordinates for a feature
15
+ # from one coordinate system to another. It basically creates a clone of
16
+ # the original feature and changes the seq_region, start position, stop
17
+ # position and strand.
18
+ #
19
+ # Suppose you have a feature on a
20
+ # contig in human (let's say on contig AC000031.6.1.38703) and you
21
+ # want to know the coordinates on the chromosome. This is a
22
+ # transformation of coordinates from a higher ranked coordinate system to
23
+ # a lower ranked coordinate system. Transformations can also be done
24
+ # from a chromosome to the contig level.
25
+ #
26
+ # In contrast to the #project method of Sliceables, the
27
+ # coordinates of a feature can only transformed to the target
28
+ # coordinate system if there is no ambiguity to which SeqRegion.
29
+ #
30
+ # For example, gene A can be transferred from the chromosome system to
31
+ # the clone coordinate system, whereas gene B can not.
32
+ #
33
+ # gene A gene B
34
+ # |---<=====>--------------------<=====>----------------| chromosome
35
+ #
36
+ # |-----------| |-------| |---------| clones
37
+ # |-----------| |-------| |--------|
38
+ #
39
+ # gene_a.transform('clone') --> gene
40
+ # gene_b.transform('clone') --> nil
41
+ #
42
+ # At the moment, transformations can only be done if the two coordinate
43
+ # systems are linked directly in the 'assembly' table.
44
+ #
45
+ # = USAGE
46
+ #
47
+ # # Get a gene in cow and transform to scaffold level
48
+ # # (i.e. going from a high rank coord system to a lower rank coord
49
+ # # system)
50
+ # # Cow scaffold Chr4.10 lies on Chr4 from 8030345 to 10087277 on the
51
+ # # reverse strand
52
+ # source_gene = Gene.find(2408)
53
+ # target_gene = source_gene.transform('scaffold')
54
+ # puts source_gene.seq_region.name #--> 4
55
+ # puts source_gene.seq_region_start #--> 8104409
56
+ # puts source_gene.seq_region_end #--> 8496477
57
+ # puts source_gene.seq_region_strand #--> -1
58
+ # puts target_gene.seq_region.name #--> Chr4.003.10
59
+ # puts target_gene.seq_region_start #--> 1590800
60
+ # puts target_gene.seq_region_end #--> 1982868
61
+ # puts target_gene.seq_region_strand #--> 1
62
+ #
63
+ # ---
64
+ # *Arguments*:
65
+ # * coord_system_name:: name of coordinate system to transform to
66
+ # coordinates to
67
+ # *Returns*:: nil or an object of the same class as self
68
+ def transform(coord_system_name)
69
+ #-
70
+ # There are two things I can do:
71
+ # (1) just use project
72
+ # (2) avoid doing all the calculations in project if the source slice
73
+ # covers multiple target slices, and _then_ go for project.
74
+ # Let's go for nr 1 for the moment and optimize later.
75
+ #+
76
+
77
+ if self.slice.seq_region.coord_system.name == coord_system_name
78
+ return self
79
+ end
80
+
81
+ target_slices = self.slice.project(coord_system_name)
82
+ if target_slices.length > 1
83
+ return nil
84
+ else
85
+ clone = self.clone
86
+ clone.seq_region_id = target_slices[0].seq_region.id
87
+ clone.seq_region_start = target_slices[0].start
88
+ clone.seq_region_end = target_slices[0].stop
89
+
90
+ clone.seq_region_strand = target_slices[0].strand * self.strand
91
+
92
+ return clone
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end