jandot-ruby-ensembl-api 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. data/TUTORIAL +623 -0
  2. data/bin/ensembl +39 -0
  3. data/lib/ensembl/core/activerecord.rb +1847 -0
  4. data/lib/ensembl/core/project.rb +248 -0
  5. data/lib/ensembl/core/slice.rb +627 -0
  6. data/lib/ensembl/core/transcript.rb +425 -0
  7. data/lib/ensembl/core/transform.rb +97 -0
  8. data/lib/ensembl/db_connection.rb +148 -0
  9. data/lib/ensembl/variation/activerecord.rb +308 -0
  10. data/lib/ensembl.rb +23 -0
  11. data/samples/examples_perl_tutorial.rb +120 -0
  12. data/samples/small_example_ruby_api.rb +34 -0
  13. data/test/unit/release_45/core/run_tests.rb +12 -0
  14. data/test/unit/release_45/core/test_project.rb +235 -0
  15. data/test/unit/release_45/core/test_project_human.rb +58 -0
  16. data/test/unit/release_45/core/test_relationships.rb +61 -0
  17. data/test/unit/release_45/core/test_sequence.rb +175 -0
  18. data/test/unit/release_45/core/test_slice.rb +56 -0
  19. data/test/unit/release_45/core/test_transcript.rb +94 -0
  20. data/test/unit/release_45/core/test_transform.rb +223 -0
  21. data/test/unit/release_45/variation/test_activerecord.rb +32 -0
  22. data/test/unit/release_50/core/run_tests.rb +12 -0
  23. data/test/unit/release_50/core/test_project.rb +215 -0
  24. data/test/unit/release_50/core/test_project_human.rb +58 -0
  25. data/test/unit/release_50/core/test_relationships.rb +66 -0
  26. data/test/unit/release_50/core/test_sequence.rb +175 -0
  27. data/test/unit/release_50/core/test_slice.rb +121 -0
  28. data/test/unit/release_50/core/test_transcript.rb +108 -0
  29. data/test/unit/release_50/core/test_transform.rb +223 -0
  30. data/test/unit/release_50/variation/test_activerecord.rb +136 -0
  31. data/test/unit/test_connection.rb +58 -0
  32. data/test/unit/test_releases.rb +40 -0
  33. metadata +243 -0
@@ -0,0 +1,248 @@
1
+ #
2
+ # = ensembl/core/project.rb - project calculations for Ensembl Slice
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ module Ensembl
8
+ module Core
9
+ class Slice
10
+ # = DESCRIPTION
11
+ # The Slice#project method is used to transfer coordinates from one
12
+ # coordinate system to another. Suppose you have a slice on a
13
+ # contig in human (let's say on contig AC000031.6.1.38703) and you
14
+ # want to know the coordinates on the chromosome. This is a
15
+ # projection of coordinates from a higher ranked coordinate system to
16
+ # a lower ranked coordinate system. Projections can also be done
17
+ # from a chromosome to the contig level. However, it might be possible
18
+ # that more than one contig has to be included and that there exist
19
+ # gaps between the contigs. The output of this method therefore is
20
+ # an _array_ of Slice and Gap objects.
21
+ #
22
+ # At the moment, projections can only be done if the two coordinate
23
+ # systems are linked directly in the 'assembly' table.
24
+ #
25
+ # = USAGE
26
+ #
27
+ # # Get a contig slice in cow and project to scaffold level
28
+ # # (i.e. going from a high rank coord system to a lower rank coord
29
+ # # system)
30
+ # source_slice = Slice.fetch_by_region('contig', 'AAFC03020247', 42, 2007)
31
+ # target_slices = source_slice.project('scaffold')
32
+ # puts target_slices.length #--> 1
33
+ # puts target_slices[0].display_name #--> scaffold:ChrUn.003.3522:6570:8535:1
34
+ #
35
+ # # Get a chromosome slice in cow and project to scaffold level
36
+ # # (i.e. going from a low rank coord system to a higher rank coord
37
+ # # system)
38
+ # # The region 96652152..98000000 on BTA4 is covered by 2 scaffolds
39
+ # # that are separated by a gap.
40
+ # source_slice = Slice.fetch_by_region('chromosome','4', 96652152, 98000000)
41
+ # target_slices = source_slice.project('scaffold')
42
+ # puts target_slices.length #--> 3
43
+ # first_bit, second_bit, third_bit = target_slices
44
+ # puts first_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.105:42:599579:1
45
+ # puts second_bit.class #--> Gap
46
+ # puts third_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.106:1:738311:1
47
+ #
48
+ # ---
49
+ # *Arguments*:
50
+ # * coord_system_name:: name of coordinate system to project
51
+ # coordinates to
52
+ # *Returns*:: an array consisting of Slices and, if necessary, Gaps
53
+ def project(coord_system_name)
54
+ answer = Array.new # an array of slices
55
+ source_coord_system = self.seq_region.coord_system
56
+ target_coord_system = nil
57
+ if coord_system_name == 'toplevel'
58
+ target_coord_system = CoordSystem.find_toplevel
59
+ coord_system_name = target_coord_system.name
60
+ elsif coord_system_name == 'seqlevel'
61
+ target_coord_system = CoordSystem.find_seqlevel
62
+ coord_system_name = target_coord_system.name
63
+ else
64
+ target_coord_system = CoordSystem.find_by_name(coord_system_name)
65
+ end
66
+
67
+ if target_coord_system.rank < source_coord_system.rank
68
+ # We're going from component to assembly, which is easy.
69
+ assembly_links = self.seq_region.assembly_links_as_component(coord_system_name)
70
+
71
+ if assembly_links.length == 0
72
+ return []
73
+ else
74
+ assembly_links.each do |assembly_link|
75
+ target_seq_region = assembly_link.asm_seq_region
76
+ target_start = self.start + assembly_link.asm_start - assembly_link.cmp_start
77
+ target_stop = self.stop + assembly_link.asm_start - assembly_link.cmp_start
78
+ target_strand = self.strand * assembly_link.ori # 1x1=>1, 1x-1=>-1, -1x-1=>1
79
+
80
+ answer.push(Slice.new(target_seq_region, target_start, target_stop, target_strand))
81
+ end
82
+ end
83
+
84
+ else
85
+ # If we're going from assembly to component, the answer of the target method
86
+ # is an array consisting of Slices intermitted with Gaps.
87
+
88
+ # ASSEMBLY_EXCEPTIONS
89
+ # CAUTION: there are exceptions to the assembly (stored in the assembly_exception)
90
+ # table which make things a little bit more difficult... For example,
91
+ # in human, the assembly data for the pseudo-autosomal region (PAR) of
92
+ # Y is *not* stored in the assembly table. Instead, there is a record
93
+ # in the assembly_exception table that says: "For chr Y positions 1
94
+ # to 2709520, use chr X:1-2709520 for the assembly data."
95
+ # As a solution, what we'll do here, is split the assembly up in blocks:
96
+ # if a slice covers both the PAR and the allosomal region, we'll make
97
+ # two subslices (let's call them blocks not to intercede with the
98
+ # Slice#subslices method) and project these independently.
99
+ assembly_exceptions = AssemblyException.find_all_by_seq_region_id(self.seq_region.id)
100
+ if assembly_exceptions.length > 0
101
+ # Check if this bit of the original slice is covered in the
102
+ # assembly_exception table.
103
+ overlapping_exceptions = Array.new
104
+ assembly_exceptions.each do |ae|
105
+ if Slice.new(self.seq_region, ae.seq_region_start, ae.seq_region_end).overlaps?(self)
106
+ if ae.exc_type == 'HAP'
107
+ raise NotImplementedError, "The haplotype exceptions are not implemented (yet). You can't project this slice."
108
+ end
109
+ overlapping_exceptions.push(ae)
110
+ end
111
+ end
112
+
113
+ if overlapping_exceptions.length > 0
114
+ # First get all assembly blocks from chromosome Y
115
+ source_assembly_blocks = self.excise(overlapping_exceptions.collect{|e| e.seq_region_start .. e.seq_region_end})
116
+ # And insert the blocks of chromosome X
117
+ all_assembly_blocks = Array.new #both for chr X and Y
118
+ # First do all exceptions between the first and last block
119
+ previous_block = nil
120
+ source_assembly_blocks.sort_by{|b| b.start}.each do |b|
121
+ if previous_block.nil?
122
+ all_assembly_blocks.push(b)
123
+ previous_block = b
124
+ next
125
+ end
126
+ # Find the exception record
127
+ exception = nil
128
+ assembly_exceptions.each do |ae|
129
+ if ae.seq_region_end == b.start - 1
130
+ exception = ae
131
+ break
132
+ end
133
+ end
134
+
135
+ new_slice_start = exception.exc_seq_region_start + ( previous_block.stop - exception.seq_region_start )
136
+ new_slice_stop = exception.exc_seq_region_start + ( b.start - exception.seq_region_start )
137
+ new_slice_strand = self.strand * exception.ori
138
+ new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
139
+
140
+ all_assembly_blocks.push(new_slice)
141
+ all_assembly_blocks.push(b)
142
+ previous_block = b
143
+ end
144
+
145
+ # And then see if we have to add an additional one at the start or end
146
+ first_block = source_assembly_blocks.sort_by{|b| b.start}[0]
147
+ if first_block.start > self.start
148
+ exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[0]
149
+ new_slice_start = exception.exc_seq_region_start + ( self.start - exception.seq_region_start )
150
+ new_slice_stop = exception.exc_seq_region_start + ( first_block.start - 1 - exception.seq_region_start )
151
+ new_slice_strand = self.strand * exception.ori
152
+ new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
153
+
154
+ all_assembly_blocks.unshift(new_slice)
155
+ end
156
+
157
+ last_block = source_assembly_blocks.sort_by{|b| b.start}[-1]
158
+ if last_block.stop < self.stop
159
+ exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[-1]
160
+ new_slice_start = exception.exc_seq_region_start + ( last_block.stop + 1 - exception.seq_region_start )
161
+ new_slice_stop = exception.exc_seq_region_start + ( self.stop - exception.seq_region_start )
162
+ new_slice_strand = self.strand * exception.ori
163
+ new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
164
+
165
+ all_assembly_blocks.shift(new_slice)
166
+ end
167
+
168
+ answer = Array.new
169
+ all_assembly_blocks.each do |b|
170
+ answer.push(b.project(coord_system_name))
171
+ end
172
+ answer.flatten!
173
+
174
+ return answer
175
+ end
176
+
177
+ end
178
+ # END OF ASSEMBLY_EXCEPTIONS
179
+
180
+ # Get all AssemblyLinks starting from this assembly and for which
181
+ # the cmp_seq_region.coord_system is what we want.
182
+ assembly_links = self.seq_region.assembly_links_as_assembly(coord_system_name)
183
+
184
+ # Now reject all the components that lie _before_ the source, then
185
+ # reject all the components that lie _after_ the source.
186
+ # Then sort based on their positions.
187
+ sorted_overlapping_assembly_links = assembly_links.reject{|al| al.asm_end < self.start}.reject{|al| al.asm_start > self.stop}.sort_by{|al| al.asm_start}
188
+ if sorted_overlapping_assembly_links.length == 0
189
+ return []
190
+ end
191
+
192
+ # What we'll do, is create slices for all the underlying components,
193
+ # including the first and the last one. At first, the first and last
194
+ # components are added in their entirity and will only be cropped afterwards.
195
+ previous_stop = nil
196
+ sorted_overlapping_assembly_links.each_index do |i|
197
+ this_link = sorted_overlapping_assembly_links[i]
198
+ if i == 0
199
+ answer.push(Slice.new(this_link.cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
200
+ next
201
+ end
202
+ previous_link = sorted_overlapping_assembly_links[i-1]
203
+
204
+ # If there is a gap with the previous link: add a gap
205
+ if this_link.asm_start > ( previous_link.asm_end + 1 )
206
+ gap_size = this_link.asm_start - previous_link.asm_end - 1
207
+ answer.push(Gap.new(CoordSystem.find_by_name(coord_system_name), gap_size))
208
+ end
209
+
210
+ # And add the component itself as a Slice
211
+ answer.push(Slice.new(this_link.cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
212
+ end
213
+
214
+ # Now see if we have to crop the first and/or last slice
215
+ first_link = sorted_overlapping_assembly_links[0]
216
+ if self.start > first_link.asm_start
217
+ if first_link.ori == -1
218
+ answer[0].stop = first_link.cmp_start + ( first_link.asm_end - self.start )
219
+ else
220
+ answer[0].start = first_link.cmp_start + ( self.start - first_link.asm_start )
221
+ end
222
+ end
223
+
224
+ last_link = sorted_overlapping_assembly_links[-1]
225
+ if self.stop < last_link.asm_end
226
+ if last_link.ori == -1
227
+ answer[-1].start = last_link.cmp_start + ( last_link.asm_end - self.stop)
228
+ else
229
+ answer[-1].stop = last_link.cmp_start + ( self.stop - last_link.asm_start )
230
+ end
231
+ end
232
+
233
+ # And check if we have to add Ns at the front and/or back
234
+ if self.start < first_link.asm_start
235
+ gap_size = first_link.asm_start - self.start
236
+ answer.unshift(Gap.new(CoordSystem.find_by_name(coord_system_name), gap_size))
237
+ end
238
+ if self.stop > last_link.asm_end
239
+ gap_size = self.stop - last_link.asm_end
240
+ answer.push(Gap.new(CoordSystem.find_by_name(coord_system_name), gap_size))
241
+ end
242
+ end
243
+ return answer
244
+
245
+ end
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,627 @@
1
+ #
2
+ # = ensembl/core/slice.rb - Slice object for Ensembl core
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ nil
8
+ module Ensembl
9
+ nil
10
+ module Core
11
+ # = DESCRIPTION
12
+ # From the perl API tutorial
13
+ # (http://www.ensembl.org/info/software/core/core_tutorial.html): "A
14
+ # Slice object represents a continuous region of a genome. Slices can be
15
+ # used to obtain sequence, features or other information from a
16
+ # particular region of interest."
17
+ #
18
+ # In contrast to almost all other classes of Ensembl::Core,
19
+ # the Slice class is not based on ActiveRecord.
20
+ #
21
+ # = USAGE
22
+ # chr4 = SeqRegion.find_by_name('4')
23
+ # my_slice = Slice.new(chr4, 95000, 98000, -1)
24
+ # puts my_slice.display_name #--> 'chromosome:4:Btau_3.1:95000:98000:1'
25
+ class Slice
26
+ attr_accessor :seq_region, :start, :stop, :strand, :seq
27
+
28
+ #################
29
+ ## CREATE A SLICE
30
+ #################
31
+
32
+ # = DESCRIPTION
33
+ # Create a new Slice object from scratch.
34
+ #
35
+ # = USAGE
36
+ # chr4 = SeqRegion.find_by_name('4')
37
+ # my_slice = Slice.new(chr4, 95000, 98000, -1)
38
+ # ---
39
+ # *Arguments*:
40
+ # * seq_region: SeqRegion object
41
+ # * start: start position of the Slice on the SeqRegion (default = 1)
42
+ # * stop: stop position of the Slice on the SeqRegion (default: end of
43
+ # SeqRegion)
44
+ # * strand: strand of the Slice relative to the SeqRegion (default = 1)
45
+ # *Returns*:: Slice object
46
+ def initialize(seq_region, start = 1, stop = seq_region.length, strand = 1)
47
+ if start.nil?
48
+ start = 1
49
+ end
50
+ if stop.nil?
51
+ stop = seq_region.length
52
+ end
53
+ unless seq_region.class == Ensembl::Core::SeqRegion
54
+ raise 'First argument has to be a Ensembl::Core::SeqRegion object'
55
+ end
56
+ @seq_region, @start, @stop, @strand = seq_region, start, stop, strand
57
+ @seq = nil
58
+ end
59
+
60
+ # = DESCRIPTION
61
+ # Create a Slice without first creating the SeqRegion object.
62
+ #
63
+ # = USAGE
64
+ # my_slice_1 = Slice.fetch_by_region('chromosome','4',95000,98000,1)
65
+ #
66
+ # ---
67
+ # *Arguments*:
68
+ # * coord_system: name of CoordSystem (required)
69
+ # * seq_region: name of SeqRegion (required)
70
+ # * start: start of Slice on SeqRegion (default = 1)
71
+ # * stop: stop of Slice on SeqRegion (default = end of SeqRegion)
72
+ # * strand: strand of Slice on SeqRegion
73
+ # *Returns*:: Ensembl::Core::Slice object
74
+ def self.fetch_by_region(coord_system_name, seq_region_name, start = nil, stop = nil, strand = 1, version = nil)
75
+ all_coord_systems = Ensembl::Core::CoordSystem.find_all_by_name(coord_system_name)
76
+ coord_system = nil
77
+ if version.nil? # Take the version with the lowest rank
78
+ coord_system = all_coord_systems.sort_by{|cs| cs.version}.reverse.shift
79
+ else
80
+ coord_system = all_coord_systems.select{|cs| cs.version == version}[0]
81
+ end
82
+ unless coord_system.class == Ensembl::Core::CoordSystem
83
+ message = "Couldn't find a Ensembl::Core::CoordSystem object with name '" + coord_system_name + "'"
84
+ if ! version.nil?
85
+ message += " and version '" + version + "'"
86
+ end
87
+ raise message
88
+ end
89
+
90
+ seq_region = Ensembl::Core::SeqRegion.find_by_name_and_coord_system_id(seq_region_name, coord_system.id)
91
+ #seq_region = Ensembl::Core::SeqRegion.find_by_sql("SELECT * FROM seq_region WHERE name = '" + seq_region_name + "' AND coord_system_id = " + coord_system.id.to_s)[0]
92
+ unless seq_region.class == Ensembl::Core::SeqRegion
93
+ raise "Couldn't find a Ensembl::Core::SeqRegion object with the name '" + seq_region_name + "'"
94
+ end
95
+
96
+ return Ensembl::Core::Slice.new(seq_region, start, stop, strand)
97
+ end
98
+
99
+ # = DESCRIPTION
100
+ # Create a Slice based on a Gene
101
+ #
102
+ # = USAGE
103
+ # my_slice = Slice.fetch_by_gene_stable_id('ENSG00000184895')
104
+ #
105
+ # ---
106
+ # *Arguments*:
107
+ # * gene_stable_id: Ensembl gene stable_id (required)
108
+ # *Returns*:: Ensembl::Core::Slice object
109
+ def self.fetch_by_gene_stable_id(gene_stable_id, flanking_seq_length = 0)
110
+ gene_stable_id = Ensembl::Core::GeneStableId.find_by_stable_id(gene_stable_id)
111
+ gene = gene_stable_id.gene
112
+ seq_region = gene.seq_region
113
+
114
+ return Ensembl::Core::Slice.new(seq_region, gene.seq_region_start - flanking_seq_length, gene.seq_region_end + flanking_seq_length, gene.seq_region_strand)
115
+ end
116
+
117
+ # = DESCRIPTION
118
+ # Create a Slice based on a Transcript
119
+ #
120
+ # = USAGE
121
+ # my_slice = Slice.fetch_by_transcript_stable_id('ENST00000383673')
122
+ #
123
+ # ---
124
+ # *Arguments*:
125
+ # * transcript_stable_id: Ensembl transcript stable_id (required)
126
+ # *Returns*:: Ensembl::Core::Slice object
127
+ def self.fetch_by_transcript_stable_id(transcript_stable_id, flanking_seq_length = 0)
128
+ transcript_stable_id = Ensembl::Core::TranscriptStableId.find_by_stable_id(transcript_stable_id)
129
+ transcript = transcript_stable_id.transcript
130
+ seq_region = transcript.seq_region
131
+
132
+ return Ensembl::Core::Slice.new(seq_region, transcript.seq_region_start - flanking_seq_length, transcript.seq_region_end + flanking_seq_length, transcript.seq_region_strand)
133
+ end
134
+
135
+ # = DESCRIPTION
136
+ # Create an array of all Slices for a given coordinate system.
137
+ #
138
+ # = USAGE
139
+ # slices = Slice.fetch_all('chromosome')
140
+ #
141
+ # ---
142
+ # *Arguments*:
143
+ # * coord_system_name:: name of coordinate system (default = chromosome)
144
+ # * coord_system_version:: version of coordinate system (default = nil)
145
+ # *Returns*:: an array of Ensembl::Core::Slice objects
146
+ def self.fetch_all(coord_system_name = 'chromosome', version = nil)
147
+ answer = Array.new
148
+ if version.nil?
149
+ coord_system = Ensembl::Core::CoordSystem.find_by_name(coord_system_name)
150
+ else
151
+ coord_system = Ensembl::Core::CoordSystem.find_by_name_and_version(coord_system_name, version)
152
+ end
153
+
154
+ coord_system.seq_regions.each do |seq_region|
155
+ answer.push(Ensembl::Core::Slice.new(seq_region))
156
+ end
157
+
158
+ return answer
159
+ end
160
+
161
+ ##################
162
+ ## GENERAL METHODS
163
+ ##################
164
+
165
+ # = DESCRIPTION
166
+ # Get the length of a slice
167
+ #
168
+ # = USAGE
169
+ # chr4 = SeqRegion.find_by_name('4')
170
+ # my_slice = Slice.new(chr4, 95000, 98000, -1)
171
+ # puts my_slice.length
172
+ # ---
173
+ # *Arguments*:: none
174
+ # *Returns*:: Integer
175
+ def length
176
+ return self.stop - self.start + 1
177
+ end
178
+
179
+ # = DESCRIPTION
180
+ # The display_name method returns a full name of this slice, containing
181
+ # the name of the coordinate system, the sequence region, start and
182
+ # stop positions on that sequence region and the strand. E.g. for a slice
183
+ # of bovine chromosome 4 from position 95000 to 98000 on the reverse strand,
184
+ # the display_name would look like: chromosome:4:Btau_3.1:95000:98000:-1
185
+ #
186
+ # = USAGE
187
+ # puts my_slice.display_name
188
+ # ---
189
+ # *Arguments*:: none
190
+ # *Result*:: String
191
+ def display_name
192
+ return [self.seq_region.coord_system.name, self.seq_region.coord_system.version, self.seq_region.name, self.start.to_s, self.stop.to_s, self.strand.to_s].join(':')
193
+ end
194
+ alias to_s display_name
195
+
196
+ # = DESCRIPTION
197
+ # The Slice#overlaps? method checks if this slice overlaps another one.
198
+ # The other slice has to be on the same coordinate system
199
+ #
200
+ # = USAGE
201
+ # slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
202
+ # slice_b = Slice.fetch_by_region('chromosome','X',900,1500)
203
+ # if slice_a.overlaps?(slice_b)
204
+ # puts "There slices overlap"
205
+ # end
206
+ # ---
207
+ # *Arguments*:: another slice
208
+ # *Returns*:: true or false
209
+ def overlaps?(other_slice)
210
+ if ! other_slice.class == Slice
211
+ raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
212
+ end
213
+ if self.seq_region.coord_system != other_slice.seq_region.coord_system
214
+ raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
215
+ end
216
+
217
+ self_range = self.start .. self.stop
218
+ other_range = other_slice.start .. other_slice.stop
219
+
220
+ if self_range.include?(other_slice.start) or other_range.include?(self.start)
221
+ return true
222
+ else
223
+ return false
224
+ end
225
+ end
226
+
227
+ # = DESCRIPTION
228
+ # The Slice#within? method checks if this slice is contained withing another one.
229
+ # The other slice has to be on the same coordinate system
230
+ #
231
+ # = USAGE
232
+ # slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
233
+ # slice_b = Slice.fetch_by_region('chromosome','X',900,950)
234
+ # if slice_b.overlaps?(slice_a)
235
+ # puts "Slice b is within slice a"
236
+ # end
237
+ # ---
238
+ # *Arguments*:: another slice
239
+ # *Returns*:: true or false
240
+ def within?(other_slice)
241
+ if ! other_slice.class == Slice
242
+ raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
243
+ end
244
+ if self.seq_region.coord_system != other_slice.seq_region.coord_system
245
+ raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
246
+ end
247
+
248
+ self_range = self.start .. self.stop
249
+ other_range = other_slice.start .. other_slice.stop
250
+
251
+ if other_range.include?(self.start) and other_range.include?(self.stop)
252
+ return true
253
+ else
254
+ return false
255
+ end
256
+ end
257
+
258
+ # = DESCRIPTION
259
+ # The Slice#excise method removes a bit of a slice and returns the
260
+ # remainder as separate slices.
261
+ #
262
+ # = USAGE
263
+ # original_slice = Slice.fetch_by_region('chromosome','X',1,10000)
264
+ # new_slices = original_slice.excise([500..750, 1050..1075])
265
+ # new_slices.each do |s|
266
+ # puts s.display_name
267
+ # end
268
+ #
269
+ # # result:
270
+ # # chromosome:X:1:499:1
271
+ # # chromosome:X:751:1049:1
272
+ # # chromosome:X:1076:10000:1
273
+ # ---
274
+ # *Arguments*:
275
+ # * ranges: array of ranges (required)
276
+ # *Returns*:: array of Slice objects
277
+ def excise(ranges)
278
+ if ranges.class != Array
279
+ raise RuntimeError, "Argument should be an array of ranges"
280
+ end
281
+ ranges.each do |r|
282
+ if r.class != Range
283
+ raise RuntimeError, "Argument should be an array of ranges"
284
+ end
285
+ end
286
+
287
+ answer = Array.new
288
+ previous_excised_stop = self.start - 1
289
+ ranges.sort_by{|r| r.first}.each do |r|
290
+ subslice_start = previous_excised_stop + 1
291
+ if subslice_start <= r.first - 1
292
+ answer.push(Slice.new(self.seq_region, subslice_start, r.first - 1))
293
+ end
294
+ previous_excised_stop = r.last
295
+ if r.last > self.stop
296
+ return answer
297
+ end
298
+ end
299
+ subslice_start = previous_excised_stop + 1
300
+ answer.push(Slice.new(self.seq_region, subslice_start, self.stop))
301
+ return answer
302
+ end
303
+
304
+ # = DESCRIPTION
305
+ # Get the sequence of the Slice as a Bio::Sequence::NA object.
306
+ #
307
+ # If the Slice is on a CoordSystem that is not seq_level, it will try
308
+ # to project it coordinates to the CoordSystem that does. At this
309
+ # moment, this is only done if there is a direct link between the
310
+ # two coordinate systems. (The perl API allows for following an
311
+ # indirect link as well.)
312
+ #
313
+ # Caution: Bio::Sequence::NA makes the sequence
314
+ # downcase!!
315
+ #
316
+ # = USAGE
317
+ # my_slice.seq.seq.to_s
318
+ #
319
+ # ---
320
+ # *Arguments*:: none
321
+ # *Returns*:: Bio::Sequence::NA object
322
+ def seq
323
+ # If we already accessed the sequence, we can just
324
+ # call the instance variable. Otherwise, we'll have
325
+ # to get the sequence first and create a Bio::Sequence::NA
326
+ # object.
327
+ if @seq.nil?
328
+ # First check if the slice is on the seqlevel coordinate
329
+ # system, otherwise project coordinates.
330
+ if self.seq_region.coord_system.seqlevel?
331
+ @seq = Bio::Sequence::NA.new(self.seq_region.subseq(self.start, self.stop))
332
+ else # we have to project coordinates
333
+ seq_string = String.new
334
+ @target_slices = self.project('seqlevel')
335
+
336
+ @target_slices.each do |component|
337
+ if component.class == Slice
338
+ seq_string += component.seq # This fetches the seq recursively (see 10 lines up)
339
+ else # it's a Gap
340
+ seq_string += 'N' * (component.length)
341
+ end
342
+
343
+ end
344
+ @seq = Bio::Sequence::NA.new(seq_string)
345
+
346
+ end
347
+
348
+ if self.strand == -1
349
+ @seq.reverse_complement!
350
+ end
351
+
352
+ end
353
+ return @seq
354
+
355
+ end
356
+
357
+ def repeatmasked_seq
358
+ raise NotImplementedError
359
+ end
360
+
361
+ # = DESCRIPTION
362
+ # Take a sub_slice from an existing one.
363
+ #
364
+ # = USAGE
365
+ # my_sub_slice = my_slice.sub_slice(400,500)
366
+ #
367
+ # ---
368
+ # *Arguments*:
369
+ # * start: start of subslice relative to slice (default: start of slice)
370
+ # * stop: stop of subslice relative to slice (default: stop of slice)
371
+ # *Returns*:: Ensembl::Core::Slice object
372
+ def sub_slice(start = self.start, stop = self.stop)
373
+ return self.class.new(self.seq_region, start, stop, self.strand)
374
+ end
375
+
376
+ # = DESCRIPTION
377
+ # Creates overlapping subslices for a given Slice.
378
+ #
379
+ # = USAGE
380
+ # my_slice.split(50000, 250).each do |sub_slice|
381
+ # puts sub_slice.display_name
382
+ # end
383
+ #
384
+ # ---
385
+ # *Arguments*:
386
+ # * max_size: maximal size of subslices (default: 100000)
387
+ # * overlap: overlap in bp between consecutive subslices (default: 0)
388
+ # *Returns*:: array of Ensembl::Core::Slice objects
389
+ def split(max_size = 100000, overlap = 0)
390
+ sub_slices = Array.new
391
+ i = 0
392
+ self.start.step(self.length, max_size - overlap - 1) do |i|
393
+ sub_slices.push(self.sub_slice(i, i + max_size - 1))
394
+ end
395
+ i -= (overlap + 1)
396
+ sub_slices.push(self.sub_slice(i + max_size))
397
+ return sub_slices
398
+ end
399
+
400
+ ############################
401
+ ## GET ELEMENTS WITHIN SLICE
402
+ ############################
403
+
404
+ #--
405
+ # As there should be 'getters' for a lot of classes, we'll implement
406
+ # this with method_missing. For some of the original methods, see the end
407
+ # of this file.
408
+ #
409
+ # The optional argument is either 'true' or 'false' (default = false).
410
+ # False if the features have to be completely contained within the slice;
411
+ # true if just a partly overlap is sufficient.
412
+ #++
413
+ # Don't use this method yourself.
414
+ def method_missing(method_name, *args)
415
+ table_name = method_name.to_s.singularize
416
+ class_name = table_name.camelcase
417
+
418
+ # Convert to the class object
419
+ target_class = nil
420
+ ObjectSpace.each_object(Class) do |o|
421
+ if o.name =~ /^Ensembl::Core::#{class_name}$/
422
+ target_class = o
423
+ end
424
+ end
425
+
426
+ # If it exists, see if it implements Sliceable
427
+ if ! target_class.nil? and target_class.include?(Sliceable)
428
+ inclusive = false
429
+ if [TrueClass, FalseClass].include?(args[0].class)
430
+ inclusive = args[0]
431
+ end
432
+ return self.get_objects(target_class, table_name, inclusive)
433
+ end
434
+
435
+ raise NoMethodError
436
+
437
+ end
438
+
439
+ # Don't use this method yourself.
440
+ def get_objects(target_class, table_name, inclusive = false)
441
+ answer = Array.new
442
+
443
+
444
+ # Get all the coord_systems with this type of features on them
445
+ coord_system_ids_with_features = MetaCoord.find_all_by_table_name(table_name).collect{|mc| mc.coord_system_id}
446
+
447
+ # Get the features of the original slice
448
+ if coord_system_ids_with_features.include?(self.seq_region.coord_system_id)
449
+ sql = ''
450
+ if inclusive
451
+ sql = <<SQL
452
+ SELECT * FROM #{table_name}
453
+ WHERE seq_region_id = #{self.seq_region.id.to_s}
454
+ AND (( seq_region_start BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
455
+ OR ( seq_region_end BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
456
+ OR ( seq_region_start <= #{self.start.to_s} AND seq_region_end >= #{self.stop.to_s} )
457
+ )
458
+ SQL
459
+ else
460
+ sql = <<SQL
461
+ SELECT * FROM #{table_name}
462
+ WHERE seq_region_id = #{self.seq_region.id.to_s}
463
+ AND seq_region_start >= #{self.start.to_s}
464
+ AND seq_region_end <= #{self.stop.to_s}
465
+ SQL
466
+ end
467
+ answer.push(target_class.find_by_sql(sql))
468
+ coord_system_ids_with_features.delete(self.seq_region.coord_system_id)
469
+ end
470
+
471
+ # Transform the original slice to other coord systems and get those
472
+ # features as well. At the moment, only 'direct' projections can be made.
473
+ # Later, I'm hoping to add functionality for following a path from one
474
+ # coord_system to another if they're not directly linked in the assembly
475
+ # table.
476
+ coord_system_ids_with_features.each do |target_coord_system_id|
477
+ target_slices = self.project(CoordSystem.find(target_coord_system_id).name)
478
+ target_slices.each do |slice|
479
+ if slice.class == Slice
480
+ if inclusive
481
+ sql = <<SQL
482
+ SELECT * FROM #{table_name}
483
+ WHERE seq_region_id = #{slice.seq_region.id.to_s}
484
+ AND (( seq_region_start BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
485
+ OR ( seq_region_end BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
486
+ OR ( seq_region_start <= #{slice.start.to_s} AND seq_region_end >= #{slice.stop.to_s} )
487
+ )
488
+ SQL
489
+ else
490
+ sql = <<SQL
491
+ SELECT * FROM #{table_name}
492
+ WHERE seq_region_id = #{slice.seq_region.id.to_s}
493
+ AND seq_region_start >= #{slice.start.to_s}
494
+ AND seq_region_end <= #{slice.stop.to_s}
495
+ SQL
496
+ end
497
+ answer.push(target_class.find_by_sql(sql))
498
+ end
499
+ end
500
+ end
501
+
502
+ answer.flatten!
503
+ answer.uniq!
504
+
505
+ return answer
506
+ end
507
+
508
+
509
+ # = DESCRIPTION
510
+ # Get all MiscFeatures that are located on a Slice for a given MiscSet.
511
+ #
512
+ # Pitfall: just looks at the CoordSystem that the Slice is located on.
513
+ # For example, if a Slice is located on a SeqRegion on the 'chromosome'
514
+ # CoordSystem, but all misc_features are annotated on SeqRegions of
515
+ # the 'scaffold' CoordSystem, this method will return an empty array.
516
+ #
517
+ # = USAGE
518
+ # my_slice.misc_features('encode').each do |feature|
519
+ # puts feature.to_yaml
520
+ # end
521
+ # ---
522
+ # *Arguments*:
523
+ # * code: code of MiscSet
524
+ # *Returns*:: array of MiscFeature objects
525
+ def misc_features(code)
526
+ answer = Array.new
527
+ if code.nil?
528
+ self.seq_region.misc_features.each do |mf|
529
+ if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
530
+ answer.push(mf)
531
+ end
532
+ end
533
+ else
534
+ self.seq_region.misc_features.each do |mf|
535
+ if mf.misc_sets[0].code == code
536
+ if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
537
+ answer.push(mf)
538
+ end
539
+ end
540
+ end
541
+ end
542
+ return answer
543
+ end
544
+
545
+ # = DESCRIPTION
546
+ # Get all DnaAlignFeatures that are located on a Slice for a given Analysis.
547
+ #
548
+ # Pitfall: just looks at the CoordSystem that the Slice is located on.
549
+ # For example, if a Slice is located on a SeqRegion on the 'chromosome'
550
+ # CoordSystem, but all dna_align_features are annotated on SeqRegions of
551
+ # the 'scaffold' CoordSystem, this method will return an empty array.
552
+ #
553
+ # = USAGE
554
+ # my_slice.dna_align_features('Vertrna').each do |feature|
555
+ # puts feature.to_yaml
556
+ # end
557
+ # ---
558
+ # *Arguments*:
559
+ # * code: name of analysis
560
+ # *Returns*:: array of DnaAlignFeature objects
561
+ def dna_align_features(analysis_name = nil)
562
+ if analysis_name.nil?
563
+ return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
564
+ else
565
+ analysis = Analysis.find_by_logic_name(analysis_name)
566
+ return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
567
+ end
568
+ end
569
+
570
+ # = DESCRIPTION
571
+ # Get all ProteinAlignFeatures that are located on a Slice for a given Analysis.
572
+ #
573
+ # Pitfall: just looks at the CoordSystem that the Slice is located on.
574
+ # For example, if a Slice is located on a SeqRegion on the 'chromosome'
575
+ # CoordSystem, but all protein_align_features are annotated on SeqRegions of
576
+ # the 'scaffold' CoordSystem, this method will return an empty array.
577
+ #
578
+ # = USAGE
579
+ # my_slice.protein_align_features('Uniprot').each do |feature|
580
+ # puts feature.to_yaml
581
+ # end
582
+ # ---
583
+ # *Arguments*:
584
+ # * code: name of analysis
585
+ # *Returns*:: array of ProteinAlignFeature objects
586
+ def protein_align_features(analysis_name)
587
+ if analysis_name.nil?
588
+ return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
589
+ else
590
+ analysis = Analysis.find_by_logic_name(analysis_name)
591
+ return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
592
+ end
593
+ end
594
+ end #Slice
595
+
596
+
597
+ # = DESCRIPTION
598
+ # The Gap class is similar to the Slice object, but describes a gap and
599
+ # therefore can easily be described by coordinate system and size.
600
+ #
601
+ class Gap
602
+ attr_accessor :coord_system, :size
603
+
604
+ # = DESCRIPTION
605
+ # Create a new Gap object from scratch.
606
+ #
607
+ # = USAGE
608
+ # my_coord_system = CoordSystem.find_by_name('chromosome')
609
+ # # Create a gap of 10kb.
610
+ # gap = Gap.new(my_coord_system, 10000)
611
+ # ---
612
+ # *Arguments*:
613
+ # * coord_system: CoordSystem object (required)
614
+ # * length: length of the gap (required)
615
+ # *Returns*:: Gap object
616
+ def initialize(coord_system, size)
617
+ @coord_system, @size = coord_system, size
618
+ end
619
+ alias length size
620
+
621
+ def display_name
622
+ return @coord_system.name + ":gap:" + @size.to_s
623
+ end
624
+ end #Gap
625
+
626
+ end #Core
627
+ end #Ensembl