jandot-ruby-ensembl-api 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. data/TUTORIAL +623 -0
  2. data/bin/ensembl +39 -0
  3. data/lib/ensembl/core/activerecord.rb +1847 -0
  4. data/lib/ensembl/core/project.rb +248 -0
  5. data/lib/ensembl/core/slice.rb +627 -0
  6. data/lib/ensembl/core/transcript.rb +425 -0
  7. data/lib/ensembl/core/transform.rb +97 -0
  8. data/lib/ensembl/db_connection.rb +148 -0
  9. data/lib/ensembl/variation/activerecord.rb +308 -0
  10. data/lib/ensembl.rb +23 -0
  11. data/samples/examples_perl_tutorial.rb +120 -0
  12. data/samples/small_example_ruby_api.rb +34 -0
  13. data/test/unit/release_45/core/run_tests.rb +12 -0
  14. data/test/unit/release_45/core/test_project.rb +235 -0
  15. data/test/unit/release_45/core/test_project_human.rb +58 -0
  16. data/test/unit/release_45/core/test_relationships.rb +61 -0
  17. data/test/unit/release_45/core/test_sequence.rb +175 -0
  18. data/test/unit/release_45/core/test_slice.rb +56 -0
  19. data/test/unit/release_45/core/test_transcript.rb +94 -0
  20. data/test/unit/release_45/core/test_transform.rb +223 -0
  21. data/test/unit/release_45/variation/test_activerecord.rb +32 -0
  22. data/test/unit/release_50/core/run_tests.rb +12 -0
  23. data/test/unit/release_50/core/test_project.rb +215 -0
  24. data/test/unit/release_50/core/test_project_human.rb +58 -0
  25. data/test/unit/release_50/core/test_relationships.rb +66 -0
  26. data/test/unit/release_50/core/test_sequence.rb +175 -0
  27. data/test/unit/release_50/core/test_slice.rb +121 -0
  28. data/test/unit/release_50/core/test_transcript.rb +108 -0
  29. data/test/unit/release_50/core/test_transform.rb +223 -0
  30. data/test/unit/release_50/variation/test_activerecord.rb +136 -0
  31. data/test/unit/test_connection.rb +58 -0
  32. data/test/unit/test_releases.rb +40 -0
  33. metadata +243 -0
@@ -0,0 +1,248 @@
1
+ #
2
+ # = ensembl/core/project.rb - project calculations for Ensembl Slice
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ module Ensembl
8
+ module Core
9
+ class Slice
10
+ # = DESCRIPTION
11
+ # The Slice#project method is used to transfer coordinates from one
12
+ # coordinate system to another. Suppose you have a slice on a
13
+ # contig in human (let's say on contig AC000031.6.1.38703) and you
14
+ # want to know the coordinates on the chromosome. This is a
15
+ # projection of coordinates from a higher ranked coordinate system to
16
+ # a lower ranked coordinate system. Projections can also be done
17
+ # from a chromosome to the contig level. However, it might be possible
18
+ # that more than one contig has to be included and that there exist
19
+ # gaps between the contigs. The output of this method therefore is
20
+ # an _array_ of Slice and Gap objects.
21
+ #
22
+ # At the moment, projections can only be done if the two coordinate
23
+ # systems are linked directly in the 'assembly' table.
24
+ #
25
+ # = USAGE
26
+ #
27
+ # # Get a contig slice in cow and project to scaffold level
28
+ # # (i.e. going from a high rank coord system to a lower rank coord
29
+ # # system)
30
+ # source_slice = Slice.fetch_by_region('contig', 'AAFC03020247', 42, 2007)
31
+ # target_slices = source_slice.project('scaffold')
32
+ # puts target_slices.length #--> 1
33
+ # puts target_slices[0].display_name #--> scaffold:ChrUn.003.3522:6570:8535:1
34
+ #
35
+ # # Get a chromosome slice in cow and project to scaffold level
36
+ # # (i.e. going from a low rank coord system to a higher rank coord
37
+ # # system)
38
+ # # The region 96652152..98000000 on BTA4 is covered by 2 scaffolds
39
+ # # that are separated by a gap.
40
+ # source_slice = Slice.fetch_by_region('chromosome','4', 96652152, 98000000)
41
+ # target_slices = source_slice.project('scaffold')
42
+ # puts target_slices.length #--> 3
43
+ # first_bit, second_bit, third_bit = target_slices
44
+ # puts first_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.105:42:599579:1
45
+ # puts second_bit.class #--> Gap
46
+ # puts third_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.106:1:738311:1
47
+ #
48
+ # ---
49
+ # *Arguments*:
50
+ # * coord_system_name:: name of coordinate system to project
51
+ # coordinates to
52
+ # *Returns*:: an array consisting of Slices and, if necessary, Gaps
53
+ def project(coord_system_name)
54
+ answer = Array.new # an array of slices
55
+ source_coord_system = self.seq_region.coord_system
56
+ target_coord_system = nil
57
+ if coord_system_name == 'toplevel'
58
+ target_coord_system = CoordSystem.find_toplevel
59
+ coord_system_name = target_coord_system.name
60
+ elsif coord_system_name == 'seqlevel'
61
+ target_coord_system = CoordSystem.find_seqlevel
62
+ coord_system_name = target_coord_system.name
63
+ else
64
+ target_coord_system = CoordSystem.find_by_name(coord_system_name)
65
+ end
66
+
67
+ if target_coord_system.rank < source_coord_system.rank
68
+ # We're going from component to assembly, which is easy.
69
+ assembly_links = self.seq_region.assembly_links_as_component(coord_system_name)
70
+
71
+ if assembly_links.length == 0
72
+ return []
73
+ else
74
+ assembly_links.each do |assembly_link|
75
+ target_seq_region = assembly_link.asm_seq_region
76
+ target_start = self.start + assembly_link.asm_start - assembly_link.cmp_start
77
+ target_stop = self.stop + assembly_link.asm_start - assembly_link.cmp_start
78
+ target_strand = self.strand * assembly_link.ori # 1x1=>1, 1x-1=>-1, -1x-1=>1
79
+
80
+ answer.push(Slice.new(target_seq_region, target_start, target_stop, target_strand))
81
+ end
82
+ end
83
+
84
+ else
85
+ # If we're going from assembly to component, the answer of the target method
86
+ # is an array consisting of Slices intermitted with Gaps.
87
+
88
+ # ASSEMBLY_EXCEPTIONS
89
+ # CAUTION: there are exceptions to the assembly (stored in the assembly_exception)
90
+ # table which make things a little bit more difficult... For example,
91
+ # in human, the assembly data for the pseudo-autosomal region (PAR) of
92
+ # Y is *not* stored in the assembly table. Instead, there is a record
93
+ # in the assembly_exception table that says: "For chr Y positions 1
94
+ # to 2709520, use chr X:1-2709520 for the assembly data."
95
+ # As a solution, what we'll do here, is split the assembly up in blocks:
96
+ # if a slice covers both the PAR and the allosomal region, we'll make
97
+ # two subslices (let's call them blocks not to intercede with the
98
+ # Slice#subslices method) and project these independently.
99
+ assembly_exceptions = AssemblyException.find_all_by_seq_region_id(self.seq_region.id)
100
+ if assembly_exceptions.length > 0
101
+ # Check if this bit of the original slice is covered in the
102
+ # assembly_exception table.
103
+ overlapping_exceptions = Array.new
104
+ assembly_exceptions.each do |ae|
105
+ if Slice.new(self.seq_region, ae.seq_region_start, ae.seq_region_end).overlaps?(self)
106
+ if ae.exc_type == 'HAP'
107
+ raise NotImplementedError, "The haplotype exceptions are not implemented (yet). You can't project this slice."
108
+ end
109
+ overlapping_exceptions.push(ae)
110
+ end
111
+ end
112
+
113
+ if overlapping_exceptions.length > 0
114
+ # First get all assembly blocks from chromosome Y
115
+ source_assembly_blocks = self.excise(overlapping_exceptions.collect{|e| e.seq_region_start .. e.seq_region_end})
116
+ # And insert the blocks of chromosome X
117
+ all_assembly_blocks = Array.new #both for chr X and Y
118
+ # First do all exceptions between the first and last block
119
+ previous_block = nil
120
+ source_assembly_blocks.sort_by{|b| b.start}.each do |b|
121
+ if previous_block.nil?
122
+ all_assembly_blocks.push(b)
123
+ previous_block = b
124
+ next
125
+ end
126
+ # Find the exception record
127
+ exception = nil
128
+ assembly_exceptions.each do |ae|
129
+ if ae.seq_region_end == b.start - 1
130
+ exception = ae
131
+ break
132
+ end
133
+ end
134
+
135
+ new_slice_start = exception.exc_seq_region_start + ( previous_block.stop - exception.seq_region_start )
136
+ new_slice_stop = exception.exc_seq_region_start + ( b.start - exception.seq_region_start )
137
+ new_slice_strand = self.strand * exception.ori
138
+ new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
139
+
140
+ all_assembly_blocks.push(new_slice)
141
+ all_assembly_blocks.push(b)
142
+ previous_block = b
143
+ end
144
+
145
+ # And then see if we have to add an additional one at the start or end
146
+ first_block = source_assembly_blocks.sort_by{|b| b.start}[0]
147
+ if first_block.start > self.start
148
+ exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[0]
149
+ new_slice_start = exception.exc_seq_region_start + ( self.start - exception.seq_region_start )
150
+ new_slice_stop = exception.exc_seq_region_start + ( first_block.start - 1 - exception.seq_region_start )
151
+ new_slice_strand = self.strand * exception.ori
152
+ new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
153
+
154
+ all_assembly_blocks.unshift(new_slice)
155
+ end
156
+
157
+ last_block = source_assembly_blocks.sort_by{|b| b.start}[-1]
158
+ if last_block.stop < self.stop
159
+ exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[-1]
160
+ new_slice_start = exception.exc_seq_region_start + ( last_block.stop + 1 - exception.seq_region_start )
161
+ new_slice_stop = exception.exc_seq_region_start + ( self.stop - exception.seq_region_start )
162
+ new_slice_strand = self.strand * exception.ori
163
+ new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
164
+
165
+ all_assembly_blocks.shift(new_slice)
166
+ end
167
+
168
+ answer = Array.new
169
+ all_assembly_blocks.each do |b|
170
+ answer.push(b.project(coord_system_name))
171
+ end
172
+ answer.flatten!
173
+
174
+ return answer
175
+ end
176
+
177
+ end
178
+ # END OF ASSEMBLY_EXCEPTIONS
179
+
180
+ # Get all AssemblyLinks starting from this assembly and for which
181
+ # the cmp_seq_region.coord_system is what we want.
182
+ assembly_links = self.seq_region.assembly_links_as_assembly(coord_system_name)
183
+
184
+ # Now reject all the components that lie _before_ the source, then
185
+ # reject all the components that lie _after_ the source.
186
+ # Then sort based on their positions.
187
+ sorted_overlapping_assembly_links = assembly_links.reject{|al| al.asm_end < self.start}.reject{|al| al.asm_start > self.stop}.sort_by{|al| al.asm_start}
188
+ if sorted_overlapping_assembly_links.length == 0
189
+ return []
190
+ end
191
+
192
+ # What we'll do, is create slices for all the underlying components,
193
+ # including the first and the last one. At first, the first and last
194
+ # components are added in their entirity and will only be cropped afterwards.
195
+ previous_stop = nil
196
+ sorted_overlapping_assembly_links.each_index do |i|
197
+ this_link = sorted_overlapping_assembly_links[i]
198
+ if i == 0
199
+ answer.push(Slice.new(this_link.cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
200
+ next
201
+ end
202
+ previous_link = sorted_overlapping_assembly_links[i-1]
203
+
204
+ # If there is a gap with the previous link: add a gap
205
+ if this_link.asm_start > ( previous_link.asm_end + 1 )
206
+ gap_size = this_link.asm_start - previous_link.asm_end - 1
207
+ answer.push(Gap.new(CoordSystem.find_by_name(coord_system_name), gap_size))
208
+ end
209
+
210
+ # And add the component itself as a Slice
211
+ answer.push(Slice.new(this_link.cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
212
+ end
213
+
214
+ # Now see if we have to crop the first and/or last slice
215
+ first_link = sorted_overlapping_assembly_links[0]
216
+ if self.start > first_link.asm_start
217
+ if first_link.ori == -1
218
+ answer[0].stop = first_link.cmp_start + ( first_link.asm_end - self.start )
219
+ else
220
+ answer[0].start = first_link.cmp_start + ( self.start - first_link.asm_start )
221
+ end
222
+ end
223
+
224
+ last_link = sorted_overlapping_assembly_links[-1]
225
+ if self.stop < last_link.asm_end
226
+ if last_link.ori == -1
227
+ answer[-1].start = last_link.cmp_start + ( last_link.asm_end - self.stop)
228
+ else
229
+ answer[-1].stop = last_link.cmp_start + ( self.stop - last_link.asm_start )
230
+ end
231
+ end
232
+
233
+ # And check if we have to add Ns at the front and/or back
234
+ if self.start < first_link.asm_start
235
+ gap_size = first_link.asm_start - self.start
236
+ answer.unshift(Gap.new(CoordSystem.find_by_name(coord_system_name), gap_size))
237
+ end
238
+ if self.stop > last_link.asm_end
239
+ gap_size = self.stop - last_link.asm_end
240
+ answer.push(Gap.new(CoordSystem.find_by_name(coord_system_name), gap_size))
241
+ end
242
+ end
243
+ return answer
244
+
245
+ end
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,627 @@
1
+ #
2
+ # = ensembl/core/slice.rb - Slice object for Ensembl core
3
+ #
4
+ # Copyright:: Copyright (C) 2007 Jan Aerts <http://jandot.myopenid.com>
5
+ # License:: The Ruby License
6
+ #
7
+ nil
8
+ module Ensembl
9
+ nil
10
+ module Core
11
+ # = DESCRIPTION
12
+ # From the perl API tutorial
13
+ # (http://www.ensembl.org/info/software/core/core_tutorial.html): "A
14
+ # Slice object represents a continuous region of a genome. Slices can be
15
+ # used to obtain sequence, features or other information from a
16
+ # particular region of interest."
17
+ #
18
+ # In contrast to almost all other classes of Ensembl::Core,
19
+ # the Slice class is not based on ActiveRecord.
20
+ #
21
+ # = USAGE
22
+ # chr4 = SeqRegion.find_by_name('4')
23
+ # my_slice = Slice.new(chr4, 95000, 98000, -1)
24
+ # puts my_slice.display_name #--> 'chromosome:4:Btau_3.1:95000:98000:1'
25
+ class Slice
26
+ attr_accessor :seq_region, :start, :stop, :strand, :seq
27
+
28
+ #################
29
+ ## CREATE A SLICE
30
+ #################
31
+
32
+ # = DESCRIPTION
33
+ # Create a new Slice object from scratch.
34
+ #
35
+ # = USAGE
36
+ # chr4 = SeqRegion.find_by_name('4')
37
+ # my_slice = Slice.new(chr4, 95000, 98000, -1)
38
+ # ---
39
+ # *Arguments*:
40
+ # * seq_region: SeqRegion object
41
+ # * start: start position of the Slice on the SeqRegion (default = 1)
42
+ # * stop: stop position of the Slice on the SeqRegion (default: end of
43
+ # SeqRegion)
44
+ # * strand: strand of the Slice relative to the SeqRegion (default = 1)
45
+ # *Returns*:: Slice object
46
+ def initialize(seq_region, start = 1, stop = seq_region.length, strand = 1)
47
+ if start.nil?
48
+ start = 1
49
+ end
50
+ if stop.nil?
51
+ stop = seq_region.length
52
+ end
53
+ unless seq_region.class == Ensembl::Core::SeqRegion
54
+ raise 'First argument has to be a Ensembl::Core::SeqRegion object'
55
+ end
56
+ @seq_region, @start, @stop, @strand = seq_region, start, stop, strand
57
+ @seq = nil
58
+ end
59
+
60
+ # = DESCRIPTION
61
+ # Create a Slice without first creating the SeqRegion object.
62
+ #
63
+ # = USAGE
64
+ # my_slice_1 = Slice.fetch_by_region('chromosome','4',95000,98000,1)
65
+ #
66
+ # ---
67
+ # *Arguments*:
68
+ # * coord_system: name of CoordSystem (required)
69
+ # * seq_region: name of SeqRegion (required)
70
+ # * start: start of Slice on SeqRegion (default = 1)
71
+ # * stop: stop of Slice on SeqRegion (default = end of SeqRegion)
72
+ # * strand: strand of Slice on SeqRegion
73
+ # *Returns*:: Ensembl::Core::Slice object
74
+ def self.fetch_by_region(coord_system_name, seq_region_name, start = nil, stop = nil, strand = 1, version = nil)
75
+ all_coord_systems = Ensembl::Core::CoordSystem.find_all_by_name(coord_system_name)
76
+ coord_system = nil
77
+ if version.nil? # Take the version with the lowest rank
78
+ coord_system = all_coord_systems.sort_by{|cs| cs.version}.reverse.shift
79
+ else
80
+ coord_system = all_coord_systems.select{|cs| cs.version == version}[0]
81
+ end
82
+ unless coord_system.class == Ensembl::Core::CoordSystem
83
+ message = "Couldn't find a Ensembl::Core::CoordSystem object with name '" + coord_system_name + "'"
84
+ if ! version.nil?
85
+ message += " and version '" + version + "'"
86
+ end
87
+ raise message
88
+ end
89
+
90
+ seq_region = Ensembl::Core::SeqRegion.find_by_name_and_coord_system_id(seq_region_name, coord_system.id)
91
+ #seq_region = Ensembl::Core::SeqRegion.find_by_sql("SELECT * FROM seq_region WHERE name = '" + seq_region_name + "' AND coord_system_id = " + coord_system.id.to_s)[0]
92
+ unless seq_region.class == Ensembl::Core::SeqRegion
93
+ raise "Couldn't find a Ensembl::Core::SeqRegion object with the name '" + seq_region_name + "'"
94
+ end
95
+
96
+ return Ensembl::Core::Slice.new(seq_region, start, stop, strand)
97
+ end
98
+
99
+ # = DESCRIPTION
100
+ # Create a Slice based on a Gene
101
+ #
102
+ # = USAGE
103
+ # my_slice = Slice.fetch_by_gene_stable_id('ENSG00000184895')
104
+ #
105
+ # ---
106
+ # *Arguments*:
107
+ # * gene_stable_id: Ensembl gene stable_id (required)
108
+ # *Returns*:: Ensembl::Core::Slice object
109
+ def self.fetch_by_gene_stable_id(gene_stable_id, flanking_seq_length = 0)
110
+ gene_stable_id = Ensembl::Core::GeneStableId.find_by_stable_id(gene_stable_id)
111
+ gene = gene_stable_id.gene
112
+ seq_region = gene.seq_region
113
+
114
+ return Ensembl::Core::Slice.new(seq_region, gene.seq_region_start - flanking_seq_length, gene.seq_region_end + flanking_seq_length, gene.seq_region_strand)
115
+ end
116
+
117
+ # = DESCRIPTION
118
+ # Create a Slice based on a Transcript
119
+ #
120
+ # = USAGE
121
+ # my_slice = Slice.fetch_by_transcript_stable_id('ENST00000383673')
122
+ #
123
+ # ---
124
+ # *Arguments*:
125
+ # * transcript_stable_id: Ensembl transcript stable_id (required)
126
+ # *Returns*:: Ensembl::Core::Slice object
127
+ def self.fetch_by_transcript_stable_id(transcript_stable_id, flanking_seq_length = 0)
128
+ transcript_stable_id = Ensembl::Core::TranscriptStableId.find_by_stable_id(transcript_stable_id)
129
+ transcript = transcript_stable_id.transcript
130
+ seq_region = transcript.seq_region
131
+
132
+ return Ensembl::Core::Slice.new(seq_region, transcript.seq_region_start - flanking_seq_length, transcript.seq_region_end + flanking_seq_length, transcript.seq_region_strand)
133
+ end
134
+
135
+ # = DESCRIPTION
136
+ # Create an array of all Slices for a given coordinate system.
137
+ #
138
+ # = USAGE
139
+ # slices = Slice.fetch_all('chromosome')
140
+ #
141
+ # ---
142
+ # *Arguments*:
143
+ # * coord_system_name:: name of coordinate system (default = chromosome)
144
+ # * coord_system_version:: version of coordinate system (default = nil)
145
+ # *Returns*:: an array of Ensembl::Core::Slice objects
146
+ def self.fetch_all(coord_system_name = 'chromosome', version = nil)
147
+ answer = Array.new
148
+ if version.nil?
149
+ coord_system = Ensembl::Core::CoordSystem.find_by_name(coord_system_name)
150
+ else
151
+ coord_system = Ensembl::Core::CoordSystem.find_by_name_and_version(coord_system_name, version)
152
+ end
153
+
154
+ coord_system.seq_regions.each do |seq_region|
155
+ answer.push(Ensembl::Core::Slice.new(seq_region))
156
+ end
157
+
158
+ return answer
159
+ end
160
+
161
+ ##################
162
+ ## GENERAL METHODS
163
+ ##################
164
+
165
+ # = DESCRIPTION
166
+ # Get the length of a slice
167
+ #
168
+ # = USAGE
169
+ # chr4 = SeqRegion.find_by_name('4')
170
+ # my_slice = Slice.new(chr4, 95000, 98000, -1)
171
+ # puts my_slice.length
172
+ # ---
173
+ # *Arguments*:: none
174
+ # *Returns*:: Integer
175
+ def length
176
+ return self.stop - self.start + 1
177
+ end
178
+
179
+ # = DESCRIPTION
180
+ # The display_name method returns a full name of this slice, containing
181
+ # the name of the coordinate system, the sequence region, start and
182
+ # stop positions on that sequence region and the strand. E.g. for a slice
183
+ # of bovine chromosome 4 from position 95000 to 98000 on the reverse strand,
184
+ # the display_name would look like: chromosome:4:Btau_3.1:95000:98000:-1
185
+ #
186
+ # = USAGE
187
+ # puts my_slice.display_name
188
+ # ---
189
+ # *Arguments*:: none
190
+ # *Result*:: String
191
+ def display_name
192
+ return [self.seq_region.coord_system.name, self.seq_region.coord_system.version, self.seq_region.name, self.start.to_s, self.stop.to_s, self.strand.to_s].join(':')
193
+ end
194
+ alias to_s display_name
195
+
196
+ # = DESCRIPTION
197
+ # The Slice#overlaps? method checks if this slice overlaps another one.
198
+ # The other slice has to be on the same coordinate system
199
+ #
200
+ # = USAGE
201
+ # slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
202
+ # slice_b = Slice.fetch_by_region('chromosome','X',900,1500)
203
+ # if slice_a.overlaps?(slice_b)
204
+ # puts "There slices overlap"
205
+ # end
206
+ # ---
207
+ # *Arguments*:: another slice
208
+ # *Returns*:: true or false
209
+ def overlaps?(other_slice)
210
+ if ! other_slice.class == Slice
211
+ raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
212
+ end
213
+ if self.seq_region.coord_system != other_slice.seq_region.coord_system
214
+ raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
215
+ end
216
+
217
+ self_range = self.start .. self.stop
218
+ other_range = other_slice.start .. other_slice.stop
219
+
220
+ if self_range.include?(other_slice.start) or other_range.include?(self.start)
221
+ return true
222
+ else
223
+ return false
224
+ end
225
+ end
226
+
227
+ # = DESCRIPTION
228
+ # The Slice#within? method checks if this slice is contained withing another one.
229
+ # The other slice has to be on the same coordinate system
230
+ #
231
+ # = USAGE
232
+ # slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
233
+ # slice_b = Slice.fetch_by_region('chromosome','X',900,950)
234
+ # if slice_b.overlaps?(slice_a)
235
+ # puts "Slice b is within slice a"
236
+ # end
237
+ # ---
238
+ # *Arguments*:: another slice
239
+ # *Returns*:: true or false
240
+ def within?(other_slice)
241
+ if ! other_slice.class == Slice
242
+ raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
243
+ end
244
+ if self.seq_region.coord_system != other_slice.seq_region.coord_system
245
+ raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
246
+ end
247
+
248
+ self_range = self.start .. self.stop
249
+ other_range = other_slice.start .. other_slice.stop
250
+
251
+ if other_range.include?(self.start) and other_range.include?(self.stop)
252
+ return true
253
+ else
254
+ return false
255
+ end
256
+ end
257
+
258
+ # = DESCRIPTION
259
+ # The Slice#excise method removes a bit of a slice and returns the
260
+ # remainder as separate slices.
261
+ #
262
+ # = USAGE
263
+ # original_slice = Slice.fetch_by_region('chromosome','X',1,10000)
264
+ # new_slices = original_slice.excise([500..750, 1050..1075])
265
+ # new_slices.each do |s|
266
+ # puts s.display_name
267
+ # end
268
+ #
269
+ # # result:
270
+ # # chromosome:X:1:499:1
271
+ # # chromosome:X:751:1049:1
272
+ # # chromosome:X:1076:10000:1
273
+ # ---
274
+ # *Arguments*:
275
+ # * ranges: array of ranges (required)
276
+ # *Returns*:: array of Slice objects
277
+ def excise(ranges)
278
+ if ranges.class != Array
279
+ raise RuntimeError, "Argument should be an array of ranges"
280
+ end
281
+ ranges.each do |r|
282
+ if r.class != Range
283
+ raise RuntimeError, "Argument should be an array of ranges"
284
+ end
285
+ end
286
+
287
+ answer = Array.new
288
+ previous_excised_stop = self.start - 1
289
+ ranges.sort_by{|r| r.first}.each do |r|
290
+ subslice_start = previous_excised_stop + 1
291
+ if subslice_start <= r.first - 1
292
+ answer.push(Slice.new(self.seq_region, subslice_start, r.first - 1))
293
+ end
294
+ previous_excised_stop = r.last
295
+ if r.last > self.stop
296
+ return answer
297
+ end
298
+ end
299
+ subslice_start = previous_excised_stop + 1
300
+ answer.push(Slice.new(self.seq_region, subslice_start, self.stop))
301
+ return answer
302
+ end
303
+
304
+ # = DESCRIPTION
305
+ # Get the sequence of the Slice as a Bio::Sequence::NA object.
306
+ #
307
+ # If the Slice is on a CoordSystem that is not seq_level, it will try
308
+ # to project it coordinates to the CoordSystem that does. At this
309
+ # moment, this is only done if there is a direct link between the
310
+ # two coordinate systems. (The perl API allows for following an
311
+ # indirect link as well.)
312
+ #
313
+ # Caution: Bio::Sequence::NA makes the sequence
314
+ # downcase!!
315
+ #
316
+ # = USAGE
317
+ # my_slice.seq.seq.to_s
318
+ #
319
+ # ---
320
+ # *Arguments*:: none
321
+ # *Returns*:: Bio::Sequence::NA object
322
+ def seq
323
+ # If we already accessed the sequence, we can just
324
+ # call the instance variable. Otherwise, we'll have
325
+ # to get the sequence first and create a Bio::Sequence::NA
326
+ # object.
327
+ if @seq.nil?
328
+ # First check if the slice is on the seqlevel coordinate
329
+ # system, otherwise project coordinates.
330
+ if self.seq_region.coord_system.seqlevel?
331
+ @seq = Bio::Sequence::NA.new(self.seq_region.subseq(self.start, self.stop))
332
+ else # we have to project coordinates
333
+ seq_string = String.new
334
+ @target_slices = self.project('seqlevel')
335
+
336
+ @target_slices.each do |component|
337
+ if component.class == Slice
338
+ seq_string += component.seq # This fetches the seq recursively (see 10 lines up)
339
+ else # it's a Gap
340
+ seq_string += 'N' * (component.length)
341
+ end
342
+
343
+ end
344
+ @seq = Bio::Sequence::NA.new(seq_string)
345
+
346
+ end
347
+
348
+ if self.strand == -1
349
+ @seq.reverse_complement!
350
+ end
351
+
352
+ end
353
+ return @seq
354
+
355
+ end
356
+
357
+ def repeatmasked_seq
358
+ raise NotImplementedError
359
+ end
360
+
361
+ # = DESCRIPTION
362
+ # Take a sub_slice from an existing one.
363
+ #
364
+ # = USAGE
365
+ # my_sub_slice = my_slice.sub_slice(400,500)
366
+ #
367
+ # ---
368
+ # *Arguments*:
369
+ # * start: start of subslice relative to slice (default: start of slice)
370
+ # * stop: stop of subslice relative to slice (default: stop of slice)
371
+ # *Returns*:: Ensembl::Core::Slice object
372
+ def sub_slice(start = self.start, stop = self.stop)
373
+ return self.class.new(self.seq_region, start, stop, self.strand)
374
+ end
375
+
376
+ # = DESCRIPTION
377
+ # Creates overlapping subslices for a given Slice.
378
+ #
379
+ # = USAGE
380
+ # my_slice.split(50000, 250).each do |sub_slice|
381
+ # puts sub_slice.display_name
382
+ # end
383
+ #
384
+ # ---
385
+ # *Arguments*:
386
+ # * max_size: maximal size of subslices (default: 100000)
387
+ # * overlap: overlap in bp between consecutive subslices (default: 0)
388
+ # *Returns*:: array of Ensembl::Core::Slice objects
389
+ def split(max_size = 100000, overlap = 0)
390
+ sub_slices = Array.new
391
+ i = 0
392
+ self.start.step(self.length, max_size - overlap - 1) do |i|
393
+ sub_slices.push(self.sub_slice(i, i + max_size - 1))
394
+ end
395
+ i -= (overlap + 1)
396
+ sub_slices.push(self.sub_slice(i + max_size))
397
+ return sub_slices
398
+ end
399
+
400
+ ############################
401
+ ## GET ELEMENTS WITHIN SLICE
402
+ ############################
403
+
404
+ #--
405
+ # As there should be 'getters' for a lot of classes, we'll implement
406
+ # this with method_missing. For some of the original methods, see the end
407
+ # of this file.
408
+ #
409
+ # The optional argument is either 'true' or 'false' (default = false).
410
+ # False if the features have to be completely contained within the slice;
411
+ # true if just a partly overlap is sufficient.
412
+ #++
413
+ # Don't use this method yourself.
414
+ def method_missing(method_name, *args)
415
+ table_name = method_name.to_s.singularize
416
+ class_name = table_name.camelcase
417
+
418
+ # Convert to the class object
419
+ target_class = nil
420
+ ObjectSpace.each_object(Class) do |o|
421
+ if o.name =~ /^Ensembl::Core::#{class_name}$/
422
+ target_class = o
423
+ end
424
+ end
425
+
426
+ # If it exists, see if it implements Sliceable
427
+ if ! target_class.nil? and target_class.include?(Sliceable)
428
+ inclusive = false
429
+ if [TrueClass, FalseClass].include?(args[0].class)
430
+ inclusive = args[0]
431
+ end
432
+ return self.get_objects(target_class, table_name, inclusive)
433
+ end
434
+
435
+ raise NoMethodError
436
+
437
+ end
438
+
439
+ # Don't use this method yourself.
440
+ def get_objects(target_class, table_name, inclusive = false)
441
+ answer = Array.new
442
+
443
+
444
+ # Get all the coord_systems with this type of features on them
445
+ coord_system_ids_with_features = MetaCoord.find_all_by_table_name(table_name).collect{|mc| mc.coord_system_id}
446
+
447
+ # Get the features of the original slice
448
+ if coord_system_ids_with_features.include?(self.seq_region.coord_system_id)
449
+ sql = ''
450
+ if inclusive
451
+ sql = <<SQL
452
+ SELECT * FROM #{table_name}
453
+ WHERE seq_region_id = #{self.seq_region.id.to_s}
454
+ AND (( seq_region_start BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
455
+ OR ( seq_region_end BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
456
+ OR ( seq_region_start <= #{self.start.to_s} AND seq_region_end >= #{self.stop.to_s} )
457
+ )
458
+ SQL
459
+ else
460
+ sql = <<SQL
461
+ SELECT * FROM #{table_name}
462
+ WHERE seq_region_id = #{self.seq_region.id.to_s}
463
+ AND seq_region_start >= #{self.start.to_s}
464
+ AND seq_region_end <= #{self.stop.to_s}
465
+ SQL
466
+ end
467
+ answer.push(target_class.find_by_sql(sql))
468
+ coord_system_ids_with_features.delete(self.seq_region.coord_system_id)
469
+ end
470
+
471
+ # Transform the original slice to other coord systems and get those
472
+ # features as well. At the moment, only 'direct' projections can be made.
473
+ # Later, I'm hoping to add functionality for following a path from one
474
+ # coord_system to another if they're not directly linked in the assembly
475
+ # table.
476
+ coord_system_ids_with_features.each do |target_coord_system_id|
477
+ target_slices = self.project(CoordSystem.find(target_coord_system_id).name)
478
+ target_slices.each do |slice|
479
+ if slice.class == Slice
480
+ if inclusive
481
+ sql = <<SQL
482
+ SELECT * FROM #{table_name}
483
+ WHERE seq_region_id = #{slice.seq_region.id.to_s}
484
+ AND (( seq_region_start BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
485
+ OR ( seq_region_end BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
486
+ OR ( seq_region_start <= #{slice.start.to_s} AND seq_region_end >= #{slice.stop.to_s} )
487
+ )
488
+ SQL
489
+ else
490
+ sql = <<SQL
491
+ SELECT * FROM #{table_name}
492
+ WHERE seq_region_id = #{slice.seq_region.id.to_s}
493
+ AND seq_region_start >= #{slice.start.to_s}
494
+ AND seq_region_end <= #{slice.stop.to_s}
495
+ SQL
496
+ end
497
+ answer.push(target_class.find_by_sql(sql))
498
+ end
499
+ end
500
+ end
501
+
502
+ answer.flatten!
503
+ answer.uniq!
504
+
505
+ return answer
506
+ end
507
+
508
+
509
+ # = DESCRIPTION
510
+ # Get all MiscFeatures that are located on a Slice for a given MiscSet.
511
+ #
512
+ # Pitfall: just looks at the CoordSystem that the Slice is located on.
513
+ # For example, if a Slice is located on a SeqRegion on the 'chromosome'
514
+ # CoordSystem, but all misc_features are annotated on SeqRegions of
515
+ # the 'scaffold' CoordSystem, this method will return an empty array.
516
+ #
517
+ # = USAGE
518
+ # my_slice.misc_features('encode').each do |feature|
519
+ # puts feature.to_yaml
520
+ # end
521
+ # ---
522
+ # *Arguments*:
523
+ # * code: code of MiscSet
524
+ # *Returns*:: array of MiscFeature objects
525
+ def misc_features(code)
526
+ answer = Array.new
527
+ if code.nil?
528
+ self.seq_region.misc_features.each do |mf|
529
+ if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
530
+ answer.push(mf)
531
+ end
532
+ end
533
+ else
534
+ self.seq_region.misc_features.each do |mf|
535
+ if mf.misc_sets[0].code == code
536
+ if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
537
+ answer.push(mf)
538
+ end
539
+ end
540
+ end
541
+ end
542
+ return answer
543
+ end
544
+
545
+ # = DESCRIPTION
546
+ # Get all DnaAlignFeatures that are located on a Slice for a given Analysis.
547
+ #
548
+ # Pitfall: just looks at the CoordSystem that the Slice is located on.
549
+ # For example, if a Slice is located on a SeqRegion on the 'chromosome'
550
+ # CoordSystem, but all dna_align_features are annotated on SeqRegions of
551
+ # the 'scaffold' CoordSystem, this method will return an empty array.
552
+ #
553
+ # = USAGE
554
+ # my_slice.dna_align_features('Vertrna').each do |feature|
555
+ # puts feature.to_yaml
556
+ # end
557
+ # ---
558
+ # *Arguments*:
559
+ # * code: name of analysis
560
+ # *Returns*:: array of DnaAlignFeature objects
561
+ def dna_align_features(analysis_name = nil)
562
+ if analysis_name.nil?
563
+ return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
564
+ else
565
+ analysis = Analysis.find_by_logic_name(analysis_name)
566
+ return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
567
+ end
568
+ end
569
+
570
+ # = DESCRIPTION
571
+ # Get all ProteinAlignFeatures that are located on a Slice for a given Analysis.
572
+ #
573
+ # Pitfall: just looks at the CoordSystem that the Slice is located on.
574
+ # For example, if a Slice is located on a SeqRegion on the 'chromosome'
575
+ # CoordSystem, but all protein_align_features are annotated on SeqRegions of
576
+ # the 'scaffold' CoordSystem, this method will return an empty array.
577
+ #
578
+ # = USAGE
579
+ # my_slice.protein_align_features('Uniprot').each do |feature|
580
+ # puts feature.to_yaml
581
+ # end
582
+ # ---
583
+ # *Arguments*:
584
+ # * code: name of analysis
585
+ # *Returns*:: array of ProteinAlignFeature objects
586
+ def protein_align_features(analysis_name)
587
+ if analysis_name.nil?
588
+ return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
589
+ else
590
+ analysis = Analysis.find_by_logic_name(analysis_name)
591
+ return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
592
+ end
593
+ end
594
+ end #Slice
595
+
596
+
597
+ # = DESCRIPTION
598
+ # The Gap class is similar to the Slice object, but describes a gap and
599
+ # therefore can easily be described by coordinate system and size.
600
+ #
601
+ class Gap
602
+ attr_accessor :coord_system, :size
603
+
604
+ # = DESCRIPTION
605
+ # Create a new Gap object from scratch.
606
+ #
607
+ # = USAGE
608
+ # my_coord_system = CoordSystem.find_by_name('chromosome')
609
+ # # Create a gap of 10kb.
610
+ # gap = Gap.new(my_coord_system, 10000)
611
+ # ---
612
+ # *Arguments*:
613
+ # * coord_system: CoordSystem object (required)
614
+ # * length: length of the gap (required)
615
+ # *Returns*:: Gap object
616
+ def initialize(coord_system, size)
617
+ @coord_system, @size = coord_system, size
618
+ end
619
+ alias length size
620
+
621
+ def display_name
622
+ return @coord_system.name + ":gap:" + @size.to_s
623
+ end
624
+ end #Gap
625
+
626
+ end #Core
627
+ end #Ensembl