ruby-ensembl-api 0.9.6
Sign up to get free protection for your applications and to get access to all the features.
- data/TUTORIAL.rdoc +623 -0
- data/bin/ensembl +40 -0
- data/lib/ensembl.rb +64 -0
- data/lib/ensembl/core/activerecord.rb +1914 -0
- data/lib/ensembl/core/collection.rb +60 -0
- data/lib/ensembl/core/project.rb +264 -0
- data/lib/ensembl/core/slice.rb +693 -0
- data/lib/ensembl/core/transcript.rb +425 -0
- data/lib/ensembl/core/transform.rb +97 -0
- data/lib/ensembl/db_connection.rb +216 -0
- data/lib/ensembl/variation/activerecord.rb +253 -0
- data/lib/ensembl/variation/variation.rb +163 -0
- data/test/unit/data/seq_c6qbl.fa +10 -0
- data/test/unit/data/seq_cso19_coding.fa +16 -0
- data/test/unit/data/seq_cso19_transcript.fa +28 -0
- data/test/unit/data/seq_drd3_gene.fa +838 -0
- data/test/unit/data/seq_drd3_transcript.fa +22 -0
- data/test/unit/data/seq_drd4_transcript.fa +24 -0
- data/test/unit/data/seq_forward_composite.fa +1669 -0
- data/test/unit/data/seq_par_boundary.fa +169 -0
- data/test/unit/data/seq_rnd3_transcript.fa +47 -0
- data/test/unit/data/seq_ub2r1_coding.fa +13 -0
- data/test/unit/data/seq_ub2r1_gene.fa +174 -0
- data/test/unit/data/seq_ub2r1_transcript.fa +26 -0
- data/test/unit/data/seq_y.fa +2 -0
- data/test/unit/ensembl_genomes/test_collection.rb +51 -0
- data/test/unit/ensembl_genomes/test_gene.rb +52 -0
- data/test/unit/ensembl_genomes/test_slice.rb +71 -0
- data/test/unit/ensembl_genomes/test_variation.rb +17 -0
- data/test/unit/release_50/core/test_project.rb +215 -0
- data/test/unit/release_50/core/test_project_human.rb +58 -0
- data/test/unit/release_50/core/test_relationships.rb +66 -0
- data/test/unit/release_50/core/test_sequence.rb +175 -0
- data/test/unit/release_50/core/test_slice.rb +121 -0
- data/test/unit/release_50/core/test_transcript.rb +108 -0
- data/test/unit/release_50/core/test_transform.rb +223 -0
- data/test/unit/release_50/variation/test_activerecord.rb +143 -0
- data/test/unit/release_50/variation/test_variation.rb +84 -0
- data/test/unit/release_53/core/test_gene.rb +66 -0
- data/test/unit/release_53/core/test_project.rb +96 -0
- data/test/unit/release_53/core/test_project_human.rb +65 -0
- data/test/unit/release_53/core/test_slice.rb +47 -0
- data/test/unit/release_53/core/test_transform.rb +63 -0
- data/test/unit/release_53/variation/test_activerecord.rb +145 -0
- data/test/unit/release_53/variation/test_variation.rb +71 -0
- data/test/unit/release_56/core/test_gene.rb +66 -0
- data/test/unit/release_56/core/test_project.rb +96 -0
- data/test/unit/release_56/core/test_slice.rb +54 -0
- data/test/unit/release_56/core/test_transform.rb +63 -0
- data/test/unit/release_56/variation/test_activerecord.rb +142 -0
- data/test/unit/release_56/variation/test_variation.rb +68 -0
- data/test/unit/test_connection.rb +66 -0
- data/test/unit/test_releases.rb +136 -0
- metadata +128 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
#
|
2
|
+
# = ensembl/core/collection.rb
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009 Francesco Strozzi <francesco.strozzi@gmail.com>
|
5
|
+
#
|
6
|
+
# License:: The Ruby License
|
7
|
+
|
8
|
+
module Ensembl
|
9
|
+
nil
|
10
|
+
module Core
|
11
|
+
# = DESCRIPTION
|
12
|
+
# Class to describe and handle multi-species databases
|
13
|
+
#
|
14
|
+
class Collection
|
15
|
+
# = DESCRIPTION
|
16
|
+
# Method to check if the current core database is a multi-species db.
|
17
|
+
# Returns a boolean value.
|
18
|
+
#
|
19
|
+
def self.check()
|
20
|
+
host,user,password,db_name,port = Ensembl::Core::DBConnection.get_info
|
21
|
+
if db_name =~/(\w+)_collection_core_.*/
|
22
|
+
return true
|
23
|
+
end
|
24
|
+
return false
|
25
|
+
end
|
26
|
+
|
27
|
+
# = DESCRIPTION
|
28
|
+
# Returns an array with all the Species present in a collection database.
|
29
|
+
#
|
30
|
+
def self.species()
|
31
|
+
return Meta.find_all_by_meta_key("species.db_name").collect {|m| m.meta_value}
|
32
|
+
end
|
33
|
+
|
34
|
+
# = DESCRIPTION
|
35
|
+
# Returns the species_id of a particular specie present in the database.
|
36
|
+
#
|
37
|
+
def self.get_species_id(species)
|
38
|
+
species = species.downcase
|
39
|
+
meta = Meta.find_by_sql("SELECT * FROM meta WHERE LOWER(meta_value) = '#{species}'")[0]
|
40
|
+
if meta.nil?
|
41
|
+
return nil
|
42
|
+
else
|
43
|
+
return meta.species_id
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# = DESCRIPTION
|
48
|
+
# Returns an array with all the coord_system_id associated with a particular specie and a table_name.
|
49
|
+
# Used inside Slice#method_missing to filter the coord_system_id using a particular species_id.
|
50
|
+
#
|
51
|
+
def self.find_all_coord_by_table_name(table_name,species_id)
|
52
|
+
all_ids = CoordSystem.find_all_by_species_id(species_id)
|
53
|
+
return MetaCoord.find_all_by_coord_system_id_and_table_name(all_ids,table_name)
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,264 @@
|
|
1
|
+
#
|
2
|
+
# = ensembl/core/project.rb - project calculations for Ensembl Slice
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009 Jan Aerts <http://jandot.myopenid.com>
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
6
|
+
#
|
7
|
+
# License:: The Ruby License
|
8
|
+
#
|
9
|
+
module Ensembl
|
10
|
+
module Core
|
11
|
+
class Slice
|
12
|
+
# = DESCRIPTION
|
13
|
+
# The Slice#project method is used to transfer coordinates from one
|
14
|
+
# coordinate system to another. Suppose you have a slice on a
|
15
|
+
# contig in human (let's say on contig AC000031.6.1.38703) and you
|
16
|
+
# want to know the coordinates on the chromosome. This is a
|
17
|
+
# projection of coordinates from a higher ranked coordinate system to
|
18
|
+
# a lower ranked coordinate system. Projections can also be done
|
19
|
+
# from a chromosome to the contig level. However, it might be possible
|
20
|
+
# that more than one contig has to be included and that there exist
|
21
|
+
# gaps between the contigs. The output of this method therefore is
|
22
|
+
# an _array_ of Slice and Gap objects.
|
23
|
+
#
|
24
|
+
# At the moment, projections can only be done if the two coordinate
|
25
|
+
# systems are linked directly in the 'assembly' table.
|
26
|
+
#
|
27
|
+
# = USAGE
|
28
|
+
#
|
29
|
+
# # Get a contig slice in cow and project to scaffold level
|
30
|
+
# # (i.e. going from a high rank coord system to a lower rank coord
|
31
|
+
# # system)
|
32
|
+
# source_slice = Slice.fetch_by_region('contig', 'AAFC03020247', 42, 2007)
|
33
|
+
# target_slices = source_slice.project('scaffold')
|
34
|
+
# puts target_slices.length #--> 1
|
35
|
+
# puts target_slices[0].display_name #--> scaffold:ChrUn.003.3522:6570:8535:1
|
36
|
+
#
|
37
|
+
# # Get a chromosome slice in cow and project to scaffold level
|
38
|
+
# # (i.e. going from a low rank coord system to a higher rank coord
|
39
|
+
# # system)
|
40
|
+
# # The region 96652152..98000000 on BTA4 is covered by 2 scaffolds
|
41
|
+
# # that are separated by a gap.
|
42
|
+
# source_slice = Slice.fetch_by_region('chromosome','4', 96652152, 98000000)
|
43
|
+
# target_slices = source_slice.project('scaffold')
|
44
|
+
# puts target_slices.length #--> 3
|
45
|
+
# first_bit, second_bit, third_bit = target_slices
|
46
|
+
# puts first_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.105:42:599579:1
|
47
|
+
# puts second_bit.class #--> Gap
|
48
|
+
# puts third_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.106:1:738311:1
|
49
|
+
#
|
50
|
+
# ---
|
51
|
+
# *Arguments*:
|
52
|
+
# * coord_system_name:: name of coordinate system to project
|
53
|
+
# coordinates to
|
54
|
+
# *Returns*:: an array consisting of Slices and, if necessary, Gaps
|
55
|
+
def project(coord_system_name)
|
56
|
+
answer = Array.new # an array of slices
|
57
|
+
unless Ensembl::SESSION.coord_systems.has_key?(self.seq_region.coord_system_id)
|
58
|
+
Ensembl::SESSION.coord_systems[self.seq_region.coord_system_id] = self.seq_region.coord_system
|
59
|
+
Ensembl::SESSION.coord_system_ids[Ensembl::SESSION.coord_systems[self.seq_region.coord_system_id].name] = self.seq_region.coord_system_id
|
60
|
+
end
|
61
|
+
source_coord_system = Ensembl::SESSION.coord_systems[self.seq_region.coord_system_id]
|
62
|
+
target_coord_system = nil
|
63
|
+
if coord_system_name == 'toplevel'
|
64
|
+
target_coord_system = source_coord_system.find_toplevel
|
65
|
+
elsif coord_system_name == 'seqlevel'
|
66
|
+
target_coord_system = source_coord_system.find_seqlevel
|
67
|
+
else
|
68
|
+
unless Ensembl::SESSION.coord_system_ids.has_key?(coord_system_name)
|
69
|
+
cs = source_coord_system.find_level(coord_system_name)
|
70
|
+
Ensembl::SESSION.coord_systems[cs.id] = cs
|
71
|
+
Ensembl::SESSION.coord_system_ids[cs.name] = cs.id
|
72
|
+
end
|
73
|
+
target_coord_system = Ensembl::SESSION.coord_systems[Ensembl::SESSION.coord_system_ids[coord_system_name]]
|
74
|
+
end
|
75
|
+
|
76
|
+
if target_coord_system.rank < source_coord_system.rank
|
77
|
+
# We're going from component to assembly, which is easy.
|
78
|
+
assembly_links = self.seq_region.assembly_links_as_component(source_coord_system)
|
79
|
+
|
80
|
+
if assembly_links.length == 0
|
81
|
+
return []
|
82
|
+
else
|
83
|
+
assembly_links.each do |assembly_link|
|
84
|
+
target_seq_region = assembly_link.asm_seq_region
|
85
|
+
target_start = self.start + assembly_link.asm_start - assembly_link.cmp_start
|
86
|
+
target_stop = self.stop + assembly_link.asm_start - assembly_link.cmp_start
|
87
|
+
target_strand = self.strand * assembly_link.ori # 1x1=>1, 1x-1=>-1, -1x-1=>1
|
88
|
+
|
89
|
+
answer.push(Slice.new(target_seq_region, target_start, target_stop, target_strand))
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
else
|
94
|
+
# If we're going from assembly to component, the answer of the target method
|
95
|
+
# is an array consisting of Slices intermitted with Gaps.
|
96
|
+
|
97
|
+
# ASSEMBLY_EXCEPTIONS
|
98
|
+
# CAUTION: there are exceptions to the assembly (stored in the assembly_exception)
|
99
|
+
# table which make things a little bit more difficult... For example,
|
100
|
+
# in human, the assembly data for the pseudo-autosomal region (PAR) of
|
101
|
+
# Y is *not* stored in the assembly table. Instead, there is a record
|
102
|
+
# in the assembly_exception table that says: "For chr Y positions 1
|
103
|
+
# to 2709520, use chr X:1-2709520 for the assembly data."
|
104
|
+
# As a solution, what we'll do here, is split the assembly up in blocks:
|
105
|
+
# if a slice covers both the PAR and the allosomal region, we'll make
|
106
|
+
# two subslices (let's call them blocks not to intercede with the
|
107
|
+
# Slice#subslices method) and project these independently.
|
108
|
+
assembly_exceptions = AssemblyException.find_all_by_seq_region_id(self.seq_region.id)
|
109
|
+
if assembly_exceptions.length > 0
|
110
|
+
# Check if this bit of the original slice is covered in the
|
111
|
+
# assembly_exception table.
|
112
|
+
overlapping_exceptions = Array.new
|
113
|
+
assembly_exceptions.each do |ae|
|
114
|
+
if Slice.new(self.seq_region, ae.seq_region_start, ae.seq_region_end).overlaps?(self)
|
115
|
+
if ae.exc_type == 'HAP'
|
116
|
+
raise NotImplementedError, "The haplotype exceptions are not implemented (yet). You can't project this slice."
|
117
|
+
end
|
118
|
+
overlapping_exceptions.push(ae)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
if overlapping_exceptions.length > 0
|
123
|
+
# First get all assembly blocks from chromosome Y
|
124
|
+
source_assembly_blocks = self.excise(overlapping_exceptions.collect{|e| e.seq_region_start .. e.seq_region_end})
|
125
|
+
# And insert the blocks of chromosome X
|
126
|
+
all_assembly_blocks = Array.new #both for chr X and Y
|
127
|
+
# First do all exceptions between the first and last block
|
128
|
+
previous_block = nil
|
129
|
+
source_assembly_blocks.sort_by{|b| b.start}.each do |b|
|
130
|
+
if previous_block.nil?
|
131
|
+
all_assembly_blocks.push(b)
|
132
|
+
previous_block = b
|
133
|
+
next
|
134
|
+
end
|
135
|
+
# Find the exception record
|
136
|
+
exception = nil
|
137
|
+
assembly_exceptions.each do |ae|
|
138
|
+
if ae.seq_region_end == b.start - 1
|
139
|
+
exception = ae
|
140
|
+
break
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
new_slice_start = exception.exc_seq_region_start + ( previous_block.stop - exception.seq_region_start )
|
145
|
+
new_slice_stop = exception.exc_seq_region_start + ( b.start - exception.seq_region_start )
|
146
|
+
new_slice_strand = self.strand * exception.ori
|
147
|
+
new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
|
148
|
+
|
149
|
+
all_assembly_blocks.push(new_slice)
|
150
|
+
all_assembly_blocks.push(b)
|
151
|
+
previous_block = b
|
152
|
+
end
|
153
|
+
|
154
|
+
# And then see if we have to add an additional one at the start or end
|
155
|
+
first_block = source_assembly_blocks.sort_by{|b| b.start}[0]
|
156
|
+
if first_block.start > self.start
|
157
|
+
exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[0]
|
158
|
+
new_slice_start = exception.exc_seq_region_start + ( self.start - exception.seq_region_start )
|
159
|
+
new_slice_stop = exception.exc_seq_region_start + ( first_block.start - 1 - exception.seq_region_start )
|
160
|
+
new_slice_strand = self.strand * exception.ori
|
161
|
+
new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
|
162
|
+
|
163
|
+
all_assembly_blocks.unshift(new_slice)
|
164
|
+
end
|
165
|
+
|
166
|
+
last_block = source_assembly_blocks.sort_by{|b| b.start}[-1]
|
167
|
+
if last_block.stop < self.stop
|
168
|
+
exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[-1]
|
169
|
+
new_slice_start = exception.exc_seq_region_start + ( last_block.stop + 1 - exception.seq_region_start )
|
170
|
+
new_slice_stop = exception.exc_seq_region_start + ( self.stop - exception.seq_region_start )
|
171
|
+
new_slice_strand = self.strand * exception.ori
|
172
|
+
new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
|
173
|
+
|
174
|
+
all_assembly_blocks.shift(new_slice)
|
175
|
+
end
|
176
|
+
|
177
|
+
answer = Array.new
|
178
|
+
all_assembly_blocks.each do |b|
|
179
|
+
answer.push(b.project(coord_system_name))
|
180
|
+
end
|
181
|
+
answer.flatten!
|
182
|
+
|
183
|
+
return answer
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
# END OF ASSEMBLY_EXCEPTIONS
|
188
|
+
|
189
|
+
# Get all AssemblyLinks starting from this assembly and for which
|
190
|
+
# the cmp_seq_region.coord_system is what we want.
|
191
|
+
assembly_links = self.seq_region.assembly_links_as_assembly(target_coord_system)
|
192
|
+
|
193
|
+
# Now reject all the components that lie _before_ the source, then
|
194
|
+
# reject all the components that lie _after_ the source.
|
195
|
+
# Then sort based on their positions.
|
196
|
+
sorted_overlapping_assembly_links = assembly_links.reject{|al| al.asm_end < self.start}.reject{|al| al.asm_start > self.stop}.sort_by{|al| al.asm_start}
|
197
|
+
if sorted_overlapping_assembly_links.length == 0
|
198
|
+
return []
|
199
|
+
end
|
200
|
+
|
201
|
+
# What we'll do, is create slices for all the underlying components,
|
202
|
+
# including the first and the last one. At first, the first and last
|
203
|
+
# components are added in their entirety and will only be cropped afterwards.
|
204
|
+
previous_stop = nil
|
205
|
+
sorted_overlapping_assembly_links.each_index do |i|
|
206
|
+
this_link = sorted_overlapping_assembly_links[i]
|
207
|
+
if i == 0
|
208
|
+
cmp_seq_region = nil
|
209
|
+
if Ensembl::SESSION.seq_regions.has_key?(this_link.cmp_seq_region_id)
|
210
|
+
cmp_seq_region = Ensembl::SESSION.seq_regions[this_link.cmp_seq_region_id]
|
211
|
+
else
|
212
|
+
cmp_seq_region = this_link.cmp_seq_region
|
213
|
+
Ensembl::SESSION.seq_regions[cmp_seq_region.id] = cmp_seq_region
|
214
|
+
end
|
215
|
+
answer.push(Slice.new(cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
|
216
|
+
next
|
217
|
+
end
|
218
|
+
previous_link = sorted_overlapping_assembly_links[i-1]
|
219
|
+
|
220
|
+
# If there is a gap with the previous link: add a gap
|
221
|
+
if this_link.asm_start > ( previous_link.asm_end + 1 )
|
222
|
+
gap_size = this_link.asm_start - previous_link.asm_end - 1
|
223
|
+
answer.push(Gap.new(target_coord_system, gap_size))
|
224
|
+
end
|
225
|
+
|
226
|
+
# And add the component itself as a Slice
|
227
|
+
answer.push(Slice.new(this_link.cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
|
228
|
+
end
|
229
|
+
|
230
|
+
# Now see if we have to crop the first and/or last slice
|
231
|
+
first_link = sorted_overlapping_assembly_links[0]
|
232
|
+
if self.start > first_link.asm_start
|
233
|
+
if first_link.ori == -1
|
234
|
+
answer[0].stop = first_link.cmp_start + ( first_link.asm_end - self.start )
|
235
|
+
else
|
236
|
+
answer[0].start = first_link.cmp_start + ( self.start - first_link.asm_start )
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
last_link = sorted_overlapping_assembly_links[-1]
|
241
|
+
if self.stop < last_link.asm_end
|
242
|
+
if last_link.ori == -1
|
243
|
+
answer[-1].start = last_link.cmp_start + ( last_link.asm_end - self.stop)
|
244
|
+
else
|
245
|
+
answer[-1].stop = last_link.cmp_start + ( self.stop - last_link.asm_start )
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
# And check if we have to add Ns at the front and/or back
|
250
|
+
if self.start < first_link.asm_start
|
251
|
+
gap_size = first_link.asm_start - self.start
|
252
|
+
answer.unshift(Gap.new(target_coord_system, gap_size))
|
253
|
+
end
|
254
|
+
if self.stop > last_link.asm_end
|
255
|
+
gap_size = self.stop - last_link.asm_end
|
256
|
+
answer.push(Gap.new(target_coord_system, gap_size))
|
257
|
+
end
|
258
|
+
end
|
259
|
+
return answer
|
260
|
+
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
@@ -0,0 +1,693 @@
|
|
1
|
+
#
|
2
|
+
# = ensembl/core/slice.rb - General methods for Ensembl Slice
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009 Jan Aerts <http://jandot.myopenid.com>
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
6
|
+
#
|
7
|
+
# License:: The Ruby License
|
8
|
+
#
|
9
|
+
nil
|
10
|
+
module Ensembl
|
11
|
+
nil
|
12
|
+
module Core
|
13
|
+
|
14
|
+
# = DESCRIPTION
|
15
|
+
# From the perl API tutorial
|
16
|
+
# (http://www.ensembl.org/info/software/core/core_tutorial.html): "A
|
17
|
+
# Slice object represents a continuous region of a genome. Slices can be
|
18
|
+
# used to obtain sequence, features or other information from a
|
19
|
+
# particular region of interest."
|
20
|
+
#
|
21
|
+
# In contrast to almost all other classes of Ensembl::Core,
|
22
|
+
# the Slice class is not based on ActiveRecord.
|
23
|
+
#
|
24
|
+
# = USAGE
|
25
|
+
# chr4 = SeqRegion.find_by_name('4')
|
26
|
+
# my_slice = Slice.new(chr4, 95000, 98000, -1)
|
27
|
+
# puts my_slice.display_name #--> 'chromosome:4:Btau_3.1:95000:98000:1'
|
28
|
+
class Slice
|
29
|
+
attr_accessor :seq_region, :start, :stop, :strand, :seq
|
30
|
+
|
31
|
+
#################
|
32
|
+
## CREATE A SLICE
|
33
|
+
#################
|
34
|
+
|
35
|
+
# = DESCRIPTION
|
36
|
+
# Create a new Slice object from scratch.
|
37
|
+
#
|
38
|
+
# = USAGE
|
39
|
+
# chr4 = SeqRegion.find_by_name('4')
|
40
|
+
# my_slice = Slice.new(chr4, 95000, 98000, -1)
|
41
|
+
# ---
|
42
|
+
# *Arguments*:
|
43
|
+
# * seq_region: SeqRegion object
|
44
|
+
# * start: start position of the Slice on the SeqRegion (default = 1)
|
45
|
+
# * stop: stop position of the Slice on the SeqRegion (default: end of
|
46
|
+
# SeqRegion)
|
47
|
+
# * strand: strand of the Slice relative to the SeqRegion (default = 1)
|
48
|
+
# *Returns*:: Slice object
|
49
|
+
def initialize(seq_region, start = 1, stop = seq_region.length, strand = 1)
|
50
|
+
if start.nil?
|
51
|
+
start = 1
|
52
|
+
end
|
53
|
+
if stop.nil?
|
54
|
+
stop = seq_region.length
|
55
|
+
end
|
56
|
+
unless seq_region.class == Ensembl::Core::SeqRegion
|
57
|
+
raise 'First argument has to be a Ensembl::Core::SeqRegion object'
|
58
|
+
end
|
59
|
+
@seq_region, @start, @stop, @strand = seq_region, start, stop, strand
|
60
|
+
@seq = nil
|
61
|
+
end
|
62
|
+
|
63
|
+
# = DESCRIPTION
|
64
|
+
# Create a Slice without first creating the SeqRegion object.
|
65
|
+
#
|
66
|
+
# = USAGE
|
67
|
+
# my_slice_1 = Slice.fetch_by_region('chromosome','4',95000,98000,1)
|
68
|
+
#
|
69
|
+
# ---
|
70
|
+
# *Arguments*:
|
71
|
+
# * coord_system: name of CoordSystem (required)
|
72
|
+
# * seq_region: name of SeqRegion (required)
|
73
|
+
# * start: start of Slice on SeqRegion (default = 1)
|
74
|
+
# * stop: stop of Slice on SeqRegion (default = end of SeqRegion)
|
75
|
+
# * strand: strand of Slice on SeqRegion
|
76
|
+
# *Returns*:: Ensembl::Core::Slice object
|
77
|
+
def self.fetch_by_region(coord_system_name, seq_region_name, start = nil, stop = nil, strand = 1, species = Ensembl::SESSION.collection_species ,version = nil)
|
78
|
+
all_coord_systems = nil
|
79
|
+
if Collection.check
|
80
|
+
species = species.downcase
|
81
|
+
if species.nil?
|
82
|
+
raise ArgumentError, "When using multi-species db, you must pass a specie name to get the correct Slice"
|
83
|
+
else
|
84
|
+
species_id = Collection.get_species_id(species)
|
85
|
+
raise ArgumentError, "No specie found in the database with this name: #{species}" if species_id.nil?
|
86
|
+
all_coord_systems = Ensembl::Core::CoordSystem.find_all_by_name_and_species_id(coord_system_name,species_id)
|
87
|
+
end
|
88
|
+
else
|
89
|
+
all_coord_systems = Ensembl::Core::CoordSystem.find_all_by_name(coord_system_name)
|
90
|
+
end
|
91
|
+
coord_system = nil
|
92
|
+
if version.nil? # Take the version with the lower rank
|
93
|
+
coord_system = all_coord_systems.sort_by{|cs| cs.rank}.shift
|
94
|
+
else
|
95
|
+
coord_system = all_coord_systems.select{|cs| cs.version == version}[0]
|
96
|
+
end
|
97
|
+
unless coord_system.class == Ensembl::Core::CoordSystem
|
98
|
+
message = "Couldn't find a Ensembl::Core::CoordSystem object with name '" + coord_system_name + "'"
|
99
|
+
if ! version.nil?
|
100
|
+
message += " and version '" + version + "'"
|
101
|
+
end
|
102
|
+
raise message
|
103
|
+
end
|
104
|
+
|
105
|
+
seq_region = Ensembl::Core::SeqRegion.find_by_name_and_coord_system_id(seq_region_name, coord_system.id)
|
106
|
+
#seq_region = Ensembl::Core::SeqRegion.find_by_sql("SELECT * FROM seq_region WHERE name = '" + seq_region_name + "' AND coord_system_id = " + coord_system.id.to_s)[0]
|
107
|
+
unless seq_region.class == Ensembl::Core::SeqRegion
|
108
|
+
raise "Couldn't find a Ensembl::Core::SeqRegion object with the name '" + seq_region_name + "'"
|
109
|
+
end
|
110
|
+
|
111
|
+
return Ensembl::Core::Slice.new(seq_region, start, stop, strand)
|
112
|
+
end
|
113
|
+
|
114
|
+
# = DESCRIPTION
|
115
|
+
# Create a Slice based on a Gene
|
116
|
+
#
|
117
|
+
# = USAGE
|
118
|
+
# my_slice = Slice.fetch_by_gene_stable_id('ENSG00000184895')
|
119
|
+
#
|
120
|
+
# ---
|
121
|
+
# *Arguments*:
|
122
|
+
# * gene_stable_id: Ensembl gene stable_id (required)
|
123
|
+
# *Returns*:: Ensembl::Core::Slice object
|
124
|
+
def self.fetch_by_gene_stable_id(gene_stable_id, flanking_seq_length = 0)
|
125
|
+
gene_stable_id = Ensembl::Core::GeneStableId.find_by_stable_id(gene_stable_id)
|
126
|
+
gene = gene_stable_id.gene
|
127
|
+
seq_region = gene.seq_region
|
128
|
+
|
129
|
+
return Ensembl::Core::Slice.new(seq_region, gene.seq_region_start - flanking_seq_length, gene.seq_region_end + flanking_seq_length, gene.seq_region_strand)
|
130
|
+
end
|
131
|
+
|
132
|
+
# = DESCRIPTION
|
133
|
+
# Create a Slice based on a Transcript
|
134
|
+
#
|
135
|
+
# = USAGE
|
136
|
+
# my_slice = Slice.fetch_by_transcript_stable_id('ENST00000383673')
|
137
|
+
#
|
138
|
+
# ---
|
139
|
+
# *Arguments*:
|
140
|
+
# * transcript_stable_id: Ensembl transcript stable_id (required)
|
141
|
+
# *Returns*:: Ensembl::Core::Slice object
|
142
|
+
def self.fetch_by_transcript_stable_id(transcript_stable_id, flanking_seq_length = 0)
|
143
|
+
transcript_stable_id = Ensembl::Core::TranscriptStableId.find_by_stable_id(transcript_stable_id)
|
144
|
+
transcript = transcript_stable_id.transcript
|
145
|
+
seq_region = transcript.seq_region
|
146
|
+
|
147
|
+
return Ensembl::Core::Slice.new(seq_region, transcript.seq_region_start - flanking_seq_length, transcript.seq_region_end + flanking_seq_length, transcript.seq_region_strand)
|
148
|
+
end
|
149
|
+
|
150
|
+
# = DESCRIPTION
|
151
|
+
# Create an array of all Slices for a given coordinate system.
|
152
|
+
#
|
153
|
+
# = USAGE
|
154
|
+
# slices = Slice.fetch_all('chromosome')
|
155
|
+
#
|
156
|
+
# ---
|
157
|
+
# *Arguments*:
|
158
|
+
# * coord_system_name:: name of coordinate system (default = chromosome)
|
159
|
+
# * coord_system_version:: version of coordinate system (default = nil)
|
160
|
+
# *Returns*:: an array of Ensembl::Core::Slice objects
|
161
|
+
def self.fetch_all(coord_system_name = 'chromosome',species = Ensembl::SESSION.collection_species ,version = nil)
|
162
|
+
answer = Array.new
|
163
|
+
coord_system = nil
|
164
|
+
if Collection.check
|
165
|
+
species = species.downcase
|
166
|
+
species_id = Collection.get_species_id(species)
|
167
|
+
raise ArgumentError, "No specie found in the database with this name: #{species}" if species_id.nil?
|
168
|
+
if version.nil?
|
169
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name_and_species_id(coord_system_name,species_id)
|
170
|
+
else
|
171
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name_and_species_id_and_version(coord_system_name, species_id, version)
|
172
|
+
end
|
173
|
+
else
|
174
|
+
if version.nil?
|
175
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name(coord_system_name)
|
176
|
+
else
|
177
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name_and_version(coord_system_name, version)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
coord_system.seq_regions.each do |seq_region|
|
181
|
+
answer.push(Ensembl::Core::Slice.new(seq_region))
|
182
|
+
end
|
183
|
+
return answer
|
184
|
+
end
|
185
|
+
|
186
|
+
##################
|
187
|
+
## GENERAL METHODS
|
188
|
+
##################
|
189
|
+
|
190
|
+
# = DESCRIPTION
|
191
|
+
# Get the length of a slice
|
192
|
+
#
|
193
|
+
# = USAGE
|
194
|
+
# chr4 = SeqRegion.find_by_name('4')
|
195
|
+
# my_slice = Slice.new(chr4, 95000, 98000, -1)
|
196
|
+
# puts my_slice.length
|
197
|
+
# ---
|
198
|
+
# *Arguments*:: none
|
199
|
+
# *Returns*:: Integer
|
200
|
+
def length
|
201
|
+
return self.stop - self.start + 1
|
202
|
+
end
|
203
|
+
|
204
|
+
# = DESCRIPTION
|
205
|
+
# The display_name method returns a full name of this slice, containing
|
206
|
+
# the name of the coordinate system, the sequence region, start and
|
207
|
+
# stop positions on that sequence region and the strand. E.g. for a slice
|
208
|
+
# of bovine chromosome 4 from position 95000 to 98000 on the reverse strand,
|
209
|
+
# the display_name would look like: chromosome:4:Btau_3.1:95000:98000:-1
|
210
|
+
#
|
211
|
+
# = USAGE
|
212
|
+
# puts my_slice.display_name
|
213
|
+
# ---
|
214
|
+
# *Arguments*:: none
|
215
|
+
# *Result*:: String
|
216
|
+
def display_name
|
217
|
+
return [self.seq_region.coord_system.name, self.seq_region.coord_system.version, self.seq_region.name, self.start.to_s, self.stop.to_s, self.strand.to_s].join(':')
|
218
|
+
end
|
219
|
+
alias to_s display_name
|
220
|
+
|
221
|
+
# = DESCRIPTION
|
222
|
+
# The Slice#overlaps? method checks if this slice overlaps another one.
|
223
|
+
# The other slice has to be on the same coordinate system
|
224
|
+
#
|
225
|
+
# = USAGE
|
226
|
+
# slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
|
227
|
+
# slice_b = Slice.fetch_by_region('chromosome','X',900,1500)
|
228
|
+
# if slice_a.overlaps?(slice_b)
|
229
|
+
# puts "There slices overlap"
|
230
|
+
# end
|
231
|
+
# ---
|
232
|
+
# *Arguments*:: another slice
|
233
|
+
# *Returns*:: true or false
|
234
|
+
def overlaps?(other_slice)
|
235
|
+
if ! other_slice.class == Slice
|
236
|
+
raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
|
237
|
+
end
|
238
|
+
if self.seq_region.coord_system != other_slice.seq_region.coord_system
|
239
|
+
raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
|
240
|
+
end
|
241
|
+
|
242
|
+
self_range = self.start .. self.stop
|
243
|
+
other_range = other_slice.start .. other_slice.stop
|
244
|
+
|
245
|
+
if self_range.include?(other_slice.start) or other_range.include?(self.start)
|
246
|
+
return true
|
247
|
+
else
|
248
|
+
return false
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
# = DESCRIPTION
|
253
|
+
# The Slice#within? method checks if this slice is contained withing another one.
|
254
|
+
# The other slice has to be on the same coordinate system
|
255
|
+
#
|
256
|
+
# = USAGE
|
257
|
+
# slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
|
258
|
+
# slice_b = Slice.fetch_by_region('chromosome','X',900,950)
|
259
|
+
# if slice_b.overlaps?(slice_a)
|
260
|
+
# puts "Slice b is within slice a"
|
261
|
+
# end
|
262
|
+
# ---
|
263
|
+
# *Arguments*:: another slice
|
264
|
+
# *Returns*:: true or false
|
265
|
+
def within?(other_slice)
|
266
|
+
if ! other_slice.class == Slice
|
267
|
+
raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
|
268
|
+
end
|
269
|
+
if self.seq_region.coord_system != other_slice.seq_region.coord_system
|
270
|
+
raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
|
271
|
+
end
|
272
|
+
|
273
|
+
self_range = self.start .. self.stop
|
274
|
+
other_range = other_slice.start .. other_slice.stop
|
275
|
+
|
276
|
+
if other_range.include?(self.start) and other_range.include?(self.stop)
|
277
|
+
return true
|
278
|
+
else
|
279
|
+
return false
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
# = DESCRIPTION
|
284
|
+
# The Slice#excise method removes a bit of a slice and returns the
|
285
|
+
# remainder as separate slices.
|
286
|
+
#
|
287
|
+
# = USAGE
|
288
|
+
# original_slice = Slice.fetch_by_region('chromosome','X',1,10000)
|
289
|
+
# new_slices = original_slice.excise([500..750, 1050..1075])
|
290
|
+
# new_slices.each do |s|
|
291
|
+
# puts s.display_name
|
292
|
+
# end
|
293
|
+
#
|
294
|
+
# # result:
|
295
|
+
# # chromosome:X:1:499:1
|
296
|
+
# # chromosome:X:751:1049:1
|
297
|
+
# # chromosome:X:1076:10000:1
|
298
|
+
# ---
|
299
|
+
# *Arguments*:
|
300
|
+
# * ranges: array of ranges (required)
|
301
|
+
# *Returns*:: array of Slice objects
|
302
|
+
def excise(ranges)
|
303
|
+
if ranges.class != Array
|
304
|
+
raise RuntimeError, "Argument should be an array of ranges"
|
305
|
+
end
|
306
|
+
ranges.each do |r|
|
307
|
+
if r.class != Range
|
308
|
+
raise RuntimeError, "Argument should be an array of ranges"
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
answer = Array.new
|
313
|
+
previous_excised_stop = self.start - 1
|
314
|
+
ranges.sort_by{|r| r.first}.each do |r|
|
315
|
+
subslice_start = previous_excised_stop + 1
|
316
|
+
if subslice_start <= r.first - 1
|
317
|
+
answer.push(Slice.new(self.seq_region, subslice_start, r.first - 1))
|
318
|
+
end
|
319
|
+
previous_excised_stop = r.last
|
320
|
+
if r.last > self.stop
|
321
|
+
return answer
|
322
|
+
end
|
323
|
+
end
|
324
|
+
subslice_start = previous_excised_stop + 1
|
325
|
+
answer.push(Slice.new(self.seq_region, subslice_start, self.stop))
|
326
|
+
return answer
|
327
|
+
end
|
328
|
+
|
329
|
+
# = DESCRIPTION
|
330
|
+
# Get the sequence of the Slice as a Bio::Sequence::NA object.
|
331
|
+
#
|
332
|
+
# If the Slice is on a CoordSystem that is not seq_level, it will try
|
333
|
+
# to project it coordinates to the CoordSystem that does. At this
|
334
|
+
# moment, this is only done if there is a direct link between the
|
335
|
+
# two coordinate systems. (The perl API allows for following an
|
336
|
+
# indirect link as well.)
|
337
|
+
#
|
338
|
+
# Caution: Bio::Sequence::NA makes the sequence
|
339
|
+
# downcase!!
|
340
|
+
#
|
341
|
+
# = USAGE
|
342
|
+
# my_slice.seq.seq.to_s
|
343
|
+
#
|
344
|
+
# ---
|
345
|
+
# *Arguments*:: none
|
346
|
+
# *Returns*:: Bio::Sequence::NA object
|
347
|
+
def seq
|
348
|
+
# If we already accessed the sequence, we can just
|
349
|
+
# call the instance variable. Otherwise, we'll have
|
350
|
+
# to get the sequence first and create a Bio::Sequence::NA
|
351
|
+
# object.
|
352
|
+
if @seq.nil?
|
353
|
+
# First check if the slice is on the seqlevel coordinate
|
354
|
+
# system, otherwise project coordinates.
|
355
|
+
if ! Ensembl::SESSION.seqlevel_id.nil? and self.seq_region.coord_system_id == Ensembl::SESSION.seqlevel_id
|
356
|
+
@seq = Bio::Sequence::NA.new(self.seq_region.subseq(self.start, self.stop))
|
357
|
+
else # we have to project coordinates
|
358
|
+
seq_string = String.new
|
359
|
+
@target_slices = self.project('seqlevel')
|
360
|
+
@target_slices.each do |component|
|
361
|
+
if component.class == Slice
|
362
|
+
seq_string += component.seq # This fetches the seq recursively
|
363
|
+
else # it's a Gap
|
364
|
+
seq_string += 'N' * (component.length)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
@seq = Bio::Sequence::NA.new(seq_string)
|
368
|
+
|
369
|
+
end
|
370
|
+
|
371
|
+
if self.strand == -1
|
372
|
+
@seq.reverse_complement!
|
373
|
+
end
|
374
|
+
|
375
|
+
end
|
376
|
+
return @seq
|
377
|
+
|
378
|
+
end
|
379
|
+
|
380
|
+
def repeatmasked_seq
|
381
|
+
raise NotImplementedError
|
382
|
+
end
|
383
|
+
|
384
|
+
# = DESCRIPTION
|
385
|
+
# Take a sub_slice from an existing one.
|
386
|
+
#
|
387
|
+
# = USAGE
|
388
|
+
# my_sub_slice = my_slice.sub_slice(400,500)
|
389
|
+
#
|
390
|
+
# ---
|
391
|
+
# *Arguments*:
|
392
|
+
# * start: start of subslice relative to slice (default: start of slice)
|
393
|
+
# * stop: stop of subslice relative to slice (default: stop of slice)
|
394
|
+
# *Returns*:: Ensembl::Core::Slice object
|
395
|
+
def sub_slice(start = self.start, stop = self.stop)
|
396
|
+
return self.class.new(self.seq_region, start, stop, self.strand)
|
397
|
+
end
|
398
|
+
|
399
|
+
# = DESCRIPTION
|
400
|
+
# Creates overlapping subslices for a given Slice.
|
401
|
+
#
|
402
|
+
# = USAGE
|
403
|
+
# my_slice.split(50000, 250).each do |sub_slice|
|
404
|
+
# puts sub_slice.display_name
|
405
|
+
# end
|
406
|
+
#
|
407
|
+
# ---
|
408
|
+
# *Arguments*:
|
409
|
+
# * max_size: maximal size of subslices (default: 100000)
|
410
|
+
# * overlap: overlap in bp between consecutive subslices (default: 0)
|
411
|
+
# *Returns*:: array of Ensembl::Core::Slice objects
|
412
|
+
def split(max_size = 100000, overlap = 0)
|
413
|
+
sub_slices = Array.new
|
414
|
+
i = 0
|
415
|
+
self.start.step(self.length, max_size - overlap - 1) do |i|
|
416
|
+
sub_slices.push(self.sub_slice(i, i + max_size - 1))
|
417
|
+
end
|
418
|
+
i -= (overlap + 1)
|
419
|
+
sub_slices.push(self.sub_slice(i + max_size))
|
420
|
+
return sub_slices
|
421
|
+
end
|
422
|
+
|
423
|
+
############################
|
424
|
+
## GET ELEMENTS WITHIN SLICE
|
425
|
+
############################
|
426
|
+
|
427
|
+
#--
|
428
|
+
# As there should be 'getters' for a lot of classes, we'll implement
|
429
|
+
# this with method_missing. For some of the original methods, see the end
|
430
|
+
# of this file.
|
431
|
+
#
|
432
|
+
# The optional argument is either 'true' or 'false' (default = false).
|
433
|
+
# False if the features have to be completely contained within the slice;
|
434
|
+
# true if just a partly overlap is sufficient.
|
435
|
+
#++
|
436
|
+
# Don't use this method yourself.
|
437
|
+
def method_missing(method_name, *args)
|
438
|
+
table_name = method_name.to_s.singularize
|
439
|
+
class_name = table_name.camelcase
|
440
|
+
|
441
|
+
# Convert to the class object
|
442
|
+
target_class = nil
|
443
|
+
ObjectSpace.each_object(Class) do |o|
|
444
|
+
if o.name =~ /^Ensembl::Core::#{class_name}$/
|
445
|
+
target_class = o
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
# If it exists, see if it implements Sliceable
|
450
|
+
if ! target_class.nil? and target_class.include?(Sliceable)
|
451
|
+
inclusive = false
|
452
|
+
if [TrueClass, FalseClass].include?(args[0].class)
|
453
|
+
inclusive = args[0]
|
454
|
+
end
|
455
|
+
return self.get_objects(target_class, table_name, inclusive)
|
456
|
+
end
|
457
|
+
|
458
|
+
raise NoMethodError
|
459
|
+
|
460
|
+
end
|
461
|
+
|
462
|
+
# Don't use this method yourself.
|
463
|
+
def get_objects(target_class, table_name, inclusive = false)
|
464
|
+
answer = Array.new
|
465
|
+
|
466
|
+
coord_system_ids_with_features = nil
|
467
|
+
# Get all the coord_systems with this type of features on them
|
468
|
+
if Collection.check
|
469
|
+
coord_system_ids_with_features = Collection.find_all_coord_by_table_name(table_name,self.seq_region.coord_system.species_id).collect{|mc| mc.coord_system_id}
|
470
|
+
else
|
471
|
+
coord_system_ids_with_features = MetaCoord.find_all_by_table_name(table_name).collect{|mc| mc.coord_system_id}
|
472
|
+
end
|
473
|
+
# Get the features of the original slice
|
474
|
+
if coord_system_ids_with_features.include?(self.seq_region.coord_system_id)
|
475
|
+
sql = ''
|
476
|
+
if inclusive
|
477
|
+
sql = <<SQL
|
478
|
+
SELECT * FROM #{table_name}
|
479
|
+
WHERE seq_region_id = #{self.seq_region.id.to_s}
|
480
|
+
AND (( seq_region_start BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
|
481
|
+
OR ( seq_region_end BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
|
482
|
+
OR ( seq_region_start <= #{self.start.to_s} AND seq_region_end >= #{self.stop.to_s} )
|
483
|
+
)
|
484
|
+
SQL
|
485
|
+
else
|
486
|
+
sql = <<SQL
|
487
|
+
SELECT * FROM #{table_name}
|
488
|
+
WHERE seq_region_id = #{self.seq_region.id.to_s}
|
489
|
+
AND seq_region_start >= #{self.start.to_s}
|
490
|
+
AND seq_region_end <= #{self.stop.to_s}
|
491
|
+
SQL
|
492
|
+
end
|
493
|
+
answer.push(target_class.find_by_sql(sql))
|
494
|
+
coord_system_ids_with_features.delete(self.seq_region.coord_system_id)
|
495
|
+
end
|
496
|
+
|
497
|
+
# Transform the original slice to other coord systems and get those
|
498
|
+
# features as well. At the moment, only 'direct' projections can be made.
|
499
|
+
# Later, I'm hoping to add functionality for following a path from one
|
500
|
+
# coord_system to another if they're not directly linked in the assembly
|
501
|
+
# table.
|
502
|
+
coord_system_ids_with_features.each do |target_coord_system_id|
|
503
|
+
target_slices = self.project(CoordSystem.find(target_coord_system_id).name)
|
504
|
+
target_slices.each do |slice|
|
505
|
+
if slice.class == Slice
|
506
|
+
if inclusive
|
507
|
+
sql = <<SQL
|
508
|
+
SELECT * FROM #{table_name}
|
509
|
+
WHERE seq_region_id = #{slice.seq_region.id.to_s}
|
510
|
+
AND (( seq_region_start BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
|
511
|
+
OR ( seq_region_end BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
|
512
|
+
OR ( seq_region_start <= #{slice.start.to_s} AND seq_region_end >= #{slice.stop.to_s} )
|
513
|
+
)
|
514
|
+
SQL
|
515
|
+
else
|
516
|
+
sql = <<SQL
|
517
|
+
SELECT * FROM #{table_name}
|
518
|
+
WHERE seq_region_id = #{slice.seq_region.id.to_s}
|
519
|
+
AND seq_region_start >= #{slice.start.to_s}
|
520
|
+
AND seq_region_end <= #{slice.stop.to_s}
|
521
|
+
SQL
|
522
|
+
end
|
523
|
+
answer.push(target_class.find_by_sql(sql))
|
524
|
+
end
|
525
|
+
end
|
526
|
+
end
|
527
|
+
|
528
|
+
answer.flatten!
|
529
|
+
answer.uniq!
|
530
|
+
|
531
|
+
return answer
|
532
|
+
end
|
533
|
+
|
534
|
+
|
535
|
+
# = DESCRIPTION
|
536
|
+
# Get all MiscFeatures that are located on a Slice for a given MiscSet.
|
537
|
+
#
|
538
|
+
# Pitfall: just looks at the CoordSystem that the Slice is located on.
|
539
|
+
# For example, if a Slice is located on a SeqRegion on the 'chromosome'
|
540
|
+
# CoordSystem, but all misc_features are annotated on SeqRegions of
|
541
|
+
# the 'scaffold' CoordSystem, this method will return an empty array.
|
542
|
+
#
|
543
|
+
# = USAGE
|
544
|
+
# my_slice.misc_features('encode').each do |feature|
|
545
|
+
# puts feature.to_yaml
|
546
|
+
# end
|
547
|
+
# ---
|
548
|
+
# *Arguments*:
|
549
|
+
# * code: code of MiscSet
|
550
|
+
# *Returns*:: array of MiscFeature objects
|
551
|
+
def misc_features(code)
|
552
|
+
answer = Array.new
|
553
|
+
if code.nil?
|
554
|
+
self.seq_region.misc_features.each do |mf|
|
555
|
+
if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
|
556
|
+
answer.push(mf)
|
557
|
+
end
|
558
|
+
end
|
559
|
+
else
|
560
|
+
self.seq_region.misc_features.each do |mf|
|
561
|
+
if mf.misc_sets[0].code == code
|
562
|
+
if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
|
563
|
+
answer.push(mf)
|
564
|
+
end
|
565
|
+
end
|
566
|
+
end
|
567
|
+
end
|
568
|
+
return answer
|
569
|
+
end
|
570
|
+
|
571
|
+
# = DESCRIPTION
|
572
|
+
# Get all DnaAlignFeatures that are located on a Slice for a given Analysis.
|
573
|
+
#
|
574
|
+
# Pitfall: just looks at the CoordSystem that the Slice is located on.
|
575
|
+
# For example, if a Slice is located on a SeqRegion on the 'chromosome'
|
576
|
+
# CoordSystem, but all dna_align_features are annotated on SeqRegions of
|
577
|
+
# the 'scaffold' CoordSystem, this method will return an empty array.
|
578
|
+
#
|
579
|
+
# = USAGE
|
580
|
+
# my_slice.dna_align_features('Vertrna').each do |feature|
|
581
|
+
# puts feature.to_yaml
|
582
|
+
# end
|
583
|
+
# ---
|
584
|
+
# *Arguments*:
|
585
|
+
# * code: name of analysis
|
586
|
+
# *Returns*:: array of DnaAlignFeature objects
|
587
|
+
def dna_align_features(analysis_name = nil)
|
588
|
+
if analysis_name.nil?
|
589
|
+
return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
|
590
|
+
else
|
591
|
+
analysis = Analysis.find_by_logic_name(analysis_name)
|
592
|
+
return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
|
593
|
+
end
|
594
|
+
end
|
595
|
+
|
596
|
+
# = DESCRIPTION
|
597
|
+
# Get all ProteinAlignFeatures that are located on a Slice for a given Analysis.
|
598
|
+
#
|
599
|
+
# Pitfall: just looks at the CoordSystem that the Slice is located on.
|
600
|
+
# For example, if a Slice is located on a SeqRegion on the 'chromosome'
|
601
|
+
# CoordSystem, but all protein_align_features are annotated on SeqRegions of
|
602
|
+
# the 'scaffold' CoordSystem, this method will return an empty array.
|
603
|
+
#
|
604
|
+
# = USAGE
|
605
|
+
# my_slice.protein_align_features('Uniprot').each do |feature|
|
606
|
+
# puts feature.to_yaml
|
607
|
+
# end
|
608
|
+
# ---
|
609
|
+
# *Arguments*:
|
610
|
+
# * code: name of analysis
|
611
|
+
# *Returns*:: array of ProteinAlignFeature objects
|
612
|
+
def protein_align_features(analysis_name)
|
613
|
+
if analysis_name.nil?
|
614
|
+
return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
|
615
|
+
else
|
616
|
+
analysis = Analysis.find_by_logic_name(analysis_name)
|
617
|
+
return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
############################
|
622
|
+
## VARIATION METHODS
|
623
|
+
############################
|
624
|
+
|
625
|
+
|
626
|
+
#= DESCRIPTION
|
627
|
+
# Method to retrieve Variation features from Ensembl::Core::Slice objects
|
628
|
+
#= USAGE
|
629
|
+
# slice = Slice.fetch_by_region('chromosome',1,50000,51000)
|
630
|
+
# variations = slice.get_variation_features
|
631
|
+
# variations.each do |vf|
|
632
|
+
# puts vf.variation_name, vf.allele_string
|
633
|
+
# puts vf.variation.ancestral_allele
|
634
|
+
# end
|
635
|
+
def get_variation_features
|
636
|
+
variation_connection()
|
637
|
+
Ensembl::Variation::VariationFeature.find(:all,:conditions => ["seq_region_id = ? AND seq_region_start >= ? AND seq_region_end <= ?",self.seq_region.seq_region_id,self.start,self.stop])
|
638
|
+
end
|
639
|
+
|
640
|
+
def get_genotyped_variation_features
|
641
|
+
variation_connection()
|
642
|
+
Ensembl::Variation::VariationFeature.find(:all,:conditions => ["flags = 'genotyped' AND seq_region_id = ? AND seq_region_start >= ? AND seq_region_end <= ?",self.seq_region.seq_region_id,self.start,self.stop])
|
643
|
+
end
|
644
|
+
|
645
|
+
private
|
646
|
+
|
647
|
+
def variation_connection()
|
648
|
+
if !Ensembl::Variation::DBConnection.connected?
|
649
|
+
host,user,password,db_name,port = Ensembl::Core::DBConnection.get_info
|
650
|
+
if db_name =~/(\w+_\w+)_\w+_(\d+)_\S+/ then
|
651
|
+
species,release = $1,$2
|
652
|
+
Ensembl::Variation::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
|
653
|
+
else
|
654
|
+
raise NameError, "Can't get Variation Database name from #{db_name}. Are you using non conventional names?"
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
end
|
659
|
+
|
660
|
+
|
661
|
+
end #Slice
|
662
|
+
|
663
|
+
# = DESCRIPTION
|
664
|
+
# The Gap class is similar to the Slice object, but describes a gap and
|
665
|
+
# therefore can easily be described by coordinate system and size.
|
666
|
+
#
|
667
|
+
class Gap
|
668
|
+
attr_accessor :coord_system, :size
|
669
|
+
|
670
|
+
# = DESCRIPTION
|
671
|
+
# Create a new Gap object from scratch.
|
672
|
+
#
|
673
|
+
# = USAGE
|
674
|
+
# my_coord_system = CoordSystem.find_by_name('chromosome')
|
675
|
+
# # Create a gap of 10kb.
|
676
|
+
# gap = Gap.new(my_coord_system, 10000)
|
677
|
+
# ---
|
678
|
+
# *Arguments*:
|
679
|
+
# * coord_system: CoordSystem object (required)
|
680
|
+
# * length: length of the gap (required)
|
681
|
+
# *Returns*:: Gap object
|
682
|
+
def initialize(coord_system, size)
|
683
|
+
@coord_system, @size = coord_system, size
|
684
|
+
end
|
685
|
+
alias length size
|
686
|
+
|
687
|
+
def display_name
|
688
|
+
return @coord_system.name + ":gap:" + @size.to_s
|
689
|
+
end
|
690
|
+
end #Gap
|
691
|
+
|
692
|
+
end #Core
|
693
|
+
end #Ensembl
|