ruby-ensembl-api 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/TUTORIAL.rdoc +623 -0
- data/bin/ensembl +40 -0
- data/lib/ensembl.rb +64 -0
- data/lib/ensembl/core/activerecord.rb +1914 -0
- data/lib/ensembl/core/collection.rb +60 -0
- data/lib/ensembl/core/project.rb +264 -0
- data/lib/ensembl/core/slice.rb +693 -0
- data/lib/ensembl/core/transcript.rb +425 -0
- data/lib/ensembl/core/transform.rb +97 -0
- data/lib/ensembl/db_connection.rb +216 -0
- data/lib/ensembl/variation/activerecord.rb +253 -0
- data/lib/ensembl/variation/variation.rb +163 -0
- data/test/unit/data/seq_c6qbl.fa +10 -0
- data/test/unit/data/seq_cso19_coding.fa +16 -0
- data/test/unit/data/seq_cso19_transcript.fa +28 -0
- data/test/unit/data/seq_drd3_gene.fa +838 -0
- data/test/unit/data/seq_drd3_transcript.fa +22 -0
- data/test/unit/data/seq_drd4_transcript.fa +24 -0
- data/test/unit/data/seq_forward_composite.fa +1669 -0
- data/test/unit/data/seq_par_boundary.fa +169 -0
- data/test/unit/data/seq_rnd3_transcript.fa +47 -0
- data/test/unit/data/seq_ub2r1_coding.fa +13 -0
- data/test/unit/data/seq_ub2r1_gene.fa +174 -0
- data/test/unit/data/seq_ub2r1_transcript.fa +26 -0
- data/test/unit/data/seq_y.fa +2 -0
- data/test/unit/ensembl_genomes/test_collection.rb +51 -0
- data/test/unit/ensembl_genomes/test_gene.rb +52 -0
- data/test/unit/ensembl_genomes/test_slice.rb +71 -0
- data/test/unit/ensembl_genomes/test_variation.rb +17 -0
- data/test/unit/release_50/core/test_project.rb +215 -0
- data/test/unit/release_50/core/test_project_human.rb +58 -0
- data/test/unit/release_50/core/test_relationships.rb +66 -0
- data/test/unit/release_50/core/test_sequence.rb +175 -0
- data/test/unit/release_50/core/test_slice.rb +121 -0
- data/test/unit/release_50/core/test_transcript.rb +108 -0
- data/test/unit/release_50/core/test_transform.rb +223 -0
- data/test/unit/release_50/variation/test_activerecord.rb +143 -0
- data/test/unit/release_50/variation/test_variation.rb +84 -0
- data/test/unit/release_53/core/test_gene.rb +66 -0
- data/test/unit/release_53/core/test_project.rb +96 -0
- data/test/unit/release_53/core/test_project_human.rb +65 -0
- data/test/unit/release_53/core/test_slice.rb +47 -0
- data/test/unit/release_53/core/test_transform.rb +63 -0
- data/test/unit/release_53/variation/test_activerecord.rb +145 -0
- data/test/unit/release_53/variation/test_variation.rb +71 -0
- data/test/unit/release_56/core/test_gene.rb +66 -0
- data/test/unit/release_56/core/test_project.rb +96 -0
- data/test/unit/release_56/core/test_slice.rb +54 -0
- data/test/unit/release_56/core/test_transform.rb +63 -0
- data/test/unit/release_56/variation/test_activerecord.rb +142 -0
- data/test/unit/release_56/variation/test_variation.rb +68 -0
- data/test/unit/test_connection.rb +66 -0
- data/test/unit/test_releases.rb +136 -0
- metadata +128 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
#
|
2
|
+
# = ensembl/core/collection.rb
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009 Francesco Strozzi <francesco.strozzi@gmail.com>
|
5
|
+
#
|
6
|
+
# License:: The Ruby License
|
7
|
+
|
8
|
+
module Ensembl
|
9
|
+
nil
|
10
|
+
module Core
|
11
|
+
# = DESCRIPTION
|
12
|
+
# Class to describe and handle multi-species databases
|
13
|
+
#
|
14
|
+
class Collection
|
15
|
+
# = DESCRIPTION
|
16
|
+
# Method to check if the current core database is a multi-species db.
|
17
|
+
# Returns a boolean value.
|
18
|
+
#
|
19
|
+
def self.check()
|
20
|
+
host,user,password,db_name,port = Ensembl::Core::DBConnection.get_info
|
21
|
+
if db_name =~/(\w+)_collection_core_.*/
|
22
|
+
return true
|
23
|
+
end
|
24
|
+
return false
|
25
|
+
end
|
26
|
+
|
27
|
+
# = DESCRIPTION
|
28
|
+
# Returns an array with all the Species present in a collection database.
|
29
|
+
#
|
30
|
+
def self.species()
|
31
|
+
return Meta.find_all_by_meta_key("species.db_name").collect {|m| m.meta_value}
|
32
|
+
end
|
33
|
+
|
34
|
+
# = DESCRIPTION
|
35
|
+
# Returns the species_id of a particular specie present in the database.
|
36
|
+
#
|
37
|
+
def self.get_species_id(species)
|
38
|
+
species = species.downcase
|
39
|
+
meta = Meta.find_by_sql("SELECT * FROM meta WHERE LOWER(meta_value) = '#{species}'")[0]
|
40
|
+
if meta.nil?
|
41
|
+
return nil
|
42
|
+
else
|
43
|
+
return meta.species_id
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# = DESCRIPTION
|
48
|
+
# Returns an array with all the coord_system_id associated with a particular specie and a table_name.
|
49
|
+
# Used inside Slice#method_missing to filter the coord_system_id using a particular species_id.
|
50
|
+
#
|
51
|
+
def self.find_all_coord_by_table_name(table_name,species_id)
|
52
|
+
all_ids = CoordSystem.find_all_by_species_id(species_id)
|
53
|
+
return MetaCoord.find_all_by_coord_system_id_and_table_name(all_ids,table_name)
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,264 @@
|
|
1
|
+
#
|
2
|
+
# = ensembl/core/project.rb - project calculations for Ensembl Slice
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009 Jan Aerts <http://jandot.myopenid.com>
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
6
|
+
#
|
7
|
+
# License:: The Ruby License
|
8
|
+
#
|
9
|
+
module Ensembl
|
10
|
+
module Core
|
11
|
+
class Slice
|
12
|
+
# = DESCRIPTION
|
13
|
+
# The Slice#project method is used to transfer coordinates from one
|
14
|
+
# coordinate system to another. Suppose you have a slice on a
|
15
|
+
# contig in human (let's say on contig AC000031.6.1.38703) and you
|
16
|
+
# want to know the coordinates on the chromosome. This is a
|
17
|
+
# projection of coordinates from a higher ranked coordinate system to
|
18
|
+
# a lower ranked coordinate system. Projections can also be done
|
19
|
+
# from a chromosome to the contig level. However, it might be possible
|
20
|
+
# that more than one contig has to be included and that there exist
|
21
|
+
# gaps between the contigs. The output of this method therefore is
|
22
|
+
# an _array_ of Slice and Gap objects.
|
23
|
+
#
|
24
|
+
# At the moment, projections can only be done if the two coordinate
|
25
|
+
# systems are linked directly in the 'assembly' table.
|
26
|
+
#
|
27
|
+
# = USAGE
|
28
|
+
#
|
29
|
+
# # Get a contig slice in cow and project to scaffold level
|
30
|
+
# # (i.e. going from a high rank coord system to a lower rank coord
|
31
|
+
# # system)
|
32
|
+
# source_slice = Slice.fetch_by_region('contig', 'AAFC03020247', 42, 2007)
|
33
|
+
# target_slices = source_slice.project('scaffold')
|
34
|
+
# puts target_slices.length #--> 1
|
35
|
+
# puts target_slices[0].display_name #--> scaffold:ChrUn.003.3522:6570:8535:1
|
36
|
+
#
|
37
|
+
# # Get a chromosome slice in cow and project to scaffold level
|
38
|
+
# # (i.e. going from a low rank coord system to a higher rank coord
|
39
|
+
# # system)
|
40
|
+
# # The region 96652152..98000000 on BTA4 is covered by 2 scaffolds
|
41
|
+
# # that are separated by a gap.
|
42
|
+
# source_slice = Slice.fetch_by_region('chromosome','4', 96652152, 98000000)
|
43
|
+
# target_slices = source_slice.project('scaffold')
|
44
|
+
# puts target_slices.length #--> 3
|
45
|
+
# first_bit, second_bit, third_bit = target_slices
|
46
|
+
# puts first_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.105:42:599579:1
|
47
|
+
# puts second_bit.class #--> Gap
|
48
|
+
# puts third_bit.display_name #--> scaffold:Btau_3.1:Chr4.003.106:1:738311:1
|
49
|
+
#
|
50
|
+
# ---
|
51
|
+
# *Arguments*:
|
52
|
+
# * coord_system_name:: name of coordinate system to project
|
53
|
+
# coordinates to
|
54
|
+
# *Returns*:: an array consisting of Slices and, if necessary, Gaps
|
55
|
+
def project(coord_system_name)
|
56
|
+
answer = Array.new # an array of slices
|
57
|
+
unless Ensembl::SESSION.coord_systems.has_key?(self.seq_region.coord_system_id)
|
58
|
+
Ensembl::SESSION.coord_systems[self.seq_region.coord_system_id] = self.seq_region.coord_system
|
59
|
+
Ensembl::SESSION.coord_system_ids[Ensembl::SESSION.coord_systems[self.seq_region.coord_system_id].name] = self.seq_region.coord_system_id
|
60
|
+
end
|
61
|
+
source_coord_system = Ensembl::SESSION.coord_systems[self.seq_region.coord_system_id]
|
62
|
+
target_coord_system = nil
|
63
|
+
if coord_system_name == 'toplevel'
|
64
|
+
target_coord_system = source_coord_system.find_toplevel
|
65
|
+
elsif coord_system_name == 'seqlevel'
|
66
|
+
target_coord_system = source_coord_system.find_seqlevel
|
67
|
+
else
|
68
|
+
unless Ensembl::SESSION.coord_system_ids.has_key?(coord_system_name)
|
69
|
+
cs = source_coord_system.find_level(coord_system_name)
|
70
|
+
Ensembl::SESSION.coord_systems[cs.id] = cs
|
71
|
+
Ensembl::SESSION.coord_system_ids[cs.name] = cs.id
|
72
|
+
end
|
73
|
+
target_coord_system = Ensembl::SESSION.coord_systems[Ensembl::SESSION.coord_system_ids[coord_system_name]]
|
74
|
+
end
|
75
|
+
|
76
|
+
if target_coord_system.rank < source_coord_system.rank
|
77
|
+
# We're going from component to assembly, which is easy.
|
78
|
+
assembly_links = self.seq_region.assembly_links_as_component(source_coord_system)
|
79
|
+
|
80
|
+
if assembly_links.length == 0
|
81
|
+
return []
|
82
|
+
else
|
83
|
+
assembly_links.each do |assembly_link|
|
84
|
+
target_seq_region = assembly_link.asm_seq_region
|
85
|
+
target_start = self.start + assembly_link.asm_start - assembly_link.cmp_start
|
86
|
+
target_stop = self.stop + assembly_link.asm_start - assembly_link.cmp_start
|
87
|
+
target_strand = self.strand * assembly_link.ori # 1x1=>1, 1x-1=>-1, -1x-1=>1
|
88
|
+
|
89
|
+
answer.push(Slice.new(target_seq_region, target_start, target_stop, target_strand))
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
else
|
94
|
+
# If we're going from assembly to component, the answer of the target method
|
95
|
+
# is an array consisting of Slices intermitted with Gaps.
|
96
|
+
|
97
|
+
# ASSEMBLY_EXCEPTIONS
|
98
|
+
# CAUTION: there are exceptions to the assembly (stored in the assembly_exception)
|
99
|
+
# table which make things a little bit more difficult... For example,
|
100
|
+
# in human, the assembly data for the pseudo-autosomal region (PAR) of
|
101
|
+
# Y is *not* stored in the assembly table. Instead, there is a record
|
102
|
+
# in the assembly_exception table that says: "For chr Y positions 1
|
103
|
+
# to 2709520, use chr X:1-2709520 for the assembly data."
|
104
|
+
# As a solution, what we'll do here, is split the assembly up in blocks:
|
105
|
+
# if a slice covers both the PAR and the allosomal region, we'll make
|
106
|
+
# two subslices (let's call them blocks not to intercede with the
|
107
|
+
# Slice#subslices method) and project these independently.
|
108
|
+
assembly_exceptions = AssemblyException.find_all_by_seq_region_id(self.seq_region.id)
|
109
|
+
if assembly_exceptions.length > 0
|
110
|
+
# Check if this bit of the original slice is covered in the
|
111
|
+
# assembly_exception table.
|
112
|
+
overlapping_exceptions = Array.new
|
113
|
+
assembly_exceptions.each do |ae|
|
114
|
+
if Slice.new(self.seq_region, ae.seq_region_start, ae.seq_region_end).overlaps?(self)
|
115
|
+
if ae.exc_type == 'HAP'
|
116
|
+
raise NotImplementedError, "The haplotype exceptions are not implemented (yet). You can't project this slice."
|
117
|
+
end
|
118
|
+
overlapping_exceptions.push(ae)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
if overlapping_exceptions.length > 0
|
123
|
+
# First get all assembly blocks from chromosome Y
|
124
|
+
source_assembly_blocks = self.excise(overlapping_exceptions.collect{|e| e.seq_region_start .. e.seq_region_end})
|
125
|
+
# And insert the blocks of chromosome X
|
126
|
+
all_assembly_blocks = Array.new #both for chr X and Y
|
127
|
+
# First do all exceptions between the first and last block
|
128
|
+
previous_block = nil
|
129
|
+
source_assembly_blocks.sort_by{|b| b.start}.each do |b|
|
130
|
+
if previous_block.nil?
|
131
|
+
all_assembly_blocks.push(b)
|
132
|
+
previous_block = b
|
133
|
+
next
|
134
|
+
end
|
135
|
+
# Find the exception record
|
136
|
+
exception = nil
|
137
|
+
assembly_exceptions.each do |ae|
|
138
|
+
if ae.seq_region_end == b.start - 1
|
139
|
+
exception = ae
|
140
|
+
break
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
new_slice_start = exception.exc_seq_region_start + ( previous_block.stop - exception.seq_region_start )
|
145
|
+
new_slice_stop = exception.exc_seq_region_start + ( b.start - exception.seq_region_start )
|
146
|
+
new_slice_strand = self.strand * exception.ori
|
147
|
+
new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
|
148
|
+
|
149
|
+
all_assembly_blocks.push(new_slice)
|
150
|
+
all_assembly_blocks.push(b)
|
151
|
+
previous_block = b
|
152
|
+
end
|
153
|
+
|
154
|
+
# And then see if we have to add an additional one at the start or end
|
155
|
+
first_block = source_assembly_blocks.sort_by{|b| b.start}[0]
|
156
|
+
if first_block.start > self.start
|
157
|
+
exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[0]
|
158
|
+
new_slice_start = exception.exc_seq_region_start + ( self.start - exception.seq_region_start )
|
159
|
+
new_slice_stop = exception.exc_seq_region_start + ( first_block.start - 1 - exception.seq_region_start )
|
160
|
+
new_slice_strand = self.strand * exception.ori
|
161
|
+
new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
|
162
|
+
|
163
|
+
all_assembly_blocks.unshift(new_slice)
|
164
|
+
end
|
165
|
+
|
166
|
+
last_block = source_assembly_blocks.sort_by{|b| b.start}[-1]
|
167
|
+
if last_block.stop < self.stop
|
168
|
+
exception = assembly_exceptions.sort_by{|ae| ae.seq_region_start}[-1]
|
169
|
+
new_slice_start = exception.exc_seq_region_start + ( last_block.stop + 1 - exception.seq_region_start )
|
170
|
+
new_slice_stop = exception.exc_seq_region_start + ( self.stop - exception.seq_region_start )
|
171
|
+
new_slice_strand = self.strand * exception.ori
|
172
|
+
new_slice = Slice.fetch_by_region(self.seq_region.coord_system.name, SeqRegion.find(exception.exc_seq_region_id).name, new_slice_start, new_slice_stop, new_slice_strand)
|
173
|
+
|
174
|
+
all_assembly_blocks.shift(new_slice)
|
175
|
+
end
|
176
|
+
|
177
|
+
answer = Array.new
|
178
|
+
all_assembly_blocks.each do |b|
|
179
|
+
answer.push(b.project(coord_system_name))
|
180
|
+
end
|
181
|
+
answer.flatten!
|
182
|
+
|
183
|
+
return answer
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
# END OF ASSEMBLY_EXCEPTIONS
|
188
|
+
|
189
|
+
# Get all AssemblyLinks starting from this assembly and for which
|
190
|
+
# the cmp_seq_region.coord_system is what we want.
|
191
|
+
assembly_links = self.seq_region.assembly_links_as_assembly(target_coord_system)
|
192
|
+
|
193
|
+
# Now reject all the components that lie _before_ the source, then
|
194
|
+
# reject all the components that lie _after_ the source.
|
195
|
+
# Then sort based on their positions.
|
196
|
+
sorted_overlapping_assembly_links = assembly_links.reject{|al| al.asm_end < self.start}.reject{|al| al.asm_start > self.stop}.sort_by{|al| al.asm_start}
|
197
|
+
if sorted_overlapping_assembly_links.length == 0
|
198
|
+
return []
|
199
|
+
end
|
200
|
+
|
201
|
+
# What we'll do, is create slices for all the underlying components,
|
202
|
+
# including the first and the last one. At first, the first and last
|
203
|
+
# components are added in their entirety and will only be cropped afterwards.
|
204
|
+
previous_stop = nil
|
205
|
+
sorted_overlapping_assembly_links.each_index do |i|
|
206
|
+
this_link = sorted_overlapping_assembly_links[i]
|
207
|
+
if i == 0
|
208
|
+
cmp_seq_region = nil
|
209
|
+
if Ensembl::SESSION.seq_regions.has_key?(this_link.cmp_seq_region_id)
|
210
|
+
cmp_seq_region = Ensembl::SESSION.seq_regions[this_link.cmp_seq_region_id]
|
211
|
+
else
|
212
|
+
cmp_seq_region = this_link.cmp_seq_region
|
213
|
+
Ensembl::SESSION.seq_regions[cmp_seq_region.id] = cmp_seq_region
|
214
|
+
end
|
215
|
+
answer.push(Slice.new(cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
|
216
|
+
next
|
217
|
+
end
|
218
|
+
previous_link = sorted_overlapping_assembly_links[i-1]
|
219
|
+
|
220
|
+
# If there is a gap with the previous link: add a gap
|
221
|
+
if this_link.asm_start > ( previous_link.asm_end + 1 )
|
222
|
+
gap_size = this_link.asm_start - previous_link.asm_end - 1
|
223
|
+
answer.push(Gap.new(target_coord_system, gap_size))
|
224
|
+
end
|
225
|
+
|
226
|
+
# And add the component itself as a Slice
|
227
|
+
answer.push(Slice.new(this_link.cmp_seq_region, this_link.cmp_start, this_link.cmp_end, this_link.ori))
|
228
|
+
end
|
229
|
+
|
230
|
+
# Now see if we have to crop the first and/or last slice
|
231
|
+
first_link = sorted_overlapping_assembly_links[0]
|
232
|
+
if self.start > first_link.asm_start
|
233
|
+
if first_link.ori == -1
|
234
|
+
answer[0].stop = first_link.cmp_start + ( first_link.asm_end - self.start )
|
235
|
+
else
|
236
|
+
answer[0].start = first_link.cmp_start + ( self.start - first_link.asm_start )
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
last_link = sorted_overlapping_assembly_links[-1]
|
241
|
+
if self.stop < last_link.asm_end
|
242
|
+
if last_link.ori == -1
|
243
|
+
answer[-1].start = last_link.cmp_start + ( last_link.asm_end - self.stop)
|
244
|
+
else
|
245
|
+
answer[-1].stop = last_link.cmp_start + ( self.stop - last_link.asm_start )
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
# And check if we have to add Ns at the front and/or back
|
250
|
+
if self.start < first_link.asm_start
|
251
|
+
gap_size = first_link.asm_start - self.start
|
252
|
+
answer.unshift(Gap.new(target_coord_system, gap_size))
|
253
|
+
end
|
254
|
+
if self.stop > last_link.asm_end
|
255
|
+
gap_size = self.stop - last_link.asm_end
|
256
|
+
answer.push(Gap.new(target_coord_system, gap_size))
|
257
|
+
end
|
258
|
+
end
|
259
|
+
return answer
|
260
|
+
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
@@ -0,0 +1,693 @@
|
|
1
|
+
#
|
2
|
+
# = ensembl/core/slice.rb - General methods for Ensembl Slice
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2009 Jan Aerts <http://jandot.myopenid.com>
|
5
|
+
# Francesco Strozzi <francesco.strozzi@gmail.com>
|
6
|
+
#
|
7
|
+
# License:: The Ruby License
|
8
|
+
#
|
9
|
+
nil
|
10
|
+
module Ensembl
|
11
|
+
nil
|
12
|
+
module Core
|
13
|
+
|
14
|
+
# = DESCRIPTION
|
15
|
+
# From the perl API tutorial
|
16
|
+
# (http://www.ensembl.org/info/software/core/core_tutorial.html): "A
|
17
|
+
# Slice object represents a continuous region of a genome. Slices can be
|
18
|
+
# used to obtain sequence, features or other information from a
|
19
|
+
# particular region of interest."
|
20
|
+
#
|
21
|
+
# In contrast to almost all other classes of Ensembl::Core,
|
22
|
+
# the Slice class is not based on ActiveRecord.
|
23
|
+
#
|
24
|
+
# = USAGE
|
25
|
+
# chr4 = SeqRegion.find_by_name('4')
|
26
|
+
# my_slice = Slice.new(chr4, 95000, 98000, -1)
|
27
|
+
# puts my_slice.display_name #--> 'chromosome:4:Btau_3.1:95000:98000:1'
|
28
|
+
class Slice
|
29
|
+
attr_accessor :seq_region, :start, :stop, :strand, :seq
|
30
|
+
|
31
|
+
#################
|
32
|
+
## CREATE A SLICE
|
33
|
+
#################
|
34
|
+
|
35
|
+
# = DESCRIPTION
|
36
|
+
# Create a new Slice object from scratch.
|
37
|
+
#
|
38
|
+
# = USAGE
|
39
|
+
# chr4 = SeqRegion.find_by_name('4')
|
40
|
+
# my_slice = Slice.new(chr4, 95000, 98000, -1)
|
41
|
+
# ---
|
42
|
+
# *Arguments*:
|
43
|
+
# * seq_region: SeqRegion object
|
44
|
+
# * start: start position of the Slice on the SeqRegion (default = 1)
|
45
|
+
# * stop: stop position of the Slice on the SeqRegion (default: end of
|
46
|
+
# SeqRegion)
|
47
|
+
# * strand: strand of the Slice relative to the SeqRegion (default = 1)
|
48
|
+
# *Returns*:: Slice object
|
49
|
+
def initialize(seq_region, start = 1, stop = seq_region.length, strand = 1)
|
50
|
+
if start.nil?
|
51
|
+
start = 1
|
52
|
+
end
|
53
|
+
if stop.nil?
|
54
|
+
stop = seq_region.length
|
55
|
+
end
|
56
|
+
unless seq_region.class == Ensembl::Core::SeqRegion
|
57
|
+
raise 'First argument has to be a Ensembl::Core::SeqRegion object'
|
58
|
+
end
|
59
|
+
@seq_region, @start, @stop, @strand = seq_region, start, stop, strand
|
60
|
+
@seq = nil
|
61
|
+
end
|
62
|
+
|
63
|
+
# = DESCRIPTION
|
64
|
+
# Create a Slice without first creating the SeqRegion object.
|
65
|
+
#
|
66
|
+
# = USAGE
|
67
|
+
# my_slice_1 = Slice.fetch_by_region('chromosome','4',95000,98000,1)
|
68
|
+
#
|
69
|
+
# ---
|
70
|
+
# *Arguments*:
|
71
|
+
# * coord_system: name of CoordSystem (required)
|
72
|
+
# * seq_region: name of SeqRegion (required)
|
73
|
+
# * start: start of Slice on SeqRegion (default = 1)
|
74
|
+
# * stop: stop of Slice on SeqRegion (default = end of SeqRegion)
|
75
|
+
# * strand: strand of Slice on SeqRegion
|
76
|
+
# *Returns*:: Ensembl::Core::Slice object
|
77
|
+
def self.fetch_by_region(coord_system_name, seq_region_name, start = nil, stop = nil, strand = 1, species = Ensembl::SESSION.collection_species ,version = nil)
|
78
|
+
all_coord_systems = nil
|
79
|
+
if Collection.check
|
80
|
+
species = species.downcase
|
81
|
+
if species.nil?
|
82
|
+
raise ArgumentError, "When using multi-species db, you must pass a specie name to get the correct Slice"
|
83
|
+
else
|
84
|
+
species_id = Collection.get_species_id(species)
|
85
|
+
raise ArgumentError, "No specie found in the database with this name: #{species}" if species_id.nil?
|
86
|
+
all_coord_systems = Ensembl::Core::CoordSystem.find_all_by_name_and_species_id(coord_system_name,species_id)
|
87
|
+
end
|
88
|
+
else
|
89
|
+
all_coord_systems = Ensembl::Core::CoordSystem.find_all_by_name(coord_system_name)
|
90
|
+
end
|
91
|
+
coord_system = nil
|
92
|
+
if version.nil? # Take the version with the lower rank
|
93
|
+
coord_system = all_coord_systems.sort_by{|cs| cs.rank}.shift
|
94
|
+
else
|
95
|
+
coord_system = all_coord_systems.select{|cs| cs.version == version}[0]
|
96
|
+
end
|
97
|
+
unless coord_system.class == Ensembl::Core::CoordSystem
|
98
|
+
message = "Couldn't find a Ensembl::Core::CoordSystem object with name '" + coord_system_name + "'"
|
99
|
+
if ! version.nil?
|
100
|
+
message += " and version '" + version + "'"
|
101
|
+
end
|
102
|
+
raise message
|
103
|
+
end
|
104
|
+
|
105
|
+
seq_region = Ensembl::Core::SeqRegion.find_by_name_and_coord_system_id(seq_region_name, coord_system.id)
|
106
|
+
#seq_region = Ensembl::Core::SeqRegion.find_by_sql("SELECT * FROM seq_region WHERE name = '" + seq_region_name + "' AND coord_system_id = " + coord_system.id.to_s)[0]
|
107
|
+
unless seq_region.class == Ensembl::Core::SeqRegion
|
108
|
+
raise "Couldn't find a Ensembl::Core::SeqRegion object with the name '" + seq_region_name + "'"
|
109
|
+
end
|
110
|
+
|
111
|
+
return Ensembl::Core::Slice.new(seq_region, start, stop, strand)
|
112
|
+
end
|
113
|
+
|
114
|
+
# = DESCRIPTION
|
115
|
+
# Create a Slice based on a Gene
|
116
|
+
#
|
117
|
+
# = USAGE
|
118
|
+
# my_slice = Slice.fetch_by_gene_stable_id('ENSG00000184895')
|
119
|
+
#
|
120
|
+
# ---
|
121
|
+
# *Arguments*:
|
122
|
+
# * gene_stable_id: Ensembl gene stable_id (required)
|
123
|
+
# *Returns*:: Ensembl::Core::Slice object
|
124
|
+
def self.fetch_by_gene_stable_id(gene_stable_id, flanking_seq_length = 0)
|
125
|
+
gene_stable_id = Ensembl::Core::GeneStableId.find_by_stable_id(gene_stable_id)
|
126
|
+
gene = gene_stable_id.gene
|
127
|
+
seq_region = gene.seq_region
|
128
|
+
|
129
|
+
return Ensembl::Core::Slice.new(seq_region, gene.seq_region_start - flanking_seq_length, gene.seq_region_end + flanking_seq_length, gene.seq_region_strand)
|
130
|
+
end
|
131
|
+
|
132
|
+
# = DESCRIPTION
|
133
|
+
# Create a Slice based on a Transcript
|
134
|
+
#
|
135
|
+
# = USAGE
|
136
|
+
# my_slice = Slice.fetch_by_transcript_stable_id('ENST00000383673')
|
137
|
+
#
|
138
|
+
# ---
|
139
|
+
# *Arguments*:
|
140
|
+
# * transcript_stable_id: Ensembl transcript stable_id (required)
|
141
|
+
# *Returns*:: Ensembl::Core::Slice object
|
142
|
+
def self.fetch_by_transcript_stable_id(transcript_stable_id, flanking_seq_length = 0)
|
143
|
+
transcript_stable_id = Ensembl::Core::TranscriptStableId.find_by_stable_id(transcript_stable_id)
|
144
|
+
transcript = transcript_stable_id.transcript
|
145
|
+
seq_region = transcript.seq_region
|
146
|
+
|
147
|
+
return Ensembl::Core::Slice.new(seq_region, transcript.seq_region_start - flanking_seq_length, transcript.seq_region_end + flanking_seq_length, transcript.seq_region_strand)
|
148
|
+
end
|
149
|
+
|
150
|
+
# = DESCRIPTION
|
151
|
+
# Create an array of all Slices for a given coordinate system.
|
152
|
+
#
|
153
|
+
# = USAGE
|
154
|
+
# slices = Slice.fetch_all('chromosome')
|
155
|
+
#
|
156
|
+
# ---
|
157
|
+
# *Arguments*:
|
158
|
+
# * coord_system_name:: name of coordinate system (default = chromosome)
|
159
|
+
# * coord_system_version:: version of coordinate system (default = nil)
|
160
|
+
# *Returns*:: an array of Ensembl::Core::Slice objects
|
161
|
+
def self.fetch_all(coord_system_name = 'chromosome',species = Ensembl::SESSION.collection_species ,version = nil)
|
162
|
+
answer = Array.new
|
163
|
+
coord_system = nil
|
164
|
+
if Collection.check
|
165
|
+
species = species.downcase
|
166
|
+
species_id = Collection.get_species_id(species)
|
167
|
+
raise ArgumentError, "No specie found in the database with this name: #{species}" if species_id.nil?
|
168
|
+
if version.nil?
|
169
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name_and_species_id(coord_system_name,species_id)
|
170
|
+
else
|
171
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name_and_species_id_and_version(coord_system_name, species_id, version)
|
172
|
+
end
|
173
|
+
else
|
174
|
+
if version.nil?
|
175
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name(coord_system_name)
|
176
|
+
else
|
177
|
+
coord_system = Ensembl::Core::CoordSystem.find_by_name_and_version(coord_system_name, version)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
coord_system.seq_regions.each do |seq_region|
|
181
|
+
answer.push(Ensembl::Core::Slice.new(seq_region))
|
182
|
+
end
|
183
|
+
return answer
|
184
|
+
end
|
185
|
+
|
186
|
+
##################
|
187
|
+
## GENERAL METHODS
|
188
|
+
##################
|
189
|
+
|
190
|
+
# = DESCRIPTION
|
191
|
+
# Get the length of a slice
|
192
|
+
#
|
193
|
+
# = USAGE
|
194
|
+
# chr4 = SeqRegion.find_by_name('4')
|
195
|
+
# my_slice = Slice.new(chr4, 95000, 98000, -1)
|
196
|
+
# puts my_slice.length
|
197
|
+
# ---
|
198
|
+
# *Arguments*:: none
|
199
|
+
# *Returns*:: Integer
|
200
|
+
def length
|
201
|
+
return self.stop - self.start + 1
|
202
|
+
end
|
203
|
+
|
204
|
+
# = DESCRIPTION
|
205
|
+
# The display_name method returns a full name of this slice, containing
|
206
|
+
# the name of the coordinate system, the sequence region, start and
|
207
|
+
# stop positions on that sequence region and the strand. E.g. for a slice
|
208
|
+
# of bovine chromosome 4 from position 95000 to 98000 on the reverse strand,
|
209
|
+
# the display_name would look like: chromosome:4:Btau_3.1:95000:98000:-1
|
210
|
+
#
|
211
|
+
# = USAGE
|
212
|
+
# puts my_slice.display_name
|
213
|
+
# ---
|
214
|
+
# *Arguments*:: none
|
215
|
+
# *Result*:: String
|
216
|
+
def display_name
|
217
|
+
return [self.seq_region.coord_system.name, self.seq_region.coord_system.version, self.seq_region.name, self.start.to_s, self.stop.to_s, self.strand.to_s].join(':')
|
218
|
+
end
|
219
|
+
alias to_s display_name
|
220
|
+
|
221
|
+
# = DESCRIPTION
|
222
|
+
# The Slice#overlaps? method checks if this slice overlaps another one.
|
223
|
+
# The other slice has to be on the same coordinate system
|
224
|
+
#
|
225
|
+
# = USAGE
|
226
|
+
# slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
|
227
|
+
# slice_b = Slice.fetch_by_region('chromosome','X',900,1500)
|
228
|
+
# if slice_a.overlaps?(slice_b)
|
229
|
+
# puts "There slices overlap"
|
230
|
+
# end
|
231
|
+
# ---
|
232
|
+
# *Arguments*:: another slice
|
233
|
+
# *Returns*:: true or false
|
234
|
+
def overlaps?(other_slice)
|
235
|
+
if ! other_slice.class == Slice
|
236
|
+
raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
|
237
|
+
end
|
238
|
+
if self.seq_region.coord_system != other_slice.seq_region.coord_system
|
239
|
+
raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
|
240
|
+
end
|
241
|
+
|
242
|
+
self_range = self.start .. self.stop
|
243
|
+
other_range = other_slice.start .. other_slice.stop
|
244
|
+
|
245
|
+
if self_range.include?(other_slice.start) or other_range.include?(self.start)
|
246
|
+
return true
|
247
|
+
else
|
248
|
+
return false
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
# = DESCRIPTION
|
253
|
+
# The Slice#within? method checks if this slice is contained withing another one.
|
254
|
+
# The other slice has to be on the same coordinate system
|
255
|
+
#
|
256
|
+
# = USAGE
|
257
|
+
# slice_a = Slice.fetch_by_region('chromosome','X',1,1000)
|
258
|
+
# slice_b = Slice.fetch_by_region('chromosome','X',900,950)
|
259
|
+
# if slice_b.overlaps?(slice_a)
|
260
|
+
# puts "Slice b is within slice a"
|
261
|
+
# end
|
262
|
+
# ---
|
263
|
+
# *Arguments*:: another slice
|
264
|
+
# *Returns*:: true or false
|
265
|
+
def within?(other_slice)
|
266
|
+
if ! other_slice.class == Slice
|
267
|
+
raise RuntimeError, "The Slice#overlaps? method takes a Slice object as its arguments."
|
268
|
+
end
|
269
|
+
if self.seq_region.coord_system != other_slice.seq_region.coord_system
|
270
|
+
raise RuntimeError, "The argument slice of Slice#overlaps? has to be in the same coordinate system, but were " + self.seq_region.coord_system.name + " and " + other_slice.seq_region.coord_system.name
|
271
|
+
end
|
272
|
+
|
273
|
+
self_range = self.start .. self.stop
|
274
|
+
other_range = other_slice.start .. other_slice.stop
|
275
|
+
|
276
|
+
if other_range.include?(self.start) and other_range.include?(self.stop)
|
277
|
+
return true
|
278
|
+
else
|
279
|
+
return false
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
# = DESCRIPTION
|
284
|
+
# The Slice#excise method removes a bit of a slice and returns the
|
285
|
+
# remainder as separate slices.
|
286
|
+
#
|
287
|
+
# = USAGE
|
288
|
+
# original_slice = Slice.fetch_by_region('chromosome','X',1,10000)
|
289
|
+
# new_slices = original_slice.excise([500..750, 1050..1075])
|
290
|
+
# new_slices.each do |s|
|
291
|
+
# puts s.display_name
|
292
|
+
# end
|
293
|
+
#
|
294
|
+
# # result:
|
295
|
+
# # chromosome:X:1:499:1
|
296
|
+
# # chromosome:X:751:1049:1
|
297
|
+
# # chromosome:X:1076:10000:1
|
298
|
+
# ---
|
299
|
+
# *Arguments*:
|
300
|
+
# * ranges: array of ranges (required)
|
301
|
+
# *Returns*:: array of Slice objects
|
302
|
+
def excise(ranges)
|
303
|
+
if ranges.class != Array
|
304
|
+
raise RuntimeError, "Argument should be an array of ranges"
|
305
|
+
end
|
306
|
+
ranges.each do |r|
|
307
|
+
if r.class != Range
|
308
|
+
raise RuntimeError, "Argument should be an array of ranges"
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
answer = Array.new
|
313
|
+
previous_excised_stop = self.start - 1
|
314
|
+
ranges.sort_by{|r| r.first}.each do |r|
|
315
|
+
subslice_start = previous_excised_stop + 1
|
316
|
+
if subslice_start <= r.first - 1
|
317
|
+
answer.push(Slice.new(self.seq_region, subslice_start, r.first - 1))
|
318
|
+
end
|
319
|
+
previous_excised_stop = r.last
|
320
|
+
if r.last > self.stop
|
321
|
+
return answer
|
322
|
+
end
|
323
|
+
end
|
324
|
+
subslice_start = previous_excised_stop + 1
|
325
|
+
answer.push(Slice.new(self.seq_region, subslice_start, self.stop))
|
326
|
+
return answer
|
327
|
+
end
|
328
|
+
|
329
|
+
# = DESCRIPTION
|
330
|
+
# Get the sequence of the Slice as a Bio::Sequence::NA object.
|
331
|
+
#
|
332
|
+
# If the Slice is on a CoordSystem that is not seq_level, it will try
|
333
|
+
# to project it coordinates to the CoordSystem that does. At this
|
334
|
+
# moment, this is only done if there is a direct link between the
|
335
|
+
# two coordinate systems. (The perl API allows for following an
|
336
|
+
# indirect link as well.)
|
337
|
+
#
|
338
|
+
# Caution: Bio::Sequence::NA makes the sequence
|
339
|
+
# downcase!!
|
340
|
+
#
|
341
|
+
# = USAGE
|
342
|
+
# my_slice.seq.seq.to_s
|
343
|
+
#
|
344
|
+
# ---
|
345
|
+
# *Arguments*:: none
|
346
|
+
# *Returns*:: Bio::Sequence::NA object
|
347
|
+
def seq
|
348
|
+
# If we already accessed the sequence, we can just
|
349
|
+
# call the instance variable. Otherwise, we'll have
|
350
|
+
# to get the sequence first and create a Bio::Sequence::NA
|
351
|
+
# object.
|
352
|
+
if @seq.nil?
|
353
|
+
# First check if the slice is on the seqlevel coordinate
|
354
|
+
# system, otherwise project coordinates.
|
355
|
+
if ! Ensembl::SESSION.seqlevel_id.nil? and self.seq_region.coord_system_id == Ensembl::SESSION.seqlevel_id
|
356
|
+
@seq = Bio::Sequence::NA.new(self.seq_region.subseq(self.start, self.stop))
|
357
|
+
else # we have to project coordinates
|
358
|
+
seq_string = String.new
|
359
|
+
@target_slices = self.project('seqlevel')
|
360
|
+
@target_slices.each do |component|
|
361
|
+
if component.class == Slice
|
362
|
+
seq_string += component.seq # This fetches the seq recursively
|
363
|
+
else # it's a Gap
|
364
|
+
seq_string += 'N' * (component.length)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
@seq = Bio::Sequence::NA.new(seq_string)
|
368
|
+
|
369
|
+
end
|
370
|
+
|
371
|
+
if self.strand == -1
|
372
|
+
@seq.reverse_complement!
|
373
|
+
end
|
374
|
+
|
375
|
+
end
|
376
|
+
return @seq
|
377
|
+
|
378
|
+
end
|
379
|
+
|
380
|
+
def repeatmasked_seq
|
381
|
+
raise NotImplementedError
|
382
|
+
end
|
383
|
+
|
384
|
+
# = DESCRIPTION
|
385
|
+
# Take a sub_slice from an existing one.
|
386
|
+
#
|
387
|
+
# = USAGE
|
388
|
+
# my_sub_slice = my_slice.sub_slice(400,500)
|
389
|
+
#
|
390
|
+
# ---
|
391
|
+
# *Arguments*:
|
392
|
+
# * start: start of subslice relative to slice (default: start of slice)
|
393
|
+
# * stop: stop of subslice relative to slice (default: stop of slice)
|
394
|
+
# *Returns*:: Ensembl::Core::Slice object
|
395
|
+
def sub_slice(start = self.start, stop = self.stop)
|
396
|
+
return self.class.new(self.seq_region, start, stop, self.strand)
|
397
|
+
end
|
398
|
+
|
399
|
+
# = DESCRIPTION
|
400
|
+
# Creates overlapping subslices for a given Slice.
|
401
|
+
#
|
402
|
+
# = USAGE
|
403
|
+
# my_slice.split(50000, 250).each do |sub_slice|
|
404
|
+
# puts sub_slice.display_name
|
405
|
+
# end
|
406
|
+
#
|
407
|
+
# ---
|
408
|
+
# *Arguments*:
|
409
|
+
# * max_size: maximal size of subslices (default: 100000)
|
410
|
+
# * overlap: overlap in bp between consecutive subslices (default: 0)
|
411
|
+
# *Returns*:: array of Ensembl::Core::Slice objects
|
412
|
+
def split(max_size = 100000, overlap = 0)
|
413
|
+
sub_slices = Array.new
|
414
|
+
i = 0
|
415
|
+
self.start.step(self.length, max_size - overlap - 1) do |i|
|
416
|
+
sub_slices.push(self.sub_slice(i, i + max_size - 1))
|
417
|
+
end
|
418
|
+
i -= (overlap + 1)
|
419
|
+
sub_slices.push(self.sub_slice(i + max_size))
|
420
|
+
return sub_slices
|
421
|
+
end
|
422
|
+
|
423
|
+
############################
|
424
|
+
## GET ELEMENTS WITHIN SLICE
|
425
|
+
############################
|
426
|
+
|
427
|
+
#--
|
428
|
+
# As there should be 'getters' for a lot of classes, we'll implement
|
429
|
+
# this with method_missing. For some of the original methods, see the end
|
430
|
+
# of this file.
|
431
|
+
#
|
432
|
+
# The optional argument is either 'true' or 'false' (default = false).
|
433
|
+
# False if the features have to be completely contained within the slice;
|
434
|
+
# true if just a partly overlap is sufficient.
|
435
|
+
#++
|
436
|
+
# Don't use this method yourself.
|
437
|
+
def method_missing(method_name, *args)
|
438
|
+
table_name = method_name.to_s.singularize
|
439
|
+
class_name = table_name.camelcase
|
440
|
+
|
441
|
+
# Convert to the class object
|
442
|
+
target_class = nil
|
443
|
+
ObjectSpace.each_object(Class) do |o|
|
444
|
+
if o.name =~ /^Ensembl::Core::#{class_name}$/
|
445
|
+
target_class = o
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
# If it exists, see if it implements Sliceable
|
450
|
+
if ! target_class.nil? and target_class.include?(Sliceable)
|
451
|
+
inclusive = false
|
452
|
+
if [TrueClass, FalseClass].include?(args[0].class)
|
453
|
+
inclusive = args[0]
|
454
|
+
end
|
455
|
+
return self.get_objects(target_class, table_name, inclusive)
|
456
|
+
end
|
457
|
+
|
458
|
+
raise NoMethodError
|
459
|
+
|
460
|
+
end
|
461
|
+
|
462
|
+
# Don't use this method yourself.
|
463
|
+
def get_objects(target_class, table_name, inclusive = false)
|
464
|
+
answer = Array.new
|
465
|
+
|
466
|
+
coord_system_ids_with_features = nil
|
467
|
+
# Get all the coord_systems with this type of features on them
|
468
|
+
if Collection.check
|
469
|
+
coord_system_ids_with_features = Collection.find_all_coord_by_table_name(table_name,self.seq_region.coord_system.species_id).collect{|mc| mc.coord_system_id}
|
470
|
+
else
|
471
|
+
coord_system_ids_with_features = MetaCoord.find_all_by_table_name(table_name).collect{|mc| mc.coord_system_id}
|
472
|
+
end
|
473
|
+
# Get the features of the original slice
|
474
|
+
if coord_system_ids_with_features.include?(self.seq_region.coord_system_id)
|
475
|
+
sql = ''
|
476
|
+
if inclusive
|
477
|
+
sql = <<SQL
|
478
|
+
SELECT * FROM #{table_name}
|
479
|
+
WHERE seq_region_id = #{self.seq_region.id.to_s}
|
480
|
+
AND (( seq_region_start BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
|
481
|
+
OR ( seq_region_end BETWEEN #{self.start.to_s} AND #{self.stop.to_s} )
|
482
|
+
OR ( seq_region_start <= #{self.start.to_s} AND seq_region_end >= #{self.stop.to_s} )
|
483
|
+
)
|
484
|
+
SQL
|
485
|
+
else
|
486
|
+
sql = <<SQL
|
487
|
+
SELECT * FROM #{table_name}
|
488
|
+
WHERE seq_region_id = #{self.seq_region.id.to_s}
|
489
|
+
AND seq_region_start >= #{self.start.to_s}
|
490
|
+
AND seq_region_end <= #{self.stop.to_s}
|
491
|
+
SQL
|
492
|
+
end
|
493
|
+
answer.push(target_class.find_by_sql(sql))
|
494
|
+
coord_system_ids_with_features.delete(self.seq_region.coord_system_id)
|
495
|
+
end
|
496
|
+
|
497
|
+
# Transform the original slice to other coord systems and get those
|
498
|
+
# features as well. At the moment, only 'direct' projections can be made.
|
499
|
+
# Later, I'm hoping to add functionality for following a path from one
|
500
|
+
# coord_system to another if they're not directly linked in the assembly
|
501
|
+
# table.
|
502
|
+
coord_system_ids_with_features.each do |target_coord_system_id|
|
503
|
+
target_slices = self.project(CoordSystem.find(target_coord_system_id).name)
|
504
|
+
target_slices.each do |slice|
|
505
|
+
if slice.class == Slice
|
506
|
+
if inclusive
|
507
|
+
sql = <<SQL
|
508
|
+
SELECT * FROM #{table_name}
|
509
|
+
WHERE seq_region_id = #{slice.seq_region.id.to_s}
|
510
|
+
AND (( seq_region_start BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
|
511
|
+
OR ( seq_region_end BETWEEN #{slice.start.to_s} AND #{slice.stop.to_s} )
|
512
|
+
OR ( seq_region_start <= #{slice.start.to_s} AND seq_region_end >= #{slice.stop.to_s} )
|
513
|
+
)
|
514
|
+
SQL
|
515
|
+
else
|
516
|
+
sql = <<SQL
|
517
|
+
SELECT * FROM #{table_name}
|
518
|
+
WHERE seq_region_id = #{slice.seq_region.id.to_s}
|
519
|
+
AND seq_region_start >= #{slice.start.to_s}
|
520
|
+
AND seq_region_end <= #{slice.stop.to_s}
|
521
|
+
SQL
|
522
|
+
end
|
523
|
+
answer.push(target_class.find_by_sql(sql))
|
524
|
+
end
|
525
|
+
end
|
526
|
+
end
|
527
|
+
|
528
|
+
answer.flatten!
|
529
|
+
answer.uniq!
|
530
|
+
|
531
|
+
return answer
|
532
|
+
end
|
533
|
+
|
534
|
+
|
535
|
+
# = DESCRIPTION
|
536
|
+
# Get all MiscFeatures that are located on a Slice for a given MiscSet.
|
537
|
+
#
|
538
|
+
# Pitfall: just looks at the CoordSystem that the Slice is located on.
|
539
|
+
# For example, if a Slice is located on a SeqRegion on the 'chromosome'
|
540
|
+
# CoordSystem, but all misc_features are annotated on SeqRegions of
|
541
|
+
# the 'scaffold' CoordSystem, this method will return an empty array.
|
542
|
+
#
|
543
|
+
# = USAGE
|
544
|
+
# my_slice.misc_features('encode').each do |feature|
|
545
|
+
# puts feature.to_yaml
|
546
|
+
# end
|
547
|
+
# ---
|
548
|
+
# *Arguments*:
|
549
|
+
# * code: code of MiscSet
|
550
|
+
# *Returns*:: array of MiscFeature objects
|
551
|
+
def misc_features(code)
|
552
|
+
answer = Array.new
|
553
|
+
if code.nil?
|
554
|
+
self.seq_region.misc_features.each do |mf|
|
555
|
+
if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
|
556
|
+
answer.push(mf)
|
557
|
+
end
|
558
|
+
end
|
559
|
+
else
|
560
|
+
self.seq_region.misc_features.each do |mf|
|
561
|
+
if mf.misc_sets[0].code == code
|
562
|
+
if mf.seq_region_start > self.start and mf.seq_region_end < self.stop
|
563
|
+
answer.push(mf)
|
564
|
+
end
|
565
|
+
end
|
566
|
+
end
|
567
|
+
end
|
568
|
+
return answer
|
569
|
+
end
|
570
|
+
|
571
|
+
# = DESCRIPTION
|
572
|
+
# Get all DnaAlignFeatures that are located on a Slice for a given Analysis.
|
573
|
+
#
|
574
|
+
# Pitfall: just looks at the CoordSystem that the Slice is located on.
|
575
|
+
# For example, if a Slice is located on a SeqRegion on the 'chromosome'
|
576
|
+
# CoordSystem, but all dna_align_features are annotated on SeqRegions of
|
577
|
+
# the 'scaffold' CoordSystem, this method will return an empty array.
|
578
|
+
#
|
579
|
+
# = USAGE
|
580
|
+
# my_slice.dna_align_features('Vertrna').each do |feature|
|
581
|
+
# puts feature.to_yaml
|
582
|
+
# end
|
583
|
+
# ---
|
584
|
+
# *Arguments*:
|
585
|
+
# * code: name of analysis
|
586
|
+
# *Returns*:: array of DnaAlignFeature objects
|
587
|
+
def dna_align_features(analysis_name = nil)
|
588
|
+
if analysis_name.nil?
|
589
|
+
return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
|
590
|
+
else
|
591
|
+
analysis = Analysis.find_by_logic_name(analysis_name)
|
592
|
+
return DnaAlignFeature.find_by_sql('SELECT * FROM dna_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
|
593
|
+
end
|
594
|
+
end
|
595
|
+
|
596
|
+
# = DESCRIPTION
|
597
|
+
# Get all ProteinAlignFeatures that are located on a Slice for a given Analysis.
|
598
|
+
#
|
599
|
+
# Pitfall: just looks at the CoordSystem that the Slice is located on.
|
600
|
+
# For example, if a Slice is located on a SeqRegion on the 'chromosome'
|
601
|
+
# CoordSystem, but all protein_align_features are annotated on SeqRegions of
|
602
|
+
# the 'scaffold' CoordSystem, this method will return an empty array.
|
603
|
+
#
|
604
|
+
# = USAGE
|
605
|
+
# my_slice.protein_align_features('Uniprot').each do |feature|
|
606
|
+
# puts feature.to_yaml
|
607
|
+
# end
|
608
|
+
# ---
|
609
|
+
# *Arguments*:
|
610
|
+
# * code: name of analysis
|
611
|
+
# *Returns*:: array of ProteinAlignFeature objects
|
612
|
+
def protein_align_features(analysis_name)
|
613
|
+
if analysis_name.nil?
|
614
|
+
return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s)
|
615
|
+
else
|
616
|
+
analysis = Analysis.find_by_logic_name(analysis_name)
|
617
|
+
return ProteinAlignFeature.find_by_sql('SELECT * FROM protein_align_feature WHERE seq_region_id = ' + self.seq_region.id.to_s + ' AND seq_region_start >= ' + self.start.to_s + ' AND seq_region_end <= ' + self.stop.to_s + ' AND analysis_id = ' + analysis.id.to_s)
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
############################
|
622
|
+
## VARIATION METHODS
|
623
|
+
############################
|
624
|
+
|
625
|
+
|
626
|
+
#= DESCRIPTION
|
627
|
+
# Method to retrieve Variation features from Ensembl::Core::Slice objects
|
628
|
+
#= USAGE
|
629
|
+
# slice = Slice.fetch_by_region('chromosome',1,50000,51000)
|
630
|
+
# variations = slice.get_variation_features
|
631
|
+
# variations.each do |vf|
|
632
|
+
# puts vf.variation_name, vf.allele_string
|
633
|
+
# puts vf.variation.ancestral_allele
|
634
|
+
# end
|
635
|
+
def get_variation_features
|
636
|
+
variation_connection()
|
637
|
+
Ensembl::Variation::VariationFeature.find(:all,:conditions => ["seq_region_id = ? AND seq_region_start >= ? AND seq_region_end <= ?",self.seq_region.seq_region_id,self.start,self.stop])
|
638
|
+
end
|
639
|
+
|
640
|
+
def get_genotyped_variation_features
|
641
|
+
variation_connection()
|
642
|
+
Ensembl::Variation::VariationFeature.find(:all,:conditions => ["flags = 'genotyped' AND seq_region_id = ? AND seq_region_start >= ? AND seq_region_end <= ?",self.seq_region.seq_region_id,self.start,self.stop])
|
643
|
+
end
|
644
|
+
|
645
|
+
private
|
646
|
+
|
647
|
+
def variation_connection()
|
648
|
+
if !Ensembl::Variation::DBConnection.connected?
|
649
|
+
host,user,password,db_name,port = Ensembl::Core::DBConnection.get_info
|
650
|
+
if db_name =~/(\w+_\w+)_\w+_(\d+)_\S+/ then
|
651
|
+
species,release = $1,$2
|
652
|
+
Ensembl::Variation::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
|
653
|
+
else
|
654
|
+
raise NameError, "Can't get Variation Database name from #{db_name}. Are you using non conventional names?"
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
end
|
659
|
+
|
660
|
+
|
661
|
+
end #Slice
|
662
|
+
|
663
|
+
# = DESCRIPTION
|
664
|
+
# The Gap class is similar to the Slice object, but describes a gap and
|
665
|
+
# therefore can easily be described by coordinate system and size.
|
666
|
+
#
|
667
|
+
class Gap
|
668
|
+
attr_accessor :coord_system, :size
|
669
|
+
|
670
|
+
# = DESCRIPTION
|
671
|
+
# Create a new Gap object from scratch.
|
672
|
+
#
|
673
|
+
# = USAGE
|
674
|
+
# my_coord_system = CoordSystem.find_by_name('chromosome')
|
675
|
+
# # Create a gap of 10kb.
|
676
|
+
# gap = Gap.new(my_coord_system, 10000)
|
677
|
+
# ---
|
678
|
+
# *Arguments*:
|
679
|
+
# * coord_system: CoordSystem object (required)
|
680
|
+
# * length: length of the gap (required)
|
681
|
+
# *Returns*:: Gap object
|
682
|
+
def initialize(coord_system, size)
|
683
|
+
@coord_system, @size = coord_system, size
|
684
|
+
end
|
685
|
+
alias length size
|
686
|
+
|
687
|
+
def display_name
|
688
|
+
return @coord_system.name + ":gap:" + @size.to_s
|
689
|
+
end
|
690
|
+
end #Gap
|
691
|
+
|
692
|
+
end #Core
|
693
|
+
end #Ensembl
|