exodb 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/exodb.rb +23 -0
- data/lib/exodb/addon/string.rb +139 -0
- data/lib/exodb/constant.rb +64 -0
- data/lib/exodb/datamodel.rb +4 -1
- data/lib/exodb/datamodel/genelocfield.rb +177 -0
- data/lib/exodb/datamodel/generef.rb +193 -0
- data/lib/exodb/datamodel/isoform.rb +237 -0
- data/lib/exodb/datamodel/reference.rb +23 -327
- data/lib/exodb/datamodel/region.rb +7 -5
- data/lib/exodb/datamodel/source.rb +1 -10
- data/lib/exodb/datamodel/variant.rb +14 -81
- data/lib/exodb/datamodel/varlocfield.rb +106 -0
- data/lib/exodb/datamodel/xrefsfield.rb +4 -0
- data/lib/exodb/extra.rb +17 -0
- data/lib/exodb/extra/upload.rb +43 -0
- data/lib/exodb/{utils → extra}/upload_generef.rb +35 -21
- data/lib/exodb/rositza/load.rb +56 -42
- data/lib/exodb/utils.rb +1 -2
- data/lib/exodb/utils/ensemblrest.rb +31 -3
- data/lib/exodb/utils/miriamrest.rb +23 -0
- data/lib/exodb/version.rb +1 -1
- metadata +10 -3
- data/lib/exodb/datamodel/locationfield.rb +0 -116
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65c5be33ab0ce6072c4a277fd63004603f7ad0e6
|
4
|
+
data.tar.gz: 75641b1dd0443b9424274f45376a0ac90db3c524
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb6dc455d74085166dc9bb0f8b34e8d658a729110ec2f151215f085a7e59e11152564159404202001b3e8b8c42985525321e8a910f5d2cba964fbbfbc2e68565
|
7
|
+
data.tar.gz: d5daaf36ae983c48994160ac413ca03059348e9dedeca2e36be510d8a02cfdc73e8282d992323a6bf7c743bb59bce95639360421429e0abd9196ba2e65a56c7f
|
data/lib/exodb.rb
CHANGED
@@ -36,12 +36,35 @@ require 'exodb/dbconnection.rb'
|
|
36
36
|
require 'exodb/usermanage.rb'
|
37
37
|
require 'exodb/datamodel.rb'
|
38
38
|
require 'exodb/exception.rb'
|
39
|
+
require 'exodb/constant.rb'
|
39
40
|
require 'exodb/utils.rb'
|
40
41
|
require 'exodb/addon.rb'
|
41
42
|
|
43
|
+
require 'exodb/extra.rb'
|
42
44
|
|
43
45
|
module Exodb
|
44
46
|
|
47
|
+
@@verbose = true
|
48
|
+
|
45
49
|
module_function
|
46
50
|
|
51
|
+
def verbose()
|
52
|
+
@@verbose = true
|
53
|
+
end
|
54
|
+
|
55
|
+
def noverbose()
|
56
|
+
@@verbose = false
|
57
|
+
end
|
58
|
+
|
59
|
+
def putstv(str)
|
60
|
+
putst(str) if @@verbose == true
|
61
|
+
end
|
62
|
+
|
63
|
+
def putst(str)
|
64
|
+
puts "Exodb:STATUS #{str}"
|
65
|
+
end
|
66
|
+
|
67
|
+
def assembly(str)
|
68
|
+
return Exodb::ASSEMBLY[str.downcase]
|
69
|
+
end
|
47
70
|
end
|
data/lib/exodb/addon/string.rb
CHANGED
@@ -11,6 +11,8 @@
|
|
11
11
|
|
12
12
|
class String
|
13
13
|
|
14
|
+
# For miriam
|
15
|
+
|
14
16
|
def is_miriam?
|
15
17
|
return self =~ /^urn:miriam:/
|
16
18
|
end
|
@@ -23,4 +25,141 @@ class String
|
|
23
25
|
return self.is_miriam? ? self.split(':', 4)[2] : ''
|
24
26
|
end
|
25
27
|
|
28
|
+
def resolve
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
# For HGV
|
33
|
+
|
34
|
+
def is_hgvs?
|
35
|
+
return self =~ Exodb::HGVPATTERN
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_hgvs
|
39
|
+
|
40
|
+
result = {}
|
41
|
+
|
42
|
+
Exodb::HGVPATTERN.match(self) do |m|
|
43
|
+
|
44
|
+
if m[1] =~ /^chr/
|
45
|
+
ref = m[1].split(/\./)
|
46
|
+
result[:chr] = ref[0]
|
47
|
+
result[:assembly] = ref[1].blank? ? Exodb::DEFAULTASSEMBLY : Exodb.assembly(ref[1])
|
48
|
+
elsif m[1] =~ /^(\d{0,2}|[MXY])\./
|
49
|
+
ref = m[1].split(/\./)
|
50
|
+
result[:chr] = "chr#{ref[0]}"
|
51
|
+
result[:assembly] = ref[1].blank? ? Exodb::DEFAULTASSEMBLY : Exodb.assembly(ref[1])
|
52
|
+
else
|
53
|
+
result[:chrrefseq] = m[1]
|
54
|
+
end
|
55
|
+
|
56
|
+
pos = m[2].split(/_/).sort
|
57
|
+
result[:pos] = pos[0].to_i
|
58
|
+
result[:start] = pos[0].to_i
|
59
|
+
result[:stop] = pos[1].blank? ? pos[0].to_i : pos[1].to_i
|
60
|
+
|
61
|
+
case m[3]
|
62
|
+
when /^ins/
|
63
|
+
result[:type] = 'ins'
|
64
|
+
result[:alt] = m[3][3..-1]
|
65
|
+
when /^del/
|
66
|
+
result[:type] = 'del'
|
67
|
+
result[:alt] = m[3][3..-1]
|
68
|
+
else
|
69
|
+
result[:type] = 'sub'
|
70
|
+
result[:alt] = m[3].split(/\>/)[1]
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
return result
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
# For Pileup string
|
80
|
+
def count_allele
|
81
|
+
|
82
|
+
allellset = {
|
83
|
+
'A' => "aA",
|
84
|
+
'T' => "tT",
|
85
|
+
'C' => "cC",
|
86
|
+
'G' => "gG",
|
87
|
+
'.' => "\\.\\,",
|
88
|
+
'*' => "\\*"
|
89
|
+
}
|
90
|
+
|
91
|
+
tmpstr = self.dup
|
92
|
+
|
93
|
+
allele = {}
|
94
|
+
|
95
|
+
self.scan(/([+-])(\d+)([ATCGatcg]+)/) do |a, b, c|
|
96
|
+
pattern = "#{a}#{b}#{c[0,(b.to_i)]}".upcase
|
97
|
+
if !allele.has_key?(pattern)
|
98
|
+
allele[pattern] = 0
|
99
|
+
tmpstr.gsub!(/#{"#{b}#{c[0,(b.to_i)]}"}/, '')
|
100
|
+
end
|
101
|
+
allele[pattern] += 1
|
102
|
+
end
|
103
|
+
|
104
|
+
allellset.each_pair do |k, v|
|
105
|
+
allele[k] = tmpstr.count(v) if tmpstr.count(v) > 0
|
106
|
+
end
|
107
|
+
|
108
|
+
return allele
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
# For pileup var
|
113
|
+
def is_pileup_var?
|
114
|
+
dat = self.split(/\//)
|
115
|
+
return dat[0].is_loc? && dat[1] =~ /[\+\-]?[ATCG]+/
|
116
|
+
end
|
117
|
+
|
118
|
+
def parse_pileup_var
|
119
|
+
|
120
|
+
result = {}
|
121
|
+
if self.is_pileup_var?
|
122
|
+
dat = self.split(/\//)
|
123
|
+
result = dat[0].parse_loc
|
124
|
+
dat[1] =~ /([\+\-]?)([ATCG]+)/
|
125
|
+
result[:type] = case $1
|
126
|
+
when '+'
|
127
|
+
'ins'
|
128
|
+
when '-'
|
129
|
+
'del'
|
130
|
+
else
|
131
|
+
'sub'
|
132
|
+
end
|
133
|
+
result[:alt] = dat[1]
|
134
|
+
end
|
135
|
+
|
136
|
+
return result
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
# Location String
|
141
|
+
|
142
|
+
def is_loc?
|
143
|
+
return /^\w+:(\d+|\d+\.\.\d+|\d+-\d+)(:\w+)?$/
|
144
|
+
end
|
145
|
+
# For quality string
|
146
|
+
|
147
|
+
# Assign gene location in format of chromosome_number:start..stop
|
148
|
+
#
|
149
|
+
# @param [String] gene location in format of chromosome_number:start..stop
|
150
|
+
def parse_loc
|
151
|
+
|
152
|
+
if self =~ /^[^:]+:(\d+|\d+\.\.\d+|\d+-\d+)(:\w+)?$/
|
153
|
+
dat = self.split(/:/)
|
154
|
+
pos = []
|
155
|
+
dat[1].split(/\.\.|-/).each {|e| pos.push(e.to_i)}
|
156
|
+
pos.sort!
|
157
|
+
return {'chr' => dat[0], 'start' => pos[0], 'pos' => pos[0], 'stop' => pos[1] ? pos[1] : pos[0], 'assembly' => dat[2] ? Exodb::assembly(dat[2]) : Exodb::DEFAULTASSEMBLY}
|
158
|
+
else
|
159
|
+
raise
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
end
|
164
|
+
|
26
165
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#
|
2
|
+
# Exodus
|
3
|
+
# Copyright (C) 2014
|
4
|
+
#
|
5
|
+
# author: Natapol Pornputtapong <natapol.por@gmail.com>
|
6
|
+
#
|
7
|
+
# Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
|
8
|
+
#
|
9
|
+
|
10
|
+
# raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
|
11
|
+
|
12
|
+
module Exodb
|
13
|
+
|
14
|
+
NAIUPAC = {
|
15
|
+
'Y' => 'CT',
|
16
|
+
'R' => 'AG',
|
17
|
+
'W' => 'AT',
|
18
|
+
'S' => 'CG',
|
19
|
+
'K' => 'GT',
|
20
|
+
'M' => 'AC',
|
21
|
+
|
22
|
+
'B' => 'CGT',
|
23
|
+
'D' => 'AGT',
|
24
|
+
'H' => 'ACT',
|
25
|
+
'V' => 'ACG',
|
26
|
+
|
27
|
+
'N' => 'ACGT',
|
28
|
+
|
29
|
+
'A' => 'A',
|
30
|
+
'T' => 'T',
|
31
|
+
'G' => 'G',
|
32
|
+
'C' => 'C',
|
33
|
+
'U' => 'U',
|
34
|
+
|
35
|
+
'CT' => 'Y',
|
36
|
+
'AG' => 'R',
|
37
|
+
'AT' => 'W',
|
38
|
+
'CG' => 'S',
|
39
|
+
'GT' => 'K',
|
40
|
+
'AC' => 'M',
|
41
|
+
|
42
|
+
'CGT' => 'B',
|
43
|
+
'AGT' => 'D',
|
44
|
+
'ACT' => 'H',
|
45
|
+
'ACG' => 'V',
|
46
|
+
|
47
|
+
'ACGT' => 'N'
|
48
|
+
}
|
49
|
+
|
50
|
+
ASSEMBLY = {
|
51
|
+
'hg19' => 'GRCh37',
|
52
|
+
'hg38' => 'GRCh38',
|
53
|
+
'GRCh37' => 'GRCh37',
|
54
|
+
'GRCh38' => 'GRCh38',
|
55
|
+
'grch37' => 'GRCh37',
|
56
|
+
'grch38' => 'GRCh38'
|
57
|
+
}
|
58
|
+
|
59
|
+
DEFAULTASSEMBLY = 'GRCh37'
|
60
|
+
LATESTASSEMBLY = 'GRCh38'
|
61
|
+
|
62
|
+
HGVPATTERN = /^([^:]+):g\.([\-_\d]+)([ATGC]>[ATCG]|del[ATCG]*|ins[ATCG]*)$/
|
63
|
+
|
64
|
+
end
|
data/lib/exodb/datamodel.rb
CHANGED
@@ -11,9 +11,12 @@
|
|
11
11
|
|
12
12
|
require 'mongoid'
|
13
13
|
|
14
|
-
require 'exodb/datamodel/
|
14
|
+
require 'exodb/datamodel/genelocfield.rb'
|
15
|
+
require 'exodb/datamodel/varlocfield.rb'
|
15
16
|
require 'exodb/datamodel/xrefsfield.rb'
|
16
17
|
require 'exodb/datamodel/variant.rb'
|
17
18
|
require 'exodb/datamodel/reference.rb'
|
19
|
+
require 'exodb/datamodel/generef.rb'
|
20
|
+
require 'exodb/datamodel/isoform.rb'
|
18
21
|
require 'exodb/datamodel/region.rb'
|
19
22
|
require 'exodb/datamodel/source.rb'
|
@@ -0,0 +1,177 @@
|
|
1
|
+
#
|
2
|
+
# Exodb
|
3
|
+
# Copyright (C) 2014
|
4
|
+
#
|
5
|
+
# author: Natapol Pornputtapong <natapol.por@gmail.com>
|
6
|
+
#
|
7
|
+
# Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
|
8
|
+
#
|
9
|
+
|
10
|
+
# raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
|
11
|
+
|
12
|
+
module Exodb
|
13
|
+
|
14
|
+
module GeneLocationField
|
15
|
+
|
16
|
+
extend ActiveSupport::Concern
|
17
|
+
|
18
|
+
included do
|
19
|
+
field :start, type: Integer
|
20
|
+
field :stop, type: Integer
|
21
|
+
field :chr, type: String
|
22
|
+
field :seqstart, type: Integer
|
23
|
+
field :seqstop, type: Integer
|
24
|
+
field :strand, type: String
|
25
|
+
field :assembly, type: String
|
26
|
+
field :sequence, type: String
|
27
|
+
field :chrrefseq, type: String # refseq id of chromosome
|
28
|
+
|
29
|
+
validates_presence_of :start, message: "start field missing"
|
30
|
+
validates_presence_of :stop, message: "stop field missing"
|
31
|
+
validates_presence_of :chr, message: "chr field missing"
|
32
|
+
validates_presence_of :assembly, message: "assembly field missing"
|
33
|
+
|
34
|
+
index({start: 1, stop: 1, chr: 1, assembly: 1}, background: true)
|
35
|
+
end
|
36
|
+
|
37
|
+
module ClassMethods
|
38
|
+
|
39
|
+
def where_cover(loc_str)
|
40
|
+
|
41
|
+
dat = parse_locstr(loc_str)
|
42
|
+
|
43
|
+
return self.where({chr: dat['chr'], assembly: dat['assembly']}).lte(start: dat['start']).gte(stop: dat['stop'])
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
def where_intersect(loc_str)
|
48
|
+
dat = parse_locstr(loc_str)
|
49
|
+
|
50
|
+
return self.where({chr: dat['chr'], assembly: dat['assembly']}).or({:start.lte => dat['start'], :stop.gte => dat['start']}, {:start.lte => dat['stop'], :stop.gte => dat['stop']})
|
51
|
+
end
|
52
|
+
|
53
|
+
def where_in(loc_str)
|
54
|
+
dat = parse_locstr(loc_str)
|
55
|
+
return self.where({chr: dat['chr'], assembly: dat['assembly']}).gte(start: dat['start']).lte(stop: dat['stop'])
|
56
|
+
end
|
57
|
+
|
58
|
+
def where_ups_cover(loc_str)
|
59
|
+
#code
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
# Download sequence from web service please use by caution. NCBI will block scamming sequest
|
65
|
+
#
|
66
|
+
def dl_seq!
|
67
|
+
|
68
|
+
case self.chrrefseq
|
69
|
+
when /\Aurn:miriam:refseq:/
|
70
|
+
self.sequence = Bio::FastaFormat.new(Bio::NCBI::REST.efetch(self.chrrefseq.split(':', 4), {"db"=>"nucleotide", "rettype"=>"fasta", "retmode"=>"text", "seq_start"=>self.start, "seq_stop"=>self.end})).seq
|
71
|
+
else
|
72
|
+
self.sequence = Exodb::Ensembl::REST.sequence_region()
|
73
|
+
end
|
74
|
+
|
75
|
+
self.save!
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
# get the start position of gene rely on the genome
|
80
|
+
#
|
81
|
+
# @return [Integer] start position of gene
|
82
|
+
def start
|
83
|
+
self[:start]
|
84
|
+
end
|
85
|
+
|
86
|
+
# get the end position of gene rely on the genome
|
87
|
+
#
|
88
|
+
# @return [Integer] end position of gene
|
89
|
+
def stop
|
90
|
+
self[:stop]
|
91
|
+
end
|
92
|
+
|
93
|
+
# get the start position of gene rely on the genome
|
94
|
+
#
|
95
|
+
# @return [Integer] start position of gene
|
96
|
+
def begin
|
97
|
+
self[:strand] == '+' ? self[:start] : self[:stop]
|
98
|
+
end
|
99
|
+
|
100
|
+
# get the start position of gene rely on the genome
|
101
|
+
#
|
102
|
+
# @return [Integer] start position of gene
|
103
|
+
def end
|
104
|
+
self[:strand] == '+' ? self[:stop] : self[:start]
|
105
|
+
end
|
106
|
+
|
107
|
+
# get the chromosome
|
108
|
+
#
|
109
|
+
# @return [Integer] chromosome
|
110
|
+
def chromosome
|
111
|
+
self[:chr]
|
112
|
+
end
|
113
|
+
|
114
|
+
# Assign location
|
115
|
+
#
|
116
|
+
# @param [String, Hash] location string in chromosome:start..stop or chromosome:start-stop format
|
117
|
+
def location=(loc)
|
118
|
+
if loc.is_a?(String)
|
119
|
+
|
120
|
+
begin
|
121
|
+
loc.parse_loc.delete_if {|k, v| k == 'pos'}.each_pair do |k, v|
|
122
|
+
self[k.to_sym] = v
|
123
|
+
end
|
124
|
+
rescue
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Return location
|
132
|
+
#
|
133
|
+
# @return [String] location string in chromosome:position
|
134
|
+
def location_str
|
135
|
+
return "#{self.chromosome}:#{[self.start, self.stop].uniq.join('..')}"
|
136
|
+
end
|
137
|
+
|
138
|
+
alias_method :locstr, :location_str
|
139
|
+
|
140
|
+
# Return gene sequence
|
141
|
+
#
|
142
|
+
# @return [Bio::Sequence] gene sequence
|
143
|
+
def to_seq
|
144
|
+
whole_seq.splice("#{self[:start] - self[:seqstart] + 1}..#{self[:stop] - self[:seqstart] + 1}")
|
145
|
+
end
|
146
|
+
|
147
|
+
# Return whole deposited sequence
|
148
|
+
#
|
149
|
+
# @return [Bio::Sequence] gene sequence
|
150
|
+
def whole_seq
|
151
|
+
Bio::Sequence::NA.new(self[:sequence])
|
152
|
+
end
|
153
|
+
|
154
|
+
# join exon or cds position into a string
|
155
|
+
#
|
156
|
+
# @param [Array] input array exon or cds
|
157
|
+
# @param [Interger] Position to stop positive value for forward read negative value for complement
|
158
|
+
#
|
159
|
+
# @return [String] a string in start..end,start..end,...
|
160
|
+
def get_splice(arr, strand = nil)
|
161
|
+
|
162
|
+
strand = strand || self[:strand]
|
163
|
+
|
164
|
+
reducer = self[:seqlocation]['start'] - 1
|
165
|
+
|
166
|
+
str = []
|
167
|
+
|
168
|
+
arr.each do |e|
|
169
|
+
str.push("#{e[0] - reducer}..#{e[1] - reducer}")
|
170
|
+
end
|
171
|
+
|
172
|
+
return strand == '+' ? self.to_seq.splicing("join(#{str.join(',')})") : self.to_seq.splicing("complement(join(#{str.join(',')}))")
|
173
|
+
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|