exodb 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/exodb.rb +23 -0
- data/lib/exodb/addon/string.rb +139 -0
- data/lib/exodb/constant.rb +64 -0
- data/lib/exodb/datamodel.rb +4 -1
- data/lib/exodb/datamodel/genelocfield.rb +177 -0
- data/lib/exodb/datamodel/generef.rb +193 -0
- data/lib/exodb/datamodel/isoform.rb +237 -0
- data/lib/exodb/datamodel/reference.rb +23 -327
- data/lib/exodb/datamodel/region.rb +7 -5
- data/lib/exodb/datamodel/source.rb +1 -10
- data/lib/exodb/datamodel/variant.rb +14 -81
- data/lib/exodb/datamodel/varlocfield.rb +106 -0
- data/lib/exodb/datamodel/xrefsfield.rb +4 -0
- data/lib/exodb/extra.rb +17 -0
- data/lib/exodb/extra/upload.rb +43 -0
- data/lib/exodb/{utils → extra}/upload_generef.rb +35 -21
- data/lib/exodb/rositza/load.rb +56 -42
- data/lib/exodb/utils.rb +1 -2
- data/lib/exodb/utils/ensemblrest.rb +31 -3
- data/lib/exodb/utils/miriamrest.rb +23 -0
- data/lib/exodb/version.rb +1 -1
- metadata +10 -3
- data/lib/exodb/datamodel/locationfield.rb +0 -116
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65c5be33ab0ce6072c4a277fd63004603f7ad0e6
|
4
|
+
data.tar.gz: 75641b1dd0443b9424274f45376a0ac90db3c524
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb6dc455d74085166dc9bb0f8b34e8d658a729110ec2f151215f085a7e59e11152564159404202001b3e8b8c42985525321e8a910f5d2cba964fbbfbc2e68565
|
7
|
+
data.tar.gz: d5daaf36ae983c48994160ac413ca03059348e9dedeca2e36be510d8a02cfdc73e8282d992323a6bf7c743bb59bce95639360421429e0abd9196ba2e65a56c7f
|
data/lib/exodb.rb
CHANGED
@@ -36,12 +36,35 @@ require 'exodb/dbconnection.rb'
|
|
36
36
|
require 'exodb/usermanage.rb'
|
37
37
|
require 'exodb/datamodel.rb'
|
38
38
|
require 'exodb/exception.rb'
|
39
|
+
require 'exodb/constant.rb'
|
39
40
|
require 'exodb/utils.rb'
|
40
41
|
require 'exodb/addon.rb'
|
41
42
|
|
43
|
+
require 'exodb/extra.rb'
|
42
44
|
|
43
45
|
module Exodb
|
44
46
|
|
47
|
+
@@verbose = true
|
48
|
+
|
45
49
|
module_function
|
46
50
|
|
51
|
+
def verbose()
|
52
|
+
@@verbose = true
|
53
|
+
end
|
54
|
+
|
55
|
+
def noverbose()
|
56
|
+
@@verbose = false
|
57
|
+
end
|
58
|
+
|
59
|
+
def putstv(str)
|
60
|
+
putst(str) if @@verbose == true
|
61
|
+
end
|
62
|
+
|
63
|
+
def putst(str)
|
64
|
+
puts "Exodb:STATUS #{str}"
|
65
|
+
end
|
66
|
+
|
67
|
+
def assembly(str)
|
68
|
+
return Exodb::ASSEMBLY[str.downcase]
|
69
|
+
end
|
47
70
|
end
|
data/lib/exodb/addon/string.rb
CHANGED
@@ -11,6 +11,8 @@
|
|
11
11
|
|
12
12
|
class String
|
13
13
|
|
14
|
+
# For miriam
|
15
|
+
|
14
16
|
def is_miriam?
|
15
17
|
return self =~ /^urn:miriam:/
|
16
18
|
end
|
@@ -23,4 +25,141 @@ class String
|
|
23
25
|
return self.is_miriam? ? self.split(':', 4)[2] : ''
|
24
26
|
end
|
25
27
|
|
28
|
+
def resolve
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
# For HGV
|
33
|
+
|
34
|
+
def is_hgvs?
|
35
|
+
return self =~ Exodb::HGVPATTERN
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_hgvs
|
39
|
+
|
40
|
+
result = {}
|
41
|
+
|
42
|
+
Exodb::HGVPATTERN.match(self) do |m|
|
43
|
+
|
44
|
+
if m[1] =~ /^chr/
|
45
|
+
ref = m[1].split(/\./)
|
46
|
+
result[:chr] = ref[0]
|
47
|
+
result[:assembly] = ref[1].blank? ? Exodb::DEFAULTASSEMBLY : Exodb.assembly(ref[1])
|
48
|
+
elsif m[1] =~ /^(\d{0,2}|[MXY])\./
|
49
|
+
ref = m[1].split(/\./)
|
50
|
+
result[:chr] = "chr#{ref[0]}"
|
51
|
+
result[:assembly] = ref[1].blank? ? Exodb::DEFAULTASSEMBLY : Exodb.assembly(ref[1])
|
52
|
+
else
|
53
|
+
result[:chrrefseq] = m[1]
|
54
|
+
end
|
55
|
+
|
56
|
+
pos = m[2].split(/_/).sort
|
57
|
+
result[:pos] = pos[0].to_i
|
58
|
+
result[:start] = pos[0].to_i
|
59
|
+
result[:stop] = pos[1].blank? ? pos[0].to_i : pos[1].to_i
|
60
|
+
|
61
|
+
case m[3]
|
62
|
+
when /^ins/
|
63
|
+
result[:type] = 'ins'
|
64
|
+
result[:alt] = m[3][3..-1]
|
65
|
+
when /^del/
|
66
|
+
result[:type] = 'del'
|
67
|
+
result[:alt] = m[3][3..-1]
|
68
|
+
else
|
69
|
+
result[:type] = 'sub'
|
70
|
+
result[:alt] = m[3].split(/\>/)[1]
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
return result
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
# For Pileup string
|
80
|
+
def count_allele
|
81
|
+
|
82
|
+
allellset = {
|
83
|
+
'A' => "aA",
|
84
|
+
'T' => "tT",
|
85
|
+
'C' => "cC",
|
86
|
+
'G' => "gG",
|
87
|
+
'.' => "\\.\\,",
|
88
|
+
'*' => "\\*"
|
89
|
+
}
|
90
|
+
|
91
|
+
tmpstr = self.dup
|
92
|
+
|
93
|
+
allele = {}
|
94
|
+
|
95
|
+
self.scan(/([+-])(\d+)([ATCGatcg]+)/) do |a, b, c|
|
96
|
+
pattern = "#{a}#{b}#{c[0,(b.to_i)]}".upcase
|
97
|
+
if !allele.has_key?(pattern)
|
98
|
+
allele[pattern] = 0
|
99
|
+
tmpstr.gsub!(/#{"#{b}#{c[0,(b.to_i)]}"}/, '')
|
100
|
+
end
|
101
|
+
allele[pattern] += 1
|
102
|
+
end
|
103
|
+
|
104
|
+
allellset.each_pair do |k, v|
|
105
|
+
allele[k] = tmpstr.count(v) if tmpstr.count(v) > 0
|
106
|
+
end
|
107
|
+
|
108
|
+
return allele
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
# For pileup var
|
113
|
+
def is_pileup_var?
|
114
|
+
dat = self.split(/\//)
|
115
|
+
return dat[0].is_loc? && dat[1] =~ /[\+\-]?[ATCG]+/
|
116
|
+
end
|
117
|
+
|
118
|
+
def parse_pileup_var
|
119
|
+
|
120
|
+
result = {}
|
121
|
+
if self.is_pileup_var?
|
122
|
+
dat = self.split(/\//)
|
123
|
+
result = dat[0].parse_loc
|
124
|
+
dat[1] =~ /([\+\-]?)([ATCG]+)/
|
125
|
+
result[:type] = case $1
|
126
|
+
when '+'
|
127
|
+
'ins'
|
128
|
+
when '-'
|
129
|
+
'del'
|
130
|
+
else
|
131
|
+
'sub'
|
132
|
+
end
|
133
|
+
result[:alt] = dat[1]
|
134
|
+
end
|
135
|
+
|
136
|
+
return result
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
# Location String
|
141
|
+
|
142
|
+
def is_loc?
|
143
|
+
return /^\w+:(\d+|\d+\.\.\d+|\d+-\d+)(:\w+)?$/
|
144
|
+
end
|
145
|
+
# For quality string
|
146
|
+
|
147
|
+
# Assign gene location in format of chromosome_number:start..stop
|
148
|
+
#
|
149
|
+
# @param [String] gene location in format of chromosome_number:start..stop
|
150
|
+
def parse_loc
|
151
|
+
|
152
|
+
if self =~ /^[^:]+:(\d+|\d+\.\.\d+|\d+-\d+)(:\w+)?$/
|
153
|
+
dat = self.split(/:/)
|
154
|
+
pos = []
|
155
|
+
dat[1].split(/\.\.|-/).each {|e| pos.push(e.to_i)}
|
156
|
+
pos.sort!
|
157
|
+
return {'chr' => dat[0], 'start' => pos[0], 'pos' => pos[0], 'stop' => pos[1] ? pos[1] : pos[0], 'assembly' => dat[2] ? Exodb::assembly(dat[2]) : Exodb::DEFAULTASSEMBLY}
|
158
|
+
else
|
159
|
+
raise
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
end
|
164
|
+
|
26
165
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#
|
2
|
+
# Exodus
|
3
|
+
# Copyright (C) 2014
|
4
|
+
#
|
5
|
+
# author: Natapol Pornputtapong <natapol.por@gmail.com>
|
6
|
+
#
|
7
|
+
# Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
|
8
|
+
#
|
9
|
+
|
10
|
+
# raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
|
11
|
+
|
12
|
+
module Exodb
|
13
|
+
|
14
|
+
NAIUPAC = {
|
15
|
+
'Y' => 'CT',
|
16
|
+
'R' => 'AG',
|
17
|
+
'W' => 'AT',
|
18
|
+
'S' => 'CG',
|
19
|
+
'K' => 'GT',
|
20
|
+
'M' => 'AC',
|
21
|
+
|
22
|
+
'B' => 'CGT',
|
23
|
+
'D' => 'AGT',
|
24
|
+
'H' => 'ACT',
|
25
|
+
'V' => 'ACG',
|
26
|
+
|
27
|
+
'N' => 'ACGT',
|
28
|
+
|
29
|
+
'A' => 'A',
|
30
|
+
'T' => 'T',
|
31
|
+
'G' => 'G',
|
32
|
+
'C' => 'C',
|
33
|
+
'U' => 'U',
|
34
|
+
|
35
|
+
'CT' => 'Y',
|
36
|
+
'AG' => 'R',
|
37
|
+
'AT' => 'W',
|
38
|
+
'CG' => 'S',
|
39
|
+
'GT' => 'K',
|
40
|
+
'AC' => 'M',
|
41
|
+
|
42
|
+
'CGT' => 'B',
|
43
|
+
'AGT' => 'D',
|
44
|
+
'ACT' => 'H',
|
45
|
+
'ACG' => 'V',
|
46
|
+
|
47
|
+
'ACGT' => 'N'
|
48
|
+
}
|
49
|
+
|
50
|
+
ASSEMBLY = {
|
51
|
+
'hg19' => 'GRCh37',
|
52
|
+
'hg38' => 'GRCh38',
|
53
|
+
'GRCh37' => 'GRCh37',
|
54
|
+
'GRCh38' => 'GRCh38',
|
55
|
+
'grch37' => 'GRCh37',
|
56
|
+
'grch38' => 'GRCh38'
|
57
|
+
}
|
58
|
+
|
59
|
+
DEFAULTASSEMBLY = 'GRCh37'
|
60
|
+
LATESTASSEMBLY = 'GRCh38'
|
61
|
+
|
62
|
+
HGVPATTERN = /^([^:]+):g\.([\-_\d]+)([ATGC]>[ATCG]|del[ATCG]*|ins[ATCG]*)$/
|
63
|
+
|
64
|
+
end
|
data/lib/exodb/datamodel.rb
CHANGED
@@ -11,9 +11,12 @@
|
|
11
11
|
|
12
12
|
require 'mongoid'
|
13
13
|
|
14
|
-
require 'exodb/datamodel/
|
14
|
+
require 'exodb/datamodel/genelocfield.rb'
|
15
|
+
require 'exodb/datamodel/varlocfield.rb'
|
15
16
|
require 'exodb/datamodel/xrefsfield.rb'
|
16
17
|
require 'exodb/datamodel/variant.rb'
|
17
18
|
require 'exodb/datamodel/reference.rb'
|
19
|
+
require 'exodb/datamodel/generef.rb'
|
20
|
+
require 'exodb/datamodel/isoform.rb'
|
18
21
|
require 'exodb/datamodel/region.rb'
|
19
22
|
require 'exodb/datamodel/source.rb'
|
@@ -0,0 +1,177 @@
|
|
1
|
+
#
|
2
|
+
# Exodb
|
3
|
+
# Copyright (C) 2014
|
4
|
+
#
|
5
|
+
# author: Natapol Pornputtapong <natapol.por@gmail.com>
|
6
|
+
#
|
7
|
+
# Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
|
8
|
+
#
|
9
|
+
|
10
|
+
# raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
|
11
|
+
|
12
|
+
module Exodb
|
13
|
+
|
14
|
+
module GeneLocationField
|
15
|
+
|
16
|
+
extend ActiveSupport::Concern
|
17
|
+
|
18
|
+
included do
|
19
|
+
field :start, type: Integer
|
20
|
+
field :stop, type: Integer
|
21
|
+
field :chr, type: String
|
22
|
+
field :seqstart, type: Integer
|
23
|
+
field :seqstop, type: Integer
|
24
|
+
field :strand, type: String
|
25
|
+
field :assembly, type: String
|
26
|
+
field :sequence, type: String
|
27
|
+
field :chrrefseq, type: String # refseq id of chromosome
|
28
|
+
|
29
|
+
validates_presence_of :start, message: "start field missing"
|
30
|
+
validates_presence_of :stop, message: "stop field missing"
|
31
|
+
validates_presence_of :chr, message: "chr field missing"
|
32
|
+
validates_presence_of :assembly, message: "assembly field missing"
|
33
|
+
|
34
|
+
index({start: 1, stop: 1, chr: 1, assembly: 1}, background: true)
|
35
|
+
end
|
36
|
+
|
37
|
+
module ClassMethods
|
38
|
+
|
39
|
+
def where_cover(loc_str)
|
40
|
+
|
41
|
+
dat = parse_locstr(loc_str)
|
42
|
+
|
43
|
+
return self.where({chr: dat['chr'], assembly: dat['assembly']}).lte(start: dat['start']).gte(stop: dat['stop'])
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
def where_intersect(loc_str)
|
48
|
+
dat = parse_locstr(loc_str)
|
49
|
+
|
50
|
+
return self.where({chr: dat['chr'], assembly: dat['assembly']}).or({:start.lte => dat['start'], :stop.gte => dat['start']}, {:start.lte => dat['stop'], :stop.gte => dat['stop']})
|
51
|
+
end
|
52
|
+
|
53
|
+
def where_in(loc_str)
|
54
|
+
dat = parse_locstr(loc_str)
|
55
|
+
return self.where({chr: dat['chr'], assembly: dat['assembly']}).gte(start: dat['start']).lte(stop: dat['stop'])
|
56
|
+
end
|
57
|
+
|
58
|
+
def where_ups_cover(loc_str)
|
59
|
+
#code
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
# Download sequence from web service please use by caution. NCBI will block scamming sequest
|
65
|
+
#
|
66
|
+
def dl_seq!
|
67
|
+
|
68
|
+
case self.chrrefseq
|
69
|
+
when /\Aurn:miriam:refseq:/
|
70
|
+
self.sequence = Bio::FastaFormat.new(Bio::NCBI::REST.efetch(self.chrrefseq.split(':', 4), {"db"=>"nucleotide", "rettype"=>"fasta", "retmode"=>"text", "seq_start"=>self.start, "seq_stop"=>self.end})).seq
|
71
|
+
else
|
72
|
+
self.sequence = Exodb::Ensembl::REST.sequence_region()
|
73
|
+
end
|
74
|
+
|
75
|
+
self.save!
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
# get the start position of gene rely on the genome
|
80
|
+
#
|
81
|
+
# @return [Integer] start position of gene
|
82
|
+
def start
|
83
|
+
self[:start]
|
84
|
+
end
|
85
|
+
|
86
|
+
# get the end position of gene rely on the genome
|
87
|
+
#
|
88
|
+
# @return [Integer] end position of gene
|
89
|
+
def stop
|
90
|
+
self[:stop]
|
91
|
+
end
|
92
|
+
|
93
|
+
# get the start position of gene rely on the genome
|
94
|
+
#
|
95
|
+
# @return [Integer] start position of gene
|
96
|
+
def begin
|
97
|
+
self[:strand] == '+' ? self[:start] : self[:stop]
|
98
|
+
end
|
99
|
+
|
100
|
+
# get the start position of gene rely on the genome
|
101
|
+
#
|
102
|
+
# @return [Integer] start position of gene
|
103
|
+
def end
|
104
|
+
self[:strand] == '+' ? self[:stop] : self[:start]
|
105
|
+
end
|
106
|
+
|
107
|
+
# get the chromosome
|
108
|
+
#
|
109
|
+
# @return [Integer] chromosome
|
110
|
+
def chromosome
|
111
|
+
self[:chr]
|
112
|
+
end
|
113
|
+
|
114
|
+
# Assign location
|
115
|
+
#
|
116
|
+
# @param [String, Hash] location string in chromosome:start..stop or chromosome:start-stop format
|
117
|
+
def location=(loc)
|
118
|
+
if loc.is_a?(String)
|
119
|
+
|
120
|
+
begin
|
121
|
+
loc.parse_loc.delete_if {|k, v| k == 'pos'}.each_pair do |k, v|
|
122
|
+
self[k.to_sym] = v
|
123
|
+
end
|
124
|
+
rescue
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Return location
|
132
|
+
#
|
133
|
+
# @return [String] location string in chromosome:position
|
134
|
+
def location_str
|
135
|
+
return "#{self.chromosome}:#{[self.start, self.stop].uniq.join('..')}"
|
136
|
+
end
|
137
|
+
|
138
|
+
alias_method :locstr, :location_str
|
139
|
+
|
140
|
+
# Return gene sequence
|
141
|
+
#
|
142
|
+
# @return [Bio::Sequence] gene sequence
|
143
|
+
def to_seq
|
144
|
+
whole_seq.splice("#{self[:start] - self[:seqstart] + 1}..#{self[:stop] - self[:seqstart] + 1}")
|
145
|
+
end
|
146
|
+
|
147
|
+
# Return whole deposited sequence
|
148
|
+
#
|
149
|
+
# @return [Bio::Sequence] gene sequence
|
150
|
+
def whole_seq
|
151
|
+
Bio::Sequence::NA.new(self[:sequence])
|
152
|
+
end
|
153
|
+
|
154
|
+
# join exon or cds position into a string
|
155
|
+
#
|
156
|
+
# @param [Array] input array exon or cds
|
157
|
+
# @param [Interger] Position to stop positive value for forward read negative value for complement
|
158
|
+
#
|
159
|
+
# @return [String] a string in start..end,start..end,...
|
160
|
+
def get_splice(arr, strand = nil)
|
161
|
+
|
162
|
+
strand = strand || self[:strand]
|
163
|
+
|
164
|
+
reducer = self[:seqlocation]['start'] - 1
|
165
|
+
|
166
|
+
str = []
|
167
|
+
|
168
|
+
arr.each do |e|
|
169
|
+
str.push("#{e[0] - reducer}..#{e[1] - reducer}")
|
170
|
+
end
|
171
|
+
|
172
|
+
return strand == '+' ? self.to_seq.splicing("join(#{str.join(',')})") : self.to_seq.splicing("complement(join(#{str.join(',')}))")
|
173
|
+
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|