exodb 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bd0cec2b7ef4791ab9686827bf8d31409152af2a
4
+ data.tar.gz: 117c7eb770d5473a76909af778017f2557288982
5
+ SHA512:
6
+ metadata.gz: 6b7cd7602e7bda8da25c1799e5fbc0fd3221e04397b52a54cb4c1b5b2691b6cfea96d1e114bfc1f218529d747614544876da7eb78d8258ef3ee41a082d22eee2
7
+ data.tar.gz: 5699679e8fb8834af7c4e7f42ca2f399c549c015b1fa07781240a9a9f76d01a56951ba72af04645254232d588a6c1c9424034f23ec03791098bd57a0e3d5bf3c
data/contributors.txt ADDED
@@ -0,0 +1 @@
1
+ Natapol Pornputtapong
data/exodb.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "exodb/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'exodb'
7
+ s.version = Exodb::VERSION
8
+ s.date = '2014-10-31'
9
+ s.platform = Gem::Platform::RUBY
10
+
11
+ s.summary = "A library for exome sequencing data management development"
12
+ s.description = "A library for exome sequencing data management"
13
+ s.authors = ["Natapol Pornputtapong"]
14
+ s.email = 'natapol.por@gmail.com'
15
+
16
+ s.homepage = 'http://rubygems.org/gems/dactyls'
17
+ s.license = 'GPL'
18
+
19
+ # s.rubyforge_project = "neography"
20
+
21
+ s.files = `git ls-files`.split("\n")
22
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
23
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
24
+ s.require_paths = ["lib"]
25
+
26
+ s.add_dependency "mongoid", "~> 3.1"
27
+ s.add_dependency "bio", "~> 1.4"
28
+ s.add_dependency "highline", "~> 1.6"
29
+ s.add_dependency "pry", "~> 0.10"
30
+
31
+
32
+ # s.add_development_dependency "rspec", ">= 2.11"
33
+ # s.add_dependency "httpclient", ">= 2.3.3"
34
+
35
+ end
@@ -0,0 +1,22 @@
1
+ #
2
+ # Exodus
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ require 'bio'
13
+
14
+
15
+ Dir.glob('hs_ref_GRCh37*.fa').each do |file|
16
+ flatfile = Bio::FlatFile.open(Bio::FastaFormat, file)
17
+ flatfile.each do |e|
18
+ outfile = File.open("#{e.acc_version}.fa", 'w')
19
+ outfile.write(e.to_s)
20
+ outfile.close()
21
+ end
22
+ end
@@ -0,0 +1,26 @@
1
+ #
2
+ # Exodus
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ class String
13
+
14
+ def is_miriam?
15
+ return self =~ /^urn:miriam:/
16
+ end
17
+
18
+ def id
19
+ return self.is_miriam? ? self.split(':', 4)[-1] : ''
20
+ end
21
+
22
+ def namespace
23
+ return self.is_miriam? ? self.split(':', 4)[2] : ''
24
+ end
25
+
26
+ end
@@ -0,0 +1,13 @@
1
+ #
2
+ # Exodus
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ require 'exodb/addon/string.rb'
13
+
@@ -0,0 +1,103 @@
1
+ #
2
+ # Exodb
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ module Exodb
13
+
14
+ module LocationField
15
+
16
+ extend ActiveSupport::Concern
17
+
18
+ included do
19
+ field :location, type: Hash #{chromosome: '', start: x, stop: x}
20
+ index({location: 1}, background: true)
21
+ end
22
+
23
+ module ClassMethods
24
+
25
+ def cover?(loc_str)
26
+ dat = loc_str.split(/(:|\.\.)/)
27
+ if dat[4]
28
+ querystr = {:'location.chromosome' => dat[0], :'location.start'.lte => dat[2].to_i, :'location.stop'.gte => dat[4].to_i}
29
+ else
30
+ querystr = {:'location.chromosome' => dat[0], :'location.start'.lte => dat[2].to_i, :'location.stop'.gte => dat[2].to_i}
31
+ end
32
+ return self.where(querystr)
33
+ end
34
+
35
+ def intersect?(loc_str)
36
+ dat = loc_str.split(/(:|\.\.)/)
37
+ querystr = {:'$or' => [{:'location.chromosome' => dat[0], :'location.start'.lte => dat[2].to_i, :'location.stop'.gte => dat[2].to_i}, {:'location.chromosome' => dat[0], :'location.start'.lte => dat[4].to_i, :'location.stop'.gte => dat[4].to_i}]}
38
+ return self.where(querystr)
39
+ end
40
+
41
+ def in?(loc_str)
42
+ dat = loc_str.split(/(:|\.\.)/)
43
+ querystr = {:'location.chromosome' => dat[0], :'location.start'.gte => dat[2].to_i, :'location.stop'.lte => dat[4].to_i}
44
+ return self.where(querystr)
45
+ end
46
+
47
+ #def converse
48
+ # self.where({}).each do |e|
49
+ # if e[:location][:coordinates]
50
+ # oldlocation = e[:location]
51
+ # if oldlocation[:coordinates][0].is_a?(Array)
52
+ # e[:location] = {chromosome: oldlocation[:coordinates][0][0], start: oldlocation[:coordinates][0][1], stop: oldlocation[:coordinates][1][1]}
53
+ # else
54
+ # e[:location] = {chromosome: oldlocation[:coordinates][0], start: oldlocation[:coordinates][1], stop: oldlocation[:coordinates][1]}
55
+ # end
56
+ # p e.save!
57
+ # end
58
+ # end
59
+ #end
60
+ end
61
+
62
+ # get the start position of gene rely on the genome
63
+ #
64
+ # @return [Integer] start position of gene
65
+ def start
66
+ self[:location]['start']
67
+ end
68
+
69
+ # get the end position of gene rely on the genome
70
+ #
71
+ # @return [Integer] end position of gene
72
+ def stop
73
+ self[:location]['stop']
74
+ end
75
+
76
+ alias_method :end, :stop
77
+
78
+ # get the chromosome
79
+ #
80
+ # @return [Integer] chromosome
81
+ def chromosome
82
+ self[:location]['chromosome']
83
+ end
84
+
85
+ # Assign gene location in format of chromosome_number:start..stop
86
+ #
87
+ # @param [String] gene location in format of chromosome_number:start..stop
88
+ def parse_location(loc_str)
89
+ dat = loc_str.split(/(:|\.\.)/)
90
+ if dat[4]
91
+ self[:location] = {chromosome: dat[0], start: dat[2].to_i, stop: dat[4].to_i}
92
+ else
93
+ self[:location] = {chromosome: dat[0], start: dat[2].to_i, stop: dat[2].to_i}
94
+ end
95
+
96
+ end
97
+
98
+ def location_str
99
+ return "#{self.chromosome}:#{[self.start, self.stop].uniq.join('..')}"
100
+ end
101
+ end
102
+
103
+ end
@@ -0,0 +1,387 @@
1
+ #
2
+ # Exodb
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ module Exodb
13
+
14
+ class Reference
15
+
16
+ include Mongoid::Document
17
+ include Mongoid::Versioning
18
+ include Mongoid::Timestamps
19
+
20
+ include Exodb::XrefsField
21
+
22
+ field :oid, type: String
23
+
24
+ end
25
+
26
+ class Variantref < Reference
27
+
28
+ PATTERN = /(?<gene>[A-Z0-9]+)-?(?<position>[0-9,]*|[is]?)(?<to>[A-Z=]*)/
29
+ SILENTSIGN = '='
30
+
31
+ include Exodb::LocationField
32
+
33
+ field :reference, type: String
34
+ field :alternate, type: String
35
+
36
+ end
37
+
38
+ class Incidence
39
+
40
+ include Mongoid::Document
41
+
42
+ field :cancertype, type: String
43
+ field :position, type: String # refseq id of chromomose
44
+ field :occur, type: Array
45
+ field :casenumber, type: Integer
46
+
47
+ embedded_in :generef
48
+
49
+ end
50
+
51
+ class Generef < Reference
52
+
53
+ include Exodb::LocationField
54
+
55
+ field :sequence, type: String
56
+ field :chrrefseq, type: String # refseq id of chromomose
57
+ field :strand, type: String
58
+ field :psuedo, type: Boolean
59
+ field :genomeref, type: String
60
+
61
+ index({sequence: 'text'}, background: true)
62
+
63
+ has_many :genes
64
+ embeds_many :splices
65
+ embeds_many :incidences
66
+
67
+ validates_format_of :chrrefseq, with: /\A(urn:miriam:refseq)/
68
+
69
+
70
+ # Download sequence from web service please use by caution. NCBI will block scamming sequest
71
+ #
72
+ def dl_seq!
73
+
74
+ case self.chrrefseq
75
+ when /\Aurn:miriam:refseq:/
76
+ self.sequence = Bio::FastaFormat.new(Bio::NCBI::REST.efetch(self.chrrefseq.split(':', 4), {"db"=>"nucleotide", "rettype"=>"fasta", "retmode"=>"text", "seq_start"=>self.start, "seq_stop"=>self.end})).seq
77
+ end
78
+
79
+ self.save!
80
+
81
+ end
82
+
83
+ # Download gene symbol from HGNC service
84
+ #
85
+ def dl_symbol!
86
+
87
+ baseuri = "http://rest.genenames.org/search"
88
+
89
+
90
+ query = ""
91
+
92
+ if self.get_xref('urn:miriam:refseq')
93
+ query = "#{baseuri}/refseq_accession/#{self.chrrefseq.id.split('.')[0]}"
94
+ elsif self.get_xref('urn:miriam:ncbigene')
95
+ query = ""
96
+ end
97
+
98
+ if !query.empty?
99
+ response = JSON.parse(open(query, 'Accept' => 'application/json').read)['response']
100
+ if !response['docs'].empty?
101
+
102
+ response['docs'].each do |e|
103
+ self.add_to_set(:xrefs, "urn:miriam:hgnc:#{e["hgnc_id"]}")
104
+ self.add_to_set(:xrefs, "urn:miriam:hgnc.symbol:#{e["symbol"]}")
105
+ end
106
+
107
+ self.save!
108
+
109
+ end
110
+
111
+ end
112
+
113
+ end
114
+
115
+ # Download incident data from TCGA
116
+ #
117
+ def dl_incidence!
118
+
119
+ if self.get_xref('urn:miriam:hgnc.symbol')
120
+
121
+ cancerstudies = []
122
+
123
+ open("http://www.cbioportal.org/public-portal/webservice.do?cmd=getCancerStudies") {|f|
124
+ f.each_line {|line| cancerstudies.push(line.chomp.split("\t")[0])}
125
+ }
126
+
127
+ incidents = {}
128
+ totalcase = {}
129
+
130
+ cancerstudies.each do |study|
131
+
132
+ totalcase[study] = 0 if !totalcase.has_key?(study)
133
+
134
+ open("http://www.cbioportal.org/public-portal/webservice.do?cmd=getCaseLists&cancer_study_id=#{study}") do |f|
135
+ f.each_line do |line|
136
+ totalcase[study] += line.chomp.split(/\t/)[4].split(' ').length if line =~ /\tSequenced Tumors\t/
137
+ end
138
+ end
139
+
140
+ incidents[study] = {} if !incidents.has_key?(study)
141
+
142
+ open("http://www.cbioportal.org/public-portal/webservice.do?cmd=getMutationData&genetic_profile_id=#{study}_mutations&gene_list=#{self.get_xref('urn:miriam:hgnc.symbol').id}") do |f|
143
+ f.each_line do |line|
144
+ dat = line.chomp.split(/\t/)
145
+
146
+ if dat[5] == 'Missense_Mutation'
147
+
148
+ incidents[study][dat[7].split(/(\d+)/)[1]] = [] if !incidents[study].has_key?(dat[7].split(/(\d+)/)[1])
149
+ incidents[study][dat[7].split(/(\d+)/)[1]].push(dat[2])
150
+
151
+ end
152
+
153
+ end
154
+
155
+
156
+ end
157
+ end
158
+
159
+ self.incidences.clear if self.incidences
160
+
161
+ incidents.each_pair do |cancertype, v|
162
+ v.each_pair do |position, occur|
163
+ self.incidences << Incidence.new({cancertype: cancertype, position: position, occur: occur.uniq.sort, casenumber: totalcase[cancertype]})
164
+ end
165
+ end
166
+
167
+ self.save!
168
+
169
+ end
170
+
171
+ end
172
+
173
+ # return sequence as Bio::Sequence object
174
+ #
175
+ # @return [Bio::Sequence] the contents reversed lexically
176
+ def to_seq
177
+ return self.sequence ? Bio::Sequence.auto(self.sequence) : Bio::Sequence.auto("")
178
+ end
179
+
180
+ # return longest splice of this gene
181
+ def longest_splice()
182
+ length = 0
183
+ longest = nil
184
+ self.splices.each do |e|
185
+
186
+ if e.prot_len > length
187
+ length = e.prot_len
188
+ longest = e
189
+ end
190
+
191
+ end
192
+
193
+ return longest
194
+ end
195
+
196
+ # Check that this gene has any splice variant
197
+ #
198
+ # @return [Boolean] true if has any splices
199
+ def has_splices?
200
+ return self.splices.exists?
201
+ end
202
+
203
+ # Check if Generef has sequence
204
+ #
205
+ # @return [Boolean] Return true if there is a sequence
206
+ def has_sequence?()
207
+ return self[:sequence] ? true : false
208
+ end
209
+
210
+ # Check if Generef can translate
211
+ #
212
+ # @return [Boolean] Return true if this can be translate
213
+ def can_translated?()
214
+ return self.has_sequence? && self.has_splices? && self.longest_splice != nil ? true : false
215
+ end
216
+
217
+ # Get gene symbol
218
+ #
219
+ # @return [String] Return gene symbol or any id from xrefs or 'nosymbol'
220
+ def symbol
221
+
222
+ if self.get_xref('urn:miriam:hgnc.symbol')
223
+ return self.get_xref('urn:miriam:hgnc.symbol').id
224
+ elsif self.xrefs && !self.xrefs.empty?
225
+ return self.xrefs.sort[0].id
226
+ else
227
+ return 'nosymbol'
228
+ end
229
+
230
+ end
231
+
232
+ end
233
+
234
+ class Splice
235
+
236
+ include Mongoid::Document
237
+
238
+ include Exodb::XrefsField
239
+
240
+ field :exon, type: Array
241
+ field :cds, type: Array
242
+
243
+ embedded_in :generef
244
+
245
+ # join exon or cds position into a string
246
+ #
247
+ # @param [Array] input array exon or cds
248
+ # @param [Interger] Position to stop positive value for forward read negative value for complement
249
+ #
250
+ # @return [String] a string in start..end,start..end,...
251
+ def get_join_str(arr, position = 0)
252
+
253
+ reducer = self.generef.start - 1
254
+ tmparr = []
255
+ found = false
256
+
257
+ if position > 0
258
+ add = true
259
+ arr.each do |e|
260
+
261
+ if e[0] <= position && position <= e[1]
262
+ tmparr.push([e[0], position])
263
+ add = false
264
+ found = true
265
+ else
266
+ tmparr.push(e) if add
267
+ end
268
+
269
+ end
270
+ elsif position < 0
271
+ position = position.abs
272
+ add = false
273
+ arr.each do |e|
274
+
275
+ if e[0] <= position && position <= e[1]
276
+ tmparr.push([position, e[1]])
277
+ add = true
278
+ found = true
279
+ else
280
+ tmparr.push(e) if add
281
+ end
282
+
283
+ end
284
+ else
285
+ tmparr = arr
286
+ end
287
+
288
+ tmparr = [] if !found && position != 0
289
+ str = []
290
+
291
+ tmparr.each do |e|
292
+ str.push("#{e[0] - reducer}..#{e[1] - reducer}")
293
+ end
294
+ return str.join(',')
295
+
296
+ end
297
+
298
+ def get_exon_join(position = 0)
299
+ get_join_str(self[:exon], position)
300
+ end
301
+
302
+ def get_cds_join(position = 0)
303
+ get_join_str(self[:cds], position)
304
+ end
305
+
306
+ # Get spliced DNA sequence
307
+ #
308
+ # @return [Bio::Sequence] an DNA sequence
309
+ def get_dna_seq
310
+ parent = self.generef
311
+ return parent.strand == '+' ? parent.to_seq.splicing("join(#{self.get_exon_join})") : parent.to_seq.splicing("complement(join(#{self.get_exon_join}))")
312
+ end
313
+
314
+ # Get spliced RNA sequence
315
+ #
316
+ # @return [Bio::Sequence] an RNA sequence
317
+ def get_mrna_seq
318
+ parent = self.generef
319
+ return parent.strand == '+' ? parent.to_seq.splicing("join(#{self.get_exon_join})").rna : parent.to_seq.splicing("complement(join(#{self.get_exon_join}))").rna
320
+ end
321
+
322
+ # Get spliced coding region sequence
323
+ #
324
+ # @param [Integer] end position to get sequence
325
+ #
326
+ # @return [Bio::Sequence] an coding region sequence
327
+ def get_cds_seq(position = 0)
328
+
329
+ parent = self.generef
330
+ if parent.strand == '+'
331
+ join = self.get_cds_join(position)
332
+ return !join.empty? ? parent.to_seq.splicing("join(#{join})") : ""
333
+ else
334
+ join = self.get_cds_join(-position)
335
+ return !join.empty? ? parent.to_seq.splicing("join(#{join})") : ""
336
+ end
337
+
338
+ end
339
+
340
+ # Get spliced protein sequence
341
+ #
342
+ # @return [Bio::Sequence] an protein sequence
343
+ def get_prot_seq
344
+ parent = self.generef
345
+ return parent.strand == '+' ? parent.to_seq.splicing("join(#{self.get_cds_join})").translate : parent.to_seq.splicing("complement(join(#{self.get_cds_join}))").translate
346
+ end
347
+
348
+ # get length of spliced RNA
349
+ #
350
+ # @return [Integer] length of spliced RNA
351
+ def rna_len
352
+ return self.get_mrna_seq.length
353
+ end
354
+
355
+ # get length of protein product
356
+ #
357
+ # @return [Integer] length of protein product
358
+ def prot_len
359
+ return self.get_prot_seq.length
360
+ end
361
+
362
+ # Get the codon sequence at the giving position base on position of amino acid
363
+ #
364
+ # @param [Integer] codon position
365
+ # @return [Bio::Sequence] the codon at given position
366
+ def get_codon(codon_pos)
367
+ return self.get_cds_seq().subseq(((codon_pos - 1) * 3) + 1 , ((codon_pos - 1) * 3) + 3)
368
+ end
369
+
370
+ # convert genomic position to codon position
371
+ #
372
+ # @param [Integer] genomic position
373
+ # @return [Array] Return all information of codon at given position
374
+ def get_prot_pos(pos)
375
+
376
+ seqlen = self.get_cds_seq(pos).length
377
+ if seqlen != 0
378
+ return [(seqlen - 1) / 3, (seqlen - 1) % 3]
379
+ else
380
+ return []
381
+ end
382
+
383
+ end
384
+
385
+ end
386
+
387
+ end
@@ -0,0 +1,51 @@
1
+ #
2
+ # Exodb
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+
13
+ module Exodb
14
+
15
+ class Region
16
+
17
+ include Mongoid::Document
18
+ include Mongoid::Timestamps
19
+
20
+ end
21
+
22
+ class Gene < Region
23
+
24
+ include Mongoid::Versioning
25
+ include Exodb::LocationField
26
+
27
+ field :symbol, type: String
28
+ field :loh, type: Boolean
29
+
30
+ embeds_many :aacids
31
+ belongs_to :generef
32
+ belongs_to :cell
33
+
34
+ index({'symbol' => 1, 'aacids.position' => 1}, background: true)
35
+ end
36
+
37
+ class Aacid < Region
38
+
39
+ field :position, type: Integer # position referenced to the first codon from the longest splice variant
40
+ field :refcodon, type: String
41
+ field :refaa, type: String
42
+ field :altcodon, type: Hash
43
+ field :inhcodon, type: Hash
44
+ field :isoform, type: Array
45
+
46
+ embedded_in :gene
47
+ has_many :variants
48
+
49
+ end
50
+
51
+ end