exodb 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bd0cec2b7ef4791ab9686827bf8d31409152af2a
4
+ data.tar.gz: 117c7eb770d5473a76909af778017f2557288982
5
+ SHA512:
6
+ metadata.gz: 6b7cd7602e7bda8da25c1799e5fbc0fd3221e04397b52a54cb4c1b5b2691b6cfea96d1e114bfc1f218529d747614544876da7eb78d8258ef3ee41a082d22eee2
7
+ data.tar.gz: 5699679e8fb8834af7c4e7f42ca2f399c549c015b1fa07781240a9a9f76d01a56951ba72af04645254232d588a6c1c9424034f23ec03791098bd57a0e3d5bf3c
data/contributors.txt ADDED
@@ -0,0 +1 @@
1
+ Natapol Pornputtapong
data/exodb.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "exodb/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'exodb'
7
+ s.version = Exodb::VERSION
8
+ s.date = '2014-10-31'
9
+ s.platform = Gem::Platform::RUBY
10
+
11
+ s.summary = "A library for exome sequencing data management development"
12
+ s.description = "A library for exome sequencing data management"
13
+ s.authors = ["Natapol Pornputtapong"]
14
+ s.email = 'natapol.por@gmail.com'
15
+
16
+ s.homepage = 'http://rubygems.org/gems/dactyls'
17
+ s.license = 'GPL'
18
+
19
+ # s.rubyforge_project = "neography"
20
+
21
+ s.files = `git ls-files`.split("\n")
22
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
23
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
24
+ s.require_paths = ["lib"]
25
+
26
+ s.add_dependency "mongoid", "~> 3.1"
27
+ s.add_dependency "bio", "~> 1.4"
28
+ s.add_dependency "highline", "~> 1.6"
29
+ s.add_dependency "pry", "~> 0.10"
30
+
31
+
32
+ # s.add_development_dependency "rspec", ">= 2.11"
33
+ # s.add_dependency "httpclient", ">= 2.3.3"
34
+
35
+ end
@@ -0,0 +1,22 @@
1
+ #
2
+ # Exodus
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ require 'bio'
13
+
14
+
15
+ Dir.glob('hs_ref_GRCh37*.fa').each do |file|
16
+ flatfile = Bio::FlatFile.open(Bio::FastaFormat, file)
17
+ flatfile.each do |e|
18
+ outfile = File.open("#{e.acc_version}.fa", 'w')
19
+ outfile.write(e.to_s)
20
+ outfile.close()
21
+ end
22
+ end
@@ -0,0 +1,26 @@
1
+ #
2
+ # Exodus
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ class String
13
+
14
+ def is_miriam?
15
+ return self =~ /^urn:miriam:/
16
+ end
17
+
18
+ def id
19
+ return self.is_miriam? ? self.split(':', 4)[-1] : ''
20
+ end
21
+
22
+ def namespace
23
+ return self.is_miriam? ? self.split(':', 4)[2] : ''
24
+ end
25
+
26
+ end
@@ -0,0 +1,13 @@
1
+ #
2
+ # Exodus
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ require 'exodb/addon/string.rb'
13
+
@@ -0,0 +1,103 @@
1
+ #
2
+ # Exodb
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ module Exodb
13
+
14
+ module LocationField
15
+
16
+ extend ActiveSupport::Concern
17
+
18
+ included do
19
+ field :location, type: Hash #{chromosome: '', start: x, stop: x}
20
+ index({location: 1}, background: true)
21
+ end
22
+
23
+ module ClassMethods
24
+
25
+ def cover?(loc_str)
26
+ dat = loc_str.split(/(:|\.\.)/)
27
+ if dat[4]
28
+ querystr = {:'location.chromosome' => dat[0], :'location.start'.lte => dat[2].to_i, :'location.stop'.gte => dat[4].to_i}
29
+ else
30
+ querystr = {:'location.chromosome' => dat[0], :'location.start'.lte => dat[2].to_i, :'location.stop'.gte => dat[2].to_i}
31
+ end
32
+ return self.where(querystr)
33
+ end
34
+
35
+ def intersect?(loc_str)
36
+ dat = loc_str.split(/(:|\.\.)/)
37
+ querystr = {:'$or' => [{:'location.chromosome' => dat[0], :'location.start'.lte => dat[2].to_i, :'location.stop'.gte => dat[2].to_i}, {:'location.chromosome' => dat[0], :'location.start'.lte => dat[4].to_i, :'location.stop'.gte => dat[4].to_i}]}
38
+ return self.where(querystr)
39
+ end
40
+
41
+ def in?(loc_str)
42
+ dat = loc_str.split(/(:|\.\.)/)
43
+ querystr = {:'location.chromosome' => dat[0], :'location.start'.gte => dat[2].to_i, :'location.stop'.lte => dat[4].to_i}
44
+ return self.where(querystr)
45
+ end
46
+
47
+ #def converse
48
+ # self.where({}).each do |e|
49
+ # if e[:location][:coordinates]
50
+ # oldlocation = e[:location]
51
+ # if oldlocation[:coordinates][0].is_a?(Array)
52
+ # e[:location] = {chromosome: oldlocation[:coordinates][0][0], start: oldlocation[:coordinates][0][1], stop: oldlocation[:coordinates][1][1]}
53
+ # else
54
+ # e[:location] = {chromosome: oldlocation[:coordinates][0], start: oldlocation[:coordinates][1], stop: oldlocation[:coordinates][1]}
55
+ # end
56
+ # p e.save!
57
+ # end
58
+ # end
59
+ #end
60
+ end
61
+
62
+ # get the start position of gene rely on the genome
63
+ #
64
+ # @return [Integer] start position of gene
65
+ def start
66
+ self[:location]['start']
67
+ end
68
+
69
+ # get the end position of gene rely on the genome
70
+ #
71
+ # @return [Integer] end position of gene
72
+ def stop
73
+ self[:location]['stop']
74
+ end
75
+
76
+ alias_method :end, :stop
77
+
78
+ # get the chromosome
79
+ #
80
+ # @return [Integer] chromosome
81
+ def chromosome
82
+ self[:location]['chromosome']
83
+ end
84
+
85
+ # Assign gene location in format of chromosome_number:start..stop
86
+ #
87
+ # @param [String] gene location in format of chromosome_number:start..stop
88
+ def parse_location(loc_str)
89
+ dat = loc_str.split(/(:|\.\.)/)
90
+ if dat[4]
91
+ self[:location] = {chromosome: dat[0], start: dat[2].to_i, stop: dat[4].to_i}
92
+ else
93
+ self[:location] = {chromosome: dat[0], start: dat[2].to_i, stop: dat[2].to_i}
94
+ end
95
+
96
+ end
97
+
98
+ def location_str
99
+ return "#{self.chromosome}:#{[self.start, self.stop].uniq.join('..')}"
100
+ end
101
+ end
102
+
103
+ end
@@ -0,0 +1,387 @@
1
+ #
2
+ # Exodb
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+ module Exodb
13
+
14
+ class Reference
15
+
16
+ include Mongoid::Document
17
+ include Mongoid::Versioning
18
+ include Mongoid::Timestamps
19
+
20
+ include Exodb::XrefsField
21
+
22
+ field :oid, type: String
23
+
24
+ end
25
+
26
+ class Variantref < Reference
27
+
28
+ PATTERN = /(?<gene>[A-Z0-9]+)-?(?<position>[0-9,]*|[is]?)(?<to>[A-Z=]*)/
29
+ SILENTSIGN = '='
30
+
31
+ include Exodb::LocationField
32
+
33
+ field :reference, type: String
34
+ field :alternate, type: String
35
+
36
+ end
37
+
38
+ class Incidence
39
+
40
+ include Mongoid::Document
41
+
42
+ field :cancertype, type: String
43
+ field :position, type: String # refseq id of chromomose
44
+ field :occur, type: Array
45
+ field :casenumber, type: Integer
46
+
47
+ embedded_in :generef
48
+
49
+ end
50
+
51
+ class Generef < Reference
52
+
53
+ include Exodb::LocationField
54
+
55
+ field :sequence, type: String
56
+ field :chrrefseq, type: String # refseq id of chromomose
57
+ field :strand, type: String
58
+ field :psuedo, type: Boolean
59
+ field :genomeref, type: String
60
+
61
+ index({sequence: 'text'}, background: true)
62
+
63
+ has_many :genes
64
+ embeds_many :splices
65
+ embeds_many :incidences
66
+
67
+ validates_format_of :chrrefseq, with: /\A(urn:miriam:refseq)/
68
+
69
+
70
+ # Download sequence from web service please use by caution. NCBI will block scamming sequest
71
+ #
72
+ def dl_seq!
73
+
74
+ case self.chrrefseq
75
+ when /\Aurn:miriam:refseq:/
76
+ self.sequence = Bio::FastaFormat.new(Bio::NCBI::REST.efetch(self.chrrefseq.split(':', 4), {"db"=>"nucleotide", "rettype"=>"fasta", "retmode"=>"text", "seq_start"=>self.start, "seq_stop"=>self.end})).seq
77
+ end
78
+
79
+ self.save!
80
+
81
+ end
82
+
83
+ # Download gene symbol from HGNC service
84
+ #
85
+ def dl_symbol!
86
+
87
+ baseuri = "http://rest.genenames.org/search"
88
+
89
+
90
+ query = ""
91
+
92
+ if self.get_xref('urn:miriam:refseq')
93
+ query = "#{baseuri}/refseq_accession/#{self.chrrefseq.id.split('.')[0]}"
94
+ elsif self.get_xref('urn:miriam:ncbigene')
95
+ query = ""
96
+ end
97
+
98
+ if !query.empty?
99
+ response = JSON.parse(open(query, 'Accept' => 'application/json').read)['response']
100
+ if !response['docs'].empty?
101
+
102
+ response['docs'].each do |e|
103
+ self.add_to_set(:xrefs, "urn:miriam:hgnc:#{e["hgnc_id"]}")
104
+ self.add_to_set(:xrefs, "urn:miriam:hgnc.symbol:#{e["symbol"]}")
105
+ end
106
+
107
+ self.save!
108
+
109
+ end
110
+
111
+ end
112
+
113
+ end
114
+
115
+ # Download incident data from TCGA
116
+ #
117
+ def dl_incidence!
118
+
119
+ if self.get_xref('urn:miriam:hgnc.symbol')
120
+
121
+ cancerstudies = []
122
+
123
+ open("http://www.cbioportal.org/public-portal/webservice.do?cmd=getCancerStudies") {|f|
124
+ f.each_line {|line| cancerstudies.push(line.chomp.split("\t")[0])}
125
+ }
126
+
127
+ incidents = {}
128
+ totalcase = {}
129
+
130
+ cancerstudies.each do |study|
131
+
132
+ totalcase[study] = 0 if !totalcase.has_key?(study)
133
+
134
+ open("http://www.cbioportal.org/public-portal/webservice.do?cmd=getCaseLists&cancer_study_id=#{study}") do |f|
135
+ f.each_line do |line|
136
+ totalcase[study] += line.chomp.split(/\t/)[4].split(' ').length if line =~ /\tSequenced Tumors\t/
137
+ end
138
+ end
139
+
140
+ incidents[study] = {} if !incidents.has_key?(study)
141
+
142
+ open("http://www.cbioportal.org/public-portal/webservice.do?cmd=getMutationData&genetic_profile_id=#{study}_mutations&gene_list=#{self.get_xref('urn:miriam:hgnc.symbol').id}") do |f|
143
+ f.each_line do |line|
144
+ dat = line.chomp.split(/\t/)
145
+
146
+ if dat[5] == 'Missense_Mutation'
147
+
148
+ incidents[study][dat[7].split(/(\d+)/)[1]] = [] if !incidents[study].has_key?(dat[7].split(/(\d+)/)[1])
149
+ incidents[study][dat[7].split(/(\d+)/)[1]].push(dat[2])
150
+
151
+ end
152
+
153
+ end
154
+
155
+
156
+ end
157
+ end
158
+
159
+ self.incidences.clear if self.incidences
160
+
161
+ incidents.each_pair do |cancertype, v|
162
+ v.each_pair do |position, occur|
163
+ self.incidences << Incidence.new({cancertype: cancertype, position: position, occur: occur.uniq.sort, casenumber: totalcase[cancertype]})
164
+ end
165
+ end
166
+
167
+ self.save!
168
+
169
+ end
170
+
171
+ end
172
+
173
+ # return sequence as Bio::Sequence object
174
+ #
175
+ # @return [Bio::Sequence] the contents reversed lexically
176
+ def to_seq
177
+ return self.sequence ? Bio::Sequence.auto(self.sequence) : Bio::Sequence.auto("")
178
+ end
179
+
180
+ # return longest splice of this gene
181
+ def longest_splice()
182
+ length = 0
183
+ longest = nil
184
+ self.splices.each do |e|
185
+
186
+ if e.prot_len > length
187
+ length = e.prot_len
188
+ longest = e
189
+ end
190
+
191
+ end
192
+
193
+ return longest
194
+ end
195
+
196
+ # Check that this gene has any splice variant
197
+ #
198
+ # @return [Boolean] true if has any splices
199
+ def has_splices?
200
+ return self.splices.exists?
201
+ end
202
+
203
+ # Check if Generef has sequence
204
+ #
205
+ # @return [Boolean] Return true if there is a sequence
206
+ def has_sequence?()
207
+ return self[:sequence] ? true : false
208
+ end
209
+
210
+ # Check if Generef can translate
211
+ #
212
+ # @return [Boolean] Return true if this can be translate
213
+ def can_translated?()
214
+ return self.has_sequence? && self.has_splices? && self.longest_splice != nil ? true : false
215
+ end
216
+
217
+ # Get gene symbol
218
+ #
219
+ # @return [String] Return gene symbol or any id from xrefs or 'nosymbol'
220
+ def symbol
221
+
222
+ if self.get_xref('urn:miriam:hgnc.symbol')
223
+ return self.get_xref('urn:miriam:hgnc.symbol').id
224
+ elsif self.xrefs && !self.xrefs.empty?
225
+ return self.xrefs.sort[0].id
226
+ else
227
+ return 'nosymbol'
228
+ end
229
+
230
+ end
231
+
232
+ end
233
+
234
+ class Splice
235
+
236
+ include Mongoid::Document
237
+
238
+ include Exodb::XrefsField
239
+
240
+ field :exon, type: Array
241
+ field :cds, type: Array
242
+
243
+ embedded_in :generef
244
+
245
+ # join exon or cds position into a string
246
+ #
247
+ # @param [Array] input array exon or cds
248
+ # @param [Interger] Position to stop positive value for forward read negative value for complement
249
+ #
250
+ # @return [String] a string in start..end,start..end,...
251
+ def get_join_str(arr, position = 0)
252
+
253
+ reducer = self.generef.start - 1
254
+ tmparr = []
255
+ found = false
256
+
257
+ if position > 0
258
+ add = true
259
+ arr.each do |e|
260
+
261
+ if e[0] <= position && position <= e[1]
262
+ tmparr.push([e[0], position])
263
+ add = false
264
+ found = true
265
+ else
266
+ tmparr.push(e) if add
267
+ end
268
+
269
+ end
270
+ elsif position < 0
271
+ position = position.abs
272
+ add = false
273
+ arr.each do |e|
274
+
275
+ if e[0] <= position && position <= e[1]
276
+ tmparr.push([position, e[1]])
277
+ add = true
278
+ found = true
279
+ else
280
+ tmparr.push(e) if add
281
+ end
282
+
283
+ end
284
+ else
285
+ tmparr = arr
286
+ end
287
+
288
+ tmparr = [] if !found && position != 0
289
+ str = []
290
+
291
+ tmparr.each do |e|
292
+ str.push("#{e[0] - reducer}..#{e[1] - reducer}")
293
+ end
294
+ return str.join(',')
295
+
296
+ end
297
+
298
+ def get_exon_join(position = 0)
299
+ get_join_str(self[:exon], position)
300
+ end
301
+
302
+ def get_cds_join(position = 0)
303
+ get_join_str(self[:cds], position)
304
+ end
305
+
306
+ # Get spliced DNA sequence
307
+ #
308
+ # @return [Bio::Sequence] an DNA sequence
309
+ def get_dna_seq
310
+ parent = self.generef
311
+ return parent.strand == '+' ? parent.to_seq.splicing("join(#{self.get_exon_join})") : parent.to_seq.splicing("complement(join(#{self.get_exon_join}))")
312
+ end
313
+
314
+ # Get spliced RNA sequence
315
+ #
316
+ # @return [Bio::Sequence] an RNA sequence
317
+ def get_mrna_seq
318
+ parent = self.generef
319
+ return parent.strand == '+' ? parent.to_seq.splicing("join(#{self.get_exon_join})").rna : parent.to_seq.splicing("complement(join(#{self.get_exon_join}))").rna
320
+ end
321
+
322
+ # Get spliced coding region sequence
323
+ #
324
+ # @param [Integer] end position to get sequence
325
+ #
326
+ # @return [Bio::Sequence] an coding region sequence
327
+ def get_cds_seq(position = 0)
328
+
329
+ parent = self.generef
330
+ if parent.strand == '+'
331
+ join = self.get_cds_join(position)
332
+ return !join.empty? ? parent.to_seq.splicing("join(#{join})") : ""
333
+ else
334
+ join = self.get_cds_join(-position)
335
+ return !join.empty? ? parent.to_seq.splicing("join(#{join})") : ""
336
+ end
337
+
338
+ end
339
+
340
+ # Get spliced protein sequence
341
+ #
342
+ # @return [Bio::Sequence] an protein sequence
343
+ def get_prot_seq
344
+ parent = self.generef
345
+ return parent.strand == '+' ? parent.to_seq.splicing("join(#{self.get_cds_join})").translate : parent.to_seq.splicing("complement(join(#{self.get_cds_join}))").translate
346
+ end
347
+
348
+ # get length of spliced RNA
349
+ #
350
+ # @return [Integer] length of spliced RNA
351
+ def rna_len
352
+ return self.get_mrna_seq.length
353
+ end
354
+
355
+ # get length of protein product
356
+ #
357
+ # @return [Integer] length of protein product
358
+ def prot_len
359
+ return self.get_prot_seq.length
360
+ end
361
+
362
+ # Get the codon sequence at the giving position base on position of amino acid
363
+ #
364
+ # @param [Integer] codon position
365
+ # @return [Bio::Sequence] the codon at given position
366
+ def get_codon(codon_pos)
367
+ return self.get_cds_seq().subseq(((codon_pos - 1) * 3) + 1 , ((codon_pos - 1) * 3) + 3)
368
+ end
369
+
370
+ # convert genomic position to codon position
371
+ #
372
+ # @param [Integer] genomic position
373
+ # @return [Array] Return all information of codon at given position
374
+ def get_prot_pos(pos)
375
+
376
+ seqlen = self.get_cds_seq(pos).length
377
+ if seqlen != 0
378
+ return [(seqlen - 1) / 3, (seqlen - 1) % 3]
379
+ else
380
+ return []
381
+ end
382
+
383
+ end
384
+
385
+ end
386
+
387
+ end
@@ -0,0 +1,51 @@
1
+ #
2
+ # Exodb
3
+ # Copyright (C) 2014
4
+ #
5
+ # author: Natapol Pornputtapong <natapol.por@gmail.com>
6
+ #
7
+ # Documentation: Natapol Pornputtapong (RDoc'd and embellished by William Webber)
8
+ #
9
+
10
+ # raise "Please, use ruby 1.9.0 or later." if RUBY_VERSION < "1.9.0"
11
+
12
+
13
+ module Exodb
14
+
15
+ class Region
16
+
17
+ include Mongoid::Document
18
+ include Mongoid::Timestamps
19
+
20
+ end
21
+
22
+ class Gene < Region
23
+
24
+ include Mongoid::Versioning
25
+ include Exodb::LocationField
26
+
27
+ field :symbol, type: String
28
+ field :loh, type: Boolean
29
+
30
+ embeds_many :aacids
31
+ belongs_to :generef
32
+ belongs_to :cell
33
+
34
+ index({'symbol' => 1, 'aacids.position' => 1}, background: true)
35
+ end
36
+
37
+ class Aacid < Region
38
+
39
+ field :position, type: Integer # position referenced to the first codon from the longest splice variant
40
+ field :refcodon, type: String
41
+ field :refaa, type: String
42
+ field :altcodon, type: Hash
43
+ field :inhcodon, type: Hash
44
+ field :isoform, type: Array
45
+
46
+ embedded_in :gene
47
+ has_many :variants
48
+
49
+ end
50
+
51
+ end