bio-ucsc-api 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ #
2
+ # = hg19/descriptioin.rb
3
+ # Copyright::
4
+ # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
6
+ #
7
+ # = Table description in UCSC Table Browser This track shows multiple
8
+ # Description of a genbank sequence
9
+
10
+ module Bio
11
+ module Ucsc
12
+ module Hg19
13
+ class Description < DBConnection
14
+ set_table_name 'description'
15
+ set_primary_key :id
16
+ Bio::Ucsc::Hg19::GbCdnaInfo
17
+ belongs_to(:gbCdnaInfo,
18
+ :class_name => "Bio::Ucsc::Hg19::GbCdnaInfo",
19
+ :foreign_key => :id)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,24 @@
1
+ #
2
+ # = hg19/gbcdnainfo.rb
3
+ # Copyright::
4
+ # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
6
+ #
7
+ # = Table description in UCSC Table Browser This track shows multiple
8
+ # Links together various info associated with a GenBank mRNA or EST
9
+
10
+ module Bio
11
+ module Ucsc
12
+ module Hg19
13
+ class GbCdnaInfo < DBConnection
14
+ set_table_name 'gbCdnaInfo'
15
+ set_primary_key :id
16
+ Bio::Ucsc::Hg19::Description
17
+ has_one(:description,
18
+ :class_name => "Bio::Ucsc::Hg19::Description",
19
+ :foreign_key => :id)
20
+ columns_hash.delete("type")
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,19 @@
1
+ #
2
+ # = hg19/kgxref.rb
3
+ # Copyright::
4
+ # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
6
+ #
7
+ # = Table description in UCSC Table Browser This track shows multiple
8
+ # Link together a Known Gene ID and a gene alias
9
+
10
+ module Bio
11
+ module Ucsc
12
+ module Hg19
13
+ class KgXref < DBConnection
14
+ set_table_name 'kgXref'
15
+ set_primary_key nil
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,171 @@
1
+ #
2
+ # = reference.rb
3
+ # handle UCSC's 2bit file (locally stored) to retrieve the reference sequence
4
+ #
5
+ # Copyright:: Cioyrught (C) 2011
6
+ # MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
7
+ # License:: Ruby license (Ryby's / GPLv2 dual)
8
+
9
+ # require 'bio'
10
+
11
+ module Bio
12
+ module Ucsc
13
+ module Hg19
14
+ TwoBitHeader =
15
+ Struct.new(:signature, :version, :sequence_count, :reserved)
16
+ TwoBitRecord =
17
+ Struct.new(:dna_size,
18
+ :n_block_intervals, :mask_block_intervals,
19
+ :reserved, :packed_dna_offset)
20
+
21
+ class ByteQueue
22
+ def initialize(str)
23
+ @str = str
24
+ @index = 0
25
+ end
26
+
27
+ attr_accessor :index
28
+
29
+ def next(n)
30
+ result = @str[@index, n]
31
+ @index += n
32
+ result
33
+ end
34
+ end # class ByteQueue
35
+
36
+ class Reference
37
+ BINCODE = {0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}
38
+
39
+ cattr_reader :filename, :header, :offsets
40
+
41
+ def self.load(filename)
42
+ two_bit = nil
43
+ open(filename, 'rb') {|f| two_bit = f.read}
44
+ @@tbq = ByteQueue.new(two_bit)
45
+ @@filename = filename
46
+
47
+ twobit_header = TwoBitHeader.new
48
+ twobit_header.signature = @@tbq.next(4).unpack('L').first
49
+ twobit_header.version = @@tbq.next(4).unpack('L').first
50
+ twobit_header.sequence_count = @@tbq.next(4).unpack('L').first
51
+ twobit_header.reserved = @@tbq.next(4).unpack('L').first
52
+ @@header = twobit_header
53
+
54
+ @@offsets = Hash.new
55
+ @@header.sequence_count.times do
56
+ name_length = @@tbq.next(1).unpack('C').first
57
+ @@offsets[@@tbq.next(name_length).unpack('a*').first] =
58
+ @@tbq.next(4).unpack('L').first
59
+ end
60
+ @@records = Hash.new
61
+ end
62
+
63
+ def self.records(chrom)
64
+ return @@records[chrom] if @@records[chrom]
65
+
66
+ @@tbq.index = @@offsets[chrom]
67
+ @@records[chrom] = TwoBitRecord.new
68
+ @@records[chrom].dna_size = @@tbq.next(4).unpack('L').first
69
+
70
+ n_block_count = @@tbq.next(4).unpack('L').first
71
+ n_block_starts = Array.new
72
+ n_block_count.times do
73
+ n_block_starts << @@tbq.next(4).unpack('L').first
74
+ end
75
+ n_block_sizes = Array.new
76
+ n_block_count.times do
77
+ n_block_sizes << @@tbq.next(4).unpack('L').first
78
+ end
79
+ @@records[chrom].n_block_intervals = Array.new
80
+ n_block_count.times do |idx|
81
+ @@records[chrom].n_block_intervals <<
82
+ Bio::GenomicInterval.zero_based(chrom,
83
+ n_block_starts[idx],
84
+ n_block_starts[idx]+n_block_sizes[idx])
85
+ end
86
+
87
+ mask_block_count = @@tbq.next(4).unpack('L').first
88
+ mask_block_starts = Array.new
89
+ mask_block_count.times do
90
+ mask_block_starts << @@tbq.next(4).unpack('L').first
91
+ end
92
+ mask_block_sizes = Array.new
93
+ mask_block_count.times do
94
+ mask_block_sizes << @@tbq.next(4).unpack('L').first
95
+ end
96
+ @@records[chrom].mask_block_intervals = Array.new
97
+ mask_block_count.times do |idx|
98
+ @@records[chrom].mask_block_intervals <<
99
+ Bio::GenomicInterval.zero_based(chrom,
100
+ mask_block_starts[idx],
101
+ mask_block_starts[idx]+mask_block_sizes[idx])
102
+ end
103
+
104
+ @@records[chrom].reserved = @@tbq.next(4).unpack('L').first
105
+ @@records[chrom].packed_dna_offset = @@tbq.index
106
+
107
+ @@records[chrom]
108
+ end
109
+
110
+ def self.find_by_interval(interval)
111
+ seq = self.find_by_interval_raw(interval)
112
+ @@records[interval.chrom].n_block_intervals.map do |nb|
113
+ if interval.overlapped?(nb)
114
+ case interval.compare(nb)
115
+ when :equal,:contained_by
116
+ seq = 'N' * interval.overlap(nb)
117
+ when :contains
118
+ left_len = nb.chr_start - interval.chr_start + 1
119
+ right_len = interval.chr_end - nb.chr_end + 1
120
+ seq[0, left_len] = 'N' * left_len
121
+ seq[-right_len, right_len] = 'N' * right_len
122
+ when :left_overlapped
123
+ left_len = nb.chr_end - interval.chr_start + 1
124
+ seq[0, left_len] = 'N' * left_len
125
+ when :right_overlapped
126
+ right_len = interval.chr_end - nb.chr_start + 1
127
+ seq[-right_len, right_len] = 'N' * right_len
128
+ when :right_adjacent, :right_off
129
+ # expecting that N-blocks are sorted
130
+ # return Bio::Sequence::NA.new(seq)
131
+ seq
132
+ end
133
+ end
134
+ end
135
+ #Bio::Sequence::NA.new(seq)
136
+ seq
137
+ end
138
+
139
+ def self.find_by_interval_raw(interval)
140
+ byte_count, byte_mod = interval.zero_start.divmod 4
141
+ chrom_top = self.records(interval.chrom).packed_dna_offset
142
+ div_start, mod_start = interval.zero_start.divmod 4
143
+ div_end, mod_end = interval.zero_end.divmod 4
144
+ div_len, mod_len = interval.length.divmod 4
145
+
146
+ byte_length = div_end - div_start + 1
147
+ @@tbq.index = chrom_top + div_start
148
+ bytes = @@tbq.next(byte_length).unpack('C*')
149
+ seq = Bio::Ucsc::Hg19::Reference.bytes_to_nucleotides(bytes)
150
+ seq[mod_start..(-1-(4-mod_end))]
151
+ end
152
+
153
+ def self.bytes_to_nucleotides(bytes)
154
+ results = ""
155
+ bytes.each do |byte|
156
+ results << Bio::Ucsc::Hg19::Reference.byte_to_nucleotides(byte)
157
+ end
158
+ results
159
+ end
160
+
161
+ def self.byte_to_nucleotides(byte)
162
+ BINCODE[byte >> 6] +
163
+ BINCODE[(byte >> 4) & 0b11] +
164
+ BINCODE[(byte >> 2) & 0b11] +
165
+ BINCODE[byte & 0b11]
166
+ end
167
+ end # class Reference
168
+
169
+ end # module Hg19
170
+ end # module Ucsc
171
+ end # module Bio
@@ -0,0 +1,23 @@
1
+ #
2
+ # = hg19/refseqsummary.rb
3
+ # Copyright::
4
+ # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
6
+ #
7
+ # = Table description in UCSC Table Browser This track shows multiple
8
+ # Summary or completeness info for RefSeqs (when given in comments)
9
+ #
10
+ # mrnaAcc column: see http://www.ncbi.nlm.nih.gov/RefSeq/key.html#accessions
11
+ # exsamples - NM_000546 (mRNA), NR_029476 (small RNA, unaligned)
12
+ #
13
+
14
+ module Bio
15
+ module Ucsc
16
+ module Hg19
17
+ class RefSeqSummary < DBConnection
18
+ set_table_name 'refSeqSummary'
19
+ set_primary_key nil
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,20 @@
1
+ #
2
+ # = hg19/trnas.rb
3
+ # Copyright::
4
+ # Copyright (C) 2011 MISHIMA, Hiroyuki <missy at be.to / hmishima at nagasaki-u.ac.jp>
5
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
6
+ #
7
+ # = Table description in UCSC Table Browser
8
+ # This track displays tRNA genes predicted by using tRNAscan-SE v.1.23.
9
+
10
+ module Bio
11
+ module Ucsc
12
+ module Hg19
13
+ class TRNAs < DBConnection
14
+ extend Bio::Ucsc::Hg19::QueryUsingChromBin
15
+ set_table_name 'tRNAs'
16
+ set_primary_key nil
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,39 @@
1
+ #!/usr/local/bin/ruby-1.9
2
+ #
3
+ # Usage:: hg19-2bit-retrieve.rb <genomic interval>
4
+ # (default is "chr1:9,500-10,499")
5
+ #
6
+ # Copyright::
7
+ # Copyright (C) 2011 MISHIMA, Hiroyuki
8
+ # <missy at be.to / hmishima at ngit agasaki-u.ac.jp>
9
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
10
+ #
11
+
12
+ require 'nkf'
13
+
14
+ HG19_2BIT_FILE = "hg19.2bit"
15
+
16
+ require File.dirname(__FILE__) + '/../lib/bio-ucsc'
17
+
18
+ class Hg19Ref
19
+ include Bio::Ucsc::Hg19
20
+
21
+ def run(interval)
22
+ DBConnection.connect
23
+ Reference.load(HG19_2BIT_FILE)
24
+ itv = Bio::GenomicInterval.parse(interval)
25
+
26
+ puts itv.to_s
27
+ puts NKF.nkf("-wf50-0", Reference.find_by_interval(itv))
28
+ end
29
+ end
30
+
31
+ if $0 == __FILE__
32
+ interval = ARGV[0]
33
+ interval ||= "chr1:9,500-10,999"
34
+ Hg19Ref.new.run(interval)
35
+ end
36
+
37
+
38
+
39
+
@@ -1,5 +1,7 @@
1
1
  #!/usr/local/bin/ruby-1.9
2
2
  #
3
+ # hg19-sample.rb: chunks of codes handling hg19 tables
4
+ #
3
5
  # Copyright::
4
6
  # Copyright (C) 2011 MISHIMA, Hiroyuki
5
7
  # <missy at be.to / hmishima at nagasaki-u.ac.jp>
@@ -7,10 +9,10 @@
7
9
  #
8
10
 
9
11
  require File.dirname(__FILE__) + '/../lib/bio-ucsc'
12
+ require 'nkf'
10
13
 
11
14
  include Bio::Ucsc
12
15
 
13
- Hg19::DBConnection.default
14
16
  Hg19::DBConnection.connect
15
17
 
16
18
  itvs_a =
@@ -64,3 +66,22 @@ names.each do |n|
64
66
  i = Bio::GenomicInterval.zero_based(r.chrom, r.chromStart, r.chromEnd)
65
67
  puts "Query: #{n}\t#{i.chrom}\t#{i.chr_start}\t#{i.chr_end}\t#{r[:class]}"
66
68
  end
69
+
70
+ #
71
+ #
72
+
73
+ results = GbCdnaInfo.find([1,2,3,4,5], :include => :description)
74
+ results.each{|e| puts "#{e.acc}\t#{e.description.name}"}
75
+
76
+ p GbCdnaInfo.find_by_acc("AA411542", :include => :description)
77
+
78
+ results = KgXref.find_all_by_geneSymbol("TP53")
79
+ results.each{|e| puts "#{e.mRNA}\t#{e.description}"}
80
+
81
+ #
82
+ #
83
+
84
+ puts
85
+ puts NKF.nkf("-wF72", RefSeqSummary.find_by_mrnaAcc("NM_000546").summary)
86
+ puts
87
+ puts NKF.nkf("-wF72", RefSeqSummary.find_by_mrnaAcc("NR_029476").summary)
@@ -0,0 +1,47 @@
1
+ #!/usr/local/bin/ruby-1.9
2
+ #
3
+ # Usage:: symbol2summary.rb <Gene_Symbol> (default is "TP53")
4
+ #
5
+ # Copyright::
6
+ # Copyright (C) 2011 MISHIMA, Hiroyuki
7
+ # <missy at be.to / hmishima at ngit agasaki-u.ac.jp>
8
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
9
+ #
10
+
11
+ require File.dirname(__FILE__) + '/../lib/bio-ucsc'
12
+ require 'nkf'
13
+
14
+ class Sym2Sum
15
+ include Bio::Ucsc::Hg19
16
+
17
+ def run(genesym)
18
+ DBConnection.connect
19
+ known_gene = KgXref.find_by_geneSymbol(genesym)
20
+ ref_gene = RefGene.find_by_name2(genesym)
21
+ summary = RefSeqSummary.find_by_mrnaAcc(ref_gene.name).summary
22
+
23
+ puts "---"
24
+ puts "Gene symbol: #{genesym}" if known_gene
25
+ puts "Description: #{known_gene.description}" if known_gene
26
+ if summary
27
+ puts "Summary:"
28
+ puts NKF.nkf("-wF72", summary)
29
+ end
30
+ end
31
+ end
32
+
33
+ if $0 == __FILE__
34
+ genesym = ARGV[0]
35
+ genesym ||= "TP53"
36
+ Sym2Sum.new.run(genesym)
37
+ end
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
@@ -0,0 +1,144 @@
1
+ require 'bio-ucsc'
2
+
3
+ describe "Bio::Ucsc::Hg18::Reference" do
4
+
5
+ describe ".load" do
6
+ context 'given "../samples/hg18.2bit"' do
7
+ it "returns true" do
8
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
9
+ end
10
+ end
11
+ end
12
+
13
+ describe ".header.signarue" do
14
+ context 'given "../samples/hg18.2bit"' do
15
+ it "returns 0x1A412743" do
16
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
17
+ Bio::Ucsc::Hg18::Reference.header.signature.should == 0x1A412743
18
+ end
19
+ end
20
+ end
21
+
22
+ describe ".header.version" do
23
+ context 'given "../samples/hg18.2bit"' do
24
+ it "returns 0" do
25
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
26
+ Bio::Ucsc::Hg18::Reference.header.version.should == 0
27
+ end
28
+ end
29
+ end
30
+
31
+ describe ".header.sequence_count" do
32
+ context 'given "../samples/hg18.2bit"' do
33
+ it "returns 0x5d" do
34
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
35
+ Bio::Ucsc::Hg18::Reference.header.sequence_count.should == 49
36
+ end
37
+ end
38
+ end
39
+
40
+ describe '.offsets["chr1"]' do
41
+ context 'given "../samples/hg18.2bit"' do
42
+ it "returns 0x100bbd2b" do
43
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
44
+ Bio::Ucsc::Hg18::Reference.offsets["chr1"].should == 0x100bbd2b
45
+ end
46
+ end
47
+ end
48
+
49
+ describe ".records" do
50
+ context 'given "chr1"' do
51
+ it 'returns (TwoBitRecord.reserved == 0)' do
52
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
53
+ Bio::Ucsc::Hg18::Reference.records("chr1").reserved == 0
54
+ end
55
+ end
56
+ end
57
+
58
+ describe ".records" do
59
+ context 'given "chr1"' do
60
+ it 'returns (TwoBitRecord.dna_size == 249_250_621)' do
61
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
62
+ Bio::Ucsc::Hg18::Reference.records("chr1").dna_size.should == 247249719
63
+ end
64
+ end
65
+ end
66
+
67
+ describe ".byte_to_nucleotides" do
68
+ context 'given 0b00011011' do
69
+ it 'returns "TCAG"' do
70
+ r = Bio::Ucsc::Hg18::Reference.byte_to_nucleotides(0b00011011)
71
+ r.should == "TCAG"
72
+ end
73
+ end
74
+ end
75
+
76
+ describe ".bytes_to_nucleotides" do
77
+ context 'given [0b00011011, 0b11100100]' do
78
+ it 'returns "TCAGGACT"' do
79
+ ary = [0b00011011, 0b11100100]
80
+ r = Bio::Ucsc::Hg18::Reference.bytes_to_nucleotides(ary)
81
+ r.should == "TCAGGACT"
82
+ end
83
+ end
84
+ end
85
+
86
+ describe ".find_by_interval_raw" do
87
+ context "given range chr1:1,000,000-1,000,030" do
88
+ it 'returens "TACGTGGCTGCTCTCACACATGGGCCATGTG"' do
89
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
90
+ itv = Bio::GenomicInterval.parse("chr1:1,000,000-1,000,030")
91
+ r = Bio::Ucsc::Hg18::Reference.find_by_interval_raw(itv)
92
+ r.should == "TACGTGGCTGCTCTCACACATGGGCCATGTG"
93
+ end
94
+ end
95
+
96
+ context "given range chr2:1,123,456-1,123,499" do
97
+ it 'returens "TAATGGCATACATGTAGAAAATGCAACTCATGAAGAAGTGGTAA"' do
98
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
99
+ itv = Bio::GenomicInterval.parse("chr2:1,123,456-1,123,499")
100
+ r = Bio::Ucsc::Hg18::Reference.find_by_interval_raw(itv)
101
+ r.should == "TAATGGCATACATGTAGAAAATGCAACTCATGAAGAAGTGGTAA"
102
+ end
103
+ end
104
+
105
+ context "given range chr2:1,123,456-1,123,456" do
106
+ it 'returens "T"' do
107
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
108
+ itv = Bio::GenomicInterval.parse("chr2:1,123,456-1,123,456")
109
+ r = Bio::Ucsc::Hg18::Reference.find_by_interval_raw(itv)
110
+ r.should == "T"
111
+ end
112
+ end
113
+ end
114
+
115
+ describe ".find_by_interval" do
116
+ context "given range chr1:217,260-217,299" do
117
+ it 'returns "NNNNNNNNNNNNNNNNNNNNNGATTCATGGCTGAAATCGT"' do
118
+
119
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
120
+ itv = Bio::GenomicInterval.parse("chr1:217,260-217,299")
121
+ r = Bio::Ucsc::Hg18::Reference.find_by_interval(itv)
122
+ r.should == "NNNNNNNNNNNNNNNNNNNNNGATTCATGGCTGAAATCGT"
123
+ end
124
+ end
125
+
126
+ # N-block => chr1:267,720-317,719
127
+ context "given range chr1:257,560-257,600" do
128
+ it 'returns "CAGGCGCCCGCATCCAGCTGGATNNNNNNNNNNNNNNNNNN"' do
129
+ Bio::Ucsc::Hg18::Reference.load("samples/hg18.2bit")
130
+ itv = Bio::GenomicInterval.parse("chr1:257,560-257,600")
131
+ r = Bio::Ucsc::Hg18::Reference.find_by_interval(itv)
132
+ r.should == "CAGGCGCCCGCATCCAGCTGGATNNNNNNNNNNNNNNNNNN"
133
+ end
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ # N-Blocks
140
+ # chr1:167281-217280,
141
+ # chr1:257583-307582,
142
+ # chr1:461232-511231,
143
+ # chr1:2624081-2674080,
144
+