rbbt-sources 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/rbbt/sources/COSMIC.rb +100 -4
- data/lib/rbbt/sources/NCI.rb +1 -1
- data/lib/rbbt/sources/STITCH.rb +5 -5
- data/lib/rbbt/sources/dbSNP.rb +141 -48
- data/lib/rbbt/sources/ensembl.rb +13 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +17 -6
- data/lib/rbbt/sources/entrez.rb +23 -21
- data/lib/rbbt/sources/genomes1000.rb +57 -0
- data/lib/rbbt/sources/go.rb +8 -8
- data/lib/rbbt/sources/organism.rb +5 -1
- data/lib/rbbt/sources/pfam.rb +24 -23
- data/lib/rbbt/sources/pubmed.rb +5 -2
- data/lib/rbbt/sources/tfacts.rb +0 -3
- data/lib/rbbt/sources/uniprot.rb +58 -1
- data/share/Ensembl/release_dates +2 -1
- data/share/install/Organism/organism_helpers.rb +33 -6
- data/test/rbbt/sources/test_gscholar.rb +14 -0
- data/test/rbbt/sources/test_organism.rb +5 -0
- metadata +8 -17
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
NjczYWU0NDMyM2IwZDBlYWFjNGVlNWU4NTg5ODFhMGEzYmEwZGJiYw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MjUzNGFjZDJjYzk1ZGJiMjIwNzllMjA4ZDMyODI2YTQzYzhhNzU0Yg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NGZiMjgxYzQ0OGY2MzgxYmUzMzEzN2E1NzBjNDc4MjU3YjRmZjM0OTMwMTcz
|
10
|
+
YzFmMTU4Y2FkMzI4OTljZTA2MTJhNmVhZDQzNzA2NDAwNGM4ODc0ZTAwYzEx
|
11
|
+
MDZjYzAzODEyZjc1OTlmODJhYWE5YjE3ZjI3ODNlYWZlODZmYzc=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NWExMTU0MGMyZWExY2U5NWI2YWJhODYzZDcxMDFkYTc0NWZjN2M3ZDAzZTRh
|
14
|
+
Njk4NTgwMDgwZWJkNjhiNWM3OTA0MDE5Y2IwZjI1OTFhYzU3YmJkZWFhN2M4
|
15
|
+
ZGY2ZTA3NGNjOTM4MDBmZWY4NmQ0ZTMzODc3NmIwMzE1MTM1YjY=
|
data/lib/rbbt/sources/COSMIC.rb
CHANGED
@@ -1,21 +1,31 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/resource'
|
3
|
+
|
3
4
|
module COSMIC
|
4
5
|
extend Resource
|
5
6
|
self.subdir = "share/databases/COSMIC"
|
6
7
|
|
7
|
-
COSMIC.claim COSMIC.
|
8
|
-
url = "ftp://ftp.sanger.ac.uk/pub/CGP/
|
8
|
+
COSMIC.claim COSMIC.mutations, :proc do
|
9
|
+
url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicCompleteExport_v64_260313.tsv.gz"
|
9
10
|
|
10
|
-
|
11
|
+
stream = CMD.cmd('awk \'BEGIN{FS="\t"} { if ($12 != "" && $12 != "Mutation ID") { sub($12, "COSM" $12 ":" $4)}; print}\'', :in => Open.open(url), :pipe => true)
|
12
|
+
tsv = TSV.open(stream, :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
|
11
13
|
tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
|
12
14
|
tsv.add_field "Genomic Mutation" do |mid, values|
|
13
15
|
position = values["Mutation GRCh37 genome position"]
|
14
16
|
cds = values["Mutation CDS"]
|
17
|
+
|
15
18
|
if position.nil? or position.empty?
|
16
19
|
nil
|
17
20
|
else
|
18
21
|
position = position.split("-").first
|
22
|
+
|
23
|
+
chr, pos = position.split(":")
|
24
|
+
chr = "X" if chr == "23"
|
25
|
+
chr = "Y" if chr == "24"
|
26
|
+
chr = "M" if chr == "25"
|
27
|
+
position = [chr, pos ] * ":"
|
28
|
+
|
19
29
|
if cds.nil?
|
20
30
|
position
|
21
31
|
else
|
@@ -52,6 +62,92 @@ module COSMIC
|
|
52
62
|
end
|
53
63
|
end
|
54
64
|
end
|
55
|
-
|
65
|
+
|
66
|
+
tsv.to_s.gsub(/(\d)-(\d)/,'\1:\2')
|
67
|
+
end
|
68
|
+
|
69
|
+
COSMIC.claim COSMIC.mutations_hg18, :proc do |filename|
|
70
|
+
require 'rbbt/sources/organism'
|
71
|
+
file = COSMIC.mutations.open
|
72
|
+
begin
|
73
|
+
|
74
|
+
while (line = file.gets) !~ /Genomic Mutation/; end
|
75
|
+
fields = line[1..-2].split("\t")
|
76
|
+
mutation_pos = fields.index "Genomic Mutation"
|
77
|
+
|
78
|
+
mutations = CMD.cmd("grep -v '^#'|cut -f #{mutation_pos + 1}|sort -u", :in => COSMIC.mutations.open).read.split("\n").select{|m| m.include? ":" }
|
79
|
+
|
80
|
+
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
81
|
+
|
82
|
+
File.open(filename, 'w') do |f|
|
83
|
+
f.puts "#: :type=:list#:namespace=Hsa/may2009"
|
84
|
+
f.puts "#" + fields * "\t"
|
85
|
+
while line = file.gets do
|
86
|
+
next if line[0] == "#"[0]
|
87
|
+
line.strip!
|
88
|
+
parts = line.split("\t")
|
89
|
+
parts[mutation_pos] = translations[parts[mutation_pos]]
|
90
|
+
f.puts parts * "\t"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
rescue Exception
|
94
|
+
FileUtils.rm filename if File.exists? filename
|
95
|
+
raise $!
|
96
|
+
ensure
|
97
|
+
file.close
|
98
|
+
end
|
99
|
+
|
100
|
+
nil
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
def self.rsid_index(organism, chromosome = nil)
|
105
|
+
build = Organism.hg_build(organism)
|
106
|
+
|
107
|
+
tag = [build, chromosome] * ":"
|
108
|
+
fwt = nil
|
109
|
+
Persist.persist("StaticPosIndex for COSMIC [#{ tag }]", :fwt, :persist => true) do
|
110
|
+
value_size = 0
|
111
|
+
file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
|
112
|
+
chr_positions = []
|
113
|
+
begin
|
114
|
+
Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
|
115
|
+
next if line[0] == "#"[0]
|
116
|
+
rsid, mutation = line.split("\t").values_at 0, 25
|
117
|
+
next if mutation.nil? or mutation.empty?
|
118
|
+
chr, pos = mutation.split(":")
|
119
|
+
next if chr != chromosome or pos.nil? or pos.empty?
|
120
|
+
chr_positions << [rsid, pos.to_i]
|
121
|
+
value_size = rsid.length if rsid.length > value_size
|
122
|
+
end
|
123
|
+
rescue
|
124
|
+
end
|
125
|
+
fwt = FixWidthTable.new :memory, value_size
|
126
|
+
fwt.add_point(chr_positions)
|
127
|
+
fwt
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.mutation_index(organism)
|
132
|
+
build = Organism.hg_build(organism)
|
133
|
+
file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
|
134
|
+
@mutation_index ||= {}
|
135
|
+
@mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
if defined? Entity
|
142
|
+
if defined? Gene and Entity === Gene
|
143
|
+
module Gene
|
144
|
+
property :COSMIC_rsids => :single2array do
|
145
|
+
COSMIC.rsid_index(organism, chromosome)[self.chr_range]
|
146
|
+
end
|
147
|
+
|
148
|
+
property :COSMIC_mutations => :single2array do
|
149
|
+
GenomicMutation.setup(COSMIC.mutation_index(organism).values_at(*self.COSMIC_rsids).uniq, "COSMIC mutations over #{self.name || self}", organism, false)
|
150
|
+
end
|
151
|
+
end
|
56
152
|
end
|
57
153
|
end
|
data/lib/rbbt/sources/NCI.rb
CHANGED
data/lib/rbbt/sources/STITCH.rb
CHANGED
@@ -5,11 +5,11 @@ module STITCH
|
|
5
5
|
extend Resource
|
6
6
|
self.subdir = "share/databases/STITCH"
|
7
7
|
|
8
|
-
STITCH.claim STITCH.source.chemical_chemical
|
9
|
-
STITCH.claim STITCH.source.protein_chemical
|
10
|
-
STITCH.claim STITCH.source.actions
|
11
|
-
STITCH.claim STITCH.source.aliases
|
12
|
-
STITCH.claim STITCH.source.sources
|
8
|
+
STITCH.claim STITCH.source.chemical_chemical, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
|
9
|
+
STITCH.claim STITCH.source.protein_chemical, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
|
10
|
+
STITCH.claim STITCH.source.actions, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
|
11
|
+
STITCH.claim STITCH.source.aliases, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
|
12
|
+
STITCH.claim STITCH.source.sources, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
|
13
13
|
|
14
14
|
Organism.installable_organisms.each do |organism|
|
15
15
|
STITCH.claim STITCH.chemical_protein(organism), :proc do
|
data/lib/rbbt/sources/dbSNP.rb
CHANGED
@@ -10,25 +10,27 @@ module DbSNP
|
|
10
10
|
URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
|
11
11
|
|
12
12
|
DbSNP.claim DbSNP.mutations_ncbi, :proc do
|
13
|
-
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :
|
13
|
+
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :flat)
|
14
14
|
file = Open.open(URL, :nocache => true)
|
15
15
|
while line = file.gets do
|
16
16
|
next if line[0] == "#"[0]
|
17
17
|
chr, position, id, ref, alt = line.split "\t"
|
18
|
-
|
19
|
-
|
20
|
-
alt[0]
|
18
|
+
|
19
|
+
mutations = alt.split(",").collect do |a|
|
20
|
+
if alt[0] == ref[0]
|
21
|
+
alt[0] = '+'[0]
|
22
|
+
end
|
23
|
+
[chr, position, alt] * ":"
|
21
24
|
end
|
22
|
-
mutation = [chr, position, alt] * ":"
|
23
25
|
|
24
26
|
tsv.namespace = "Hsa/may2012"
|
25
|
-
tsv[id] =
|
27
|
+
tsv[id] = mutations
|
26
28
|
end
|
27
29
|
|
28
30
|
tsv.to_s
|
29
31
|
end
|
30
32
|
|
31
|
-
DbSNP.claim DbSNP.
|
33
|
+
DbSNP.claim DbSNP.rsids, :proc do |filename|
|
32
34
|
ftp = Net::FTP.new('ftp.broadinstitute.org')
|
33
35
|
ftp.passive = true
|
34
36
|
ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
|
@@ -37,65 +39,156 @@ module DbSNP
|
|
37
39
|
tmpfile = TmpFile.tmp_file + '.gz'
|
38
40
|
ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
|
39
41
|
|
40
|
-
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation", "GMAF", "G5", "G5A", "dbSNP Build ID"], :type => :list)
|
41
42
|
file = Open.open(tmpfile, :nocache => true)
|
42
|
-
|
43
|
-
|
43
|
+
begin
|
44
|
+
File.open(filename, 'w') do |f|
|
45
|
+
f.puts "#: :type=:list#:namespace=Hsa/may2012"
|
46
|
+
f.puts "#" + ["RS ID", "GMAF", "G5", "G5A", "dbSNP Build ID"] * "\t"
|
47
|
+
while line = file.gets do
|
48
|
+
next if line[0] == "#"[0]
|
49
|
+
|
50
|
+
chr, position, id, ref, muts, qual, filter, info = line.split "\t"
|
51
|
+
|
52
|
+
g5 = g5a = dbsnp_build_id = gmaf = nil
|
53
|
+
|
54
|
+
gmaf = $1 if info =~ /GMAF=([0-9.]+)/
|
55
|
+
g5 = true if info =~ /\bG5\b/
|
56
|
+
g5a = true if info =~ /\bG5A\b/
|
57
|
+
dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
|
44
58
|
|
45
|
-
|
46
|
-
|
47
|
-
chr.sub!('chr', '')
|
48
|
-
|
49
|
-
mut = mut.split(",").first
|
50
|
-
case
|
51
|
-
when ref == '-'
|
52
|
-
mut = "+" << mut
|
53
|
-
when mut == '-'
|
54
|
-
mut = "-" * ref.length
|
55
|
-
when (mut.length > 1 and ref.length > 1)
|
56
|
-
mut = '-' * ref.length << mut
|
57
|
-
when (mut.length > 1 and ref.length == 1 and mut.index(ref) == 0)
|
58
|
-
mut = '+' << mut[1..-1]
|
59
|
-
when (mut.length == 1 and ref.length > 1 and ref.index(mut) == 0)
|
60
|
-
mut = '-' * (ref.length - 1)
|
61
|
-
else
|
62
|
-
mut = mut
|
59
|
+
f.puts [id, gmaf, g5, g5a, dbsnp_build_id] * "\t"
|
60
|
+
end
|
63
61
|
end
|
62
|
+
rescue Exception
|
63
|
+
FileUtils.rm filename if File.exists? filename
|
64
|
+
raise $!
|
65
|
+
ensure
|
66
|
+
file.close
|
67
|
+
FileUtils.rm tmpfile
|
68
|
+
end
|
64
69
|
|
65
|
-
|
66
|
-
|
67
|
-
gmaf = $1 if info =~ /GMAF=([0-9.]+)/
|
68
|
-
g5 = true if info =~ /\bG5\b/
|
69
|
-
g5a = true if info =~ /\bG5A\b/
|
70
|
-
dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
|
70
|
+
nil
|
71
|
+
end
|
71
72
|
|
72
|
-
|
73
|
+
DbSNP.claim DbSNP.mutations, :proc do |filename|
|
74
|
+
ftp = Net::FTP.new('ftp.broadinstitute.org')
|
75
|
+
ftp.passive = true
|
76
|
+
ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
|
77
|
+
ftp.chdir('/bundle/2.3/hg19')
|
73
78
|
|
74
|
-
|
79
|
+
tmpfile = TmpFile.tmp_file + '.gz'
|
80
|
+
ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
|
75
81
|
|
76
|
-
|
77
|
-
|
82
|
+
file = Open.open(tmpfile, :nocache => true)
|
83
|
+
begin
|
84
|
+
File.open(filename, 'w') do |f|
|
85
|
+
f.puts "#: :type=:flat#:namespace=Hsa/may2012"
|
86
|
+
f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
|
87
|
+
while line = file.gets do
|
88
|
+
next if line[0] == "#"[0]
|
78
89
|
|
79
|
-
|
90
|
+
chr, position, id, ref, muts, qual, filter, info = line.split "\t"
|
80
91
|
|
81
|
-
|
92
|
+
chr.sub!('chr', '')
|
93
|
+
|
94
|
+
position, muts = Misc.correct_vcf_mutation(position.to_i, ref, muts)
|
95
|
+
|
96
|
+
mutations = muts.collect{|mut| [chr, position, mut] * ":" }
|
97
|
+
|
98
|
+
f.puts ([id] + mutations) * "\t"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
rescue Exception
|
102
|
+
FileUtils.rm filename if File.exists? filename
|
103
|
+
raise $!
|
104
|
+
ensure
|
105
|
+
file.close
|
106
|
+
FileUtils.rm tmpfile
|
107
|
+
end
|
108
|
+
|
109
|
+
nil
|
82
110
|
end
|
83
111
|
|
84
|
-
DbSNP.claim DbSNP.mutations_hg18, :proc do
|
112
|
+
DbSNP.claim DbSNP.mutations_hg18, :proc do |filename|
|
85
113
|
require 'rbbt/sources/organism'
|
86
114
|
|
87
|
-
|
88
|
-
|
89
|
-
mutations = hg19_tsv.values
|
115
|
+
mutations = CMD.cmd("grep -v '^#'|cut -f 2|sort -u", :in => DbSNP.mutations.open).read.split("\n").collect{|l| l.split("|")}.flatten
|
90
116
|
|
91
117
|
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
118
|
+
begin
|
119
|
+
file = Open.open(DbSNP.mutations.find, :nocache => true)
|
120
|
+
File.open(filename, 'w') do |f|
|
121
|
+
f.puts "#: :type=:flat#:namespace=Hsa/may2009"
|
122
|
+
f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
|
123
|
+
while line = file.gets do
|
124
|
+
next if line[0] == "#"[0]
|
125
|
+
parts = line.split("\t")
|
126
|
+
parts[1..-1] = parts[1..-1].collect{|p| translations[p]} * "|"
|
127
|
+
f.puts parts * "\t"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
rescue Exception
|
131
|
+
FileUtils.rm filename if File.exists? filename
|
132
|
+
raise $!
|
133
|
+
ensure
|
134
|
+
file.close
|
135
|
+
end
|
136
|
+
|
137
|
+
nil
|
138
|
+
end
|
92
139
|
|
93
|
-
|
94
|
-
|
140
|
+
def self.rsid_index(organism, chromosome = nil)
|
141
|
+
build = Organism.hg_build(organism)
|
142
|
+
|
143
|
+
tag = [build, chromosome] * ":"
|
144
|
+
Persist.persist("StaticPosIndex for dbSNP [#{ tag }]", :fwt, :persist => true) do
|
145
|
+
value_size = 0
|
146
|
+
file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
|
147
|
+
chr_positions = []
|
148
|
+
Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
|
149
|
+
next if line[0] == "#"[0]
|
150
|
+
rsid, mutation = line.split("\t")
|
151
|
+
next if mutation.nil? or mutation.empty?
|
152
|
+
chr, pos = mutation.split(":")
|
153
|
+
next if chr != chromosome or pos.nil? or pos.empty?
|
154
|
+
chr_positions << [rsid, pos.to_i]
|
155
|
+
value_size = rsid.length if rsid.length > value_size
|
156
|
+
end
|
157
|
+
fwt = FixWidthTable.new :memory, value_size
|
158
|
+
fwt.add_point(chr_positions)
|
159
|
+
fwt
|
95
160
|
end
|
161
|
+
end
|
96
162
|
|
97
|
-
|
163
|
+
def self.mutation_index(organism)
|
164
|
+
build = Organism.hg_build(organism)
|
165
|
+
file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
|
166
|
+
@mutation_index ||= {}
|
167
|
+
@mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
if defined? Entity
|
173
|
+
if defined? Gene and Entity === Gene
|
174
|
+
module Gene
|
175
|
+
property :dbSNP_rsids => :single2array do
|
176
|
+
DbSNP.rsid_index(organism, chromosome)[self.chr_range]
|
177
|
+
end
|
178
|
+
|
179
|
+
property :dbSNP_mutations => :single2array do
|
180
|
+
GenomicMutation.setup(DbSNP.mutation_index(organism).values_at(*self.dbSNP_rsids).compact.flatten.uniq, "dbSNP mutations over #{self.name || self}", organism, true)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
if defined? GenomicMutation and Entity === GenomicMutation
|
186
|
+
module GenomicMutation
|
187
|
+
property :dbSNP => :array2single do
|
188
|
+
dbSNP.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["RS ID"], :type => :single).values_at *self
|
189
|
+
end
|
190
|
+
end
|
98
191
|
|
99
|
-
tsv.to_s
|
100
192
|
end
|
101
193
|
end
|
194
|
+
|
@@ -1,24 +1,31 @@
|
|
1
1
|
require 'rbbt/util/open'
|
2
2
|
require 'rbbt/sources/organism'
|
3
3
|
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/sources/ensembl'
|
4
5
|
require 'net/ftp'
|
5
6
|
|
6
7
|
module Ensembl
|
7
|
-
|
8
8
|
|
9
|
-
def self.releases
|
10
|
-
@releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
|
11
|
-
end
|
12
|
-
|
13
9
|
module FTP
|
14
10
|
|
15
11
|
SERVER = "ftp.ensembl.org"
|
16
12
|
|
13
|
+
def self.mysql_path(release)
|
14
|
+
end
|
15
|
+
|
17
16
|
def self.ftp_name_for(organism)
|
18
17
|
code, build = organism.split "/"
|
19
18
|
build ||= "current"
|
20
19
|
|
21
20
|
if build.to_s == "current"
|
21
|
+
release = 'current'
|
22
|
+
name = Organism.scientific_name(organism)
|
23
|
+
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
24
|
+
ftp.passive = true
|
25
|
+
ftp.login
|
26
|
+
ftp.chdir(File.join('pub', 'current_mysql'))
|
27
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
|
28
|
+
ftp.close
|
22
29
|
else
|
23
30
|
release = Ensembl.releases[build]
|
24
31
|
name = Organism.scientific_name(organism)
|
@@ -34,7 +41,11 @@ module Ensembl
|
|
34
41
|
|
35
42
|
def self.ftp_directory_for(organism)
|
36
43
|
release, ftp_name = ftp_name_for(organism)
|
37
|
-
|
44
|
+
if release == 'current'
|
45
|
+
File.join('/pub/', 'current_mysql', ftp_name)
|
46
|
+
else
|
47
|
+
File.join('/pub/', release, 'mysql', ftp_name)
|
48
|
+
end
|
38
49
|
end
|
39
50
|
|
40
51
|
def self.base_url(organism)
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -139,10 +139,12 @@ module Entrez
|
|
139
139
|
else
|
140
140
|
filename = gene_filename geneid
|
141
141
|
|
142
|
+
|
142
143
|
if FileCache.found(filename)
|
143
144
|
return Gene.new(Open.read(FileCache.path(filename)))
|
144
145
|
else
|
145
146
|
xml = get_online(geneid)
|
147
|
+
|
146
148
|
FileCache.add(filename, xml) unless FileCache.found(filename)
|
147
149
|
|
148
150
|
return Gene.new(xml)
|
@@ -150,30 +152,30 @@ module Entrez
|
|
150
152
|
end
|
151
153
|
end
|
152
154
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
return 0
|
166
|
-
end
|
167
|
-
else
|
155
|
+
# Counts the words in common between a chunk of text and the text
|
156
|
+
# found in Entrez Gene for that particular gene. The +gene+ may be a
|
157
|
+
# gene identifier or a Gene class instance.
|
158
|
+
def self.gene_text_similarity(gene, text)
|
159
|
+
|
160
|
+
case
|
161
|
+
when Entrez::Gene === gene
|
162
|
+
gene_text = gene.text
|
163
|
+
when String === gene || Fixnum === gene
|
164
|
+
begin
|
165
|
+
gene_text = get_gene(gene).text
|
166
|
+
rescue CMD::CMDError
|
168
167
|
return 0
|
169
168
|
end
|
169
|
+
else
|
170
|
+
return 0
|
171
|
+
end
|
170
172
|
|
171
|
-
|
172
|
-
|
173
|
+
gene_words = gene_text.words.to_set
|
174
|
+
text_words = text.words.to_set
|
173
175
|
|
174
|
-
|
176
|
+
return 0 if gene_words.empty? || text_words.empty?
|
175
177
|
|
176
|
-
|
177
|
-
|
178
|
-
|
178
|
+
common = gene_words.intersection(text_words)
|
179
|
+
common.length / (gene_words.length + text_words.length).to_f
|
180
|
+
end
|
179
181
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/open'
|
3
3
|
require 'rbbt/resource'
|
4
|
+
require 'rbbt/entity/gene'
|
4
5
|
|
5
6
|
module Genomes1000
|
6
7
|
extend Resource
|
@@ -49,4 +50,60 @@ module Genomes1000
|
|
49
50
|
tsv.to_s
|
50
51
|
end
|
51
52
|
|
53
|
+
def self.rsid_index(organism, chromosome = nil)
|
54
|
+
build = Organism.hg_build(organism)
|
55
|
+
|
56
|
+
tag = [build, chromosome] * ":"
|
57
|
+
Persist.persist("StaticPosIndex for Genomes1000 [#{ tag }]", :fwt, :persist => true) do
|
58
|
+
value_size = 0
|
59
|
+
file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
|
60
|
+
chr_positions = []
|
61
|
+
Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
|
62
|
+
next if line[0] == "#"[0]
|
63
|
+
rsid, mutation = line.split("\t")
|
64
|
+
next if mutation.nil? or mutation.empty?
|
65
|
+
chr, pos = mutation.split(":")
|
66
|
+
next if chr != chromosome or pos.nil? or pos.empty?
|
67
|
+
chr_positions << [rsid, pos.to_i]
|
68
|
+
value_size = rsid.length if rsid.length > value_size
|
69
|
+
end
|
70
|
+
fwt = FixWidthTable.new :memory, value_size
|
71
|
+
fwt.add_point(chr_positions)
|
72
|
+
fwt
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.mutation_index(organism)
|
77
|
+
build = Organism.hg_build(organism)
|
78
|
+
file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
|
79
|
+
@mutation_index ||= {}
|
80
|
+
@mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
|
81
|
+
end
|
82
|
+
|
83
|
+
|
52
84
|
end
|
85
|
+
|
86
|
+
|
87
|
+
if defined? Entity
|
88
|
+
if defined? Gene and Entity === Gene
|
89
|
+
module Gene
|
90
|
+
property :genomes_1000_rsids => :single2array do
|
91
|
+
Genomes1000.rsid_index(organism, chromosome)[self.chr_range]
|
92
|
+
end
|
93
|
+
|
94
|
+
property :genomes_1000_mutations => :single2array do
|
95
|
+
GenomicMutation.setup(Genomes1000.mutation_index(organism).values_at(*self.genomes_1000_rsids).uniq, "1000 Genomes mutations over #{self.name || self}", organism, true)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
if defined? GenomicMutation and Entity === GenomicMutation
|
101
|
+
module GenomicMutation
|
102
|
+
property :genomes_1000 => :array2single do
|
103
|
+
Genomes1000.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["Variant ID"], :type => :single).values_at *self
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -18,7 +18,7 @@ module GO
|
|
18
18
|
# only the name field is used.
|
19
19
|
def self.init
|
20
20
|
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
21
|
-
info.serializer = :marshal if info.respond_to? :serializer
|
21
|
+
info.serializer = :marshal if info.respond_to? :serializer
|
22
22
|
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
23
23
|
term_info = {}
|
24
24
|
|
@@ -37,11 +37,11 @@ module GO
|
|
37
37
|
}
|
38
38
|
|
39
39
|
info
|
40
|
-
end
|
40
|
+
end.tap{|o| o.unnamed = true}
|
41
41
|
end
|
42
42
|
|
43
43
|
def self.info
|
44
|
-
|
44
|
+
@@info ||= self.init
|
45
45
|
end
|
46
46
|
|
47
47
|
def self.goterms
|
@@ -94,7 +94,7 @@ if defined? Entity
|
|
94
94
|
self.annotation :organism
|
95
95
|
|
96
96
|
property :name => :array2single do
|
97
|
-
|
97
|
+
GO.id2name(self)
|
98
98
|
end
|
99
99
|
|
100
100
|
property :genes => :array2single do |*args|
|
@@ -117,19 +117,19 @@ if defined? Entity
|
|
117
117
|
if defined? Gene and Entity === Gene
|
118
118
|
module Gene
|
119
119
|
property :go_terms => :array2single do
|
120
|
-
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
120
|
+
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
121
121
|
end
|
122
122
|
|
123
123
|
property :go_bp_terms => :array2single do
|
124
|
-
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
124
|
+
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
125
125
|
end
|
126
126
|
|
127
127
|
property :go_cc_terms => :array2single do
|
128
|
-
@go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
128
|
+
@go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
129
129
|
end
|
130
130
|
|
131
131
|
property :go_mf_terms => :array2single do
|
132
|
-
@go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
132
|
+
@go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
133
133
|
end
|
134
134
|
|
135
135
|
end
|
@@ -46,7 +46,11 @@ module Organism
|
|
46
46
|
return positions
|
47
47
|
end
|
48
48
|
|
49
|
-
positions_bed = positions.collect{|position|
|
49
|
+
positions_bed = positions.collect{|position|
|
50
|
+
chr, pos = position.split(":").values_at(0,1)
|
51
|
+
["chr" << chr, pos.to_i-1, pos, position] * "\t"
|
52
|
+
} * "\n" + "\n"
|
53
|
+
|
50
54
|
new_positions = {}
|
51
55
|
|
52
56
|
TmpFile.with_file(positions_bed) do |source_bed|
|
data/lib/rbbt/sources/pfam.rb
CHANGED
@@ -4,6 +4,29 @@ require 'rbbt/resource'
|
|
4
4
|
require 'rbbt/entity'
|
5
5
|
require 'rbbt/sources/InterPro'
|
6
6
|
|
7
|
+
InterPro.claim InterPro.pfam_names, :proc do
|
8
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
9
|
+
tsv = nil
|
10
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
11
|
+
tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
|
12
|
+
end
|
13
|
+
tsv.key_field = "InterPro ID"
|
14
|
+
tsv.fields = ["Domain Name"]
|
15
|
+
tsv.to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
InterPro.claim InterPro.pfam_equivalences, :proc do
|
19
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
20
|
+
tsv = nil
|
21
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
22
|
+
tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
|
23
|
+
end
|
24
|
+
tsv.key_field = "InterPro ID"
|
25
|
+
tsv.fields = ["Pfam Domain"]
|
26
|
+
tsv.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
|
7
30
|
module Pfam
|
8
31
|
extend Resource
|
9
32
|
self.subdir = "share/databases/Pfam"
|
@@ -14,7 +37,7 @@ module Pfam
|
|
14
37
|
tsv.to_s
|
15
38
|
end
|
16
39
|
|
17
|
-
NAMES_FILE = InterPro.pfam_names.
|
40
|
+
NAMES_FILE = InterPro.pfam_names.produce
|
18
41
|
|
19
42
|
def self.name_index
|
20
43
|
@name_index ||= TSV.open NAMES_FILE, :single, :unnamed => true
|
@@ -31,28 +54,6 @@ module InterPro
|
|
31
54
|
end
|
32
55
|
end
|
33
56
|
|
34
|
-
InterPro.claim InterPro.pfam_names, :proc do
|
35
|
-
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
36
|
-
tsv = nil
|
37
|
-
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
38
|
-
tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
|
39
|
-
end
|
40
|
-
tsv.key_field = "InterPro ID"
|
41
|
-
tsv.fields = ["Domain Name"]
|
42
|
-
tsv.to_s
|
43
|
-
end
|
44
|
-
|
45
|
-
InterPro.claim InterPro.pfam_equivalences, :proc do
|
46
|
-
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
47
|
-
tsv = nil
|
48
|
-
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
49
|
-
tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
|
50
|
-
end
|
51
|
-
tsv.key_field = "InterPro ID"
|
52
|
-
tsv.fields = ["Pfam Domain"]
|
53
|
-
tsv.to_s
|
54
|
-
end
|
55
|
-
|
56
57
|
|
57
58
|
if defined? Entity
|
58
59
|
module PfamDomain
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -54,6 +54,7 @@ module PubMed
|
|
54
54
|
[:year , "Journal/JournalIssue/PubDate/Year"],
|
55
55
|
[:month , "Journal/JournalIssue/PubDate/Month"],
|
56
56
|
[:pages , "Pagination/MedlinePgn"],
|
57
|
+
[:author , "AuthorList/Author"],
|
57
58
|
[:abstract , "Abstract/AbstractText"],
|
58
59
|
]
|
59
60
|
|
@@ -154,7 +155,7 @@ module PubMed
|
|
154
155
|
end
|
155
156
|
end
|
156
157
|
|
157
|
-
text
|
158
|
+
Misc.fixutf8(text)
|
158
159
|
end
|
159
160
|
|
160
161
|
def bibtex
|
@@ -187,7 +188,9 @@ module PubMed
|
|
187
188
|
|
188
189
|
# Join the text from title and abstract
|
189
190
|
def text
|
190
|
-
[title, abstract].join("\n")
|
191
|
+
text = [title, abstract].join("\n")
|
192
|
+
|
193
|
+
Misc.fixutf8(text)
|
191
194
|
end
|
192
195
|
end
|
193
196
|
|
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -50,16 +50,13 @@ if defined? Entity and defined? Gene and Entity === Gene
|
|
50
50
|
tfs = TFacts.targets.keys
|
51
51
|
self.name.collect{|gene| tfs.include? gene}
|
52
52
|
end
|
53
|
-
persist :_ary_is_transcription_factor?
|
54
53
|
|
55
54
|
property :transcription_regulators => :array2single do
|
56
55
|
Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
57
56
|
end
|
58
|
-
persist :_ary_transcription_regulators
|
59
57
|
|
60
58
|
property :transcription_targets => :array2single do
|
61
59
|
Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
62
60
|
end
|
63
|
-
persist :_ary_transcription_targets
|
64
61
|
end
|
65
62
|
end
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rbbt'
|
1
2
|
require 'rbbt/util/open'
|
2
3
|
require 'rbbt/resource'
|
3
4
|
require 'rbbt/sources/cath'
|
@@ -33,6 +34,7 @@ module UniProt
|
|
33
34
|
|
34
35
|
|
35
36
|
UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
|
37
|
+
UNIPROT_FASTA="http://www.uniprot.org/uniprot/[PROTEIN].fasta"
|
36
38
|
def self.pdbs(protein)
|
37
39
|
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
38
40
|
text = Open.read(url)
|
@@ -44,15 +46,70 @@ module UniProt
|
|
44
46
|
id, method, resolution, region = $1.split(";").collect{|v| v.strip}
|
45
47
|
begin
|
46
48
|
chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
|
49
|
+
start = start.to_i
|
50
|
+
eend = eend.to_i
|
51
|
+
start, eend = eend, start if start > eend
|
47
52
|
rescue
|
48
53
|
Log.warn("Error process Uniprot PDB line: #{line}")
|
49
54
|
next
|
50
55
|
end
|
51
|
-
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start
|
56
|
+
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start..eend), :chains => chains}
|
52
57
|
}
|
53
58
|
pdb
|
54
59
|
end
|
55
60
|
|
61
|
+
def self.sequence(protein)
|
62
|
+
url = UNIPROT_FASTA.sub "[PROTEIN]", protein
|
63
|
+
text = Open.read(url)
|
64
|
+
|
65
|
+
text.split(/\n/).select{|line| line !~ /^>/} * ""
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.features(protein)
|
69
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
70
|
+
text = Open.read(url)
|
71
|
+
|
72
|
+
text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
|
73
|
+
|
74
|
+
parts = text.split(/^(FT \w+)/)
|
75
|
+
parts.shift
|
76
|
+
|
77
|
+
features = []
|
78
|
+
|
79
|
+
type = nil
|
80
|
+
parts.each do |part|
|
81
|
+
parts
|
82
|
+
if part[0..1] == "FT"
|
83
|
+
type = part.gsub(/FT\s+/,'')
|
84
|
+
next
|
85
|
+
end
|
86
|
+
value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
|
87
|
+
case
|
88
|
+
when value.match(/(\d+) (\d+) (.*)/)
|
89
|
+
start, eend, description = $1, $2, $3
|
90
|
+
description.gsub(/^FT\s+/m, '')
|
91
|
+
when value.match(/(\d+) (\d+)/)
|
92
|
+
start, eend = $1, $2
|
93
|
+
description = nil
|
94
|
+
else
|
95
|
+
Log.debug "Value not understood: #{ value }"
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
feature = {
|
100
|
+
:type => type,
|
101
|
+
:start => start.to_i,
|
102
|
+
:end => eend.to_i,
|
103
|
+
:description => description,
|
104
|
+
}
|
105
|
+
|
106
|
+
features << feature
|
107
|
+
end
|
108
|
+
|
109
|
+
features
|
110
|
+
end
|
111
|
+
|
112
|
+
|
56
113
|
def self.variants(protein)
|
57
114
|
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
58
115
|
text = Open.read(url)
|
data/share/Ensembl/release_dates
CHANGED
@@ -290,7 +290,8 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
|
290
290
|
[]
|
291
291
|
end
|
292
292
|
|
293
|
-
transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
|
293
|
+
#transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
|
294
|
+
transcripts
|
294
295
|
end
|
295
296
|
|
296
297
|
def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
@@ -440,6 +441,23 @@ file 'chromosomes' do |t|
|
|
440
441
|
File.open(t.name, 'w') do |f| f.puts goterms end
|
441
442
|
end
|
442
443
|
|
444
|
+
file 'blacklist_chromosomes' => 'chromosomes' do |t|
|
445
|
+
list = TSV.open(t.prerequisites.first).keys.select{|c| c.index('_') or c.index('.')}
|
446
|
+
File.open(t.name, 'w') do |f| f.puts list * "\n" end
|
447
|
+
end
|
448
|
+
|
449
|
+
file 'blacklist_genes' => ['blacklist_chromosomes', 'gene_positions'] do |t|
|
450
|
+
Open.read(t.prerequisites.first)
|
451
|
+
genes = CMD.cmd("grep -f '#{t.prerequisites.first}' | cut -f 1", :in => Open.open(t.prerequisites.last)).read.split("\n").uniq
|
452
|
+
File.open(t.name, 'w') do |f| f.puts genes * "\n" end
|
453
|
+
end
|
454
|
+
|
455
|
+
file 'sanctioned_genes' => ['blacklist_genes', 'gene_positions'] do |t|
|
456
|
+
genes = CMD.cmd("cut -f 1", :in => Open.open(t.prerequisites.last)).read.split("\n").uniq - Open.read(t.prerequisites.first).split("\n")
|
457
|
+
File.open(t.name, 'w') do |f| f.puts genes * "\n" end
|
458
|
+
end
|
459
|
+
|
460
|
+
|
443
461
|
rule /^chromosome_.*/ do |t|
|
444
462
|
chr = t.name.match(/chromosome_(.*)/)[1]
|
445
463
|
|
@@ -450,7 +468,11 @@ rule /^chromosome_.*/ do |t|
|
|
450
468
|
ftp = Net::FTP.new("ftp.ensembl.org")
|
451
469
|
ftp.passive = true
|
452
470
|
ftp.login
|
453
|
-
|
471
|
+
if release.nil? or release == 'current'
|
472
|
+
ftp.chdir("pub/current_fasta/")
|
473
|
+
else
|
474
|
+
ftp.chdir("pub/#{ release }/fasta/")
|
475
|
+
end
|
454
476
|
ftp.chdir($scientific_name.downcase.sub(" ",'_'))
|
455
477
|
ftp.chdir('dna')
|
456
478
|
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
@@ -530,7 +552,7 @@ file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
|
|
530
552
|
begin
|
531
553
|
p = Organism.root
|
532
554
|
p.replace File.expand_path("./chromosome_#{chr}")
|
533
|
-
p.sub!(
|
555
|
+
p.sub!(%r{.*/organisms/},'share/organisms/')
|
534
556
|
p = Path.setup(p, 'rbbt', Organism)
|
535
557
|
chr_str = p.produce.read
|
536
558
|
rescue Exception
|
@@ -624,10 +646,10 @@ end
|
|
624
646
|
|
625
647
|
file 'transcript_3utr' => ["transcript_5utr"] do |t|
|
626
648
|
end
|
627
|
-
|
628
|
-
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
|
649
|
+
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
|
629
650
|
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
|
630
651
|
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
|
652
|
+
transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
|
631
653
|
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
|
632
654
|
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
633
655
|
|
@@ -638,7 +660,12 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
|
|
638
660
|
next if protein.nil? or protein.empty?
|
639
661
|
utr5 = transcript_5utr[transcript]
|
640
662
|
utr3 = transcript_3utr[transcript]
|
641
|
-
|
663
|
+
phase = transcript_phase[transcript] || 0
|
664
|
+
if phase < 0
|
665
|
+
utr5 = - phase if utr5 == 0
|
666
|
+
phase = 0
|
667
|
+
end
|
668
|
+
psequence = Bio::Sequence::NA.new(("N" * phase) << sequence[utr5..sequence.length-utr3-1]).translate
|
642
669
|
protein_sequence[protein]=psequence
|
643
670
|
end
|
644
671
|
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
|
3
|
+
require 'rbbt/sources/gscholar'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestGScholar < Test::Unit::TestCase
|
7
|
+
def test_citation
|
8
|
+
assert_match GoogleScholar.citation_link("Ten Years of Pathway Analysis: Current Approaches and Outstanding Challenges").to_s, /cites/
|
9
|
+
assert_match GoogleScholar.number_cites("Ten Years of Pathway Analysis: Current Approaches and Outstanding Challenges").to_s, /\d+/
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
|
@@ -61,6 +61,11 @@ class TestOrganism < Test::Unit::TestCase
|
|
61
61
|
assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
|
62
62
|
end
|
63
63
|
|
64
|
+
def test_orhtolog
|
65
|
+
require 'rbbt/entity/gene'
|
66
|
+
assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog("Hsa/jun2011")
|
67
|
+
end
|
68
|
+
|
64
69
|
#def test_genes_at_chromosome
|
65
70
|
# pos = [12, 117799500]
|
66
71
|
# assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
5
|
-
prerelease:
|
4
|
+
version: 2.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Miguel Vazquez
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-10-21 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rbbt-util
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rbbt-text
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: libxml-ruby
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ! '>='
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,6 @@ dependencies:
|
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ! '>='
|
60
53
|
- !ruby/object:Gem::Version
|
@@ -62,7 +55,6 @@ dependencies:
|
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: bio
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
59
|
- - ! '>='
|
68
60
|
- !ruby/object:Gem::Version
|
@@ -70,7 +62,6 @@ dependencies:
|
|
70
62
|
type: :runtime
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
66
|
- - ! '>='
|
76
67
|
- !ruby/object:Gem::Version
|
@@ -78,7 +69,6 @@ dependencies:
|
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: mechanize
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
73
|
- - ! '>='
|
84
74
|
- !ruby/object:Gem::Version
|
@@ -86,7 +76,6 @@ dependencies:
|
|
86
76
|
type: :runtime
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
80
|
- - ! '>='
|
92
81
|
- !ruby/object:Gem::Version
|
@@ -111,6 +100,7 @@ files:
|
|
111
100
|
- lib/rbbt/sources/biomart.rb
|
112
101
|
- lib/rbbt/sources/cath.rb
|
113
102
|
- lib/rbbt/sources/dbSNP.rb
|
103
|
+
- lib/rbbt/sources/ensembl.rb
|
114
104
|
- lib/rbbt/sources/ensembl_ftp.rb
|
115
105
|
- lib/rbbt/sources/entrez.rb
|
116
106
|
- lib/rbbt/sources/genomes1000.rb
|
@@ -139,37 +129,38 @@ files:
|
|
139
129
|
- test/rbbt/sources/test_go.rb
|
140
130
|
- test/rbbt/sources/test_entrez.rb
|
141
131
|
- test/rbbt/sources/test_biomart.rb
|
132
|
+
- test/rbbt/sources/test_gscholar.rb
|
142
133
|
- test/rbbt/sources/test_organism.rb
|
143
134
|
- test/rbbt/sources/test_pubmed.rb
|
144
135
|
- test/test_helper.rb
|
145
136
|
homepage: http://github.com/mikisvaz/rbbt-sources
|
146
137
|
licenses: []
|
138
|
+
metadata: {}
|
147
139
|
post_install_message:
|
148
140
|
rdoc_options: []
|
149
141
|
require_paths:
|
150
142
|
- lib
|
151
143
|
required_ruby_version: !ruby/object:Gem::Requirement
|
152
|
-
none: false
|
153
144
|
requirements:
|
154
145
|
- - ! '>='
|
155
146
|
- !ruby/object:Gem::Version
|
156
147
|
version: '0'
|
157
148
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
|
-
none: false
|
159
149
|
requirements:
|
160
150
|
- - ! '>='
|
161
151
|
- !ruby/object:Gem::Version
|
162
152
|
version: '0'
|
163
153
|
requirements: []
|
164
154
|
rubyforge_project:
|
165
|
-
rubygems_version:
|
155
|
+
rubygems_version: 2.0.3
|
166
156
|
signing_key:
|
167
|
-
specification_version:
|
157
|
+
specification_version: 4
|
168
158
|
summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
|
169
159
|
test_files:
|
170
160
|
- test/rbbt/sources/test_go.rb
|
171
161
|
- test/rbbt/sources/test_entrez.rb
|
172
162
|
- test/rbbt/sources/test_biomart.rb
|
163
|
+
- test/rbbt/sources/test_gscholar.rb
|
173
164
|
- test/rbbt/sources/test_organism.rb
|
174
165
|
- test/rbbt/sources/test_pubmed.rb
|
175
166
|
- test/test_helper.rb
|