rbbt-sources 2.0.2 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/lib/rbbt/sources/COSMIC.rb +100 -4
- data/lib/rbbt/sources/NCI.rb +1 -1
- data/lib/rbbt/sources/STITCH.rb +5 -5
- data/lib/rbbt/sources/dbSNP.rb +141 -48
- data/lib/rbbt/sources/ensembl.rb +13 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +17 -6
- data/lib/rbbt/sources/entrez.rb +23 -21
- data/lib/rbbt/sources/genomes1000.rb +57 -0
- data/lib/rbbt/sources/go.rb +8 -8
- data/lib/rbbt/sources/organism.rb +5 -1
- data/lib/rbbt/sources/pfam.rb +24 -23
- data/lib/rbbt/sources/pubmed.rb +5 -2
- data/lib/rbbt/sources/tfacts.rb +0 -3
- data/lib/rbbt/sources/uniprot.rb +58 -1
- data/share/Ensembl/release_dates +2 -1
- data/share/install/Organism/organism_helpers.rb +33 -6
- data/test/rbbt/sources/test_gscholar.rb +14 -0
- data/test/rbbt/sources/test_organism.rb +5 -0
- metadata +8 -17
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
NjczYWU0NDMyM2IwZDBlYWFjNGVlNWU4NTg5ODFhMGEzYmEwZGJiYw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MjUzNGFjZDJjYzk1ZGJiMjIwNzllMjA4ZDMyODI2YTQzYzhhNzU0Yg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NGZiMjgxYzQ0OGY2MzgxYmUzMzEzN2E1NzBjNDc4MjU3YjRmZjM0OTMwMTcz
|
10
|
+
YzFmMTU4Y2FkMzI4OTljZTA2MTJhNmVhZDQzNzA2NDAwNGM4ODc0ZTAwYzEx
|
11
|
+
MDZjYzAzODEyZjc1OTlmODJhYWE5YjE3ZjI3ODNlYWZlODZmYzc=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NWExMTU0MGMyZWExY2U5NWI2YWJhODYzZDcxMDFkYTc0NWZjN2M3ZDAzZTRh
|
14
|
+
Njk4NTgwMDgwZWJkNjhiNWM3OTA0MDE5Y2IwZjI1OTFhYzU3YmJkZWFhN2M4
|
15
|
+
ZGY2ZTA3NGNjOTM4MDBmZWY4NmQ0ZTMzODc3NmIwMzE1MTM1YjY=
|
data/lib/rbbt/sources/COSMIC.rb
CHANGED
@@ -1,21 +1,31 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/resource'
|
3
|
+
|
3
4
|
module COSMIC
|
4
5
|
extend Resource
|
5
6
|
self.subdir = "share/databases/COSMIC"
|
6
7
|
|
7
|
-
COSMIC.claim COSMIC.
|
8
|
-
url = "ftp://ftp.sanger.ac.uk/pub/CGP/
|
8
|
+
COSMIC.claim COSMIC.mutations, :proc do
|
9
|
+
url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicCompleteExport_v64_260313.tsv.gz"
|
9
10
|
|
10
|
-
|
11
|
+
stream = CMD.cmd('awk \'BEGIN{FS="\t"} { if ($12 != "" && $12 != "Mutation ID") { sub($12, "COSM" $12 ":" $4)}; print}\'', :in => Open.open(url), :pipe => true)
|
12
|
+
tsv = TSV.open(stream, :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
|
11
13
|
tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
|
12
14
|
tsv.add_field "Genomic Mutation" do |mid, values|
|
13
15
|
position = values["Mutation GRCh37 genome position"]
|
14
16
|
cds = values["Mutation CDS"]
|
17
|
+
|
15
18
|
if position.nil? or position.empty?
|
16
19
|
nil
|
17
20
|
else
|
18
21
|
position = position.split("-").first
|
22
|
+
|
23
|
+
chr, pos = position.split(":")
|
24
|
+
chr = "X" if chr == "23"
|
25
|
+
chr = "Y" if chr == "24"
|
26
|
+
chr = "M" if chr == "25"
|
27
|
+
position = [chr, pos ] * ":"
|
28
|
+
|
19
29
|
if cds.nil?
|
20
30
|
position
|
21
31
|
else
|
@@ -52,6 +62,92 @@ module COSMIC
|
|
52
62
|
end
|
53
63
|
end
|
54
64
|
end
|
55
|
-
|
65
|
+
|
66
|
+
tsv.to_s.gsub(/(\d)-(\d)/,'\1:\2')
|
67
|
+
end
|
68
|
+
|
69
|
+
COSMIC.claim COSMIC.mutations_hg18, :proc do |filename|
|
70
|
+
require 'rbbt/sources/organism'
|
71
|
+
file = COSMIC.mutations.open
|
72
|
+
begin
|
73
|
+
|
74
|
+
while (line = file.gets) !~ /Genomic Mutation/; end
|
75
|
+
fields = line[1..-2].split("\t")
|
76
|
+
mutation_pos = fields.index "Genomic Mutation"
|
77
|
+
|
78
|
+
mutations = CMD.cmd("grep -v '^#'|cut -f #{mutation_pos + 1}|sort -u", :in => COSMIC.mutations.open).read.split("\n").select{|m| m.include? ":" }
|
79
|
+
|
80
|
+
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
81
|
+
|
82
|
+
File.open(filename, 'w') do |f|
|
83
|
+
f.puts "#: :type=:list#:namespace=Hsa/may2009"
|
84
|
+
f.puts "#" + fields * "\t"
|
85
|
+
while line = file.gets do
|
86
|
+
next if line[0] == "#"[0]
|
87
|
+
line.strip!
|
88
|
+
parts = line.split("\t")
|
89
|
+
parts[mutation_pos] = translations[parts[mutation_pos]]
|
90
|
+
f.puts parts * "\t"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
rescue Exception
|
94
|
+
FileUtils.rm filename if File.exists? filename
|
95
|
+
raise $!
|
96
|
+
ensure
|
97
|
+
file.close
|
98
|
+
end
|
99
|
+
|
100
|
+
nil
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
def self.rsid_index(organism, chromosome = nil)
|
105
|
+
build = Organism.hg_build(organism)
|
106
|
+
|
107
|
+
tag = [build, chromosome] * ":"
|
108
|
+
fwt = nil
|
109
|
+
Persist.persist("StaticPosIndex for COSMIC [#{ tag }]", :fwt, :persist => true) do
|
110
|
+
value_size = 0
|
111
|
+
file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
|
112
|
+
chr_positions = []
|
113
|
+
begin
|
114
|
+
Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
|
115
|
+
next if line[0] == "#"[0]
|
116
|
+
rsid, mutation = line.split("\t").values_at 0, 25
|
117
|
+
next if mutation.nil? or mutation.empty?
|
118
|
+
chr, pos = mutation.split(":")
|
119
|
+
next if chr != chromosome or pos.nil? or pos.empty?
|
120
|
+
chr_positions << [rsid, pos.to_i]
|
121
|
+
value_size = rsid.length if rsid.length > value_size
|
122
|
+
end
|
123
|
+
rescue
|
124
|
+
end
|
125
|
+
fwt = FixWidthTable.new :memory, value_size
|
126
|
+
fwt.add_point(chr_positions)
|
127
|
+
fwt
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.mutation_index(organism)
|
132
|
+
build = Organism.hg_build(organism)
|
133
|
+
file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
|
134
|
+
@mutation_index ||= {}
|
135
|
+
@mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
if defined? Entity
|
142
|
+
if defined? Gene and Entity === Gene
|
143
|
+
module Gene
|
144
|
+
property :COSMIC_rsids => :single2array do
|
145
|
+
COSMIC.rsid_index(organism, chromosome)[self.chr_range]
|
146
|
+
end
|
147
|
+
|
148
|
+
property :COSMIC_mutations => :single2array do
|
149
|
+
GenomicMutation.setup(COSMIC.mutation_index(organism).values_at(*self.COSMIC_rsids).uniq, "COSMIC mutations over #{self.name || self}", organism, false)
|
150
|
+
end
|
151
|
+
end
|
56
152
|
end
|
57
153
|
end
|
data/lib/rbbt/sources/NCI.rb
CHANGED
data/lib/rbbt/sources/STITCH.rb
CHANGED
@@ -5,11 +5,11 @@ module STITCH
|
|
5
5
|
extend Resource
|
6
6
|
self.subdir = "share/databases/STITCH"
|
7
7
|
|
8
|
-
STITCH.claim STITCH.source.chemical_chemical
|
9
|
-
STITCH.claim STITCH.source.protein_chemical
|
10
|
-
STITCH.claim STITCH.source.actions
|
11
|
-
STITCH.claim STITCH.source.aliases
|
12
|
-
STITCH.claim STITCH.source.sources
|
8
|
+
STITCH.claim STITCH.source.chemical_chemical, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
|
9
|
+
STITCH.claim STITCH.source.protein_chemical, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
|
10
|
+
STITCH.claim STITCH.source.actions, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
|
11
|
+
STITCH.claim STITCH.source.aliases, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
|
12
|
+
STITCH.claim STITCH.source.sources, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
|
13
13
|
|
14
14
|
Organism.installable_organisms.each do |organism|
|
15
15
|
STITCH.claim STITCH.chemical_protein(organism), :proc do
|
data/lib/rbbt/sources/dbSNP.rb
CHANGED
@@ -10,25 +10,27 @@ module DbSNP
|
|
10
10
|
URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
|
11
11
|
|
12
12
|
DbSNP.claim DbSNP.mutations_ncbi, :proc do
|
13
|
-
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :
|
13
|
+
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :flat)
|
14
14
|
file = Open.open(URL, :nocache => true)
|
15
15
|
while line = file.gets do
|
16
16
|
next if line[0] == "#"[0]
|
17
17
|
chr, position, id, ref, alt = line.split "\t"
|
18
|
-
|
19
|
-
|
20
|
-
alt[0]
|
18
|
+
|
19
|
+
mutations = alt.split(",").collect do |a|
|
20
|
+
if alt[0] == ref[0]
|
21
|
+
alt[0] = '+'[0]
|
22
|
+
end
|
23
|
+
[chr, position, alt] * ":"
|
21
24
|
end
|
22
|
-
mutation = [chr, position, alt] * ":"
|
23
25
|
|
24
26
|
tsv.namespace = "Hsa/may2012"
|
25
|
-
tsv[id] =
|
27
|
+
tsv[id] = mutations
|
26
28
|
end
|
27
29
|
|
28
30
|
tsv.to_s
|
29
31
|
end
|
30
32
|
|
31
|
-
DbSNP.claim DbSNP.
|
33
|
+
DbSNP.claim DbSNP.rsids, :proc do |filename|
|
32
34
|
ftp = Net::FTP.new('ftp.broadinstitute.org')
|
33
35
|
ftp.passive = true
|
34
36
|
ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
|
@@ -37,65 +39,156 @@ module DbSNP
|
|
37
39
|
tmpfile = TmpFile.tmp_file + '.gz'
|
38
40
|
ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
|
39
41
|
|
40
|
-
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation", "GMAF", "G5", "G5A", "dbSNP Build ID"], :type => :list)
|
41
42
|
file = Open.open(tmpfile, :nocache => true)
|
42
|
-
|
43
|
-
|
43
|
+
begin
|
44
|
+
File.open(filename, 'w') do |f|
|
45
|
+
f.puts "#: :type=:list#:namespace=Hsa/may2012"
|
46
|
+
f.puts "#" + ["RS ID", "GMAF", "G5", "G5A", "dbSNP Build ID"] * "\t"
|
47
|
+
while line = file.gets do
|
48
|
+
next if line[0] == "#"[0]
|
49
|
+
|
50
|
+
chr, position, id, ref, muts, qual, filter, info = line.split "\t"
|
51
|
+
|
52
|
+
g5 = g5a = dbsnp_build_id = gmaf = nil
|
53
|
+
|
54
|
+
gmaf = $1 if info =~ /GMAF=([0-9.]+)/
|
55
|
+
g5 = true if info =~ /\bG5\b/
|
56
|
+
g5a = true if info =~ /\bG5A\b/
|
57
|
+
dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
|
44
58
|
|
45
|
-
|
46
|
-
|
47
|
-
chr.sub!('chr', '')
|
48
|
-
|
49
|
-
mut = mut.split(",").first
|
50
|
-
case
|
51
|
-
when ref == '-'
|
52
|
-
mut = "+" << mut
|
53
|
-
when mut == '-'
|
54
|
-
mut = "-" * ref.length
|
55
|
-
when (mut.length > 1 and ref.length > 1)
|
56
|
-
mut = '-' * ref.length << mut
|
57
|
-
when (mut.length > 1 and ref.length == 1 and mut.index(ref) == 0)
|
58
|
-
mut = '+' << mut[1..-1]
|
59
|
-
when (mut.length == 1 and ref.length > 1 and ref.index(mut) == 0)
|
60
|
-
mut = '-' * (ref.length - 1)
|
61
|
-
else
|
62
|
-
mut = mut
|
59
|
+
f.puts [id, gmaf, g5, g5a, dbsnp_build_id] * "\t"
|
60
|
+
end
|
63
61
|
end
|
62
|
+
rescue Exception
|
63
|
+
FileUtils.rm filename if File.exists? filename
|
64
|
+
raise $!
|
65
|
+
ensure
|
66
|
+
file.close
|
67
|
+
FileUtils.rm tmpfile
|
68
|
+
end
|
64
69
|
|
65
|
-
|
66
|
-
|
67
|
-
gmaf = $1 if info =~ /GMAF=([0-9.]+)/
|
68
|
-
g5 = true if info =~ /\bG5\b/
|
69
|
-
g5a = true if info =~ /\bG5A\b/
|
70
|
-
dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
|
70
|
+
nil
|
71
|
+
end
|
71
72
|
|
72
|
-
|
73
|
+
DbSNP.claim DbSNP.mutations, :proc do |filename|
|
74
|
+
ftp = Net::FTP.new('ftp.broadinstitute.org')
|
75
|
+
ftp.passive = true
|
76
|
+
ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
|
77
|
+
ftp.chdir('/bundle/2.3/hg19')
|
73
78
|
|
74
|
-
|
79
|
+
tmpfile = TmpFile.tmp_file + '.gz'
|
80
|
+
ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
|
75
81
|
|
76
|
-
|
77
|
-
|
82
|
+
file = Open.open(tmpfile, :nocache => true)
|
83
|
+
begin
|
84
|
+
File.open(filename, 'w') do |f|
|
85
|
+
f.puts "#: :type=:flat#:namespace=Hsa/may2012"
|
86
|
+
f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
|
87
|
+
while line = file.gets do
|
88
|
+
next if line[0] == "#"[0]
|
78
89
|
|
79
|
-
|
90
|
+
chr, position, id, ref, muts, qual, filter, info = line.split "\t"
|
80
91
|
|
81
|
-
|
92
|
+
chr.sub!('chr', '')
|
93
|
+
|
94
|
+
position, muts = Misc.correct_vcf_mutation(position.to_i, ref, muts)
|
95
|
+
|
96
|
+
mutations = muts.collect{|mut| [chr, position, mut] * ":" }
|
97
|
+
|
98
|
+
f.puts ([id] + mutations) * "\t"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
rescue Exception
|
102
|
+
FileUtils.rm filename if File.exists? filename
|
103
|
+
raise $!
|
104
|
+
ensure
|
105
|
+
file.close
|
106
|
+
FileUtils.rm tmpfile
|
107
|
+
end
|
108
|
+
|
109
|
+
nil
|
82
110
|
end
|
83
111
|
|
84
|
-
DbSNP.claim DbSNP.mutations_hg18, :proc do
|
112
|
+
DbSNP.claim DbSNP.mutations_hg18, :proc do |filename|
|
85
113
|
require 'rbbt/sources/organism'
|
86
114
|
|
87
|
-
|
88
|
-
|
89
|
-
mutations = hg19_tsv.values
|
115
|
+
mutations = CMD.cmd("grep -v '^#'|cut -f 2|sort -u", :in => DbSNP.mutations.open).read.split("\n").collect{|l| l.split("|")}.flatten
|
90
116
|
|
91
117
|
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
118
|
+
begin
|
119
|
+
file = Open.open(DbSNP.mutations.find, :nocache => true)
|
120
|
+
File.open(filename, 'w') do |f|
|
121
|
+
f.puts "#: :type=:flat#:namespace=Hsa/may2009"
|
122
|
+
f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
|
123
|
+
while line = file.gets do
|
124
|
+
next if line[0] == "#"[0]
|
125
|
+
parts = line.split("\t")
|
126
|
+
parts[1..-1] = parts[1..-1].collect{|p| translations[p]} * "|"
|
127
|
+
f.puts parts * "\t"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
rescue Exception
|
131
|
+
FileUtils.rm filename if File.exists? filename
|
132
|
+
raise $!
|
133
|
+
ensure
|
134
|
+
file.close
|
135
|
+
end
|
136
|
+
|
137
|
+
nil
|
138
|
+
end
|
92
139
|
|
93
|
-
|
94
|
-
|
140
|
+
def self.rsid_index(organism, chromosome = nil)
|
141
|
+
build = Organism.hg_build(organism)
|
142
|
+
|
143
|
+
tag = [build, chromosome] * ":"
|
144
|
+
Persist.persist("StaticPosIndex for dbSNP [#{ tag }]", :fwt, :persist => true) do
|
145
|
+
value_size = 0
|
146
|
+
file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
|
147
|
+
chr_positions = []
|
148
|
+
Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
|
149
|
+
next if line[0] == "#"[0]
|
150
|
+
rsid, mutation = line.split("\t")
|
151
|
+
next if mutation.nil? or mutation.empty?
|
152
|
+
chr, pos = mutation.split(":")
|
153
|
+
next if chr != chromosome or pos.nil? or pos.empty?
|
154
|
+
chr_positions << [rsid, pos.to_i]
|
155
|
+
value_size = rsid.length if rsid.length > value_size
|
156
|
+
end
|
157
|
+
fwt = FixWidthTable.new :memory, value_size
|
158
|
+
fwt.add_point(chr_positions)
|
159
|
+
fwt
|
95
160
|
end
|
161
|
+
end
|
96
162
|
|
97
|
-
|
163
|
+
def self.mutation_index(organism)
|
164
|
+
build = Organism.hg_build(organism)
|
165
|
+
file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
|
166
|
+
@mutation_index ||= {}
|
167
|
+
@mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
if defined? Entity
|
173
|
+
if defined? Gene and Entity === Gene
|
174
|
+
module Gene
|
175
|
+
property :dbSNP_rsids => :single2array do
|
176
|
+
DbSNP.rsid_index(organism, chromosome)[self.chr_range]
|
177
|
+
end
|
178
|
+
|
179
|
+
property :dbSNP_mutations => :single2array do
|
180
|
+
GenomicMutation.setup(DbSNP.mutation_index(organism).values_at(*self.dbSNP_rsids).compact.flatten.uniq, "dbSNP mutations over #{self.name || self}", organism, true)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
if defined? GenomicMutation and Entity === GenomicMutation
|
186
|
+
module GenomicMutation
|
187
|
+
property :dbSNP => :array2single do
|
188
|
+
dbSNP.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["RS ID"], :type => :single).values_at *self
|
189
|
+
end
|
190
|
+
end
|
98
191
|
|
99
|
-
tsv.to_s
|
100
192
|
end
|
101
193
|
end
|
194
|
+
|
@@ -1,24 +1,31 @@
|
|
1
1
|
require 'rbbt/util/open'
|
2
2
|
require 'rbbt/sources/organism'
|
3
3
|
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/sources/ensembl'
|
4
5
|
require 'net/ftp'
|
5
6
|
|
6
7
|
module Ensembl
|
7
|
-
|
8
8
|
|
9
|
-
def self.releases
|
10
|
-
@releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
|
11
|
-
end
|
12
|
-
|
13
9
|
module FTP
|
14
10
|
|
15
11
|
SERVER = "ftp.ensembl.org"
|
16
12
|
|
13
|
+
def self.mysql_path(release)
|
14
|
+
end
|
15
|
+
|
17
16
|
def self.ftp_name_for(organism)
|
18
17
|
code, build = organism.split "/"
|
19
18
|
build ||= "current"
|
20
19
|
|
21
20
|
if build.to_s == "current"
|
21
|
+
release = 'current'
|
22
|
+
name = Organism.scientific_name(organism)
|
23
|
+
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
24
|
+
ftp.passive = true
|
25
|
+
ftp.login
|
26
|
+
ftp.chdir(File.join('pub', 'current_mysql'))
|
27
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
|
28
|
+
ftp.close
|
22
29
|
else
|
23
30
|
release = Ensembl.releases[build]
|
24
31
|
name = Organism.scientific_name(organism)
|
@@ -34,7 +41,11 @@ module Ensembl
|
|
34
41
|
|
35
42
|
def self.ftp_directory_for(organism)
|
36
43
|
release, ftp_name = ftp_name_for(organism)
|
37
|
-
|
44
|
+
if release == 'current'
|
45
|
+
File.join('/pub/', 'current_mysql', ftp_name)
|
46
|
+
else
|
47
|
+
File.join('/pub/', release, 'mysql', ftp_name)
|
48
|
+
end
|
38
49
|
end
|
39
50
|
|
40
51
|
def self.base_url(organism)
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -139,10 +139,12 @@ module Entrez
|
|
139
139
|
else
|
140
140
|
filename = gene_filename geneid
|
141
141
|
|
142
|
+
|
142
143
|
if FileCache.found(filename)
|
143
144
|
return Gene.new(Open.read(FileCache.path(filename)))
|
144
145
|
else
|
145
146
|
xml = get_online(geneid)
|
147
|
+
|
146
148
|
FileCache.add(filename, xml) unless FileCache.found(filename)
|
147
149
|
|
148
150
|
return Gene.new(xml)
|
@@ -150,30 +152,30 @@ module Entrez
|
|
150
152
|
end
|
151
153
|
end
|
152
154
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
return 0
|
166
|
-
end
|
167
|
-
else
|
155
|
+
# Counts the words in common between a chunk of text and the text
|
156
|
+
# found in Entrez Gene for that particular gene. The +gene+ may be a
|
157
|
+
# gene identifier or a Gene class instance.
|
158
|
+
def self.gene_text_similarity(gene, text)
|
159
|
+
|
160
|
+
case
|
161
|
+
when Entrez::Gene === gene
|
162
|
+
gene_text = gene.text
|
163
|
+
when String === gene || Fixnum === gene
|
164
|
+
begin
|
165
|
+
gene_text = get_gene(gene).text
|
166
|
+
rescue CMD::CMDError
|
168
167
|
return 0
|
169
168
|
end
|
169
|
+
else
|
170
|
+
return 0
|
171
|
+
end
|
170
172
|
|
171
|
-
|
172
|
-
|
173
|
+
gene_words = gene_text.words.to_set
|
174
|
+
text_words = text.words.to_set
|
173
175
|
|
174
|
-
|
176
|
+
return 0 if gene_words.empty? || text_words.empty?
|
175
177
|
|
176
|
-
|
177
|
-
|
178
|
-
|
178
|
+
common = gene_words.intersection(text_words)
|
179
|
+
common.length / (gene_words.length + text_words.length).to_f
|
180
|
+
end
|
179
181
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/open'
|
3
3
|
require 'rbbt/resource'
|
4
|
+
require 'rbbt/entity/gene'
|
4
5
|
|
5
6
|
module Genomes1000
|
6
7
|
extend Resource
|
@@ -49,4 +50,60 @@ module Genomes1000
|
|
49
50
|
tsv.to_s
|
50
51
|
end
|
51
52
|
|
53
|
+
def self.rsid_index(organism, chromosome = nil)
|
54
|
+
build = Organism.hg_build(organism)
|
55
|
+
|
56
|
+
tag = [build, chromosome] * ":"
|
57
|
+
Persist.persist("StaticPosIndex for Genomes1000 [#{ tag }]", :fwt, :persist => true) do
|
58
|
+
value_size = 0
|
59
|
+
file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
|
60
|
+
chr_positions = []
|
61
|
+
Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
|
62
|
+
next if line[0] == "#"[0]
|
63
|
+
rsid, mutation = line.split("\t")
|
64
|
+
next if mutation.nil? or mutation.empty?
|
65
|
+
chr, pos = mutation.split(":")
|
66
|
+
next if chr != chromosome or pos.nil? or pos.empty?
|
67
|
+
chr_positions << [rsid, pos.to_i]
|
68
|
+
value_size = rsid.length if rsid.length > value_size
|
69
|
+
end
|
70
|
+
fwt = FixWidthTable.new :memory, value_size
|
71
|
+
fwt.add_point(chr_positions)
|
72
|
+
fwt
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.mutation_index(organism)
|
77
|
+
build = Organism.hg_build(organism)
|
78
|
+
file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
|
79
|
+
@mutation_index ||= {}
|
80
|
+
@mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
|
81
|
+
end
|
82
|
+
|
83
|
+
|
52
84
|
end
|
85
|
+
|
86
|
+
|
87
|
+
if defined? Entity
|
88
|
+
if defined? Gene and Entity === Gene
|
89
|
+
module Gene
|
90
|
+
property :genomes_1000_rsids => :single2array do
|
91
|
+
Genomes1000.rsid_index(organism, chromosome)[self.chr_range]
|
92
|
+
end
|
93
|
+
|
94
|
+
property :genomes_1000_mutations => :single2array do
|
95
|
+
GenomicMutation.setup(Genomes1000.mutation_index(organism).values_at(*self.genomes_1000_rsids).uniq, "1000 Genomes mutations over #{self.name || self}", organism, true)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
if defined? GenomicMutation and Entity === GenomicMutation
|
101
|
+
module GenomicMutation
|
102
|
+
property :genomes_1000 => :array2single do
|
103
|
+
Genomes1000.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["Variant ID"], :type => :single).values_at *self
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -18,7 +18,7 @@ module GO
|
|
18
18
|
# only the name field is used.
|
19
19
|
def self.init
|
20
20
|
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
21
|
-
info.serializer = :marshal if info.respond_to? :serializer
|
21
|
+
info.serializer = :marshal if info.respond_to? :serializer
|
22
22
|
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
23
23
|
term_info = {}
|
24
24
|
|
@@ -37,11 +37,11 @@ module GO
|
|
37
37
|
}
|
38
38
|
|
39
39
|
info
|
40
|
-
end
|
40
|
+
end.tap{|o| o.unnamed = true}
|
41
41
|
end
|
42
42
|
|
43
43
|
def self.info
|
44
|
-
|
44
|
+
@@info ||= self.init
|
45
45
|
end
|
46
46
|
|
47
47
|
def self.goterms
|
@@ -94,7 +94,7 @@ if defined? Entity
|
|
94
94
|
self.annotation :organism
|
95
95
|
|
96
96
|
property :name => :array2single do
|
97
|
-
|
97
|
+
GO.id2name(self)
|
98
98
|
end
|
99
99
|
|
100
100
|
property :genes => :array2single do |*args|
|
@@ -117,19 +117,19 @@ if defined? Entity
|
|
117
117
|
if defined? Gene and Entity === Gene
|
118
118
|
module Gene
|
119
119
|
property :go_terms => :array2single do
|
120
|
-
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
120
|
+
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
121
121
|
end
|
122
122
|
|
123
123
|
property :go_bp_terms => :array2single do
|
124
|
-
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
124
|
+
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
125
125
|
end
|
126
126
|
|
127
127
|
property :go_cc_terms => :array2single do
|
128
|
-
@go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
128
|
+
@go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
129
129
|
end
|
130
130
|
|
131
131
|
property :go_mf_terms => :array2single do
|
132
|
-
@go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).
|
132
|
+
@go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
|
133
133
|
end
|
134
134
|
|
135
135
|
end
|
@@ -46,7 +46,11 @@ module Organism
|
|
46
46
|
return positions
|
47
47
|
end
|
48
48
|
|
49
|
-
positions_bed = positions.collect{|position|
|
49
|
+
positions_bed = positions.collect{|position|
|
50
|
+
chr, pos = position.split(":").values_at(0,1)
|
51
|
+
["chr" << chr, pos.to_i-1, pos, position] * "\t"
|
52
|
+
} * "\n" + "\n"
|
53
|
+
|
50
54
|
new_positions = {}
|
51
55
|
|
52
56
|
TmpFile.with_file(positions_bed) do |source_bed|
|
data/lib/rbbt/sources/pfam.rb
CHANGED
@@ -4,6 +4,29 @@ require 'rbbt/resource'
|
|
4
4
|
require 'rbbt/entity'
|
5
5
|
require 'rbbt/sources/InterPro'
|
6
6
|
|
7
|
+
InterPro.claim InterPro.pfam_names, :proc do
|
8
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
9
|
+
tsv = nil
|
10
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
11
|
+
tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
|
12
|
+
end
|
13
|
+
tsv.key_field = "InterPro ID"
|
14
|
+
tsv.fields = ["Domain Name"]
|
15
|
+
tsv.to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
InterPro.claim InterPro.pfam_equivalences, :proc do
|
19
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
20
|
+
tsv = nil
|
21
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
22
|
+
tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
|
23
|
+
end
|
24
|
+
tsv.key_field = "InterPro ID"
|
25
|
+
tsv.fields = ["Pfam Domain"]
|
26
|
+
tsv.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
|
7
30
|
module Pfam
|
8
31
|
extend Resource
|
9
32
|
self.subdir = "share/databases/Pfam"
|
@@ -14,7 +37,7 @@ module Pfam
|
|
14
37
|
tsv.to_s
|
15
38
|
end
|
16
39
|
|
17
|
-
NAMES_FILE = InterPro.pfam_names.
|
40
|
+
NAMES_FILE = InterPro.pfam_names.produce
|
18
41
|
|
19
42
|
def self.name_index
|
20
43
|
@name_index ||= TSV.open NAMES_FILE, :single, :unnamed => true
|
@@ -31,28 +54,6 @@ module InterPro
|
|
31
54
|
end
|
32
55
|
end
|
33
56
|
|
34
|
-
InterPro.claim InterPro.pfam_names, :proc do
|
35
|
-
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
36
|
-
tsv = nil
|
37
|
-
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
38
|
-
tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
|
39
|
-
end
|
40
|
-
tsv.key_field = "InterPro ID"
|
41
|
-
tsv.fields = ["Domain Name"]
|
42
|
-
tsv.to_s
|
43
|
-
end
|
44
|
-
|
45
|
-
InterPro.claim InterPro.pfam_equivalences, :proc do
|
46
|
-
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
47
|
-
tsv = nil
|
48
|
-
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
49
|
-
tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
|
50
|
-
end
|
51
|
-
tsv.key_field = "InterPro ID"
|
52
|
-
tsv.fields = ["Pfam Domain"]
|
53
|
-
tsv.to_s
|
54
|
-
end
|
55
|
-
|
56
57
|
|
57
58
|
if defined? Entity
|
58
59
|
module PfamDomain
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -54,6 +54,7 @@ module PubMed
|
|
54
54
|
[:year , "Journal/JournalIssue/PubDate/Year"],
|
55
55
|
[:month , "Journal/JournalIssue/PubDate/Month"],
|
56
56
|
[:pages , "Pagination/MedlinePgn"],
|
57
|
+
[:author , "AuthorList/Author"],
|
57
58
|
[:abstract , "Abstract/AbstractText"],
|
58
59
|
]
|
59
60
|
|
@@ -154,7 +155,7 @@ module PubMed
|
|
154
155
|
end
|
155
156
|
end
|
156
157
|
|
157
|
-
text
|
158
|
+
Misc.fixutf8(text)
|
158
159
|
end
|
159
160
|
|
160
161
|
def bibtex
|
@@ -187,7 +188,9 @@ module PubMed
|
|
187
188
|
|
188
189
|
# Join the text from title and abstract
|
189
190
|
def text
|
190
|
-
[title, abstract].join("\n")
|
191
|
+
text = [title, abstract].join("\n")
|
192
|
+
|
193
|
+
Misc.fixutf8(text)
|
191
194
|
end
|
192
195
|
end
|
193
196
|
|
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -50,16 +50,13 @@ if defined? Entity and defined? Gene and Entity === Gene
|
|
50
50
|
tfs = TFacts.targets.keys
|
51
51
|
self.name.collect{|gene| tfs.include? gene}
|
52
52
|
end
|
53
|
-
persist :_ary_is_transcription_factor?
|
54
53
|
|
55
54
|
property :transcription_regulators => :array2single do
|
56
55
|
Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
57
56
|
end
|
58
|
-
persist :_ary_transcription_regulators
|
59
57
|
|
60
58
|
property :transcription_targets => :array2single do
|
61
59
|
Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
62
60
|
end
|
63
|
-
persist :_ary_transcription_targets
|
64
61
|
end
|
65
62
|
end
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rbbt'
|
1
2
|
require 'rbbt/util/open'
|
2
3
|
require 'rbbt/resource'
|
3
4
|
require 'rbbt/sources/cath'
|
@@ -33,6 +34,7 @@ module UniProt
|
|
33
34
|
|
34
35
|
|
35
36
|
UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
|
37
|
+
UNIPROT_FASTA="http://www.uniprot.org/uniprot/[PROTEIN].fasta"
|
36
38
|
def self.pdbs(protein)
|
37
39
|
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
38
40
|
text = Open.read(url)
|
@@ -44,15 +46,70 @@ module UniProt
|
|
44
46
|
id, method, resolution, region = $1.split(";").collect{|v| v.strip}
|
45
47
|
begin
|
46
48
|
chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
|
49
|
+
start = start.to_i
|
50
|
+
eend = eend.to_i
|
51
|
+
start, eend = eend, start if start > eend
|
47
52
|
rescue
|
48
53
|
Log.warn("Error process Uniprot PDB line: #{line}")
|
49
54
|
next
|
50
55
|
end
|
51
|
-
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start
|
56
|
+
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start..eend), :chains => chains}
|
52
57
|
}
|
53
58
|
pdb
|
54
59
|
end
|
55
60
|
|
61
|
+
def self.sequence(protein)
|
62
|
+
url = UNIPROT_FASTA.sub "[PROTEIN]", protein
|
63
|
+
text = Open.read(url)
|
64
|
+
|
65
|
+
text.split(/\n/).select{|line| line !~ /^>/} * ""
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.features(protein)
|
69
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
70
|
+
text = Open.read(url)
|
71
|
+
|
72
|
+
text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
|
73
|
+
|
74
|
+
parts = text.split(/^(FT \w+)/)
|
75
|
+
parts.shift
|
76
|
+
|
77
|
+
features = []
|
78
|
+
|
79
|
+
type = nil
|
80
|
+
parts.each do |part|
|
81
|
+
parts
|
82
|
+
if part[0..1] == "FT"
|
83
|
+
type = part.gsub(/FT\s+/,'')
|
84
|
+
next
|
85
|
+
end
|
86
|
+
value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
|
87
|
+
case
|
88
|
+
when value.match(/(\d+) (\d+) (.*)/)
|
89
|
+
start, eend, description = $1, $2, $3
|
90
|
+
description.gsub(/^FT\s+/m, '')
|
91
|
+
when value.match(/(\d+) (\d+)/)
|
92
|
+
start, eend = $1, $2
|
93
|
+
description = nil
|
94
|
+
else
|
95
|
+
Log.debug "Value not understood: #{ value }"
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
feature = {
|
100
|
+
:type => type,
|
101
|
+
:start => start.to_i,
|
102
|
+
:end => eend.to_i,
|
103
|
+
:description => description,
|
104
|
+
}
|
105
|
+
|
106
|
+
features << feature
|
107
|
+
end
|
108
|
+
|
109
|
+
features
|
110
|
+
end
|
111
|
+
|
112
|
+
|
56
113
|
def self.variants(protein)
|
57
114
|
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
58
115
|
text = Open.read(url)
|
data/share/Ensembl/release_dates
CHANGED
@@ -290,7 +290,8 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
|
290
290
|
[]
|
291
291
|
end
|
292
292
|
|
293
|
-
transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
|
293
|
+
#transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
|
294
|
+
transcripts
|
294
295
|
end
|
295
296
|
|
296
297
|
def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
@@ -440,6 +441,23 @@ file 'chromosomes' do |t|
|
|
440
441
|
File.open(t.name, 'w') do |f| f.puts goterms end
|
441
442
|
end
|
442
443
|
|
444
|
+
file 'blacklist_chromosomes' => 'chromosomes' do |t|
|
445
|
+
list = TSV.open(t.prerequisites.first).keys.select{|c| c.index('_') or c.index('.')}
|
446
|
+
File.open(t.name, 'w') do |f| f.puts list * "\n" end
|
447
|
+
end
|
448
|
+
|
449
|
+
file 'blacklist_genes' => ['blacklist_chromosomes', 'gene_positions'] do |t|
|
450
|
+
Open.read(t.prerequisites.first)
|
451
|
+
genes = CMD.cmd("grep -f '#{t.prerequisites.first}' | cut -f 1", :in => Open.open(t.prerequisites.last)).read.split("\n").uniq
|
452
|
+
File.open(t.name, 'w') do |f| f.puts genes * "\n" end
|
453
|
+
end
|
454
|
+
|
455
|
+
file 'sanctioned_genes' => ['blacklist_genes', 'gene_positions'] do |t|
|
456
|
+
genes = CMD.cmd("cut -f 1", :in => Open.open(t.prerequisites.last)).read.split("\n").uniq - Open.read(t.prerequisites.first).split("\n")
|
457
|
+
File.open(t.name, 'w') do |f| f.puts genes * "\n" end
|
458
|
+
end
|
459
|
+
|
460
|
+
|
443
461
|
rule /^chromosome_.*/ do |t|
|
444
462
|
chr = t.name.match(/chromosome_(.*)/)[1]
|
445
463
|
|
@@ -450,7 +468,11 @@ rule /^chromosome_.*/ do |t|
|
|
450
468
|
ftp = Net::FTP.new("ftp.ensembl.org")
|
451
469
|
ftp.passive = true
|
452
470
|
ftp.login
|
453
|
-
|
471
|
+
if release.nil? or release == 'current'
|
472
|
+
ftp.chdir("pub/current_fasta/")
|
473
|
+
else
|
474
|
+
ftp.chdir("pub/#{ release }/fasta/")
|
475
|
+
end
|
454
476
|
ftp.chdir($scientific_name.downcase.sub(" ",'_'))
|
455
477
|
ftp.chdir('dna')
|
456
478
|
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
@@ -530,7 +552,7 @@ file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
|
|
530
552
|
begin
|
531
553
|
p = Organism.root
|
532
554
|
p.replace File.expand_path("./chromosome_#{chr}")
|
533
|
-
p.sub!(
|
555
|
+
p.sub!(%r{.*/organisms/},'share/organisms/')
|
534
556
|
p = Path.setup(p, 'rbbt', Organism)
|
535
557
|
chr_str = p.produce.read
|
536
558
|
rescue Exception
|
@@ -624,10 +646,10 @@ end
|
|
624
646
|
|
625
647
|
file 'transcript_3utr' => ["transcript_5utr"] do |t|
|
626
648
|
end
|
627
|
-
|
628
|
-
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
|
649
|
+
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
|
629
650
|
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
|
630
651
|
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
|
652
|
+
transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
|
631
653
|
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
|
632
654
|
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
633
655
|
|
@@ -638,7 +660,12 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
|
|
638
660
|
next if protein.nil? or protein.empty?
|
639
661
|
utr5 = transcript_5utr[transcript]
|
640
662
|
utr3 = transcript_3utr[transcript]
|
641
|
-
|
663
|
+
phase = transcript_phase[transcript] || 0
|
664
|
+
if phase < 0
|
665
|
+
utr5 = - phase if utr5 == 0
|
666
|
+
phase = 0
|
667
|
+
end
|
668
|
+
psequence = Bio::Sequence::NA.new(("N" * phase) << sequence[utr5..sequence.length-utr3-1]).translate
|
642
669
|
protein_sequence[protein]=psequence
|
643
670
|
end
|
644
671
|
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
|
3
|
+
require 'rbbt/sources/gscholar'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestGScholar < Test::Unit::TestCase
|
7
|
+
def test_citation
|
8
|
+
assert_match GoogleScholar.citation_link("Ten Years of Pathway Analysis: Current Approaches and Outstanding Challenges").to_s, /cites/
|
9
|
+
assert_match GoogleScholar.number_cites("Ten Years of Pathway Analysis: Current Approaches and Outstanding Challenges").to_s, /\d+/
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
|
@@ -61,6 +61,11 @@ class TestOrganism < Test::Unit::TestCase
|
|
61
61
|
assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
|
62
62
|
end
|
63
63
|
|
64
|
+
def test_orhtolog
|
65
|
+
require 'rbbt/entity/gene'
|
66
|
+
assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog("Hsa/jun2011")
|
67
|
+
end
|
68
|
+
|
64
69
|
#def test_genes_at_chromosome
|
65
70
|
# pos = [12, 117799500]
|
66
71
|
# assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
5
|
-
prerelease:
|
4
|
+
version: 2.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Miguel Vazquez
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-10-21 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rbbt-util
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rbbt-text
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: libxml-ruby
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ! '>='
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,6 @@ dependencies:
|
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ! '>='
|
60
53
|
- !ruby/object:Gem::Version
|
@@ -62,7 +55,6 @@ dependencies:
|
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: bio
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
59
|
- - ! '>='
|
68
60
|
- !ruby/object:Gem::Version
|
@@ -70,7 +62,6 @@ dependencies:
|
|
70
62
|
type: :runtime
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
66
|
- - ! '>='
|
76
67
|
- !ruby/object:Gem::Version
|
@@ -78,7 +69,6 @@ dependencies:
|
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: mechanize
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
73
|
- - ! '>='
|
84
74
|
- !ruby/object:Gem::Version
|
@@ -86,7 +76,6 @@ dependencies:
|
|
86
76
|
type: :runtime
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
80
|
- - ! '>='
|
92
81
|
- !ruby/object:Gem::Version
|
@@ -111,6 +100,7 @@ files:
|
|
111
100
|
- lib/rbbt/sources/biomart.rb
|
112
101
|
- lib/rbbt/sources/cath.rb
|
113
102
|
- lib/rbbt/sources/dbSNP.rb
|
103
|
+
- lib/rbbt/sources/ensembl.rb
|
114
104
|
- lib/rbbt/sources/ensembl_ftp.rb
|
115
105
|
- lib/rbbt/sources/entrez.rb
|
116
106
|
- lib/rbbt/sources/genomes1000.rb
|
@@ -139,37 +129,38 @@ files:
|
|
139
129
|
- test/rbbt/sources/test_go.rb
|
140
130
|
- test/rbbt/sources/test_entrez.rb
|
141
131
|
- test/rbbt/sources/test_biomart.rb
|
132
|
+
- test/rbbt/sources/test_gscholar.rb
|
142
133
|
- test/rbbt/sources/test_organism.rb
|
143
134
|
- test/rbbt/sources/test_pubmed.rb
|
144
135
|
- test/test_helper.rb
|
145
136
|
homepage: http://github.com/mikisvaz/rbbt-sources
|
146
137
|
licenses: []
|
138
|
+
metadata: {}
|
147
139
|
post_install_message:
|
148
140
|
rdoc_options: []
|
149
141
|
require_paths:
|
150
142
|
- lib
|
151
143
|
required_ruby_version: !ruby/object:Gem::Requirement
|
152
|
-
none: false
|
153
144
|
requirements:
|
154
145
|
- - ! '>='
|
155
146
|
- !ruby/object:Gem::Version
|
156
147
|
version: '0'
|
157
148
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
|
-
none: false
|
159
149
|
requirements:
|
160
150
|
- - ! '>='
|
161
151
|
- !ruby/object:Gem::Version
|
162
152
|
version: '0'
|
163
153
|
requirements: []
|
164
154
|
rubyforge_project:
|
165
|
-
rubygems_version:
|
155
|
+
rubygems_version: 2.0.3
|
166
156
|
signing_key:
|
167
|
-
specification_version:
|
157
|
+
specification_version: 4
|
168
158
|
summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
|
169
159
|
test_files:
|
170
160
|
- test/rbbt/sources/test_go.rb
|
171
161
|
- test/rbbt/sources/test_entrez.rb
|
172
162
|
- test/rbbt/sources/test_biomart.rb
|
163
|
+
- test/rbbt/sources/test_gscholar.rb
|
173
164
|
- test/rbbt/sources/test_organism.rb
|
174
165
|
- test/rbbt/sources/test_pubmed.rb
|
175
166
|
- test/test_helper.rb
|