rbbt-sources 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'net/ftp'
|
5
|
+
|
6
|
+
module Ensembl
|
7
|
+
|
8
|
+
|
9
|
+
def self.releases
|
10
|
+
@releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
|
11
|
+
end
|
12
|
+
|
13
|
+
module FTP
|
14
|
+
|
15
|
+
SERVER = "ftp.ensembl.org"
|
16
|
+
|
17
|
+
def self.ftp_name_for(organism)
|
18
|
+
code, build = organism.split "/"
|
19
|
+
build ||= "current"
|
20
|
+
|
21
|
+
if build.to_s == "current"
|
22
|
+
else
|
23
|
+
release = Ensembl.releases[build]
|
24
|
+
name = Organism.scientific_name(organism)
|
25
|
+
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
26
|
+
ftp.login
|
27
|
+
ftp.chdir(File.join('pub', release, 'mysql'))
|
28
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
|
29
|
+
ftp.close
|
30
|
+
end
|
31
|
+
[release, file]
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.ftp_directory_for(organism)
|
35
|
+
release, ftp_name = ftp_name_for(organism)
|
36
|
+
File.join('/pub/', release, 'mysql', ftp_name)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.base_url(organism)
|
40
|
+
File.join("ftp://" + SERVER, ftp_directory_for(organism) )
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.url_for(organism, table)
|
44
|
+
"#{base_url(organism)}/#{table}.txt.gz"
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.has_table?(organism, table)
|
48
|
+
sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
|
49
|
+
! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.fields_for(organism, table)
|
53
|
+
sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
|
54
|
+
|
55
|
+
chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
|
56
|
+
chunk.scan(/^\s+`(.*?)`/).flatten
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
|
60
|
+
url = url_for(organism, table)
|
61
|
+
if key_field and fields
|
62
|
+
all_fields = fields_for(organism, table)
|
63
|
+
key_pos = all_fields.index key_field
|
64
|
+
field_pos = fields.collect{|f| all_fields.index f}
|
65
|
+
|
66
|
+
options[:key_field] = key_pos
|
67
|
+
options[:fields] = field_pos
|
68
|
+
end
|
69
|
+
tsv = TSV.open(url, options)
|
70
|
+
tsv.key_field = key_field
|
71
|
+
tsv.fields = fields
|
72
|
+
tsv
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
if __FILE__ == $0
|
78
|
+
ddd Ensembl::FTP.ensembl_tsv("Hsa/may2012", 'exon')
|
79
|
+
end
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -10,7 +10,7 @@ module Entrez
|
|
10
10
|
Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
11
11
|
|
12
12
|
def self.entrez2native(taxs, options = {})
|
13
|
-
options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
|
13
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
|
14
14
|
|
15
15
|
taxs = [taxs] unless Array === taxs
|
16
16
|
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
@@ -22,7 +22,7 @@ module Entrez
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.entrez2name(taxs, options = {})
|
25
|
-
options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
|
25
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => [2], :persist => true, :merge => true
|
26
26
|
|
27
27
|
taxs = [taxs] unless Array === taxs
|
28
28
|
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/resource'
|
4
|
+
|
5
|
+
module Genomes1000
|
6
|
+
extend Resource
|
7
|
+
self.subdir = "share/databases/genomes_1000"
|
8
|
+
|
9
|
+
RELEASE_URL = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz"
|
10
|
+
|
11
|
+
Genomes1000.claim Genomes1000.mutations, :proc do
|
12
|
+
|
13
|
+
tsv = TSV.setup({}, :key_field => "Variant ID", :fields => ["Genomic Mutation"], :type => :single)
|
14
|
+
Open.read(RELEASE_URL) do |line|
|
15
|
+
next if line[0] == "#"[0]
|
16
|
+
|
17
|
+
chromosome, position, id, references, alternative, quality, filter, info = line.split("\t")
|
18
|
+
|
19
|
+
tsv[id] = [chromosome, position, alternative] * ":"
|
20
|
+
end
|
21
|
+
|
22
|
+
tsv.namespace = "Hsa"
|
23
|
+
|
24
|
+
tsv.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
Genomes1000.claim Genomes1000.mutations_hg18, :proc do
|
28
|
+
require 'rbbt/sources/organism'
|
29
|
+
|
30
|
+
hg19_tsv = Genomes1000.mutations.tsv :unnamed => true
|
31
|
+
|
32
|
+
mutations = hg19_tsv.values
|
33
|
+
|
34
|
+
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
35
|
+
|
36
|
+
tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
|
37
|
+
translations[mutation]
|
38
|
+
end
|
39
|
+
|
40
|
+
tsv.namespace = "Hsa/may2009"
|
41
|
+
|
42
|
+
tsv.to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -97,9 +97,12 @@ if defined? Entity
|
|
97
97
|
@name ||= GO.id2name(self)
|
98
98
|
end
|
99
99
|
|
100
|
-
property :genes => :array2single do |
|
100
|
+
property :genes => :array2single do |*args|
|
101
|
+
organism = args.first
|
101
102
|
organism ||= self.organism
|
102
|
-
|
103
|
+
res = Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
|
104
|
+
res.collect{|r| r.organism = organism if r and r.respond_to? :organism}
|
105
|
+
res
|
103
106
|
end
|
104
107
|
|
105
108
|
property :description => :single2array do
|
@@ -114,12 +117,21 @@ if defined? Entity
|
|
114
117
|
if defined? Gene and Entity === Gene
|
115
118
|
module Gene
|
116
119
|
property :go_terms => :array2single do
|
117
|
-
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
120
|
+
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
118
121
|
end
|
119
122
|
|
120
123
|
property :go_bp_terms => :array2single do
|
121
|
-
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
124
|
+
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
122
125
|
end
|
126
|
+
|
127
|
+
property :go_cc_terms => :array2single do
|
128
|
+
@go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
129
|
+
end
|
130
|
+
|
131
|
+
property :go_mf_terms => :array2single do
|
132
|
+
@go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
133
|
+
end
|
134
|
+
|
123
135
|
end
|
124
136
|
end
|
125
137
|
end
|
@@ -1,18 +1,80 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/resource'
|
3
|
-
require 'rbbt/resource/with_key'
|
4
3
|
|
5
4
|
module Organism
|
6
5
|
extend Resource
|
7
6
|
self.pkgdir = "rbbt"
|
8
7
|
self.subdir = "share/organisms"
|
9
8
|
|
10
|
-
|
9
|
+
def self.installable_organisms
|
10
|
+
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
Organism.installable_organisms.each do |organism|
|
11
15
|
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
12
16
|
|
13
17
|
module_eval "#{ organism } = with_key '#{organism}'"
|
14
18
|
end
|
15
19
|
|
20
|
+
Rbbt.claim Rbbt.software.opt.bin.liftOver, :url, "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
21
|
+
|
22
|
+
def self.hg_build(organism)
|
23
|
+
require 'rbbt/sources/ensembl_ftp'
|
24
|
+
|
25
|
+
raise "Only organism 'Hsa' (Homo sapiens) supported" unless organism =~ /^Hsa/
|
26
|
+
|
27
|
+
return 'hg19' unless organism =~ /\//
|
28
|
+
date = organism.split("/")[1]
|
29
|
+
|
30
|
+
release = Ensembl.releases[date]
|
31
|
+
|
32
|
+
release.sub(/.*-/,'').to_i > 54 ? 'hg19' : 'hg18'
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.liftOver(positions, source, target)
|
36
|
+
|
37
|
+
source_hg = hg_build(source)
|
38
|
+
target_hg = hg_build(target)
|
39
|
+
|
40
|
+
case
|
41
|
+
when (source_hg == 'hg19' and target_hg == 'hg18')
|
42
|
+
map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg18.over.chain.gz"
|
43
|
+
when (source_hg == 'hg18' and target_hg == 'hg19')
|
44
|
+
map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz"
|
45
|
+
else
|
46
|
+
return positions
|
47
|
+
end
|
48
|
+
|
49
|
+
positions_bed = positions.collect{|position| chr, pos = position.split(":").values_at(0,1); ["chr" << chr, pos.to_i-1, pos, position] * "\t"} * "\n" + "\n"
|
50
|
+
new_positions = {}
|
51
|
+
|
52
|
+
TmpFile.with_file(positions_bed) do |source_bed|
|
53
|
+
TmpFile.with_file() do |unmapped_file|
|
54
|
+
TmpFile.with_file() do |map_file|
|
55
|
+
|
56
|
+
|
57
|
+
Open.write(map_file, Open.read(map_url))
|
58
|
+
new_mutations = TmpFile.with_file() do |target_bed|
|
59
|
+
FileUtils.chmod(755, Rbbt.software.opt.bin.liftOver.produce.find)
|
60
|
+
CMD.cmd("#{Rbbt.software.opt.bin.liftOver.find} '#{source_bed}' '#{map_file}' '#{target_bed}' '#{unmapped_file}'").read
|
61
|
+
Open.read(target_bed) do |line|
|
62
|
+
chr, position_alt, position, name = line.chomp.split("\t")
|
63
|
+
chr.sub! /chr/, ''
|
64
|
+
|
65
|
+
old_chr, old_position, *rest = name.split(":")
|
66
|
+
new_positions[name] = ([chr, position].concat rest) * ":"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
positions.collect do |position|
|
74
|
+
new_positions[position]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
16
78
|
class OrganismNotProcessedError < StandardError; end
|
17
79
|
|
18
80
|
def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
|
@@ -53,30 +115,36 @@ module Organism
|
|
53
115
|
end
|
54
116
|
end
|
55
117
|
|
56
|
-
def self.guess_id(org, values, identifiers = nil)
|
57
|
-
identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
|
58
|
-
field_matches = identifiers.field_matches(values)
|
59
|
-
field_matches.sort_by{|field, matches| matches.uniq.length}.last
|
60
|
-
end
|
61
|
-
|
62
118
|
def self.guess_id(org, values)
|
63
119
|
field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
|
64
120
|
field_matches.sort_by{|field, count| count.to_i}.last
|
65
121
|
end
|
66
122
|
|
67
|
-
|
68
123
|
def self.organisms
|
69
124
|
Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
|
70
125
|
end
|
71
126
|
|
72
|
-
def self.
|
73
|
-
Organism
|
127
|
+
def self.scientific_name(organism)
|
128
|
+
Organism[organism]["scientific_name"].produce.read.strip
|
74
129
|
end
|
75
130
|
|
76
131
|
def self.organism(name)
|
77
132
|
organisms.select{|organism|
|
78
|
-
organism == name or Organism.
|
133
|
+
organism == name or Organism.scientific_name(organism) =~ /#{ name }/i
|
79
134
|
}.first
|
80
135
|
end
|
81
136
|
|
137
|
+
def self.known_ids(name)
|
138
|
+
TSV::Parser.new(Organism.identifiers(name).open).all_fields
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.entrez_taxid_organism(taxid)
|
142
|
+
all_organisms = Organism.installable_organisms
|
143
|
+
|
144
|
+
all_organisms.each do |organism|
|
145
|
+
return organism if Organism.entrez_taxids(organism).read.split("\n").include? taxid.to_s
|
146
|
+
end
|
147
|
+
|
148
|
+
raise "No organism identified for taxid #{taxid}. Supported organism are: #{all_organisms * ", "}"
|
149
|
+
end
|
82
150
|
end
|
data/lib/rbbt/sources/pfam.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/tsv'
|
3
3
|
require 'rbbt/resource'
|
4
|
+
require 'rbbt/entity'
|
5
|
+
require 'rbbt/sources/InterPro'
|
4
6
|
|
5
7
|
module Pfam
|
6
8
|
extend Resource
|
@@ -12,24 +14,82 @@ module Pfam
|
|
12
14
|
tsv.to_s
|
13
15
|
end
|
14
16
|
|
15
|
-
NAMES_FILE =
|
17
|
+
NAMES_FILE = InterPro.pfam_names.find
|
16
18
|
|
17
19
|
def self.name_index
|
18
|
-
@name_index ||= TSV.open NAMES_FILE, :single
|
20
|
+
@name_index ||= TSV.open NAMES_FILE, :single, :unnamed => true
|
19
21
|
end
|
20
22
|
|
21
23
|
def self.name(id)
|
22
|
-
name_index[id]
|
24
|
+
name_index[id] || id
|
23
25
|
end
|
24
26
|
end
|
25
27
|
|
28
|
+
module InterPro
|
29
|
+
def self.pfam_index
|
30
|
+
@@pfam_index ||= InterPro.pfam_equivalences.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["Pfam Domain"])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
InterPro.claim InterPro.pfam_names.find, :proc do
|
35
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
36
|
+
tsv = nil
|
37
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
38
|
+
tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
|
39
|
+
end
|
40
|
+
tsv.key_field = "InterPro ID"
|
41
|
+
tsv.fields = ["Domain Name"]
|
42
|
+
tsv.to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
InterPro.claim InterPro.pfam_equivalences.find, :proc do
|
46
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
47
|
+
tsv = nil
|
48
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
49
|
+
tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
|
50
|
+
end
|
51
|
+
tsv.key_field = "InterPro ID"
|
52
|
+
tsv.fields = ["Pfam Domain"]
|
53
|
+
tsv.to_s
|
54
|
+
end
|
55
|
+
|
56
|
+
|
26
57
|
if defined? Entity
|
27
58
|
module PfamDomain
|
28
59
|
extend Entity
|
29
60
|
self.format = "Pfam Domain"
|
61
|
+
self.format = "Pfam Domain ID"
|
62
|
+
|
63
|
+
self.annotation :organism
|
30
64
|
|
31
65
|
property :name => :array2single do
|
32
66
|
self.collect{|id| Pfam.name(id)}
|
33
67
|
end
|
68
|
+
|
69
|
+
property :genes => :array2single do
|
70
|
+
@genes ||= Organism.gene_pfam(organism).tsv(:key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :persist => true, :merge => true, :type => :flat, :namespace => organism).values_at *self
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
module InterProDomain
|
76
|
+
property :pfam => :array2single do
|
77
|
+
InterPro.pfam_index.values_at(*self).
|
78
|
+
each{|domain| domain.organism = organism if domain.respond_to? :organism }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
if defined? Gene and Entity === Gene
|
83
|
+
module Gene
|
84
|
+
INDEX_CACHE = {}
|
85
|
+
|
86
|
+
property :pfam_domains => :array2single do
|
87
|
+
index = INDEX_CACHE[organism] ||= Organism.gene_pfam(organism).tsv(:persist => true, :type => :flat, :fields => ["Pfam Domain"], :key_field => "Ensembl Gene ID", :namespace => organism)
|
88
|
+
@pfam_domains ||= index.values_at *self.ensembl
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
34
92
|
end
|
35
93
|
end
|
94
|
+
|
95
|
+
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -15,7 +15,8 @@ module PubMed
|
|
15
15
|
|
16
16
|
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
17
17
|
articles = []
|
18
|
-
|
18
|
+
|
19
|
+
Misc.divide(pmids.sort_by{|v| v.nil? ? 0 : v.to_i}, (pmids.length / 1000) + 1).each do |pmid_list|
|
19
20
|
postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
|
20
21
|
xml = TmpFile.with_file(postdata) do |postfile|
|
21
22
|
Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
|
@@ -130,7 +131,13 @@ module PubMed
|
|
130
131
|
|
131
132
|
def pdf_url
|
132
133
|
return pmc_pdf if pmc_pdf
|
133
|
-
@gscholar_pdf ||=
|
134
|
+
@gscholar_pdf ||= begin
|
135
|
+
GoogleScholar::full_text_url title
|
136
|
+
rescue
|
137
|
+
Log.medium "GoogleScholar#full_text failed: #{title}"
|
138
|
+
sleep 0.1
|
139
|
+
nil
|
140
|
+
end
|
134
141
|
end
|
135
142
|
|
136
143
|
def full_text
|
@@ -140,7 +147,7 @@ module PubMed
|
|
140
147
|
TmpFile.with_file do |pdf|
|
141
148
|
|
142
149
|
# Change user-agent, oh well...
|
143
|
-
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
|
150
|
+
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
|
144
151
|
TmpFile.with_file do |txt|
|
145
152
|
`pdftotext #{ pdf } #{ txt }`
|
146
153
|
text = Open.read(txt) if File.exists? txt
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
|
4
|
+
module Reactome
|
5
|
+
extend Resource
|
6
|
+
self.subdir = "share/databases/Reactome"
|
7
|
+
|
8
|
+
Reactome.claim Reactome.protein_pathways, :proc do
|
9
|
+
url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
|
10
|
+
tsv = TSV.open(Open.open(url), :key_field => 0, :fields => [1], :merge => true, :type => :double)
|
11
|
+
tsv.key_field = "UniProt/SwissProt Accession"
|
12
|
+
tsv.fields = ["Reactome Pathway ID"]
|
13
|
+
tsv.namespace = "Hsa"
|
14
|
+
tsv.to_s
|
15
|
+
end
|
16
|
+
|
17
|
+
Reactome.claim Reactome.pathway_names, :proc do
|
18
|
+
url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
|
19
|
+
tsv = TSV.open(Open.open(url), :key_field => 1, :fields => [2], :type => :single)
|
20
|
+
tsv.key_field = "Reactome Pathway ID"
|
21
|
+
tsv.fields = ["Pathway Name"]
|
22
|
+
tsv.namespace = "Hsa"
|
23
|
+
tsv.to_s
|
24
|
+
end
|
25
|
+
|
26
|
+
Reactome.claim Reactome.protein_protein, :proc do
|
27
|
+
url = "http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"
|
28
|
+
tsv = TSV.open(CMD.cmd('cut -f 1,4,7,8,9|sed "s/UniProt://g;s/,/;/g"', :in => Open.open(url), :pipe => true), :type => :double, :merge => true)
|
29
|
+
tsv.key_field = "UniProt/SwissProt Accession"
|
30
|
+
tsv.fields = ["Interactor UniProt/SwissProt Accession", "Interaction type", "Reactions", "PMID"]
|
31
|
+
tsv.namespace = "Hsa"
|
32
|
+
tsv.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
if defined? Entity
|
38
|
+
module ReactomePathway
|
39
|
+
extend Entity
|
40
|
+
self.format = "Reactome Pathway ID"
|
41
|
+
|
42
|
+
self.annotation :organism
|
43
|
+
|
44
|
+
def self.name_index
|
45
|
+
@name_index ||= Reactome.pathway_names.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.gene_index
|
49
|
+
@gene_index ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
53
|
+
return true if query == entity
|
54
|
+
|
55
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
56
|
+
|
57
|
+
false
|
58
|
+
end
|
59
|
+
|
60
|
+
property :name => :array2single do
|
61
|
+
@name ||= ReactomePathway.name_index.values_at *self
|
62
|
+
end
|
63
|
+
|
64
|
+
property :genes => :array2single do
|
65
|
+
@genes ||= ReactomePathway.gene_index.values_at(*self).
|
66
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
67
|
+
end
|
68
|
+
|
69
|
+
property :url => :single do
|
70
|
+
"http://www.reactome.org/cgi-bin/eventbrowser_st_id?ST_ID=#{ self }"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
if defined? Gene and Entity === Gene
|
75
|
+
module Gene
|
76
|
+
property :reactome_pathways => :array2single do
|
77
|
+
@reactome_pathways ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
|
78
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| ReactomePathway.setup(o, organism)}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|