rbbt-sources 1.2.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'net/ftp'
|
5
|
+
|
6
|
+
module Ensembl
|
7
|
+
|
8
|
+
|
9
|
+
def self.releases
|
10
|
+
@releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
|
11
|
+
end
|
12
|
+
|
13
|
+
module FTP
|
14
|
+
|
15
|
+
SERVER = "ftp.ensembl.org"
|
16
|
+
|
17
|
+
def self.ftp_name_for(organism)
|
18
|
+
code, build = organism.split "/"
|
19
|
+
build ||= "current"
|
20
|
+
|
21
|
+
if build.to_s == "current"
|
22
|
+
else
|
23
|
+
release = Ensembl.releases[build]
|
24
|
+
name = Organism.scientific_name(organism)
|
25
|
+
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
26
|
+
ftp.login
|
27
|
+
ftp.chdir(File.join('pub', release, 'mysql'))
|
28
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
|
29
|
+
ftp.close
|
30
|
+
end
|
31
|
+
[release, file]
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.ftp_directory_for(organism)
|
35
|
+
release, ftp_name = ftp_name_for(organism)
|
36
|
+
File.join('/pub/', release, 'mysql', ftp_name)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.base_url(organism)
|
40
|
+
File.join("ftp://" + SERVER, ftp_directory_for(organism) )
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.url_for(organism, table)
|
44
|
+
"#{base_url(organism)}/#{table}.txt.gz"
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.has_table?(organism, table)
|
48
|
+
sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
|
49
|
+
! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.fields_for(organism, table)
|
53
|
+
sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
|
54
|
+
|
55
|
+
chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
|
56
|
+
chunk.scan(/^\s+`(.*?)`/).flatten
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
|
60
|
+
url = url_for(organism, table)
|
61
|
+
if key_field and fields
|
62
|
+
all_fields = fields_for(organism, table)
|
63
|
+
key_pos = all_fields.index key_field
|
64
|
+
field_pos = fields.collect{|f| all_fields.index f}
|
65
|
+
|
66
|
+
options[:key_field] = key_pos
|
67
|
+
options[:fields] = field_pos
|
68
|
+
end
|
69
|
+
tsv = TSV.open(url, options)
|
70
|
+
tsv.key_field = key_field
|
71
|
+
tsv.fields = fields
|
72
|
+
tsv
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
if __FILE__ == $0
|
78
|
+
ddd Ensembl::FTP.ensembl_tsv("Hsa/may2012", 'exon')
|
79
|
+
end
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -10,7 +10,7 @@ module Entrez
|
|
10
10
|
Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
11
11
|
|
12
12
|
def self.entrez2native(taxs, options = {})
|
13
|
-
options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
|
13
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
|
14
14
|
|
15
15
|
taxs = [taxs] unless Array === taxs
|
16
16
|
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
@@ -22,7 +22,7 @@ module Entrez
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.entrez2name(taxs, options = {})
|
25
|
-
options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
|
25
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => [2], :persist => true, :merge => true
|
26
26
|
|
27
27
|
taxs = [taxs] unless Array === taxs
|
28
28
|
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/resource'
|
4
|
+
|
5
|
+
module Genomes1000
|
6
|
+
extend Resource
|
7
|
+
self.subdir = "share/databases/genomes_1000"
|
8
|
+
|
9
|
+
RELEASE_URL = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz"
|
10
|
+
|
11
|
+
Genomes1000.claim Genomes1000.mutations, :proc do
|
12
|
+
|
13
|
+
tsv = TSV.setup({}, :key_field => "Variant ID", :fields => ["Genomic Mutation"], :type => :single)
|
14
|
+
Open.read(RELEASE_URL) do |line|
|
15
|
+
next if line[0] == "#"[0]
|
16
|
+
|
17
|
+
chromosome, position, id, references, alternative, quality, filter, info = line.split("\t")
|
18
|
+
|
19
|
+
tsv[id] = [chromosome, position, alternative] * ":"
|
20
|
+
end
|
21
|
+
|
22
|
+
tsv.namespace = "Hsa"
|
23
|
+
|
24
|
+
tsv.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
Genomes1000.claim Genomes1000.mutations_hg18, :proc do
|
28
|
+
require 'rbbt/sources/organism'
|
29
|
+
|
30
|
+
hg19_tsv = Genomes1000.mutations.tsv :unnamed => true
|
31
|
+
|
32
|
+
mutations = hg19_tsv.values
|
33
|
+
|
34
|
+
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
35
|
+
|
36
|
+
tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
|
37
|
+
translations[mutation]
|
38
|
+
end
|
39
|
+
|
40
|
+
tsv.namespace = "Hsa/may2009"
|
41
|
+
|
42
|
+
tsv.to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -97,9 +97,12 @@ if defined? Entity
|
|
97
97
|
@name ||= GO.id2name(self)
|
98
98
|
end
|
99
99
|
|
100
|
-
property :genes => :array2single do |
|
100
|
+
property :genes => :array2single do |*args|
|
101
|
+
organism = args.first
|
101
102
|
organism ||= self.organism
|
102
|
-
|
103
|
+
res = Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
|
104
|
+
res.collect{|r| r.organism = organism if r and r.respond_to? :organism}
|
105
|
+
res
|
103
106
|
end
|
104
107
|
|
105
108
|
property :description => :single2array do
|
@@ -114,12 +117,21 @@ if defined? Entity
|
|
114
117
|
if defined? Gene and Entity === Gene
|
115
118
|
module Gene
|
116
119
|
property :go_terms => :array2single do
|
117
|
-
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
120
|
+
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
118
121
|
end
|
119
122
|
|
120
123
|
property :go_bp_terms => :array2single do
|
121
|
-
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
124
|
+
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
122
125
|
end
|
126
|
+
|
127
|
+
property :go_cc_terms => :array2single do
|
128
|
+
@go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
129
|
+
end
|
130
|
+
|
131
|
+
property :go_mf_terms => :array2single do
|
132
|
+
@go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
|
133
|
+
end
|
134
|
+
|
123
135
|
end
|
124
136
|
end
|
125
137
|
end
|
@@ -1,18 +1,80 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/resource'
|
3
|
-
require 'rbbt/resource/with_key'
|
4
3
|
|
5
4
|
module Organism
|
6
5
|
extend Resource
|
7
6
|
self.pkgdir = "rbbt"
|
8
7
|
self.subdir = "share/organisms"
|
9
8
|
|
10
|
-
|
9
|
+
def self.installable_organisms
|
10
|
+
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
Organism.installable_organisms.each do |organism|
|
11
15
|
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
12
16
|
|
13
17
|
module_eval "#{ organism } = with_key '#{organism}'"
|
14
18
|
end
|
15
19
|
|
20
|
+
Rbbt.claim Rbbt.software.opt.bin.liftOver, :url, "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
21
|
+
|
22
|
+
def self.hg_build(organism)
|
23
|
+
require 'rbbt/sources/ensembl_ftp'
|
24
|
+
|
25
|
+
raise "Only organism 'Hsa' (Homo sapiens) supported" unless organism =~ /^Hsa/
|
26
|
+
|
27
|
+
return 'hg19' unless organism =~ /\//
|
28
|
+
date = organism.split("/")[1]
|
29
|
+
|
30
|
+
release = Ensembl.releases[date]
|
31
|
+
|
32
|
+
release.sub(/.*-/,'').to_i > 54 ? 'hg19' : 'hg18'
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.liftOver(positions, source, target)
|
36
|
+
|
37
|
+
source_hg = hg_build(source)
|
38
|
+
target_hg = hg_build(target)
|
39
|
+
|
40
|
+
case
|
41
|
+
when (source_hg == 'hg19' and target_hg == 'hg18')
|
42
|
+
map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg18.over.chain.gz"
|
43
|
+
when (source_hg == 'hg18' and target_hg == 'hg19')
|
44
|
+
map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz"
|
45
|
+
else
|
46
|
+
return positions
|
47
|
+
end
|
48
|
+
|
49
|
+
positions_bed = positions.collect{|position| chr, pos = position.split(":").values_at(0,1); ["chr" << chr, pos.to_i-1, pos, position] * "\t"} * "\n" + "\n"
|
50
|
+
new_positions = {}
|
51
|
+
|
52
|
+
TmpFile.with_file(positions_bed) do |source_bed|
|
53
|
+
TmpFile.with_file() do |unmapped_file|
|
54
|
+
TmpFile.with_file() do |map_file|
|
55
|
+
|
56
|
+
|
57
|
+
Open.write(map_file, Open.read(map_url))
|
58
|
+
new_mutations = TmpFile.with_file() do |target_bed|
|
59
|
+
FileUtils.chmod(755, Rbbt.software.opt.bin.liftOver.produce.find)
|
60
|
+
CMD.cmd("#{Rbbt.software.opt.bin.liftOver.find} '#{source_bed}' '#{map_file}' '#{target_bed}' '#{unmapped_file}'").read
|
61
|
+
Open.read(target_bed) do |line|
|
62
|
+
chr, position_alt, position, name = line.chomp.split("\t")
|
63
|
+
chr.sub! /chr/, ''
|
64
|
+
|
65
|
+
old_chr, old_position, *rest = name.split(":")
|
66
|
+
new_positions[name] = ([chr, position].concat rest) * ":"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
positions.collect do |position|
|
74
|
+
new_positions[position]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
16
78
|
class OrganismNotProcessedError < StandardError; end
|
17
79
|
|
18
80
|
def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
|
@@ -53,30 +115,36 @@ module Organism
|
|
53
115
|
end
|
54
116
|
end
|
55
117
|
|
56
|
-
def self.guess_id(org, values, identifiers = nil)
|
57
|
-
identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
|
58
|
-
field_matches = identifiers.field_matches(values)
|
59
|
-
field_matches.sort_by{|field, matches| matches.uniq.length}.last
|
60
|
-
end
|
61
|
-
|
62
118
|
def self.guess_id(org, values)
|
63
119
|
field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
|
64
120
|
field_matches.sort_by{|field, count| count.to_i}.last
|
65
121
|
end
|
66
122
|
|
67
|
-
|
68
123
|
def self.organisms
|
69
124
|
Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
|
70
125
|
end
|
71
126
|
|
72
|
-
def self.
|
73
|
-
Organism
|
127
|
+
def self.scientific_name(organism)
|
128
|
+
Organism[organism]["scientific_name"].produce.read.strip
|
74
129
|
end
|
75
130
|
|
76
131
|
def self.organism(name)
|
77
132
|
organisms.select{|organism|
|
78
|
-
organism == name or Organism.
|
133
|
+
organism == name or Organism.scientific_name(organism) =~ /#{ name }/i
|
79
134
|
}.first
|
80
135
|
end
|
81
136
|
|
137
|
+
def self.known_ids(name)
|
138
|
+
TSV::Parser.new(Organism.identifiers(name).open).all_fields
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.entrez_taxid_organism(taxid)
|
142
|
+
all_organisms = Organism.installable_organisms
|
143
|
+
|
144
|
+
all_organisms.each do |organism|
|
145
|
+
return organism if Organism.entrez_taxids(organism).read.split("\n").include? taxid.to_s
|
146
|
+
end
|
147
|
+
|
148
|
+
raise "No organism identified for taxid #{taxid}. Supported organism are: #{all_organisms * ", "}"
|
149
|
+
end
|
82
150
|
end
|
data/lib/rbbt/sources/pfam.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/tsv'
|
3
3
|
require 'rbbt/resource'
|
4
|
+
require 'rbbt/entity'
|
5
|
+
require 'rbbt/sources/InterPro'
|
4
6
|
|
5
7
|
module Pfam
|
6
8
|
extend Resource
|
@@ -12,24 +14,82 @@ module Pfam
|
|
12
14
|
tsv.to_s
|
13
15
|
end
|
14
16
|
|
15
|
-
NAMES_FILE =
|
17
|
+
NAMES_FILE = InterPro.pfam_names.find
|
16
18
|
|
17
19
|
def self.name_index
|
18
|
-
@name_index ||= TSV.open NAMES_FILE, :single
|
20
|
+
@name_index ||= TSV.open NAMES_FILE, :single, :unnamed => true
|
19
21
|
end
|
20
22
|
|
21
23
|
def self.name(id)
|
22
|
-
name_index[id]
|
24
|
+
name_index[id] || id
|
23
25
|
end
|
24
26
|
end
|
25
27
|
|
28
|
+
module InterPro
|
29
|
+
def self.pfam_index
|
30
|
+
@@pfam_index ||= InterPro.pfam_equivalences.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["Pfam Domain"])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
InterPro.claim InterPro.pfam_names.find, :proc do
|
35
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
36
|
+
tsv = nil
|
37
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
38
|
+
tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
|
39
|
+
end
|
40
|
+
tsv.key_field = "InterPro ID"
|
41
|
+
tsv.fields = ["Domain Name"]
|
42
|
+
tsv.to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
InterPro.claim InterPro.pfam_equivalences.find, :proc do
|
46
|
+
pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
|
47
|
+
tsv = nil
|
48
|
+
TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
|
49
|
+
tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
|
50
|
+
end
|
51
|
+
tsv.key_field = "InterPro ID"
|
52
|
+
tsv.fields = ["Pfam Domain"]
|
53
|
+
tsv.to_s
|
54
|
+
end
|
55
|
+
|
56
|
+
|
26
57
|
if defined? Entity
|
27
58
|
module PfamDomain
|
28
59
|
extend Entity
|
29
60
|
self.format = "Pfam Domain"
|
61
|
+
self.format = "Pfam Domain ID"
|
62
|
+
|
63
|
+
self.annotation :organism
|
30
64
|
|
31
65
|
property :name => :array2single do
|
32
66
|
self.collect{|id| Pfam.name(id)}
|
33
67
|
end
|
68
|
+
|
69
|
+
property :genes => :array2single do
|
70
|
+
@genes ||= Organism.gene_pfam(organism).tsv(:key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :persist => true, :merge => true, :type => :flat, :namespace => organism).values_at *self
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
module InterProDomain
|
76
|
+
property :pfam => :array2single do
|
77
|
+
InterPro.pfam_index.values_at(*self).
|
78
|
+
each{|domain| domain.organism = organism if domain.respond_to? :organism }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
if defined? Gene and Entity === Gene
|
83
|
+
module Gene
|
84
|
+
INDEX_CACHE = {}
|
85
|
+
|
86
|
+
property :pfam_domains => :array2single do
|
87
|
+
index = INDEX_CACHE[organism] ||= Organism.gene_pfam(organism).tsv(:persist => true, :type => :flat, :fields => ["Pfam Domain"], :key_field => "Ensembl Gene ID", :namespace => organism)
|
88
|
+
@pfam_domains ||= index.values_at *self.ensembl
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
34
92
|
end
|
35
93
|
end
|
94
|
+
|
95
|
+
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -15,7 +15,8 @@ module PubMed
|
|
15
15
|
|
16
16
|
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
17
17
|
articles = []
|
18
|
-
|
18
|
+
|
19
|
+
Misc.divide(pmids.sort_by{|v| v.nil? ? 0 : v.to_i}, (pmids.length / 1000) + 1).each do |pmid_list|
|
19
20
|
postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
|
20
21
|
xml = TmpFile.with_file(postdata) do |postfile|
|
21
22
|
Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
|
@@ -130,7 +131,13 @@ module PubMed
|
|
130
131
|
|
131
132
|
def pdf_url
|
132
133
|
return pmc_pdf if pmc_pdf
|
133
|
-
@gscholar_pdf ||=
|
134
|
+
@gscholar_pdf ||= begin
|
135
|
+
GoogleScholar::full_text_url title
|
136
|
+
rescue
|
137
|
+
Log.medium "GoogleScholar#full_text failed: #{title}"
|
138
|
+
sleep 0.1
|
139
|
+
nil
|
140
|
+
end
|
134
141
|
end
|
135
142
|
|
136
143
|
def full_text
|
@@ -140,7 +147,7 @@ module PubMed
|
|
140
147
|
TmpFile.with_file do |pdf|
|
141
148
|
|
142
149
|
# Change user-agent, oh well...
|
143
|
-
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
|
150
|
+
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
|
144
151
|
TmpFile.with_file do |txt|
|
145
152
|
`pdftotext #{ pdf } #{ txt }`
|
146
153
|
text = Open.read(txt) if File.exists? txt
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
|
4
|
+
module Reactome
|
5
|
+
extend Resource
|
6
|
+
self.subdir = "share/databases/Reactome"
|
7
|
+
|
8
|
+
Reactome.claim Reactome.protein_pathways, :proc do
|
9
|
+
url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
|
10
|
+
tsv = TSV.open(Open.open(url), :key_field => 0, :fields => [1], :merge => true, :type => :double)
|
11
|
+
tsv.key_field = "UniProt/SwissProt Accession"
|
12
|
+
tsv.fields = ["Reactome Pathway ID"]
|
13
|
+
tsv.namespace = "Hsa"
|
14
|
+
tsv.to_s
|
15
|
+
end
|
16
|
+
|
17
|
+
Reactome.claim Reactome.pathway_names, :proc do
|
18
|
+
url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
|
19
|
+
tsv = TSV.open(Open.open(url), :key_field => 1, :fields => [2], :type => :single)
|
20
|
+
tsv.key_field = "Reactome Pathway ID"
|
21
|
+
tsv.fields = ["Pathway Name"]
|
22
|
+
tsv.namespace = "Hsa"
|
23
|
+
tsv.to_s
|
24
|
+
end
|
25
|
+
|
26
|
+
Reactome.claim Reactome.protein_protein, :proc do
|
27
|
+
url = "http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"
|
28
|
+
tsv = TSV.open(CMD.cmd('cut -f 1,4,7,8,9|sed "s/UniProt://g;s/,/;/g"', :in => Open.open(url), :pipe => true), :type => :double, :merge => true)
|
29
|
+
tsv.key_field = "UniProt/SwissProt Accession"
|
30
|
+
tsv.fields = ["Interactor UniProt/SwissProt Accession", "Interaction type", "Reactions", "PMID"]
|
31
|
+
tsv.namespace = "Hsa"
|
32
|
+
tsv.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
if defined? Entity
|
38
|
+
module ReactomePathway
|
39
|
+
extend Entity
|
40
|
+
self.format = "Reactome Pathway ID"
|
41
|
+
|
42
|
+
self.annotation :organism
|
43
|
+
|
44
|
+
def self.name_index
|
45
|
+
@name_index ||= Reactome.pathway_names.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.gene_index
|
49
|
+
@gene_index ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
53
|
+
return true if query == entity
|
54
|
+
|
55
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
56
|
+
|
57
|
+
false
|
58
|
+
end
|
59
|
+
|
60
|
+
property :name => :array2single do
|
61
|
+
@name ||= ReactomePathway.name_index.values_at *self
|
62
|
+
end
|
63
|
+
|
64
|
+
property :genes => :array2single do
|
65
|
+
@genes ||= ReactomePathway.gene_index.values_at(*self).
|
66
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
67
|
+
end
|
68
|
+
|
69
|
+
property :url => :single do
|
70
|
+
"http://www.reactome.org/cgi-bin/eventbrowser_st_id?ST_ID=#{ self }"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
if defined? Gene and Entity === Gene
|
75
|
+
module Gene
|
76
|
+
property :reactome_pathways => :array2single do
|
77
|
+
@reactome_pathways ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
|
78
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| ReactomePathway.setup(o, organism)}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|