rbbt-sources 3.3.0 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/etc/allowed_biomart_archives +2 -4
- data/etc/biomart/missing_in_archive +2 -0
- data/etc/build_organism +4 -4
- data/etc/organisms +1 -0
- data/lib/rbbt/sources/biomart.rb +48 -13
- data/lib/rbbt/sources/ensembl_ftp.rb +31 -15
- data/lib/rbbt/sources/entrez.rb +13 -0
- data/lib/rbbt/sources/go.rb +2 -2
- data/lib/rbbt/sources/mesh.rb +26 -0
- data/lib/rbbt/sources/organism.rb +45 -24
- data/lib/rbbt/sources/pubmed.rb +13 -2
- data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} +23 -15
- data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} +3 -20
- data/share/install/Organism/{Rno/Rakefile → Rno.rake} +3 -8
- data/share/install/Organism/Sce.rake +38 -0
- data/share/install/Organism/organism_helpers.rb +126 -53
- data/share/install/lib/rake_helper.rb +2 -2
- data/test/rbbt/sources/test_biomart.rb +44 -6
- data/test/rbbt/sources/test_ensembl_ftp.rb +11 -0
- data/test/rbbt/sources/test_entrez.rb +5 -0
- data/test/rbbt/sources/test_mesh.rb +10 -0
- data/test/rbbt/sources/test_organism.rb +15 -15
- data/test/rbbt/sources/test_pubmed.rb +18 -8
- metadata +12 -7
- data/share/install/Organism/Sce/Rakefile +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
|
4
|
+
data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
|
7
|
+
data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0
|
data/etc/build_organism
CHANGED
data/etc/organisms
CHANGED
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -3,6 +3,7 @@ require 'rbbt/tsv'
|
|
3
3
|
require 'rbbt/tsv/attach'
|
4
4
|
require 'rbbt/util/log'
|
5
5
|
require 'cgi'
|
6
|
+
require 'rbbt/sources/organism'
|
6
7
|
|
7
8
|
# This module interacts with BioMart. It performs queries to BioMart and
|
8
9
|
# synthesises a hash with the results. Note that this module connects to the
|
@@ -13,7 +14,7 @@ module BioMart
|
|
13
14
|
|
14
15
|
class BioMart::QueryError < StandardError; end
|
15
16
|
|
16
|
-
BIOMART_URL = '
|
17
|
+
BIOMART_URL = 'ensembl.org/biomart/martservice'
|
17
18
|
|
18
19
|
MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
|
19
20
|
|
@@ -22,7 +23,7 @@ module BioMart
|
|
22
23
|
@@biomart_query_xml = <<-EOT
|
23
24
|
<?xml version="1.0" encoding="UTF-8"?>
|
24
25
|
<!DOCTYPE Query>
|
25
|
-
<Query completionStamp="1" virtualSchemaName = "
|
26
|
+
<Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
|
26
27
|
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
27
28
|
<!--FILTERS-->
|
28
29
|
<!--MAIN-->
|
@@ -36,14 +37,10 @@ module BioMart
|
|
36
37
|
raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
|
37
38
|
end
|
38
39
|
Thread.current['archive'] = date
|
39
|
-
Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
|
40
|
-
Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
|
41
40
|
end
|
42
41
|
|
43
42
|
def self.unset_archive
|
44
|
-
Log.debug "Restoring current version URL #{BIOMART_URL}"
|
45
43
|
Thread.current['archive'] = nil
|
46
|
-
Thread.current['archive_url'] = nil
|
47
44
|
end
|
48
45
|
|
49
46
|
def self.with_archive(data)
|
@@ -55,6 +52,21 @@ module BioMart
|
|
55
52
|
end
|
56
53
|
end
|
57
54
|
|
55
|
+
def self.final_url(query, archive = nil, ensembl_domain = nil)
|
56
|
+
url_domain = if archive.nil?
|
57
|
+
if ensembl_domain.nil?
|
58
|
+
'www'
|
59
|
+
else
|
60
|
+
ensembl_domain
|
61
|
+
end
|
62
|
+
elsif ensembl_domain
|
63
|
+
[archive, ensembl_domain] * "-"
|
64
|
+
else
|
65
|
+
[archive, 'archive'] * "."
|
66
|
+
end
|
67
|
+
"http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
|
68
|
+
end
|
69
|
+
|
58
70
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
59
71
|
open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
|
60
72
|
repeats = true
|
@@ -75,11 +87,17 @@ module BioMart
|
|
75
87
|
|
76
88
|
query = @@biomart_query_xml.dup
|
77
89
|
query.sub!(/<!--DATABASE-->/,database)
|
90
|
+
if Thread.current["ensembl_domain"]
|
91
|
+
query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
|
92
|
+
else
|
93
|
+
query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
|
94
|
+
end
|
78
95
|
query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
|
79
96
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
80
97
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
81
98
|
|
82
|
-
url = Thread.current[
|
99
|
+
url = final_url(query, Thread.current["archive"], Thread.current["ensembl_domain"])
|
100
|
+
|
83
101
|
|
84
102
|
begin
|
85
103
|
response = Open.read(url, open_options.dup)
|
@@ -105,10 +123,17 @@ module BioMart
|
|
105
123
|
|
106
124
|
new_datafile = TmpFile.tmp_file
|
107
125
|
if data.nil?
|
108
|
-
|
126
|
+
Open.open(result_file) do |file|
|
127
|
+
Open.write(new_datafile, Open.collapse_stream(file))
|
128
|
+
end
|
109
129
|
data = new_datafile
|
110
130
|
else
|
111
|
-
|
131
|
+
Open.open(result_file) do |stream_result|
|
132
|
+
Open.open(data) do |stream_data|
|
133
|
+
Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
#TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
|
112
137
|
FileUtils.rm data
|
113
138
|
data = new_datafile
|
114
139
|
end
|
@@ -142,9 +167,9 @@ module BioMart
|
|
142
167
|
|
143
168
|
IndiferentHash.setup(open_options)
|
144
169
|
|
145
|
-
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{
|
170
|
+
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
|
146
171
|
|
147
|
-
max_items =
|
172
|
+
max_items = 1
|
148
173
|
chunks = []
|
149
174
|
chunk = []
|
150
175
|
attrs.each{|a|
|
@@ -178,7 +203,7 @@ module BioMart
|
|
178
203
|
results
|
179
204
|
else
|
180
205
|
Open.write(filename) do |f|
|
181
|
-
f.puts "#: " << Misc.hash2string(TSV
|
206
|
+
f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
|
182
207
|
if field_names.nil?
|
183
208
|
f.puts "#" << [main, attrs].flatten * "\t"
|
184
209
|
else
|
@@ -211,7 +236,17 @@ module BioMart
|
|
211
236
|
changes = {}
|
212
237
|
missing.select{|m| m.include? "~" }.each do |str|
|
213
238
|
orig,_sep, new = str.partition "~"
|
214
|
-
|
239
|
+
if orig.include?(":")
|
240
|
+
target_db, _sep, orig = orig.partition(":")
|
241
|
+
if target_db[0] == "-"
|
242
|
+
next if database == target_db[1..-1]
|
243
|
+
else
|
244
|
+
next unless database == target_db
|
245
|
+
end
|
246
|
+
changes[orig] = new
|
247
|
+
else
|
248
|
+
changes[orig] = new
|
249
|
+
end
|
215
250
|
end
|
216
251
|
changed = true
|
217
252
|
while changed
|
@@ -9,11 +9,29 @@ module Ensembl
|
|
9
9
|
module FTP
|
10
10
|
|
11
11
|
SERVER = "ftp.ensembl.org"
|
12
|
+
DOMAIN_SERVER = "ftp.ensemblgenomes.org"
|
12
13
|
|
13
|
-
def self.
|
14
|
+
def self.ftp_name_for_domain(domain, organism, subdir='mysql')
|
15
|
+
code, build = organism.split "/"
|
16
|
+
build ||= "current"
|
17
|
+
|
18
|
+
release = build == "current" ? 'current' : Ensembl.releases[build]
|
19
|
+
name = Organism.scientific_name(organism)
|
20
|
+
ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
|
21
|
+
ftp.passive = true
|
22
|
+
ftp.login
|
23
|
+
dir = File.join('pub', domain, 'current', subdir)
|
24
|
+
ftp.chdir(dir)
|
25
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
|
26
|
+
ftp.close
|
27
|
+
[release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
|
14
28
|
end
|
15
29
|
|
16
|
-
def self.ftp_name_for(organism)
|
30
|
+
def self.ftp_name_for(organism, subdir='mysql')
|
31
|
+
if domain = Thread.current["ensembl_domain"]
|
32
|
+
return ftp_name_for_domain(domain, organism,subdir)
|
33
|
+
end
|
34
|
+
|
17
35
|
code, build = organism.split "/"
|
18
36
|
build ||= "current"
|
19
37
|
|
@@ -23,8 +41,9 @@ module Ensembl
|
|
23
41
|
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
24
42
|
ftp.passive = true
|
25
43
|
ftp.login
|
26
|
-
|
27
|
-
|
44
|
+
dir = File.join('pub', "current_#{subdir}")
|
45
|
+
ftp.chdir(dir)
|
46
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
|
28
47
|
ftp.close
|
29
48
|
else
|
30
49
|
release = Ensembl.releases[build]
|
@@ -32,24 +51,21 @@ module Ensembl
|
|
32
51
|
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
33
52
|
ftp.passive = true
|
34
53
|
ftp.login
|
35
|
-
|
36
|
-
|
54
|
+
dir = File.join('pub', release, subdir)
|
55
|
+
ftp.chdir(dir)
|
56
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
|
37
57
|
ftp.close
|
38
58
|
end
|
39
|
-
[release, file]
|
59
|
+
[release, File.join(Ensembl::FTP::SERVER, dir, file)]
|
40
60
|
end
|
41
61
|
|
42
|
-
def self.
|
43
|
-
release,
|
44
|
-
|
45
|
-
File.join('/pub/', 'current_mysql', ftp_name)
|
46
|
-
else
|
47
|
-
File.join('/pub/', release, 'mysql', ftp_name)
|
48
|
-
end
|
62
|
+
def self.ftp_url_for(organism)
|
63
|
+
release, ftp_url = ftp_name_for(organism)
|
64
|
+
ftp_url
|
49
65
|
end
|
50
66
|
|
51
67
|
def self.base_url(organism)
|
52
|
-
File.join("ftp://"
|
68
|
+
File.join("ftp://", ftp_url_for(organism) )
|
53
69
|
end
|
54
70
|
|
55
71
|
def self.url_for(organism, table, extension)
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -8,6 +8,19 @@ module Entrez
|
|
8
8
|
|
9
9
|
Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
|
10
10
|
Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
11
|
+
Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
|
12
|
+
TmpFile.with_dir do |dir|
|
13
|
+
Misc.in_dir dir do
|
14
|
+
CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
|
15
|
+
CMD.cmd("tar xvfz taxdump.tar.gz")
|
16
|
+
CMD.cmd("grep 'scientific name' names.dmp |cut -f 1,3 > tmp.tsv")
|
17
|
+
tsv = TSV.open('tmp.tsv', type: :single)
|
18
|
+
tsv.key_field = "Entrez Tax ID"
|
19
|
+
tsv.fields = ["Scientific Name"]
|
20
|
+
Open.write(filename, tsv.to_s)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
11
24
|
|
12
25
|
def self.entrez2native(taxs, options = {})
|
13
26
|
options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -25,8 +25,8 @@ module GO
|
|
25
25
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
26
26
|
# only the name field is used.
|
27
27
|
def self.init
|
28
|
-
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
29
|
-
info.serializer = :marshal if info.respond_to? :serializer
|
28
|
+
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
|
29
|
+
#info.serializer = :marshal if info.respond_to? :serializer
|
30
30
|
Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
|
31
31
|
term_info = {}
|
32
32
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
|
4
|
+
module MeSH
|
5
|
+
extend Resource
|
6
|
+
|
7
|
+
self.subdir = "share/databases/MeSH"
|
8
|
+
|
9
|
+
MeSH.claim MeSH["data.gz"], :url, "https://nlmpubs.nlm.nih.gov/projects/mesh/rdf/mesh.nt.gz"
|
10
|
+
|
11
|
+
MeSH.claim MeSH.vocabulary, :proc do
|
12
|
+
dumper = TSV::Dumper.new :key_field => "MeSH ID", :fields => ["Label"], :type => :single
|
13
|
+
dumper.init
|
14
|
+
TSV.traverse MeSH.data, :type => :array, :into => dumper, :bar => "Processing MeSH vocab" do |line|
|
15
|
+
sub, verb, obj = line.split("\t")
|
16
|
+
|
17
|
+
next unless verb && verb.include?("rdf-schema#label")
|
18
|
+
|
19
|
+
id = sub.split("/").last[0..-2]
|
20
|
+
label = obj.split('"')[1]
|
21
|
+
|
22
|
+
[id, label]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -9,6 +9,10 @@ module Organism
|
|
9
9
|
ARCHIVE_MONTH_INDEX = {}
|
10
10
|
%w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
|
11
11
|
|
12
|
+
def self.rake_organism_helper
|
13
|
+
Rbbt.share.install.Organism["organism_helpers.rb"].find
|
14
|
+
end
|
15
|
+
|
12
16
|
def self.compare_archives(a1, a2)
|
13
17
|
a1 = a1.partition("/").last if a1 and a1.include? "/"
|
14
18
|
a2 = a2.partition("/").last if a2 and a2.include? "/"
|
@@ -29,7 +33,8 @@ module Organism
|
|
29
33
|
end
|
30
34
|
|
31
35
|
def self.default_code(organism = "Hsa")
|
32
|
-
|
36
|
+
latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
|
37
|
+
organism.split("/").first << "/" << latest
|
33
38
|
end
|
34
39
|
|
35
40
|
def self.organism_codes(organism = nil)
|
@@ -43,7 +48,8 @@ module Organism
|
|
43
48
|
end
|
44
49
|
|
45
50
|
def self.installed_organisms
|
46
|
-
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
|
51
|
+
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
|
52
|
+
Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
|
47
53
|
end
|
48
54
|
|
49
55
|
def self.prepared_organisms
|
@@ -62,25 +68,6 @@ module Organism
|
|
62
68
|
nil
|
63
69
|
end
|
64
70
|
|
65
|
-
Organism.installable_organisms.each do |organism|
|
66
|
-
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
67
|
-
|
68
|
-
module_eval "#{ organism } = with_key '#{organism}'"
|
69
|
-
end
|
70
|
-
|
71
|
-
Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
|
72
|
-
Open.mkdir File.dirname(file) unless File.directory?(file)
|
73
|
-
url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
74
|
-
CMD.cmd_log("wget '#{url}' -O '#{file}'")
|
75
|
-
CMD.cmd("chmod 0755 '#{file}'")
|
76
|
-
Rbbt.set_software_env
|
77
|
-
nil
|
78
|
-
end
|
79
|
-
|
80
|
-
CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
|
81
|
-
|
82
|
-
Rbbt.set_software_env
|
83
|
-
|
84
71
|
def self.hg_build(organism)
|
85
72
|
require 'rbbt/sources/ensembl_ftp'
|
86
73
|
organism = organism.strip
|
@@ -257,7 +244,16 @@ module Organism
|
|
257
244
|
end
|
258
245
|
|
259
246
|
def self.scientific_name(organism)
|
260
|
-
Organism[organism]
|
247
|
+
Organism[organism].scientific_name.read.strip
|
248
|
+
end
|
249
|
+
|
250
|
+
def self.make_organism(name, long = false)
|
251
|
+
first, _, second = name.partition(/[ _]/)
|
252
|
+
if long
|
253
|
+
first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
|
254
|
+
else
|
255
|
+
first[0].upcase + second[0..1].downcase
|
256
|
+
end
|
261
257
|
end
|
262
258
|
|
263
259
|
def self.organism(name)
|
@@ -295,7 +291,7 @@ module Organism
|
|
295
291
|
organism ||= "Hsa"
|
296
292
|
|
297
293
|
@@gene_start_end ||= {}
|
298
|
-
gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :
|
294
|
+
gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
|
299
295
|
|
300
296
|
ranges = genes.collect{|gene|
|
301
297
|
start, eend = gene_start_end[gene]
|
@@ -339,7 +335,8 @@ module Organism
|
|
339
335
|
def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
|
340
336
|
chromosome_sizes = {}
|
341
337
|
|
342
|
-
Organism
|
338
|
+
Organism.chromosomes(organism).produce.tsv.each do |chr|
|
339
|
+
file = Organism[organism]["chromosome_#{chr}"].produce.find
|
343
340
|
chromosome = file.split("_").last.split(".").first
|
344
341
|
size = if Open.gzip?(file) || Open.bgzip?(file)
|
345
342
|
CMD.cmd("zcat '#{ file }' | wc -c ").read
|
@@ -352,4 +349,28 @@ module Organism
|
|
352
349
|
chromosome_sizes
|
353
350
|
end
|
354
351
|
|
352
|
+
Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
|
353
|
+
Open.mkdir File.dirname(file) unless File.directory?(file)
|
354
|
+
url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
355
|
+
CMD.cmd_log("wget '#{url}' -O '#{file}'")
|
356
|
+
CMD.cmd("chmod 0755 '#{file}'")
|
357
|
+
Rbbt.set_software_env
|
358
|
+
nil
|
359
|
+
end
|
360
|
+
|
361
|
+
CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
|
362
|
+
|
363
|
+
Rbbt.set_software_env
|
364
|
+
|
365
|
+
Organism.installable_organisms.each do |organism|
|
366
|
+
if Rbbt.share.install.Organism[organism].Rakefile.exists?
|
367
|
+
rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
|
368
|
+
else
|
369
|
+
rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
|
370
|
+
end
|
371
|
+
|
372
|
+
claim Organism[organism], :rake, rakefile
|
373
|
+
|
374
|
+
module_eval "#{ organism } = with_key '#{organism}'"
|
375
|
+
end
|
355
376
|
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -51,6 +51,7 @@ module PubMed
|
|
51
51
|
end
|
52
52
|
[lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
|
53
53
|
end
|
54
|
+
|
54
55
|
def self.parse_xml(xml)
|
55
56
|
require 'nokogiri'
|
56
57
|
|
@@ -91,6 +92,16 @@ module PubMed
|
|
91
92
|
[lastname, forename] * ", "
|
92
93
|
end * " and "
|
93
94
|
|
95
|
+
info[:mesh] = parser.search("MeshHeadingList/MeshHeading").collect do |mesh|
|
96
|
+
descriptor = mesh.search("DescriptorName").first.attr('UI')
|
97
|
+
qualifiers = mesh.search("QualifierName").collect{|q| q.attr('UI')}
|
98
|
+
[descriptor] + qualifiers.collect{|q| descriptor + q }
|
99
|
+
end.compact.flatten
|
100
|
+
|
101
|
+
info[:substance] = parser.search("NameOfSubstance").collect do |substance|
|
102
|
+
substance.attr('UI')
|
103
|
+
end
|
104
|
+
|
94
105
|
info[:bibentry] = bibentry.downcase if bibentry
|
95
106
|
|
96
107
|
info[:pmc_pdf] = parser.search("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
@@ -102,7 +113,7 @@ module PubMed
|
|
102
113
|
info
|
103
114
|
end
|
104
115
|
|
105
|
-
attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
|
116
|
+
attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url, :mesh, :substance
|
106
117
|
attr_accessor *XML_KEYS.collect{|p| p.first }
|
107
118
|
|
108
119
|
def initialize(xml)
|
@@ -141,7 +152,7 @@ module PubMed
|
|
141
152
|
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
|
142
153
|
TmpFile.with_file do |txt|
|
143
154
|
`pdftotext #{ pdf } #{ txt }`
|
144
|
-
text = Open.read(txt) if File.
|
155
|
+
text = Open.read(txt) if File.exist?(txt)
|
145
156
|
end
|
146
157
|
end
|
147
158
|
text
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [9606]
|
7
2
|
$scientific_name = "Homo sapiens"
|
8
3
|
$ortholog_key = "hsapiens_homolog_ensembl_gene"
|
@@ -95,17 +90,30 @@ $biomart_identifiers = [
|
|
95
90
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
96
91
|
]
|
97
92
|
|
98
|
-
$
|
99
|
-
|
100
|
-
|
93
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
94
|
+
Thread.current["namespace"] = $namespace
|
95
|
+
load Organism.rake_organism_helper
|
96
|
+
|
97
|
+
file 'regulators' do |t|
|
98
|
+
regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
|
99
|
+
regulatory_fields = [
|
100
|
+
['Chromosome Name','chromosome_name'],
|
101
|
+
['Region Start', 'chromosome_start'],
|
102
|
+
['Region End', 'chromosome_end'],
|
103
|
+
['Feature type', 'feature_type_name'],
|
101
104
|
]
|
105
|
+
regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
|
106
|
+
|
107
|
+
Misc.sensiblewrite(t.name, regulators.to_s)
|
108
|
+
end
|
102
109
|
|
103
|
-
|
104
|
-
[
|
105
|
-
|
106
|
-
|
110
|
+
file 'regulator_activity' do |t|
|
111
|
+
regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
|
112
|
+
regulatory_fields = [
|
113
|
+
['Epigenome name','epigenome_name'],
|
114
|
+
['Activity', 'activity'],
|
107
115
|
]
|
116
|
+
regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
108
117
|
|
109
|
-
|
110
|
-
|
111
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
118
|
+
Misc.sensiblewrite(t.name, regulators.to_s)
|
119
|
+
end
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [10090]
|
7
2
|
$scientific_name = "Mus musculus"
|
8
3
|
$ortholog_key = "mmusculus_homolog_ensembl_gene"
|
@@ -43,18 +38,6 @@ $biomart_identifiers = [
|
|
43
38
|
[ 'EMBL (Genbank) ID' , "embl"] ,
|
44
39
|
]
|
45
40
|
|
46
|
-
$
|
47
|
-
|
48
|
-
|
49
|
-
]
|
50
|
-
|
51
|
-
$biomart_go_2009= [
|
52
|
-
["GO BP ID", 'go_biological_process_id'],
|
53
|
-
["GO MF ID", 'go_molecular_function_id'],
|
54
|
-
["GO CC ID", 'go_cellular_component_id'],
|
55
|
-
]
|
56
|
-
|
57
|
-
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
58
|
-
Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
|
59
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
60
|
-
|
41
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
42
|
+
Thread.current["namespace"] = $namespace
|
43
|
+
load Organism.rake_organism_helper
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [10116]
|
7
2
|
$scientific_name = "Rattus norvegicus"
|
8
3
|
|
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
|
|
50
45
|
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
|
51
46
|
]
|
52
47
|
|
53
|
-
$namespace = File.basename(
|
54
|
-
Thread.current["namespace"] =
|
55
|
-
load
|
48
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
49
|
+
Thread.current["namespace"] = $namespace
|
50
|
+
load Organism.rake_organism_helper
|
@@ -0,0 +1,38 @@
|
|
1
|
+
$taxs = [559292,4932]
|
2
|
+
$scientific_name = "Saccharomyces cerevisiae"
|
3
|
+
$ensembl_domain = 'fungi'
|
4
|
+
#$ortholog_key = "yeast_ensembl_gene"
|
5
|
+
|
6
|
+
$biomart_db = 'scerevisiae_eg_gene'
|
7
|
+
|
8
|
+
$biomart_lexicon = [
|
9
|
+
[ 'Associated Gene Name' , "external_gene_name"],
|
10
|
+
]
|
11
|
+
|
12
|
+
$biomart_protein_identifiers = [
|
13
|
+
[ 'Protein ID', "protein_id" ],
|
14
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
15
|
+
[ 'Unigene ID', "unigene" ],
|
16
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
17
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
18
|
+
]
|
19
|
+
|
20
|
+
$biomart_probe_identifiers = [
|
21
|
+
]
|
22
|
+
|
23
|
+
$biomart_identifiers = [
|
24
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
25
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
26
|
+
[ 'Associated Gene Name', "external_gene_name" ],
|
27
|
+
[ 'Protein ID', "protein_id" ],
|
28
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
29
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
30
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
31
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
32
|
+
[ 'RefSeq DNA' , "refseq_dna"] ,
|
33
|
+
]
|
34
|
+
|
35
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
36
|
+
Thread.current["namespace"] = $namespace
|
37
|
+
Thread.current["ensembl_domain"] = $ensembl_domain
|
38
|
+
load Organism.rake_organism_helper
|