rbbt-sources 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/sources/COSTART.rb +16 -0
- data/lib/rbbt/sources/CTCAE.rb +6 -0
- data/lib/rbbt/sources/Reactome.rb +16 -0
- data/lib/rbbt/sources/biomart.rb +16 -3
- data/lib/rbbt/sources/entrez.rb +6 -6
- data/lib/rbbt/sources/go.rb +2 -3
- data/lib/rbbt/sources/organism.rb +41 -0
- data/lib/rbbt/sources/polysearch.rb +10 -0
- data/share/install/Organism/Hsa/Rakefile +36 -2
- data/share/install/Organism/Sce/Rakefile +2 -2
- data/test/rbbt/sources/test_biomart.rb +2 -2
- data/test/rbbt/sources/test_entrez.rb +1 -1
- data/test/rbbt/sources/test_organism.rb +13 -1
- metadata +10 -6
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
module COSTART
|
4
|
+
|
5
|
+
Rbbt.claim "COSTART",
|
6
|
+
Proc.new do
|
7
|
+
terms = ["#COSTART Terms"]
|
8
|
+
Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
|
9
|
+
puts line
|
10
|
+
next unless line =~ /^'(.*)',/
|
11
|
+
terms << $1
|
12
|
+
end
|
13
|
+
|
14
|
+
terms * "\n"
|
15
|
+
end, 'COSTART'
|
16
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
module Reactome
|
4
|
+
|
5
|
+
Rbbt.claim "Reactome",
|
6
|
+
Proc.new do
|
7
|
+
headers = ["Uniprot ID#1", "Ensembl Gene ID#2","Entrez Gene ID#1", "Uniprot ID#2", "Ensembl Gene ID#2", "Entrez Gene ID#2" , "Type", "Reaction", "PMID"]
|
8
|
+
|
9
|
+
tsv = TSV.new(Open.open("http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"), :fix => Proc.new {|l| l.gsub(/[\w ]+:/, "")})
|
10
|
+
tsv.key_field = headers.shift
|
11
|
+
tsv.fields = headers
|
12
|
+
|
13
|
+
tsv.to_s
|
14
|
+
end, 'Reactome'
|
15
|
+
]
|
16
|
+
end
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -10,6 +10,8 @@ module BioMart
|
|
10
10
|
|
11
11
|
class BioMart::QueryError < StandardError; end
|
12
12
|
|
13
|
+
BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
@@biomart_query_xml = <<-EOT
|
@@ -23,8 +25,14 @@ module BioMart
|
|
23
25
|
</Dataset>
|
24
26
|
</Query>
|
25
27
|
EOT
|
26
|
-
|
27
28
|
|
29
|
+
def self.set_archive(date)
|
30
|
+
@archive_url = BIOMART_URL.sub(/www\.biomar\./, date + '.archive.ensemble')
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.unset_archive
|
34
|
+
@archive_url = nil
|
35
|
+
end
|
28
36
|
|
29
37
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
30
38
|
attrs ||= []
|
@@ -37,8 +45,13 @@ module BioMart
|
|
37
45
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
38
46
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
39
47
|
|
40
|
-
|
41
|
-
|
48
|
+
if @archive_url
|
49
|
+
response = Open.read(@archive_url + query.gsub(/\n/,' '), open_options)
|
50
|
+
else
|
51
|
+
response = Open.read(BIOMART_URL + query.gsub(/\n/,' '), open_options)
|
52
|
+
end
|
53
|
+
|
54
|
+
if response.empty? or response =~ /Query ERROR:/
|
42
55
|
raise BioMart::QueryError, response
|
43
56
|
end
|
44
57
|
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -5,29 +5,29 @@ require 'set'
|
|
5
5
|
|
6
6
|
module Entrez
|
7
7
|
|
8
|
-
Rbbt.
|
9
|
-
|
8
|
+
Rbbt.claim "gene_info", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz', 'databases/entrez'
|
9
|
+
Rbbt.claim "gene2pubmed", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz', 'databases/entrez'
|
10
10
|
|
11
11
|
def self.entrez2native(taxs, options = {})
|
12
|
-
options = Misc.add_defaults options, :
|
12
|
+
options = Misc.add_defaults options, :key => 1, :others => 5, :persistence => true, :merge => true
|
13
13
|
|
14
14
|
taxs = [taxs] unless Array === taxs
|
15
15
|
options.merge! :grep => taxs
|
16
16
|
|
17
|
-
tsv = TSV.new(Rbbt.
|
17
|
+
tsv = TSV.new(Rbbt.files.databases.entrez.gene_info, :flat, options)
|
18
18
|
tsv.key_field = "Entrez Gene ID"
|
19
19
|
tsv.fields = ["Native ID"]
|
20
20
|
tsv
|
21
21
|
end
|
22
22
|
|
23
23
|
def self.entrez2pubmed(taxs)
|
24
|
-
options = {:
|
24
|
+
options = {:key => 1, :others => 2, :persistence => true, :merge => true}
|
25
25
|
|
26
26
|
taxs = [taxs] unless taxs.is_a?(Array)
|
27
27
|
taxs = taxs.collect{|t| t.to_s}
|
28
28
|
options.merge! :grep => taxs
|
29
29
|
|
30
|
-
TSV.new(Rbbt.
|
30
|
+
TSV.new(Rbbt.files.databases.entrez.gene2pubmed, :flat, options)
|
31
31
|
end
|
32
32
|
|
33
33
|
class Gene
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -4,9 +4,8 @@ require 'rbbt-util'
|
|
4
4
|
# now all it does is provide a translation form id to the actual names.
|
5
5
|
module GO
|
6
6
|
|
7
|
-
Rbbt.
|
8
|
-
|
9
|
-
|
7
|
+
Rbbt.claim :gene_ontology, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo', 'databases/GO'
|
8
|
+
Rbbt.claim :goslim_generic, 'http://www.geneontology.org/GO_slims/goslim_generic.obo', 'databases/GO'
|
10
9
|
|
11
10
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
12
11
|
TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
|
@@ -1,12 +1,53 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/util/data_module'
|
3
3
|
|
4
|
+
|
4
5
|
module Organism
|
5
6
|
class OrganismNotProcessedError < StandardError; end
|
6
7
|
|
7
8
|
def self.datadir(org)
|
8
9
|
File.join(Rbbt.datadir, 'organisms', org)
|
9
10
|
end
|
11
|
+
|
12
|
+
def self.normalize(org, list, field = nil, others = nil, options = {})
|
13
|
+
return [] if list.nil? or list.empty?
|
14
|
+
options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
|
15
|
+
double = Misc.process_options options, :double
|
16
|
+
|
17
|
+
if Array === list
|
18
|
+
if double
|
19
|
+
index.values_at *list
|
20
|
+
else
|
21
|
+
index.values_at(*list).collect{|e| Misc.first e}
|
22
|
+
end
|
23
|
+
else
|
24
|
+
if double
|
25
|
+
index[list]
|
26
|
+
else
|
27
|
+
index[list].first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.guess_id(org, values)
|
33
|
+
identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
|
34
|
+
field_matches = identifiers.field_matches(values)
|
35
|
+
field_matches.sort_by{|field, matches| matches.uniq.length}.last
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.organisms
|
39
|
+
Dir.glob(File.join(PKGData.sharedir_for_file(__FILE__), 'install/Organism/*/Rakefile')).collect{|f| File.basename(File.dirname(f))}
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.name(organism)
|
43
|
+
Open.read(Organism.scientific_name(organism)).strip
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.organism(name)
|
47
|
+
organisms.select{|organism|
|
48
|
+
organism == name or Organism.name(organism) =~ /#{ name }/i
|
49
|
+
}.first
|
50
|
+
end
|
10
51
|
|
11
52
|
extend DataModule
|
12
53
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
|
3
|
+
module Polysearch
|
4
|
+
Rbbt.claim "organ" ,'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt', 'Polysearch'
|
5
|
+
Rbbt.claim "tissue" ,'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt', 'Polysearch'
|
6
|
+
Rbbt.claim "location" ,'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt', 'Polysearch'
|
7
|
+
Rbbt.claim "disease" ,'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt', 'Polysearch'
|
8
|
+
Rbbt.claim "drug" ,'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt', 'Polysearch'
|
9
|
+
end
|
10
|
+
|
@@ -52,9 +52,18 @@ $biomart_identifiers = [
|
|
52
52
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
53
|
]
|
54
54
|
|
55
|
+
$biomart_positions = [
|
56
|
+
['Chromosome Name','chromosome_name'],
|
57
|
+
['Strand','strand'],
|
58
|
+
['Gene Start','start_position'],
|
59
|
+
['Gene End','end_position'],
|
60
|
+
['Transcript Start','transcript_start'],
|
61
|
+
['Transcript End','transcript_end'],
|
62
|
+
]
|
63
|
+
|
55
64
|
|
56
65
|
|
57
|
-
file '
|
66
|
+
file 'scientific_name' do |t|
|
58
67
|
File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
|
59
68
|
end
|
60
69
|
|
@@ -77,7 +86,32 @@ file 'identifiers' do |t|
|
|
77
86
|
File.open(t.name, 'w') do |f| f.puts identifiers end
|
78
87
|
end
|
79
88
|
|
89
|
+
file 'gene_go' do |t|
|
90
|
+
url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
|
91
|
+
tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
|
92
|
+
|
93
|
+
index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
|
94
|
+
new = TSV.new({})
|
95
|
+
tsv.through do |key, values|
|
96
|
+
next if index[key].nil?
|
97
|
+
new_key = index[key].first
|
98
|
+
new[new_key] = values
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
new.key_field = "Associated Gene Name"
|
103
|
+
new.fields = ["GO Term"]
|
104
|
+
Open.write(t.name, new.to_s)
|
105
|
+
end
|
106
|
+
|
107
|
+
file 'gene_positions' do |t|
|
108
|
+
BioMart.set_archive('may2009')
|
109
|
+
positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
|
110
|
+
BioMart.unset_archive
|
111
|
+
|
112
|
+
Open.write(t.name, positions.to_s)
|
113
|
+
end
|
80
114
|
|
81
|
-
task :default => ['name', 'lexicon', 'identifiers']
|
115
|
+
task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
|
82
116
|
|
83
117
|
|
@@ -10,7 +10,7 @@ $biomart_db = 'scerevisiae_gene_ensembl'
|
|
10
10
|
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
11
|
|
12
12
|
|
13
|
-
file '
|
13
|
+
file 'scientific_name' do |t|
|
14
14
|
File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
|
15
15
|
end
|
16
16
|
|
@@ -27,7 +27,7 @@ file 'lexicon' do |t|
|
|
27
27
|
end
|
28
28
|
|
29
29
|
file 'identifiers' do |t|
|
30
|
-
identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
30
|
+
identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
|
31
31
|
|
32
32
|
merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
|
33
33
|
|
@@ -4,7 +4,7 @@ require 'test/unit'
|
|
4
4
|
|
5
5
|
class TestBioMart < Test::Unit::TestCase
|
6
6
|
|
7
|
-
def
|
7
|
+
def _test_get
|
8
8
|
assert_raise BioMart::QueryError do
|
9
9
|
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
10
10
|
end
|
@@ -23,7 +23,7 @@ class TestBioMart < Test::Unit::TestCase
|
|
23
23
|
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def _test_tsv
|
27
27
|
data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
28
28
|
|
29
29
|
assert(data['852236']['Protein ID'].include? 'CAA84864')
|
@@ -6,12 +6,24 @@ class TestEntrez < Test::Unit::TestCase
|
|
6
6
|
def test_identifiers
|
7
7
|
assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
8
8
|
assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
9
|
-
|
9
|
+
assert TSV.new(Organism.identifiers('Hsa'))['1020']["Associated Gene Name"].include?('CDK5')
|
10
10
|
end
|
11
11
|
|
12
12
|
def test_lexicon
|
13
13
|
assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
|
14
14
|
end
|
15
|
+
|
16
|
+
def test_guess_id
|
17
|
+
ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
|
18
|
+
gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
|
19
|
+
assert_equal "Ensembl Gene ID", Organism::Sce.guess_id(ensembl).first
|
20
|
+
assert_equal "Associated Gene Name", Organism::Sce.guess_id(gene_name).first
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_organisms
|
24
|
+
assert Organism.organisms.include? "Hsa"
|
25
|
+
assert_equal "Hsa", Organism.organism("Homo sapiens")
|
26
|
+
end
|
15
27
|
end
|
16
28
|
|
17
29
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 19
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-30 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -83,12 +83,16 @@ extensions: []
|
|
83
83
|
extra_rdoc_files: []
|
84
84
|
|
85
85
|
files:
|
86
|
+
- lib/rbbt/sources/COSTART.rb
|
87
|
+
- lib/rbbt/sources/CTCAE.rb
|
88
|
+
- lib/rbbt/sources/Reactome.rb
|
86
89
|
- lib/rbbt/sources/bibtex.rb
|
87
90
|
- lib/rbbt/sources/biomart.rb
|
88
91
|
- lib/rbbt/sources/entrez.rb
|
89
92
|
- lib/rbbt/sources/go.rb
|
90
93
|
- lib/rbbt/sources/gscholar.rb
|
91
94
|
- lib/rbbt/sources/organism.rb
|
95
|
+
- lib/rbbt/sources/polysearch.rb
|
92
96
|
- lib/rbbt/sources/pubmed.rb
|
93
97
|
- share/install/Organism/Hsa/Rakefile
|
94
98
|
- share/install/Organism/Sce/Rakefile
|
@@ -129,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
133
|
requirements: []
|
130
134
|
|
131
135
|
rubyforge_project:
|
132
|
-
rubygems_version: 1.
|
136
|
+
rubygems_version: 1.4.2
|
133
137
|
signing_key:
|
134
138
|
specification_version: 3
|
135
139
|
summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
|