rbbt-sources 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/COSTART.rb +16 -0
- data/lib/rbbt/sources/CTCAE.rb +6 -0
- data/lib/rbbt/sources/Reactome.rb +16 -0
- data/lib/rbbt/sources/biomart.rb +16 -3
- data/lib/rbbt/sources/entrez.rb +6 -6
- data/lib/rbbt/sources/go.rb +2 -3
- data/lib/rbbt/sources/organism.rb +41 -0
- data/lib/rbbt/sources/polysearch.rb +10 -0
- data/share/install/Organism/Hsa/Rakefile +36 -2
- data/share/install/Organism/Sce/Rakefile +2 -2
- data/test/rbbt/sources/test_biomart.rb +2 -2
- data/test/rbbt/sources/test_entrez.rb +1 -1
- data/test/rbbt/sources/test_organism.rb +13 -1
- metadata +10 -6
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
module COSTART
|
4
|
+
|
5
|
+
Rbbt.claim "COSTART",
|
6
|
+
Proc.new do
|
7
|
+
terms = ["#COSTART Terms"]
|
8
|
+
Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
|
9
|
+
puts line
|
10
|
+
next unless line =~ /^'(.*)',/
|
11
|
+
terms << $1
|
12
|
+
end
|
13
|
+
|
14
|
+
terms * "\n"
|
15
|
+
end, 'COSTART'
|
16
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
module Reactome
|
4
|
+
|
5
|
+
Rbbt.claim "Reactome",
|
6
|
+
Proc.new do
|
7
|
+
headers = ["Uniprot ID#1", "Ensembl Gene ID#2","Entrez Gene ID#1", "Uniprot ID#2", "Ensembl Gene ID#2", "Entrez Gene ID#2" , "Type", "Reaction", "PMID"]
|
8
|
+
|
9
|
+
tsv = TSV.new(Open.open("http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"), :fix => Proc.new {|l| l.gsub(/[\w ]+:/, "")})
|
10
|
+
tsv.key_field = headers.shift
|
11
|
+
tsv.fields = headers
|
12
|
+
|
13
|
+
tsv.to_s
|
14
|
+
end, 'Reactome'
|
15
|
+
]
|
16
|
+
end
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -10,6 +10,8 @@ module BioMart
|
|
10
10
|
|
11
11
|
class BioMart::QueryError < StandardError; end
|
12
12
|
|
13
|
+
BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
@@biomart_query_xml = <<-EOT
|
@@ -23,8 +25,14 @@ module BioMart
|
|
23
25
|
</Dataset>
|
24
26
|
</Query>
|
25
27
|
EOT
|
26
|
-
|
27
28
|
|
29
|
+
def self.set_archive(date)
|
30
|
+
@archive_url = BIOMART_URL.sub(/www\.biomar\./, date + '.archive.ensemble')
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.unset_archive
|
34
|
+
@archive_url = nil
|
35
|
+
end
|
28
36
|
|
29
37
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
30
38
|
attrs ||= []
|
@@ -37,8 +45,13 @@ module BioMart
|
|
37
45
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
38
46
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
39
47
|
|
40
|
-
|
41
|
-
|
48
|
+
if @archive_url
|
49
|
+
response = Open.read(@archive_url + query.gsub(/\n/,' '), open_options)
|
50
|
+
else
|
51
|
+
response = Open.read(BIOMART_URL + query.gsub(/\n/,' '), open_options)
|
52
|
+
end
|
53
|
+
|
54
|
+
if response.empty? or response =~ /Query ERROR:/
|
42
55
|
raise BioMart::QueryError, response
|
43
56
|
end
|
44
57
|
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -5,29 +5,29 @@ require 'set'
|
|
5
5
|
|
6
6
|
module Entrez
|
7
7
|
|
8
|
-
Rbbt.
|
9
|
-
|
8
|
+
Rbbt.claim "gene_info", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz', 'databases/entrez'
|
9
|
+
Rbbt.claim "gene2pubmed", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz', 'databases/entrez'
|
10
10
|
|
11
11
|
def self.entrez2native(taxs, options = {})
|
12
|
-
options = Misc.add_defaults options, :
|
12
|
+
options = Misc.add_defaults options, :key => 1, :others => 5, :persistence => true, :merge => true
|
13
13
|
|
14
14
|
taxs = [taxs] unless Array === taxs
|
15
15
|
options.merge! :grep => taxs
|
16
16
|
|
17
|
-
tsv = TSV.new(Rbbt.
|
17
|
+
tsv = TSV.new(Rbbt.files.databases.entrez.gene_info, :flat, options)
|
18
18
|
tsv.key_field = "Entrez Gene ID"
|
19
19
|
tsv.fields = ["Native ID"]
|
20
20
|
tsv
|
21
21
|
end
|
22
22
|
|
23
23
|
def self.entrez2pubmed(taxs)
|
24
|
-
options = {:
|
24
|
+
options = {:key => 1, :others => 2, :persistence => true, :merge => true}
|
25
25
|
|
26
26
|
taxs = [taxs] unless taxs.is_a?(Array)
|
27
27
|
taxs = taxs.collect{|t| t.to_s}
|
28
28
|
options.merge! :grep => taxs
|
29
29
|
|
30
|
-
TSV.new(Rbbt.
|
30
|
+
TSV.new(Rbbt.files.databases.entrez.gene2pubmed, :flat, options)
|
31
31
|
end
|
32
32
|
|
33
33
|
class Gene
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -4,9 +4,8 @@ require 'rbbt-util'
|
|
4
4
|
# now all it does is provide a translation form id to the actual names.
|
5
5
|
module GO
|
6
6
|
|
7
|
-
Rbbt.
|
8
|
-
|
9
|
-
|
7
|
+
Rbbt.claim :gene_ontology, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo', 'databases/GO'
|
8
|
+
Rbbt.claim :goslim_generic, 'http://www.geneontology.org/GO_slims/goslim_generic.obo', 'databases/GO'
|
10
9
|
|
11
10
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
12
11
|
TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
|
@@ -1,12 +1,53 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/util/data_module'
|
3
3
|
|
4
|
+
|
4
5
|
module Organism
|
5
6
|
class OrganismNotProcessedError < StandardError; end
|
6
7
|
|
7
8
|
def self.datadir(org)
|
8
9
|
File.join(Rbbt.datadir, 'organisms', org)
|
9
10
|
end
|
11
|
+
|
12
|
+
def self.normalize(org, list, field = nil, others = nil, options = {})
|
13
|
+
return [] if list.nil? or list.empty?
|
14
|
+
options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
|
15
|
+
double = Misc.process_options options, :double
|
16
|
+
|
17
|
+
if Array === list
|
18
|
+
if double
|
19
|
+
index.values_at *list
|
20
|
+
else
|
21
|
+
index.values_at(*list).collect{|e| Misc.first e}
|
22
|
+
end
|
23
|
+
else
|
24
|
+
if double
|
25
|
+
index[list]
|
26
|
+
else
|
27
|
+
index[list].first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.guess_id(org, values)
|
33
|
+
identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
|
34
|
+
field_matches = identifiers.field_matches(values)
|
35
|
+
field_matches.sort_by{|field, matches| matches.uniq.length}.last
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.organisms
|
39
|
+
Dir.glob(File.join(PKGData.sharedir_for_file(__FILE__), 'install/Organism/*/Rakefile')).collect{|f| File.basename(File.dirname(f))}
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.name(organism)
|
43
|
+
Open.read(Organism.scientific_name(organism)).strip
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.organism(name)
|
47
|
+
organisms.select{|organism|
|
48
|
+
organism == name or Organism.name(organism) =~ /#{ name }/i
|
49
|
+
}.first
|
50
|
+
end
|
10
51
|
|
11
52
|
extend DataModule
|
12
53
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
|
3
|
+
module Polysearch
|
4
|
+
Rbbt.claim "organ" ,'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt', 'Polysearch'
|
5
|
+
Rbbt.claim "tissue" ,'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt', 'Polysearch'
|
6
|
+
Rbbt.claim "location" ,'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt', 'Polysearch'
|
7
|
+
Rbbt.claim "disease" ,'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt', 'Polysearch'
|
8
|
+
Rbbt.claim "drug" ,'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt', 'Polysearch'
|
9
|
+
end
|
10
|
+
|
@@ -52,9 +52,18 @@ $biomart_identifiers = [
|
|
52
52
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
53
|
]
|
54
54
|
|
55
|
+
$biomart_positions = [
|
56
|
+
['Chromosome Name','chromosome_name'],
|
57
|
+
['Strand','strand'],
|
58
|
+
['Gene Start','start_position'],
|
59
|
+
['Gene End','end_position'],
|
60
|
+
['Transcript Start','transcript_start'],
|
61
|
+
['Transcript End','transcript_end'],
|
62
|
+
]
|
63
|
+
|
55
64
|
|
56
65
|
|
57
|
-
file '
|
66
|
+
file 'scientific_name' do |t|
|
58
67
|
File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
|
59
68
|
end
|
60
69
|
|
@@ -77,7 +86,32 @@ file 'identifiers' do |t|
|
|
77
86
|
File.open(t.name, 'w') do |f| f.puts identifiers end
|
78
87
|
end
|
79
88
|
|
89
|
+
file 'gene_go' do |t|
|
90
|
+
url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
|
91
|
+
tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
|
92
|
+
|
93
|
+
index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
|
94
|
+
new = TSV.new({})
|
95
|
+
tsv.through do |key, values|
|
96
|
+
next if index[key].nil?
|
97
|
+
new_key = index[key].first
|
98
|
+
new[new_key] = values
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
new.key_field = "Associated Gene Name"
|
103
|
+
new.fields = ["GO Term"]
|
104
|
+
Open.write(t.name, new.to_s)
|
105
|
+
end
|
106
|
+
|
107
|
+
file 'gene_positions' do |t|
|
108
|
+
BioMart.set_archive('may2009')
|
109
|
+
positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
|
110
|
+
BioMart.unset_archive
|
111
|
+
|
112
|
+
Open.write(t.name, positions.to_s)
|
113
|
+
end
|
80
114
|
|
81
|
-
task :default => ['name', 'lexicon', 'identifiers']
|
115
|
+
task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
|
82
116
|
|
83
117
|
|
@@ -10,7 +10,7 @@ $biomart_db = 'scerevisiae_gene_ensembl'
|
|
10
10
|
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
11
|
|
12
12
|
|
13
|
-
file '
|
13
|
+
file 'scientific_name' do |t|
|
14
14
|
File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
|
15
15
|
end
|
16
16
|
|
@@ -27,7 +27,7 @@ file 'lexicon' do |t|
|
|
27
27
|
end
|
28
28
|
|
29
29
|
file 'identifiers' do |t|
|
30
|
-
identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
30
|
+
identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
|
31
31
|
|
32
32
|
merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
|
33
33
|
|
@@ -4,7 +4,7 @@ require 'test/unit'
|
|
4
4
|
|
5
5
|
class TestBioMart < Test::Unit::TestCase
|
6
6
|
|
7
|
-
def
|
7
|
+
def _test_get
|
8
8
|
assert_raise BioMart::QueryError do
|
9
9
|
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
10
10
|
end
|
@@ -23,7 +23,7 @@ class TestBioMart < Test::Unit::TestCase
|
|
23
23
|
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def _test_tsv
|
27
27
|
data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
28
28
|
|
29
29
|
assert(data['852236']['Protein ID'].include? 'CAA84864')
|
@@ -6,12 +6,24 @@ class TestEntrez < Test::Unit::TestCase
|
|
6
6
|
def test_identifiers
|
7
7
|
assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
8
8
|
assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
9
|
-
|
9
|
+
assert TSV.new(Organism.identifiers('Hsa'))['1020']["Associated Gene Name"].include?('CDK5')
|
10
10
|
end
|
11
11
|
|
12
12
|
def test_lexicon
|
13
13
|
assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
|
14
14
|
end
|
15
|
+
|
16
|
+
def test_guess_id
|
17
|
+
ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
|
18
|
+
gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
|
19
|
+
assert_equal "Ensembl Gene ID", Organism::Sce.guess_id(ensembl).first
|
20
|
+
assert_equal "Associated Gene Name", Organism::Sce.guess_id(gene_name).first
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_organisms
|
24
|
+
assert Organism.organisms.include? "Hsa"
|
25
|
+
assert_equal "Hsa", Organism.organism("Homo sapiens")
|
26
|
+
end
|
15
27
|
end
|
16
28
|
|
17
29
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 19
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-30 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -83,12 +83,16 @@ extensions: []
|
|
83
83
|
extra_rdoc_files: []
|
84
84
|
|
85
85
|
files:
|
86
|
+
- lib/rbbt/sources/COSTART.rb
|
87
|
+
- lib/rbbt/sources/CTCAE.rb
|
88
|
+
- lib/rbbt/sources/Reactome.rb
|
86
89
|
- lib/rbbt/sources/bibtex.rb
|
87
90
|
- lib/rbbt/sources/biomart.rb
|
88
91
|
- lib/rbbt/sources/entrez.rb
|
89
92
|
- lib/rbbt/sources/go.rb
|
90
93
|
- lib/rbbt/sources/gscholar.rb
|
91
94
|
- lib/rbbt/sources/organism.rb
|
95
|
+
- lib/rbbt/sources/polysearch.rb
|
92
96
|
- lib/rbbt/sources/pubmed.rb
|
93
97
|
- share/install/Organism/Hsa/Rakefile
|
94
98
|
- share/install/Organism/Sce/Rakefile
|
@@ -129,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
133
|
requirements: []
|
130
134
|
|
131
135
|
rubyforge_project:
|
132
|
-
rubygems_version: 1.
|
136
|
+
rubygems_version: 1.4.2
|
133
137
|
signing_key:
|
134
138
|
specification_version: 3
|
135
139
|
summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
|