rbbt-sources 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/sources/biomart.rb +34 -12
- data/lib/rbbt/sources/entrez.rb +4 -1
- data/lib/rbbt/sources/go.rb +37 -38
- data/lib/rbbt/sources/organism.rb +7 -1
- data/share/install/Organism/Hsa/Rakefile +83 -0
- data/share/install/Organism/Sce/Rakefile +118 -0
- data/share/install/lib/helpers.rb +47 -0
- data/test/rbbt/sources/test_biomart.rb +15 -10
- data/test/rbbt/sources/test_entrez.rb +2 -2
- data/test/rbbt/sources/test_go.rb +0 -3
- data/test/rbbt/sources/test_organism.rb +17 -0
- data/test/rbbt/sources/test_pubmed.rb +1 -1
- metadata +25 -6
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/log'
|
3
3
|
|
4
4
|
# This module interacts with BioMart. It performs queries to BioMart and
|
5
5
|
# synthesises a hash with the results. Note that this module connects to the
|
@@ -9,6 +9,7 @@ require 'rbbt/util/open'
|
|
9
9
|
module BioMart
|
10
10
|
|
11
11
|
class BioMart::QueryError < StandardError; end
|
12
|
+
|
12
13
|
private
|
13
14
|
|
14
15
|
@@biomart_query_xml = <<-EOT
|
@@ -25,8 +26,7 @@ module BioMart
|
|
25
26
|
|
26
27
|
|
27
28
|
|
28
|
-
|
29
|
-
def self.get(database, main, attrs = nil, filters = nil, data = nil, options = {})
|
29
|
+
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
30
30
|
attrs ||= []
|
31
31
|
filters ||= ["with_#{main}"]
|
32
32
|
data ||= {}
|
@@ -37,7 +37,7 @@ module BioMart
|
|
37
37
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
38
38
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
39
39
|
|
40
|
-
response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '),
|
40
|
+
response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), open_options)
|
41
41
|
if response =~ /Query ERROR:/
|
42
42
|
raise BioMart::QueryError, response
|
43
43
|
end
|
@@ -51,8 +51,12 @@ module BioMart
|
|
51
51
|
attrs.each{|name|
|
52
52
|
value = parts.shift
|
53
53
|
data[main][name] ||= []
|
54
|
-
next if value.nil?
|
55
|
-
data[main][name]
|
54
|
+
next if value.nil? or value.empty?
|
55
|
+
if data[main][name]
|
56
|
+
data[main][name] = [value]
|
57
|
+
else
|
58
|
+
data[main][name] << value unless data[main][name].include? value
|
59
|
+
end
|
56
60
|
}
|
57
61
|
}
|
58
62
|
|
@@ -75,30 +79,48 @@ module BioMart
|
|
75
79
|
# the BioMart query to remove results with the main attribute empty, this may
|
76
80
|
# cause an error if the BioMart WS does not allow filtering with that
|
77
81
|
# attribute.
|
78
|
-
def self.query(database, main, attrs = nil, filters = nil, data = nil,
|
82
|
+
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
83
|
+
open_options = Misc.add_defaults open_options, :nocache => false
|
79
84
|
attrs ||= []
|
80
85
|
data ||= {}
|
81
86
|
|
87
|
+
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
88
|
+
|
89
|
+
max_items = 2
|
82
90
|
chunks = []
|
83
91
|
chunk = []
|
84
92
|
attrs.each{|a|
|
85
93
|
chunk << a
|
86
|
-
if chunk.length ==
|
94
|
+
if chunk.length == max_items
|
87
95
|
chunks << chunk
|
88
96
|
chunk = []
|
89
97
|
end
|
90
98
|
}
|
91
99
|
|
92
100
|
chunks << chunk if chunk.any?
|
101
|
+
|
93
102
|
|
94
|
-
chunks.
|
95
|
-
|
103
|
+
Log.low "Chunks: #{chunks.length}"
|
104
|
+
chunks.each_with_index{|chunk,i|
|
105
|
+
Log.low "Chunk #{ i }: [#{chunk * ", "}]"
|
106
|
+
data = get(database, main, chunk, filters, data, open_options)
|
96
107
|
}
|
97
108
|
|
98
109
|
data
|
99
110
|
end
|
100
111
|
|
101
|
-
|
112
|
+
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
113
|
+
codes = attrs.collect{|attr| attr[1]}
|
114
|
+
data = query(database, main.last, codes, filters, data, open_options)
|
115
|
+
tsv = TSV.new({})
|
116
|
+
|
117
|
+
data.each do |key, info|
|
118
|
+
tsv[key] = info.values_at(*codes)
|
119
|
+
end
|
102
120
|
|
121
|
+
tsv.key_field = main.first
|
122
|
+
tsv.fields = attrs.collect{|attr| attr.first}
|
123
|
+
tsv
|
124
|
+
end
|
103
125
|
end
|
104
126
|
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -14,7 +14,10 @@ module Entrez
|
|
14
14
|
taxs = [taxs] unless Array === taxs
|
15
15
|
options.merge! :grep => taxs
|
16
16
|
|
17
|
-
TSV.new(Rbbt.find_datafile('gene_info'), options)
|
17
|
+
tsv = TSV.new(Rbbt.find_datafile('gene_info'), options)
|
18
|
+
tsv.key_field = "Entrez Gene ID"
|
19
|
+
tsv.fields = ["Native ID"]
|
20
|
+
tsv
|
18
21
|
end
|
19
22
|
|
20
23
|
def self.entrez2pubmed(taxs)
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -4,66 +4,67 @@ require 'rbbt-util'
|
|
4
4
|
# now all it does is provide a translation form id to the actual names.
|
5
5
|
module GO
|
6
6
|
|
7
|
-
|
7
|
+
Rbbt.add_datafiles :gene_ontology => ['databases/GO', 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'],
|
8
|
+
:goslim_generic => ['databases/GO', 'http://www.geneontology.org/GO_slims/goslim_generic.obo']
|
9
|
+
|
10
|
+
|
8
11
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
12
|
+
TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
|
9
13
|
|
10
14
|
# This method needs to be called before any translations can be made, it is
|
11
15
|
# called automatically the first time the id2name method is called. It loads
|
12
16
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
13
17
|
# only the name field is used.
|
14
18
|
def self.init
|
15
|
-
|
16
|
-
File.open(
|
17
|
-
split(/\[Term\]/).
|
18
|
-
each{|term|
|
19
|
+
info = TCHash.new(TSV_GENE_ONTOLOGY, true)
|
20
|
+
File.open(Rbbt.find_datafile('gene_ontology')).read.split(/\[Term\]/).each{|term|
|
19
21
|
term_info = {}
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
22
|
+
|
23
|
+
term.split(/\n/). select{|l| l =~ /:/}.each{|l|
|
24
|
+
key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
|
25
|
+
if MULTIPLE_VALUE_FIELDS.include? key.strip
|
26
|
+
term_info[key.strip] ||= []
|
27
|
+
term_info[key.strip] << value.strip
|
28
|
+
else
|
29
|
+
term_info[key.strip] = value.strip
|
30
|
+
end
|
31
|
+
}
|
32
|
+
|
33
|
+
next if term_info["id"].nil?
|
34
|
+
info[term_info["id"]] = term_info
|
35
|
+
}
|
36
|
+
info.close
|
33
37
|
end
|
34
38
|
|
35
39
|
def self.info
|
36
|
-
self.init unless
|
37
|
-
|
40
|
+
self.init unless File.exists? TSV_GENE_ONTOLOGY
|
41
|
+
TCHash.get(TSV_GENE_ONTOLOGY)
|
38
42
|
end
|
39
43
|
|
40
44
|
def self.goterms
|
41
|
-
|
42
|
-
@@info.keys
|
45
|
+
info.keys
|
43
46
|
end
|
44
47
|
|
45
48
|
def self.id2name(id)
|
46
|
-
self.init unless @@info
|
47
49
|
if id.kind_of? Array
|
48
|
-
|
50
|
+
info.values_at(*id).collect{|i| i['name'] if i}
|
49
51
|
else
|
50
|
-
return nil if
|
51
|
-
|
52
|
+
return nil if info[id].nil?
|
53
|
+
info[id]['name']
|
52
54
|
end
|
53
55
|
end
|
54
56
|
|
55
57
|
def self.id2ancestors(id)
|
56
|
-
self.init unless @@info
|
57
58
|
if id.kind_of? Array
|
58
|
-
|
59
|
+
info.values_at(*id).
|
59
60
|
select{|i| ! i['is_a'].nil?}.
|
60
61
|
collect{|i| i['is_a'].collect{|id|
|
61
|
-
|
62
|
-
|
62
|
+
id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
|
63
|
+
}.compact
|
63
64
|
}
|
64
65
|
else
|
65
|
-
return [] if
|
66
|
-
|
66
|
+
return [] if id.nil? or info[id].nil? or info[id]['is_a'].nil?
|
67
|
+
info[id]['is_a'].
|
67
68
|
collect{|id|
|
68
69
|
id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
|
69
70
|
}.compact
|
@@ -71,14 +72,12 @@ module GO
|
|
71
72
|
end
|
72
73
|
|
73
74
|
def self.id2namespace(id)
|
74
|
-
self.init unless
|
75
|
+
self.init unless info
|
75
76
|
if id.kind_of? Array
|
76
|
-
|
77
|
+
info.values_at(*id).collect{|i| i['namespace'] if i}
|
77
78
|
else
|
78
|
-
return nil if
|
79
|
-
|
79
|
+
return nil if info[id].nil?
|
80
|
+
info[id]['namespace']
|
80
81
|
end
|
81
82
|
end
|
82
|
-
|
83
|
-
|
84
83
|
end
|
@@ -1,9 +1,15 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/data_module'
|
3
|
+
|
2
4
|
module Organism
|
3
5
|
class OrganismNotProcessedError < StandardError; end
|
4
6
|
|
5
7
|
def self.datadir(org)
|
6
8
|
File.join(Rbbt.datadir, 'organisms', org)
|
7
9
|
end
|
8
|
-
|
10
|
+
|
11
|
+
extend DataModule
|
12
|
+
|
13
|
+
Hsa = with_key('Hsa')
|
14
|
+
Sce = with_key('Sce')
|
9
15
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
+
|
6
|
+
$taxs = [559292,4932]
|
7
|
+
$native = "SGD ID"
|
8
|
+
$url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
|
9
|
+
$biomart_db = 'hsapiens_gene_ensembl'
|
10
|
+
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
+
$biomart_lexicon = [
|
12
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
13
|
+
[ 'HGNC symbol', "hgnc_symbol" ],
|
14
|
+
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
15
|
+
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
16
|
+
]
|
17
|
+
|
18
|
+
$biomart_identifiers = [
|
19
|
+
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
20
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
21
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
22
|
+
[ 'CCDS ID', "ccds" ],
|
23
|
+
[ 'Protein ID', "protein_id" ],
|
24
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
25
|
+
[ 'Unigene ID', "unigene" ],
|
26
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
27
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
28
|
+
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
29
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
30
|
+
|
31
|
+
# Affymetrix
|
32
|
+
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
33
|
+
[ 'AFFY HG FOCUS', 'affy_hg_focus' ],
|
34
|
+
[ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
|
35
|
+
[ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
|
36
|
+
[ 'AFFY HG U133A', 'affy_hg_u133a' ],
|
37
|
+
[ 'AFFY HG U133B', 'affy_hg_u133b' ],
|
38
|
+
[ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
|
39
|
+
[ 'AFFY HG U95B', 'affy_hg_u95b' ],
|
40
|
+
[ 'AFFY HG U95C', 'affy_hg_u95c' ],
|
41
|
+
[ 'AFFY HG U95D', 'affy_hg_u95d' ],
|
42
|
+
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
43
|
+
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
44
|
+
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
45
|
+
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
46
|
+
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
47
|
+
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
48
|
+
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
49
|
+
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
50
|
+
[ 'Codelink ID', 'codelink' ],
|
51
|
+
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
52
|
+
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
|
+
]
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
file 'name' do |t|
|
58
|
+
File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
|
59
|
+
end
|
60
|
+
|
61
|
+
file 'lexicon' do |t|
|
62
|
+
lexicon = tsv_file('http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag',
|
63
|
+
"HGNC ID", nil, :flatten => true, :header_hash => '')
|
64
|
+
merge_biomart lexicon, $biomart_db, $biomart_main, $biomart_lexicon, "HGNC ID"
|
65
|
+
|
66
|
+
File.open(t.name, 'w') do |f| f.puts lexicon end
|
67
|
+
end
|
68
|
+
|
69
|
+
file 'identifiers' do |t|
|
70
|
+
identifiers = BioMart.tsv($biomart_db, $biomart_main, $biomart_identifiers)
|
71
|
+
$biomart_identifiers.each do |name, key, prefix|
|
72
|
+
if prefix
|
73
|
+
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
task :default => ['name', 'lexicon', 'identifiers']
|
82
|
+
|
83
|
+
|
@@ -0,0 +1,118 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
+
|
6
|
+
$taxs = [559292,4932]
|
7
|
+
$native = "SGD ID"
|
8
|
+
$url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
|
9
|
+
$biomart_db = 'scerevisiae_gene_ensembl'
|
10
|
+
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
+
|
12
|
+
|
13
|
+
file 'name' do |t|
|
14
|
+
File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
|
15
|
+
end
|
16
|
+
|
17
|
+
file 'lexicon' do |t|
|
18
|
+
lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
19
|
+
|
20
|
+
merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
|
21
|
+
|
22
|
+
merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
23
|
+
|
24
|
+
lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
|
25
|
+
|
26
|
+
File.open(t.name, 'w') do |f| f.puts lexicon end
|
27
|
+
end
|
28
|
+
|
29
|
+
file 'identifiers' do |t|
|
30
|
+
identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
31
|
+
|
32
|
+
merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
|
33
|
+
|
34
|
+
merge_biomart(identifiers, $biomart_db, $biomart_main,
|
35
|
+
[['Associated Gene Name' , "external_gene_id"],
|
36
|
+
['Ensembl Gene ID', "ensembl_gene_id" ],
|
37
|
+
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
38
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
39
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
40
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
41
|
+
['Protein ID' , "protein_id"] ,
|
42
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
43
|
+
# Affymetrix
|
44
|
+
['Affy yeast 2',"affy_yeast_2"],
|
45
|
+
['Affy yg s98', "affy_yg_s98"]])
|
46
|
+
|
47
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
task :default => ['name', 'lexicon', 'identifiers']
|
52
|
+
|
53
|
+
#require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
54
|
+
#
|
55
|
+
#$name = "Saccharomyces cerevisiae"
|
56
|
+
#
|
57
|
+
#
|
58
|
+
#$native_id = "SGD DB Id"
|
59
|
+
#
|
60
|
+
#$entrez2native = {
|
61
|
+
# :tax => 559292,
|
62
|
+
# :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
|
63
|
+
# :check => proc{|code| code.match(/^S0/)},
|
64
|
+
#}
|
65
|
+
#
|
66
|
+
#$lexicon = {
|
67
|
+
# :file => {
|
68
|
+
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
69
|
+
# :native => 0,
|
70
|
+
# :extra => [4,3,5]
|
71
|
+
# },
|
72
|
+
# :biomart => {
|
73
|
+
# :database => 'scerevisiae_gene_ensembl',
|
74
|
+
# :main => ['Entrez Gene ID', 'entrezgene'],
|
75
|
+
# :extra => [
|
76
|
+
# ['Interpro Description' , "interpro_description"],
|
77
|
+
# ],
|
78
|
+
# :filter => [],
|
79
|
+
# }
|
80
|
+
#
|
81
|
+
#}
|
82
|
+
#
|
83
|
+
#$identifiers = {
|
84
|
+
# :file => {
|
85
|
+
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
86
|
+
# :native => 0,
|
87
|
+
# :extra => [],
|
88
|
+
# },
|
89
|
+
# :biomart => {
|
90
|
+
# :database => 'scerevisiae_gene_ensembl',
|
91
|
+
# :main => ['Entrez Gene ID', 'entrezgene'],
|
92
|
+
# :extra => [
|
93
|
+
# ['Associated Gene Name' , "external_gene_id"],
|
94
|
+
# ['Ensembl Gene ID', "ensembl_gene_id" ],
|
95
|
+
# ['Ensembl Protein ID', "ensembl_peptide_id" ],
|
96
|
+
# ['RefSeq Protein ID' , "refseq_peptide"] ,
|
97
|
+
# ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
98
|
+
# ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
99
|
+
# ['Protein ID' , "protein_id"] ,
|
100
|
+
# ['EMBL (Genbank) ID' , "embl"] ,
|
101
|
+
# # Affymetrix
|
102
|
+
# ['Affy yeast 2',"affy_yeast_2"],
|
103
|
+
# ['Affy yg s98', "affy_yg_s98"],
|
104
|
+
# ],
|
105
|
+
# :filter => [],
|
106
|
+
# }
|
107
|
+
#}
|
108
|
+
#
|
109
|
+
#$go = {
|
110
|
+
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
|
111
|
+
# :code => 1,
|
112
|
+
# :go => 4,
|
113
|
+
# :pmid => 5,
|
114
|
+
#}
|
115
|
+
#
|
116
|
+
#$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
117
|
+
#
|
118
|
+
#
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
|
5
|
+
def tsv_file(url, native, extra, options = {})
|
6
|
+
options = Misc.add_defaults options, :persistence => false, :keep_empty => true
|
7
|
+
|
8
|
+
case
|
9
|
+
when Array === native
|
10
|
+
options = Misc.add_defaults options, :native => native.last
|
11
|
+
key_field = native.first
|
12
|
+
when (String === native or Integer === native)
|
13
|
+
options = Misc.add_defaults options, :native => native
|
14
|
+
key_field = nil
|
15
|
+
else
|
16
|
+
key_field = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
case
|
20
|
+
when (Array === extra and Array === extra.first)
|
21
|
+
options = Misc.add_defaults options, :extra => extra.collect{|e| e.last}
|
22
|
+
fields = extra.collect{|e| e.first}
|
23
|
+
when (Array === extra and not Array === extra.first)
|
24
|
+
options = Misc.add_defaults options, :extra => extra
|
25
|
+
fields = (1..extra.length).to_a.collect{|i| "Field#{i}"}
|
26
|
+
else
|
27
|
+
fields = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
tsv = TSV.new(Open.open(url), options)
|
31
|
+
tsv.key_field ||= key_field
|
32
|
+
tsv.fields ||= fields
|
33
|
+
tsv
|
34
|
+
end
|
35
|
+
|
36
|
+
def merge_entrez(data, taxs, native, fix = nil, select = nil)
|
37
|
+
entrez = Entrez.entrez2native(taxs, :fix => fix, :select => select)
|
38
|
+
entrez.fields = [native]
|
39
|
+
entrez
|
40
|
+
|
41
|
+
data.smart_merge entrez, native
|
42
|
+
end
|
43
|
+
|
44
|
+
def merge_biomart(lexicon, db, native, other, match = nil)
|
45
|
+
match ||= native.first
|
46
|
+
lexicon.smart_merge BioMart.tsv(db, native, other), match
|
47
|
+
end
|
@@ -9,22 +9,27 @@ class TestBioMart < Test::Unit::TestCase
|
|
9
9
|
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
10
10
|
end
|
11
11
|
|
12
|
-
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache =>
|
13
|
-
assert(data['
|
14
|
-
|
15
|
-
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => true, :wget_options => { :quiet => false} )
|
16
|
-
assert(data['856452']['protein_id'].include? 'AAB68382')
|
17
|
-
assert(data['856452']['external_gene_id'].include? 'CUP1-2')
|
12
|
+
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :wget_options => { :quiet => false})
|
13
|
+
assert(data['852236']['protein_id'].include? 'CAA84864')
|
18
14
|
|
15
|
+
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
|
16
|
+
assert(data['852236']['protein_id'].include? 'CAA84864')
|
17
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
19
18
|
end
|
20
19
|
|
21
20
|
def test_query
|
22
|
-
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache =>
|
21
|
+
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
22
|
+
|
23
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
24
|
+
end
|
23
25
|
|
24
|
-
|
25
|
-
|
26
|
-
end
|
26
|
+
def test_tsv
|
27
|
+
data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
27
28
|
|
29
|
+
assert(data['852236']['Protein ID'].include? 'CAA84864')
|
30
|
+
assert_equal 'Entrez Gene', data.key_field
|
31
|
+
assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
|
32
|
+
end
|
28
33
|
end
|
29
34
|
|
30
35
|
|
@@ -3,12 +3,12 @@ require 'rbbt/sources/entrez'
|
|
3
3
|
require 'test/unit'
|
4
4
|
|
5
5
|
class TestEntrez < Test::Unit::TestCase
|
6
|
-
$yeast_tax = 559292
|
6
|
+
$yeast_tax = [559292,4932]
|
7
7
|
|
8
8
|
def test_entrez2native
|
9
9
|
tax = $yeast_tax
|
10
10
|
fix = proc{|line| line.sub(/SGD:S0/,'S0') }
|
11
|
-
select = proc{|line| line.match(/\
|
11
|
+
select = proc{|line| line.match(/\tS0/)}
|
12
12
|
lexicon = Entrez.entrez2native(tax, :fix => fix, :select => select)
|
13
13
|
|
14
14
|
assert(lexicon['855611'].include? 'S000005056')
|
@@ -4,7 +4,6 @@ require 'rbbt/sources/go'
|
|
4
4
|
require 'test/unit'
|
5
5
|
|
6
6
|
class TestGo < Test::Unit::TestCase
|
7
|
-
|
8
7
|
def test_go
|
9
8
|
assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
|
10
9
|
assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
|
@@ -17,8 +16,6 @@ class TestGo < Test::Unit::TestCase
|
|
17
16
|
def test_namespace
|
18
17
|
assert_equal 'biological_process', GO.id2namespace('GO:0000001')
|
19
18
|
end
|
20
|
-
|
21
|
-
|
22
19
|
end
|
23
20
|
|
24
21
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestEntrez < Test::Unit::TestCase
|
6
|
+
def test_identifiers
|
7
|
+
assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
8
|
+
assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
9
|
+
#assert Organism.identifiers('Hsa')['1020']["Associated Gene Name"].include?('CDK5')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_lexicon
|
13
|
+
assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
@@ -21,7 +21,7 @@ class TestPubMed < Test::Unit::TestCase
|
|
21
21
|
assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
24
|
+
def _test_full_text
|
25
25
|
pmid = '16438716'
|
26
26
|
assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
|
27
27
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-10 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -33,7 +33,7 @@ dependencies:
|
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
36
|
+
name: rbbt-text
|
37
37
|
prerelease: false
|
38
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
@@ -47,7 +47,7 @@ dependencies:
|
|
47
47
|
type: :runtime
|
48
48
|
version_requirements: *id002
|
49
49
|
- !ruby/object:Gem::Dependency
|
50
|
-
name:
|
50
|
+
name: mechanize
|
51
51
|
prerelease: false
|
52
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
53
|
none: false
|
@@ -60,6 +60,20 @@ dependencies:
|
|
60
60
|
version: "0"
|
61
61
|
type: :runtime
|
62
62
|
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: libxml-ruby
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
type: :runtime
|
76
|
+
version_requirements: *id004
|
63
77
|
description: Data sources like PubMed, Entrez Gene, or Gene Ontology
|
64
78
|
email: miguel.vazquez@fdi.ucm.es
|
65
79
|
executables: []
|
@@ -76,9 +90,13 @@ files:
|
|
76
90
|
- lib/rbbt/sources/gscholar.rb
|
77
91
|
- lib/rbbt/sources/organism.rb
|
78
92
|
- lib/rbbt/sources/pubmed.rb
|
93
|
+
- share/install/Organism/Hsa/Rakefile
|
94
|
+
- share/install/Organism/Sce/Rakefile
|
95
|
+
- share/install/lib/helpers.rb
|
79
96
|
- test/rbbt/sources/test_biomart.rb
|
80
97
|
- test/rbbt/sources/test_entrez.rb
|
81
98
|
- test/rbbt/sources/test_go.rb
|
99
|
+
- test/rbbt/sources/test_organism.rb
|
82
100
|
- test/rbbt/sources/test_pubmed.rb
|
83
101
|
- test/test_helper.rb
|
84
102
|
has_rdoc: true
|
@@ -119,5 +137,6 @@ test_files:
|
|
119
137
|
- test/rbbt/sources/test_biomart.rb
|
120
138
|
- test/rbbt/sources/test_entrez.rb
|
121
139
|
- test/rbbt/sources/test_go.rb
|
140
|
+
- test/rbbt/sources/test_organism.rb
|
122
141
|
- test/rbbt/sources/test_pubmed.rb
|
123
142
|
- test/test_helper.rb
|