rbbt-sources 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/biomart.rb +34 -12
- data/lib/rbbt/sources/entrez.rb +4 -1
- data/lib/rbbt/sources/go.rb +37 -38
- data/lib/rbbt/sources/organism.rb +7 -1
- data/share/install/Organism/Hsa/Rakefile +83 -0
- data/share/install/Organism/Sce/Rakefile +118 -0
- data/share/install/lib/helpers.rb +47 -0
- data/test/rbbt/sources/test_biomart.rb +15 -10
- data/test/rbbt/sources/test_entrez.rb +2 -2
- data/test/rbbt/sources/test_go.rb +0 -3
- data/test/rbbt/sources/test_organism.rb +17 -0
- data/test/rbbt/sources/test_pubmed.rb +1 -1
- metadata +25 -6
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/log'
|
3
3
|
|
4
4
|
# This module interacts with BioMart. It performs queries to BioMart and
|
5
5
|
# synthesises a hash with the results. Note that this module connects to the
|
@@ -9,6 +9,7 @@ require 'rbbt/util/open'
|
|
9
9
|
module BioMart
|
10
10
|
|
11
11
|
class BioMart::QueryError < StandardError; end
|
12
|
+
|
12
13
|
private
|
13
14
|
|
14
15
|
@@biomart_query_xml = <<-EOT
|
@@ -25,8 +26,7 @@ module BioMart
|
|
25
26
|
|
26
27
|
|
27
28
|
|
28
|
-
|
29
|
-
def self.get(database, main, attrs = nil, filters = nil, data = nil, options = {})
|
29
|
+
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
30
30
|
attrs ||= []
|
31
31
|
filters ||= ["with_#{main}"]
|
32
32
|
data ||= {}
|
@@ -37,7 +37,7 @@ module BioMart
|
|
37
37
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
38
38
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
39
39
|
|
40
|
-
response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '),
|
40
|
+
response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), open_options)
|
41
41
|
if response =~ /Query ERROR:/
|
42
42
|
raise BioMart::QueryError, response
|
43
43
|
end
|
@@ -51,8 +51,12 @@ module BioMart
|
|
51
51
|
attrs.each{|name|
|
52
52
|
value = parts.shift
|
53
53
|
data[main][name] ||= []
|
54
|
-
next if value.nil?
|
55
|
-
data[main][name]
|
54
|
+
next if value.nil? or value.empty?
|
55
|
+
if data[main][name]
|
56
|
+
data[main][name] = [value]
|
57
|
+
else
|
58
|
+
data[main][name] << value unless data[main][name].include? value
|
59
|
+
end
|
56
60
|
}
|
57
61
|
}
|
58
62
|
|
@@ -75,30 +79,48 @@ module BioMart
|
|
75
79
|
# the BioMart query to remove results with the main attribute empty, this may
|
76
80
|
# cause an error if the BioMart WS does not allow filtering with that
|
77
81
|
# attribute.
|
78
|
-
def self.query(database, main, attrs = nil, filters = nil, data = nil,
|
82
|
+
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
83
|
+
open_options = Misc.add_defaults open_options, :nocache => false
|
79
84
|
attrs ||= []
|
80
85
|
data ||= {}
|
81
86
|
|
87
|
+
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
88
|
+
|
89
|
+
max_items = 2
|
82
90
|
chunks = []
|
83
91
|
chunk = []
|
84
92
|
attrs.each{|a|
|
85
93
|
chunk << a
|
86
|
-
if chunk.length ==
|
94
|
+
if chunk.length == max_items
|
87
95
|
chunks << chunk
|
88
96
|
chunk = []
|
89
97
|
end
|
90
98
|
}
|
91
99
|
|
92
100
|
chunks << chunk if chunk.any?
|
101
|
+
|
93
102
|
|
94
|
-
chunks.
|
95
|
-
|
103
|
+
Log.low "Chunks: #{chunks.length}"
|
104
|
+
chunks.each_with_index{|chunk,i|
|
105
|
+
Log.low "Chunk #{ i }: [#{chunk * ", "}]"
|
106
|
+
data = get(database, main, chunk, filters, data, open_options)
|
96
107
|
}
|
97
108
|
|
98
109
|
data
|
99
110
|
end
|
100
111
|
|
101
|
-
|
112
|
+
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
113
|
+
codes = attrs.collect{|attr| attr[1]}
|
114
|
+
data = query(database, main.last, codes, filters, data, open_options)
|
115
|
+
tsv = TSV.new({})
|
116
|
+
|
117
|
+
data.each do |key, info|
|
118
|
+
tsv[key] = info.values_at(*codes)
|
119
|
+
end
|
102
120
|
|
121
|
+
tsv.key_field = main.first
|
122
|
+
tsv.fields = attrs.collect{|attr| attr.first}
|
123
|
+
tsv
|
124
|
+
end
|
103
125
|
end
|
104
126
|
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -14,7 +14,10 @@ module Entrez
|
|
14
14
|
taxs = [taxs] unless Array === taxs
|
15
15
|
options.merge! :grep => taxs
|
16
16
|
|
17
|
-
TSV.new(Rbbt.find_datafile('gene_info'), options)
|
17
|
+
tsv = TSV.new(Rbbt.find_datafile('gene_info'), options)
|
18
|
+
tsv.key_field = "Entrez Gene ID"
|
19
|
+
tsv.fields = ["Native ID"]
|
20
|
+
tsv
|
18
21
|
end
|
19
22
|
|
20
23
|
def self.entrez2pubmed(taxs)
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -4,66 +4,67 @@ require 'rbbt-util'
|
|
4
4
|
# now all it does is provide a translation form id to the actual names.
|
5
5
|
module GO
|
6
6
|
|
7
|
-
|
7
|
+
Rbbt.add_datafiles :gene_ontology => ['databases/GO', 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'],
|
8
|
+
:goslim_generic => ['databases/GO', 'http://www.geneontology.org/GO_slims/goslim_generic.obo']
|
9
|
+
|
10
|
+
|
8
11
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
12
|
+
TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
|
9
13
|
|
10
14
|
# This method needs to be called before any translations can be made, it is
|
11
15
|
# called automatically the first time the id2name method is called. It loads
|
12
16
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
13
17
|
# only the name field is used.
|
14
18
|
def self.init
|
15
|
-
|
16
|
-
File.open(
|
17
|
-
split(/\[Term\]/).
|
18
|
-
each{|term|
|
19
|
+
info = TCHash.new(TSV_GENE_ONTOLOGY, true)
|
20
|
+
File.open(Rbbt.find_datafile('gene_ontology')).read.split(/\[Term\]/).each{|term|
|
19
21
|
term_info = {}
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
22
|
+
|
23
|
+
term.split(/\n/). select{|l| l =~ /:/}.each{|l|
|
24
|
+
key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
|
25
|
+
if MULTIPLE_VALUE_FIELDS.include? key.strip
|
26
|
+
term_info[key.strip] ||= []
|
27
|
+
term_info[key.strip] << value.strip
|
28
|
+
else
|
29
|
+
term_info[key.strip] = value.strip
|
30
|
+
end
|
31
|
+
}
|
32
|
+
|
33
|
+
next if term_info["id"].nil?
|
34
|
+
info[term_info["id"]] = term_info
|
35
|
+
}
|
36
|
+
info.close
|
33
37
|
end
|
34
38
|
|
35
39
|
def self.info
|
36
|
-
self.init unless
|
37
|
-
|
40
|
+
self.init unless File.exists? TSV_GENE_ONTOLOGY
|
41
|
+
TCHash.get(TSV_GENE_ONTOLOGY)
|
38
42
|
end
|
39
43
|
|
40
44
|
def self.goterms
|
41
|
-
|
42
|
-
@@info.keys
|
45
|
+
info.keys
|
43
46
|
end
|
44
47
|
|
45
48
|
def self.id2name(id)
|
46
|
-
self.init unless @@info
|
47
49
|
if id.kind_of? Array
|
48
|
-
|
50
|
+
info.values_at(*id).collect{|i| i['name'] if i}
|
49
51
|
else
|
50
|
-
return nil if
|
51
|
-
|
52
|
+
return nil if info[id].nil?
|
53
|
+
info[id]['name']
|
52
54
|
end
|
53
55
|
end
|
54
56
|
|
55
57
|
def self.id2ancestors(id)
|
56
|
-
self.init unless @@info
|
57
58
|
if id.kind_of? Array
|
58
|
-
|
59
|
+
info.values_at(*id).
|
59
60
|
select{|i| ! i['is_a'].nil?}.
|
60
61
|
collect{|i| i['is_a'].collect{|id|
|
61
|
-
|
62
|
-
|
62
|
+
id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
|
63
|
+
}.compact
|
63
64
|
}
|
64
65
|
else
|
65
|
-
return [] if
|
66
|
-
|
66
|
+
return [] if id.nil? or info[id].nil? or info[id]['is_a'].nil?
|
67
|
+
info[id]['is_a'].
|
67
68
|
collect{|id|
|
68
69
|
id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
|
69
70
|
}.compact
|
@@ -71,14 +72,12 @@ module GO
|
|
71
72
|
end
|
72
73
|
|
73
74
|
def self.id2namespace(id)
|
74
|
-
self.init unless
|
75
|
+
self.init unless info
|
75
76
|
if id.kind_of? Array
|
76
|
-
|
77
|
+
info.values_at(*id).collect{|i| i['namespace'] if i}
|
77
78
|
else
|
78
|
-
return nil if
|
79
|
-
|
79
|
+
return nil if info[id].nil?
|
80
|
+
info[id]['namespace']
|
80
81
|
end
|
81
82
|
end
|
82
|
-
|
83
|
-
|
84
83
|
end
|
@@ -1,9 +1,15 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/data_module'
|
3
|
+
|
2
4
|
module Organism
|
3
5
|
class OrganismNotProcessedError < StandardError; end
|
4
6
|
|
5
7
|
def self.datadir(org)
|
6
8
|
File.join(Rbbt.datadir, 'organisms', org)
|
7
9
|
end
|
8
|
-
|
10
|
+
|
11
|
+
extend DataModule
|
12
|
+
|
13
|
+
Hsa = with_key('Hsa')
|
14
|
+
Sce = with_key('Sce')
|
9
15
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
+
|
6
|
+
$taxs = [559292,4932]
|
7
|
+
$native = "SGD ID"
|
8
|
+
$url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
|
9
|
+
$biomart_db = 'hsapiens_gene_ensembl'
|
10
|
+
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
+
$biomart_lexicon = [
|
12
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
13
|
+
[ 'HGNC symbol', "hgnc_symbol" ],
|
14
|
+
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
15
|
+
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
16
|
+
]
|
17
|
+
|
18
|
+
$biomart_identifiers = [
|
19
|
+
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
20
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
21
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
22
|
+
[ 'CCDS ID', "ccds" ],
|
23
|
+
[ 'Protein ID', "protein_id" ],
|
24
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
25
|
+
[ 'Unigene ID', "unigene" ],
|
26
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
27
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
28
|
+
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
29
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
30
|
+
|
31
|
+
# Affymetrix
|
32
|
+
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
33
|
+
[ 'AFFY HG FOCUS', 'affy_hg_focus' ],
|
34
|
+
[ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
|
35
|
+
[ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
|
36
|
+
[ 'AFFY HG U133A', 'affy_hg_u133a' ],
|
37
|
+
[ 'AFFY HG U133B', 'affy_hg_u133b' ],
|
38
|
+
[ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
|
39
|
+
[ 'AFFY HG U95B', 'affy_hg_u95b' ],
|
40
|
+
[ 'AFFY HG U95C', 'affy_hg_u95c' ],
|
41
|
+
[ 'AFFY HG U95D', 'affy_hg_u95d' ],
|
42
|
+
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
43
|
+
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
44
|
+
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
45
|
+
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
46
|
+
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
47
|
+
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
48
|
+
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
49
|
+
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
50
|
+
[ 'Codelink ID', 'codelink' ],
|
51
|
+
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
52
|
+
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
|
+
]
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
file 'name' do |t|
|
58
|
+
File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
|
59
|
+
end
|
60
|
+
|
61
|
+
file 'lexicon' do |t|
|
62
|
+
lexicon = tsv_file('http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag',
|
63
|
+
"HGNC ID", nil, :flatten => true, :header_hash => '')
|
64
|
+
merge_biomart lexicon, $biomart_db, $biomart_main, $biomart_lexicon, "HGNC ID"
|
65
|
+
|
66
|
+
File.open(t.name, 'w') do |f| f.puts lexicon end
|
67
|
+
end
|
68
|
+
|
69
|
+
file 'identifiers' do |t|
|
70
|
+
identifiers = BioMart.tsv($biomart_db, $biomart_main, $biomart_identifiers)
|
71
|
+
$biomart_identifiers.each do |name, key, prefix|
|
72
|
+
if prefix
|
73
|
+
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
task :default => ['name', 'lexicon', 'identifiers']
|
82
|
+
|
83
|
+
|
@@ -0,0 +1,118 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
+
|
6
|
+
$taxs = [559292,4932]
|
7
|
+
$native = "SGD ID"
|
8
|
+
$url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
|
9
|
+
$biomart_db = 'scerevisiae_gene_ensembl'
|
10
|
+
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
+
|
12
|
+
|
13
|
+
file 'name' do |t|
|
14
|
+
File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
|
15
|
+
end
|
16
|
+
|
17
|
+
file 'lexicon' do |t|
|
18
|
+
lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
19
|
+
|
20
|
+
merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
|
21
|
+
|
22
|
+
merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
23
|
+
|
24
|
+
lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
|
25
|
+
|
26
|
+
File.open(t.name, 'w') do |f| f.puts lexicon end
|
27
|
+
end
|
28
|
+
|
29
|
+
file 'identifiers' do |t|
|
30
|
+
identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
31
|
+
|
32
|
+
merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
|
33
|
+
|
34
|
+
merge_biomart(identifiers, $biomart_db, $biomart_main,
|
35
|
+
[['Associated Gene Name' , "external_gene_id"],
|
36
|
+
['Ensembl Gene ID', "ensembl_gene_id" ],
|
37
|
+
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
38
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
39
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
40
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
41
|
+
['Protein ID' , "protein_id"] ,
|
42
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
43
|
+
# Affymetrix
|
44
|
+
['Affy yeast 2',"affy_yeast_2"],
|
45
|
+
['Affy yg s98', "affy_yg_s98"]])
|
46
|
+
|
47
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
task :default => ['name', 'lexicon', 'identifiers']
|
52
|
+
|
53
|
+
#require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
54
|
+
#
|
55
|
+
#$name = "Saccharomyces cerevisiae"
|
56
|
+
#
|
57
|
+
#
|
58
|
+
#$native_id = "SGD DB Id"
|
59
|
+
#
|
60
|
+
#$entrez2native = {
|
61
|
+
# :tax => 559292,
|
62
|
+
# :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
|
63
|
+
# :check => proc{|code| code.match(/^S0/)},
|
64
|
+
#}
|
65
|
+
#
|
66
|
+
#$lexicon = {
|
67
|
+
# :file => {
|
68
|
+
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
69
|
+
# :native => 0,
|
70
|
+
# :extra => [4,3,5]
|
71
|
+
# },
|
72
|
+
# :biomart => {
|
73
|
+
# :database => 'scerevisiae_gene_ensembl',
|
74
|
+
# :main => ['Entrez Gene ID', 'entrezgene'],
|
75
|
+
# :extra => [
|
76
|
+
# ['Interpro Description' , "interpro_description"],
|
77
|
+
# ],
|
78
|
+
# :filter => [],
|
79
|
+
# }
|
80
|
+
#
|
81
|
+
#}
|
82
|
+
#
|
83
|
+
#$identifiers = {
|
84
|
+
# :file => {
|
85
|
+
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
86
|
+
# :native => 0,
|
87
|
+
# :extra => [],
|
88
|
+
# },
|
89
|
+
# :biomart => {
|
90
|
+
# :database => 'scerevisiae_gene_ensembl',
|
91
|
+
# :main => ['Entrez Gene ID', 'entrezgene'],
|
92
|
+
# :extra => [
|
93
|
+
# ['Associated Gene Name' , "external_gene_id"],
|
94
|
+
# ['Ensembl Gene ID', "ensembl_gene_id" ],
|
95
|
+
# ['Ensembl Protein ID', "ensembl_peptide_id" ],
|
96
|
+
# ['RefSeq Protein ID' , "refseq_peptide"] ,
|
97
|
+
# ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
98
|
+
# ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
99
|
+
# ['Protein ID' , "protein_id"] ,
|
100
|
+
# ['EMBL (Genbank) ID' , "embl"] ,
|
101
|
+
# # Affymetrix
|
102
|
+
# ['Affy yeast 2',"affy_yeast_2"],
|
103
|
+
# ['Affy yg s98', "affy_yg_s98"],
|
104
|
+
# ],
|
105
|
+
# :filter => [],
|
106
|
+
# }
|
107
|
+
#}
|
108
|
+
#
|
109
|
+
#$go = {
|
110
|
+
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
|
111
|
+
# :code => 1,
|
112
|
+
# :go => 4,
|
113
|
+
# :pmid => 5,
|
114
|
+
#}
|
115
|
+
#
|
116
|
+
#$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
117
|
+
#
|
118
|
+
#
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
|
5
|
+
def tsv_file(url, native, extra, options = {})
|
6
|
+
options = Misc.add_defaults options, :persistence => false, :keep_empty => true
|
7
|
+
|
8
|
+
case
|
9
|
+
when Array === native
|
10
|
+
options = Misc.add_defaults options, :native => native.last
|
11
|
+
key_field = native.first
|
12
|
+
when (String === native or Integer === native)
|
13
|
+
options = Misc.add_defaults options, :native => native
|
14
|
+
key_field = nil
|
15
|
+
else
|
16
|
+
key_field = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
case
|
20
|
+
when (Array === extra and Array === extra.first)
|
21
|
+
options = Misc.add_defaults options, :extra => extra.collect{|e| e.last}
|
22
|
+
fields = extra.collect{|e| e.first}
|
23
|
+
when (Array === extra and not Array === extra.first)
|
24
|
+
options = Misc.add_defaults options, :extra => extra
|
25
|
+
fields = (1..extra.length).to_a.collect{|i| "Field#{i}"}
|
26
|
+
else
|
27
|
+
fields = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
tsv = TSV.new(Open.open(url), options)
|
31
|
+
tsv.key_field ||= key_field
|
32
|
+
tsv.fields ||= fields
|
33
|
+
tsv
|
34
|
+
end
|
35
|
+
|
36
|
+
def merge_entrez(data, taxs, native, fix = nil, select = nil)
|
37
|
+
entrez = Entrez.entrez2native(taxs, :fix => fix, :select => select)
|
38
|
+
entrez.fields = [native]
|
39
|
+
entrez
|
40
|
+
|
41
|
+
data.smart_merge entrez, native
|
42
|
+
end
|
43
|
+
|
44
|
+
def merge_biomart(lexicon, db, native, other, match = nil)
|
45
|
+
match ||= native.first
|
46
|
+
lexicon.smart_merge BioMart.tsv(db, native, other), match
|
47
|
+
end
|
@@ -9,22 +9,27 @@ class TestBioMart < Test::Unit::TestCase
|
|
9
9
|
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
10
10
|
end
|
11
11
|
|
12
|
-
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache =>
|
13
|
-
assert(data['
|
14
|
-
|
15
|
-
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => true, :wget_options => { :quiet => false} )
|
16
|
-
assert(data['856452']['protein_id'].include? 'AAB68382')
|
17
|
-
assert(data['856452']['external_gene_id'].include? 'CUP1-2')
|
12
|
+
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :wget_options => { :quiet => false})
|
13
|
+
assert(data['852236']['protein_id'].include? 'CAA84864')
|
18
14
|
|
15
|
+
data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
|
16
|
+
assert(data['852236']['protein_id'].include? 'CAA84864')
|
17
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
19
18
|
end
|
20
19
|
|
21
20
|
def test_query
|
22
|
-
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache =>
|
21
|
+
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
22
|
+
|
23
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
24
|
+
end
|
23
25
|
|
24
|
-
|
25
|
-
|
26
|
-
end
|
26
|
+
def test_tsv
|
27
|
+
data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
27
28
|
|
29
|
+
assert(data['852236']['Protein ID'].include? 'CAA84864')
|
30
|
+
assert_equal 'Entrez Gene', data.key_field
|
31
|
+
assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
|
32
|
+
end
|
28
33
|
end
|
29
34
|
|
30
35
|
|
@@ -3,12 +3,12 @@ require 'rbbt/sources/entrez'
|
|
3
3
|
require 'test/unit'
|
4
4
|
|
5
5
|
class TestEntrez < Test::Unit::TestCase
|
6
|
-
$yeast_tax = 559292
|
6
|
+
$yeast_tax = [559292,4932]
|
7
7
|
|
8
8
|
def test_entrez2native
|
9
9
|
tax = $yeast_tax
|
10
10
|
fix = proc{|line| line.sub(/SGD:S0/,'S0') }
|
11
|
-
select = proc{|line| line.match(/\
|
11
|
+
select = proc{|line| line.match(/\tS0/)}
|
12
12
|
lexicon = Entrez.entrez2native(tax, :fix => fix, :select => select)
|
13
13
|
|
14
14
|
assert(lexicon['855611'].include? 'S000005056')
|
@@ -4,7 +4,6 @@ require 'rbbt/sources/go'
|
|
4
4
|
require 'test/unit'
|
5
5
|
|
6
6
|
class TestGo < Test::Unit::TestCase
|
7
|
-
|
8
7
|
def test_go
|
9
8
|
assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
|
10
9
|
assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
|
@@ -17,8 +16,6 @@ class TestGo < Test::Unit::TestCase
|
|
17
16
|
def test_namespace
|
18
17
|
assert_equal 'biological_process', GO.id2namespace('GO:0000001')
|
19
18
|
end
|
20
|
-
|
21
|
-
|
22
19
|
end
|
23
20
|
|
24
21
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestEntrez < Test::Unit::TestCase
|
6
|
+
def test_identifiers
|
7
|
+
assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
8
|
+
assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
9
|
+
#assert Organism.identifiers('Hsa')['1020']["Associated Gene Name"].include?('CDK5')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_lexicon
|
13
|
+
assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
@@ -21,7 +21,7 @@ class TestPubMed < Test::Unit::TestCase
|
|
21
21
|
assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
24
|
+
def _test_full_text
|
25
25
|
pmid = '16438716'
|
26
26
|
assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
|
27
27
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-10 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -33,7 +33,7 @@ dependencies:
|
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
36
|
+
name: rbbt-text
|
37
37
|
prerelease: false
|
38
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
@@ -47,7 +47,7 @@ dependencies:
|
|
47
47
|
type: :runtime
|
48
48
|
version_requirements: *id002
|
49
49
|
- !ruby/object:Gem::Dependency
|
50
|
-
name:
|
50
|
+
name: mechanize
|
51
51
|
prerelease: false
|
52
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
53
|
none: false
|
@@ -60,6 +60,20 @@ dependencies:
|
|
60
60
|
version: "0"
|
61
61
|
type: :runtime
|
62
62
|
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: libxml-ruby
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
type: :runtime
|
76
|
+
version_requirements: *id004
|
63
77
|
description: Data sources like PubMed, Entrez Gene, or Gene Ontology
|
64
78
|
email: miguel.vazquez@fdi.ucm.es
|
65
79
|
executables: []
|
@@ -76,9 +90,13 @@ files:
|
|
76
90
|
- lib/rbbt/sources/gscholar.rb
|
77
91
|
- lib/rbbt/sources/organism.rb
|
78
92
|
- lib/rbbt/sources/pubmed.rb
|
93
|
+
- share/install/Organism/Hsa/Rakefile
|
94
|
+
- share/install/Organism/Sce/Rakefile
|
95
|
+
- share/install/lib/helpers.rb
|
79
96
|
- test/rbbt/sources/test_biomart.rb
|
80
97
|
- test/rbbt/sources/test_entrez.rb
|
81
98
|
- test/rbbt/sources/test_go.rb
|
99
|
+
- test/rbbt/sources/test_organism.rb
|
82
100
|
- test/rbbt/sources/test_pubmed.rb
|
83
101
|
- test/test_helper.rb
|
84
102
|
has_rdoc: true
|
@@ -119,5 +137,6 @@ test_files:
|
|
119
137
|
- test/rbbt/sources/test_biomart.rb
|
120
138
|
- test/rbbt/sources/test_entrez.rb
|
121
139
|
- test/rbbt/sources/test_go.rb
|
140
|
+
- test/rbbt/sources/test_organism.rb
|
122
141
|
- test/rbbt/sources/test_pubmed.rb
|
123
142
|
- test/test_helper.rb
|