rbbt-sources 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/log'
3
3
 
4
4
  # This module interacts with BioMart. It performs queries to BioMart and
5
5
  # synthesises a hash with the results. Note that this module connects to the
@@ -9,6 +9,7 @@ require 'rbbt/util/open'
9
9
  module BioMart
10
10
 
11
11
  class BioMart::QueryError < StandardError; end
12
+
12
13
  private
13
14
 
14
15
  @@biomart_query_xml = <<-EOT
@@ -25,8 +26,7 @@ module BioMart
25
26
 
26
27
 
27
28
 
28
-
29
- def self.get(database, main, attrs = nil, filters = nil, data = nil, options = {})
29
+ def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
30
30
  attrs ||= []
31
31
  filters ||= ["with_#{main}"]
32
32
  data ||= {}
@@ -37,7 +37,7 @@ module BioMart
37
37
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
38
38
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
39
39
 
40
- response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), options)
40
+ response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), open_options)
41
41
  if response =~ /Query ERROR:/
42
42
  raise BioMart::QueryError, response
43
43
  end
@@ -51,8 +51,12 @@ module BioMart
51
51
  attrs.each{|name|
52
52
  value = parts.shift
53
53
  data[main][name] ||= []
54
- next if value.nil?
55
- data[main][name] << value
54
+ next if value.nil? or value.empty?
55
+ if data[main][name]
56
+ data[main][name] = [value]
57
+ else
58
+ data[main][name] << value unless data[main][name].include? value
59
+ end
56
60
  }
57
61
  }
58
62
 
@@ -75,30 +79,48 @@ module BioMart
75
79
  # the BioMart query to remove results with the main attribute empty, this may
76
80
  # cause an error if the BioMart WS does not allow filtering with that
77
81
  # attribute.
78
- def self.query(database, main, attrs = nil, filters = nil, data = nil, options = {})
82
+ def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
83
+ open_options = Misc.add_defaults open_options, :nocache => false
79
84
  attrs ||= []
80
85
  data ||= {}
81
86
 
87
+ Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
88
+
89
+ max_items = 2
82
90
  chunks = []
83
91
  chunk = []
84
92
  attrs.each{|a|
85
93
  chunk << a
86
- if chunk.length == 2
94
+ if chunk.length == max_items
87
95
  chunks << chunk
88
96
  chunk = []
89
97
  end
90
98
  }
91
99
 
92
100
  chunks << chunk if chunk.any?
101
+
93
102
 
94
- chunks.each{|chunk|
95
- data = get(database, main, chunk, filters, data, options)
103
+ Log.low "Chunks: #{chunks.length}"
104
+ chunks.each_with_index{|chunk,i|
105
+ Log.low "Chunk #{ i }: [#{chunk * ", "}]"
106
+ data = get(database, main, chunk, filters, data, open_options)
96
107
  }
97
108
 
98
109
  data
99
110
  end
100
111
 
101
-
112
+ def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
113
+ codes = attrs.collect{|attr| attr[1]}
114
+ data = query(database, main.last, codes, filters, data, open_options)
115
+ tsv = TSV.new({})
116
+
117
+ data.each do |key, info|
118
+ tsv[key] = info.values_at(*codes)
119
+ end
102
120
 
121
+ tsv.key_field = main.first
122
+ tsv.fields = attrs.collect{|attr| attr.first}
123
+ tsv
124
+ end
103
125
  end
104
126
 
@@ -14,7 +14,10 @@ module Entrez
14
14
  taxs = [taxs] unless Array === taxs
15
15
  options.merge! :grep => taxs
16
16
 
17
- TSV.new(Rbbt.find_datafile('gene_info'), options)
17
+ tsv = TSV.new(Rbbt.find_datafile('gene_info'), options)
18
+ tsv.key_field = "Entrez Gene ID"
19
+ tsv.fields = ["Native ID"]
20
+ tsv
18
21
  end
19
22
 
20
23
  def self.entrez2pubmed(taxs)
@@ -4,66 +4,67 @@ require 'rbbt-util'
4
4
  # now all it does is provide a translation form id to the actual names.
5
5
  module GO
6
6
 
7
- @@info = nil
7
+ Rbbt.add_datafiles :gene_ontology => ['databases/GO', 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'],
8
+ :goslim_generic => ['databases/GO', 'http://www.geneontology.org/GO_slims/goslim_generic.obo']
9
+
10
+
8
11
  MULTIPLE_VALUE_FIELDS = %w(is_a)
12
+ TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
9
13
 
10
14
  # This method needs to be called before any translations can be made, it is
11
15
  # called automatically the first time the id2name method is called. It loads
12
16
  # the gene_ontology.obo file and extracts all the fields, although right now,
13
17
  # only the name field is used.
14
18
  def self.init
15
- @@info = {}
16
- File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
17
- split(/\[Term\]/).
18
- each{|term|
19
+ info = TCHash.new(TSV_GENE_ONTOLOGY, true)
20
+ File.open(Rbbt.find_datafile('gene_ontology')).read.split(/\[Term\]/).each{|term|
19
21
  term_info = {}
20
- term.split(/\n/).
21
- select{|l| l =~ /:/}.
22
- each{|l|
23
- key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
24
- if MULTIPLE_VALUE_FIELDS.include? key.strip
25
- term_info[key.strip] ||= []
26
- term_info[key.strip] << value.strip
27
- else
28
- term_info[key.strip] = value.strip
29
- end
30
- }
31
- @@info[term_info["id"]] = term_info
32
- }
22
+
23
+ term.split(/\n/). select{|l| l =~ /:/}.each{|l|
24
+ key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
25
+ if MULTIPLE_VALUE_FIELDS.include? key.strip
26
+ term_info[key.strip] ||= []
27
+ term_info[key.strip] << value.strip
28
+ else
29
+ term_info[key.strip] = value.strip
30
+ end
31
+ }
32
+
33
+ next if term_info["id"].nil?
34
+ info[term_info["id"]] = term_info
35
+ }
36
+ info.close
33
37
  end
34
38
 
35
39
  def self.info
36
- self.init unless @@info
37
- @@info
40
+ self.init unless File.exists? TSV_GENE_ONTOLOGY
41
+ TCHash.get(TSV_GENE_ONTOLOGY)
38
42
  end
39
43
 
40
44
  def self.goterms
41
- self.init unless @@info
42
- @@info.keys
45
+ info.keys
43
46
  end
44
47
 
45
48
  def self.id2name(id)
46
- self.init unless @@info
47
49
  if id.kind_of? Array
48
- @@info.values_at(*id).collect{|i| i['name'] if i}
50
+ info.values_at(*id).collect{|i| i['name'] if i}
49
51
  else
50
- return nil if @@info[id].nil?
51
- @@info[id]['name']
52
+ return nil if info[id].nil?
53
+ info[id]['name']
52
54
  end
53
55
  end
54
56
 
55
57
  def self.id2ancestors(id)
56
- self.init unless @@info
57
58
  if id.kind_of? Array
58
- @@info.values_at(*id).
59
+ info.values_at(*id).
59
60
  select{|i| ! i['is_a'].nil?}.
60
61
  collect{|i| i['is_a'].collect{|id|
61
- id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
62
- }.compact
62
+ id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
63
+ }.compact
63
64
  }
64
65
  else
65
- return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
66
- @@info[id]['is_a'].
66
+ return [] if id.nil? or info[id].nil? or info[id]['is_a'].nil?
67
+ info[id]['is_a'].
67
68
  collect{|id|
68
69
  id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
69
70
  }.compact
@@ -71,14 +72,12 @@ module GO
71
72
  end
72
73
 
73
74
  def self.id2namespace(id)
74
- self.init unless @@info
75
+ self.init unless info
75
76
  if id.kind_of? Array
76
- @@info.values_at(*id).collect{|i| i['namespace'] if i}
77
+ info.values_at(*id).collect{|i| i['namespace'] if i}
77
78
  else
78
- return nil if @@info[id].nil?
79
- @@info[id]['namespace']
79
+ return nil if info[id].nil?
80
+ info[id]['namespace']
80
81
  end
81
82
  end
82
-
83
-
84
83
  end
@@ -1,9 +1,15 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/util/data_module'
3
+
2
4
  module Organism
3
5
  class OrganismNotProcessedError < StandardError; end
4
6
 
5
7
  def self.datadir(org)
6
8
  File.join(Rbbt.datadir, 'organisms', org)
7
9
  end
8
-
10
+
11
+ extend DataModule
12
+
13
+ Hsa = with_key('Hsa')
14
+ Sce = with_key('Sce')
9
15
  end
@@ -0,0 +1,83 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
+ require 'rbbt/sources/biomart'
3
+ require 'rbbt/sources/entrez'
4
+ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
+
6
+ $taxs = [559292,4932]
7
+ $native = "SGD ID"
8
+ $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
9
+ $biomart_db = 'hsapiens_gene_ensembl'
10
+ $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
+ $biomart_lexicon = [
12
+ [ 'Associated Gene Name' , "external_gene_id"],
13
+ [ 'HGNC symbol', "hgnc_symbol" ],
14
+ [ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
15
+ [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
16
+ ]
17
+
18
+ $biomart_identifiers = [
19
+ [ 'Ensembl Gene ID', "ensembl_gene_id" ],
20
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
21
+ [ 'Associated Gene Name', "external_gene_id" ],
22
+ [ 'CCDS ID', "ccds" ],
23
+ [ 'Protein ID', "protein_id" ],
24
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
25
+ [ 'Unigene ID', "unigene" ],
26
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
27
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
28
+ [ 'HGNC ID', "hgnc_id", 'HGNC'],
29
+ [ 'EMBL (Genbank) ID' , "embl"] ,
30
+
31
+ # Affymetrix
32
+ [ 'AFFY HC G110', 'affy_hc_g110' ],
33
+ [ 'AFFY HG FOCUS', 'affy_hg_focus' ],
34
+ [ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
35
+ [ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
36
+ [ 'AFFY HG U133A', 'affy_hg_u133a' ],
37
+ [ 'AFFY HG U133B', 'affy_hg_u133b' ],
38
+ [ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
39
+ [ 'AFFY HG U95B', 'affy_hg_u95b' ],
40
+ [ 'AFFY HG U95C', 'affy_hg_u95c' ],
41
+ [ 'AFFY HG U95D', 'affy_hg_u95d' ],
42
+ [ 'AFFY HG U95E', 'affy_hg_u95e' ],
43
+ [ 'AFFY HG U95A', 'affy_hg_u95a' ],
44
+ [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
45
+ [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
46
+ [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
47
+ [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
48
+ [ 'Agilent WholeGenome',"agilent_wholegenome" ],
49
+ [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
50
+ [ 'Codelink ID', 'codelink' ],
51
+ [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
52
+ [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
53
+ ]
54
+
55
+
56
+
57
+ file 'name' do |t|
58
+ File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
59
+ end
60
+
61
+ file 'lexicon' do |t|
62
+ lexicon = tsv_file('http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag',
63
+ "HGNC ID", nil, :flatten => true, :header_hash => '')
64
+ merge_biomart lexicon, $biomart_db, $biomart_main, $biomart_lexicon, "HGNC ID"
65
+
66
+ File.open(t.name, 'w') do |f| f.puts lexicon end
67
+ end
68
+
69
+ file 'identifiers' do |t|
70
+ identifiers = BioMart.tsv($biomart_db, $biomart_main, $biomart_identifiers)
71
+ $biomart_identifiers.each do |name, key, prefix|
72
+ if prefix
73
+ identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
74
+ end
75
+ end
76
+
77
+ File.open(t.name, 'w') do |f| f.puts identifiers end
78
+ end
79
+
80
+
81
+ task :default => ['name', 'lexicon', 'identifiers']
82
+
83
+
@@ -0,0 +1,118 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
+ require 'rbbt/sources/biomart'
3
+ require 'rbbt/sources/entrez'
4
+ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
+
6
+ $taxs = [559292,4932]
7
+ $native = "SGD ID"
8
+ $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
9
+ $biomart_db = 'scerevisiae_gene_ensembl'
10
+ $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
+
12
+
13
+ file 'name' do |t|
14
+ File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
15
+ end
16
+
17
+ file 'lexicon' do |t|
18
+ lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
19
+
20
+ merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
21
+
22
+ merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
23
+
24
+ lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
25
+
26
+ File.open(t.name, 'w') do |f| f.puts lexicon end
27
+ end
28
+
29
+ file 'identifiers' do |t|
30
+ identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
31
+
32
+ merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
33
+
34
+ merge_biomart(identifiers, $biomart_db, $biomart_main,
35
+ [['Associated Gene Name' , "external_gene_id"],
36
+ ['Ensembl Gene ID', "ensembl_gene_id" ],
37
+ ['Ensembl Protein ID', "ensembl_peptide_id" ],
38
+ ['RefSeq Protein ID' , "refseq_peptide"] ,
39
+ ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
40
+ ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
41
+ ['Protein ID' , "protein_id"] ,
42
+ ['EMBL (Genbank) ID' , "embl"] ,
43
+ # Affymetrix
44
+ ['Affy yeast 2',"affy_yeast_2"],
45
+ ['Affy yg s98', "affy_yg_s98"]])
46
+
47
+ File.open(t.name, 'w') do |f| f.puts identifiers end
48
+ end
49
+
50
+
51
+ task :default => ['name', 'lexicon', 'identifiers']
52
+
53
+ #require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
54
+ #
55
+ #$name = "Saccharomyces cerevisiae"
56
+ #
57
+ #
58
+ #$native_id = "SGD DB Id"
59
+ #
60
+ #$entrez2native = {
61
+ # :tax => 559292,
62
+ # :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
63
+ # :check => proc{|code| code.match(/^S0/)},
64
+ #}
65
+ #
66
+ #$lexicon = {
67
+ # :file => {
68
+ # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
69
+ # :native => 0,
70
+ # :extra => [4,3,5]
71
+ # },
72
+ # :biomart => {
73
+ # :database => 'scerevisiae_gene_ensembl',
74
+ # :main => ['Entrez Gene ID', 'entrezgene'],
75
+ # :extra => [
76
+ # ['Interpro Description' , "interpro_description"],
77
+ # ],
78
+ # :filter => [],
79
+ # }
80
+ #
81
+ #}
82
+ #
83
+ #$identifiers = {
84
+ # :file => {
85
+ # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
86
+ # :native => 0,
87
+ # :extra => [],
88
+ # },
89
+ # :biomart => {
90
+ # :database => 'scerevisiae_gene_ensembl',
91
+ # :main => ['Entrez Gene ID', 'entrezgene'],
92
+ # :extra => [
93
+ # ['Associated Gene Name' , "external_gene_id"],
94
+ # ['Ensembl Gene ID', "ensembl_gene_id" ],
95
+ # ['Ensembl Protein ID', "ensembl_peptide_id" ],
96
+ # ['RefSeq Protein ID' , "refseq_peptide"] ,
97
+ # ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
98
+ # ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
99
+ # ['Protein ID' , "protein_id"] ,
100
+ # ['EMBL (Genbank) ID' , "embl"] ,
101
+ # # Affymetrix
102
+ # ['Affy yeast 2',"affy_yeast_2"],
103
+ # ['Affy yg s98', "affy_yg_s98"],
104
+ # ],
105
+ # :filter => [],
106
+ # }
107
+ #}
108
+ #
109
+ #$go = {
110
+ # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
111
+ # :code => 1,
112
+ # :go => 4,
113
+ # :pmid => 5,
114
+ #}
115
+ #
116
+ #$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
117
+ #
118
+ #
@@ -0,0 +1,47 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/sources/biomart'
3
+ require 'rbbt/sources/entrez'
4
+
5
+ def tsv_file(url, native, extra, options = {})
6
+ options = Misc.add_defaults options, :persistence => false, :keep_empty => true
7
+
8
+ case
9
+ when Array === native
10
+ options = Misc.add_defaults options, :native => native.last
11
+ key_field = native.first
12
+ when (String === native or Integer === native)
13
+ options = Misc.add_defaults options, :native => native
14
+ key_field = nil
15
+ else
16
+ key_field = nil
17
+ end
18
+
19
+ case
20
+ when (Array === extra and Array === extra.first)
21
+ options = Misc.add_defaults options, :extra => extra.collect{|e| e.last}
22
+ fields = extra.collect{|e| e.first}
23
+ when (Array === extra and not Array === extra.first)
24
+ options = Misc.add_defaults options, :extra => extra
25
+ fields = (1..extra.length).to_a.collect{|i| "Field#{i}"}
26
+ else
27
+ fields = nil
28
+ end
29
+
30
+ tsv = TSV.new(Open.open(url), options)
31
+ tsv.key_field ||= key_field
32
+ tsv.fields ||= fields
33
+ tsv
34
+ end
35
+
36
+ def merge_entrez(data, taxs, native, fix = nil, select = nil)
37
+ entrez = Entrez.entrez2native(taxs, :fix => fix, :select => select)
38
+ entrez.fields = [native]
39
+ entrez
40
+
41
+ data.smart_merge entrez, native
42
+ end
43
+
44
+ def merge_biomart(lexicon, db, native, other, match = nil)
45
+ match ||= native.first
46
+ lexicon.smart_merge BioMart.tsv(db, native, other), match
47
+ end
@@ -9,22 +9,27 @@ class TestBioMart < Test::Unit::TestCase
9
9
  BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
10
  end
11
11
 
12
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :wget_options => { :quiet => false})
13
- assert(data['856452']['protein_id'].include? 'AAB68382')
14
-
15
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => true, :wget_options => { :quiet => false} )
16
- assert(data['856452']['protein_id'].include? 'AAB68382')
17
- assert(data['856452']['external_gene_id'].include? 'CUP1-2')
12
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :wget_options => { :quiet => false})
13
+ assert(data['852236']['protein_id'].include? 'CAA84864')
18
14
 
15
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
16
+ assert(data['852236']['protein_id'].include? 'CAA84864')
17
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
19
18
  end
20
19
 
21
20
  def test_query
22
- data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => true, :wget_options => { :quiet => false})
21
+ data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
22
+
23
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
24
+ end
23
25
 
24
- assert(data['856452']['protein_id'].include? 'AAB68382')
25
- assert(data['856452']['external_gene_id'].include? 'CUP1-2')
26
- end
26
+ def test_tsv
27
+ data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
27
28
 
29
+ assert(data['852236']['Protein ID'].include? 'CAA84864')
30
+ assert_equal 'Entrez Gene', data.key_field
31
+ assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
32
+ end
28
33
  end
29
34
 
30
35
 
@@ -3,12 +3,12 @@ require 'rbbt/sources/entrez'
3
3
  require 'test/unit'
4
4
 
5
5
  class TestEntrez < Test::Unit::TestCase
6
- $yeast_tax = 559292
6
+ $yeast_tax = [559292,4932]
7
7
 
8
8
  def test_entrez2native
9
9
  tax = $yeast_tax
10
10
  fix = proc{|line| line.sub(/SGD:S0/,'S0') }
11
- select = proc{|line| line.match(/\tSGD:S0/)}
11
+ select = proc{|line| line.match(/\tS0/)}
12
12
  lexicon = Entrez.entrez2native(tax, :fix => fix, :select => select)
13
13
 
14
14
  assert(lexicon['855611'].include? 'S000005056')
@@ -4,7 +4,6 @@ require 'rbbt/sources/go'
4
4
  require 'test/unit'
5
5
 
6
6
  class TestGo < Test::Unit::TestCase
7
-
8
7
  def test_go
9
8
  assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
10
9
  assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
@@ -17,8 +16,6 @@ class TestGo < Test::Unit::TestCase
17
16
  def test_namespace
18
17
  assert_equal 'biological_process', GO.id2namespace('GO:0000001')
19
18
  end
20
-
21
-
22
19
  end
23
20
 
24
21
 
@@ -0,0 +1,17 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/sources/organism'
3
+ require 'test/unit'
4
+
5
+ class TestEntrez < Test::Unit::TestCase
6
+ def test_identifiers
7
+ assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
8
+ assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
9
+ #assert Organism.identifiers('Hsa')['1020']["Associated Gene Name"].include?('CDK5')
10
+ end
11
+
12
+ def test_lexicon
13
+ assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
14
+ end
15
+ end
16
+
17
+
@@ -21,7 +21,7 @@ class TestPubMed < Test::Unit::TestCase
21
21
  assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
22
22
  end
23
23
 
24
- def test_full_text
24
+ def _test_full_text
25
25
  pmid = '16438716'
26
26
  assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
27
27
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 0.1.0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-01 00:00:00 +01:00
18
+ date: 2010-12-10 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -33,7 +33,7 @@ dependencies:
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
35
  - !ruby/object:Gem::Dependency
36
- name: mechanize
36
+ name: rbbt-text
37
37
  prerelease: false
38
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
39
  none: false
@@ -47,7 +47,7 @@ dependencies:
47
47
  type: :runtime
48
48
  version_requirements: *id002
49
49
  - !ruby/object:Gem::Dependency
50
- name: libxml-ruby
50
+ name: mechanize
51
51
  prerelease: false
52
52
  requirement: &id003 !ruby/object:Gem::Requirement
53
53
  none: false
@@ -60,6 +60,20 @@ dependencies:
60
60
  version: "0"
61
61
  type: :runtime
62
62
  version_requirements: *id003
63
+ - !ruby/object:Gem::Dependency
64
+ name: libxml-ruby
65
+ prerelease: false
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ type: :runtime
76
+ version_requirements: *id004
63
77
  description: Data sources like PubMed, Entrez Gene, or Gene Ontology
64
78
  email: miguel.vazquez@fdi.ucm.es
65
79
  executables: []
@@ -76,9 +90,13 @@ files:
76
90
  - lib/rbbt/sources/gscholar.rb
77
91
  - lib/rbbt/sources/organism.rb
78
92
  - lib/rbbt/sources/pubmed.rb
93
+ - share/install/Organism/Hsa/Rakefile
94
+ - share/install/Organism/Sce/Rakefile
95
+ - share/install/lib/helpers.rb
79
96
  - test/rbbt/sources/test_biomart.rb
80
97
  - test/rbbt/sources/test_entrez.rb
81
98
  - test/rbbt/sources/test_go.rb
99
+ - test/rbbt/sources/test_organism.rb
82
100
  - test/rbbt/sources/test_pubmed.rb
83
101
  - test/test_helper.rb
84
102
  has_rdoc: true
@@ -119,5 +137,6 @@ test_files:
119
137
  - test/rbbt/sources/test_biomart.rb
120
138
  - test/rbbt/sources/test_entrez.rb
121
139
  - test/rbbt/sources/test_go.rb
140
+ - test/rbbt/sources/test_organism.rb
122
141
  - test/rbbt/sources/test_pubmed.rb
123
142
  - test/test_helper.rb