rbbt-sources 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ require 'rbbt-util'
2
+
3
+ module COSTART
4
+
5
+ Rbbt.claim "COSTART",
6
+ Proc.new do
7
+ terms = ["#COSTART Terms"]
8
+ Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
9
+ puts line
10
+ next unless line =~ /^'(.*)',/
11
+ terms << $1
12
+ end
13
+
14
+ terms * "\n"
15
+ end, 'COSTART'
16
+ end
@@ -0,0 +1,6 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/excel2tsv'
3
+
4
+ module CTCAE
5
+ Rbb.claim "CTCAE", TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls'), 'CTCAE'
6
+ end
@@ -0,0 +1,16 @@
1
+ require 'rbbt-util'
2
+
3
+ module Reactome
4
+
5
+ Rbbt.claim "Reactome",
6
+ Proc.new do
7
+ headers = ["Uniprot ID#1", "Ensembl Gene ID#2","Entrez Gene ID#1", "Uniprot ID#2", "Ensembl Gene ID#2", "Entrez Gene ID#2" , "Type", "Reaction", "PMID"]
8
+
9
+ tsv = TSV.new(Open.open("http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"), :fix => Proc.new {|l| l.gsub(/[\w ]+:/, "")})
10
+ tsv.key_field = headers.shift
11
+ tsv.fields = headers
12
+
13
+ tsv.to_s
14
+ end, 'Reactome'
15
+ ]
16
+ end
@@ -10,6 +10,8 @@ module BioMart
10
10
 
11
11
  class BioMart::QueryError < StandardError; end
12
12
 
13
+ BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
14
+
13
15
  private
14
16
 
15
17
  @@biomart_query_xml = <<-EOT
@@ -23,8 +25,14 @@ module BioMart
23
25
  </Dataset>
24
26
  </Query>
25
27
  EOT
26
-
27
28
 
29
+ def self.set_archive(date)
30
+ @archive_url = BIOMART_URL.sub(/www\.biomar\./, date + '.archive.ensemble')
31
+ end
32
+
33
+ def self.unset_archive
34
+ @archive_url = nil
35
+ end
28
36
 
29
37
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
30
38
  attrs ||= []
@@ -37,8 +45,13 @@ module BioMart
37
45
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
38
46
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
39
47
 
40
- response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), open_options)
41
- if response =~ /Query ERROR:/
48
+ if @archive_url
49
+ response = Open.read(@archive_url + query.gsub(/\n/,' '), open_options)
50
+ else
51
+ response = Open.read(BIOMART_URL + query.gsub(/\n/,' '), open_options)
52
+ end
53
+
54
+ if response.empty? or response =~ /Query ERROR:/
42
55
  raise BioMart::QueryError, response
43
56
  end
44
57
 
@@ -5,29 +5,29 @@ require 'set'
5
5
 
6
6
  module Entrez
7
7
 
8
- Rbbt.add_datafiles "gene_info" => ['databases/entrez', 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'],
9
- "gene2pubmed" => ["databases/entrez", "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz" ]
8
+ Rbbt.claim "gene_info", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz', 'databases/entrez'
9
+ Rbbt.claim "gene2pubmed", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz', 'databases/entrez'
10
10
 
11
11
  def self.entrez2native(taxs, options = {})
12
- options = Misc.add_defaults options, :native => 1, :extra => 5, :flatten => true, :persistence => true
12
+ options = Misc.add_defaults options, :key => 1, :others => 5, :persistence => true, :merge => true
13
13
 
14
14
  taxs = [taxs] unless Array === taxs
15
15
  options.merge! :grep => taxs
16
16
 
17
- tsv = TSV.new(Rbbt.find_datafile('gene_info'), options)
17
+ tsv = TSV.new(Rbbt.files.databases.entrez.gene_info, :flat, options)
18
18
  tsv.key_field = "Entrez Gene ID"
19
19
  tsv.fields = ["Native ID"]
20
20
  tsv
21
21
  end
22
22
 
23
23
  def self.entrez2pubmed(taxs)
24
- options = {:native => 1, :extra => 2, :flatten => true, :persistence => true}
24
+ options = {:key => 1, :others => 2, :persistence => true, :merge => true}
25
25
 
26
26
  taxs = [taxs] unless taxs.is_a?(Array)
27
27
  taxs = taxs.collect{|t| t.to_s}
28
28
  options.merge! :grep => taxs
29
29
 
30
- TSV.new(Rbbt.find_datafile('gene2pubmed'), options)
30
+ TSV.new(Rbbt.files.databases.entrez.gene2pubmed, :flat, options)
31
31
  end
32
32
 
33
33
  class Gene
@@ -4,9 +4,8 @@ require 'rbbt-util'
4
4
  # now all it does is provide a translation form id to the actual names.
5
5
  module GO
6
6
 
7
- Rbbt.add_datafiles :gene_ontology => ['databases/GO', 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'],
8
- :goslim_generic => ['databases/GO', 'http://www.geneontology.org/GO_slims/goslim_generic.obo']
9
-
7
+ Rbbt.claim :gene_ontology, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo', 'databases/GO'
8
+ Rbbt.claim :goslim_generic, 'http://www.geneontology.org/GO_slims/goslim_generic.obo', 'databases/GO'
10
9
 
11
10
  MULTIPLE_VALUE_FIELDS = %w(is_a)
12
11
  TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
@@ -1,12 +1,53 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/util/data_module'
3
3
 
4
+
4
5
  module Organism
5
6
  class OrganismNotProcessedError < StandardError; end
6
7
 
7
8
  def self.datadir(org)
8
9
  File.join(Rbbt.datadir, 'organisms', org)
9
10
  end
11
+
12
+ def self.normalize(org, list, field = nil, others = nil, options = {})
13
+ return [] if list.nil? or list.empty?
14
+ options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
15
+ double = Misc.process_options options, :double
16
+
17
+ if Array === list
18
+ if double
19
+ index.values_at *list
20
+ else
21
+ index.values_at(*list).collect{|e| Misc.first e}
22
+ end
23
+ else
24
+ if double
25
+ index[list]
26
+ else
27
+ index[list].first
28
+ end
29
+ end
30
+ end
31
+
32
+ def self.guess_id(org, values)
33
+ identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
34
+ field_matches = identifiers.field_matches(values)
35
+ field_matches.sort_by{|field, matches| matches.uniq.length}.last
36
+ end
37
+
38
+ def self.organisms
39
+ Dir.glob(File.join(PKGData.sharedir_for_file(__FILE__), 'install/Organism/*/Rakefile')).collect{|f| File.basename(File.dirname(f))}
40
+ end
41
+
42
+ def self.name(organism)
43
+ Open.read(Organism.scientific_name(organism)).strip
44
+ end
45
+
46
+ def self.organism(name)
47
+ organisms.select{|organism|
48
+ organism == name or Organism.name(organism) =~ /#{ name }/i
49
+ }.first
50
+ end
10
51
 
11
52
  extend DataModule
12
53
 
@@ -0,0 +1,10 @@
1
+ require 'rbbt'
2
+
3
+ module Polysearch
4
+ Rbbt.claim "organ" ,'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt', 'Polysearch'
5
+ Rbbt.claim "tissue" ,'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt', 'Polysearch'
6
+ Rbbt.claim "location" ,'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt', 'Polysearch'
7
+ Rbbt.claim "disease" ,'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt', 'Polysearch'
8
+ Rbbt.claim "drug" ,'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt', 'Polysearch'
9
+ end
10
+
@@ -52,9 +52,18 @@ $biomart_identifiers = [
52
52
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
53
53
  ]
54
54
 
55
+ $biomart_positions = [
56
+ ['Chromosome Name','chromosome_name'],
57
+ ['Strand','strand'],
58
+ ['Gene Start','start_position'],
59
+ ['Gene End','end_position'],
60
+ ['Transcript Start','transcript_start'],
61
+ ['Transcript End','transcript_end'],
62
+ ]
63
+
55
64
 
56
65
 
57
- file 'name' do |t|
66
+ file 'scientific_name' do |t|
58
67
  File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
59
68
  end
60
69
 
@@ -77,7 +86,32 @@ file 'identifiers' do |t|
77
86
  File.open(t.name, 'w') do |f| f.puts identifiers end
78
87
  end
79
88
 
89
+ file 'gene_go' do |t|
90
+ url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
91
+ tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
92
+
93
+ index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
94
+ new = TSV.new({})
95
+ tsv.through do |key, values|
96
+ next if index[key].nil?
97
+ new_key = index[key].first
98
+ new[new_key] = values
99
+ end
100
+
101
+
102
+ new.key_field = "Associated Gene Name"
103
+ new.fields = ["GO Term"]
104
+ Open.write(t.name, new.to_s)
105
+ end
106
+
107
+ file 'gene_positions' do |t|
108
+ BioMart.set_archive('may2009')
109
+ positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
110
+ BioMart.unset_archive
111
+
112
+ Open.write(t.name, positions.to_s)
113
+ end
80
114
 
81
- task :default => ['name', 'lexicon', 'identifiers']
115
+ task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
82
116
 
83
117
 
@@ -10,7 +10,7 @@ $biomart_db = 'scerevisiae_gene_ensembl'
10
10
  $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
11
 
12
12
 
13
- file 'name' do |t|
13
+ file 'scientific_name' do |t|
14
14
  File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
15
15
  end
16
16
 
@@ -27,7 +27,7 @@ file 'lexicon' do |t|
27
27
  end
28
28
 
29
29
  file 'identifiers' do |t|
30
- identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
30
+ identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
31
31
 
32
32
  merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
33
33
 
@@ -4,7 +4,7 @@ require 'test/unit'
4
4
 
5
5
  class TestBioMart < Test::Unit::TestCase
6
6
 
7
- def test_get
7
+ def _test_get
8
8
  assert_raise BioMart::QueryError do
9
9
  BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
10
  end
@@ -23,7 +23,7 @@ class TestBioMart < Test::Unit::TestCase
23
23
  assert(data['852236']['external_gene_id'].include? 'YBL044W')
24
24
  end
25
25
 
26
- def test_tsv
26
+ def _test_tsv
27
27
  data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
28
28
 
29
29
  assert(data['852236']['Protein ID'].include? 'CAA84864')
@@ -18,7 +18,7 @@ class TestEntrez < Test::Unit::TestCase
18
18
  tax = $yeast_tax
19
19
 
20
20
  data = Entrez.entrez2pubmed(tax)
21
- assert(data['850320'].include? '15102838')
21
+ assert(data['850320'].include? '1574125')
22
22
  end
23
23
 
24
24
  def test_getonline
@@ -6,12 +6,24 @@ class TestEntrez < Test::Unit::TestCase
6
6
  def test_identifiers
7
7
  assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
8
8
  assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
9
- #assert Organism.identifiers('Hsa')['1020']["Associated Gene Name"].include?('CDK5')
9
+ assert TSV.new(Organism.identifiers('Hsa'))['1020']["Associated Gene Name"].include?('CDK5')
10
10
  end
11
11
 
12
12
  def test_lexicon
13
13
  assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
14
14
  end
15
+
16
+ def test_guess_id
17
+ ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
18
+ gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
19
+ assert_equal "Ensembl Gene ID", Organism::Sce.guess_id(ensembl).first
20
+ assert_equal "Associated Gene Name", Organism::Sce.guess_id(gene_name).first
21
+ end
22
+
23
+ def test_organisms
24
+ assert Organism.organisms.include? "Hsa"
25
+ assert_equal "Hsa", Organism.organism("Homo sapiens")
26
+ end
15
27
  end
16
28
 
17
29
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
5
- prerelease: false
4
+ hash: 19
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 1
10
- version: 0.2.1
9
+ - 2
10
+ version: 0.2.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-14 00:00:00 +01:00
18
+ date: 2011-01-30 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -83,12 +83,16 @@ extensions: []
83
83
  extra_rdoc_files: []
84
84
 
85
85
  files:
86
+ - lib/rbbt/sources/COSTART.rb
87
+ - lib/rbbt/sources/CTCAE.rb
88
+ - lib/rbbt/sources/Reactome.rb
86
89
  - lib/rbbt/sources/bibtex.rb
87
90
  - lib/rbbt/sources/biomart.rb
88
91
  - lib/rbbt/sources/entrez.rb
89
92
  - lib/rbbt/sources/go.rb
90
93
  - lib/rbbt/sources/gscholar.rb
91
94
  - lib/rbbt/sources/organism.rb
95
+ - lib/rbbt/sources/polysearch.rb
92
96
  - lib/rbbt/sources/pubmed.rb
93
97
  - share/install/Organism/Hsa/Rakefile
94
98
  - share/install/Organism/Sce/Rakefile
@@ -129,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
129
133
  requirements: []
130
134
 
131
135
  rubyforge_project:
132
- rubygems_version: 1.3.7
136
+ rubygems_version: 1.4.2
133
137
  signing_key:
134
138
  specification_version: 3
135
139
  summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)