rbbt-sources 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,16 @@
1
+ require 'rbbt-util'
2
+
3
+ module COSTART
4
+
5
+ Rbbt.claim "COSTART",
6
+ Proc.new do
7
+ terms = ["#COSTART Terms"]
8
+ Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
9
+ puts line
10
+ next unless line =~ /^'(.*)',/
11
+ terms << $1
12
+ end
13
+
14
+ terms * "\n"
15
+ end, 'COSTART'
16
+ end
@@ -0,0 +1,6 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/excel2tsv'
3
+
4
+ module CTCAE
5
+ Rbb.claim "CTCAE", TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls'), 'CTCAE'
6
+ end
@@ -0,0 +1,16 @@
1
+ require 'rbbt-util'
2
+
3
+ module Reactome
4
+
5
+ Rbbt.claim "Reactome",
6
+ Proc.new do
7
+ headers = ["Uniprot ID#1", "Ensembl Gene ID#2","Entrez Gene ID#1", "Uniprot ID#2", "Ensembl Gene ID#2", "Entrez Gene ID#2" , "Type", "Reaction", "PMID"]
8
+
9
+ tsv = TSV.new(Open.open("http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"), :fix => Proc.new {|l| l.gsub(/[\w ]+:/, "")})
10
+ tsv.key_field = headers.shift
11
+ tsv.fields = headers
12
+
13
+ tsv.to_s
14
+ end, 'Reactome'
15
+ ]
16
+ end
@@ -10,6 +10,8 @@ module BioMart
10
10
 
11
11
  class BioMart::QueryError < StandardError; end
12
12
 
13
+ BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
14
+
13
15
  private
14
16
 
15
17
  @@biomart_query_xml = <<-EOT
@@ -23,8 +25,14 @@ module BioMart
23
25
  </Dataset>
24
26
  </Query>
25
27
  EOT
26
-
27
28
 
29
+ def self.set_archive(date)
30
+ @archive_url = BIOMART_URL.sub(/www\.biomar\./, date + '.archive.ensemble')
31
+ end
32
+
33
+ def self.unset_archive
34
+ @archive_url = nil
35
+ end
28
36
 
29
37
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
30
38
  attrs ||= []
@@ -37,8 +45,13 @@ module BioMart
37
45
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
38
46
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
39
47
 
40
- response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), open_options)
41
- if response =~ /Query ERROR:/
48
+ if @archive_url
49
+ response = Open.read(@archive_url + query.gsub(/\n/,' '), open_options)
50
+ else
51
+ response = Open.read(BIOMART_URL + query.gsub(/\n/,' '), open_options)
52
+ end
53
+
54
+ if response.empty? or response =~ /Query ERROR:/
42
55
  raise BioMart::QueryError, response
43
56
  end
44
57
 
@@ -5,29 +5,29 @@ require 'set'
5
5
 
6
6
  module Entrez
7
7
 
8
- Rbbt.add_datafiles "gene_info" => ['databases/entrez', 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'],
9
- "gene2pubmed" => ["databases/entrez", "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz" ]
8
+ Rbbt.claim "gene_info", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz', 'databases/entrez'
9
+ Rbbt.claim "gene2pubmed", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz', 'databases/entrez'
10
10
 
11
11
  def self.entrez2native(taxs, options = {})
12
- options = Misc.add_defaults options, :native => 1, :extra => 5, :flatten => true, :persistence => true
12
+ options = Misc.add_defaults options, :key => 1, :others => 5, :persistence => true, :merge => true
13
13
 
14
14
  taxs = [taxs] unless Array === taxs
15
15
  options.merge! :grep => taxs
16
16
 
17
- tsv = TSV.new(Rbbt.find_datafile('gene_info'), options)
17
+ tsv = TSV.new(Rbbt.files.databases.entrez.gene_info, :flat, options)
18
18
  tsv.key_field = "Entrez Gene ID"
19
19
  tsv.fields = ["Native ID"]
20
20
  tsv
21
21
  end
22
22
 
23
23
  def self.entrez2pubmed(taxs)
24
- options = {:native => 1, :extra => 2, :flatten => true, :persistence => true}
24
+ options = {:key => 1, :others => 2, :persistence => true, :merge => true}
25
25
 
26
26
  taxs = [taxs] unless taxs.is_a?(Array)
27
27
  taxs = taxs.collect{|t| t.to_s}
28
28
  options.merge! :grep => taxs
29
29
 
30
- TSV.new(Rbbt.find_datafile('gene2pubmed'), options)
30
+ TSV.new(Rbbt.files.databases.entrez.gene2pubmed, :flat, options)
31
31
  end
32
32
 
33
33
  class Gene
@@ -4,9 +4,8 @@ require 'rbbt-util'
4
4
  # now all it does is provide a translation form id to the actual names.
5
5
  module GO
6
6
 
7
- Rbbt.add_datafiles :gene_ontology => ['databases/GO', 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'],
8
- :goslim_generic => ['databases/GO', 'http://www.geneontology.org/GO_slims/goslim_generic.obo']
9
-
7
+ Rbbt.claim :gene_ontology, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo', 'databases/GO'
8
+ Rbbt.claim :goslim_generic, 'http://www.geneontology.org/GO_slims/goslim_generic.obo', 'databases/GO'
10
9
 
11
10
  MULTIPLE_VALUE_FIELDS = %w(is_a)
12
11
  TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
@@ -1,12 +1,53 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/util/data_module'
3
3
 
4
+
4
5
  module Organism
5
6
  class OrganismNotProcessedError < StandardError; end
6
7
 
7
8
  def self.datadir(org)
8
9
  File.join(Rbbt.datadir, 'organisms', org)
9
10
  end
11
+
12
+ def self.normalize(org, list, field = nil, others = nil, options = {})
13
+ return [] if list.nil? or list.empty?
14
+ options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
15
+ double = Misc.process_options options, :double
16
+
17
+ if Array === list
18
+ if double
19
+ index.values_at *list
20
+ else
21
+ index.values_at(*list).collect{|e| Misc.first e}
22
+ end
23
+ else
24
+ if double
25
+ index[list]
26
+ else
27
+ index[list].first
28
+ end
29
+ end
30
+ end
31
+
32
+ def self.guess_id(org, values)
33
+ identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
34
+ field_matches = identifiers.field_matches(values)
35
+ field_matches.sort_by{|field, matches| matches.uniq.length}.last
36
+ end
37
+
38
+ def self.organisms
39
+ Dir.glob(File.join(PKGData.sharedir_for_file(__FILE__), 'install/Organism/*/Rakefile')).collect{|f| File.basename(File.dirname(f))}
40
+ end
41
+
42
+ def self.name(organism)
43
+ Open.read(Organism.scientific_name(organism)).strip
44
+ end
45
+
46
+ def self.organism(name)
47
+ organisms.select{|organism|
48
+ organism == name or Organism.name(organism) =~ /#{ name }/i
49
+ }.first
50
+ end
10
51
 
11
52
  extend DataModule
12
53
 
@@ -0,0 +1,10 @@
1
+ require 'rbbt'
2
+
3
+ module Polysearch
4
+ Rbbt.claim "organ" ,'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt', 'Polysearch'
5
+ Rbbt.claim "tissue" ,'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt', 'Polysearch'
6
+ Rbbt.claim "location" ,'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt', 'Polysearch'
7
+ Rbbt.claim "disease" ,'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt', 'Polysearch'
8
+ Rbbt.claim "drug" ,'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt', 'Polysearch'
9
+ end
10
+
@@ -52,9 +52,18 @@ $biomart_identifiers = [
52
52
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
53
53
  ]
54
54
 
55
+ $biomart_positions = [
56
+ ['Chromosome Name','chromosome_name'],
57
+ ['Strand','strand'],
58
+ ['Gene Start','start_position'],
59
+ ['Gene End','end_position'],
60
+ ['Transcript Start','transcript_start'],
61
+ ['Transcript End','transcript_end'],
62
+ ]
63
+
55
64
 
56
65
 
57
- file 'name' do |t|
66
+ file 'scientific_name' do |t|
58
67
  File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
59
68
  end
60
69
 
@@ -77,7 +86,32 @@ file 'identifiers' do |t|
77
86
  File.open(t.name, 'w') do |f| f.puts identifiers end
78
87
  end
79
88
 
89
+ file 'gene_go' do |t|
90
+ url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
91
+ tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
92
+
93
+ index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
94
+ new = TSV.new({})
95
+ tsv.through do |key, values|
96
+ next if index[key].nil?
97
+ new_key = index[key].first
98
+ new[new_key] = values
99
+ end
100
+
101
+
102
+ new.key_field = "Associated Gene Name"
103
+ new.fields = ["GO Term"]
104
+ Open.write(t.name, new.to_s)
105
+ end
106
+
107
+ file 'gene_positions' do |t|
108
+ BioMart.set_archive('may2009')
109
+ positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
110
+ BioMart.unset_archive
111
+
112
+ Open.write(t.name, positions.to_s)
113
+ end
80
114
 
81
- task :default => ['name', 'lexicon', 'identifiers']
115
+ task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
82
116
 
83
117
 
@@ -10,7 +10,7 @@ $biomart_db = 'scerevisiae_gene_ensembl'
10
10
  $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
11
 
12
12
 
13
- file 'name' do |t|
13
+ file 'scientific_name' do |t|
14
14
  File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
15
15
  end
16
16
 
@@ -27,7 +27,7 @@ file 'lexicon' do |t|
27
27
  end
28
28
 
29
29
  file 'identifiers' do |t|
30
- identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
30
+ identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
31
31
 
32
32
  merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
33
33
 
@@ -4,7 +4,7 @@ require 'test/unit'
4
4
 
5
5
  class TestBioMart < Test::Unit::TestCase
6
6
 
7
- def test_get
7
+ def _test_get
8
8
  assert_raise BioMart::QueryError do
9
9
  BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
10
  end
@@ -23,7 +23,7 @@ class TestBioMart < Test::Unit::TestCase
23
23
  assert(data['852236']['external_gene_id'].include? 'YBL044W')
24
24
  end
25
25
 
26
- def test_tsv
26
+ def _test_tsv
27
27
  data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
28
28
 
29
29
  assert(data['852236']['Protein ID'].include? 'CAA84864')
@@ -18,7 +18,7 @@ class TestEntrez < Test::Unit::TestCase
18
18
  tax = $yeast_tax
19
19
 
20
20
  data = Entrez.entrez2pubmed(tax)
21
- assert(data['850320'].include? '15102838')
21
+ assert(data['850320'].include? '1574125')
22
22
  end
23
23
 
24
24
  def test_getonline
@@ -6,12 +6,24 @@ class TestEntrez < Test::Unit::TestCase
6
6
  def test_identifiers
7
7
  assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
8
8
  assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
9
- #assert Organism.identifiers('Hsa')['1020']["Associated Gene Name"].include?('CDK5')
9
+ assert TSV.new(Organism.identifiers('Hsa'))['1020']["Associated Gene Name"].include?('CDK5')
10
10
  end
11
11
 
12
12
  def test_lexicon
13
13
  assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
14
14
  end
15
+
16
+ def test_guess_id
17
+ ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
18
+ gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
19
+ assert_equal "Ensembl Gene ID", Organism::Sce.guess_id(ensembl).first
20
+ assert_equal "Associated Gene Name", Organism::Sce.guess_id(gene_name).first
21
+ end
22
+
23
+ def test_organisms
24
+ assert Organism.organisms.include? "Hsa"
25
+ assert_equal "Hsa", Organism.organism("Homo sapiens")
26
+ end
15
27
  end
16
28
 
17
29
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
5
- prerelease: false
4
+ hash: 19
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 1
10
- version: 0.2.1
9
+ - 2
10
+ version: 0.2.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-14 00:00:00 +01:00
18
+ date: 2011-01-30 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -83,12 +83,16 @@ extensions: []
83
83
  extra_rdoc_files: []
84
84
 
85
85
  files:
86
+ - lib/rbbt/sources/COSTART.rb
87
+ - lib/rbbt/sources/CTCAE.rb
88
+ - lib/rbbt/sources/Reactome.rb
86
89
  - lib/rbbt/sources/bibtex.rb
87
90
  - lib/rbbt/sources/biomart.rb
88
91
  - lib/rbbt/sources/entrez.rb
89
92
  - lib/rbbt/sources/go.rb
90
93
  - lib/rbbt/sources/gscholar.rb
91
94
  - lib/rbbt/sources/organism.rb
95
+ - lib/rbbt/sources/polysearch.rb
92
96
  - lib/rbbt/sources/pubmed.rb
93
97
  - share/install/Organism/Hsa/Rakefile
94
98
  - share/install/Organism/Sce/Rakefile
@@ -129,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
129
133
  requirements: []
130
134
 
131
135
  rubyforge_project:
132
- rubygems_version: 1.3.7
136
+ rubygems_version: 1.4.2
133
137
  signing_key:
134
138
  specification_version: 3
135
139
  summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)