rbbt-sources 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/sources/organism'
3
+ require 'rbbt/tsv'
4
+ require 'net/ftp'
5
+
6
+ module Ensembl
7
+
8
+
9
+ def self.releases
10
+ @releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
11
+ end
12
+
13
+ module FTP
14
+
15
+ SERVER = "ftp.ensembl.org"
16
+
17
+ def self.ftp_name_for(organism)
18
+ code, build = organism.split "/"
19
+ build ||= "current"
20
+
21
+ if build.to_s == "current"
22
+ else
23
+ release = Ensembl.releases[build]
24
+ name = Organism.scientific_name(organism)
25
+ ftp = Net::FTP.new(Ensembl::FTP::SERVER)
26
+ ftp.login
27
+ ftp.chdir(File.join('pub', release, 'mysql'))
28
+ file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
29
+ ftp.close
30
+ end
31
+ [release, file]
32
+ end
33
+
34
+ def self.ftp_directory_for(organism)
35
+ release, ftp_name = ftp_name_for(organism)
36
+ File.join('/pub/', release, 'mysql', ftp_name)
37
+ end
38
+
39
+ def self.base_url(organism)
40
+ File.join("ftp://" + SERVER, ftp_directory_for(organism) )
41
+ end
42
+
43
+ def self.url_for(organism, table)
44
+ "#{base_url(organism)}/#{table}.txt.gz"
45
+ end
46
+
47
+ def self.has_table?(organism, table)
48
+ sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
49
+ ! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
50
+ end
51
+
52
+ def self.fields_for(organism, table)
53
+ sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
54
+
55
+ chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
56
+ chunk.scan(/^\s+`(.*?)`/).flatten
57
+ end
58
+
59
+ def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
60
+ url = url_for(organism, table)
61
+ if key_field and fields
62
+ all_fields = fields_for(organism, table)
63
+ key_pos = all_fields.index key_field
64
+ field_pos = fields.collect{|f| all_fields.index f}
65
+
66
+ options[:key_field] = key_pos
67
+ options[:fields] = field_pos
68
+ end
69
+ tsv = TSV.open(url, options)
70
+ tsv.key_field = key_field
71
+ tsv.fields = fields
72
+ tsv
73
+ end
74
+ end
75
+ end
76
+
77
+ if __FILE__ == $0
78
+ ddd Ensembl::FTP.ensembl_tsv("Hsa/may2012", 'exon')
79
+ end
@@ -10,7 +10,7 @@ module Entrez
10
10
  Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
11
11
 
12
12
  def self.entrez2native(taxs, options = {})
13
- options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
13
+ options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
14
14
 
15
15
  taxs = [taxs] unless Array === taxs
16
16
  options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
@@ -22,7 +22,7 @@ module Entrez
22
22
  end
23
23
 
24
24
  def self.entrez2name(taxs, options = {})
25
- options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
25
+ options = Misc.add_defaults options, :key_field => 1, :fields => [2], :persist => true, :merge => true
26
26
 
27
27
  taxs = [taxs] unless Array === taxs
28
28
  options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
@@ -0,0 +1,45 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/resource'
4
+
5
+ module Genomes1000
6
+ extend Resource
7
+ self.subdir = "share/databases/genomes_1000"
8
+
9
+ RELEASE_URL = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz"
10
+
11
+ Genomes1000.claim Genomes1000.mutations, :proc do
12
+
13
+ tsv = TSV.setup({}, :key_field => "Variant ID", :fields => ["Genomic Mutation"], :type => :single)
14
+ Open.read(RELEASE_URL) do |line|
15
+ next if line[0] == "#"[0]
16
+
17
+ chromosome, position, id, references, alternative, quality, filter, info = line.split("\t")
18
+
19
+ tsv[id] = [chromosome, position, alternative] * ":"
20
+ end
21
+
22
+ tsv.namespace = "Hsa"
23
+
24
+ tsv.to_s
25
+ end
26
+
27
+ Genomes1000.claim Genomes1000.mutations_hg18, :proc do
28
+ require 'rbbt/sources/organism'
29
+
30
+ hg19_tsv = Genomes1000.mutations.tsv :unnamed => true
31
+
32
+ mutations = hg19_tsv.values
33
+
34
+ translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
35
+
36
+ tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
37
+ translations[mutation]
38
+ end
39
+
40
+ tsv.namespace = "Hsa/may2009"
41
+
42
+ tsv.to_s
43
+ end
44
+
45
+ end
@@ -97,9 +97,12 @@ if defined? Entity
97
97
  @name ||= GO.id2name(self)
98
98
  end
99
99
 
100
- property :genes => :array2single do |organism|
100
+ property :genes => :array2single do |*args|
101
+ organism = args.first
101
102
  organism ||= self.organism
102
- @genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
103
+ res = Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
104
+ res.collect{|r| r.organism = organism if r and r.respond_to? :organism}
105
+ res
103
106
  end
104
107
 
105
108
  property :description => :single2array do
@@ -114,12 +117,21 @@ if defined? Entity
114
117
  if defined? Gene and Entity === Gene
115
118
  module Gene
116
119
  property :go_terms => :array2single do
117
- @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
120
+ @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
118
121
  end
119
122
 
120
123
  property :go_bp_terms => :array2single do
121
- @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
124
+ @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
122
125
  end
126
+
127
+ property :go_cc_terms => :array2single do
128
+ @go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
129
+ end
130
+
131
+ property :go_mf_terms => :array2single do
132
+ @go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
133
+ end
134
+
123
135
  end
124
136
  end
125
137
  end
@@ -1,18 +1,80 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/resource'
3
- require 'rbbt/resource/with_key'
4
3
 
5
4
  module Organism
6
5
  extend Resource
7
6
  self.pkgdir = "rbbt"
8
7
  self.subdir = "share/organisms"
9
8
 
10
- ["Hsa", "Mmu", "Rno", "Sce"].each do |organism|
9
+ def self.installable_organisms
10
+ Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
11
+ end
12
+
13
+
14
+ Organism.installable_organisms.each do |organism|
11
15
  claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
12
16
 
13
17
  module_eval "#{ organism } = with_key '#{organism}'"
14
18
  end
15
19
 
20
+ Rbbt.claim Rbbt.software.opt.bin.liftOver, :url, "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
21
+
22
+ def self.hg_build(organism)
23
+ require 'rbbt/sources/ensembl_ftp'
24
+
25
+ raise "Only organism 'Hsa' (Homo sapiens) supported" unless organism =~ /^Hsa/
26
+
27
+ return 'hg19' unless organism =~ /\//
28
+ date = organism.split("/")[1]
29
+
30
+ release = Ensembl.releases[date]
31
+
32
+ release.sub(/.*-/,'').to_i > 54 ? 'hg19' : 'hg18'
33
+ end
34
+
35
+ def self.liftOver(positions, source, target)
36
+
37
+ source_hg = hg_build(source)
38
+ target_hg = hg_build(target)
39
+
40
+ case
41
+ when (source_hg == 'hg19' and target_hg == 'hg18')
42
+ map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg18.over.chain.gz"
43
+ when (source_hg == 'hg18' and target_hg == 'hg19')
44
+ map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz"
45
+ else
46
+ return positions
47
+ end
48
+
49
+ positions_bed = positions.collect{|position| chr, pos = position.split(":").values_at(0,1); ["chr" << chr, pos.to_i-1, pos, position] * "\t"} * "\n" + "\n"
50
+ new_positions = {}
51
+
52
+ TmpFile.with_file(positions_bed) do |source_bed|
53
+ TmpFile.with_file() do |unmapped_file|
54
+ TmpFile.with_file() do |map_file|
55
+
56
+
57
+ Open.write(map_file, Open.read(map_url))
58
+ new_mutations = TmpFile.with_file() do |target_bed|
59
+ FileUtils.chmod(755, Rbbt.software.opt.bin.liftOver.produce.find)
60
+ CMD.cmd("#{Rbbt.software.opt.bin.liftOver.find} '#{source_bed}' '#{map_file}' '#{target_bed}' '#{unmapped_file}'").read
61
+ Open.read(target_bed) do |line|
62
+ chr, position_alt, position, name = line.chomp.split("\t")
63
+ chr.sub! /chr/, ''
64
+
65
+ old_chr, old_position, *rest = name.split(":")
66
+ new_positions[name] = ([chr, position].concat rest) * ":"
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ positions.collect do |position|
74
+ new_positions[position]
75
+ end
76
+ end
77
+
16
78
  class OrganismNotProcessedError < StandardError; end
17
79
 
18
80
  def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
@@ -53,30 +115,36 @@ module Organism
53
115
  end
54
116
  end
55
117
 
56
- def self.guess_id(org, values, identifiers = nil)
57
- identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
58
- field_matches = identifiers.field_matches(values)
59
- field_matches.sort_by{|field, matches| matches.uniq.length}.last
60
- end
61
-
62
118
  def self.guess_id(org, values)
63
119
  field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
64
120
  field_matches.sort_by{|field, count| count.to_i}.last
65
121
  end
66
122
 
67
-
68
123
  def self.organisms
69
124
  Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
70
125
  end
71
126
 
72
- def self.name(organism)
73
- Organism.scientific_name(organism).read.strip
127
+ def self.scientific_name(organism)
128
+ Organism[organism]["scientific_name"].produce.read.strip
74
129
  end
75
130
 
76
131
  def self.organism(name)
77
132
  organisms.select{|organism|
78
- organism == name or Organism.name(organism) =~ /#{ name }/i
133
+ organism == name or Organism.scientific_name(organism) =~ /#{ name }/i
79
134
  }.first
80
135
  end
81
136
 
137
+ def self.known_ids(name)
138
+ TSV::Parser.new(Organism.identifiers(name).open).all_fields
139
+ end
140
+
141
+ def self.entrez_taxid_organism(taxid)
142
+ all_organisms = Organism.installable_organisms
143
+
144
+ all_organisms.each do |organism|
145
+ return organism if Organism.entrez_taxids(organism).read.split("\n").include? taxid.to_s
146
+ end
147
+
148
+ raise "No organism identified for taxid #{taxid}. Supported organism are: #{all_organisms * ", "}"
149
+ end
82
150
  end
@@ -1,6 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
3
  require 'rbbt/resource'
4
+ require 'rbbt/entity'
5
+ require 'rbbt/sources/InterPro'
4
6
 
5
7
  module Pfam
6
8
  extend Resource
@@ -12,24 +14,82 @@ module Pfam
12
14
  tsv.to_s
13
15
  end
14
16
 
15
- NAMES_FILE = Rbbt.share.databases.InterPro.pfam_names.find
17
+ NAMES_FILE = InterPro.pfam_names.find
16
18
 
17
19
  def self.name_index
18
- @name_index ||= TSV.open NAMES_FILE, :single
20
+ @name_index ||= TSV.open NAMES_FILE, :single, :unnamed => true
19
21
  end
20
22
 
21
23
  def self.name(id)
22
- name_index[id]
24
+ name_index[id] || id
23
25
  end
24
26
  end
25
27
 
28
+ module InterPro
29
+ def self.pfam_index
30
+ @@pfam_index ||= InterPro.pfam_equivalences.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["Pfam Domain"])
31
+ end
32
+ end
33
+
34
+ InterPro.claim InterPro.pfam_names.find, :proc do
35
+ pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
36
+ tsv = nil
37
+ TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
38
+ tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
39
+ end
40
+ tsv.key_field = "InterPro ID"
41
+ tsv.fields = ["Domain Name"]
42
+ tsv.to_s
43
+ end
44
+
45
+ InterPro.claim InterPro.pfam_equivalences.find, :proc do
46
+ pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
47
+ tsv = nil
48
+ TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
49
+ tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
50
+ end
51
+ tsv.key_field = "InterPro ID"
52
+ tsv.fields = ["Pfam Domain"]
53
+ tsv.to_s
54
+ end
55
+
56
+
26
57
  if defined? Entity
27
58
  module PfamDomain
28
59
  extend Entity
29
60
  self.format = "Pfam Domain"
61
+ self.format = "Pfam Domain ID"
62
+
63
+ self.annotation :organism
30
64
 
31
65
  property :name => :array2single do
32
66
  self.collect{|id| Pfam.name(id)}
33
67
  end
68
+
69
+ property :genes => :array2single do
70
+ @genes ||= Organism.gene_pfam(organism).tsv(:key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :persist => true, :merge => true, :type => :flat, :namespace => organism).values_at *self
71
+ end
72
+ end
73
+
74
+
75
+ module InterProDomain
76
+ property :pfam => :array2single do
77
+ InterPro.pfam_index.values_at(*self).
78
+ each{|domain| domain.organism = organism if domain.respond_to? :organism }
79
+ end
80
+ end
81
+
82
+ if defined? Gene and Entity === Gene
83
+ module Gene
84
+ INDEX_CACHE = {}
85
+
86
+ property :pfam_domains => :array2single do
87
+ index = INDEX_CACHE[organism] ||= Organism.gene_pfam(organism).tsv(:persist => true, :type => :flat, :fields => ["Pfam Domain"], :key_field => "Ensembl Gene ID", :namespace => organism)
88
+ @pfam_domains ||= index.values_at *self.ensembl
89
+ end
90
+
91
+ end
34
92
  end
35
93
  end
94
+
95
+
@@ -15,7 +15,8 @@ module PubMed
15
15
 
16
16
  url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
17
17
  articles = []
18
- Misc.divide(pmids.sort, (pmids.length / 1000) + 1) do |pmid_list|
18
+
19
+ Misc.divide(pmids.sort_by{|v| v.nil? ? 0 : v.to_i}, (pmids.length / 1000) + 1).each do |pmid_list|
19
20
  postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
20
21
  xml = TmpFile.with_file(postdata) do |postfile|
21
22
  Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
@@ -130,7 +131,13 @@ module PubMed
130
131
 
131
132
  def pdf_url
132
133
  return pmc_pdf if pmc_pdf
133
- @gscholar_pdf ||= GoogleScholar::full_text_url title
134
+ @gscholar_pdf ||= begin
135
+ GoogleScholar::full_text_url title
136
+ rescue
137
+ Log.medium "GoogleScholar#full_text failed: #{title}"
138
+ sleep 0.1
139
+ nil
140
+ end
134
141
  end
135
142
 
136
143
  def full_text
@@ -140,7 +147,7 @@ module PubMed
140
147
  TmpFile.with_file do |pdf|
141
148
 
142
149
  # Change user-agent, oh well...
143
- `wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
150
+ `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
144
151
  TmpFile.with_file do |txt|
145
152
  `pdftotext #{ pdf } #{ txt }`
146
153
  text = Open.read(txt) if File.exists? txt
@@ -0,0 +1,82 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+
4
+ module Reactome
5
+ extend Resource
6
+ self.subdir = "share/databases/Reactome"
7
+
8
+ Reactome.claim Reactome.protein_pathways, :proc do
9
+ url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
10
+ tsv = TSV.open(Open.open(url), :key_field => 0, :fields => [1], :merge => true, :type => :double)
11
+ tsv.key_field = "UniProt/SwissProt Accession"
12
+ tsv.fields = ["Reactome Pathway ID"]
13
+ tsv.namespace = "Hsa"
14
+ tsv.to_s
15
+ end
16
+
17
+ Reactome.claim Reactome.pathway_names, :proc do
18
+ url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
19
+ tsv = TSV.open(Open.open(url), :key_field => 1, :fields => [2], :type => :single)
20
+ tsv.key_field = "Reactome Pathway ID"
21
+ tsv.fields = ["Pathway Name"]
22
+ tsv.namespace = "Hsa"
23
+ tsv.to_s
24
+ end
25
+
26
+ Reactome.claim Reactome.protein_protein, :proc do
27
+ url = "http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"
28
+ tsv = TSV.open(CMD.cmd('cut -f 1,4,7,8,9|sed "s/UniProt://g;s/,/;/g"', :in => Open.open(url), :pipe => true), :type => :double, :merge => true)
29
+ tsv.key_field = "UniProt/SwissProt Accession"
30
+ tsv.fields = ["Interactor UniProt/SwissProt Accession", "Interaction type", "Reactions", "PMID"]
31
+ tsv.namespace = "Hsa"
32
+ tsv.to_s
33
+ end
34
+
35
+ end
36
+
37
+ if defined? Entity
38
+ module ReactomePathway
39
+ extend Entity
40
+ self.format = "Reactome Pathway ID"
41
+
42
+ self.annotation :organism
43
+
44
+ def self.name_index
45
+ @name_index ||= Reactome.pathway_names.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
46
+ end
47
+
48
+ def self.gene_index
49
+ @gene_index ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
50
+ end
51
+
52
+ def self.filter(query, field = nil, options = nil, entity = nil)
53
+ return true if query == entity
54
+
55
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
56
+
57
+ false
58
+ end
59
+
60
+ property :name => :array2single do
61
+ @name ||= ReactomePathway.name_index.values_at *self
62
+ end
63
+
64
+ property :genes => :array2single do
65
+ @genes ||= ReactomePathway.gene_index.values_at(*self).
66
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
67
+ end
68
+
69
+ property :url => :single do
70
+ "http://www.reactome.org/cgi-bin/eventbrowser_st_id?ST_ID=#{ self }"
71
+ end
72
+ end
73
+
74
+ if defined? Gene and Entity === Gene
75
+ module Gene
76
+ property :reactome_pathways => :array2single do
77
+ @reactome_pathways ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
78
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| ReactomePathway.setup(o, organism)}
79
+ end
80
+ end
81
+ end
82
+ end