rbbt-sources 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,79 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/sources/organism'
3
+ require 'rbbt/tsv'
4
+ require 'net/ftp'
5
+
6
+ module Ensembl
7
+
8
+
9
+ def self.releases
10
+ @releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
11
+ end
12
+
13
+ module FTP
14
+
15
+ SERVER = "ftp.ensembl.org"
16
+
17
+ def self.ftp_name_for(organism)
18
+ code, build = organism.split "/"
19
+ build ||= "current"
20
+
21
+ if build.to_s == "current"
22
+ else
23
+ release = Ensembl.releases[build]
24
+ name = Organism.scientific_name(organism)
25
+ ftp = Net::FTP.new(Ensembl::FTP::SERVER)
26
+ ftp.login
27
+ ftp.chdir(File.join('pub', release, 'mysql'))
28
+ file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
29
+ ftp.close
30
+ end
31
+ [release, file]
32
+ end
33
+
34
+ def self.ftp_directory_for(organism)
35
+ release, ftp_name = ftp_name_for(organism)
36
+ File.join('/pub/', release, 'mysql', ftp_name)
37
+ end
38
+
39
+ def self.base_url(organism)
40
+ File.join("ftp://" + SERVER, ftp_directory_for(organism) )
41
+ end
42
+
43
+ def self.url_for(organism, table)
44
+ "#{base_url(organism)}/#{table}.txt.gz"
45
+ end
46
+
47
+ def self.has_table?(organism, table)
48
+ sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
49
+ ! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
50
+ end
51
+
52
+ def self.fields_for(organism, table)
53
+ sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
54
+
55
+ chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
56
+ chunk.scan(/^\s+`(.*?)`/).flatten
57
+ end
58
+
59
+ def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
60
+ url = url_for(organism, table)
61
+ if key_field and fields
62
+ all_fields = fields_for(organism, table)
63
+ key_pos = all_fields.index key_field
64
+ field_pos = fields.collect{|f| all_fields.index f}
65
+
66
+ options[:key_field] = key_pos
67
+ options[:fields] = field_pos
68
+ end
69
+ tsv = TSV.open(url, options)
70
+ tsv.key_field = key_field
71
+ tsv.fields = fields
72
+ tsv
73
+ end
74
+ end
75
+ end
76
+
77
+ if __FILE__ == $0
78
+ ddd Ensembl::FTP.ensembl_tsv("Hsa/may2012", 'exon')
79
+ end
@@ -10,7 +10,7 @@ module Entrez
10
10
  Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
11
11
 
12
12
  def self.entrez2native(taxs, options = {})
13
- options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
13
+ options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
14
14
 
15
15
  taxs = [taxs] unless Array === taxs
16
16
  options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
@@ -22,7 +22,7 @@ module Entrez
22
22
  end
23
23
 
24
24
  def self.entrez2name(taxs, options = {})
25
- options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
25
+ options = Misc.add_defaults options, :key_field => 1, :fields => [2], :persist => true, :merge => true
26
26
 
27
27
  taxs = [taxs] unless Array === taxs
28
28
  options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
@@ -0,0 +1,45 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/resource'
4
+
5
+ module Genomes1000
6
+ extend Resource
7
+ self.subdir = "share/databases/genomes_1000"
8
+
9
+ RELEASE_URL = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz"
10
+
11
+ Genomes1000.claim Genomes1000.mutations, :proc do
12
+
13
+ tsv = TSV.setup({}, :key_field => "Variant ID", :fields => ["Genomic Mutation"], :type => :single)
14
+ Open.read(RELEASE_URL) do |line|
15
+ next if line[0] == "#"[0]
16
+
17
+ chromosome, position, id, references, alternative, quality, filter, info = line.split("\t")
18
+
19
+ tsv[id] = [chromosome, position, alternative] * ":"
20
+ end
21
+
22
+ tsv.namespace = "Hsa"
23
+
24
+ tsv.to_s
25
+ end
26
+
27
+ Genomes1000.claim Genomes1000.mutations_hg18, :proc do
28
+ require 'rbbt/sources/organism'
29
+
30
+ hg19_tsv = Genomes1000.mutations.tsv :unnamed => true
31
+
32
+ mutations = hg19_tsv.values
33
+
34
+ translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
35
+
36
+ tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
37
+ translations[mutation]
38
+ end
39
+
40
+ tsv.namespace = "Hsa/may2009"
41
+
42
+ tsv.to_s
43
+ end
44
+
45
+ end
@@ -97,9 +97,12 @@ if defined? Entity
97
97
  @name ||= GO.id2name(self)
98
98
  end
99
99
 
100
- property :genes => :array2single do |organism|
100
+ property :genes => :array2single do |*args|
101
+ organism = args.first
101
102
  organism ||= self.organism
102
- @genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
103
+ res = Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
104
+ res.collect{|r| r.organism = organism if r and r.respond_to? :organism}
105
+ res
103
106
  end
104
107
 
105
108
  property :description => :single2array do
@@ -114,12 +117,21 @@ if defined? Entity
114
117
  if defined? Gene and Entity === Gene
115
118
  module Gene
116
119
  property :go_terms => :array2single do
117
- @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
120
+ @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
118
121
  end
119
122
 
120
123
  property :go_bp_terms => :array2single do
121
- @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
124
+ @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
122
125
  end
126
+
127
+ property :go_cc_terms => :array2single do
128
+ @go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
129
+ end
130
+
131
+ property :go_mf_terms => :array2single do
132
+ @go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
133
+ end
134
+
123
135
  end
124
136
  end
125
137
  end
@@ -1,18 +1,80 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/resource'
3
- require 'rbbt/resource/with_key'
4
3
 
5
4
  module Organism
6
5
  extend Resource
7
6
  self.pkgdir = "rbbt"
8
7
  self.subdir = "share/organisms"
9
8
 
10
- ["Hsa", "Mmu", "Rno", "Sce"].each do |organism|
9
+ def self.installable_organisms
10
+ Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
11
+ end
12
+
13
+
14
+ Organism.installable_organisms.each do |organism|
11
15
  claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
12
16
 
13
17
  module_eval "#{ organism } = with_key '#{organism}'"
14
18
  end
15
19
 
20
+ Rbbt.claim Rbbt.software.opt.bin.liftOver, :url, "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
21
+
22
+ def self.hg_build(organism)
23
+ require 'rbbt/sources/ensembl_ftp'
24
+
25
+ raise "Only organism 'Hsa' (Homo sapiens) supported" unless organism =~ /^Hsa/
26
+
27
+ return 'hg19' unless organism =~ /\//
28
+ date = organism.split("/")[1]
29
+
30
+ release = Ensembl.releases[date]
31
+
32
+ release.sub(/.*-/,'').to_i > 54 ? 'hg19' : 'hg18'
33
+ end
34
+
35
+ def self.liftOver(positions, source, target)
36
+
37
+ source_hg = hg_build(source)
38
+ target_hg = hg_build(target)
39
+
40
+ case
41
+ when (source_hg == 'hg19' and target_hg == 'hg18')
42
+ map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg18.over.chain.gz"
43
+ when (source_hg == 'hg18' and target_hg == 'hg19')
44
+ map_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz"
45
+ else
46
+ return positions
47
+ end
48
+
49
+ positions_bed = positions.collect{|position| chr, pos = position.split(":").values_at(0,1); ["chr" << chr, pos.to_i-1, pos, position] * "\t"} * "\n" + "\n"
50
+ new_positions = {}
51
+
52
+ TmpFile.with_file(positions_bed) do |source_bed|
53
+ TmpFile.with_file() do |unmapped_file|
54
+ TmpFile.with_file() do |map_file|
55
+
56
+
57
+ Open.write(map_file, Open.read(map_url))
58
+ new_mutations = TmpFile.with_file() do |target_bed|
59
+ FileUtils.chmod(755, Rbbt.software.opt.bin.liftOver.produce.find)
60
+ CMD.cmd("#{Rbbt.software.opt.bin.liftOver.find} '#{source_bed}' '#{map_file}' '#{target_bed}' '#{unmapped_file}'").read
61
+ Open.read(target_bed) do |line|
62
+ chr, position_alt, position, name = line.chomp.split("\t")
63
+ chr.sub! /chr/, ''
64
+
65
+ old_chr, old_position, *rest = name.split(":")
66
+ new_positions[name] = ([chr, position].concat rest) * ":"
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ positions.collect do |position|
74
+ new_positions[position]
75
+ end
76
+ end
77
+
16
78
  class OrganismNotProcessedError < StandardError; end
17
79
 
18
80
  def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
@@ -53,30 +115,36 @@ module Organism
53
115
  end
54
116
  end
55
117
 
56
- def self.guess_id(org, values, identifiers = nil)
57
- identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
58
- field_matches = identifiers.field_matches(values)
59
- field_matches.sort_by{|field, matches| matches.uniq.length}.last
60
- end
61
-
62
118
  def self.guess_id(org, values)
63
119
  field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
64
120
  field_matches.sort_by{|field, count| count.to_i}.last
65
121
  end
66
122
 
67
-
68
123
  def self.organisms
69
124
  Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
70
125
  end
71
126
 
72
- def self.name(organism)
73
- Organism.scientific_name(organism).read.strip
127
+ def self.scientific_name(organism)
128
+ Organism[organism]["scientific_name"].produce.read.strip
74
129
  end
75
130
 
76
131
  def self.organism(name)
77
132
  organisms.select{|organism|
78
- organism == name or Organism.name(organism) =~ /#{ name }/i
133
+ organism == name or Organism.scientific_name(organism) =~ /#{ name }/i
79
134
  }.first
80
135
  end
81
136
 
137
+ def self.known_ids(name)
138
+ TSV::Parser.new(Organism.identifiers(name).open).all_fields
139
+ end
140
+
141
+ def self.entrez_taxid_organism(taxid)
142
+ all_organisms = Organism.installable_organisms
143
+
144
+ all_organisms.each do |organism|
145
+ return organism if Organism.entrez_taxids(organism).read.split("\n").include? taxid.to_s
146
+ end
147
+
148
+ raise "No organism identified for taxid #{taxid}. Supported organism are: #{all_organisms * ", "}"
149
+ end
82
150
  end
@@ -1,6 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
3
  require 'rbbt/resource'
4
+ require 'rbbt/entity'
5
+ require 'rbbt/sources/InterPro'
4
6
 
5
7
  module Pfam
6
8
  extend Resource
@@ -12,24 +14,82 @@ module Pfam
12
14
  tsv.to_s
13
15
  end
14
16
 
15
- NAMES_FILE = Rbbt.share.databases.InterPro.pfam_names.find
17
+ NAMES_FILE = InterPro.pfam_names.find
16
18
 
17
19
  def self.name_index
18
- @name_index ||= TSV.open NAMES_FILE, :single
20
+ @name_index ||= TSV.open NAMES_FILE, :single, :unnamed => true
19
21
  end
20
22
 
21
23
  def self.name(id)
22
- name_index[id]
24
+ name_index[id] || id
23
25
  end
24
26
  end
25
27
 
28
+ module InterPro
29
+ def self.pfam_index
30
+ @@pfam_index ||= InterPro.pfam_equivalences.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["Pfam Domain"])
31
+ end
32
+ end
33
+
34
+ InterPro.claim InterPro.pfam_names.find, :proc do
35
+ pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
36
+ tsv = nil
37
+ TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
38
+ tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
39
+ end
40
+ tsv.key_field = "InterPro ID"
41
+ tsv.fields = ["Domain Name"]
42
+ tsv.to_s
43
+ end
44
+
45
+ InterPro.claim InterPro.pfam_equivalences.find, :proc do
46
+ pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
47
+ tsv = nil
48
+ TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
49
+ tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
50
+ end
51
+ tsv.key_field = "InterPro ID"
52
+ tsv.fields = ["Pfam Domain"]
53
+ tsv.to_s
54
+ end
55
+
56
+
26
57
  if defined? Entity
27
58
  module PfamDomain
28
59
  extend Entity
29
60
  self.format = "Pfam Domain"
61
+ self.format = "Pfam Domain ID"
62
+
63
+ self.annotation :organism
30
64
 
31
65
  property :name => :array2single do
32
66
  self.collect{|id| Pfam.name(id)}
33
67
  end
68
+
69
+ property :genes => :array2single do
70
+ @genes ||= Organism.gene_pfam(organism).tsv(:key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :persist => true, :merge => true, :type => :flat, :namespace => organism).values_at *self
71
+ end
72
+ end
73
+
74
+
75
+ module InterProDomain
76
+ property :pfam => :array2single do
77
+ InterPro.pfam_index.values_at(*self).
78
+ each{|domain| domain.organism = organism if domain.respond_to? :organism }
79
+ end
80
+ end
81
+
82
+ if defined? Gene and Entity === Gene
83
+ module Gene
84
+ INDEX_CACHE = {}
85
+
86
+ property :pfam_domains => :array2single do
87
+ index = INDEX_CACHE[organism] ||= Organism.gene_pfam(organism).tsv(:persist => true, :type => :flat, :fields => ["Pfam Domain"], :key_field => "Ensembl Gene ID", :namespace => organism)
88
+ @pfam_domains ||= index.values_at *self.ensembl
89
+ end
90
+
91
+ end
34
92
  end
35
93
  end
94
+
95
+
@@ -15,7 +15,8 @@ module PubMed
15
15
 
16
16
  url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
17
17
  articles = []
18
- Misc.divide(pmids.sort, (pmids.length / 1000) + 1) do |pmid_list|
18
+
19
+ Misc.divide(pmids.sort_by{|v| v.nil? ? 0 : v.to_i}, (pmids.length / 1000) + 1).each do |pmid_list|
19
20
  postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
20
21
  xml = TmpFile.with_file(postdata) do |postfile|
21
22
  Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
@@ -130,7 +131,13 @@ module PubMed
130
131
 
131
132
  def pdf_url
132
133
  return pmc_pdf if pmc_pdf
133
- @gscholar_pdf ||= GoogleScholar::full_text_url title
134
+ @gscholar_pdf ||= begin
135
+ GoogleScholar::full_text_url title
136
+ rescue
137
+ Log.medium "GoogleScholar#full_text failed: #{title}"
138
+ sleep 0.1
139
+ nil
140
+ end
134
141
  end
135
142
 
136
143
  def full_text
@@ -140,7 +147,7 @@ module PubMed
140
147
  TmpFile.with_file do |pdf|
141
148
 
142
149
  # Change user-agent, oh well...
143
- `wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
150
+ `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
144
151
  TmpFile.with_file do |txt|
145
152
  `pdftotext #{ pdf } #{ txt }`
146
153
  text = Open.read(txt) if File.exists? txt
@@ -0,0 +1,82 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+
4
+ module Reactome
5
+ extend Resource
6
+ self.subdir = "share/databases/Reactome"
7
+
8
+ Reactome.claim Reactome.protein_pathways, :proc do
9
+ url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
10
+ tsv = TSV.open(Open.open(url), :key_field => 0, :fields => [1], :merge => true, :type => :double)
11
+ tsv.key_field = "UniProt/SwissProt Accession"
12
+ tsv.fields = ["Reactome Pathway ID"]
13
+ tsv.namespace = "Hsa"
14
+ tsv.to_s
15
+ end
16
+
17
+ Reactome.claim Reactome.pathway_names, :proc do
18
+ url = "http://www.reactome.org/download/current/uniprot_2_pathways.stid.txt"
19
+ tsv = TSV.open(Open.open(url), :key_field => 1, :fields => [2], :type => :single)
20
+ tsv.key_field = "Reactome Pathway ID"
21
+ tsv.fields = ["Pathway Name"]
22
+ tsv.namespace = "Hsa"
23
+ tsv.to_s
24
+ end
25
+
26
+ Reactome.claim Reactome.protein_protein, :proc do
27
+ url = "http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"
28
+ tsv = TSV.open(CMD.cmd('cut -f 1,4,7,8,9|sed "s/UniProt://g;s/,/;/g"', :in => Open.open(url), :pipe => true), :type => :double, :merge => true)
29
+ tsv.key_field = "UniProt/SwissProt Accession"
30
+ tsv.fields = ["Interactor UniProt/SwissProt Accession", "Interaction type", "Reactions", "PMID"]
31
+ tsv.namespace = "Hsa"
32
+ tsv.to_s
33
+ end
34
+
35
+ end
36
+
37
+ if defined? Entity
38
+ module ReactomePathway
39
+ extend Entity
40
+ self.format = "Reactome Pathway ID"
41
+
42
+ self.annotation :organism
43
+
44
+ def self.name_index
45
+ @name_index ||= Reactome.pathway_names.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
46
+ end
47
+
48
+ def self.gene_index
49
+ @gene_index ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
50
+ end
51
+
52
+ def self.filter(query, field = nil, options = nil, entity = nil)
53
+ return true if query == entity
54
+
55
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
56
+
57
+ false
58
+ end
59
+
60
+ property :name => :array2single do
61
+ @name ||= ReactomePathway.name_index.values_at *self
62
+ end
63
+
64
+ property :genes => :array2single do
65
+ @genes ||= ReactomePathway.gene_index.values_at(*self).
66
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
67
+ end
68
+
69
+ property :url => :single do
70
+ "http://www.reactome.org/cgi-bin/eventbrowser_st_id?ST_ID=#{ self }"
71
+ end
72
+ end
73
+
74
+ if defined? Gene and Entity === Gene
75
+ module Gene
76
+ property :reactome_pathways => :array2single do
77
+ @reactome_pathways ||= Reactome.protein_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
78
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| ReactomePathway.setup(o, organism)}
79
+ end
80
+ end
81
+ end
82
+ end