rbbt-sources 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,19 @@
1
+ jun2011:
2
+ - refseq_mrna
3
+ may2010:
4
+ - refseq_mrna
5
+ - agilent_wholegenome
6
+ - agilent_cgh_44b
7
+ - illumina_humanwg_6_v2
8
+ - illumina_humanwg_6_v3
1
9
  may2009:
10
+ - refseq_mrna
2
11
  - agilent_wholegenome
3
12
  - agilent_cgh_44b
4
13
  - illumina_humanwg_6_v2
5
14
  - illumina_humanwg_6_v3
6
15
  dec2007:
16
+ - refseq_mrna
7
17
  - protein_id
8
18
  - affy_hc_g110
9
19
  - affy_hg_u133a_2
@@ -14,6 +24,7 @@ dec2007:
14
24
  - illumina_humanwg_6_v2
15
25
  - illumina_humanwg_6_v3
16
26
  aug2007:
27
+ - refseq_mrna
17
28
  - protein_id
18
29
  - affy_hc_g110
19
30
  - affy_hg_u133a_2
@@ -5,10 +5,53 @@ module COSMIC
5
5
  self.subdir = "share/databases/COSMIC"
6
6
 
7
7
  COSMIC.claim COSMIC.Mutations, :proc do
8
- url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicMutantExport_v54_120711.tsv"
8
+ url = "ftp://ftp.sanger.ac.uk/pub/CGP/wgs/data_export/CosmicWGS_MutantExport_v61_260912.tsv.gz"
9
9
 
10
- TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation GRCh37 genome position", :merge => true).to_s
10
+ tsv = TSV.open(Open.open(url), :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
11
+ tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
12
+ tsv.add_field "Genomic Mutation" do |mid, values|
13
+ position = values["Mutation GRCh37 genome position"]
14
+ cds = values["Mutation CDS"]
15
+ if position.nil? or position.empty?
16
+ nil
17
+ else
18
+ position = position.split("-").first
19
+ if cds.nil?
20
+ position
21
+ else
22
+ change = case
23
+ when cds =~ />/
24
+ cds.split(">").last
25
+ when cds =~ /del/
26
+ deletion = cds.split("del").last
27
+ case
28
+ when deletion =~ /^\d+$/
29
+ "-" * deletion.to_i
30
+ when deletion =~ /^[ACTG]+$/i
31
+ "-" * deletion.length
32
+ else
33
+ Log.debug "Unknown deletion: #{ deletion }"
34
+ deletion
35
+ end
36
+ when cds =~ /ins/
37
+ insertion = cds.split("ins").last
38
+ case
39
+ when insertion =~ /^\d+$/
40
+ "+" + "N" * insertion.to_i
41
+ when insertion =~ /^[NACTG]+$/i
42
+ "+" + insertion
43
+ else
44
+ Log.debug "Unknown insertion: #{insertion }"
45
+ insertion
46
+ end
47
+ else
48
+ Log.debug "Unknown change: #{cds}"
49
+ "?(" << cds << ")"
50
+ end
51
+ position + ":" + change
52
+ end
53
+ end
54
+ end
55
+ tsv.to_s.gsub(/^(\d)/m,'COSM\1').gsub(/(\d)-(\d)/,'\1:\2')
11
56
  end
12
57
  end
13
-
14
- puts COSMIC.Mutations.produce
@@ -0,0 +1,23 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity/gene'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/sources/organism'
5
+
6
+ module HPRD
7
+ extend Resource
8
+ self.subdir = "share/databases/HPRD"
9
+
10
+ HPRD.claim HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"], :proc do
11
+ raise "File BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt not found in '#{HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].find}', download manually from http://www.hprd.org/"
12
+ end
13
+
14
+ HPRD.claim HPRD.protein_protein, :proc do
15
+ tsv = HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].tsv
16
+
17
+ tsv.key_field = "Associated Gene Name 1"
18
+ tsv.fields = ["HPRD id 1","RefSeq Protein ID 1","Associated Gene Name 2","HPRD id 2","RefSeq Protein ID 2", "Experiment type", "PMID"]
19
+ tsv.namespace = "Hsa"
20
+
21
+ tsv.to_s
22
+ end
23
+ end
@@ -1,17 +1,107 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/entity/gene'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/sources/organism'
5
+
2
6
  module InterPro
3
7
  extend Resource
4
8
  self.subdir = "share/databases/InterPro"
5
9
 
6
- InterPro.claim InterPro.root.find, :rake, Rbbt.share.install.InterPro.Rakefile.find(:lib)
10
+ InterPro.claim InterPro.source.protein2ipr.find, :url, "ftp://ftp.ebi.ac.uk/pub/databases/interpro/protein2ipr.dat.gz"
11
+
12
+ InterPro.claim InterPro.protein_domains.find, :proc do
13
+ organism = "Hsa"
14
+ uniprot_colum = TSV::Parser.new(Organism.protein_identifiers(organism).open).all_fields.index("UniProt/SwissProt Accession")
15
+ uniprots = CMD.cmd("grep -v '^#'|cut -f #{uniprot_colum+1}", :in => Organism.protein_identifiers(organism).open).read.split("\n").collect{|l| l.split("|")}.flatten.uniq.reject{|l| l.empty?}
16
+
17
+ tsv = nil
18
+ TmpFile.with_file(uniprots * "\n") do |tmpfile|
19
+ tsv = TSV.open(CMD.cmd("cut -f 1,2,5,6 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :double)
20
+ end
21
+
22
+ tsv.key_field = "UniProt/SwissProt Accession"
23
+ tsv.fields = ["InterPro ID", "Domain Start AA", "Domain End AA"]
24
+ tsv.to_s
25
+ end
26
+
27
+ InterPro.claim InterPro.domain_names.find, :proc do
28
+ #tsv = InterPro.source.protein2ipr.tsv :key_field => 1, :fields => [2], :type => :single
29
+ tsv = TSV.open(CMD.cmd("cut -f 2,3 | sort -u", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :single)
30
+
31
+ tsv.key_field = "InterPro ID"
32
+ tsv.fields = ["Domain Name"]
33
+ tsv.to_s
34
+ end
35
+
36
+ def self.name_index
37
+ @@name_index ||= InterPro.domain_names.tsv(:persist => true, :unnamed => true)
38
+ end
39
+
40
+ def self.gene_index
41
+ @@gene_index ||= InterPro.protein_domains.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
42
+ end
43
+
44
+ def self.domain_index
45
+ @@domain_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID"], :merge => true)
46
+ end
47
+
48
+ def self.domain_position_index
49
+ @@domain_position_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID", "Domain Start AA", "Domain End AA"], :type => :double, :merge => true)
50
+ end
51
+
52
+ def self.ens2uniprot(organism)
53
+ @@ens2uniprot_index ||= {}
54
+ @@ens2uniprot_index[organism] ||= Organism.protein_identifiers(organism).tsv(:persist => true, :unnamed => true, :fields => ["UniProt/SwissProt Accession"], :key_field => "Ensembl Protein ID", :type => :double, :merge => true)
55
+ end
56
+
57
+ end
58
+
59
+ if defined? Entity
60
+ module InterProDomain
61
+ extend Entity
62
+ self.format = "InterPro ID"
63
+
64
+ self.annotation :organism
65
+ property :description => :array2single do
66
+ InterPro.name_index.values_at *self
67
+ end
68
+
69
+ property :name => :array2single do
70
+ InterPro.name_index.values_at *self
71
+ end
72
+
73
+ property :proteins => :array2single do
74
+ InterPro.gene_index.values_at(*self).
75
+ collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Protein.setup(o, "UniProt/SwissProt Accession", organism)}
76
+ end
77
+
78
+ property :genes => :array2single do
79
+ InterPro.gene_index.values_at(*self).
80
+ collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Gene.setup(o, "UniProt/SwissProt Accession", organism)}
81
+ end
82
+ end
83
+
84
+ if defined? Protein and Entity === Protein
85
+ module Protein
86
+ property :interpro_domains => :array2single do
87
+ self.collect do |protein|
88
+ uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
89
+ uniprot.empty? ? nil :
90
+ InterPro.domain_index.values_at(*uniprot).compact.flatten. each{|pth| pth.organism = organism if pth.respond_to? :organism }.uniq.tap{|o| InterProDomain.setup(o, organism)}
91
+ end
92
+ end
7
93
 
8
- def self.tsv(*args)
9
- old_url = BioMart::BIOMART_URL
10
- begin
11
- BioMart::BIOMART_URL.replace "http://www.ebi.ac.uk/interpro/biomart/martservice?query="
12
- BioMart.tsv(*args)
13
- ensure
14
- BioMart::BIOMART_URL.replace old_url
94
+ property :interpro_domain_positions => :array2single do
95
+ self.collect do |protein|
96
+ if protein.nil?
97
+ [].tap{|o| InterProDomain.setup(o, organism)}
98
+ else
99
+ uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
100
+ uniprot.empty? ? nil :
101
+ InterPro.domain_position_index.values_at(*uniprot).compact.flatten(1).tap{|o| InterProDomain.setup(o, organism)}
102
+ end
103
+ end
104
+ end
15
105
  end
16
106
  end
17
107
  end
@@ -34,7 +34,8 @@ if defined? Entity
34
34
  end
35
35
 
36
36
  property :genes => :array2single do
37
- @genes ||= NCINaturePathway.gene_index.values_at *self
37
+ @genes ||= NCINaturePathway.gene_index.values_at(*self).
38
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
38
39
  end
39
40
  end
40
41
 
@@ -65,7 +66,8 @@ if defined? Entity
65
66
  end
66
67
 
67
68
  property :genes => :array2single do
68
- @genes ||= NCIReactomePathway.gene_index.values_at *self
69
+ @genes ||= NCIReactomePathway.gene_index.values_at(*self).
70
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
69
71
  end
70
72
  end
71
73
 
@@ -97,7 +99,7 @@ if defined? Entity
97
99
 
98
100
  property :genes => :array2single do
99
101
  @genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
100
- each{|pth| pth.organism = organism if pth.respond_to? :organism }
102
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
101
103
  end
102
104
  end
103
105
 
@@ -109,8 +111,8 @@ if defined? Entity
109
111
  each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
110
112
  end
111
113
 
112
- property :reactome_pathways => :array2single do
113
- @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
114
+ property :nci_reactome_pathways => :array2single do
115
+ @nci_reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
114
116
  each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
115
117
  end
116
118
 
@@ -0,0 +1,41 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/tsv'
4
+
5
+ module PSI_MI
6
+ extend Resource
7
+ self.subdir = Rbbt.share.databases.PSI_MI
8
+
9
+ URL="http://psidev.cvs.sourceforge.net/viewvc/psidev/psi/mi/rel25/data/psi-mi25.obo"
10
+
11
+ PSI_MI.claim PSI_MI.identifiers, :proc do
12
+ tsv = TSV.setup({}, :type => :list, :key_field => "PSI-MI Term", :fields => ["Name", "Description"])
13
+ Open.open(URL).read.split("[Term]").each do |chunk|
14
+ id = chunk.scan(/id: ([^\n]*)/)[0]
15
+ name = chunk.scan(/name: ([^\n]*)/)[0]
16
+ description = chunk.scan(/def: "([^\n]*)"/)[0]
17
+ tsv[id] = [name, description]
18
+ end
19
+ tsv.to_s
20
+ end
21
+ end
22
+
23
+
24
+ if defined? Entity
25
+ require 'rbbt/entity/gene'
26
+ require 'rbbt/entity/interactor'
27
+ require 'rbbt/sources/PSI_MI'
28
+
29
+ module PSI_MITerm
30
+ extend Entity
31
+
32
+ self.format = "PSI-MI Term"
33
+
34
+ property :name => :array2single do
35
+ @@index ||= PSI_MI.identifiers.tsv(:persist => true, :fields => ["Name"], :type => :single)
36
+ @@index.values_at(*self)
37
+ end
38
+
39
+ end
40
+ end
41
+
@@ -0,0 +1,92 @@
1
+ require 'rbbt/resource'
2
+ require 'rbbt/sources/organism'
3
+
4
+ module STITCH
5
+ extend Resource
6
+ self.subdir = "share/databases/STITCH"
7
+
8
+ STITCH.claim STITCH.source.chemical_chemical.find, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
9
+ STITCH.claim STITCH.source.protein_chemical.find, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
10
+ STITCH.claim STITCH.source.actions.find, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
11
+ STITCH.claim STITCH.source.aliases.find, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
12
+ STITCH.claim STITCH.source.sources.find, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
13
+
14
+ Organism.installable_organisms.each do |organism|
15
+ STITCH.claim STITCH.chemical_protein(organism), :proc do
16
+ taxids = Organism.entrez_taxids(organism).read.split("\n")
17
+ tsv = TSV.open(CMD.cmd("grep '\t\\(#{ taxids * '\|' }\\)\\.' | sed 's/\\(#{taxids * "|"}\\)\\.//'", :in => STITCH.source.protein_chemical.open, :pipe => true), :type => :double, :merge => true)
18
+ tsv.key_field = "Chemical CID"
19
+ tsv.fields = ["Ensembl Gene ID", "experimental", "database", "textmining", "combined_score"]
20
+ tsv.to_s
21
+ end
22
+ end
23
+
24
+
25
+ STITCH.claim STITCH.identifiers, :proc do
26
+ identifiers = {}
27
+ fields = []
28
+ first_line = true
29
+ i = 0
30
+ STITCH.source.aliases.read do |line|
31
+ i += 1
32
+ next if i == 1
33
+ puts i
34
+ cid, code, source = line.split("\t")
35
+
36
+ pos = fields.index source
37
+ if pos.nil?
38
+ fields << source
39
+ pos = fields.length - 1
40
+ end
41
+ identifiers[cid] ||= []
42
+ identifiers[cid][pos] = code
43
+ end
44
+
45
+ TSV.setup(identifiers, :key_field => ["Chemical CID"], :fields => fields, :type => :double)
46
+
47
+ identifiers.to_s
48
+ end
49
+
50
+ STITCH.claim STITCH.iupac, :proc do
51
+ tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "IUPAC", :fields => [1]
52
+ tsv.key_field = "Chemical CID"
53
+ tsv.fields = ["IUPAC"]
54
+
55
+ tsv.to_s
56
+ end
57
+
58
+ STITCH.claim STITCH.drug_bank, :proc do
59
+ tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "DrugBank", :fields => [1]
60
+ tsv.key_field = "Chemical CID"
61
+ tsv.fields = ["DrugBank ID"]
62
+
63
+ tsv.to_s
64
+ end
65
+
66
+ end
67
+
68
+ if defined? Entity
69
+ module Compound
70
+ extend Entity
71
+
72
+ self.annotation :format
73
+ self.format = "Chemical CID"
74
+
75
+ property :iupac => :array2single do
76
+ end
77
+
78
+
79
+ end
80
+ end
81
+
82
+ if __FILE__ == $0
83
+ STITCH.source.chemical_chemical.produce
84
+ STITCH.source.protein_chemical.produce
85
+ STITCH.source.actions.produce
86
+ STITCH.source.aliases.produce
87
+ STITCH.source.sources.produce
88
+ STITCH.chemical_protein("Hsa").produce
89
+ STITCH.iupac.produce
90
+ STITCH.drug_bank.produce
91
+ STITCH.identifiers.produce
92
+ end
@@ -18,6 +18,3 @@ module Barcode
18
18
  end
19
19
 
20
20
  end
21
-
22
-
23
- Barcode.transcriptome.produce
@@ -21,12 +21,12 @@ module BioMart
21
21
 
22
22
  @@biomart_query_xml = <<-EOT
23
23
  <?xml version="1.0" encoding="UTF-8"?>
24
- <!DOCTYPE Query>
24
+ <!DOCTYPE Query>
25
25
  <Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
26
26
  <Dataset name = "<!--DATABASE-->" interface = "default" >
27
27
  <!--FILTERS-->
28
- <!--MAIN-->
29
- <!--ATTRIBUTES-->
28
+ <!--MAIN-->
29
+ <!--ATTRIBUTES-->
30
30
  </Dataset>
31
31
  </Query>
32
32
  EOT
@@ -0,0 +1,100 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/resource'
4
+ require 'net/ftp'
5
+
6
+ module DbSNP
7
+ extend Resource
8
+ self.subdir = "share/databases/dbSNP"
9
+
10
+ URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
11
+
12
+ DbSNP.claim DbSNP.mutations, :proc do
13
+ tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :single)
14
+ file = Open.open(URL, :nocache => true)
15
+ while line = file.gets do
16
+ next if line[0] == "#"[0]
17
+ chr, position, id, ref, alt = line.split "\t"
18
+ alt = alt.split(",").first
19
+ if alt[0] == ref[0]
20
+ alt[0] = '+'[0]
21
+ end
22
+ mutation = [chr, position, alt] * ":"
23
+
24
+ tsv.namespace = "Hsa/may2012"
25
+ tsv[id] = mutation
26
+ end
27
+
28
+ tsv.to_s
29
+ end
30
+
31
+ DbSNP.claim DbSNP.mutations_gatk, :proc do
32
+ ftp = Net::FTP.new('ftp.broadinstitute.org')
33
+ ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
34
+ ftp.chdir('/bundle/2.3/hg19')
35
+
36
+ tmpfile = TmpFile.tmp_file + '.gz'
37
+ ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
38
+
39
+ tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation", "GMAF", "G5", "G5A", "dbSNP Build ID"], :type => :list)
40
+ file = Open.open(tmpfile, :nocache => true)
41
+ while line = file.gets do
42
+ next if line[0] == "#"[0]
43
+
44
+ chr, position, id, ref, mut, qual, filter, info = line.split "\t"
45
+
46
+ chr.sub!('chr', '')
47
+
48
+ mut = mut.split(",").first
49
+ case
50
+ when ref == '-'
51
+ mut = "+" << mut
52
+ when mut == '-'
53
+ mut = "-" * ref.length
54
+ when (mut.length > 1 and ref.length > 1)
55
+ mut = '-' * ref.length << mut
56
+ when (mut.length > 1 and ref.length == 1 and mut.index(ref) == 0)
57
+ mut = '+' << mut[1..-1]
58
+ when (mut.length == 1 and ref.length > 1 and ref.index(mut) == 0)
59
+ mut = '-' * (ref.length - 1)
60
+ else
61
+ mut = mut
62
+ end
63
+
64
+ g5 = g5a = dbsnp_build_id = gmaf = nil
65
+
66
+ gmaf = $1 if info =~ /GMAF=([0-9.]+)/
67
+ g5 = true if info =~ /\bG5\b/
68
+ g5a = true if info =~ /\bG5A\b/
69
+ dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
70
+
71
+ mutation = [chr, position, mut] * ":"
72
+
73
+ tsv.namespace = "Hsa/may2012"
74
+
75
+ tsv[id] = [mutation, gmaf, g5, g5a, dbsnp_build_id]
76
+ end
77
+
78
+ FileUtils.rm tmpfile
79
+
80
+ tsv.to_s
81
+ end
82
+
83
+ DbSNP.claim DbSNP.mutations_hg18, :proc do
84
+ require 'rbbt/sources/organism'
85
+
86
+ hg19_tsv = DbSNP.mutations.tsv :unnamed => true
87
+
88
+ mutations = hg19_tsv.values
89
+
90
+ translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
91
+
92
+ tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
93
+ translations[mutation]
94
+ end
95
+
96
+ tsv.namespace = "Hsa/may2009"
97
+
98
+ tsv.to_s
99
+ end
100
+ end