rbbt-sources 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +1,19 @@
1
+ jun2011:
2
+ - refseq_mrna
3
+ may2010:
4
+ - refseq_mrna
5
+ - agilent_wholegenome
6
+ - agilent_cgh_44b
7
+ - illumina_humanwg_6_v2
8
+ - illumina_humanwg_6_v3
1
9
  may2009:
10
+ - refseq_mrna
2
11
  - agilent_wholegenome
3
12
  - agilent_cgh_44b
4
13
  - illumina_humanwg_6_v2
5
14
  - illumina_humanwg_6_v3
6
15
  dec2007:
16
+ - refseq_mrna
7
17
  - protein_id
8
18
  - affy_hc_g110
9
19
  - affy_hg_u133a_2
@@ -14,6 +24,7 @@ dec2007:
14
24
  - illumina_humanwg_6_v2
15
25
  - illumina_humanwg_6_v3
16
26
  aug2007:
27
+ - refseq_mrna
17
28
  - protein_id
18
29
  - affy_hc_g110
19
30
  - affy_hg_u133a_2
@@ -5,10 +5,53 @@ module COSMIC
5
5
  self.subdir = "share/databases/COSMIC"
6
6
 
7
7
  COSMIC.claim COSMIC.Mutations, :proc do
8
- url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicMutantExport_v54_120711.tsv"
8
+ url = "ftp://ftp.sanger.ac.uk/pub/CGP/wgs/data_export/CosmicWGS_MutantExport_v61_260912.tsv.gz"
9
9
 
10
- TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation GRCh37 genome position", :merge => true).to_s
10
+ tsv = TSV.open(Open.open(url), :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
11
+ tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
12
+ tsv.add_field "Genomic Mutation" do |mid, values|
13
+ position = values["Mutation GRCh37 genome position"]
14
+ cds = values["Mutation CDS"]
15
+ if position.nil? or position.empty?
16
+ nil
17
+ else
18
+ position = position.split("-").first
19
+ if cds.nil?
20
+ position
21
+ else
22
+ change = case
23
+ when cds =~ />/
24
+ cds.split(">").last
25
+ when cds =~ /del/
26
+ deletion = cds.split("del").last
27
+ case
28
+ when deletion =~ /^\d+$/
29
+ "-" * deletion.to_i
30
+ when deletion =~ /^[ACTG]+$/i
31
+ "-" * deletion.length
32
+ else
33
+ Log.debug "Unknown deletion: #{ deletion }"
34
+ deletion
35
+ end
36
+ when cds =~ /ins/
37
+ insertion = cds.split("ins").last
38
+ case
39
+ when insertion =~ /^\d+$/
40
+ "+" + "N" * insertion.to_i
41
+ when insertion =~ /^[NACTG]+$/i
42
+ "+" + insertion
43
+ else
44
+ Log.debug "Unknown insertion: #{insertion }"
45
+ insertion
46
+ end
47
+ else
48
+ Log.debug "Unknown change: #{cds}"
49
+ "?(" << cds << ")"
50
+ end
51
+ position + ":" + change
52
+ end
53
+ end
54
+ end
55
+ tsv.to_s.gsub(/^(\d)/m,'COSM\1').gsub(/(\d)-(\d)/,'\1:\2')
11
56
  end
12
57
  end
13
-
14
- puts COSMIC.Mutations.produce
@@ -0,0 +1,23 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity/gene'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/sources/organism'
5
+
6
+ module HPRD
7
+ extend Resource
8
+ self.subdir = "share/databases/HPRD"
9
+
10
+ HPRD.claim HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"], :proc do
11
+ raise "File BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt not found in '#{HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].find}', download manually from http://www.hprd.org/"
12
+ end
13
+
14
+ HPRD.claim HPRD.protein_protein, :proc do
15
+ tsv = HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].tsv
16
+
17
+ tsv.key_field = "Associated Gene Name 1"
18
+ tsv.fields = ["HPRD id 1","RefSeq Protein ID 1","Associated Gene Name 2","HPRD id 2","RefSeq Protein ID 2", "Experiment type", "PMID"]
19
+ tsv.namespace = "Hsa"
20
+
21
+ tsv.to_s
22
+ end
23
+ end
@@ -1,17 +1,107 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/entity/gene'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/sources/organism'
5
+
2
6
  module InterPro
3
7
  extend Resource
4
8
  self.subdir = "share/databases/InterPro"
5
9
 
6
- InterPro.claim InterPro.root.find, :rake, Rbbt.share.install.InterPro.Rakefile.find(:lib)
10
+ InterPro.claim InterPro.source.protein2ipr.find, :url, "ftp://ftp.ebi.ac.uk/pub/databases/interpro/protein2ipr.dat.gz"
11
+
12
+ InterPro.claim InterPro.protein_domains.find, :proc do
13
+ organism = "Hsa"
14
+ uniprot_colum = TSV::Parser.new(Organism.protein_identifiers(organism).open).all_fields.index("UniProt/SwissProt Accession")
15
+ uniprots = CMD.cmd("grep -v '^#'|cut -f #{uniprot_colum+1}", :in => Organism.protein_identifiers(organism).open).read.split("\n").collect{|l| l.split("|")}.flatten.uniq.reject{|l| l.empty?}
16
+
17
+ tsv = nil
18
+ TmpFile.with_file(uniprots * "\n") do |tmpfile|
19
+ tsv = TSV.open(CMD.cmd("cut -f 1,2,5,6 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :double)
20
+ end
21
+
22
+ tsv.key_field = "UniProt/SwissProt Accession"
23
+ tsv.fields = ["InterPro ID", "Domain Start AA", "Domain End AA"]
24
+ tsv.to_s
25
+ end
26
+
27
+ InterPro.claim InterPro.domain_names.find, :proc do
28
+ #tsv = InterPro.source.protein2ipr.tsv :key_field => 1, :fields => [2], :type => :single
29
+ tsv = TSV.open(CMD.cmd("cut -f 2,3 | sort -u", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :single)
30
+
31
+ tsv.key_field = "InterPro ID"
32
+ tsv.fields = ["Domain Name"]
33
+ tsv.to_s
34
+ end
35
+
36
+ def self.name_index
37
+ @@name_index ||= InterPro.domain_names.tsv(:persist => true, :unnamed => true)
38
+ end
39
+
40
+ def self.gene_index
41
+ @@gene_index ||= InterPro.protein_domains.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
42
+ end
43
+
44
+ def self.domain_index
45
+ @@domain_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID"], :merge => true)
46
+ end
47
+
48
+ def self.domain_position_index
49
+ @@domain_position_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID", "Domain Start AA", "Domain End AA"], :type => :double, :merge => true)
50
+ end
51
+
52
+ def self.ens2uniprot(organism)
53
+ @@ens2uniprot_index ||= {}
54
+ @@ens2uniprot_index[organism] ||= Organism.protein_identifiers(organism).tsv(:persist => true, :unnamed => true, :fields => ["UniProt/SwissProt Accession"], :key_field => "Ensembl Protein ID", :type => :double, :merge => true)
55
+ end
56
+
57
+ end
58
+
59
+ if defined? Entity
60
+ module InterProDomain
61
+ extend Entity
62
+ self.format = "InterPro ID"
63
+
64
+ self.annotation :organism
65
+ property :description => :array2single do
66
+ InterPro.name_index.values_at *self
67
+ end
68
+
69
+ property :name => :array2single do
70
+ InterPro.name_index.values_at *self
71
+ end
72
+
73
+ property :proteins => :array2single do
74
+ InterPro.gene_index.values_at(*self).
75
+ collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Protein.setup(o, "UniProt/SwissProt Accession", organism)}
76
+ end
77
+
78
+ property :genes => :array2single do
79
+ InterPro.gene_index.values_at(*self).
80
+ collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Gene.setup(o, "UniProt/SwissProt Accession", organism)}
81
+ end
82
+ end
83
+
84
+ if defined? Protein and Entity === Protein
85
+ module Protein
86
+ property :interpro_domains => :array2single do
87
+ self.collect do |protein|
88
+ uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
89
+ uniprot.empty? ? nil :
90
+ InterPro.domain_index.values_at(*uniprot).compact.flatten. each{|pth| pth.organism = organism if pth.respond_to? :organism }.uniq.tap{|o| InterProDomain.setup(o, organism)}
91
+ end
92
+ end
7
93
 
8
- def self.tsv(*args)
9
- old_url = BioMart::BIOMART_URL
10
- begin
11
- BioMart::BIOMART_URL.replace "http://www.ebi.ac.uk/interpro/biomart/martservice?query="
12
- BioMart.tsv(*args)
13
- ensure
14
- BioMart::BIOMART_URL.replace old_url
94
+ property :interpro_domain_positions => :array2single do
95
+ self.collect do |protein|
96
+ if protein.nil?
97
+ [].tap{|o| InterProDomain.setup(o, organism)}
98
+ else
99
+ uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
100
+ uniprot.empty? ? nil :
101
+ InterPro.domain_position_index.values_at(*uniprot).compact.flatten(1).tap{|o| InterProDomain.setup(o, organism)}
102
+ end
103
+ end
104
+ end
15
105
  end
16
106
  end
17
107
  end
@@ -34,7 +34,8 @@ if defined? Entity
34
34
  end
35
35
 
36
36
  property :genes => :array2single do
37
- @genes ||= NCINaturePathway.gene_index.values_at *self
37
+ @genes ||= NCINaturePathway.gene_index.values_at(*self).
38
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
38
39
  end
39
40
  end
40
41
 
@@ -65,7 +66,8 @@ if defined? Entity
65
66
  end
66
67
 
67
68
  property :genes => :array2single do
68
- @genes ||= NCIReactomePathway.gene_index.values_at *self
69
+ @genes ||= NCIReactomePathway.gene_index.values_at(*self).
70
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
69
71
  end
70
72
  end
71
73
 
@@ -97,7 +99,7 @@ if defined? Entity
97
99
 
98
100
  property :genes => :array2single do
99
101
  @genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
100
- each{|pth| pth.organism = organism if pth.respond_to? :organism }
102
+ each{|gene| gene.organism = organism if gene.respond_to? :organism }
101
103
  end
102
104
  end
103
105
 
@@ -109,8 +111,8 @@ if defined? Entity
109
111
  each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
110
112
  end
111
113
 
112
- property :reactome_pathways => :array2single do
113
- @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
114
+ property :nci_reactome_pathways => :array2single do
115
+ @nci_reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
114
116
  each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
115
117
  end
116
118
 
@@ -0,0 +1,41 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/tsv'
4
+
5
+ module PSI_MI
6
+ extend Resource
7
+ self.subdir = Rbbt.share.databases.PSI_MI
8
+
9
+ URL="http://psidev.cvs.sourceforge.net/viewvc/psidev/psi/mi/rel25/data/psi-mi25.obo"
10
+
11
+ PSI_MI.claim PSI_MI.identifiers, :proc do
12
+ tsv = TSV.setup({}, :type => :list, :key_field => "PSI-MI Term", :fields => ["Name", "Description"])
13
+ Open.open(URL).read.split("[Term]").each do |chunk|
14
+ id = chunk.scan(/id: ([^\n]*)/)[0]
15
+ name = chunk.scan(/name: ([^\n]*)/)[0]
16
+ description = chunk.scan(/def: "([^\n]*)"/)[0]
17
+ tsv[id] = [name, description]
18
+ end
19
+ tsv.to_s
20
+ end
21
+ end
22
+
23
+
24
+ if defined? Entity
25
+ require 'rbbt/entity/gene'
26
+ require 'rbbt/entity/interactor'
27
+ require 'rbbt/sources/PSI_MI'
28
+
29
+ module PSI_MITerm
30
+ extend Entity
31
+
32
+ self.format = "PSI-MI Term"
33
+
34
+ property :name => :array2single do
35
+ @@index ||= PSI_MI.identifiers.tsv(:persist => true, :fields => ["Name"], :type => :single)
36
+ @@index.values_at(*self)
37
+ end
38
+
39
+ end
40
+ end
41
+
@@ -0,0 +1,92 @@
1
+ require 'rbbt/resource'
2
+ require 'rbbt/sources/organism'
3
+
4
+ module STITCH
5
+ extend Resource
6
+ self.subdir = "share/databases/STITCH"
7
+
8
+ STITCH.claim STITCH.source.chemical_chemical.find, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
9
+ STITCH.claim STITCH.source.protein_chemical.find, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
10
+ STITCH.claim STITCH.source.actions.find, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
11
+ STITCH.claim STITCH.source.aliases.find, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
12
+ STITCH.claim STITCH.source.sources.find, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
13
+
14
+ Organism.installable_organisms.each do |organism|
15
+ STITCH.claim STITCH.chemical_protein(organism), :proc do
16
+ taxids = Organism.entrez_taxids(organism).read.split("\n")
17
+ tsv = TSV.open(CMD.cmd("grep '\t\\(#{ taxids * '\|' }\\)\\.' | sed 's/\\(#{taxids * "|"}\\)\\.//'", :in => STITCH.source.protein_chemical.open, :pipe => true), :type => :double, :merge => true)
18
+ tsv.key_field = "Chemical CID"
19
+ tsv.fields = ["Ensembl Gene ID", "experimental", "database", "textmining", "combined_score"]
20
+ tsv.to_s
21
+ end
22
+ end
23
+
24
+
25
+ STITCH.claim STITCH.identifiers, :proc do
26
+ identifiers = {}
27
+ fields = []
28
+ first_line = true
29
+ i = 0
30
+ STITCH.source.aliases.read do |line|
31
+ i += 1
32
+ next if i == 1
33
+ puts i
34
+ cid, code, source = line.split("\t")
35
+
36
+ pos = fields.index source
37
+ if pos.nil?
38
+ fields << source
39
+ pos = fields.length - 1
40
+ end
41
+ identifiers[cid] ||= []
42
+ identifiers[cid][pos] = code
43
+ end
44
+
45
+ TSV.setup(identifiers, :key_field => ["Chemical CID"], :fields => fields, :type => :double)
46
+
47
+ identifiers.to_s
48
+ end
49
+
50
+ STITCH.claim STITCH.iupac, :proc do
51
+ tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "IUPAC", :fields => [1]
52
+ tsv.key_field = "Chemical CID"
53
+ tsv.fields = ["IUPAC"]
54
+
55
+ tsv.to_s
56
+ end
57
+
58
+ STITCH.claim STITCH.drug_bank, :proc do
59
+ tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "DrugBank", :fields => [1]
60
+ tsv.key_field = "Chemical CID"
61
+ tsv.fields = ["DrugBank ID"]
62
+
63
+ tsv.to_s
64
+ end
65
+
66
+ end
67
+
68
+ if defined? Entity
69
+ module Compound
70
+ extend Entity
71
+
72
+ self.annotation :format
73
+ self.format = "Chemical CID"
74
+
75
+ property :iupac => :array2single do
76
+ end
77
+
78
+
79
+ end
80
+ end
81
+
82
+ if __FILE__ == $0
83
+ STITCH.source.chemical_chemical.produce
84
+ STITCH.source.protein_chemical.produce
85
+ STITCH.source.actions.produce
86
+ STITCH.source.aliases.produce
87
+ STITCH.source.sources.produce
88
+ STITCH.chemical_protein("Hsa").produce
89
+ STITCH.iupac.produce
90
+ STITCH.drug_bank.produce
91
+ STITCH.identifiers.produce
92
+ end
@@ -18,6 +18,3 @@ module Barcode
18
18
  end
19
19
 
20
20
  end
21
-
22
-
23
- Barcode.transcriptome.produce
@@ -21,12 +21,12 @@ module BioMart
21
21
 
22
22
  @@biomart_query_xml = <<-EOT
23
23
  <?xml version="1.0" encoding="UTF-8"?>
24
- <!DOCTYPE Query>
24
+ <!DOCTYPE Query>
25
25
  <Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
26
26
  <Dataset name = "<!--DATABASE-->" interface = "default" >
27
27
  <!--FILTERS-->
28
- <!--MAIN-->
29
- <!--ATTRIBUTES-->
28
+ <!--MAIN-->
29
+ <!--ATTRIBUTES-->
30
30
  </Dataset>
31
31
  </Query>
32
32
  EOT
@@ -0,0 +1,100 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/resource'
4
+ require 'net/ftp'
5
+
6
+ module DbSNP
7
+ extend Resource
8
+ self.subdir = "share/databases/dbSNP"
9
+
10
+ URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
11
+
12
+ DbSNP.claim DbSNP.mutations, :proc do
13
+ tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :single)
14
+ file = Open.open(URL, :nocache => true)
15
+ while line = file.gets do
16
+ next if line[0] == "#"[0]
17
+ chr, position, id, ref, alt = line.split "\t"
18
+ alt = alt.split(",").first
19
+ if alt[0] == ref[0]
20
+ alt[0] = '+'[0]
21
+ end
22
+ mutation = [chr, position, alt] * ":"
23
+
24
+ tsv.namespace = "Hsa/may2012"
25
+ tsv[id] = mutation
26
+ end
27
+
28
+ tsv.to_s
29
+ end
30
+
31
+ DbSNP.claim DbSNP.mutations_gatk, :proc do
32
+ ftp = Net::FTP.new('ftp.broadinstitute.org')
33
+ ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
34
+ ftp.chdir('/bundle/2.3/hg19')
35
+
36
+ tmpfile = TmpFile.tmp_file + '.gz'
37
+ ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
38
+
39
+ tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation", "GMAF", "G5", "G5A", "dbSNP Build ID"], :type => :list)
40
+ file = Open.open(tmpfile, :nocache => true)
41
+ while line = file.gets do
42
+ next if line[0] == "#"[0]
43
+
44
+ chr, position, id, ref, mut, qual, filter, info = line.split "\t"
45
+
46
+ chr.sub!('chr', '')
47
+
48
+ mut = mut.split(",").first
49
+ case
50
+ when ref == '-'
51
+ mut = "+" << mut
52
+ when mut == '-'
53
+ mut = "-" * ref.length
54
+ when (mut.length > 1 and ref.length > 1)
55
+ mut = '-' * ref.length << mut
56
+ when (mut.length > 1 and ref.length == 1 and mut.index(ref) == 0)
57
+ mut = '+' << mut[1..-1]
58
+ when (mut.length == 1 and ref.length > 1 and ref.index(mut) == 0)
59
+ mut = '-' * (ref.length - 1)
60
+ else
61
+ mut = mut
62
+ end
63
+
64
+ g5 = g5a = dbsnp_build_id = gmaf = nil
65
+
66
+ gmaf = $1 if info =~ /GMAF=([0-9.]+)/
67
+ g5 = true if info =~ /\bG5\b/
68
+ g5a = true if info =~ /\bG5A\b/
69
+ dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
70
+
71
+ mutation = [chr, position, mut] * ":"
72
+
73
+ tsv.namespace = "Hsa/may2012"
74
+
75
+ tsv[id] = [mutation, gmaf, g5, g5a, dbsnp_build_id]
76
+ end
77
+
78
+ FileUtils.rm tmpfile
79
+
80
+ tsv.to_s
81
+ end
82
+
83
+ DbSNP.claim DbSNP.mutations_hg18, :proc do
84
+ require 'rbbt/sources/organism'
85
+
86
+ hg19_tsv = DbSNP.mutations.tsv :unnamed => true
87
+
88
+ mutations = hg19_tsv.values
89
+
90
+ translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
91
+
92
+ tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
93
+ translations[mutation]
94
+ end
95
+
96
+ tsv.namespace = "Hsa/may2009"
97
+
98
+ tsv.to_s
99
+ end
100
+ end