rbbt-sources 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
@@ -1,9 +1,19 @@
|
|
1
|
+
jun2011:
|
2
|
+
- refseq_mrna
|
3
|
+
may2010:
|
4
|
+
- refseq_mrna
|
5
|
+
- agilent_wholegenome
|
6
|
+
- agilent_cgh_44b
|
7
|
+
- illumina_humanwg_6_v2
|
8
|
+
- illumina_humanwg_6_v3
|
1
9
|
may2009:
|
10
|
+
- refseq_mrna
|
2
11
|
- agilent_wholegenome
|
3
12
|
- agilent_cgh_44b
|
4
13
|
- illumina_humanwg_6_v2
|
5
14
|
- illumina_humanwg_6_v3
|
6
15
|
dec2007:
|
16
|
+
- refseq_mrna
|
7
17
|
- protein_id
|
8
18
|
- affy_hc_g110
|
9
19
|
- affy_hg_u133a_2
|
@@ -14,6 +24,7 @@ dec2007:
|
|
14
24
|
- illumina_humanwg_6_v2
|
15
25
|
- illumina_humanwg_6_v3
|
16
26
|
aug2007:
|
27
|
+
- refseq_mrna
|
17
28
|
- protein_id
|
18
29
|
- affy_hc_g110
|
19
30
|
- affy_hg_u133a_2
|
data/lib/rbbt/sources/COSMIC.rb
CHANGED
@@ -5,10 +5,53 @@ module COSMIC
|
|
5
5
|
self.subdir = "share/databases/COSMIC"
|
6
6
|
|
7
7
|
COSMIC.claim COSMIC.Mutations, :proc do
|
8
|
-
url = "ftp://ftp.sanger.ac.uk/pub/CGP/
|
8
|
+
url = "ftp://ftp.sanger.ac.uk/pub/CGP/wgs/data_export/CosmicWGS_MutantExport_v61_260912.tsv.gz"
|
9
9
|
|
10
|
-
TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation
|
10
|
+
tsv = TSV.open(Open.open(url), :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
|
11
|
+
tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
|
12
|
+
tsv.add_field "Genomic Mutation" do |mid, values|
|
13
|
+
position = values["Mutation GRCh37 genome position"]
|
14
|
+
cds = values["Mutation CDS"]
|
15
|
+
if position.nil? or position.empty?
|
16
|
+
nil
|
17
|
+
else
|
18
|
+
position = position.split("-").first
|
19
|
+
if cds.nil?
|
20
|
+
position
|
21
|
+
else
|
22
|
+
change = case
|
23
|
+
when cds =~ />/
|
24
|
+
cds.split(">").last
|
25
|
+
when cds =~ /del/
|
26
|
+
deletion = cds.split("del").last
|
27
|
+
case
|
28
|
+
when deletion =~ /^\d+$/
|
29
|
+
"-" * deletion.to_i
|
30
|
+
when deletion =~ /^[ACTG]+$/i
|
31
|
+
"-" * deletion.length
|
32
|
+
else
|
33
|
+
Log.debug "Unknown deletion: #{ deletion }"
|
34
|
+
deletion
|
35
|
+
end
|
36
|
+
when cds =~ /ins/
|
37
|
+
insertion = cds.split("ins").last
|
38
|
+
case
|
39
|
+
when insertion =~ /^\d+$/
|
40
|
+
"+" + "N" * insertion.to_i
|
41
|
+
when insertion =~ /^[NACTG]+$/i
|
42
|
+
"+" + insertion
|
43
|
+
else
|
44
|
+
Log.debug "Unknown insertion: #{insertion }"
|
45
|
+
insertion
|
46
|
+
end
|
47
|
+
else
|
48
|
+
Log.debug "Unknown change: #{cds}"
|
49
|
+
"?(" << cds << ")"
|
50
|
+
end
|
51
|
+
position + ":" + change
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
tsv.to_s.gsub(/^(\d)/m,'COSM\1').gsub(/(\d)-(\d)/,'\1:\2')
|
11
56
|
end
|
12
57
|
end
|
13
|
-
|
14
|
-
puts COSMIC.Mutations.produce
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity/gene'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/sources/organism'
|
5
|
+
|
6
|
+
module HPRD
|
7
|
+
extend Resource
|
8
|
+
self.subdir = "share/databases/HPRD"
|
9
|
+
|
10
|
+
HPRD.claim HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"], :proc do
|
11
|
+
raise "File BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt not found in '#{HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].find}', download manually from http://www.hprd.org/"
|
12
|
+
end
|
13
|
+
|
14
|
+
HPRD.claim HPRD.protein_protein, :proc do
|
15
|
+
tsv = HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].tsv
|
16
|
+
|
17
|
+
tsv.key_field = "Associated Gene Name 1"
|
18
|
+
tsv.fields = ["HPRD id 1","RefSeq Protein ID 1","Associated Gene Name 2","HPRD id 2","RefSeq Protein ID 2", "Experiment type", "PMID"]
|
19
|
+
tsv.namespace = "Hsa"
|
20
|
+
|
21
|
+
tsv.to_s
|
22
|
+
end
|
23
|
+
end
|
@@ -1,17 +1,107 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity/gene'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/sources/organism'
|
5
|
+
|
2
6
|
module InterPro
|
3
7
|
extend Resource
|
4
8
|
self.subdir = "share/databases/InterPro"
|
5
9
|
|
6
|
-
InterPro.claim InterPro.
|
10
|
+
InterPro.claim InterPro.source.protein2ipr.find, :url, "ftp://ftp.ebi.ac.uk/pub/databases/interpro/protein2ipr.dat.gz"
|
11
|
+
|
12
|
+
InterPro.claim InterPro.protein_domains.find, :proc do
|
13
|
+
organism = "Hsa"
|
14
|
+
uniprot_colum = TSV::Parser.new(Organism.protein_identifiers(organism).open).all_fields.index("UniProt/SwissProt Accession")
|
15
|
+
uniprots = CMD.cmd("grep -v '^#'|cut -f #{uniprot_colum+1}", :in => Organism.protein_identifiers(organism).open).read.split("\n").collect{|l| l.split("|")}.flatten.uniq.reject{|l| l.empty?}
|
16
|
+
|
17
|
+
tsv = nil
|
18
|
+
TmpFile.with_file(uniprots * "\n") do |tmpfile|
|
19
|
+
tsv = TSV.open(CMD.cmd("cut -f 1,2,5,6 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :double)
|
20
|
+
end
|
21
|
+
|
22
|
+
tsv.key_field = "UniProt/SwissProt Accession"
|
23
|
+
tsv.fields = ["InterPro ID", "Domain Start AA", "Domain End AA"]
|
24
|
+
tsv.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
InterPro.claim InterPro.domain_names.find, :proc do
|
28
|
+
#tsv = InterPro.source.protein2ipr.tsv :key_field => 1, :fields => [2], :type => :single
|
29
|
+
tsv = TSV.open(CMD.cmd("cut -f 2,3 | sort -u", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :single)
|
30
|
+
|
31
|
+
tsv.key_field = "InterPro ID"
|
32
|
+
tsv.fields = ["Domain Name"]
|
33
|
+
tsv.to_s
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.name_index
|
37
|
+
@@name_index ||= InterPro.domain_names.tsv(:persist => true, :unnamed => true)
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.gene_index
|
41
|
+
@@gene_index ||= InterPro.protein_domains.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.domain_index
|
45
|
+
@@domain_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID"], :merge => true)
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.domain_position_index
|
49
|
+
@@domain_position_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID", "Domain Start AA", "Domain End AA"], :type => :double, :merge => true)
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.ens2uniprot(organism)
|
53
|
+
@@ens2uniprot_index ||= {}
|
54
|
+
@@ens2uniprot_index[organism] ||= Organism.protein_identifiers(organism).tsv(:persist => true, :unnamed => true, :fields => ["UniProt/SwissProt Accession"], :key_field => "Ensembl Protein ID", :type => :double, :merge => true)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
if defined? Entity
|
60
|
+
module InterProDomain
|
61
|
+
extend Entity
|
62
|
+
self.format = "InterPro ID"
|
63
|
+
|
64
|
+
self.annotation :organism
|
65
|
+
property :description => :array2single do
|
66
|
+
InterPro.name_index.values_at *self
|
67
|
+
end
|
68
|
+
|
69
|
+
property :name => :array2single do
|
70
|
+
InterPro.name_index.values_at *self
|
71
|
+
end
|
72
|
+
|
73
|
+
property :proteins => :array2single do
|
74
|
+
InterPro.gene_index.values_at(*self).
|
75
|
+
collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Protein.setup(o, "UniProt/SwissProt Accession", organism)}
|
76
|
+
end
|
77
|
+
|
78
|
+
property :genes => :array2single do
|
79
|
+
InterPro.gene_index.values_at(*self).
|
80
|
+
collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Gene.setup(o, "UniProt/SwissProt Accession", organism)}
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if defined? Protein and Entity === Protein
|
85
|
+
module Protein
|
86
|
+
property :interpro_domains => :array2single do
|
87
|
+
self.collect do |protein|
|
88
|
+
uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
|
89
|
+
uniprot.empty? ? nil :
|
90
|
+
InterPro.domain_index.values_at(*uniprot).compact.flatten. each{|pth| pth.organism = organism if pth.respond_to? :organism }.uniq.tap{|o| InterProDomain.setup(o, organism)}
|
91
|
+
end
|
92
|
+
end
|
7
93
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
94
|
+
property :interpro_domain_positions => :array2single do
|
95
|
+
self.collect do |protein|
|
96
|
+
if protein.nil?
|
97
|
+
[].tap{|o| InterProDomain.setup(o, organism)}
|
98
|
+
else
|
99
|
+
uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
|
100
|
+
uniprot.empty? ? nil :
|
101
|
+
InterPro.domain_position_index.values_at(*uniprot).compact.flatten(1).tap{|o| InterProDomain.setup(o, organism)}
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
15
105
|
end
|
16
106
|
end
|
17
107
|
end
|
data/lib/rbbt/sources/NCI.rb
CHANGED
@@ -34,7 +34,8 @@ if defined? Entity
|
|
34
34
|
end
|
35
35
|
|
36
36
|
property :genes => :array2single do
|
37
|
-
@genes ||= NCINaturePathway.gene_index.values_at
|
37
|
+
@genes ||= NCINaturePathway.gene_index.values_at(*self).
|
38
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
@@ -65,7 +66,8 @@ if defined? Entity
|
|
65
66
|
end
|
66
67
|
|
67
68
|
property :genes => :array2single do
|
68
|
-
@genes ||= NCIReactomePathway.gene_index.values_at
|
69
|
+
@genes ||= NCIReactomePathway.gene_index.values_at(*self).
|
70
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
69
71
|
end
|
70
72
|
end
|
71
73
|
|
@@ -97,7 +99,7 @@ if defined? Entity
|
|
97
99
|
|
98
100
|
property :genes => :array2single do
|
99
101
|
@genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
|
100
|
-
each{|
|
102
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
101
103
|
end
|
102
104
|
end
|
103
105
|
|
@@ -109,8 +111,8 @@ if defined? Entity
|
|
109
111
|
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
|
110
112
|
end
|
111
113
|
|
112
|
-
property :
|
113
|
-
@
|
114
|
+
property :nci_reactome_pathways => :array2single do
|
115
|
+
@nci_reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
|
114
116
|
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
|
115
117
|
end
|
116
118
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
|
5
|
+
module PSI_MI
|
6
|
+
extend Resource
|
7
|
+
self.subdir = Rbbt.share.databases.PSI_MI
|
8
|
+
|
9
|
+
URL="http://psidev.cvs.sourceforge.net/viewvc/psidev/psi/mi/rel25/data/psi-mi25.obo"
|
10
|
+
|
11
|
+
PSI_MI.claim PSI_MI.identifiers, :proc do
|
12
|
+
tsv = TSV.setup({}, :type => :list, :key_field => "PSI-MI Term", :fields => ["Name", "Description"])
|
13
|
+
Open.open(URL).read.split("[Term]").each do |chunk|
|
14
|
+
id = chunk.scan(/id: ([^\n]*)/)[0]
|
15
|
+
name = chunk.scan(/name: ([^\n]*)/)[0]
|
16
|
+
description = chunk.scan(/def: "([^\n]*)"/)[0]
|
17
|
+
tsv[id] = [name, description]
|
18
|
+
end
|
19
|
+
tsv.to_s
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
if defined? Entity
|
25
|
+
require 'rbbt/entity/gene'
|
26
|
+
require 'rbbt/entity/interactor'
|
27
|
+
require 'rbbt/sources/PSI_MI'
|
28
|
+
|
29
|
+
module PSI_MITerm
|
30
|
+
extend Entity
|
31
|
+
|
32
|
+
self.format = "PSI-MI Term"
|
33
|
+
|
34
|
+
property :name => :array2single do
|
35
|
+
@@index ||= PSI_MI.identifiers.tsv(:persist => true, :fields => ["Name"], :type => :single)
|
36
|
+
@@index.values_at(*self)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'rbbt/resource'
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
|
4
|
+
module STITCH
|
5
|
+
extend Resource
|
6
|
+
self.subdir = "share/databases/STITCH"
|
7
|
+
|
8
|
+
STITCH.claim STITCH.source.chemical_chemical.find, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
|
9
|
+
STITCH.claim STITCH.source.protein_chemical.find, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
|
10
|
+
STITCH.claim STITCH.source.actions.find, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
|
11
|
+
STITCH.claim STITCH.source.aliases.find, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
|
12
|
+
STITCH.claim STITCH.source.sources.find, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
|
13
|
+
|
14
|
+
Organism.installable_organisms.each do |organism|
|
15
|
+
STITCH.claim STITCH.chemical_protein(organism), :proc do
|
16
|
+
taxids = Organism.entrez_taxids(organism).read.split("\n")
|
17
|
+
tsv = TSV.open(CMD.cmd("grep '\t\\(#{ taxids * '\|' }\\)\\.' | sed 's/\\(#{taxids * "|"}\\)\\.//'", :in => STITCH.source.protein_chemical.open, :pipe => true), :type => :double, :merge => true)
|
18
|
+
tsv.key_field = "Chemical CID"
|
19
|
+
tsv.fields = ["Ensembl Gene ID", "experimental", "database", "textmining", "combined_score"]
|
20
|
+
tsv.to_s
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
STITCH.claim STITCH.identifiers, :proc do
|
26
|
+
identifiers = {}
|
27
|
+
fields = []
|
28
|
+
first_line = true
|
29
|
+
i = 0
|
30
|
+
STITCH.source.aliases.read do |line|
|
31
|
+
i += 1
|
32
|
+
next if i == 1
|
33
|
+
puts i
|
34
|
+
cid, code, source = line.split("\t")
|
35
|
+
|
36
|
+
pos = fields.index source
|
37
|
+
if pos.nil?
|
38
|
+
fields << source
|
39
|
+
pos = fields.length - 1
|
40
|
+
end
|
41
|
+
identifiers[cid] ||= []
|
42
|
+
identifiers[cid][pos] = code
|
43
|
+
end
|
44
|
+
|
45
|
+
TSV.setup(identifiers, :key_field => ["Chemical CID"], :fields => fields, :type => :double)
|
46
|
+
|
47
|
+
identifiers.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
STITCH.claim STITCH.iupac, :proc do
|
51
|
+
tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "IUPAC", :fields => [1]
|
52
|
+
tsv.key_field = "Chemical CID"
|
53
|
+
tsv.fields = ["IUPAC"]
|
54
|
+
|
55
|
+
tsv.to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
STITCH.claim STITCH.drug_bank, :proc do
|
59
|
+
tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "DrugBank", :fields => [1]
|
60
|
+
tsv.key_field = "Chemical CID"
|
61
|
+
tsv.fields = ["DrugBank ID"]
|
62
|
+
|
63
|
+
tsv.to_s
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
if defined? Entity
|
69
|
+
module Compound
|
70
|
+
extend Entity
|
71
|
+
|
72
|
+
self.annotation :format
|
73
|
+
self.format = "Chemical CID"
|
74
|
+
|
75
|
+
property :iupac => :array2single do
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
if __FILE__ == $0
|
83
|
+
STITCH.source.chemical_chemical.produce
|
84
|
+
STITCH.source.protein_chemical.produce
|
85
|
+
STITCH.source.actions.produce
|
86
|
+
STITCH.source.aliases.produce
|
87
|
+
STITCH.source.sources.produce
|
88
|
+
STITCH.chemical_protein("Hsa").produce
|
89
|
+
STITCH.iupac.produce
|
90
|
+
STITCH.drug_bank.produce
|
91
|
+
STITCH.identifiers.produce
|
92
|
+
end
|
data/lib/rbbt/sources/barcode.rb
CHANGED
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -21,12 +21,12 @@ module BioMart
|
|
21
21
|
|
22
22
|
@@biomart_query_xml = <<-EOT
|
23
23
|
<?xml version="1.0" encoding="UTF-8"?>
|
24
|
-
<!DOCTYPE Query>
|
24
|
+
<!DOCTYPE Query>
|
25
25
|
<Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
26
26
|
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
27
27
|
<!--FILTERS-->
|
28
|
-
<!--MAIN-->
|
29
|
-
<!--ATTRIBUTES-->
|
28
|
+
<!--MAIN-->
|
29
|
+
<!--ATTRIBUTES-->
|
30
30
|
</Dataset>
|
31
31
|
</Query>
|
32
32
|
EOT
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/resource'
|
4
|
+
require 'net/ftp'
|
5
|
+
|
6
|
+
module DbSNP
|
7
|
+
extend Resource
|
8
|
+
self.subdir = "share/databases/dbSNP"
|
9
|
+
|
10
|
+
URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
|
11
|
+
|
12
|
+
DbSNP.claim DbSNP.mutations, :proc do
|
13
|
+
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :single)
|
14
|
+
file = Open.open(URL, :nocache => true)
|
15
|
+
while line = file.gets do
|
16
|
+
next if line[0] == "#"[0]
|
17
|
+
chr, position, id, ref, alt = line.split "\t"
|
18
|
+
alt = alt.split(",").first
|
19
|
+
if alt[0] == ref[0]
|
20
|
+
alt[0] = '+'[0]
|
21
|
+
end
|
22
|
+
mutation = [chr, position, alt] * ":"
|
23
|
+
|
24
|
+
tsv.namespace = "Hsa/may2012"
|
25
|
+
tsv[id] = mutation
|
26
|
+
end
|
27
|
+
|
28
|
+
tsv.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
DbSNP.claim DbSNP.mutations_gatk, :proc do
|
32
|
+
ftp = Net::FTP.new('ftp.broadinstitute.org')
|
33
|
+
ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
|
34
|
+
ftp.chdir('/bundle/2.3/hg19')
|
35
|
+
|
36
|
+
tmpfile = TmpFile.tmp_file + '.gz'
|
37
|
+
ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
|
38
|
+
|
39
|
+
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation", "GMAF", "G5", "G5A", "dbSNP Build ID"], :type => :list)
|
40
|
+
file = Open.open(tmpfile, :nocache => true)
|
41
|
+
while line = file.gets do
|
42
|
+
next if line[0] == "#"[0]
|
43
|
+
|
44
|
+
chr, position, id, ref, mut, qual, filter, info = line.split "\t"
|
45
|
+
|
46
|
+
chr.sub!('chr', '')
|
47
|
+
|
48
|
+
mut = mut.split(",").first
|
49
|
+
case
|
50
|
+
when ref == '-'
|
51
|
+
mut = "+" << mut
|
52
|
+
when mut == '-'
|
53
|
+
mut = "-" * ref.length
|
54
|
+
when (mut.length > 1 and ref.length > 1)
|
55
|
+
mut = '-' * ref.length << mut
|
56
|
+
when (mut.length > 1 and ref.length == 1 and mut.index(ref) == 0)
|
57
|
+
mut = '+' << mut[1..-1]
|
58
|
+
when (mut.length == 1 and ref.length > 1 and ref.index(mut) == 0)
|
59
|
+
mut = '-' * (ref.length - 1)
|
60
|
+
else
|
61
|
+
mut = mut
|
62
|
+
end
|
63
|
+
|
64
|
+
g5 = g5a = dbsnp_build_id = gmaf = nil
|
65
|
+
|
66
|
+
gmaf = $1 if info =~ /GMAF=([0-9.]+)/
|
67
|
+
g5 = true if info =~ /\bG5\b/
|
68
|
+
g5a = true if info =~ /\bG5A\b/
|
69
|
+
dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
|
70
|
+
|
71
|
+
mutation = [chr, position, mut] * ":"
|
72
|
+
|
73
|
+
tsv.namespace = "Hsa/may2012"
|
74
|
+
|
75
|
+
tsv[id] = [mutation, gmaf, g5, g5a, dbsnp_build_id]
|
76
|
+
end
|
77
|
+
|
78
|
+
FileUtils.rm tmpfile
|
79
|
+
|
80
|
+
tsv.to_s
|
81
|
+
end
|
82
|
+
|
83
|
+
DbSNP.claim DbSNP.mutations_hg18, :proc do
|
84
|
+
require 'rbbt/sources/organism'
|
85
|
+
|
86
|
+
hg19_tsv = DbSNP.mutations.tsv :unnamed => true
|
87
|
+
|
88
|
+
mutations = hg19_tsv.values
|
89
|
+
|
90
|
+
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
91
|
+
|
92
|
+
tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
|
93
|
+
translations[mutation]
|
94
|
+
end
|
95
|
+
|
96
|
+
tsv.namespace = "Hsa/may2009"
|
97
|
+
|
98
|
+
tsv.to_s
|
99
|
+
end
|
100
|
+
end
|