rbbt-sources 1.2.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
@@ -1,9 +1,19 @@
|
|
1
|
+
jun2011:
|
2
|
+
- refseq_mrna
|
3
|
+
may2010:
|
4
|
+
- refseq_mrna
|
5
|
+
- agilent_wholegenome
|
6
|
+
- agilent_cgh_44b
|
7
|
+
- illumina_humanwg_6_v2
|
8
|
+
- illumina_humanwg_6_v3
|
1
9
|
may2009:
|
10
|
+
- refseq_mrna
|
2
11
|
- agilent_wholegenome
|
3
12
|
- agilent_cgh_44b
|
4
13
|
- illumina_humanwg_6_v2
|
5
14
|
- illumina_humanwg_6_v3
|
6
15
|
dec2007:
|
16
|
+
- refseq_mrna
|
7
17
|
- protein_id
|
8
18
|
- affy_hc_g110
|
9
19
|
- affy_hg_u133a_2
|
@@ -14,6 +24,7 @@ dec2007:
|
|
14
24
|
- illumina_humanwg_6_v2
|
15
25
|
- illumina_humanwg_6_v3
|
16
26
|
aug2007:
|
27
|
+
- refseq_mrna
|
17
28
|
- protein_id
|
18
29
|
- affy_hc_g110
|
19
30
|
- affy_hg_u133a_2
|
data/lib/rbbt/sources/COSMIC.rb
CHANGED
@@ -5,10 +5,53 @@ module COSMIC
|
|
5
5
|
self.subdir = "share/databases/COSMIC"
|
6
6
|
|
7
7
|
COSMIC.claim COSMIC.Mutations, :proc do
|
8
|
-
url = "ftp://ftp.sanger.ac.uk/pub/CGP/
|
8
|
+
url = "ftp://ftp.sanger.ac.uk/pub/CGP/wgs/data_export/CosmicWGS_MutantExport_v61_260912.tsv.gz"
|
9
9
|
|
10
|
-
TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation
|
10
|
+
tsv = TSV.open(Open.open(url), :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
|
11
|
+
tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
|
12
|
+
tsv.add_field "Genomic Mutation" do |mid, values|
|
13
|
+
position = values["Mutation GRCh37 genome position"]
|
14
|
+
cds = values["Mutation CDS"]
|
15
|
+
if position.nil? or position.empty?
|
16
|
+
nil
|
17
|
+
else
|
18
|
+
position = position.split("-").first
|
19
|
+
if cds.nil?
|
20
|
+
position
|
21
|
+
else
|
22
|
+
change = case
|
23
|
+
when cds =~ />/
|
24
|
+
cds.split(">").last
|
25
|
+
when cds =~ /del/
|
26
|
+
deletion = cds.split("del").last
|
27
|
+
case
|
28
|
+
when deletion =~ /^\d+$/
|
29
|
+
"-" * deletion.to_i
|
30
|
+
when deletion =~ /^[ACTG]+$/i
|
31
|
+
"-" * deletion.length
|
32
|
+
else
|
33
|
+
Log.debug "Unknown deletion: #{ deletion }"
|
34
|
+
deletion
|
35
|
+
end
|
36
|
+
when cds =~ /ins/
|
37
|
+
insertion = cds.split("ins").last
|
38
|
+
case
|
39
|
+
when insertion =~ /^\d+$/
|
40
|
+
"+" + "N" * insertion.to_i
|
41
|
+
when insertion =~ /^[NACTG]+$/i
|
42
|
+
"+" + insertion
|
43
|
+
else
|
44
|
+
Log.debug "Unknown insertion: #{insertion }"
|
45
|
+
insertion
|
46
|
+
end
|
47
|
+
else
|
48
|
+
Log.debug "Unknown change: #{cds}"
|
49
|
+
"?(" << cds << ")"
|
50
|
+
end
|
51
|
+
position + ":" + change
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
tsv.to_s.gsub(/^(\d)/m,'COSM\1').gsub(/(\d)-(\d)/,'\1:\2')
|
11
56
|
end
|
12
57
|
end
|
13
|
-
|
14
|
-
puts COSMIC.Mutations.produce
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity/gene'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/sources/organism'
|
5
|
+
|
6
|
+
module HPRD
|
7
|
+
extend Resource
|
8
|
+
self.subdir = "share/databases/HPRD"
|
9
|
+
|
10
|
+
HPRD.claim HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"], :proc do
|
11
|
+
raise "File BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt not found in '#{HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].find}', download manually from http://www.hprd.org/"
|
12
|
+
end
|
13
|
+
|
14
|
+
HPRD.claim HPRD.protein_protein, :proc do
|
15
|
+
tsv = HPRD["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].tsv
|
16
|
+
|
17
|
+
tsv.key_field = "Associated Gene Name 1"
|
18
|
+
tsv.fields = ["HPRD id 1","RefSeq Protein ID 1","Associated Gene Name 2","HPRD id 2","RefSeq Protein ID 2", "Experiment type", "PMID"]
|
19
|
+
tsv.namespace = "Hsa"
|
20
|
+
|
21
|
+
tsv.to_s
|
22
|
+
end
|
23
|
+
end
|
@@ -1,17 +1,107 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity/gene'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/sources/organism'
|
5
|
+
|
2
6
|
module InterPro
|
3
7
|
extend Resource
|
4
8
|
self.subdir = "share/databases/InterPro"
|
5
9
|
|
6
|
-
InterPro.claim InterPro.
|
10
|
+
InterPro.claim InterPro.source.protein2ipr.find, :url, "ftp://ftp.ebi.ac.uk/pub/databases/interpro/protein2ipr.dat.gz"
|
11
|
+
|
12
|
+
InterPro.claim InterPro.protein_domains.find, :proc do
|
13
|
+
organism = "Hsa"
|
14
|
+
uniprot_colum = TSV::Parser.new(Organism.protein_identifiers(organism).open).all_fields.index("UniProt/SwissProt Accession")
|
15
|
+
uniprots = CMD.cmd("grep -v '^#'|cut -f #{uniprot_colum+1}", :in => Organism.protein_identifiers(organism).open).read.split("\n").collect{|l| l.split("|")}.flatten.uniq.reject{|l| l.empty?}
|
16
|
+
|
17
|
+
tsv = nil
|
18
|
+
TmpFile.with_file(uniprots * "\n") do |tmpfile|
|
19
|
+
tsv = TSV.open(CMD.cmd("cut -f 1,2,5,6 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :double)
|
20
|
+
end
|
21
|
+
|
22
|
+
tsv.key_field = "UniProt/SwissProt Accession"
|
23
|
+
tsv.fields = ["InterPro ID", "Domain Start AA", "Domain End AA"]
|
24
|
+
tsv.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
InterPro.claim InterPro.domain_names.find, :proc do
|
28
|
+
#tsv = InterPro.source.protein2ipr.tsv :key_field => 1, :fields => [2], :type => :single
|
29
|
+
tsv = TSV.open(CMD.cmd("cut -f 2,3 | sort -u", :in => InterPro.source.protein2ipr.open, :pipe => true), :merge => true, :type => :single)
|
30
|
+
|
31
|
+
tsv.key_field = "InterPro ID"
|
32
|
+
tsv.fields = ["Domain Name"]
|
33
|
+
tsv.to_s
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.name_index
|
37
|
+
@@name_index ||= InterPro.domain_names.tsv(:persist => true, :unnamed => true)
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.gene_index
|
41
|
+
@@gene_index ||= InterPro.protein_domains.tsv(:persist => true, :key_field => "InterPro ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.domain_index
|
45
|
+
@@domain_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID"], :merge => true)
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.domain_position_index
|
49
|
+
@@domain_position_index ||= InterPro.protein_domains.tsv(:persist => true, :unnamed => true, :key_field => "UniProt/SwissProt Accession", :fields => ["InterPro ID", "Domain Start AA", "Domain End AA"], :type => :double, :merge => true)
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.ens2uniprot(organism)
|
53
|
+
@@ens2uniprot_index ||= {}
|
54
|
+
@@ens2uniprot_index[organism] ||= Organism.protein_identifiers(organism).tsv(:persist => true, :unnamed => true, :fields => ["UniProt/SwissProt Accession"], :key_field => "Ensembl Protein ID", :type => :double, :merge => true)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
if defined? Entity
|
60
|
+
module InterProDomain
|
61
|
+
extend Entity
|
62
|
+
self.format = "InterPro ID"
|
63
|
+
|
64
|
+
self.annotation :organism
|
65
|
+
property :description => :array2single do
|
66
|
+
InterPro.name_index.values_at *self
|
67
|
+
end
|
68
|
+
|
69
|
+
property :name => :array2single do
|
70
|
+
InterPro.name_index.values_at *self
|
71
|
+
end
|
72
|
+
|
73
|
+
property :proteins => :array2single do
|
74
|
+
InterPro.gene_index.values_at(*self).
|
75
|
+
collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Protein.setup(o, "UniProt/SwissProt Accession", organism)}
|
76
|
+
end
|
77
|
+
|
78
|
+
property :genes => :array2single do
|
79
|
+
InterPro.gene_index.values_at(*self).
|
80
|
+
collect{|genes| genes = genes.uniq; genes.organism = organism if genes.respond_to? :organism; genes }.tap{|o| Gene.setup(o, "UniProt/SwissProt Accession", organism)}
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if defined? Protein and Entity === Protein
|
85
|
+
module Protein
|
86
|
+
property :interpro_domains => :array2single do
|
87
|
+
self.collect do |protein|
|
88
|
+
uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
|
89
|
+
uniprot.empty? ? nil :
|
90
|
+
InterPro.domain_index.values_at(*uniprot).compact.flatten. each{|pth| pth.organism = organism if pth.respond_to? :organism }.uniq.tap{|o| InterProDomain.setup(o, organism)}
|
91
|
+
end
|
92
|
+
end
|
7
93
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
94
|
+
property :interpro_domain_positions => :array2single do
|
95
|
+
self.collect do |protein|
|
96
|
+
if protein.nil?
|
97
|
+
[].tap{|o| InterProDomain.setup(o, organism)}
|
98
|
+
else
|
99
|
+
uniprot = (InterPro.ens2uniprot(protein.organism)[protein] || []).flatten
|
100
|
+
uniprot.empty? ? nil :
|
101
|
+
InterPro.domain_position_index.values_at(*uniprot).compact.flatten(1).tap{|o| InterProDomain.setup(o, organism)}
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
15
105
|
end
|
16
106
|
end
|
17
107
|
end
|
data/lib/rbbt/sources/NCI.rb
CHANGED
@@ -34,7 +34,8 @@ if defined? Entity
|
|
34
34
|
end
|
35
35
|
|
36
36
|
property :genes => :array2single do
|
37
|
-
@genes ||= NCINaturePathway.gene_index.values_at
|
37
|
+
@genes ||= NCINaturePathway.gene_index.values_at(*self).
|
38
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
@@ -65,7 +66,8 @@ if defined? Entity
|
|
65
66
|
end
|
66
67
|
|
67
68
|
property :genes => :array2single do
|
68
|
-
@genes ||= NCIReactomePathway.gene_index.values_at
|
69
|
+
@genes ||= NCIReactomePathway.gene_index.values_at(*self).
|
70
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
69
71
|
end
|
70
72
|
end
|
71
73
|
|
@@ -97,7 +99,7 @@ if defined? Entity
|
|
97
99
|
|
98
100
|
property :genes => :array2single do
|
99
101
|
@genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
|
100
|
-
each{|
|
102
|
+
each{|gene| gene.organism = organism if gene.respond_to? :organism }
|
101
103
|
end
|
102
104
|
end
|
103
105
|
|
@@ -109,8 +111,8 @@ if defined? Entity
|
|
109
111
|
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
|
110
112
|
end
|
111
113
|
|
112
|
-
property :
|
113
|
-
@
|
114
|
+
property :nci_reactome_pathways => :array2single do
|
115
|
+
@nci_reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
|
114
116
|
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
|
115
117
|
end
|
116
118
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
|
5
|
+
module PSI_MI
|
6
|
+
extend Resource
|
7
|
+
self.subdir = Rbbt.share.databases.PSI_MI
|
8
|
+
|
9
|
+
URL="http://psidev.cvs.sourceforge.net/viewvc/psidev/psi/mi/rel25/data/psi-mi25.obo"
|
10
|
+
|
11
|
+
PSI_MI.claim PSI_MI.identifiers, :proc do
|
12
|
+
tsv = TSV.setup({}, :type => :list, :key_field => "PSI-MI Term", :fields => ["Name", "Description"])
|
13
|
+
Open.open(URL).read.split("[Term]").each do |chunk|
|
14
|
+
id = chunk.scan(/id: ([^\n]*)/)[0]
|
15
|
+
name = chunk.scan(/name: ([^\n]*)/)[0]
|
16
|
+
description = chunk.scan(/def: "([^\n]*)"/)[0]
|
17
|
+
tsv[id] = [name, description]
|
18
|
+
end
|
19
|
+
tsv.to_s
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
if defined? Entity
|
25
|
+
require 'rbbt/entity/gene'
|
26
|
+
require 'rbbt/entity/interactor'
|
27
|
+
require 'rbbt/sources/PSI_MI'
|
28
|
+
|
29
|
+
module PSI_MITerm
|
30
|
+
extend Entity
|
31
|
+
|
32
|
+
self.format = "PSI-MI Term"
|
33
|
+
|
34
|
+
property :name => :array2single do
|
35
|
+
@@index ||= PSI_MI.identifiers.tsv(:persist => true, :fields => ["Name"], :type => :single)
|
36
|
+
@@index.values_at(*self)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'rbbt/resource'
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
|
4
|
+
module STITCH
|
5
|
+
extend Resource
|
6
|
+
self.subdir = "share/databases/STITCH"
|
7
|
+
|
8
|
+
STITCH.claim STITCH.source.chemical_chemical.find, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
|
9
|
+
STITCH.claim STITCH.source.protein_chemical.find, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
|
10
|
+
STITCH.claim STITCH.source.actions.find, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
|
11
|
+
STITCH.claim STITCH.source.aliases.find, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
|
12
|
+
STITCH.claim STITCH.source.sources.find, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
|
13
|
+
|
14
|
+
Organism.installable_organisms.each do |organism|
|
15
|
+
STITCH.claim STITCH.chemical_protein(organism), :proc do
|
16
|
+
taxids = Organism.entrez_taxids(organism).read.split("\n")
|
17
|
+
tsv = TSV.open(CMD.cmd("grep '\t\\(#{ taxids * '\|' }\\)\\.' | sed 's/\\(#{taxids * "|"}\\)\\.//'", :in => STITCH.source.protein_chemical.open, :pipe => true), :type => :double, :merge => true)
|
18
|
+
tsv.key_field = "Chemical CID"
|
19
|
+
tsv.fields = ["Ensembl Gene ID", "experimental", "database", "textmining", "combined_score"]
|
20
|
+
tsv.to_s
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
STITCH.claim STITCH.identifiers, :proc do
|
26
|
+
identifiers = {}
|
27
|
+
fields = []
|
28
|
+
first_line = true
|
29
|
+
i = 0
|
30
|
+
STITCH.source.aliases.read do |line|
|
31
|
+
i += 1
|
32
|
+
next if i == 1
|
33
|
+
puts i
|
34
|
+
cid, code, source = line.split("\t")
|
35
|
+
|
36
|
+
pos = fields.index source
|
37
|
+
if pos.nil?
|
38
|
+
fields << source
|
39
|
+
pos = fields.length - 1
|
40
|
+
end
|
41
|
+
identifiers[cid] ||= []
|
42
|
+
identifiers[cid][pos] = code
|
43
|
+
end
|
44
|
+
|
45
|
+
TSV.setup(identifiers, :key_field => ["Chemical CID"], :fields => fields, :type => :double)
|
46
|
+
|
47
|
+
identifiers.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
STITCH.claim STITCH.iupac, :proc do
|
51
|
+
tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "IUPAC", :fields => [1]
|
52
|
+
tsv.key_field = "Chemical CID"
|
53
|
+
tsv.fields = ["IUPAC"]
|
54
|
+
|
55
|
+
tsv.to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
STITCH.claim STITCH.drug_bank, :proc do
|
59
|
+
tsv = STITCH.source.aliases.tsv :type => :double, :merge => true, :grep => "DrugBank", :fields => [1]
|
60
|
+
tsv.key_field = "Chemical CID"
|
61
|
+
tsv.fields = ["DrugBank ID"]
|
62
|
+
|
63
|
+
tsv.to_s
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
if defined? Entity
|
69
|
+
module Compound
|
70
|
+
extend Entity
|
71
|
+
|
72
|
+
self.annotation :format
|
73
|
+
self.format = "Chemical CID"
|
74
|
+
|
75
|
+
property :iupac => :array2single do
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
if __FILE__ == $0
|
83
|
+
STITCH.source.chemical_chemical.produce
|
84
|
+
STITCH.source.protein_chemical.produce
|
85
|
+
STITCH.source.actions.produce
|
86
|
+
STITCH.source.aliases.produce
|
87
|
+
STITCH.source.sources.produce
|
88
|
+
STITCH.chemical_protein("Hsa").produce
|
89
|
+
STITCH.iupac.produce
|
90
|
+
STITCH.drug_bank.produce
|
91
|
+
STITCH.identifiers.produce
|
92
|
+
end
|
data/lib/rbbt/sources/barcode.rb
CHANGED
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -21,12 +21,12 @@ module BioMart
|
|
21
21
|
|
22
22
|
@@biomart_query_xml = <<-EOT
|
23
23
|
<?xml version="1.0" encoding="UTF-8"?>
|
24
|
-
<!DOCTYPE Query>
|
24
|
+
<!DOCTYPE Query>
|
25
25
|
<Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
26
26
|
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
27
27
|
<!--FILTERS-->
|
28
|
-
<!--MAIN-->
|
29
|
-
<!--ATTRIBUTES-->
|
28
|
+
<!--MAIN-->
|
29
|
+
<!--ATTRIBUTES-->
|
30
30
|
</Dataset>
|
31
31
|
</Query>
|
32
32
|
EOT
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/resource'
|
4
|
+
require 'net/ftp'
|
5
|
+
|
6
|
+
module DbSNP
|
7
|
+
extend Resource
|
8
|
+
self.subdir = "share/databases/dbSNP"
|
9
|
+
|
10
|
+
URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
|
11
|
+
|
12
|
+
DbSNP.claim DbSNP.mutations, :proc do
|
13
|
+
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :single)
|
14
|
+
file = Open.open(URL, :nocache => true)
|
15
|
+
while line = file.gets do
|
16
|
+
next if line[0] == "#"[0]
|
17
|
+
chr, position, id, ref, alt = line.split "\t"
|
18
|
+
alt = alt.split(",").first
|
19
|
+
if alt[0] == ref[0]
|
20
|
+
alt[0] = '+'[0]
|
21
|
+
end
|
22
|
+
mutation = [chr, position, alt] * ":"
|
23
|
+
|
24
|
+
tsv.namespace = "Hsa/may2012"
|
25
|
+
tsv[id] = mutation
|
26
|
+
end
|
27
|
+
|
28
|
+
tsv.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
DbSNP.claim DbSNP.mutations_gatk, :proc do
|
32
|
+
ftp = Net::FTP.new('ftp.broadinstitute.org')
|
33
|
+
ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
|
34
|
+
ftp.chdir('/bundle/2.3/hg19')
|
35
|
+
|
36
|
+
tmpfile = TmpFile.tmp_file + '.gz'
|
37
|
+
ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
|
38
|
+
|
39
|
+
tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation", "GMAF", "G5", "G5A", "dbSNP Build ID"], :type => :list)
|
40
|
+
file = Open.open(tmpfile, :nocache => true)
|
41
|
+
while line = file.gets do
|
42
|
+
next if line[0] == "#"[0]
|
43
|
+
|
44
|
+
chr, position, id, ref, mut, qual, filter, info = line.split "\t"
|
45
|
+
|
46
|
+
chr.sub!('chr', '')
|
47
|
+
|
48
|
+
mut = mut.split(",").first
|
49
|
+
case
|
50
|
+
when ref == '-'
|
51
|
+
mut = "+" << mut
|
52
|
+
when mut == '-'
|
53
|
+
mut = "-" * ref.length
|
54
|
+
when (mut.length > 1 and ref.length > 1)
|
55
|
+
mut = '-' * ref.length << mut
|
56
|
+
when (mut.length > 1 and ref.length == 1 and mut.index(ref) == 0)
|
57
|
+
mut = '+' << mut[1..-1]
|
58
|
+
when (mut.length == 1 and ref.length > 1 and ref.index(mut) == 0)
|
59
|
+
mut = '-' * (ref.length - 1)
|
60
|
+
else
|
61
|
+
mut = mut
|
62
|
+
end
|
63
|
+
|
64
|
+
g5 = g5a = dbsnp_build_id = gmaf = nil
|
65
|
+
|
66
|
+
gmaf = $1 if info =~ /GMAF=([0-9.]+)/
|
67
|
+
g5 = true if info =~ /\bG5\b/
|
68
|
+
g5a = true if info =~ /\bG5A\b/
|
69
|
+
dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
|
70
|
+
|
71
|
+
mutation = [chr, position, mut] * ":"
|
72
|
+
|
73
|
+
tsv.namespace = "Hsa/may2012"
|
74
|
+
|
75
|
+
tsv[id] = [mutation, gmaf, g5, g5a, dbsnp_build_id]
|
76
|
+
end
|
77
|
+
|
78
|
+
FileUtils.rm tmpfile
|
79
|
+
|
80
|
+
tsv.to_s
|
81
|
+
end
|
82
|
+
|
83
|
+
DbSNP.claim DbSNP.mutations_hg18, :proc do
|
84
|
+
require 'rbbt/sources/organism'
|
85
|
+
|
86
|
+
hg19_tsv = DbSNP.mutations.tsv :unnamed => true
|
87
|
+
|
88
|
+
mutations = hg19_tsv.values
|
89
|
+
|
90
|
+
translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
|
91
|
+
|
92
|
+
tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
|
93
|
+
translations[mutation]
|
94
|
+
end
|
95
|
+
|
96
|
+
tsv.namespace = "Hsa/may2009"
|
97
|
+
|
98
|
+
tsv.to_s
|
99
|
+
end
|
100
|
+
end
|