rbbt-sources 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/sources/NCI.rb +70 -12
- data/lib/rbbt/sources/cath.rb +142 -0
- data/lib/rbbt/sources/go.rb +14 -3
- data/lib/rbbt/sources/organism.rb +1 -1
- data/lib/rbbt/sources/pfam.rb +35 -0
- data/lib/rbbt/sources/pubmed.rb +7 -11
- data/lib/rbbt/sources/tfacts.rb +0 -1
- data/lib/rbbt/sources/uniprot.rb +125 -0
- data/share/install/Organism/Hsa/Rakefile +1 -4
- data/share/install/Organism/Mmu/Rakefile +57 -0
- data/share/install/Organism/Rno/Rakefile +1 -0
- data/share/install/Organism/Sce/Rakefile +1 -0
- data/share/install/Organism/organism_helpers.rb +54 -1
- metadata +8 -5
- data/lib/rbbt/sources/organism/sequence.rb +0 -612
data/lib/rbbt/sources/NCI.rb
CHANGED
@@ -8,57 +8,115 @@ end
|
|
8
8
|
|
9
9
|
if defined? Entity
|
10
10
|
|
11
|
-
module
|
11
|
+
module NCINaturePathway
|
12
12
|
extend Entity
|
13
13
|
self.format = "NCI Nature Pathway ID"
|
14
14
|
|
15
|
+
self.annotation :organism
|
16
|
+
|
17
|
+
def self.name_index
|
18
|
+
@name_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.gene_index
|
22
|
+
@gene_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
26
|
+
return true if query == entity
|
27
|
+
|
28
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
29
|
+
|
30
|
+
false
|
31
|
+
end
|
15
32
|
property :name => :array2single do
|
16
|
-
@name ||=
|
33
|
+
@name ||= NCINaturePathway.name_index.values_at *self
|
17
34
|
end
|
18
35
|
|
19
36
|
property :genes => :array2single do
|
20
|
-
@genes ||=
|
37
|
+
@genes ||= NCINaturePathway.gene_index.values_at *self
|
21
38
|
end
|
22
39
|
end
|
23
40
|
|
24
|
-
module
|
41
|
+
module NCIReactomePathway
|
25
42
|
extend Entity
|
26
43
|
self.format = "NCI Reactome Pathway ID"
|
44
|
+
|
45
|
+
self.annotation :organism
|
46
|
+
|
47
|
+
def self.name_index
|
48
|
+
@name_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.gene_index
|
52
|
+
@gene_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
56
|
+
return true if query == entity
|
57
|
+
|
58
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
59
|
+
|
60
|
+
false
|
61
|
+
end
|
27
62
|
|
28
63
|
property :name => :array2single do
|
29
|
-
@name ||=
|
64
|
+
@name ||= NCIReactomePathway.name_index.values_at *self
|
30
65
|
end
|
31
66
|
|
32
67
|
property :genes => :array2single do
|
33
|
-
@genes ||=
|
68
|
+
@genes ||= NCIReactomePathway.gene_index.values_at *self
|
34
69
|
end
|
35
70
|
end
|
36
71
|
|
37
|
-
module
|
72
|
+
module NCIBioCartaPathway
|
38
73
|
extend Entity
|
39
74
|
self.format = "NCI BioCarta Pathway ID"
|
40
75
|
|
76
|
+
self.annotation :organism
|
77
|
+
|
78
|
+
def self.name_index
|
79
|
+
@name_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.gene_index
|
83
|
+
@gene_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true)
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
87
|
+
return true if query == entity
|
88
|
+
|
89
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
90
|
+
|
91
|
+
false
|
92
|
+
end
|
93
|
+
|
41
94
|
property :name => :array2single do
|
42
|
-
@name ||=
|
95
|
+
@name ||= NCIBioCartaPathway.name_index.values_at *self
|
43
96
|
end
|
44
97
|
|
45
98
|
property :genes => :array2single do
|
46
|
-
@genes ||=
|
99
|
+
@genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
|
100
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }
|
47
101
|
end
|
48
102
|
end
|
49
103
|
|
50
104
|
if defined? Gene and Entity === Gene
|
51
105
|
module Gene
|
52
106
|
property :nature_pathways => :array2single do
|
53
|
-
@nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).
|
107
|
+
@nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).
|
108
|
+
values_at(*self.to("UniProt/SwissProt Accession")).
|
109
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
|
54
110
|
end
|
55
111
|
|
56
112
|
property :reactome_pathways => :array2single do
|
57
|
-
@reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at
|
113
|
+
@reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
|
114
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
|
58
115
|
end
|
59
116
|
|
60
117
|
property :biocarta_pathways => :array2single do
|
61
|
-
@biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at
|
118
|
+
@biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at(*self.entrez).
|
119
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIBioCartaPathway.setup(o, organism)}
|
62
120
|
end
|
63
121
|
end
|
64
122
|
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
module Cath
|
4
|
+
extend Resource
|
5
|
+
|
6
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathNames, :proc do
|
7
|
+
tsv = TSV.setup({}, :key_field => "CATH Code", :type => :list, :fields => ["PDB ID", "CATH Domain", "CATH Description"])
|
8
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathNames").split(/\n/).each do |line|
|
9
|
+
next if line =~ /^#/
|
10
|
+
code, pdb, domain, name = line.match(/([\d\.]+)\s+(\w\w\w\w)(\w\w\w)\s+:(.*)/).values_at 1,2,3,4
|
11
|
+
tsv[code] = [pdb.downcase, domain, name]
|
12
|
+
end
|
13
|
+
|
14
|
+
tsv.to_s
|
15
|
+
end
|
16
|
+
|
17
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathUnclassifiedList , :proc do
|
18
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathUnclassifiedList").split(/\n/).collect do |line|
|
19
|
+
next if line =~ /^#/
|
20
|
+
line.split(/\s/).first
|
21
|
+
end * "\n"
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathDomainSeqs, :proc do
|
26
|
+
tsv = TSV.setup({}, :key_field => "CATH Domain", :type => :single, :fields => ["Cath Domain Sequence"])
|
27
|
+
|
28
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathDomainSeqs.ATOM").split(/>pdb\|/).each do |chunk|
|
29
|
+
next if chunk.empty?
|
30
|
+
domain, sequence = chunk.strip.match(/(.*)\n(.*)/).values_at 1, 2
|
31
|
+
tsv[domain] = sequence
|
32
|
+
end
|
33
|
+
|
34
|
+
tsv.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathRegions, :proc do
|
39
|
+
domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["Start", "End"])
|
40
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathDomall").split(/\n/).each do |line|
|
41
|
+
next if line =~ /^#/
|
42
|
+
chain, ndomains, nfragments, rest = line.match(/(\w\w\w\w\w)\s+D(\d+)\s+F(\d+)\s+(.*)/).values_at 1,2,3,4
|
43
|
+
|
44
|
+
ndomains.to_i.times do |dn|
|
45
|
+
nsegments, rest = rest.match(/^\s*(\d+)\s+(.*)/).values_at 1, 2
|
46
|
+
segments = []
|
47
|
+
nsegments.to_i.times do |sn|
|
48
|
+
start, eend, rest = rest.match(/\w\s+(-?\d+)\s+.\s+\w\s+(-?\d+)\s+.(.*)/).values_at 1, 2, 3
|
49
|
+
segments << [start, eend]
|
50
|
+
end
|
51
|
+
|
52
|
+
domain = chain + "%02d" % dn.to_i
|
53
|
+
segments = segments[0].zip(*segments[1..-1])
|
54
|
+
domains[domain] = segments
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
domains.to_s
|
59
|
+
end
|
60
|
+
|
61
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathDomainList, :proc do
|
62
|
+
domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["CATH domain name (seven characters)",
|
63
|
+
"Class number", "Architecture number", "Topology number", "Homologous superfamily number", "S35 sequence cluster number",
|
64
|
+
"S60 sequence cluster number", "S95 sequence cluster number", "S100 sequence cluster number", "S100 sequence count number",
|
65
|
+
"Domain length", "Structure resolution (Angstroms)"], :type => :list)
|
66
|
+
|
67
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathDomainList").split(/\n/).each do |line|
|
68
|
+
next if line =~ /^#/
|
69
|
+
parts = line.chomp.split /\s+/
|
70
|
+
domain = parts.shift
|
71
|
+
domains[domain] = parts
|
72
|
+
end
|
73
|
+
|
74
|
+
domains.to_s
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def self.cath_index
|
79
|
+
@@cath ||= Rbbt.share.databases.CATH.CathNames.tsv :persist => true, :case_insensitive => true
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.pdb_index
|
83
|
+
if not defined? @@pdb or @@pdb.nil?
|
84
|
+
@@pdb = {}
|
85
|
+
Rbbt.share.databases.CATH.CathDomainSeqs.read.split("\n").each do |line|
|
86
|
+
domain = line.split(/\t/).first
|
87
|
+
pdb = domain[0..3]
|
88
|
+
@@pdb[pdb] ||= []
|
89
|
+
@@pdb[pdb] << domain
|
90
|
+
end
|
91
|
+
end
|
92
|
+
@@pdb
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.unclassified
|
96
|
+
@@unclassified = {}
|
97
|
+
Rbbt.share.databases.CATH.CathUnclassifiedList.read.split("\n").each do |domain|
|
98
|
+
pdb = domain[0..3]
|
99
|
+
@@unclassified[pdb] ||= []
|
100
|
+
@@unclassified[pdb] << domain
|
101
|
+
end
|
102
|
+
@@unclassified
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.domain_sequences
|
106
|
+
@@domain_sequences ||= Rbbt.share.databases.CATH.CathDomainSeqs.tsv(:persist => true)
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.pdbs(cath_code)
|
110
|
+
cath = cath_index
|
111
|
+
if cath.include? cath_code
|
112
|
+
cath[cath_code]["PDB ID"]
|
113
|
+
else
|
114
|
+
nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.domains_for_pdb(pdb)
|
119
|
+
pdb2cath = pdb_index
|
120
|
+
(pdb2cath[pdb] || []) + (unclassified[pdb] || [])
|
121
|
+
end
|
122
|
+
|
123
|
+
def self.align(domain, sequence)
|
124
|
+
require 'bio'
|
125
|
+
|
126
|
+
return nil if not domain_sequences.include? domain
|
127
|
+
|
128
|
+
TmpFile.with_file(">target\n" << sequence) do |target|
|
129
|
+
TmpFile.with_file(">domain\n" << domain_sequences[domain]) do |domain|
|
130
|
+
|
131
|
+
result = CMD.cmd("fasta35 #{ target } #{ domain }").read
|
132
|
+
|
133
|
+
if result.match(/([\d\.]+)% identity.*overlap \((\d+)-(\d+):/s)
|
134
|
+
{:identity => $1.to_f, :range => ($2.to_i..$3.to_i)}
|
135
|
+
else
|
136
|
+
false
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -91,22 +91,33 @@ if defined? Entity
|
|
91
91
|
extend Entity
|
92
92
|
self.format = "GO ID"
|
93
93
|
|
94
|
+
self.annotation :organism
|
95
|
+
|
94
96
|
property :name => :array2single do
|
95
97
|
@name ||= GO.id2name(self)
|
96
98
|
end
|
97
99
|
|
98
100
|
property :genes => :array2single do |organism|
|
101
|
+
organism ||= self.organism
|
99
102
|
@genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
|
100
103
|
end
|
104
|
+
|
105
|
+
property :description => :single2array do
|
106
|
+
description = GO.info[self]['def']
|
107
|
+
description.gsub!(/"|\[.*\]/,'') if description
|
108
|
+
|
109
|
+
description
|
110
|
+
end
|
111
|
+
|
101
112
|
end
|
102
113
|
|
103
114
|
if defined? Gene and Entity === Gene
|
104
115
|
module Gene
|
105
|
-
property :go_terms => :array2single do
|
116
|
+
property :go_terms => :array2single do
|
106
117
|
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
107
118
|
end
|
108
|
-
|
109
|
-
property :go_bp_terms => :array2single do
|
119
|
+
|
120
|
+
property :go_bp_terms => :array2single do
|
110
121
|
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
111
122
|
end
|
112
123
|
end
|
@@ -7,7 +7,7 @@ module Organism
|
|
7
7
|
self.pkgdir = "rbbt"
|
8
8
|
self.subdir = "share/organisms"
|
9
9
|
|
10
|
-
["Hsa", "Rno", "Sce"].each do |organism|
|
10
|
+
["Hsa", "Mmu", "Rno", "Sce"].each do |organism|
|
11
11
|
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
12
12
|
|
13
13
|
module_eval "#{ organism } = with_key '#{organism}'"
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/resource'
|
4
|
+
|
5
|
+
module Pfam
|
6
|
+
extend Resource
|
7
|
+
self.subdir = "share/databases/Pfam"
|
8
|
+
|
9
|
+
Pfam.claim Pfam.domains, :proc do
|
10
|
+
url = "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
|
11
|
+
tsv = TSV.open(Open.open(url), :key_field => "Pfam Domain ID", :fields => ["Pfam Clan ID", "Code Name", "Name", "Description"])
|
12
|
+
tsv.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
NAMES_FILE = Rbbt.share.databases.InterPro.pfam_names.find
|
16
|
+
|
17
|
+
def self.name_index
|
18
|
+
@name_index ||= TSV.open NAMES_FILE, :single
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.name(id)
|
22
|
+
name_index[id]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
if defined? Entity
|
27
|
+
module PfamDomain
|
28
|
+
extend Entity
|
29
|
+
self.format = "Pfam Domain"
|
30
|
+
|
31
|
+
property :name => :array2single do
|
32
|
+
self.collect{|id| Pfam.name(id)}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -13,11 +13,13 @@ module PubMed
|
|
13
13
|
|
14
14
|
pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
|
15
15
|
|
16
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
16
17
|
articles = []
|
17
|
-
Misc.divide(
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
Misc.divide(pmids.sort, (pmids.length / 1000) + 1) do |pmid_list|
|
19
|
+
postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
|
20
|
+
xml = TmpFile.with_file(postdata) do |postfile|
|
21
|
+
Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
|
22
|
+
end
|
21
23
|
|
22
24
|
articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
|
23
25
|
end
|
@@ -202,14 +204,8 @@ module PubMed
|
|
202
204
|
}
|
203
205
|
|
204
206
|
return list unless missing.any?
|
205
|
-
chunk_size = [100, missing.length].min
|
206
|
-
chunks = (missing.length.to_f / chunk_size).ceil
|
207
207
|
|
208
|
-
articles =
|
209
|
-
chunks.times do |chunk|
|
210
|
-
pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
|
211
|
-
articles.merge!(get_online(pmids))
|
212
|
-
end
|
208
|
+
articles = get_online(missing)
|
213
209
|
|
214
210
|
articles.each{|p, xml|
|
215
211
|
filename = p + '.xml'
|
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/sources/cath'
|
4
|
+
require 'rbbt/sources/uniprot'
|
5
|
+
|
6
|
+
module Uniprot
|
7
|
+
extend Resource
|
8
|
+
self.subdir = "share/databases/Uniprot"
|
9
|
+
|
10
|
+
Uniprot.claim Uniprot.annotated_variants, :proc do
|
11
|
+
url = "http://www.uniprot.org/docs/humsavar.txt"
|
12
|
+
tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
|
13
|
+
:fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
|
14
|
+
:fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
|
15
|
+
|
16
|
+
tsv.unnamed = true
|
17
|
+
tsv.process "Amino Acid Mutation" do |mutation|
|
18
|
+
if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
|
19
|
+
wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
|
20
|
+
mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
|
21
|
+
[wt, $2, mut] * ""
|
22
|
+
else
|
23
|
+
mutation
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
|
28
|
+
mutation_pos = tsv.identify_field "Amino Acid Mutation"
|
29
|
+
tsv.add_field "Mutated Isoform" do |key, values|
|
30
|
+
[values[uniprot_pos], values[mutation_pos]] * ":"
|
31
|
+
end
|
32
|
+
|
33
|
+
tsv.reorder("Mutated Isoform").to_s
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
|
38
|
+
def self.pdbs(protein)
|
39
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
40
|
+
text = Open.read(url)
|
41
|
+
|
42
|
+
pdb = {}
|
43
|
+
|
44
|
+
text.split(/\n/).each{|l|
|
45
|
+
next unless l =~ /^DR\s+PDB; (.*)\./
|
46
|
+
id, method, resolution, region = $1.split(";").collect{|v| v.strip}
|
47
|
+
chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
|
48
|
+
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
|
49
|
+
}
|
50
|
+
pdb
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.variants(protein)
|
54
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
55
|
+
text = Open.read(url)
|
56
|
+
|
57
|
+
text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
|
58
|
+
|
59
|
+
parts = text.split(/^(FT \w+)/)
|
60
|
+
parts.shift
|
61
|
+
|
62
|
+
variants = []
|
63
|
+
|
64
|
+
type = nil
|
65
|
+
parts.each do |part|
|
66
|
+
if type.nil?
|
67
|
+
type = part
|
68
|
+
else
|
69
|
+
if type !~ /VARIANT/
|
70
|
+
type = nil
|
71
|
+
next
|
72
|
+
end
|
73
|
+
type = nil
|
74
|
+
|
75
|
+
value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
|
76
|
+
# 291 291 K -> E (in sporadic cancers; somatic mutation). /FTId=VAR_045413.
|
77
|
+
case
|
78
|
+
when value.match(/(\d+) (\d+) ([A-Z])\s*\-\>\s*([A-Z]) (.*)\. \/FTId=(.*)/)
|
79
|
+
start, eend, ref, mut, desc, id = $1, $2, $3, $4, $5, $6
|
80
|
+
when value.match(/(\d+) (\d+) (.*)\. \/FTId=(.*)/)
|
81
|
+
start, eend, ref, mut, desc, id = $1, $2, nil, nil, $3, $4
|
82
|
+
else
|
83
|
+
Log.debug "Value not understood: #{ value }"
|
84
|
+
end
|
85
|
+
variants << {
|
86
|
+
:start => start,
|
87
|
+
:end => eend,
|
88
|
+
:ref => ref,
|
89
|
+
:mut => mut,
|
90
|
+
:desc => desc,
|
91
|
+
:id => id,
|
92
|
+
}
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
variants
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
def self.cath(protein)
|
101
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
102
|
+
text = Open.read(url)
|
103
|
+
|
104
|
+
cath = {}
|
105
|
+
text.split(/\n/).each{|l|
|
106
|
+
next unless l =~ /^DR\s+Gene3D; G3DSA:(.*)\./
|
107
|
+
id, description, cuantity = $1.split(";").collect{|v| v.strip}
|
108
|
+
cath[id] = {:description => description, :cuantity => cuantity}
|
109
|
+
}
|
110
|
+
cath
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.cath_domains(protein)
|
114
|
+
pdbs = pdbs(protein).keys.uniq
|
115
|
+
pdbs.collect do |pdb|
|
116
|
+
Cath.domains_for_pdb(pdb)
|
117
|
+
end.flatten.compact
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.pdbs_covering_aa_position(protein, aa_position)
|
121
|
+
Uniprot.pdbs(protein).select do |pdb, info|
|
122
|
+
info[:region].include? aa_position
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -5,6 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
|
5
5
|
|
6
6
|
$taxs = [9606]
|
7
7
|
$scientific_name = "Homo sapiens"
|
8
|
+
$ortholog_key = "human_ensembl_gene"
|
8
9
|
|
9
10
|
$biomart_db = 'hsapiens_gene_ensembl'
|
10
11
|
$biomart_db_germline_variation = 'hsapiens_snp'
|
@@ -97,9 +98,5 @@ $biomart_go_2009= [
|
|
97
98
|
["GO CC ID", 'go_cellular_component_id'],
|
98
99
|
]
|
99
100
|
|
100
|
-
$biomart_pfam= [
|
101
|
-
["Pfam Domain", 'pfam'],
|
102
|
-
]
|
103
|
-
|
104
101
|
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
105
102
|
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -0,0 +1,57 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
+
|
6
|
+
$taxs = [10090]
|
7
|
+
$scientific_name = "Mus musculus"
|
8
|
+
$ortholog_key = "mouse_ensembl_gene"
|
9
|
+
|
10
|
+
$biomart_db = 'mmusculus_gene_ensembl'
|
11
|
+
$biomart_db_germline_variation = 'mmusculus_snp'
|
12
|
+
$biomart_db_somatic_variation = 'mmusculus_snp_som'
|
13
|
+
|
14
|
+
$biomart_lexicon = [
|
15
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
16
|
+
[ 'HGNC symbol', "hgnc_symbol" ],
|
17
|
+
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
18
|
+
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
19
|
+
]
|
20
|
+
|
21
|
+
$biomart_protein_identifiers = [
|
22
|
+
[ 'Protein ID', "protein_id" ],
|
23
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
24
|
+
[ 'Unigene ID', "unigene" ],
|
25
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
26
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
27
|
+
]
|
28
|
+
|
29
|
+
$biomart_probe_identifiers = [
|
30
|
+
]
|
31
|
+
|
32
|
+
$biomart_identifiers = [
|
33
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
34
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
35
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
36
|
+
[ 'CCDS ID', "ccds" ],
|
37
|
+
[ 'Protein ID', "protein_id" ],
|
38
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
39
|
+
[ 'Unigene ID', "unigene" ],
|
40
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
41
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
42
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
43
|
+
]
|
44
|
+
|
45
|
+
$biomart_go= [
|
46
|
+
["GO ID", 'go_id'],
|
47
|
+
["GO Namespace", 'namespace_1003'],
|
48
|
+
]
|
49
|
+
|
50
|
+
$biomart_go_2009= [
|
51
|
+
["GO BP ID", 'go_biological_process_id'],
|
52
|
+
["GO MF ID", 'go_molecular_function_id'],
|
53
|
+
["GO CC ID", 'go_cellular_component_id'],
|
54
|
+
]
|
55
|
+
|
56
|
+
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
57
|
+
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -9,6 +9,7 @@ $scientific_name = "Rattus norvegicus"
|
|
9
9
|
$biomart_db = 'rnorvegicus_gene_ensembl'
|
10
10
|
$biomart_db_germline_variation = 'rnorvegicus_snp'
|
11
11
|
$biomart_db_somatic_variation = 'rnorvegicus_snp_som'
|
12
|
+
$ortholog_key = "rat_ensembl_gene"
|
12
13
|
|
13
14
|
$biomart_lexicon = [
|
14
15
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -8,6 +8,7 @@ $native = "SGD ID"
|
|
8
8
|
$url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
|
9
9
|
$biomart_db = 'scerevisiae_gene_ensembl'
|
10
10
|
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
+
$ortholog_key = "yeast_ensembl_gene"
|
11
12
|
|
12
13
|
|
13
14
|
file 'scientific_name' do |t|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'net/ftp'
|
2
|
+
|
1
3
|
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
2
4
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
3
5
|
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
@@ -56,7 +58,9 @@ $biomart_exon_phase = [
|
|
56
58
|
['Phase','phase'],
|
57
59
|
]
|
58
60
|
|
59
|
-
|
61
|
+
$biomart_pfam= [
|
62
|
+
["Pfam Domain", 'pfam'],
|
63
|
+
]
|
60
64
|
|
61
65
|
$biomart_exons = [
|
62
66
|
$biomart_ensembl_gene,
|
@@ -71,6 +75,12 @@ file 'scientific_name' do |t|
|
|
71
75
|
File.open(t.name, 'w') do |f| f.write $scientific_name end
|
72
76
|
end
|
73
77
|
|
78
|
+
file 'ortholog_key' do |t|
|
79
|
+
raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
|
80
|
+
|
81
|
+
File.open(t.name, 'w') do |f| f.write $ortholog_key end
|
82
|
+
end
|
83
|
+
|
74
84
|
file 'identifiers' do |t|
|
75
85
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
|
76
86
|
identifiers.unnamed = true
|
@@ -456,6 +466,49 @@ file 'chromosomes' do |t|
|
|
456
466
|
File.open(t.name, 'w') do |f| f.puts goterms end
|
457
467
|
end
|
458
468
|
|
469
|
+
rule /^chromosome_.*/ do |t|
|
470
|
+
chr = t.name.match(/chromosome_(.*)/)[1]
|
471
|
+
|
472
|
+
archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
|
473
|
+
|
474
|
+
release = case archive
|
475
|
+
when "may2009"
|
476
|
+
"release-54"
|
477
|
+
when "jun2011"
|
478
|
+
"release-64"
|
479
|
+
when nil
|
480
|
+
Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
|
481
|
+
end
|
482
|
+
|
483
|
+
|
484
|
+
ftp = Net::FTP.new("ftp.ensembl.org")
|
485
|
+
ftp.login
|
486
|
+
ftp.chdir("pub/#{ release }/fasta/")
|
487
|
+
ftp.chdir($scientific_name.downcase.sub(" ",'_'))
|
488
|
+
ftp.chdir('dna')
|
489
|
+
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
490
|
+
|
491
|
+
raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
|
492
|
+
|
493
|
+
Log.debug("Downloading chromosome sequence: #{ file }")
|
494
|
+
TmpFile.with_file do |tmpfile|
|
495
|
+
ftp.getbinaryfile(file, tmpfile)
|
496
|
+
Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
|
497
|
+
ftp.close
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
rule /^possible_ortholog_(.*)/ do |t|
|
502
|
+
other = t.name.match(/ortholog_(.*)/)[1]
|
503
|
+
other_key = Organism.ortholog_key(other).produce.read
|
504
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
|
505
|
+
end
|
506
|
+
|
507
|
+
rule /^ortholog_(.*)/ do |t|
|
508
|
+
other = t.name.match(/ortholog_(.*)/)[1]
|
509
|
+
other_key = Organism.ortholog_key(other).produce.read
|
510
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
|
511
|
+
end
|
459
512
|
|
460
513
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
461
514
|
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 1.
|
10
|
+
version: 1.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-01-13 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -108,20 +108,23 @@ files:
|
|
108
108
|
- lib/rbbt/sources/barcode.rb
|
109
109
|
- lib/rbbt/sources/bibtex.rb
|
110
110
|
- lib/rbbt/sources/biomart.rb
|
111
|
+
- lib/rbbt/sources/cath.rb
|
111
112
|
- lib/rbbt/sources/entrez.rb
|
112
113
|
- lib/rbbt/sources/go.rb
|
113
114
|
- lib/rbbt/sources/gscholar.rb
|
114
115
|
- lib/rbbt/sources/jochem.rb
|
115
116
|
- lib/rbbt/sources/organism.rb
|
116
|
-
- lib/rbbt/sources/
|
117
|
+
- lib/rbbt/sources/pfam.rb
|
117
118
|
- lib/rbbt/sources/polysearch.rb
|
118
119
|
- lib/rbbt/sources/pubmed.rb
|
119
120
|
- lib/rbbt/sources/tfacts.rb
|
121
|
+
- lib/rbbt/sources/uniprot.rb
|
120
122
|
- lib/rbbt/sources/wgEncodeBroadHmm.rb
|
121
123
|
- share/install/InterPro/Rakefile
|
122
124
|
- share/install/JoChem/Rakefile
|
123
125
|
- share/install/NCI/Rakefile
|
124
126
|
- share/install/Organism/Hsa/Rakefile
|
127
|
+
- share/install/Organism/Mmu/Rakefile
|
125
128
|
- share/install/Organism/Rno/Rakefile
|
126
129
|
- share/install/Organism/Sce/Rakefile
|
127
130
|
- share/install/Organism/organism_helpers.rb
|
@@ -1,612 +0,0 @@
|
|
1
|
-
require 'rbbt/sources/organism'
|
2
|
-
require 'rbbt/util/workflow'
|
3
|
-
require 'bio'
|
4
|
-
# Sequence analyses
|
5
|
-
module Organism
|
6
|
-
extend WorkFlow
|
7
|
-
relative_to Rbbt, "share/organisms"
|
8
|
-
self.jobdir = Rbbt.var.organism.find
|
9
|
-
|
10
|
-
def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
|
11
|
-
exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
12
|
-
transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
|
13
|
-
|
14
|
-
transcripts = begin
|
15
|
-
exon_transcripts[exon].first
|
16
|
-
rescue
|
17
|
-
[]
|
18
|
-
end
|
19
|
-
|
20
|
-
transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
|
21
|
-
end
|
22
|
-
|
23
|
-
def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
|
24
|
-
transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
25
|
-
transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
26
|
-
|
27
|
-
utr5 = transcript_5utr[transcript]
|
28
|
-
|
29
|
-
raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
|
30
|
-
|
31
|
-
return nil if utr5 > offset
|
32
|
-
|
33
|
-
sequence = transcript_sequence[transcript]
|
34
|
-
raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
|
35
|
-
|
36
|
-
ccds_offset = offset - utr5
|
37
|
-
return nil if ccds_offset > sequence.length
|
38
|
-
|
39
|
-
range = (utr5..-1)
|
40
|
-
sequence = sequence[range]
|
41
|
-
|
42
|
-
codon = ccds_offset / 3
|
43
|
-
codon_offset = ccds_offset % 3
|
44
|
-
|
45
|
-
[sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
|
46
|
-
end
|
47
|
-
|
48
|
-
def self.codon_change(allele, codon, offset)
|
49
|
-
original = Bio::Sequence::NA .new(codon).translate
|
50
|
-
codon = codon.dup
|
51
|
-
codon[offset] = allele
|
52
|
-
new = Bio::Sequence::NA .new(codon).translate
|
53
|
-
[original, new]
|
54
|
-
end
|
55
|
-
|
56
|
-
def self.genes_at_chromosome_positions(org, chromosome, positions)
|
57
|
-
chromosome = chromosome.to_s
|
58
|
-
chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
59
|
-
tsv = file.tsv(:persistence => false, :type => :list)
|
60
|
-
tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
|
61
|
-
[gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
if Array === positions
|
66
|
-
positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
|
67
|
-
else
|
68
|
-
pos = chromosome_bed[positions];
|
69
|
-
pos.nil? ? nil : pos.first
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def self.genes_at_genomic_positions(org, positions)
|
74
|
-
positions = [positions] unless Array === positions.first
|
75
|
-
genes = []
|
76
|
-
chromosomes = {}
|
77
|
-
indices = {}
|
78
|
-
positions.each_with_index do |info,i|
|
79
|
-
chr, pos = info
|
80
|
-
chromosomes[chr] ||= []
|
81
|
-
indices[chr] ||= []
|
82
|
-
chromosomes[chr] << pos
|
83
|
-
indices[chr] << i
|
84
|
-
end
|
85
|
-
|
86
|
-
chromosomes.each do |chr, pos_list|
|
87
|
-
chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
|
88
|
-
chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
|
89
|
-
end
|
90
|
-
|
91
|
-
genes
|
92
|
-
end
|
93
|
-
|
94
|
-
def self.exons_at_chromosome_positions(org, chromosome, positions)
|
95
|
-
chromosome = chromosome.to_s
|
96
|
-
chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
97
|
-
tsv = file.tsv(:persistence => true, :type => :list)
|
98
|
-
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
99
|
-
[exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
if Array === positions
|
104
|
-
positions.collect{|position|
|
105
|
-
chromosome_bed[position];
|
106
|
-
}
|
107
|
-
else
|
108
|
-
chromosome_bed[positions];
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
|
113
|
-
def self.exons_at_genomic_positions(org, positions)
|
114
|
-
positions = [positions] unless Array === positions.first
|
115
|
-
|
116
|
-
exons = []
|
117
|
-
chromosomes = {}
|
118
|
-
indices = {}
|
119
|
-
positions.each_with_index do |info,i|
|
120
|
-
chr, pos = info
|
121
|
-
chromosomes[chr] ||= []
|
122
|
-
indices[chr] ||= []
|
123
|
-
chromosomes[chr] << pos
|
124
|
-
indices[chr] << i
|
125
|
-
end
|
126
|
-
|
127
|
-
chromosomes.each do |chr, pos_list|
|
128
|
-
chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
|
129
|
-
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
130
|
-
end
|
131
|
-
|
132
|
-
exons
|
133
|
-
end
|
134
|
-
|
135
|
-
def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
|
136
|
-
exons ||= Organism.exons(org).tsv(:persistence => true)
|
137
|
-
transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
|
138
|
-
|
139
|
-
sizes = [0]
|
140
|
-
rank = nil
|
141
|
-
transcript_exons[transcript].zip_fields.each do |_exon, _rank|
|
142
|
-
_rank = _rank.to_i
|
143
|
-
s, e = exons[_exon].values_at("Start", "End")
|
144
|
-
size = e.to_i - s.to_i + 1
|
145
|
-
sizes[_rank] = size
|
146
|
-
rank = _rank if _exon == exon
|
147
|
-
end
|
148
|
-
|
149
|
-
if not rank.nil?
|
150
|
-
sizes[0..rank - 1].inject(0){|e,acc| acc += e}
|
151
|
-
else
|
152
|
-
nil
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
|
157
|
-
exon_info ||= Organism.exons(org).tsv(:persistence => true)
|
158
|
-
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
159
|
-
|
160
|
-
exons = [exons] unless Array === exons
|
161
|
-
transcript_offsets = {}
|
162
|
-
exons.each do |exon|
|
163
|
-
transcript_offsets[exon] ||= {}
|
164
|
-
offsets = nil
|
165
|
-
next unless exon_offsets.include? exon
|
166
|
-
offsets = exon_offsets[exon].zip_fields
|
167
|
-
|
168
|
-
offsets.collect do |transcript, offset|
|
169
|
-
next if transcript.empty?
|
170
|
-
transcript_offsets[exon][transcript] = offset.to_i
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
transcript_offsets
|
175
|
-
end
|
176
|
-
|
177
|
-
def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
|
178
|
-
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
179
|
-
exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
180
|
-
exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
181
|
-
exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
182
|
-
|
183
|
-
exons = exons_at_genomic_positions(org, positions)
|
184
|
-
offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
|
185
|
-
|
186
|
-
position_exons = {}
|
187
|
-
positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
|
188
|
-
|
189
|
-
position_offsets = {}
|
190
|
-
position_exons.each do |position,pos_exons|
|
191
|
-
chr, pos = position
|
192
|
-
next if pos_exons.nil? or pos_exons.empty?
|
193
|
-
pos_exons.each do |exon|
|
194
|
-
if offsets.include? exon
|
195
|
-
if exon_strand[exon] == 1
|
196
|
-
offset_in_exon = (pos.to_i - exon_start[exon].to_i)
|
197
|
-
else
|
198
|
-
offset_in_exon = (exon_end[exon] - pos.to_i)
|
199
|
-
end
|
200
|
-
position_offsets[position] ||= {}
|
201
|
-
offsets[exon].each do |transcript, offset|
|
202
|
-
if not offset.nil?
|
203
|
-
position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
position_offsets
|
211
|
-
end
|
212
|
-
|
213
|
-
def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
|
214
|
-
chromosome = chromosome.to_s
|
215
|
-
chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
216
|
-
tsv = file.tsv(:persistence => true, :type => :list)
|
217
|
-
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
218
|
-
[exon, values["Exon Chr Start"].to_i]
|
219
|
-
end
|
220
|
-
end
|
221
|
-
chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
222
|
-
tsv = file.tsv(:persistence => true, :type => :list)
|
223
|
-
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
224
|
-
[exon, values["Exon Chr End"].to_i]
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
|
-
if Array === positions
|
229
|
-
positions.collect{|position|
|
230
|
-
position = position.to_i
|
231
|
-
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
232
|
-
}
|
233
|
-
else
|
234
|
-
position = positions.to_i
|
235
|
-
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
236
|
-
end
|
237
|
-
|
238
|
-
end
|
239
|
-
|
240
|
-
def self.exon_junctures_at_genomic_positions(org, positions)
|
241
|
-
positions = [positions] unless Array === positions.first
|
242
|
-
|
243
|
-
exons = []
|
244
|
-
chromosomes = {}
|
245
|
-
indices = {}
|
246
|
-
positions.each_with_index do |info,i|
|
247
|
-
chr, pos = info
|
248
|
-
chromosomes[chr] ||= []
|
249
|
-
indices[chr] ||= []
|
250
|
-
chromosomes[chr] << pos
|
251
|
-
indices[chr] << i
|
252
|
-
end
|
253
|
-
|
254
|
-
chromosomes.each do |chr, pos_list|
|
255
|
-
chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
|
256
|
-
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
257
|
-
end
|
258
|
-
|
259
|
-
exons
|
260
|
-
end
|
261
|
-
|
262
|
-
def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
|
263
|
-
chromosome = chromosome.to_s
|
264
|
-
|
265
|
-
chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
266
|
-
rows = []
|
267
|
-
chromosome = options[:chromosome]
|
268
|
-
f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
|
269
|
-
while not f.eof?
|
270
|
-
line = f.gets.chomp
|
271
|
-
id, chr, pos = line.split "\t"
|
272
|
-
rows << [id, pos.to_i]
|
273
|
-
end
|
274
|
-
|
275
|
-
rows
|
276
|
-
end
|
277
|
-
|
278
|
-
if Array === positions
|
279
|
-
positions.collect{|position|
|
280
|
-
chromosome_bed[position];
|
281
|
-
}
|
282
|
-
else
|
283
|
-
chromosome_bed[positions];
|
284
|
-
end
|
285
|
-
end
|
286
|
-
|
287
|
-
|
288
|
-
def self.identify_variations_at_genomic_positions(org, positions, variations_file)
|
289
|
-
positions = [positions] unless Array === positions.first
|
290
|
-
|
291
|
-
variations = []
|
292
|
-
chromosomes = {}
|
293
|
-
indices = {}
|
294
|
-
positions.each_with_index do |info,i|
|
295
|
-
chr, pos = info
|
296
|
-
chromosomes[chr] ||= []
|
297
|
-
indices[chr] ||= []
|
298
|
-
chromosomes[chr] << pos
|
299
|
-
indices[chr] << i
|
300
|
-
end
|
301
|
-
|
302
|
-
chromosomes.each do |chr, pos_list|
|
303
|
-
chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
|
304
|
-
chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
|
305
|
-
end
|
306
|
-
|
307
|
-
variations
|
308
|
-
end
|
309
|
-
|
310
|
-
task_option :organism, "Organism", :string, "Hsa"
|
311
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
312
|
-
task_dependencies nil
|
313
|
-
task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
|
314
|
-
genomic_mutations = case
|
315
|
-
when TSV === genomic_mutations
|
316
|
-
genomic_mutations
|
317
|
-
else
|
318
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
319
|
-
end
|
320
|
-
genomic_mutations.key_field ||= "Position"
|
321
|
-
genomic_mutations.fields ||= ["Mutation"]
|
322
|
-
|
323
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
324
|
-
|
325
|
-
step(:resources, "Load Resources")
|
326
|
-
|
327
|
-
exon_junctures = {}
|
328
|
-
genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
|
329
|
-
exon_junctures[position] = exons
|
330
|
-
end
|
331
|
-
|
332
|
-
genomic_mutations.add_field "Exon Junctions" do |position, values|
|
333
|
-
exon_junctures[position] * "|"
|
334
|
-
end
|
335
|
-
|
336
|
-
genomic_mutations.to_s :sort, true
|
337
|
-
end
|
338
|
-
|
339
|
-
|
340
|
-
task_option :organism, "Organism", :string, "Hsa"
|
341
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
342
|
-
task_dependencies nil
|
343
|
-
task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
|
344
|
-
genomic_mutations = case
|
345
|
-
when TSV === genomic_mutations
|
346
|
-
genomic_mutations
|
347
|
-
else
|
348
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
349
|
-
end
|
350
|
-
genomic_mutations.key_field ||= "Position"
|
351
|
-
genomic_mutations.fields ||= ["Mutation"]
|
352
|
-
|
353
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
354
|
-
|
355
|
-
step(:resources, "Load Resources")
|
356
|
-
genes_at_positions = Hash[*genomic_mutations.keys.zip(Organism.genes_at_genomic_positions(org, positions)).flatten]
|
357
|
-
|
358
|
-
genomic_mutations.add_field "#{org.sub(/\/.*/,'')}:Ensembl Gene ID" do |position, values|
|
359
|
-
genes_at_positions[position]
|
360
|
-
end
|
361
|
-
|
362
|
-
genomic_mutations
|
363
|
-
end
|
364
|
-
|
365
|
-
|
366
|
-
task_description <<-EOF
|
367
|
-
Translates a collection of mutations in genomic coordinates into mutations in aminoacids for the
|
368
|
-
protein products of transcripts including those positions.
|
369
|
-
EOF
|
370
|
-
task_option :organism, "Organism", :string, "Hsa"
|
371
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
372
|
-
task_dependencies nil
|
373
|
-
task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
|
374
|
-
genomic_mutations = case
|
375
|
-
when TSV === genomic_mutations
|
376
|
-
genomic_mutations
|
377
|
-
else
|
378
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
379
|
-
end
|
380
|
-
|
381
|
-
genomic_mutations.key_field ||= "Position"
|
382
|
-
genomic_mutations.fields ||= ["Mutation"]
|
383
|
-
|
384
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
385
|
-
|
386
|
-
step(:prepare, "Prepare Results")
|
387
|
-
results = TSV.new({})
|
388
|
-
results.key_field = "Position"
|
389
|
-
results.fields = ["#{org.sub(/\/.*/,'')}:Ensembl Transcript ID", "Protein Mutation"]
|
390
|
-
results.type = :double
|
391
|
-
results.filename = path
|
392
|
-
|
393
|
-
step(:resources, "Load Resources")
|
394
|
-
transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
395
|
-
transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
396
|
-
exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
397
|
-
exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
398
|
-
exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
399
|
-
exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
400
|
-
transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
|
401
|
-
|
402
|
-
step(:offsets, "Find transcripts and offsets for mutations")
|
403
|
-
offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
|
404
|
-
|
405
|
-
step(:aminoacid, "Translate mutation to amino acid substitutions")
|
406
|
-
offsets.each do |position, transcripts|
|
407
|
-
if genomic_mutations.type === :double
|
408
|
-
alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
|
409
|
-
else
|
410
|
-
alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
|
411
|
-
end
|
412
|
-
|
413
|
-
transcripts.each do |transcript, offset_info|
|
414
|
-
offset, strand = offset_info
|
415
|
-
codon = begin
|
416
|
-
Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
|
417
|
-
rescue
|
418
|
-
Log.medium $!.message
|
419
|
-
next
|
420
|
-
end
|
421
|
-
|
422
|
-
if not codon.nil? and not codon.empty?
|
423
|
-
alleles.each do |allele|
|
424
|
-
allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
|
425
|
-
change = Organism.codon_change(allele, *codon.values_at(0,1))
|
426
|
-
pos_code = position * ":"
|
427
|
-
mutation = [change.first, codon.last + 1, change.last] * ""
|
428
|
-
if results.include? pos_code
|
429
|
-
results[pos_code] = results[pos_code].merge [transcript, mutation]
|
430
|
-
else
|
431
|
-
results[pos_code] = [[transcript], [mutation]]
|
432
|
-
end
|
433
|
-
end
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
end
|
438
|
-
|
439
|
-
step(:identify_proteins, "Identify Proteins for Transcripts")
|
440
|
-
transcript_field = results.identify_field "Ensembl Transcript ID"
|
441
|
-
results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
|
442
|
-
values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
|
443
|
-
end
|
444
|
-
|
445
|
-
|
446
|
-
results
|
447
|
-
end
|
448
|
-
|
449
|
-
|
450
|
-
task_option :organism, "Organism", :string, "Hsa"
|
451
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
452
|
-
task_dependencies nil
|
453
|
-
task :identify_germline_variations => :tsv do |org,genomic_mutations|
|
454
|
-
genomic_mutations = case
|
455
|
-
when TSV === genomic_mutations
|
456
|
-
genomic_mutations
|
457
|
-
else
|
458
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
459
|
-
end
|
460
|
-
|
461
|
-
genomic_mutations.key_field ||= "Position"
|
462
|
-
genomic_mutations.fields ||= ["Mutation"]
|
463
|
-
|
464
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
465
|
-
|
466
|
-
|
467
|
-
step(:prepare, "Prepare Results")
|
468
|
-
results = TSV.new({})
|
469
|
-
results.key_field = "Position"
|
470
|
-
results.fields = ["SNP Id"]
|
471
|
-
results.type = :double
|
472
|
-
results.filename = path
|
473
|
-
|
474
|
-
|
475
|
-
step(:resources, "Load Resources")
|
476
|
-
|
477
|
-
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
|
478
|
-
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
479
|
-
|
480
|
-
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
481
|
-
snps_for_positions[position]
|
482
|
-
end
|
483
|
-
|
484
|
-
genomic_mutations
|
485
|
-
end
|
486
|
-
|
487
|
-
|
488
|
-
task_option :organism, "Organism", :string, "Hsa"
|
489
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
490
|
-
task_dependencies nil
|
491
|
-
task :identify_somatic_variations => :tsv do |org,genomic_mutations|
|
492
|
-
genomic_mutations = case
|
493
|
-
when TSV === genomic_mutations
|
494
|
-
genomic_mutations
|
495
|
-
else
|
496
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
497
|
-
end
|
498
|
-
|
499
|
-
genomic_mutations.key_field ||= "Position"
|
500
|
-
genomic_mutations.fields ||= ["Mutation"]
|
501
|
-
|
502
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
503
|
-
|
504
|
-
|
505
|
-
step(:prepare, "Prepare Results")
|
506
|
-
results = TSV.new({})
|
507
|
-
results.key_field = "Position"
|
508
|
-
results.fields = ["SNP Id"]
|
509
|
-
results.type = :double
|
510
|
-
results.filename = path
|
511
|
-
|
512
|
-
|
513
|
-
step(:resources, "Load Resources")
|
514
|
-
|
515
|
-
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
|
516
|
-
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
517
|
-
|
518
|
-
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
519
|
-
snps_for_positions[position]
|
520
|
-
end
|
521
|
-
|
522
|
-
genomic_mutations
|
523
|
-
end
|
524
|
-
|
525
|
-
|
526
|
-
end
|
527
|
-
|
528
|
-
if __FILE__ == $0
|
529
|
-
require 'rbbt/util/log'
|
530
|
-
require 'benchmark'
|
531
|
-
|
532
|
-
select = <<-EOF
|
533
|
-
3:64581875
|
534
|
-
EOF
|
535
|
-
select = select.split("\n").collect{|l| l.split(":")}
|
536
|
-
|
537
|
-
picmi_test = <<-EOF
|
538
|
-
#Chromosome Name Position Reference Tumor
|
539
|
-
1 100382265 C G
|
540
|
-
1 100380997 A G
|
541
|
-
22 30163533 A C
|
542
|
-
X 10094215 G A
|
543
|
-
X 10085674 C T
|
544
|
-
20 50071099 G T
|
545
|
-
21 19638426 G T
|
546
|
-
2 230633386 C T
|
547
|
-
2 230312220 C T
|
548
|
-
1 100624830 T A
|
549
|
-
4 30723053 G T
|
550
|
-
EOF
|
551
|
-
|
552
|
-
# Build 37
|
553
|
-
picmi_test = <<-EOF
|
554
|
-
#Chromosome Name Position Reference Tumor
|
555
|
-
1 100624830 T A
|
556
|
-
21 19638426 G T
|
557
|
-
EOF
|
558
|
-
|
559
|
-
exon_juncture_test = <<-EOF
|
560
|
-
#Position Mutation
|
561
|
-
7:150753996 T
|
562
|
-
EOF
|
563
|
-
|
564
|
-
|
565
|
-
job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
|
566
|
-
job.run
|
567
|
-
job.clean if job.error?
|
568
|
-
puts job.messages
|
569
|
-
puts job.read
|
570
|
-
|
571
|
-
# # Build 36
|
572
|
-
# picmi_test = <<-EOF
|
573
|
-
##Chromosome Name Position Reference Tumor
|
574
|
-
#3 81780820 T C
|
575
|
-
#2 43881517 A T
|
576
|
-
#2 43857514 T C
|
577
|
-
#6 88375602 G A
|
578
|
-
#16 69875502 G T
|
579
|
-
#16 69876078 T C
|
580
|
-
#16 69877147 G A
|
581
|
-
#17 8101874 C T
|
582
|
-
# EOF
|
583
|
-
|
584
|
-
|
585
|
-
Log.severity = 2
|
586
|
-
org = 'Hsa/may2009'
|
587
|
-
file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
|
588
|
-
|
589
|
-
#positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
|
590
|
-
positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
|
591
|
-
positions.key_field = "Position"
|
592
|
-
positions.fields = %w(Reference Control Tumor)
|
593
|
-
#positions.fields = %w(Reference Tumor)
|
594
|
-
|
595
|
-
#puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
|
596
|
-
|
597
|
-
|
598
|
-
#positions = positions.select ["10:98099540"]
|
599
|
-
|
600
|
-
Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
|
601
|
-
job = Organism.job :genomic_mutations_to_protein_mutations, "Metastasis", org, positions.slice("Tumor")
|
602
|
-
job.run
|
603
|
-
|
604
|
-
while not job.done?
|
605
|
-
puts job.step
|
606
|
-
sleep 2
|
607
|
-
end
|
608
|
-
|
609
|
-
raise job.messages.last if job.error?
|
610
|
-
mutations = job.load
|
611
|
-
|
612
|
-
end
|