rbbt-sources 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/NCI.rb +70 -12
- data/lib/rbbt/sources/cath.rb +142 -0
- data/lib/rbbt/sources/go.rb +14 -3
- data/lib/rbbt/sources/organism.rb +1 -1
- data/lib/rbbt/sources/pfam.rb +35 -0
- data/lib/rbbt/sources/pubmed.rb +7 -11
- data/lib/rbbt/sources/tfacts.rb +0 -1
- data/lib/rbbt/sources/uniprot.rb +125 -0
- data/share/install/Organism/Hsa/Rakefile +1 -4
- data/share/install/Organism/Mmu/Rakefile +57 -0
- data/share/install/Organism/Rno/Rakefile +1 -0
- data/share/install/Organism/Sce/Rakefile +1 -0
- data/share/install/Organism/organism_helpers.rb +54 -1
- metadata +8 -5
- data/lib/rbbt/sources/organism/sequence.rb +0 -612
data/lib/rbbt/sources/NCI.rb
CHANGED
@@ -8,57 +8,115 @@ end
|
|
8
8
|
|
9
9
|
if defined? Entity
|
10
10
|
|
11
|
-
module
|
11
|
+
module NCINaturePathway
|
12
12
|
extend Entity
|
13
13
|
self.format = "NCI Nature Pathway ID"
|
14
14
|
|
15
|
+
self.annotation :organism
|
16
|
+
|
17
|
+
def self.name_index
|
18
|
+
@name_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.gene_index
|
22
|
+
@gene_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
26
|
+
return true if query == entity
|
27
|
+
|
28
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
29
|
+
|
30
|
+
false
|
31
|
+
end
|
15
32
|
property :name => :array2single do
|
16
|
-
@name ||=
|
33
|
+
@name ||= NCINaturePathway.name_index.values_at *self
|
17
34
|
end
|
18
35
|
|
19
36
|
property :genes => :array2single do
|
20
|
-
@genes ||=
|
37
|
+
@genes ||= NCINaturePathway.gene_index.values_at *self
|
21
38
|
end
|
22
39
|
end
|
23
40
|
|
24
|
-
module
|
41
|
+
module NCIReactomePathway
|
25
42
|
extend Entity
|
26
43
|
self.format = "NCI Reactome Pathway ID"
|
44
|
+
|
45
|
+
self.annotation :organism
|
46
|
+
|
47
|
+
def self.name_index
|
48
|
+
@name_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.gene_index
|
52
|
+
@gene_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
56
|
+
return true if query == entity
|
57
|
+
|
58
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
59
|
+
|
60
|
+
false
|
61
|
+
end
|
27
62
|
|
28
63
|
property :name => :array2single do
|
29
|
-
@name ||=
|
64
|
+
@name ||= NCIReactomePathway.name_index.values_at *self
|
30
65
|
end
|
31
66
|
|
32
67
|
property :genes => :array2single do
|
33
|
-
@genes ||=
|
68
|
+
@genes ||= NCIReactomePathway.gene_index.values_at *self
|
34
69
|
end
|
35
70
|
end
|
36
71
|
|
37
|
-
module
|
72
|
+
module NCIBioCartaPathway
|
38
73
|
extend Entity
|
39
74
|
self.format = "NCI BioCarta Pathway ID"
|
40
75
|
|
76
|
+
self.annotation :organism
|
77
|
+
|
78
|
+
def self.name_index
|
79
|
+
@name_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :single)
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.gene_index
|
83
|
+
@gene_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true)
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.filter(query, field = nil, options = nil, entity = nil)
|
87
|
+
return true if query == entity
|
88
|
+
|
89
|
+
return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
|
90
|
+
|
91
|
+
false
|
92
|
+
end
|
93
|
+
|
41
94
|
property :name => :array2single do
|
42
|
-
@name ||=
|
95
|
+
@name ||= NCIBioCartaPathway.name_index.values_at *self
|
43
96
|
end
|
44
97
|
|
45
98
|
property :genes => :array2single do
|
46
|
-
@genes ||=
|
99
|
+
@genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
|
100
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }
|
47
101
|
end
|
48
102
|
end
|
49
103
|
|
50
104
|
if defined? Gene and Entity === Gene
|
51
105
|
module Gene
|
52
106
|
property :nature_pathways => :array2single do
|
53
|
-
@nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).
|
107
|
+
@nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).
|
108
|
+
values_at(*self.to("UniProt/SwissProt Accession")).
|
109
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
|
54
110
|
end
|
55
111
|
|
56
112
|
property :reactome_pathways => :array2single do
|
57
|
-
@reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at
|
113
|
+
@reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
|
114
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
|
58
115
|
end
|
59
116
|
|
60
117
|
property :biocarta_pathways => :array2single do
|
61
|
-
@biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at
|
118
|
+
@biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at(*self.entrez).
|
119
|
+
each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIBioCartaPathway.setup(o, organism)}
|
62
120
|
end
|
63
121
|
end
|
64
122
|
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
module Cath
|
4
|
+
extend Resource
|
5
|
+
|
6
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathNames, :proc do
|
7
|
+
tsv = TSV.setup({}, :key_field => "CATH Code", :type => :list, :fields => ["PDB ID", "CATH Domain", "CATH Description"])
|
8
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathNames").split(/\n/).each do |line|
|
9
|
+
next if line =~ /^#/
|
10
|
+
code, pdb, domain, name = line.match(/([\d\.]+)\s+(\w\w\w\w)(\w\w\w)\s+:(.*)/).values_at 1,2,3,4
|
11
|
+
tsv[code] = [pdb.downcase, domain, name]
|
12
|
+
end
|
13
|
+
|
14
|
+
tsv.to_s
|
15
|
+
end
|
16
|
+
|
17
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathUnclassifiedList , :proc do
|
18
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathUnclassifiedList").split(/\n/).collect do |line|
|
19
|
+
next if line =~ /^#/
|
20
|
+
line.split(/\s/).first
|
21
|
+
end * "\n"
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathDomainSeqs, :proc do
|
26
|
+
tsv = TSV.setup({}, :key_field => "CATH Domain", :type => :single, :fields => ["Cath Domain Sequence"])
|
27
|
+
|
28
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathDomainSeqs.ATOM").split(/>pdb\|/).each do |chunk|
|
29
|
+
next if chunk.empty?
|
30
|
+
domain, sequence = chunk.strip.match(/(.*)\n(.*)/).values_at 1, 2
|
31
|
+
tsv[domain] = sequence
|
32
|
+
end
|
33
|
+
|
34
|
+
tsv.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathRegions, :proc do
|
39
|
+
domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["Start", "End"])
|
40
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathDomall").split(/\n/).each do |line|
|
41
|
+
next if line =~ /^#/
|
42
|
+
chain, ndomains, nfragments, rest = line.match(/(\w\w\w\w\w)\s+D(\d+)\s+F(\d+)\s+(.*)/).values_at 1,2,3,4
|
43
|
+
|
44
|
+
ndomains.to_i.times do |dn|
|
45
|
+
nsegments, rest = rest.match(/^\s*(\d+)\s+(.*)/).values_at 1, 2
|
46
|
+
segments = []
|
47
|
+
nsegments.to_i.times do |sn|
|
48
|
+
start, eend, rest = rest.match(/\w\s+(-?\d+)\s+.\s+\w\s+(-?\d+)\s+.(.*)/).values_at 1, 2, 3
|
49
|
+
segments << [start, eend]
|
50
|
+
end
|
51
|
+
|
52
|
+
domain = chain + "%02d" % dn.to_i
|
53
|
+
segments = segments[0].zip(*segments[1..-1])
|
54
|
+
domains[domain] = segments
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
domains.to_s
|
59
|
+
end
|
60
|
+
|
61
|
+
Rbbt.claim Rbbt.share.databases.CATH.CathDomainList, :proc do
|
62
|
+
domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["CATH domain name (seven characters)",
|
63
|
+
"Class number", "Architecture number", "Topology number", "Homologous superfamily number", "S35 sequence cluster number",
|
64
|
+
"S60 sequence cluster number", "S95 sequence cluster number", "S100 sequence cluster number", "S100 sequence count number",
|
65
|
+
"Domain length", "Structure resolution (Angstroms)"], :type => :list)
|
66
|
+
|
67
|
+
Open.read("http://release.cathdb.info/v3.4.0/CathDomainList").split(/\n/).each do |line|
|
68
|
+
next if line =~ /^#/
|
69
|
+
parts = line.chomp.split /\s+/
|
70
|
+
domain = parts.shift
|
71
|
+
domains[domain] = parts
|
72
|
+
end
|
73
|
+
|
74
|
+
domains.to_s
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def self.cath_index
|
79
|
+
@@cath ||= Rbbt.share.databases.CATH.CathNames.tsv :persist => true, :case_insensitive => true
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.pdb_index
|
83
|
+
if not defined? @@pdb or @@pdb.nil?
|
84
|
+
@@pdb = {}
|
85
|
+
Rbbt.share.databases.CATH.CathDomainSeqs.read.split("\n").each do |line|
|
86
|
+
domain = line.split(/\t/).first
|
87
|
+
pdb = domain[0..3]
|
88
|
+
@@pdb[pdb] ||= []
|
89
|
+
@@pdb[pdb] << domain
|
90
|
+
end
|
91
|
+
end
|
92
|
+
@@pdb
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.unclassified
|
96
|
+
@@unclassified = {}
|
97
|
+
Rbbt.share.databases.CATH.CathUnclassifiedList.read.split("\n").each do |domain|
|
98
|
+
pdb = domain[0..3]
|
99
|
+
@@unclassified[pdb] ||= []
|
100
|
+
@@unclassified[pdb] << domain
|
101
|
+
end
|
102
|
+
@@unclassified
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.domain_sequences
|
106
|
+
@@domain_sequences ||= Rbbt.share.databases.CATH.CathDomainSeqs.tsv(:persist => true)
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.pdbs(cath_code)
|
110
|
+
cath = cath_index
|
111
|
+
if cath.include? cath_code
|
112
|
+
cath[cath_code]["PDB ID"]
|
113
|
+
else
|
114
|
+
nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.domains_for_pdb(pdb)
|
119
|
+
pdb2cath = pdb_index
|
120
|
+
(pdb2cath[pdb] || []) + (unclassified[pdb] || [])
|
121
|
+
end
|
122
|
+
|
123
|
+
def self.align(domain, sequence)
|
124
|
+
require 'bio'
|
125
|
+
|
126
|
+
return nil if not domain_sequences.include? domain
|
127
|
+
|
128
|
+
TmpFile.with_file(">target\n" << sequence) do |target|
|
129
|
+
TmpFile.with_file(">domain\n" << domain_sequences[domain]) do |domain|
|
130
|
+
|
131
|
+
result = CMD.cmd("fasta35 #{ target } #{ domain }").read
|
132
|
+
|
133
|
+
if result.match(/([\d\.]+)% identity.*overlap \((\d+)-(\d+):/s)
|
134
|
+
{:identity => $1.to_f, :range => ($2.to_i..$3.to_i)}
|
135
|
+
else
|
136
|
+
false
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -91,22 +91,33 @@ if defined? Entity
|
|
91
91
|
extend Entity
|
92
92
|
self.format = "GO ID"
|
93
93
|
|
94
|
+
self.annotation :organism
|
95
|
+
|
94
96
|
property :name => :array2single do
|
95
97
|
@name ||= GO.id2name(self)
|
96
98
|
end
|
97
99
|
|
98
100
|
property :genes => :array2single do |organism|
|
101
|
+
organism ||= self.organism
|
99
102
|
@genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
|
100
103
|
end
|
104
|
+
|
105
|
+
property :description => :single2array do
|
106
|
+
description = GO.info[self]['def']
|
107
|
+
description.gsub!(/"|\[.*\]/,'') if description
|
108
|
+
|
109
|
+
description
|
110
|
+
end
|
111
|
+
|
101
112
|
end
|
102
113
|
|
103
114
|
if defined? Gene and Entity === Gene
|
104
115
|
module Gene
|
105
|
-
property :go_terms => :array2single do
|
116
|
+
property :go_terms => :array2single do
|
106
117
|
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
107
118
|
end
|
108
|
-
|
109
|
-
property :go_bp_terms => :array2single do
|
119
|
+
|
120
|
+
property :go_bp_terms => :array2single do
|
110
121
|
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
111
122
|
end
|
112
123
|
end
|
@@ -7,7 +7,7 @@ module Organism
|
|
7
7
|
self.pkgdir = "rbbt"
|
8
8
|
self.subdir = "share/organisms"
|
9
9
|
|
10
|
-
["Hsa", "Rno", "Sce"].each do |organism|
|
10
|
+
["Hsa", "Mmu", "Rno", "Sce"].each do |organism|
|
11
11
|
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
12
12
|
|
13
13
|
module_eval "#{ organism } = with_key '#{organism}'"
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/resource'
|
4
|
+
|
5
|
+
module Pfam
|
6
|
+
extend Resource
|
7
|
+
self.subdir = "share/databases/Pfam"
|
8
|
+
|
9
|
+
Pfam.claim Pfam.domains, :proc do
|
10
|
+
url = "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
|
11
|
+
tsv = TSV.open(Open.open(url), :key_field => "Pfam Domain ID", :fields => ["Pfam Clan ID", "Code Name", "Name", "Description"])
|
12
|
+
tsv.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
NAMES_FILE = Rbbt.share.databases.InterPro.pfam_names.find
|
16
|
+
|
17
|
+
def self.name_index
|
18
|
+
@name_index ||= TSV.open NAMES_FILE, :single
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.name(id)
|
22
|
+
name_index[id]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
if defined? Entity
|
27
|
+
module PfamDomain
|
28
|
+
extend Entity
|
29
|
+
self.format = "Pfam Domain"
|
30
|
+
|
31
|
+
property :name => :array2single do
|
32
|
+
self.collect{|id| Pfam.name(id)}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -13,11 +13,13 @@ module PubMed
|
|
13
13
|
|
14
14
|
pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
|
15
15
|
|
16
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
16
17
|
articles = []
|
17
|
-
Misc.divide(
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
Misc.divide(pmids.sort, (pmids.length / 1000) + 1) do |pmid_list|
|
19
|
+
postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
|
20
|
+
xml = TmpFile.with_file(postdata) do |postfile|
|
21
|
+
Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
|
22
|
+
end
|
21
23
|
|
22
24
|
articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
|
23
25
|
end
|
@@ -202,14 +204,8 @@ module PubMed
|
|
202
204
|
}
|
203
205
|
|
204
206
|
return list unless missing.any?
|
205
|
-
chunk_size = [100, missing.length].min
|
206
|
-
chunks = (missing.length.to_f / chunk_size).ceil
|
207
207
|
|
208
|
-
articles =
|
209
|
-
chunks.times do |chunk|
|
210
|
-
pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
|
211
|
-
articles.merge!(get_online(pmids))
|
212
|
-
end
|
208
|
+
articles = get_online(missing)
|
213
209
|
|
214
210
|
articles.each{|p, xml|
|
215
211
|
filename = p + '.xml'
|
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/sources/cath'
|
4
|
+
require 'rbbt/sources/uniprot'
|
5
|
+
|
6
|
+
module Uniprot
|
7
|
+
extend Resource
|
8
|
+
self.subdir = "share/databases/Uniprot"
|
9
|
+
|
10
|
+
Uniprot.claim Uniprot.annotated_variants, :proc do
|
11
|
+
url = "http://www.uniprot.org/docs/humsavar.txt"
|
12
|
+
tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
|
13
|
+
:fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
|
14
|
+
:fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
|
15
|
+
|
16
|
+
tsv.unnamed = true
|
17
|
+
tsv.process "Amino Acid Mutation" do |mutation|
|
18
|
+
if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
|
19
|
+
wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
|
20
|
+
mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
|
21
|
+
[wt, $2, mut] * ""
|
22
|
+
else
|
23
|
+
mutation
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
|
28
|
+
mutation_pos = tsv.identify_field "Amino Acid Mutation"
|
29
|
+
tsv.add_field "Mutated Isoform" do |key, values|
|
30
|
+
[values[uniprot_pos], values[mutation_pos]] * ":"
|
31
|
+
end
|
32
|
+
|
33
|
+
tsv.reorder("Mutated Isoform").to_s
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
|
38
|
+
def self.pdbs(protein)
|
39
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
40
|
+
text = Open.read(url)
|
41
|
+
|
42
|
+
pdb = {}
|
43
|
+
|
44
|
+
text.split(/\n/).each{|l|
|
45
|
+
next unless l =~ /^DR\s+PDB; (.*)\./
|
46
|
+
id, method, resolution, region = $1.split(";").collect{|v| v.strip}
|
47
|
+
chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
|
48
|
+
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
|
49
|
+
}
|
50
|
+
pdb
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.variants(protein)
|
54
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
55
|
+
text = Open.read(url)
|
56
|
+
|
57
|
+
text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
|
58
|
+
|
59
|
+
parts = text.split(/^(FT \w+)/)
|
60
|
+
parts.shift
|
61
|
+
|
62
|
+
variants = []
|
63
|
+
|
64
|
+
type = nil
|
65
|
+
parts.each do |part|
|
66
|
+
if type.nil?
|
67
|
+
type = part
|
68
|
+
else
|
69
|
+
if type !~ /VARIANT/
|
70
|
+
type = nil
|
71
|
+
next
|
72
|
+
end
|
73
|
+
type = nil
|
74
|
+
|
75
|
+
value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
|
76
|
+
# 291 291 K -> E (in sporadic cancers; somatic mutation). /FTId=VAR_045413.
|
77
|
+
case
|
78
|
+
when value.match(/(\d+) (\d+) ([A-Z])\s*\-\>\s*([A-Z]) (.*)\. \/FTId=(.*)/)
|
79
|
+
start, eend, ref, mut, desc, id = $1, $2, $3, $4, $5, $6
|
80
|
+
when value.match(/(\d+) (\d+) (.*)\. \/FTId=(.*)/)
|
81
|
+
start, eend, ref, mut, desc, id = $1, $2, nil, nil, $3, $4
|
82
|
+
else
|
83
|
+
Log.debug "Value not understood: #{ value }"
|
84
|
+
end
|
85
|
+
variants << {
|
86
|
+
:start => start,
|
87
|
+
:end => eend,
|
88
|
+
:ref => ref,
|
89
|
+
:mut => mut,
|
90
|
+
:desc => desc,
|
91
|
+
:id => id,
|
92
|
+
}
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
variants
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
def self.cath(protein)
|
101
|
+
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
102
|
+
text = Open.read(url)
|
103
|
+
|
104
|
+
cath = {}
|
105
|
+
text.split(/\n/).each{|l|
|
106
|
+
next unless l =~ /^DR\s+Gene3D; G3DSA:(.*)\./
|
107
|
+
id, description, cuantity = $1.split(";").collect{|v| v.strip}
|
108
|
+
cath[id] = {:description => description, :cuantity => cuantity}
|
109
|
+
}
|
110
|
+
cath
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.cath_domains(protein)
|
114
|
+
pdbs = pdbs(protein).keys.uniq
|
115
|
+
pdbs.collect do |pdb|
|
116
|
+
Cath.domains_for_pdb(pdb)
|
117
|
+
end.flatten.compact
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.pdbs_covering_aa_position(protein, aa_position)
|
121
|
+
Uniprot.pdbs(protein).select do |pdb, info|
|
122
|
+
info[:region].include? aa_position
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -5,6 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
|
5
5
|
|
6
6
|
$taxs = [9606]
|
7
7
|
$scientific_name = "Homo sapiens"
|
8
|
+
$ortholog_key = "human_ensembl_gene"
|
8
9
|
|
9
10
|
$biomart_db = 'hsapiens_gene_ensembl'
|
10
11
|
$biomart_db_germline_variation = 'hsapiens_snp'
|
@@ -97,9 +98,5 @@ $biomart_go_2009= [
|
|
97
98
|
["GO CC ID", 'go_cellular_component_id'],
|
98
99
|
]
|
99
100
|
|
100
|
-
$biomart_pfam= [
|
101
|
-
["Pfam Domain", 'pfam'],
|
102
|
-
]
|
103
|
-
|
104
101
|
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
105
102
|
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -0,0 +1,57 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
+
|
6
|
+
$taxs = [10090]
|
7
|
+
$scientific_name = "Mus musculus"
|
8
|
+
$ortholog_key = "mouse_ensembl_gene"
|
9
|
+
|
10
|
+
$biomart_db = 'mmusculus_gene_ensembl'
|
11
|
+
$biomart_db_germline_variation = 'mmusculus_snp'
|
12
|
+
$biomart_db_somatic_variation = 'mmusculus_snp_som'
|
13
|
+
|
14
|
+
$biomart_lexicon = [
|
15
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
16
|
+
[ 'HGNC symbol', "hgnc_symbol" ],
|
17
|
+
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
18
|
+
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
19
|
+
]
|
20
|
+
|
21
|
+
$biomart_protein_identifiers = [
|
22
|
+
[ 'Protein ID', "protein_id" ],
|
23
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
24
|
+
[ 'Unigene ID', "unigene" ],
|
25
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
26
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
27
|
+
]
|
28
|
+
|
29
|
+
$biomart_probe_identifiers = [
|
30
|
+
]
|
31
|
+
|
32
|
+
$biomart_identifiers = [
|
33
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
34
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
35
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
36
|
+
[ 'CCDS ID', "ccds" ],
|
37
|
+
[ 'Protein ID', "protein_id" ],
|
38
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
39
|
+
[ 'Unigene ID', "unigene" ],
|
40
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
41
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
42
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
43
|
+
]
|
44
|
+
|
45
|
+
$biomart_go= [
|
46
|
+
["GO ID", 'go_id'],
|
47
|
+
["GO Namespace", 'namespace_1003'],
|
48
|
+
]
|
49
|
+
|
50
|
+
$biomart_go_2009= [
|
51
|
+
["GO BP ID", 'go_biological_process_id'],
|
52
|
+
["GO MF ID", 'go_molecular_function_id'],
|
53
|
+
["GO CC ID", 'go_cellular_component_id'],
|
54
|
+
]
|
55
|
+
|
56
|
+
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
57
|
+
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -9,6 +9,7 @@ $scientific_name = "Rattus norvegicus"
|
|
9
9
|
$biomart_db = 'rnorvegicus_gene_ensembl'
|
10
10
|
$biomart_db_germline_variation = 'rnorvegicus_snp'
|
11
11
|
$biomart_db_somatic_variation = 'rnorvegicus_snp_som'
|
12
|
+
$ortholog_key = "rat_ensembl_gene"
|
12
13
|
|
13
14
|
$biomart_lexicon = [
|
14
15
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -8,6 +8,7 @@ $native = "SGD ID"
|
|
8
8
|
$url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
|
9
9
|
$biomart_db = 'scerevisiae_gene_ensembl'
|
10
10
|
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
+
$ortholog_key = "yeast_ensembl_gene"
|
11
12
|
|
12
13
|
|
13
14
|
file 'scientific_name' do |t|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'net/ftp'
|
2
|
+
|
1
3
|
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
2
4
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
3
5
|
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
@@ -56,7 +58,9 @@ $biomart_exon_phase = [
|
|
56
58
|
['Phase','phase'],
|
57
59
|
]
|
58
60
|
|
59
|
-
|
61
|
+
$biomart_pfam= [
|
62
|
+
["Pfam Domain", 'pfam'],
|
63
|
+
]
|
60
64
|
|
61
65
|
$biomart_exons = [
|
62
66
|
$biomart_ensembl_gene,
|
@@ -71,6 +75,12 @@ file 'scientific_name' do |t|
|
|
71
75
|
File.open(t.name, 'w') do |f| f.write $scientific_name end
|
72
76
|
end
|
73
77
|
|
78
|
+
file 'ortholog_key' do |t|
|
79
|
+
raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
|
80
|
+
|
81
|
+
File.open(t.name, 'w') do |f| f.write $ortholog_key end
|
82
|
+
end
|
83
|
+
|
74
84
|
file 'identifiers' do |t|
|
75
85
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
|
76
86
|
identifiers.unnamed = true
|
@@ -456,6 +466,49 @@ file 'chromosomes' do |t|
|
|
456
466
|
File.open(t.name, 'w') do |f| f.puts goterms end
|
457
467
|
end
|
458
468
|
|
469
|
+
rule /^chromosome_.*/ do |t|
|
470
|
+
chr = t.name.match(/chromosome_(.*)/)[1]
|
471
|
+
|
472
|
+
archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
|
473
|
+
|
474
|
+
release = case archive
|
475
|
+
when "may2009"
|
476
|
+
"release-54"
|
477
|
+
when "jun2011"
|
478
|
+
"release-64"
|
479
|
+
when nil
|
480
|
+
Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
|
481
|
+
end
|
482
|
+
|
483
|
+
|
484
|
+
ftp = Net::FTP.new("ftp.ensembl.org")
|
485
|
+
ftp.login
|
486
|
+
ftp.chdir("pub/#{ release }/fasta/")
|
487
|
+
ftp.chdir($scientific_name.downcase.sub(" ",'_'))
|
488
|
+
ftp.chdir('dna')
|
489
|
+
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
490
|
+
|
491
|
+
raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
|
492
|
+
|
493
|
+
Log.debug("Downloading chromosome sequence: #{ file }")
|
494
|
+
TmpFile.with_file do |tmpfile|
|
495
|
+
ftp.getbinaryfile(file, tmpfile)
|
496
|
+
Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
|
497
|
+
ftp.close
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
rule /^possible_ortholog_(.*)/ do |t|
|
502
|
+
other = t.name.match(/ortholog_(.*)/)[1]
|
503
|
+
other_key = Organism.ortholog_key(other).produce.read
|
504
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
|
505
|
+
end
|
506
|
+
|
507
|
+
rule /^ortholog_(.*)/ do |t|
|
508
|
+
other = t.name.match(/ortholog_(.*)/)[1]
|
509
|
+
other_key = Organism.ortholog_key(other).produce.read
|
510
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
|
511
|
+
end
|
459
512
|
|
460
513
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
461
514
|
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 1.
|
10
|
+
version: 1.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-01-13 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -108,20 +108,23 @@ files:
|
|
108
108
|
- lib/rbbt/sources/barcode.rb
|
109
109
|
- lib/rbbt/sources/bibtex.rb
|
110
110
|
- lib/rbbt/sources/biomart.rb
|
111
|
+
- lib/rbbt/sources/cath.rb
|
111
112
|
- lib/rbbt/sources/entrez.rb
|
112
113
|
- lib/rbbt/sources/go.rb
|
113
114
|
- lib/rbbt/sources/gscholar.rb
|
114
115
|
- lib/rbbt/sources/jochem.rb
|
115
116
|
- lib/rbbt/sources/organism.rb
|
116
|
-
- lib/rbbt/sources/
|
117
|
+
- lib/rbbt/sources/pfam.rb
|
117
118
|
- lib/rbbt/sources/polysearch.rb
|
118
119
|
- lib/rbbt/sources/pubmed.rb
|
119
120
|
- lib/rbbt/sources/tfacts.rb
|
121
|
+
- lib/rbbt/sources/uniprot.rb
|
120
122
|
- lib/rbbt/sources/wgEncodeBroadHmm.rb
|
121
123
|
- share/install/InterPro/Rakefile
|
122
124
|
- share/install/JoChem/Rakefile
|
123
125
|
- share/install/NCI/Rakefile
|
124
126
|
- share/install/Organism/Hsa/Rakefile
|
127
|
+
- share/install/Organism/Mmu/Rakefile
|
125
128
|
- share/install/Organism/Rno/Rakefile
|
126
129
|
- share/install/Organism/Sce/Rakefile
|
127
130
|
- share/install/Organism/organism_helpers.rb
|
@@ -1,612 +0,0 @@
|
|
1
|
-
require 'rbbt/sources/organism'
|
2
|
-
require 'rbbt/util/workflow'
|
3
|
-
require 'bio'
|
4
|
-
# Sequence analyses
|
5
|
-
module Organism
|
6
|
-
extend WorkFlow
|
7
|
-
relative_to Rbbt, "share/organisms"
|
8
|
-
self.jobdir = Rbbt.var.organism.find
|
9
|
-
|
10
|
-
def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
|
11
|
-
exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
12
|
-
transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
|
13
|
-
|
14
|
-
transcripts = begin
|
15
|
-
exon_transcripts[exon].first
|
16
|
-
rescue
|
17
|
-
[]
|
18
|
-
end
|
19
|
-
|
20
|
-
transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
|
21
|
-
end
|
22
|
-
|
23
|
-
def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
|
24
|
-
transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
25
|
-
transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
26
|
-
|
27
|
-
utr5 = transcript_5utr[transcript]
|
28
|
-
|
29
|
-
raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
|
30
|
-
|
31
|
-
return nil if utr5 > offset
|
32
|
-
|
33
|
-
sequence = transcript_sequence[transcript]
|
34
|
-
raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
|
35
|
-
|
36
|
-
ccds_offset = offset - utr5
|
37
|
-
return nil if ccds_offset > sequence.length
|
38
|
-
|
39
|
-
range = (utr5..-1)
|
40
|
-
sequence = sequence[range]
|
41
|
-
|
42
|
-
codon = ccds_offset / 3
|
43
|
-
codon_offset = ccds_offset % 3
|
44
|
-
|
45
|
-
[sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
|
46
|
-
end
|
47
|
-
|
48
|
-
def self.codon_change(allele, codon, offset)
|
49
|
-
original = Bio::Sequence::NA .new(codon).translate
|
50
|
-
codon = codon.dup
|
51
|
-
codon[offset] = allele
|
52
|
-
new = Bio::Sequence::NA .new(codon).translate
|
53
|
-
[original, new]
|
54
|
-
end
|
55
|
-
|
56
|
-
def self.genes_at_chromosome_positions(org, chromosome, positions)
|
57
|
-
chromosome = chromosome.to_s
|
58
|
-
chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
59
|
-
tsv = file.tsv(:persistence => false, :type => :list)
|
60
|
-
tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
|
61
|
-
[gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
if Array === positions
|
66
|
-
positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
|
67
|
-
else
|
68
|
-
pos = chromosome_bed[positions];
|
69
|
-
pos.nil? ? nil : pos.first
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def self.genes_at_genomic_positions(org, positions)
|
74
|
-
positions = [positions] unless Array === positions.first
|
75
|
-
genes = []
|
76
|
-
chromosomes = {}
|
77
|
-
indices = {}
|
78
|
-
positions.each_with_index do |info,i|
|
79
|
-
chr, pos = info
|
80
|
-
chromosomes[chr] ||= []
|
81
|
-
indices[chr] ||= []
|
82
|
-
chromosomes[chr] << pos
|
83
|
-
indices[chr] << i
|
84
|
-
end
|
85
|
-
|
86
|
-
chromosomes.each do |chr, pos_list|
|
87
|
-
chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
|
88
|
-
chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
|
89
|
-
end
|
90
|
-
|
91
|
-
genes
|
92
|
-
end
|
93
|
-
|
94
|
-
def self.exons_at_chromosome_positions(org, chromosome, positions)
|
95
|
-
chromosome = chromosome.to_s
|
96
|
-
chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
97
|
-
tsv = file.tsv(:persistence => true, :type => :list)
|
98
|
-
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
99
|
-
[exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
if Array === positions
|
104
|
-
positions.collect{|position|
|
105
|
-
chromosome_bed[position];
|
106
|
-
}
|
107
|
-
else
|
108
|
-
chromosome_bed[positions];
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
|
113
|
-
def self.exons_at_genomic_positions(org, positions)
|
114
|
-
positions = [positions] unless Array === positions.first
|
115
|
-
|
116
|
-
exons = []
|
117
|
-
chromosomes = {}
|
118
|
-
indices = {}
|
119
|
-
positions.each_with_index do |info,i|
|
120
|
-
chr, pos = info
|
121
|
-
chromosomes[chr] ||= []
|
122
|
-
indices[chr] ||= []
|
123
|
-
chromosomes[chr] << pos
|
124
|
-
indices[chr] << i
|
125
|
-
end
|
126
|
-
|
127
|
-
chromosomes.each do |chr, pos_list|
|
128
|
-
chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
|
129
|
-
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
130
|
-
end
|
131
|
-
|
132
|
-
exons
|
133
|
-
end
|
134
|
-
|
135
|
-
def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
|
136
|
-
exons ||= Organism.exons(org).tsv(:persistence => true)
|
137
|
-
transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
|
138
|
-
|
139
|
-
sizes = [0]
|
140
|
-
rank = nil
|
141
|
-
transcript_exons[transcript].zip_fields.each do |_exon, _rank|
|
142
|
-
_rank = _rank.to_i
|
143
|
-
s, e = exons[_exon].values_at("Start", "End")
|
144
|
-
size = e.to_i - s.to_i + 1
|
145
|
-
sizes[_rank] = size
|
146
|
-
rank = _rank if _exon == exon
|
147
|
-
end
|
148
|
-
|
149
|
-
if not rank.nil?
|
150
|
-
sizes[0..rank - 1].inject(0){|e,acc| acc += e}
|
151
|
-
else
|
152
|
-
nil
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
|
157
|
-
exon_info ||= Organism.exons(org).tsv(:persistence => true)
|
158
|
-
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
159
|
-
|
160
|
-
exons = [exons] unless Array === exons
|
161
|
-
transcript_offsets = {}
|
162
|
-
exons.each do |exon|
|
163
|
-
transcript_offsets[exon] ||= {}
|
164
|
-
offsets = nil
|
165
|
-
next unless exon_offsets.include? exon
|
166
|
-
offsets = exon_offsets[exon].zip_fields
|
167
|
-
|
168
|
-
offsets.collect do |transcript, offset|
|
169
|
-
next if transcript.empty?
|
170
|
-
transcript_offsets[exon][transcript] = offset.to_i
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
transcript_offsets
|
175
|
-
end
|
176
|
-
|
177
|
-
def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
|
178
|
-
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
179
|
-
exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
180
|
-
exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
181
|
-
exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
182
|
-
|
183
|
-
exons = exons_at_genomic_positions(org, positions)
|
184
|
-
offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
|
185
|
-
|
186
|
-
position_exons = {}
|
187
|
-
positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
|
188
|
-
|
189
|
-
position_offsets = {}
|
190
|
-
position_exons.each do |position,pos_exons|
|
191
|
-
chr, pos = position
|
192
|
-
next if pos_exons.nil? or pos_exons.empty?
|
193
|
-
pos_exons.each do |exon|
|
194
|
-
if offsets.include? exon
|
195
|
-
if exon_strand[exon] == 1
|
196
|
-
offset_in_exon = (pos.to_i - exon_start[exon].to_i)
|
197
|
-
else
|
198
|
-
offset_in_exon = (exon_end[exon] - pos.to_i)
|
199
|
-
end
|
200
|
-
position_offsets[position] ||= {}
|
201
|
-
offsets[exon].each do |transcript, offset|
|
202
|
-
if not offset.nil?
|
203
|
-
position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
position_offsets
|
211
|
-
end
|
212
|
-
|
213
|
-
def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
|
214
|
-
chromosome = chromosome.to_s
|
215
|
-
chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
216
|
-
tsv = file.tsv(:persistence => true, :type => :list)
|
217
|
-
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
218
|
-
[exon, values["Exon Chr Start"].to_i]
|
219
|
-
end
|
220
|
-
end
|
221
|
-
chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
222
|
-
tsv = file.tsv(:persistence => true, :type => :list)
|
223
|
-
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
224
|
-
[exon, values["Exon Chr End"].to_i]
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
|
-
if Array === positions
|
229
|
-
positions.collect{|position|
|
230
|
-
position = position.to_i
|
231
|
-
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
232
|
-
}
|
233
|
-
else
|
234
|
-
position = positions.to_i
|
235
|
-
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
236
|
-
end
|
237
|
-
|
238
|
-
end
|
239
|
-
|
240
|
-
def self.exon_junctures_at_genomic_positions(org, positions)
|
241
|
-
positions = [positions] unless Array === positions.first
|
242
|
-
|
243
|
-
exons = []
|
244
|
-
chromosomes = {}
|
245
|
-
indices = {}
|
246
|
-
positions.each_with_index do |info,i|
|
247
|
-
chr, pos = info
|
248
|
-
chromosomes[chr] ||= []
|
249
|
-
indices[chr] ||= []
|
250
|
-
chromosomes[chr] << pos
|
251
|
-
indices[chr] << i
|
252
|
-
end
|
253
|
-
|
254
|
-
chromosomes.each do |chr, pos_list|
|
255
|
-
chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
|
256
|
-
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
257
|
-
end
|
258
|
-
|
259
|
-
exons
|
260
|
-
end
|
261
|
-
|
262
|
-
def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
|
263
|
-
chromosome = chromosome.to_s
|
264
|
-
|
265
|
-
chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
266
|
-
rows = []
|
267
|
-
chromosome = options[:chromosome]
|
268
|
-
f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
|
269
|
-
while not f.eof?
|
270
|
-
line = f.gets.chomp
|
271
|
-
id, chr, pos = line.split "\t"
|
272
|
-
rows << [id, pos.to_i]
|
273
|
-
end
|
274
|
-
|
275
|
-
rows
|
276
|
-
end
|
277
|
-
|
278
|
-
if Array === positions
|
279
|
-
positions.collect{|position|
|
280
|
-
chromosome_bed[position];
|
281
|
-
}
|
282
|
-
else
|
283
|
-
chromosome_bed[positions];
|
284
|
-
end
|
285
|
-
end
|
286
|
-
|
287
|
-
|
288
|
-
def self.identify_variations_at_genomic_positions(org, positions, variations_file)
|
289
|
-
positions = [positions] unless Array === positions.first
|
290
|
-
|
291
|
-
variations = []
|
292
|
-
chromosomes = {}
|
293
|
-
indices = {}
|
294
|
-
positions.each_with_index do |info,i|
|
295
|
-
chr, pos = info
|
296
|
-
chromosomes[chr] ||= []
|
297
|
-
indices[chr] ||= []
|
298
|
-
chromosomes[chr] << pos
|
299
|
-
indices[chr] << i
|
300
|
-
end
|
301
|
-
|
302
|
-
chromosomes.each do |chr, pos_list|
|
303
|
-
chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
|
304
|
-
chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
|
305
|
-
end
|
306
|
-
|
307
|
-
variations
|
308
|
-
end
|
309
|
-
|
310
|
-
task_option :organism, "Organism", :string, "Hsa"
|
311
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
312
|
-
task_dependencies nil
|
313
|
-
task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
|
314
|
-
genomic_mutations = case
|
315
|
-
when TSV === genomic_mutations
|
316
|
-
genomic_mutations
|
317
|
-
else
|
318
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
319
|
-
end
|
320
|
-
genomic_mutations.key_field ||= "Position"
|
321
|
-
genomic_mutations.fields ||= ["Mutation"]
|
322
|
-
|
323
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
324
|
-
|
325
|
-
step(:resources, "Load Resources")
|
326
|
-
|
327
|
-
exon_junctures = {}
|
328
|
-
genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
|
329
|
-
exon_junctures[position] = exons
|
330
|
-
end
|
331
|
-
|
332
|
-
genomic_mutations.add_field "Exon Junctions" do |position, values|
|
333
|
-
exon_junctures[position] * "|"
|
334
|
-
end
|
335
|
-
|
336
|
-
genomic_mutations.to_s :sort, true
|
337
|
-
end
|
338
|
-
|
339
|
-
|
340
|
-
task_option :organism, "Organism", :string, "Hsa"
|
341
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
342
|
-
task_dependencies nil
|
343
|
-
task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
|
344
|
-
genomic_mutations = case
|
345
|
-
when TSV === genomic_mutations
|
346
|
-
genomic_mutations
|
347
|
-
else
|
348
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
349
|
-
end
|
350
|
-
genomic_mutations.key_field ||= "Position"
|
351
|
-
genomic_mutations.fields ||= ["Mutation"]
|
352
|
-
|
353
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
354
|
-
|
355
|
-
step(:resources, "Load Resources")
|
356
|
-
genes_at_positions = Hash[*genomic_mutations.keys.zip(Organism.genes_at_genomic_positions(org, positions)).flatten]
|
357
|
-
|
358
|
-
genomic_mutations.add_field "#{org.sub(/\/.*/,'')}:Ensembl Gene ID" do |position, values|
|
359
|
-
genes_at_positions[position]
|
360
|
-
end
|
361
|
-
|
362
|
-
genomic_mutations
|
363
|
-
end
|
364
|
-
|
365
|
-
|
366
|
-
task_description <<-EOF
|
367
|
-
Translates a collection of mutations in genomic coordinates into mutations in aminoacids for the
|
368
|
-
protein products of transcripts including those positions.
|
369
|
-
EOF
|
370
|
-
task_option :organism, "Organism", :string, "Hsa"
|
371
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
372
|
-
task_dependencies nil
|
373
|
-
task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
|
374
|
-
genomic_mutations = case
|
375
|
-
when TSV === genomic_mutations
|
376
|
-
genomic_mutations
|
377
|
-
else
|
378
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
379
|
-
end
|
380
|
-
|
381
|
-
genomic_mutations.key_field ||= "Position"
|
382
|
-
genomic_mutations.fields ||= ["Mutation"]
|
383
|
-
|
384
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
385
|
-
|
386
|
-
step(:prepare, "Prepare Results")
|
387
|
-
results = TSV.new({})
|
388
|
-
results.key_field = "Position"
|
389
|
-
results.fields = ["#{org.sub(/\/.*/,'')}:Ensembl Transcript ID", "Protein Mutation"]
|
390
|
-
results.type = :double
|
391
|
-
results.filename = path
|
392
|
-
|
393
|
-
step(:resources, "Load Resources")
|
394
|
-
transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
395
|
-
transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
396
|
-
exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
397
|
-
exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
398
|
-
exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
399
|
-
exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
400
|
-
transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
|
401
|
-
|
402
|
-
step(:offsets, "Find transcripts and offsets for mutations")
|
403
|
-
offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
|
404
|
-
|
405
|
-
step(:aminoacid, "Translate mutation to amino acid substitutions")
|
406
|
-
offsets.each do |position, transcripts|
|
407
|
-
if genomic_mutations.type === :double
|
408
|
-
alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
|
409
|
-
else
|
410
|
-
alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
|
411
|
-
end
|
412
|
-
|
413
|
-
transcripts.each do |transcript, offset_info|
|
414
|
-
offset, strand = offset_info
|
415
|
-
codon = begin
|
416
|
-
Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
|
417
|
-
rescue
|
418
|
-
Log.medium $!.message
|
419
|
-
next
|
420
|
-
end
|
421
|
-
|
422
|
-
if not codon.nil? and not codon.empty?
|
423
|
-
alleles.each do |allele|
|
424
|
-
allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
|
425
|
-
change = Organism.codon_change(allele, *codon.values_at(0,1))
|
426
|
-
pos_code = position * ":"
|
427
|
-
mutation = [change.first, codon.last + 1, change.last] * ""
|
428
|
-
if results.include? pos_code
|
429
|
-
results[pos_code] = results[pos_code].merge [transcript, mutation]
|
430
|
-
else
|
431
|
-
results[pos_code] = [[transcript], [mutation]]
|
432
|
-
end
|
433
|
-
end
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
end
|
438
|
-
|
439
|
-
step(:identify_proteins, "Identify Proteins for Transcripts")
|
440
|
-
transcript_field = results.identify_field "Ensembl Transcript ID"
|
441
|
-
results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
|
442
|
-
values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
|
443
|
-
end
|
444
|
-
|
445
|
-
|
446
|
-
results
|
447
|
-
end
|
448
|
-
|
449
|
-
|
450
|
-
task_option :organism, "Organism", :string, "Hsa"
|
451
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
452
|
-
task_dependencies nil
|
453
|
-
task :identify_germline_variations => :tsv do |org,genomic_mutations|
|
454
|
-
genomic_mutations = case
|
455
|
-
when TSV === genomic_mutations
|
456
|
-
genomic_mutations
|
457
|
-
else
|
458
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
459
|
-
end
|
460
|
-
|
461
|
-
genomic_mutations.key_field ||= "Position"
|
462
|
-
genomic_mutations.fields ||= ["Mutation"]
|
463
|
-
|
464
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
465
|
-
|
466
|
-
|
467
|
-
step(:prepare, "Prepare Results")
|
468
|
-
results = TSV.new({})
|
469
|
-
results.key_field = "Position"
|
470
|
-
results.fields = ["SNP Id"]
|
471
|
-
results.type = :double
|
472
|
-
results.filename = path
|
473
|
-
|
474
|
-
|
475
|
-
step(:resources, "Load Resources")
|
476
|
-
|
477
|
-
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
|
478
|
-
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
479
|
-
|
480
|
-
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
481
|
-
snps_for_positions[position]
|
482
|
-
end
|
483
|
-
|
484
|
-
genomic_mutations
|
485
|
-
end
|
486
|
-
|
487
|
-
|
488
|
-
task_option :organism, "Organism", :string, "Hsa"
|
489
|
-
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
490
|
-
task_dependencies nil
|
491
|
-
task :identify_somatic_variations => :tsv do |org,genomic_mutations|
|
492
|
-
genomic_mutations = case
|
493
|
-
when TSV === genomic_mutations
|
494
|
-
genomic_mutations
|
495
|
-
else
|
496
|
-
TSV.new StringIO.new(genomic_mutations), :list
|
497
|
-
end
|
498
|
-
|
499
|
-
genomic_mutations.key_field ||= "Position"
|
500
|
-
genomic_mutations.fields ||= ["Mutation"]
|
501
|
-
|
502
|
-
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
503
|
-
|
504
|
-
|
505
|
-
step(:prepare, "Prepare Results")
|
506
|
-
results = TSV.new({})
|
507
|
-
results.key_field = "Position"
|
508
|
-
results.fields = ["SNP Id"]
|
509
|
-
results.type = :double
|
510
|
-
results.filename = path
|
511
|
-
|
512
|
-
|
513
|
-
step(:resources, "Load Resources")
|
514
|
-
|
515
|
-
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
|
516
|
-
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
517
|
-
|
518
|
-
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
519
|
-
snps_for_positions[position]
|
520
|
-
end
|
521
|
-
|
522
|
-
genomic_mutations
|
523
|
-
end
|
524
|
-
|
525
|
-
|
526
|
-
end
|
527
|
-
|
528
|
-
if __FILE__ == $0
|
529
|
-
require 'rbbt/util/log'
|
530
|
-
require 'benchmark'
|
531
|
-
|
532
|
-
select = <<-EOF
|
533
|
-
3:64581875
|
534
|
-
EOF
|
535
|
-
select = select.split("\n").collect{|l| l.split(":")}
|
536
|
-
|
537
|
-
picmi_test = <<-EOF
|
538
|
-
#Chromosome Name Position Reference Tumor
|
539
|
-
1 100382265 C G
|
540
|
-
1 100380997 A G
|
541
|
-
22 30163533 A C
|
542
|
-
X 10094215 G A
|
543
|
-
X 10085674 C T
|
544
|
-
20 50071099 G T
|
545
|
-
21 19638426 G T
|
546
|
-
2 230633386 C T
|
547
|
-
2 230312220 C T
|
548
|
-
1 100624830 T A
|
549
|
-
4 30723053 G T
|
550
|
-
EOF
|
551
|
-
|
552
|
-
# Build 37
|
553
|
-
picmi_test = <<-EOF
|
554
|
-
#Chromosome Name Position Reference Tumor
|
555
|
-
1 100624830 T A
|
556
|
-
21 19638426 G T
|
557
|
-
EOF
|
558
|
-
|
559
|
-
exon_juncture_test = <<-EOF
|
560
|
-
#Position Mutation
|
561
|
-
7:150753996 T
|
562
|
-
EOF
|
563
|
-
|
564
|
-
|
565
|
-
job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
|
566
|
-
job.run
|
567
|
-
job.clean if job.error?
|
568
|
-
puts job.messages
|
569
|
-
puts job.read
|
570
|
-
|
571
|
-
# # Build 36
|
572
|
-
# picmi_test = <<-EOF
|
573
|
-
##Chromosome Name Position Reference Tumor
|
574
|
-
#3 81780820 T C
|
575
|
-
#2 43881517 A T
|
576
|
-
#2 43857514 T C
|
577
|
-
#6 88375602 G A
|
578
|
-
#16 69875502 G T
|
579
|
-
#16 69876078 T C
|
580
|
-
#16 69877147 G A
|
581
|
-
#17 8101874 C T
|
582
|
-
# EOF
|
583
|
-
|
584
|
-
|
585
|
-
Log.severity = 2
|
586
|
-
org = 'Hsa/may2009'
|
587
|
-
file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
|
588
|
-
|
589
|
-
#positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
|
590
|
-
positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
|
591
|
-
positions.key_field = "Position"
|
592
|
-
positions.fields = %w(Reference Control Tumor)
|
593
|
-
#positions.fields = %w(Reference Tumor)
|
594
|
-
|
595
|
-
#puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
|
596
|
-
|
597
|
-
|
598
|
-
#positions = positions.select ["10:98099540"]
|
599
|
-
|
600
|
-
Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
|
601
|
-
job = Organism.job :genomic_mutations_to_protein_mutations, "Metastasis", org, positions.slice("Tumor")
|
602
|
-
job.run
|
603
|
-
|
604
|
-
while not job.done?
|
605
|
-
puts job.step
|
606
|
-
sleep 2
|
607
|
-
end
|
608
|
-
|
609
|
-
raise job.messages.last if job.error?
|
610
|
-
mutations = job.load
|
611
|
-
|
612
|
-
end
|