rbbt-sources 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/sources/COSTART.rb +2 -3
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/biomart.rb +32 -32
- data/lib/rbbt/sources/entrez.rb +14 -10
- data/lib/rbbt/sources/go.rb +9 -8
- data/lib/rbbt/sources/organism.rb +36 -10
- data/lib/rbbt/sources/organism/sequence.rb +337 -0
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/share/install/Organism/Hsa/Rakefile +7 -68
- data/share/install/Organism/Sce/Rakefile +4 -70
- data/share/install/Organism/organism_helpers.rb +305 -0
- data/share/install/lib/helpers.rb +5 -5
- data/test/rbbt/sources/test_biomart.rb +7 -6
- data/test/rbbt/sources/test_entrez.rb +3 -3
- data/test/rbbt/sources/test_organism.rb +32 -3
- data/test/rbbt/sources/test_pubmed.rb +1 -1
- metadata +7 -6
- data/lib/rbbt/sources/Reactome.rb +0 -16
data/lib/rbbt/sources/COSTART.rb
CHANGED
@@ -2,8 +2,7 @@ require 'rbbt-util'
|
|
2
2
|
|
3
3
|
module COSTART
|
4
4
|
|
5
|
-
Rbbt.
|
6
|
-
Proc.new do
|
5
|
+
Rbbt.share.databases.COSTART.COSTART.define_as_proc do
|
7
6
|
terms = ["#COSTART Terms"]
|
8
7
|
Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
|
9
8
|
puts line
|
@@ -12,5 +11,5 @@ module COSTART
|
|
12
11
|
end
|
13
12
|
|
14
13
|
terms * "\n"
|
15
|
-
|
14
|
+
end
|
16
15
|
end
|
data/lib/rbbt/sources/CTCAE.rb
CHANGED
@@ -2,5 +2,5 @@ require 'rbbt-util'
|
|
2
2
|
require 'rbbt/util/excel2tsv'
|
3
3
|
|
4
4
|
module CTCAE
|
5
|
-
|
5
|
+
Rbbt.share.CTCAE.CTCAE.define_as_url TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls')
|
6
6
|
end
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt/util/tsv'
|
2
2
|
require 'rbbt/util/log'
|
3
|
+
require 'cgi'
|
3
4
|
|
4
5
|
# This module interacts with BioMart. It performs queries to BioMart and
|
5
6
|
# synthesises a hash with the results. Note that this module connects to the
|
@@ -27,21 +28,23 @@ module BioMart
|
|
27
28
|
EOT
|
28
29
|
|
29
30
|
def self.set_archive(date)
|
30
|
-
@archive_url = BIOMART_URL.sub(/
|
31
|
+
@archive_url = BIOMART_URL.sub(/http:\/\/biomart\./, 'http://' + date + '.archive.ensembl.')
|
32
|
+
Log.debug "Using Archive URL #{ @archive_url }"
|
31
33
|
end
|
32
34
|
|
33
35
|
def self.unset_archive
|
36
|
+
Log.debug "Restoring current version URL #{BIOMART_URL}"
|
34
37
|
@archive_url = nil
|
35
38
|
end
|
36
39
|
|
37
40
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
41
|
+
repeats = true
|
38
42
|
attrs ||= []
|
39
43
|
filters ||= ["with_#{main}"]
|
40
|
-
|
41
|
-
|
44
|
+
|
42
45
|
query = @@biomart_query_xml.dup
|
43
46
|
query.sub!(/<!--DATABASE-->/,database)
|
44
|
-
query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
|
47
|
+
query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
|
45
48
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
46
49
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
47
50
|
|
@@ -55,23 +58,18 @@ module BioMart
|
|
55
58
|
raise BioMart::QueryError, response
|
56
59
|
end
|
57
60
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
else
|
71
|
-
data[main][name] << value unless data[main][name].include? value
|
72
|
-
end
|
73
|
-
}
|
74
|
-
}
|
61
|
+
result_file = TmpFile.tmp_file
|
62
|
+
Open.write(result_file, response)
|
63
|
+
|
64
|
+
if data.nil?
|
65
|
+
data = result_file
|
66
|
+
else
|
67
|
+
new_datafile = TmpFile.tmp_file
|
68
|
+
TSV.paste_merge data, result_file, new_datafile
|
69
|
+
FileUtils.rm data
|
70
|
+
data = new_datafile
|
71
|
+
FileUtils.rm result_file
|
72
|
+
end
|
75
73
|
|
76
74
|
data
|
77
75
|
end
|
@@ -95,8 +93,9 @@ module BioMart
|
|
95
93
|
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
96
94
|
open_options = Misc.add_defaults open_options, :nocache => false
|
97
95
|
attrs ||= []
|
98
|
-
|
99
|
-
|
96
|
+
|
97
|
+
open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
|
98
|
+
|
100
99
|
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
101
100
|
|
102
101
|
max_items = 2
|
@@ -115,21 +114,22 @@ module BioMart
|
|
115
114
|
|
116
115
|
Log.low "Chunks: #{chunks.length}"
|
117
116
|
chunks.each_with_index{|chunk,i|
|
118
|
-
Log.low "Chunk #{ i }: [#{chunk * ", "}]"
|
117
|
+
Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
|
119
118
|
data = get(database, main, chunk, filters, data, open_options)
|
120
119
|
}
|
121
120
|
|
122
|
-
data
|
121
|
+
result = TSV.new(data, open_options)
|
122
|
+
result.key_field = main
|
123
|
+
result.fields = attrs
|
124
|
+
result.filename = "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
|
125
|
+
|
126
|
+
FileUtils.rm data
|
127
|
+
result
|
123
128
|
end
|
124
129
|
|
125
130
|
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
126
131
|
codes = attrs.collect{|attr| attr[1]}
|
127
|
-
|
128
|
-
tsv = TSV.new({})
|
129
|
-
|
130
|
-
data.each do |key, info|
|
131
|
-
tsv[key] = info.values_at(*codes)
|
132
|
-
end
|
132
|
+
tsv = query(database, main.last, codes, filters, data, open_options)
|
133
133
|
|
134
134
|
tsv.key_field = main.first
|
135
135
|
tsv.fields = attrs.collect{|attr| attr.first}
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -5,29 +5,29 @@ require 'set'
|
|
5
5
|
|
6
6
|
module Entrez
|
7
7
|
|
8
|
-
Rbbt.
|
9
|
-
Rbbt.
|
8
|
+
Rbbt.share.databases.entrez.gene_info.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
|
9
|
+
Rbbt.share.databases.entrez.gene2pubmed.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
10
10
|
|
11
11
|
def self.entrez2native(taxs, options = {})
|
12
|
-
options = Misc.add_defaults options, :key => 1, :
|
12
|
+
options = Misc.add_defaults options, :key => 1, :fields => 5, :persistence => true, :merge => true
|
13
13
|
|
14
14
|
taxs = [taxs] unless Array === taxs
|
15
|
-
options.merge! :grep => taxs
|
16
|
-
|
17
|
-
tsv =
|
15
|
+
options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
|
16
|
+
|
17
|
+
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
18
18
|
tsv.key_field = "Entrez Gene ID"
|
19
19
|
tsv.fields = ["Native ID"]
|
20
20
|
tsv
|
21
21
|
end
|
22
22
|
|
23
23
|
def self.entrez2pubmed(taxs)
|
24
|
-
options = {:key => 1, :
|
24
|
+
options = {:key => 1, :fields => 2, :persistence => true, :merge => true}
|
25
25
|
|
26
26
|
taxs = [taxs] unless taxs.is_a?(Array)
|
27
27
|
taxs = taxs.collect{|t| t.to_s}
|
28
|
-
options.merge! :grep => taxs
|
28
|
+
options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
|
29
29
|
|
30
|
-
|
30
|
+
Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
|
31
31
|
end
|
32
32
|
|
33
33
|
class Gene
|
@@ -132,7 +132,11 @@ module Entrez
|
|
132
132
|
when Entrez::Gene === gene
|
133
133
|
gene_text = gene.text
|
134
134
|
when String === gene || Fixnum === gene
|
135
|
-
|
135
|
+
begin
|
136
|
+
gene_text = get_gene(gene).text
|
137
|
+
rescue CMD::CMDError
|
138
|
+
return 0
|
139
|
+
end
|
136
140
|
else
|
137
141
|
return 0
|
138
142
|
end
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -4,19 +4,20 @@ require 'rbbt-util'
|
|
4
4
|
# now all it does is provide a translation form id to the actual names.
|
5
5
|
module GO
|
6
6
|
|
7
|
-
Rbbt.
|
8
|
-
Rbbt.
|
7
|
+
Rbbt.share.databases.GO.gene_ontology.define_as_url 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
|
8
|
+
Rbbt.share.databases.GO.gslim_generic.define_as_url 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
|
9
9
|
|
10
10
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
11
|
-
TSV_GENE_ONTOLOGY = File.join(
|
11
|
+
TSV_GENE_ONTOLOGY = File.join(Persistence.cachedir, 'gene_ontology')
|
12
12
|
|
13
13
|
# This method needs to be called before any translations can be made, it is
|
14
14
|
# called automatically the first time the id2name method is called. It loads
|
15
15
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
16
16
|
# only the name field is used.
|
17
17
|
def self.init
|
18
|
-
|
19
|
-
|
18
|
+
init = Persistence.persist_tsv('gene_ontology', :Misc) do
|
19
|
+
info = {}
|
20
|
+
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
20
21
|
term_info = {}
|
21
22
|
|
22
23
|
term.split(/\n/). select{|l| l =~ /:/}.each{|l|
|
@@ -32,12 +33,12 @@ module GO
|
|
32
33
|
next if term_info["id"].nil?
|
33
34
|
info[term_info["id"]] = term_info
|
34
35
|
}
|
35
|
-
|
36
|
+
info
|
37
|
+
end
|
36
38
|
end
|
37
39
|
|
38
40
|
def self.info
|
39
|
-
self.init
|
40
|
-
TCHash.get(TSV_GENE_ONTOLOGY)
|
41
|
+
self.init
|
41
42
|
end
|
42
43
|
|
43
44
|
def self.goterms
|
@@ -1,19 +1,39 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
-
require 'rbbt/util/
|
2
|
+
require 'rbbt/util/resource'
|
3
3
|
|
4
4
|
|
5
5
|
module Organism
|
6
|
+
extend Resource
|
7
|
+
relative_to Rbbt, "share/organisms"
|
8
|
+
|
6
9
|
class OrganismNotProcessedError < StandardError; end
|
7
10
|
|
8
11
|
def self.datadir(org)
|
9
12
|
File.join(Rbbt.datadir, 'organisms', org)
|
10
13
|
end
|
11
14
|
|
12
|
-
def self.
|
15
|
+
def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
|
16
|
+
Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
|
17
|
+
options = Misc.add_defaults options, :persistence => true, :case_insensitive => false
|
18
|
+
|
19
|
+
options.merge! :key => target unless target.nil?
|
20
|
+
options.merge! :fields => fields unless fields.nil?
|
21
|
+
|
22
|
+
index = identifiers(org).tsv options
|
23
|
+
|
24
|
+
tsv.attach index, [:key]
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.normalize(org, list, target = nil, fields = nil, options = {})
|
13
28
|
return [] if list.nil? or list.empty?
|
14
29
|
options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
|
15
30
|
double = Misc.process_options options, :double
|
16
|
-
|
31
|
+
|
32
|
+
options.merge! :target => target unless target.nil?
|
33
|
+
options.merge! :fields => fields unless fields.nil?
|
34
|
+
|
35
|
+
index = identifiers(org).index options
|
36
|
+
|
17
37
|
if Array === list
|
18
38
|
if double
|
19
39
|
index.values_at *list
|
@@ -36,11 +56,11 @@ module Organism
|
|
36
56
|
end
|
37
57
|
|
38
58
|
def self.organisms
|
39
|
-
Dir.glob(File.join(
|
59
|
+
Dir.glob(File.join(Rbbt.share.organisms.find, '*')).collect{|f| File.basename(f)}
|
40
60
|
end
|
41
61
|
|
42
62
|
def self.name(organism)
|
43
|
-
|
63
|
+
Organism.scientific_name(organism).read.strip
|
44
64
|
end
|
45
65
|
|
46
66
|
def self.organism(name)
|
@@ -48,9 +68,15 @@ module Organism
|
|
48
68
|
organism == name or Organism.name(organism) =~ /#{ name }/i
|
49
69
|
}.first
|
50
70
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
71
|
+
|
72
|
+
["Hsa", "Sce"].each do |organism|
|
73
|
+
rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
|
74
|
+
rakefile.lib_dir = Resource.caller_lib_dir __FILE__
|
75
|
+
rakefile.pkgdir = 'phgx'
|
76
|
+
Organism[organism].define_as_rake rakefile
|
77
|
+
module_eval "#{ organism } = with_key '#{organism}'"
|
78
|
+
end
|
79
|
+
|
56
80
|
end
|
81
|
+
|
82
|
+
|
@@ -0,0 +1,337 @@
|
|
1
|
+
require 'rbbt/sources/organism'
|
2
|
+
require 'rbbt/util/workflow'
|
3
|
+
require 'bio'
|
4
|
+
# Sequence analyses
|
5
|
+
module Organism
|
6
|
+
extend WorkFlow
|
7
|
+
|
8
|
+
def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
|
9
|
+
exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
10
|
+
transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
|
11
|
+
|
12
|
+
transcripts = exon_transcripts[exon].first
|
13
|
+
transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
|
17
|
+
transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
18
|
+
transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
19
|
+
|
20
|
+
utr5 = transcript_5utr[transcript]
|
21
|
+
|
22
|
+
raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
|
23
|
+
|
24
|
+
return nil if utr5 > offset
|
25
|
+
|
26
|
+
sequence = transcript_sequence[transcript]
|
27
|
+
raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
|
28
|
+
|
29
|
+
ccds_offset = offset - utr5
|
30
|
+
return nil if ccds_offset > sequence.length
|
31
|
+
|
32
|
+
range = (utr5..-1)
|
33
|
+
sequence = sequence[range]
|
34
|
+
|
35
|
+
codon = ccds_offset / 3
|
36
|
+
codon_offset = ccds_offset % 3
|
37
|
+
|
38
|
+
[sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.codon_change(allele, codon, offset)
|
42
|
+
original = Bio::Sequence::NA .new(codon).translate
|
43
|
+
codon = codon.dup
|
44
|
+
codon[offset] = allele
|
45
|
+
new = Bio::Sequence::NA .new(codon).translate
|
46
|
+
[original, new]
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.genes_at_chromosome_positions(org, chromosome, positions)
|
50
|
+
chromosome = chromosome.to_s
|
51
|
+
chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
52
|
+
tsv = file.tsv(:persistence => false, :type => :list)
|
53
|
+
tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
|
54
|
+
[gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
if Array === positions
|
59
|
+
positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
|
60
|
+
else
|
61
|
+
pos = chromosome_bed[positions];
|
62
|
+
pos.nil? ? nil : pos.first
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.genes_at_genomic_positions(org, positions)
|
67
|
+
positions = [positions] unless Array === positions.first
|
68
|
+
genes = []
|
69
|
+
chromosomes = {}
|
70
|
+
indices = {}
|
71
|
+
positions.each_with_index do |info,i|
|
72
|
+
chr, pos = info
|
73
|
+
chromosomes[chr] ||= []
|
74
|
+
indices[chr] ||= []
|
75
|
+
chromosomes[chr] << pos
|
76
|
+
indices[chr] << i
|
77
|
+
end
|
78
|
+
|
79
|
+
chromosomes.each do |chr, pos_list|
|
80
|
+
chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
|
81
|
+
chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
|
82
|
+
end
|
83
|
+
|
84
|
+
genes
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.exons_at_chromosome_positions(org, chromosome, positions)
|
88
|
+
chromosome = chromosome.to_s
|
89
|
+
chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
90
|
+
tsv = file.tsv(:persistence => true, :type => :list)
|
91
|
+
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
92
|
+
[exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
if Array === positions
|
97
|
+
positions.collect{|position|
|
98
|
+
chromosome_bed[position];
|
99
|
+
}
|
100
|
+
else
|
101
|
+
chromosome_bed[positions];
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
def self.exons_at_genomic_positions(org, positions)
|
107
|
+
positions = [positions] unless Array === positions.first
|
108
|
+
|
109
|
+
exons = []
|
110
|
+
chromosomes = {}
|
111
|
+
indices = {}
|
112
|
+
positions.each_with_index do |info,i|
|
113
|
+
chr, pos = info
|
114
|
+
chromosomes[chr] ||= []
|
115
|
+
indices[chr] ||= []
|
116
|
+
chromosomes[chr] << pos
|
117
|
+
indices[chr] << i
|
118
|
+
end
|
119
|
+
|
120
|
+
chromosomes.each do |chr, pos_list|
|
121
|
+
chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
|
122
|
+
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
123
|
+
end
|
124
|
+
|
125
|
+
exons
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
|
129
|
+
exons ||= Organism.exons(org).tsv(:persistence => true)
|
130
|
+
transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
|
131
|
+
|
132
|
+
sizes = [0]
|
133
|
+
rank = nil
|
134
|
+
transcript_exons[transcript].zip_fields.each do |_exon, _rank|
|
135
|
+
_rank = _rank.to_i
|
136
|
+
s, e = exons[_exon].values_at("Start", "End")
|
137
|
+
size = e.to_i - s.to_i + 1
|
138
|
+
sizes[_rank] = size
|
139
|
+
rank = _rank if _exon == exon
|
140
|
+
end
|
141
|
+
|
142
|
+
if not rank.nil?
|
143
|
+
sizes[0..rank - 1].inject(0){|e,acc| acc += e}
|
144
|
+
else
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
|
150
|
+
exon_info ||= Organism.exons(org).tsv(:persistence => true)
|
151
|
+
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
152
|
+
|
153
|
+
exons = [exons] unless Array === exons
|
154
|
+
transcript_offsets = {}
|
155
|
+
exons.each do |exon|
|
156
|
+
transcript_offsets[exon] ||= {}
|
157
|
+
offsets = exon_offsets[exon].zip_fields
|
158
|
+
|
159
|
+
offsets.collect do |transcript, offset|
|
160
|
+
next if transcript.empty?
|
161
|
+
transcript_offsets[exon][transcript] = offset.to_i
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
transcript_offsets
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
|
169
|
+
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
170
|
+
exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
171
|
+
exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
172
|
+
exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
173
|
+
|
174
|
+
exons = exons_at_genomic_positions(org, positions)
|
175
|
+
offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
|
176
|
+
|
177
|
+
position_exons = {}
|
178
|
+
positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
|
179
|
+
|
180
|
+
position_offsets = {}
|
181
|
+
position_exons.each do |position,pos_exons|
|
182
|
+
chr, pos = position
|
183
|
+
next if pos_exons.nil? or pos_exons.empty?
|
184
|
+
pos_exons.each do |exon|
|
185
|
+
if offsets.include? exon
|
186
|
+
if exon_strand[exon] == 1
|
187
|
+
offset_in_exon = (pos.to_i - exon_start[exon].to_i)
|
188
|
+
else
|
189
|
+
offset_in_exon = (exon_end[exon] - pos.to_i)
|
190
|
+
end
|
191
|
+
position_offsets[position] ||= {}
|
192
|
+
offsets[exon].each do |transcript, offset|
|
193
|
+
if not offset.nil?
|
194
|
+
position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
position_offsets
|
202
|
+
end
|
203
|
+
|
204
|
+
task_option :org, "Organism", :string
|
205
|
+
task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
|
206
|
+
task :genomic_mutation_to_protein_mutation => :tsv do |org, genomic_mutations|
|
207
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
208
|
+
|
209
|
+
step(:prepare, "Prepare Results")
|
210
|
+
results = TSV.new({})
|
211
|
+
results.key_field = "Position"
|
212
|
+
results.fields = ["Ensembl Transcript ID", "Mutation"]
|
213
|
+
results.type = :double
|
214
|
+
|
215
|
+
step(:resources, "Load Resources")
|
216
|
+
transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
217
|
+
transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
218
|
+
exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
219
|
+
exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
220
|
+
exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
221
|
+
exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
222
|
+
|
223
|
+
step(:offsets, "Find transcripts and offsets for mutations")
|
224
|
+
offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
|
225
|
+
|
226
|
+
step(:aminoacid, "Translate mutation to amino acid substitutions")
|
227
|
+
offsets.each do |position, transcripts|
|
228
|
+
alleles = genomic_mutations[position * ":"].collect{|allele| Misc.IUPAC_to_base(allele)}.flatten
|
229
|
+
|
230
|
+
transcripts.each do |transcript, offset_info|
|
231
|
+
offset, strand = offset_info
|
232
|
+
ddd strand
|
233
|
+
begin
|
234
|
+
codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
|
235
|
+
rescue
|
236
|
+
Log.medium $!.message
|
237
|
+
next
|
238
|
+
end
|
239
|
+
|
240
|
+
ddd codon
|
241
|
+
if not codon.nil?
|
242
|
+
alleles.each do |allele|
|
243
|
+
ddd allele
|
244
|
+
allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
|
245
|
+
ddd allele
|
246
|
+
change = Organism.codon_change(allele, *codon.values_at(0,1))
|
247
|
+
pos_code = position * ":"
|
248
|
+
mutation = [change.first, codon.last + 1, change.last] * ""
|
249
|
+
if results.include? pos_code
|
250
|
+
results[pos_code] = results[pos_code].merge [transcript, mutation]
|
251
|
+
else
|
252
|
+
results[pos_code] = [[transcript], [mutation]]
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
|
260
|
+
results
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
if __FILE__ == $0
|
265
|
+
require 'rbbt/util/log'
|
266
|
+
require 'benchmark'
|
267
|
+
|
268
|
+
select = <<-EOF
|
269
|
+
3:64581875
|
270
|
+
EOF
|
271
|
+
select = select.split("\n").collect{|l| l.split(":")}
|
272
|
+
|
273
|
+
picmi_test = <<-EOF
|
274
|
+
#Chromosome Name Position Reference Tumor
|
275
|
+
1 100382265 C G
|
276
|
+
1 100380997 A G
|
277
|
+
22 30163533 A C
|
278
|
+
X 10094215 G A
|
279
|
+
X 10085674 C T
|
280
|
+
20 50071099 G T
|
281
|
+
21 19638426 G T
|
282
|
+
2 230633386 C T
|
283
|
+
2 230312220 C T
|
284
|
+
1 100624830 T A
|
285
|
+
4 30723053 G T
|
286
|
+
EOF
|
287
|
+
|
288
|
+
# Build 37
|
289
|
+
picmi_test = <<-EOF
|
290
|
+
#Chromosome Name Position Reference Tumor
|
291
|
+
1 100624830 T A
|
292
|
+
21 19638426 G T
|
293
|
+
EOF
|
294
|
+
|
295
|
+
|
296
|
+
# # Build 36
|
297
|
+
# picmi_test = <<-EOF
|
298
|
+
##Chromosome Name Position Reference Tumor
|
299
|
+
#3 81780820 T C
|
300
|
+
#2 43881517 A T
|
301
|
+
#2 43857514 T C
|
302
|
+
#6 88375602 G A
|
303
|
+
#16 69875502 G T
|
304
|
+
#16 69876078 T C
|
305
|
+
#16 69877147 G A
|
306
|
+
#17 8101874 C T
|
307
|
+
# EOF
|
308
|
+
|
309
|
+
|
310
|
+
Log.severity = 2
|
311
|
+
org = 'Hsa/may2009'
|
312
|
+
file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
|
313
|
+
|
314
|
+
#positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
|
315
|
+
positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
|
316
|
+
positions.key_field = "Position"
|
317
|
+
positions.fields = %w(Reference Control Tumor)
|
318
|
+
#positions.fields = %w(Reference Tumor)
|
319
|
+
|
320
|
+
#puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
|
321
|
+
|
322
|
+
|
323
|
+
#positions = positions.select ["10:98099540"]
|
324
|
+
|
325
|
+
Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
|
326
|
+
job = Organism.job :genomic_mutation_to_protein_mutation, "Metastasis", org, positions.slice("Tumor")
|
327
|
+
job.run
|
328
|
+
|
329
|
+
while not job.done?
|
330
|
+
puts job.step
|
331
|
+
sleep 2
|
332
|
+
end
|
333
|
+
|
334
|
+
raise job.messages.last if job.error?
|
335
|
+
mutations = job.load
|
336
|
+
|
337
|
+
end
|