rbbt-sources 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/COSTART.rb +2 -3
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/biomart.rb +32 -32
- data/lib/rbbt/sources/entrez.rb +14 -10
- data/lib/rbbt/sources/go.rb +9 -8
- data/lib/rbbt/sources/organism.rb +36 -10
- data/lib/rbbt/sources/organism/sequence.rb +337 -0
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/share/install/Organism/Hsa/Rakefile +7 -68
- data/share/install/Organism/Sce/Rakefile +4 -70
- data/share/install/Organism/organism_helpers.rb +305 -0
- data/share/install/lib/helpers.rb +5 -5
- data/test/rbbt/sources/test_biomart.rb +7 -6
- data/test/rbbt/sources/test_entrez.rb +3 -3
- data/test/rbbt/sources/test_organism.rb +32 -3
- data/test/rbbt/sources/test_pubmed.rb +1 -1
- metadata +7 -6
- data/lib/rbbt/sources/Reactome.rb +0 -16
data/lib/rbbt/sources/COSTART.rb
CHANGED
@@ -2,8 +2,7 @@ require 'rbbt-util'
|
|
2
2
|
|
3
3
|
module COSTART
|
4
4
|
|
5
|
-
Rbbt.
|
6
|
-
Proc.new do
|
5
|
+
Rbbt.share.databases.COSTART.COSTART.define_as_proc do
|
7
6
|
terms = ["#COSTART Terms"]
|
8
7
|
Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
|
9
8
|
puts line
|
@@ -12,5 +11,5 @@ module COSTART
|
|
12
11
|
end
|
13
12
|
|
14
13
|
terms * "\n"
|
15
|
-
|
14
|
+
end
|
16
15
|
end
|
data/lib/rbbt/sources/CTCAE.rb
CHANGED
@@ -2,5 +2,5 @@ require 'rbbt-util'
|
|
2
2
|
require 'rbbt/util/excel2tsv'
|
3
3
|
|
4
4
|
module CTCAE
|
5
|
-
|
5
|
+
Rbbt.share.CTCAE.CTCAE.define_as_url TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls')
|
6
6
|
end
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt/util/tsv'
|
2
2
|
require 'rbbt/util/log'
|
3
|
+
require 'cgi'
|
3
4
|
|
4
5
|
# This module interacts with BioMart. It performs queries to BioMart and
|
5
6
|
# synthesises a hash with the results. Note that this module connects to the
|
@@ -27,21 +28,23 @@ module BioMart
|
|
27
28
|
EOT
|
28
29
|
|
29
30
|
def self.set_archive(date)
|
30
|
-
@archive_url = BIOMART_URL.sub(/
|
31
|
+
@archive_url = BIOMART_URL.sub(/http:\/\/biomart\./, 'http://' + date + '.archive.ensembl.')
|
32
|
+
Log.debug "Using Archive URL #{ @archive_url }"
|
31
33
|
end
|
32
34
|
|
33
35
|
def self.unset_archive
|
36
|
+
Log.debug "Restoring current version URL #{BIOMART_URL}"
|
34
37
|
@archive_url = nil
|
35
38
|
end
|
36
39
|
|
37
40
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
41
|
+
repeats = true
|
38
42
|
attrs ||= []
|
39
43
|
filters ||= ["with_#{main}"]
|
40
|
-
|
41
|
-
|
44
|
+
|
42
45
|
query = @@biomart_query_xml.dup
|
43
46
|
query.sub!(/<!--DATABASE-->/,database)
|
44
|
-
query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
|
47
|
+
query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
|
45
48
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
46
49
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
47
50
|
|
@@ -55,23 +58,18 @@ module BioMart
|
|
55
58
|
raise BioMart::QueryError, response
|
56
59
|
end
|
57
60
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
else
|
71
|
-
data[main][name] << value unless data[main][name].include? value
|
72
|
-
end
|
73
|
-
}
|
74
|
-
}
|
61
|
+
result_file = TmpFile.tmp_file
|
62
|
+
Open.write(result_file, response)
|
63
|
+
|
64
|
+
if data.nil?
|
65
|
+
data = result_file
|
66
|
+
else
|
67
|
+
new_datafile = TmpFile.tmp_file
|
68
|
+
TSV.paste_merge data, result_file, new_datafile
|
69
|
+
FileUtils.rm data
|
70
|
+
data = new_datafile
|
71
|
+
FileUtils.rm result_file
|
72
|
+
end
|
75
73
|
|
76
74
|
data
|
77
75
|
end
|
@@ -95,8 +93,9 @@ module BioMart
|
|
95
93
|
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
96
94
|
open_options = Misc.add_defaults open_options, :nocache => false
|
97
95
|
attrs ||= []
|
98
|
-
|
99
|
-
|
96
|
+
|
97
|
+
open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
|
98
|
+
|
100
99
|
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
101
100
|
|
102
101
|
max_items = 2
|
@@ -115,21 +114,22 @@ module BioMart
|
|
115
114
|
|
116
115
|
Log.low "Chunks: #{chunks.length}"
|
117
116
|
chunks.each_with_index{|chunk,i|
|
118
|
-
Log.low "Chunk #{ i }: [#{chunk * ", "}]"
|
117
|
+
Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
|
119
118
|
data = get(database, main, chunk, filters, data, open_options)
|
120
119
|
}
|
121
120
|
|
122
|
-
data
|
121
|
+
result = TSV.new(data, open_options)
|
122
|
+
result.key_field = main
|
123
|
+
result.fields = attrs
|
124
|
+
result.filename = "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
|
125
|
+
|
126
|
+
FileUtils.rm data
|
127
|
+
result
|
123
128
|
end
|
124
129
|
|
125
130
|
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
126
131
|
codes = attrs.collect{|attr| attr[1]}
|
127
|
-
|
128
|
-
tsv = TSV.new({})
|
129
|
-
|
130
|
-
data.each do |key, info|
|
131
|
-
tsv[key] = info.values_at(*codes)
|
132
|
-
end
|
132
|
+
tsv = query(database, main.last, codes, filters, data, open_options)
|
133
133
|
|
134
134
|
tsv.key_field = main.first
|
135
135
|
tsv.fields = attrs.collect{|attr| attr.first}
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -5,29 +5,29 @@ require 'set'
|
|
5
5
|
|
6
6
|
module Entrez
|
7
7
|
|
8
|
-
Rbbt.
|
9
|
-
Rbbt.
|
8
|
+
Rbbt.share.databases.entrez.gene_info.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
|
9
|
+
Rbbt.share.databases.entrez.gene2pubmed.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
10
10
|
|
11
11
|
def self.entrez2native(taxs, options = {})
|
12
|
-
options = Misc.add_defaults options, :key => 1, :
|
12
|
+
options = Misc.add_defaults options, :key => 1, :fields => 5, :persistence => true, :merge => true
|
13
13
|
|
14
14
|
taxs = [taxs] unless Array === taxs
|
15
|
-
options.merge! :grep => taxs
|
16
|
-
|
17
|
-
tsv =
|
15
|
+
options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
|
16
|
+
|
17
|
+
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
18
18
|
tsv.key_field = "Entrez Gene ID"
|
19
19
|
tsv.fields = ["Native ID"]
|
20
20
|
tsv
|
21
21
|
end
|
22
22
|
|
23
23
|
def self.entrez2pubmed(taxs)
|
24
|
-
options = {:key => 1, :
|
24
|
+
options = {:key => 1, :fields => 2, :persistence => true, :merge => true}
|
25
25
|
|
26
26
|
taxs = [taxs] unless taxs.is_a?(Array)
|
27
27
|
taxs = taxs.collect{|t| t.to_s}
|
28
|
-
options.merge! :grep => taxs
|
28
|
+
options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
|
29
29
|
|
30
|
-
|
30
|
+
Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
|
31
31
|
end
|
32
32
|
|
33
33
|
class Gene
|
@@ -132,7 +132,11 @@ module Entrez
|
|
132
132
|
when Entrez::Gene === gene
|
133
133
|
gene_text = gene.text
|
134
134
|
when String === gene || Fixnum === gene
|
135
|
-
|
135
|
+
begin
|
136
|
+
gene_text = get_gene(gene).text
|
137
|
+
rescue CMD::CMDError
|
138
|
+
return 0
|
139
|
+
end
|
136
140
|
else
|
137
141
|
return 0
|
138
142
|
end
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -4,19 +4,20 @@ require 'rbbt-util'
|
|
4
4
|
# now all it does is provide a translation form id to the actual names.
|
5
5
|
module GO
|
6
6
|
|
7
|
-
Rbbt.
|
8
|
-
Rbbt.
|
7
|
+
Rbbt.share.databases.GO.gene_ontology.define_as_url 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
|
8
|
+
Rbbt.share.databases.GO.gslim_generic.define_as_url 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
|
9
9
|
|
10
10
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
11
|
-
TSV_GENE_ONTOLOGY = File.join(
|
11
|
+
TSV_GENE_ONTOLOGY = File.join(Persistence.cachedir, 'gene_ontology')
|
12
12
|
|
13
13
|
# This method needs to be called before any translations can be made, it is
|
14
14
|
# called automatically the first time the id2name method is called. It loads
|
15
15
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
16
16
|
# only the name field is used.
|
17
17
|
def self.init
|
18
|
-
|
19
|
-
|
18
|
+
init = Persistence.persist_tsv('gene_ontology', :Misc) do
|
19
|
+
info = {}
|
20
|
+
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
20
21
|
term_info = {}
|
21
22
|
|
22
23
|
term.split(/\n/). select{|l| l =~ /:/}.each{|l|
|
@@ -32,12 +33,12 @@ module GO
|
|
32
33
|
next if term_info["id"].nil?
|
33
34
|
info[term_info["id"]] = term_info
|
34
35
|
}
|
35
|
-
|
36
|
+
info
|
37
|
+
end
|
36
38
|
end
|
37
39
|
|
38
40
|
def self.info
|
39
|
-
self.init
|
40
|
-
TCHash.get(TSV_GENE_ONTOLOGY)
|
41
|
+
self.init
|
41
42
|
end
|
42
43
|
|
43
44
|
def self.goterms
|
@@ -1,19 +1,39 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
-
require 'rbbt/util/
|
2
|
+
require 'rbbt/util/resource'
|
3
3
|
|
4
4
|
|
5
5
|
module Organism
|
6
|
+
extend Resource
|
7
|
+
relative_to Rbbt, "share/organisms"
|
8
|
+
|
6
9
|
class OrganismNotProcessedError < StandardError; end
|
7
10
|
|
8
11
|
def self.datadir(org)
|
9
12
|
File.join(Rbbt.datadir, 'organisms', org)
|
10
13
|
end
|
11
14
|
|
12
|
-
def self.
|
15
|
+
def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
|
16
|
+
Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
|
17
|
+
options = Misc.add_defaults options, :persistence => true, :case_insensitive => false
|
18
|
+
|
19
|
+
options.merge! :key => target unless target.nil?
|
20
|
+
options.merge! :fields => fields unless fields.nil?
|
21
|
+
|
22
|
+
index = identifiers(org).tsv options
|
23
|
+
|
24
|
+
tsv.attach index, [:key]
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.normalize(org, list, target = nil, fields = nil, options = {})
|
13
28
|
return [] if list.nil? or list.empty?
|
14
29
|
options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
|
15
30
|
double = Misc.process_options options, :double
|
16
|
-
|
31
|
+
|
32
|
+
options.merge! :target => target unless target.nil?
|
33
|
+
options.merge! :fields => fields unless fields.nil?
|
34
|
+
|
35
|
+
index = identifiers(org).index options
|
36
|
+
|
17
37
|
if Array === list
|
18
38
|
if double
|
19
39
|
index.values_at *list
|
@@ -36,11 +56,11 @@ module Organism
|
|
36
56
|
end
|
37
57
|
|
38
58
|
def self.organisms
|
39
|
-
Dir.glob(File.join(
|
59
|
+
Dir.glob(File.join(Rbbt.share.organisms.find, '*')).collect{|f| File.basename(f)}
|
40
60
|
end
|
41
61
|
|
42
62
|
def self.name(organism)
|
43
|
-
|
63
|
+
Organism.scientific_name(organism).read.strip
|
44
64
|
end
|
45
65
|
|
46
66
|
def self.organism(name)
|
@@ -48,9 +68,15 @@ module Organism
|
|
48
68
|
organism == name or Organism.name(organism) =~ /#{ name }/i
|
49
69
|
}.first
|
50
70
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
71
|
+
|
72
|
+
["Hsa", "Sce"].each do |organism|
|
73
|
+
rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
|
74
|
+
rakefile.lib_dir = Resource.caller_lib_dir __FILE__
|
75
|
+
rakefile.pkgdir = 'phgx'
|
76
|
+
Organism[organism].define_as_rake rakefile
|
77
|
+
module_eval "#{ organism } = with_key '#{organism}'"
|
78
|
+
end
|
79
|
+
|
56
80
|
end
|
81
|
+
|
82
|
+
|
@@ -0,0 +1,337 @@
|
|
1
|
+
require 'rbbt/sources/organism'
|
2
|
+
require 'rbbt/util/workflow'
|
3
|
+
require 'bio'
|
4
|
+
# Sequence analyses
|
5
|
+
module Organism
|
6
|
+
extend WorkFlow
|
7
|
+
|
8
|
+
def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
|
9
|
+
exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
10
|
+
transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
|
11
|
+
|
12
|
+
transcripts = exon_transcripts[exon].first
|
13
|
+
transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
|
17
|
+
transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
18
|
+
transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
19
|
+
|
20
|
+
utr5 = transcript_5utr[transcript]
|
21
|
+
|
22
|
+
raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
|
23
|
+
|
24
|
+
return nil if utr5 > offset
|
25
|
+
|
26
|
+
sequence = transcript_sequence[transcript]
|
27
|
+
raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
|
28
|
+
|
29
|
+
ccds_offset = offset - utr5
|
30
|
+
return nil if ccds_offset > sequence.length
|
31
|
+
|
32
|
+
range = (utr5..-1)
|
33
|
+
sequence = sequence[range]
|
34
|
+
|
35
|
+
codon = ccds_offset / 3
|
36
|
+
codon_offset = ccds_offset % 3
|
37
|
+
|
38
|
+
[sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.codon_change(allele, codon, offset)
|
42
|
+
original = Bio::Sequence::NA .new(codon).translate
|
43
|
+
codon = codon.dup
|
44
|
+
codon[offset] = allele
|
45
|
+
new = Bio::Sequence::NA .new(codon).translate
|
46
|
+
[original, new]
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.genes_at_chromosome_positions(org, chromosome, positions)
|
50
|
+
chromosome = chromosome.to_s
|
51
|
+
chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
52
|
+
tsv = file.tsv(:persistence => false, :type => :list)
|
53
|
+
tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
|
54
|
+
[gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
if Array === positions
|
59
|
+
positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
|
60
|
+
else
|
61
|
+
pos = chromosome_bed[positions];
|
62
|
+
pos.nil? ? nil : pos.first
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.genes_at_genomic_positions(org, positions)
|
67
|
+
positions = [positions] unless Array === positions.first
|
68
|
+
genes = []
|
69
|
+
chromosomes = {}
|
70
|
+
indices = {}
|
71
|
+
positions.each_with_index do |info,i|
|
72
|
+
chr, pos = info
|
73
|
+
chromosomes[chr] ||= []
|
74
|
+
indices[chr] ||= []
|
75
|
+
chromosomes[chr] << pos
|
76
|
+
indices[chr] << i
|
77
|
+
end
|
78
|
+
|
79
|
+
chromosomes.each do |chr, pos_list|
|
80
|
+
chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
|
81
|
+
chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
|
82
|
+
end
|
83
|
+
|
84
|
+
genes
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.exons_at_chromosome_positions(org, chromosome, positions)
|
88
|
+
chromosome = chromosome.to_s
|
89
|
+
chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
|
90
|
+
tsv = file.tsv(:persistence => true, :type => :list)
|
91
|
+
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
92
|
+
[exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
if Array === positions
|
97
|
+
positions.collect{|position|
|
98
|
+
chromosome_bed[position];
|
99
|
+
}
|
100
|
+
else
|
101
|
+
chromosome_bed[positions];
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
def self.exons_at_genomic_positions(org, positions)
|
107
|
+
positions = [positions] unless Array === positions.first
|
108
|
+
|
109
|
+
exons = []
|
110
|
+
chromosomes = {}
|
111
|
+
indices = {}
|
112
|
+
positions.each_with_index do |info,i|
|
113
|
+
chr, pos = info
|
114
|
+
chromosomes[chr] ||= []
|
115
|
+
indices[chr] ||= []
|
116
|
+
chromosomes[chr] << pos
|
117
|
+
indices[chr] << i
|
118
|
+
end
|
119
|
+
|
120
|
+
chromosomes.each do |chr, pos_list|
|
121
|
+
chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
|
122
|
+
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
123
|
+
end
|
124
|
+
|
125
|
+
exons
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
|
129
|
+
exons ||= Organism.exons(org).tsv(:persistence => true)
|
130
|
+
transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
|
131
|
+
|
132
|
+
sizes = [0]
|
133
|
+
rank = nil
|
134
|
+
transcript_exons[transcript].zip_fields.each do |_exon, _rank|
|
135
|
+
_rank = _rank.to_i
|
136
|
+
s, e = exons[_exon].values_at("Start", "End")
|
137
|
+
size = e.to_i - s.to_i + 1
|
138
|
+
sizes[_rank] = size
|
139
|
+
rank = _rank if _exon == exon
|
140
|
+
end
|
141
|
+
|
142
|
+
if not rank.nil?
|
143
|
+
sizes[0..rank - 1].inject(0){|e,acc| acc += e}
|
144
|
+
else
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
|
150
|
+
exon_info ||= Organism.exons(org).tsv(:persistence => true)
|
151
|
+
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
152
|
+
|
153
|
+
exons = [exons] unless Array === exons
|
154
|
+
transcript_offsets = {}
|
155
|
+
exons.each do |exon|
|
156
|
+
transcript_offsets[exon] ||= {}
|
157
|
+
offsets = exon_offsets[exon].zip_fields
|
158
|
+
|
159
|
+
offsets.collect do |transcript, offset|
|
160
|
+
next if transcript.empty?
|
161
|
+
transcript_offsets[exon][transcript] = offset.to_i
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
transcript_offsets
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
|
169
|
+
exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
170
|
+
exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
171
|
+
exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
172
|
+
exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
173
|
+
|
174
|
+
exons = exons_at_genomic_positions(org, positions)
|
175
|
+
offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
|
176
|
+
|
177
|
+
position_exons = {}
|
178
|
+
positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
|
179
|
+
|
180
|
+
position_offsets = {}
|
181
|
+
position_exons.each do |position,pos_exons|
|
182
|
+
chr, pos = position
|
183
|
+
next if pos_exons.nil? or pos_exons.empty?
|
184
|
+
pos_exons.each do |exon|
|
185
|
+
if offsets.include? exon
|
186
|
+
if exon_strand[exon] == 1
|
187
|
+
offset_in_exon = (pos.to_i - exon_start[exon].to_i)
|
188
|
+
else
|
189
|
+
offset_in_exon = (exon_end[exon] - pos.to_i)
|
190
|
+
end
|
191
|
+
position_offsets[position] ||= {}
|
192
|
+
offsets[exon].each do |transcript, offset|
|
193
|
+
if not offset.nil?
|
194
|
+
position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
position_offsets
|
202
|
+
end
|
203
|
+
|
204
|
+
task_option :org, "Organism", :string
|
205
|
+
task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
|
206
|
+
task :genomic_mutation_to_protein_mutation => :tsv do |org, genomic_mutations|
|
207
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
208
|
+
|
209
|
+
step(:prepare, "Prepare Results")
|
210
|
+
results = TSV.new({})
|
211
|
+
results.key_field = "Position"
|
212
|
+
results.fields = ["Ensembl Transcript ID", "Mutation"]
|
213
|
+
results.type = :double
|
214
|
+
|
215
|
+
step(:resources, "Load Resources")
|
216
|
+
transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
217
|
+
transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
218
|
+
exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
|
219
|
+
exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
220
|
+
exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
221
|
+
exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
222
|
+
|
223
|
+
step(:offsets, "Find transcripts and offsets for mutations")
|
224
|
+
offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
|
225
|
+
|
226
|
+
step(:aminoacid, "Translate mutation to amino acid substitutions")
|
227
|
+
offsets.each do |position, transcripts|
|
228
|
+
alleles = genomic_mutations[position * ":"].collect{|allele| Misc.IUPAC_to_base(allele)}.flatten
|
229
|
+
|
230
|
+
transcripts.each do |transcript, offset_info|
|
231
|
+
offset, strand = offset_info
|
232
|
+
ddd strand
|
233
|
+
begin
|
234
|
+
codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
|
235
|
+
rescue
|
236
|
+
Log.medium $!.message
|
237
|
+
next
|
238
|
+
end
|
239
|
+
|
240
|
+
ddd codon
|
241
|
+
if not codon.nil?
|
242
|
+
alleles.each do |allele|
|
243
|
+
ddd allele
|
244
|
+
allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
|
245
|
+
ddd allele
|
246
|
+
change = Organism.codon_change(allele, *codon.values_at(0,1))
|
247
|
+
pos_code = position * ":"
|
248
|
+
mutation = [change.first, codon.last + 1, change.last] * ""
|
249
|
+
if results.include? pos_code
|
250
|
+
results[pos_code] = results[pos_code].merge [transcript, mutation]
|
251
|
+
else
|
252
|
+
results[pos_code] = [[transcript], [mutation]]
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
|
260
|
+
results
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
if __FILE__ == $0
|
265
|
+
require 'rbbt/util/log'
|
266
|
+
require 'benchmark'
|
267
|
+
|
268
|
+
select = <<-EOF
|
269
|
+
3:64581875
|
270
|
+
EOF
|
271
|
+
select = select.split("\n").collect{|l| l.split(":")}
|
272
|
+
|
273
|
+
picmi_test = <<-EOF
|
274
|
+
#Chromosome Name Position Reference Tumor
|
275
|
+
1 100382265 C G
|
276
|
+
1 100380997 A G
|
277
|
+
22 30163533 A C
|
278
|
+
X 10094215 G A
|
279
|
+
X 10085674 C T
|
280
|
+
20 50071099 G T
|
281
|
+
21 19638426 G T
|
282
|
+
2 230633386 C T
|
283
|
+
2 230312220 C T
|
284
|
+
1 100624830 T A
|
285
|
+
4 30723053 G T
|
286
|
+
EOF
|
287
|
+
|
288
|
+
# Build 37
|
289
|
+
picmi_test = <<-EOF
|
290
|
+
#Chromosome Name Position Reference Tumor
|
291
|
+
1 100624830 T A
|
292
|
+
21 19638426 G T
|
293
|
+
EOF
|
294
|
+
|
295
|
+
|
296
|
+
# # Build 36
|
297
|
+
# picmi_test = <<-EOF
|
298
|
+
##Chromosome Name Position Reference Tumor
|
299
|
+
#3 81780820 T C
|
300
|
+
#2 43881517 A T
|
301
|
+
#2 43857514 T C
|
302
|
+
#6 88375602 G A
|
303
|
+
#16 69875502 G T
|
304
|
+
#16 69876078 T C
|
305
|
+
#16 69877147 G A
|
306
|
+
#17 8101874 C T
|
307
|
+
# EOF
|
308
|
+
|
309
|
+
|
310
|
+
Log.severity = 2
|
311
|
+
org = 'Hsa/may2009'
|
312
|
+
file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
|
313
|
+
|
314
|
+
#positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
|
315
|
+
positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
|
316
|
+
positions.key_field = "Position"
|
317
|
+
positions.fields = %w(Reference Control Tumor)
|
318
|
+
#positions.fields = %w(Reference Tumor)
|
319
|
+
|
320
|
+
#puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
|
321
|
+
|
322
|
+
|
323
|
+
#positions = positions.select ["10:98099540"]
|
324
|
+
|
325
|
+
Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
|
326
|
+
job = Organism.job :genomic_mutation_to_protein_mutation, "Metastasis", org, positions.slice("Tumor")
|
327
|
+
job.run
|
328
|
+
|
329
|
+
while not job.done?
|
330
|
+
puts job.step
|
331
|
+
sleep 2
|
332
|
+
end
|
333
|
+
|
334
|
+
raise job.messages.last if job.error?
|
335
|
+
mutations = job.load
|
336
|
+
|
337
|
+
end
|