rbbt-sources 0.4.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/etc/biomart/missing_in_archive +15 -0
- data/lib/rbbt/sources/COSMIC.rb +14 -0
- data/lib/rbbt/sources/COSTART.rb +1 -1
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/InterPro.rb +17 -0
- data/lib/rbbt/sources/NCI.rb +7 -0
- data/lib/rbbt/sources/biomart.rb +9 -9
- data/lib/rbbt/sources/entrez.rb +44 -17
- data/lib/rbbt/sources/go.rb +10 -7
- data/lib/rbbt/sources/jochem.rb +4 -0
- data/lib/rbbt/sources/organism.rb +24 -25
- data/lib/rbbt/sources/organism/sequence.rb +253 -19
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/lib/rbbt/sources/pubmed.rb +10 -5
- data/lib/rbbt/sources/wgEncodeBroadHmm.rb +37 -0
- data/share/install/InterPro/Rakefile +29 -0
- data/share/install/JoChem/Rakefile +67 -0
- data/share/install/NCI/Rakefile +79 -0
- data/share/install/Organism/Hsa/Rakefile +20 -1
- data/share/install/Organism/Rno/Rakefile +2 -0
- data/share/install/Organism/organism_helpers.rb +134 -77
- data/share/install/lib/helpers.rb +6 -5
- data/test/rbbt/sources/test_biomart.rb +8 -5
- data/test/rbbt/sources/test_organism.rb +23 -19
- metadata +39 -14
@@ -0,0 +1,15 @@
|
|
1
|
+
may2009:
|
2
|
+
- agilent_wholegenome
|
3
|
+
- agilent_cgh_44b
|
4
|
+
- illumina_humanwg_6_v2
|
5
|
+
- illumina_humanwg_6_v3
|
6
|
+
dec2007:
|
7
|
+
- protein_id
|
8
|
+
- affy_hc_g110
|
9
|
+
- affy_hg_u133a_2
|
10
|
+
- affy_huex_1_0_st_v2
|
11
|
+
- affy_hugene_1_0_st_v1
|
12
|
+
- agilent_wholegenome
|
13
|
+
- agilent_cgh_44b
|
14
|
+
- illumina_humanwg_6_v2
|
15
|
+
- illumina_humanwg_6_v3
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
module COSMIC
|
4
|
+
extend Resource
|
5
|
+
self.subdir = "share/databases/COSMIC"
|
6
|
+
|
7
|
+
COSMIC.claim COSMIC.Mutations, :proc do
|
8
|
+
url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicMutantExport_v54_120711.tsv"
|
9
|
+
|
10
|
+
TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation GRCh37 genome position", :merge => true).to_s
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
puts COSMIC.Mutations.produce
|
data/lib/rbbt/sources/COSTART.rb
CHANGED
@@ -2,7 +2,7 @@ require 'rbbt-util'
|
|
2
2
|
|
3
3
|
module COSTART
|
4
4
|
|
5
|
-
Rbbt.share.databases.COSTART.COSTART
|
5
|
+
Rbbt.claim Rbbt.share.databases.COSTART.COSTART, :proc do
|
6
6
|
terms = ["#COSTART Terms"]
|
7
7
|
Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
|
8
8
|
puts line
|
data/lib/rbbt/sources/CTCAE.rb
CHANGED
@@ -2,5 +2,5 @@ require 'rbbt-util'
|
|
2
2
|
require 'rbbt/util/excel2tsv'
|
3
3
|
|
4
4
|
module CTCAE
|
5
|
-
Rbbt.share.CTCAE.CTCAE
|
5
|
+
Rbbt.claim Rbbt.share.databases.CTCAE.CTCAE, :proc do TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls').to_s end
|
6
6
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
module InterPro
|
3
|
+
extend Resource
|
4
|
+
self.subdir = "share/databases/InterPro"
|
5
|
+
|
6
|
+
InterPro.claim InterPro.root.find, :rake, Rbbt.share.install.InterPro.Rakefile.find(:lib)
|
7
|
+
|
8
|
+
def self.tsv(*args)
|
9
|
+
old_url = BioMart::BIOMART_URL
|
10
|
+
begin
|
11
|
+
BioMart::BIOMART_URL.replace "http://www.ebi.ac.uk/interpro/biomart/martservice?query="
|
12
|
+
BioMart.tsv(*args)
|
13
|
+
ensure
|
14
|
+
BioMart::BIOMART_URL.replace old_url
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rbbt'
|
2
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/tsv/attach'
|
3
4
|
require 'rbbt/util/log'
|
4
5
|
require 'cgi'
|
5
6
|
|
@@ -14,7 +15,7 @@ module BioMart
|
|
14
15
|
|
15
16
|
BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
|
16
17
|
|
17
|
-
MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.yaml
|
18
|
+
MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.yaml : {}
|
18
19
|
|
19
20
|
private
|
20
21
|
|
@@ -68,10 +69,10 @@ module BioMart
|
|
68
69
|
|
69
70
|
new_datafile = TmpFile.tmp_file
|
70
71
|
if data.nil?
|
71
|
-
TSV.
|
72
|
+
TSV.merge_row_fields Open.open(result_file), new_datafile
|
72
73
|
data = new_datafile
|
73
74
|
else
|
74
|
-
TSV.
|
75
|
+
TSV.merge_different_fields data, result_file, new_datafile
|
75
76
|
FileUtils.rm data
|
76
77
|
data = new_datafile
|
77
78
|
end
|
@@ -117,7 +118,6 @@ module BioMart
|
|
117
118
|
}
|
118
119
|
|
119
120
|
chunks << chunk if chunk.any?
|
120
|
-
|
121
121
|
|
122
122
|
Log.low "Chunks: #{chunks.length}"
|
123
123
|
chunks.each_with_index{|chunk,i|
|
@@ -125,15 +125,15 @@ module BioMart
|
|
125
125
|
data = get(database, main, chunk, filters, data, open_options)
|
126
126
|
}
|
127
127
|
|
128
|
-
open_options[:filename] ||= "BioMart
|
128
|
+
open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
|
129
129
|
if filename.nil?
|
130
|
-
results = TSV.
|
130
|
+
results = TSV.open data, open_options
|
131
131
|
results.key_field = main
|
132
132
|
results.fields = attrs
|
133
133
|
results
|
134
134
|
else
|
135
135
|
Open.write(filename) do |f|
|
136
|
-
f.puts "#: " << Misc.hash2string(TSV::
|
136
|
+
f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
|
137
137
|
if field_names.nil?
|
138
138
|
f.puts "#" << [main, attrs].flatten * "\t"
|
139
139
|
else
|
@@ -148,7 +148,7 @@ module BioMart
|
|
148
148
|
|
149
149
|
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
150
150
|
if @archive_url
|
151
|
-
attrs = attrs.reject{|attr| MISSING_IN_ARCHIVE[@archive].include? attr[1]}
|
151
|
+
attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
|
152
152
|
end
|
153
153
|
|
154
154
|
codes = attrs.collect{|attr| attr[1]}
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -1,18 +1,19 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/resource'
|
3
4
|
require 'rbbt/bow/bow'
|
4
5
|
require 'set'
|
5
6
|
|
6
7
|
module Entrez
|
7
8
|
|
8
|
-
Rbbt.share.databases.entrez.gene_info
|
9
|
-
Rbbt.share.databases.entrez.gene2pubmed
|
9
|
+
Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
|
10
|
+
Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
10
11
|
|
11
12
|
def self.entrez2native(taxs, options = {})
|
12
|
-
options = Misc.add_defaults options, :
|
13
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
|
13
14
|
|
14
15
|
taxs = [taxs] unless Array === taxs
|
15
|
-
options.merge! :grep => taxs.collect{|t| "
|
16
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
16
17
|
|
17
18
|
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
18
19
|
tsv.key_field = "Entrez Gene ID"
|
@@ -20,12 +21,24 @@ module Entrez
|
|
20
21
|
tsv
|
21
22
|
end
|
22
23
|
|
24
|
+
def self.entrez2name(taxs, options = {})
|
25
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
|
26
|
+
|
27
|
+
taxs = [taxs] unless Array === taxs
|
28
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
29
|
+
|
30
|
+
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
31
|
+
tsv.key_field = "Entrez Gene ID"
|
32
|
+
tsv.fields = ["Associated Gene Name"]
|
33
|
+
tsv
|
34
|
+
end
|
35
|
+
|
36
|
+
|
23
37
|
def self.entrez2pubmed(taxs)
|
24
|
-
options = {:
|
38
|
+
options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
|
25
39
|
|
26
40
|
taxs = [taxs] unless taxs.is_a?(Array)
|
27
|
-
|
28
|
-
options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
|
41
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
29
42
|
|
30
43
|
Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
|
31
44
|
end
|
@@ -58,18 +71,31 @@ module Entrez
|
|
58
71
|
private
|
59
72
|
|
60
73
|
def self.get_online(geneids)
|
61
|
-
geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
|
62
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
|
63
74
|
|
64
|
-
|
75
|
+
genes_complete = geneids.is_a?(Array) ? geneids : [geneids]
|
65
76
|
|
66
|
-
genes =
|
77
|
+
genes = []
|
78
|
+
Misc.divide(genes_complete, (genes_complete.length / 100) + 1).each do |geneids_list|
|
79
|
+
begin
|
80
|
+
Misc.try3times do
|
81
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list * ","}"
|
82
|
+
|
83
|
+
xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
|
84
|
+
|
85
|
+
genes += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
|
86
|
+
end
|
87
|
+
rescue
|
88
|
+
puts $!.message
|
89
|
+
genes += geneids_list.collect{|g| nil}
|
90
|
+
end
|
91
|
+
end
|
67
92
|
|
68
93
|
if geneids.is_a? Array
|
69
|
-
list =
|
70
|
-
genes.
|
71
|
-
geneid =
|
72
|
-
|
94
|
+
list = Hash[*genes_complete.zip([nil]).flatten]
|
95
|
+
genes.each{|gene|
|
96
|
+
geneid = gene.match(/<Gene-track_geneid>(\d+)/)[1]
|
97
|
+
geneid = geneid.to_i unless list.include? geneid
|
98
|
+
list[geneid] = gene
|
73
99
|
}
|
74
100
|
return list
|
75
101
|
else
|
@@ -99,6 +125,7 @@ module Entrez
|
|
99
125
|
end
|
100
126
|
}
|
101
127
|
|
128
|
+
|
102
129
|
return list unless missing.any?
|
103
130
|
genes = get_online(missing)
|
104
131
|
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/persist/tsv'
|
2
4
|
|
3
5
|
# This module holds helper methods to deal with the Gene Ontology files. Right
|
4
6
|
# now all it does is provide a translation form id to the actual names.
|
5
7
|
module GO
|
6
8
|
|
7
|
-
Rbbt.share.databases.GO.gene_ontology
|
8
|
-
Rbbt.share.databases.GO.gslim_generic
|
9
|
+
Rbbt.claim Rbbt.share.databases.GO.gene_ontology, :url, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
|
10
|
+
Rbbt.claim Rbbt.share.databases.GO.gslim_generic, :url, 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
|
9
11
|
|
10
12
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
11
|
-
TSV_GENE_ONTOLOGY = File.join(
|
13
|
+
TSV_GENE_ONTOLOGY = File.join(Persist.cachedir, 'gene_ontology')
|
12
14
|
|
13
15
|
# This method needs to be called before any translations can be made, it is
|
14
16
|
# called automatically the first time the id2name method is called. It loads
|
15
17
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
16
18
|
# only the name field is used.
|
17
19
|
def self.init
|
18
|
-
|
19
|
-
info =
|
20
|
+
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
21
|
+
info.serializer = :marshal if info.respond_to? :serializer and info.serializer == :type
|
20
22
|
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
21
23
|
term_info = {}
|
22
24
|
|
@@ -33,12 +35,13 @@ module GO
|
|
33
35
|
next if term_info["id"].nil?
|
34
36
|
info[term_info["id"]] = term_info
|
35
37
|
}
|
38
|
+
|
36
39
|
info
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
43
|
def self.info
|
41
|
-
self.init
|
44
|
+
@info ||= self.init
|
42
45
|
end
|
43
46
|
|
44
47
|
def self.goterms
|
@@ -1,32 +1,35 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
3
|
-
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/resource/with_key'
|
4
4
|
|
5
5
|
module Organism
|
6
6
|
extend Resource
|
7
|
-
|
7
|
+
self.pkgdir = "rbbt"
|
8
|
+
self.subdir = "share/organisms"
|
8
9
|
|
9
|
-
|
10
|
+
["Hsa", "Rno", "Sce"].each do |organism|
|
11
|
+
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
13
|
+
module_eval "#{ organism } = with_key '#{organism}'"
|
14
|
+
end
|
15
|
+
|
16
|
+
class OrganismNotProcessedError < StandardError; end
|
14
17
|
|
15
18
|
def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
|
16
19
|
Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
|
17
|
-
options = Misc.add_defaults options, :
|
20
|
+
options = Misc.add_defaults options, :persist => true, :case_insensitive => false
|
18
21
|
|
19
|
-
options.merge! :
|
22
|
+
options.merge! :key_field => target unless target.nil?
|
20
23
|
options.merge! :fields => fields unless fields.nil?
|
21
24
|
|
22
25
|
index = identifiers(org).tsv options
|
23
26
|
|
24
|
-
tsv.attach index, [:key]
|
27
|
+
tsv.attach index, :fields => [:key], :persist_input => true
|
25
28
|
end
|
26
29
|
|
27
30
|
def self.normalize(org, list, target = nil, fields = nil, options = {})
|
28
31
|
return [] if list.nil? or list.empty?
|
29
|
-
options = Misc.add_defaults options, :
|
32
|
+
options = Misc.add_defaults options, :persist => true, :case_insensitive => true, :double => false
|
30
33
|
double = Misc.process_options options, :double
|
31
34
|
|
32
35
|
|
@@ -50,14 +53,20 @@ module Organism
|
|
50
53
|
end
|
51
54
|
end
|
52
55
|
|
53
|
-
def self.guess_id(org, values)
|
54
|
-
identifiers
|
56
|
+
def self.guess_id(org, values, identifiers = nil)
|
57
|
+
identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
|
55
58
|
field_matches = identifiers.field_matches(values)
|
56
59
|
field_matches.sort_by{|field, matches| matches.uniq.length}.last
|
57
60
|
end
|
58
61
|
|
62
|
+
def self.guess_id(org, values)
|
63
|
+
field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
|
64
|
+
field_matches.sort_by{|field, count| count.to_i}.last
|
65
|
+
end
|
66
|
+
|
67
|
+
|
59
68
|
def self.organisms
|
60
|
-
Dir.glob(File.join(
|
69
|
+
Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
|
61
70
|
end
|
62
71
|
|
63
72
|
def self.name(organism)
|
@@ -70,14 +79,4 @@ module Organism
|
|
70
79
|
}.first
|
71
80
|
end
|
72
81
|
|
73
|
-
["Hsa", "Rno", "Sce"].each do |organism|
|
74
|
-
rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
|
75
|
-
rakefile.lib_dir = Resource.caller_lib_dir __FILE__
|
76
|
-
rakefile.pkgdir = 'phgx'
|
77
|
-
Organism[organism].define_as_rake rakefile
|
78
|
-
module_eval "#{ organism } = with_key '#{organism}'"
|
79
|
-
end
|
80
|
-
|
81
82
|
end
|
82
|
-
|
83
|
-
|
@@ -11,7 +11,12 @@ module Organism
|
|
11
11
|
exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
12
12
|
transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
|
13
13
|
|
14
|
-
transcripts =
|
14
|
+
transcripts = begin
|
15
|
+
exon_transcripts[exon].first
|
16
|
+
rescue
|
17
|
+
[]
|
18
|
+
end
|
19
|
+
|
15
20
|
transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
|
16
21
|
end
|
17
22
|
|
@@ -156,6 +161,8 @@ module Organism
|
|
156
161
|
transcript_offsets = {}
|
157
162
|
exons.each do |exon|
|
158
163
|
transcript_offsets[exon] ||= {}
|
164
|
+
offsets = nil
|
165
|
+
next unless exon_offsets.include? exon
|
159
166
|
offsets = exon_offsets[exon].zip_fields
|
160
167
|
|
161
168
|
offsets.collect do |transcript, offset|
|
@@ -173,7 +180,7 @@ module Organism
|
|
173
180
|
exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
174
181
|
exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
175
182
|
|
176
|
-
exons
|
183
|
+
exons = exons_at_genomic_positions(org, positions)
|
177
184
|
offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
|
178
185
|
|
179
186
|
position_exons = {}
|
@@ -203,8 +210,135 @@ module Organism
|
|
203
210
|
position_offsets
|
204
211
|
end
|
205
212
|
|
213
|
+
def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
|
214
|
+
chromosome = chromosome.to_s
|
215
|
+
chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
216
|
+
tsv = file.tsv(:persistence => true, :type => :list)
|
217
|
+
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
218
|
+
[exon, values["Exon Chr Start"].to_i]
|
219
|
+
end
|
220
|
+
end
|
221
|
+
chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
222
|
+
tsv = file.tsv(:persistence => true, :type => :list)
|
223
|
+
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
224
|
+
[exon, values["Exon Chr End"].to_i]
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
if Array === positions
|
229
|
+
positions.collect{|position|
|
230
|
+
position = position.to_i
|
231
|
+
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
232
|
+
}
|
233
|
+
else
|
234
|
+
position = positions.to_i
|
235
|
+
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
|
240
|
+
def self.exon_junctures_at_genomic_positions(org, positions)
|
241
|
+
positions = [positions] unless Array === positions.first
|
242
|
+
|
243
|
+
exons = []
|
244
|
+
chromosomes = {}
|
245
|
+
indices = {}
|
246
|
+
positions.each_with_index do |info,i|
|
247
|
+
chr, pos = info
|
248
|
+
chromosomes[chr] ||= []
|
249
|
+
indices[chr] ||= []
|
250
|
+
chromosomes[chr] << pos
|
251
|
+
indices[chr] << i
|
252
|
+
end
|
253
|
+
|
254
|
+
chromosomes.each do |chr, pos_list|
|
255
|
+
chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
|
256
|
+
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
257
|
+
end
|
258
|
+
|
259
|
+
exons
|
260
|
+
end
|
261
|
+
|
262
|
+
def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
|
263
|
+
chromosome = chromosome.to_s
|
264
|
+
|
265
|
+
chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
266
|
+
rows = []
|
267
|
+
chromosome = options[:chromosome]
|
268
|
+
f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
|
269
|
+
while not f.eof?
|
270
|
+
line = f.gets.chomp
|
271
|
+
id, chr, pos = line.split "\t"
|
272
|
+
rows << [id, pos.to_i]
|
273
|
+
end
|
274
|
+
|
275
|
+
rows
|
276
|
+
end
|
277
|
+
|
278
|
+
if Array === positions
|
279
|
+
positions.collect{|position|
|
280
|
+
chromosome_bed[position];
|
281
|
+
}
|
282
|
+
else
|
283
|
+
chromosome_bed[positions];
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
|
288
|
+
def self.identify_variations_at_genomic_positions(org, positions, variations_file)
|
289
|
+
positions = [positions] unless Array === positions.first
|
290
|
+
|
291
|
+
variations = []
|
292
|
+
chromosomes = {}
|
293
|
+
indices = {}
|
294
|
+
positions.each_with_index do |info,i|
|
295
|
+
chr, pos = info
|
296
|
+
chromosomes[chr] ||= []
|
297
|
+
indices[chr] ||= []
|
298
|
+
chromosomes[chr] << pos
|
299
|
+
indices[chr] << i
|
300
|
+
end
|
301
|
+
|
302
|
+
chromosomes.each do |chr, pos_list|
|
303
|
+
chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
|
304
|
+
chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
|
305
|
+
end
|
306
|
+
|
307
|
+
variations
|
308
|
+
end
|
309
|
+
|
310
|
+
task_option :organism, "Organism", :string, "Hsa"
|
311
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
312
|
+
task_dependencies nil
|
313
|
+
task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
|
314
|
+
genomic_mutations = case
|
315
|
+
when TSV === genomic_mutations
|
316
|
+
genomic_mutations
|
317
|
+
else
|
318
|
+
TSV.new StringIO.new(genomic_mutations), :list
|
319
|
+
end
|
320
|
+
genomic_mutations.key_field ||= "Position"
|
321
|
+
genomic_mutations.fields ||= ["Mutation"]
|
322
|
+
|
323
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
324
|
+
|
325
|
+
step(:resources, "Load Resources")
|
326
|
+
|
327
|
+
exon_junctures = {}
|
328
|
+
genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
|
329
|
+
exon_junctures[position] = exons
|
330
|
+
end
|
331
|
+
|
332
|
+
genomic_mutations.add_field "Exon Junctions" do |position, values|
|
333
|
+
exon_junctures[position] * "|"
|
334
|
+
end
|
335
|
+
|
336
|
+
genomic_mutations.to_s :sort, true
|
337
|
+
end
|
338
|
+
|
339
|
+
|
206
340
|
task_option :organism, "Organism", :string, "Hsa"
|
207
|
-
task_option :genomic_mutations, "Position (chr:position)
|
341
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
208
342
|
task_dependencies nil
|
209
343
|
task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
|
210
344
|
genomic_mutations = case
|
@@ -213,8 +347,8 @@ module Organism
|
|
213
347
|
else
|
214
348
|
TSV.new StringIO.new(genomic_mutations), :list
|
215
349
|
end
|
216
|
-
genomic_mutations.key_field
|
217
|
-
genomic_mutations.fields
|
350
|
+
genomic_mutations.key_field ||= "Position"
|
351
|
+
genomic_mutations.fields ||= ["Mutation"]
|
218
352
|
|
219
353
|
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
220
354
|
|
@@ -234,7 +368,7 @@ Translates a collection of mutations in genomic coordinates into mutations in am
|
|
234
368
|
protein products of transcripts including those positions.
|
235
369
|
EOF
|
236
370
|
task_option :organism, "Organism", :string, "Hsa"
|
237
|
-
task_option :genomic_mutations, "Position (chr:position)
|
371
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
238
372
|
task_dependencies nil
|
239
373
|
task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
|
240
374
|
genomic_mutations = case
|
@@ -244,8 +378,8 @@ protein products of transcripts including those positions.
|
|
244
378
|
TSV.new StringIO.new(genomic_mutations), :list
|
245
379
|
end
|
246
380
|
|
247
|
-
genomic_mutations.key_field
|
248
|
-
genomic_mutations.fields
|
381
|
+
genomic_mutations.key_field ||= "Position"
|
382
|
+
genomic_mutations.fields ||= ["Mutation"]
|
249
383
|
|
250
384
|
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
251
385
|
|
@@ -256,7 +390,6 @@ protein products of transcripts including those positions.
|
|
256
390
|
results.type = :double
|
257
391
|
results.filename = path
|
258
392
|
|
259
|
-
|
260
393
|
step(:resources, "Load Resources")
|
261
394
|
transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
262
395
|
transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
@@ -264,26 +397,31 @@ protein products of transcripts including those positions.
|
|
264
397
|
exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
265
398
|
exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
266
399
|
exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
400
|
+
transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
|
267
401
|
|
268
402
|
step(:offsets, "Find transcripts and offsets for mutations")
|
269
403
|
offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
|
270
404
|
|
271
405
|
step(:aminoacid, "Translate mutation to amino acid substitutions")
|
272
406
|
offsets.each do |position, transcripts|
|
273
|
-
|
407
|
+
if genomic_mutations.type === :double
|
408
|
+
alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
|
409
|
+
else
|
410
|
+
alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
|
411
|
+
end
|
274
412
|
|
275
413
|
transcripts.each do |transcript, offset_info|
|
276
414
|
offset, strand = offset_info
|
277
|
-
begin
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
if not codon.nil?
|
415
|
+
codon = begin
|
416
|
+
Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
|
417
|
+
rescue
|
418
|
+
Log.medium $!.message
|
419
|
+
next
|
420
|
+
end
|
421
|
+
|
422
|
+
if not codon.nil? and not codon.empty?
|
285
423
|
alleles.each do |allele|
|
286
|
-
allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
|
424
|
+
allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
|
287
425
|
change = Organism.codon_change(allele, *codon.values_at(0,1))
|
288
426
|
pos_code = position * ":"
|
289
427
|
mutation = [change.first, codon.last + 1, change.last] * ""
|
@@ -298,8 +436,93 @@ protein products of transcripts including those positions.
|
|
298
436
|
|
299
437
|
end
|
300
438
|
|
439
|
+
step(:identify_proteins, "Identify Proteins for Transcripts")
|
440
|
+
transcript_field = results.identify_field "Ensembl Transcript ID"
|
441
|
+
results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
|
442
|
+
values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
|
443
|
+
end
|
444
|
+
|
445
|
+
|
301
446
|
results
|
302
447
|
end
|
448
|
+
|
449
|
+
|
450
|
+
task_option :organism, "Organism", :string, "Hsa"
|
451
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
452
|
+
task_dependencies nil
|
453
|
+
task :identify_germline_variations => :tsv do |org,genomic_mutations|
|
454
|
+
genomic_mutations = case
|
455
|
+
when TSV === genomic_mutations
|
456
|
+
genomic_mutations
|
457
|
+
else
|
458
|
+
TSV.new StringIO.new(genomic_mutations), :list
|
459
|
+
end
|
460
|
+
|
461
|
+
genomic_mutations.key_field ||= "Position"
|
462
|
+
genomic_mutations.fields ||= ["Mutation"]
|
463
|
+
|
464
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
465
|
+
|
466
|
+
|
467
|
+
step(:prepare, "Prepare Results")
|
468
|
+
results = TSV.new({})
|
469
|
+
results.key_field = "Position"
|
470
|
+
results.fields = ["SNP Id"]
|
471
|
+
results.type = :double
|
472
|
+
results.filename = path
|
473
|
+
|
474
|
+
|
475
|
+
step(:resources, "Load Resources")
|
476
|
+
|
477
|
+
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
|
478
|
+
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
479
|
+
|
480
|
+
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
481
|
+
snps_for_positions[position]
|
482
|
+
end
|
483
|
+
|
484
|
+
genomic_mutations
|
485
|
+
end
|
486
|
+
|
487
|
+
|
488
|
+
task_option :organism, "Organism", :string, "Hsa"
|
489
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
490
|
+
task_dependencies nil
|
491
|
+
task :identify_somatic_variations => :tsv do |org,genomic_mutations|
|
492
|
+
genomic_mutations = case
|
493
|
+
when TSV === genomic_mutations
|
494
|
+
genomic_mutations
|
495
|
+
else
|
496
|
+
TSV.new StringIO.new(genomic_mutations), :list
|
497
|
+
end
|
498
|
+
|
499
|
+
genomic_mutations.key_field ||= "Position"
|
500
|
+
genomic_mutations.fields ||= ["Mutation"]
|
501
|
+
|
502
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
503
|
+
|
504
|
+
|
505
|
+
step(:prepare, "Prepare Results")
|
506
|
+
results = TSV.new({})
|
507
|
+
results.key_field = "Position"
|
508
|
+
results.fields = ["SNP Id"]
|
509
|
+
results.type = :double
|
510
|
+
results.filename = path
|
511
|
+
|
512
|
+
|
513
|
+
step(:resources, "Load Resources")
|
514
|
+
|
515
|
+
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
|
516
|
+
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
517
|
+
|
518
|
+
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
519
|
+
snps_for_positions[position]
|
520
|
+
end
|
521
|
+
|
522
|
+
genomic_mutations
|
523
|
+
end
|
524
|
+
|
525
|
+
|
303
526
|
end
|
304
527
|
|
305
528
|
if __FILE__ == $0
|
@@ -333,6 +556,17 @@ X 10085674 C T
|
|
333
556
|
21 19638426 G T
|
334
557
|
EOF
|
335
558
|
|
559
|
+
exon_juncture_test = <<-EOF
|
560
|
+
#Position Mutation
|
561
|
+
7:150753996 T
|
562
|
+
EOF
|
563
|
+
|
564
|
+
|
565
|
+
job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
|
566
|
+
job.run
|
567
|
+
job.clean if job.error?
|
568
|
+
puts job.messages
|
569
|
+
puts job.read
|
336
570
|
|
337
571
|
# # Build 36
|
338
572
|
# picmi_test = <<-EOF
|