rbbt-sources 0.4.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/etc/biomart/missing_in_archive +15 -0
- data/lib/rbbt/sources/COSMIC.rb +14 -0
- data/lib/rbbt/sources/COSTART.rb +1 -1
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/InterPro.rb +17 -0
- data/lib/rbbt/sources/NCI.rb +7 -0
- data/lib/rbbt/sources/biomart.rb +9 -9
- data/lib/rbbt/sources/entrez.rb +44 -17
- data/lib/rbbt/sources/go.rb +10 -7
- data/lib/rbbt/sources/jochem.rb +4 -0
- data/lib/rbbt/sources/organism.rb +24 -25
- data/lib/rbbt/sources/organism/sequence.rb +253 -19
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/lib/rbbt/sources/pubmed.rb +10 -5
- data/lib/rbbt/sources/wgEncodeBroadHmm.rb +37 -0
- data/share/install/InterPro/Rakefile +29 -0
- data/share/install/JoChem/Rakefile +67 -0
- data/share/install/NCI/Rakefile +79 -0
- data/share/install/Organism/Hsa/Rakefile +20 -1
- data/share/install/Organism/Rno/Rakefile +2 -0
- data/share/install/Organism/organism_helpers.rb +134 -77
- data/share/install/lib/helpers.rb +6 -5
- data/test/rbbt/sources/test_biomart.rb +8 -5
- data/test/rbbt/sources/test_organism.rb +23 -19
- metadata +39 -14
@@ -0,0 +1,15 @@
|
|
1
|
+
may2009:
|
2
|
+
- agilent_wholegenome
|
3
|
+
- agilent_cgh_44b
|
4
|
+
- illumina_humanwg_6_v2
|
5
|
+
- illumina_humanwg_6_v3
|
6
|
+
dec2007:
|
7
|
+
- protein_id
|
8
|
+
- affy_hc_g110
|
9
|
+
- affy_hg_u133a_2
|
10
|
+
- affy_huex_1_0_st_v2
|
11
|
+
- affy_hugene_1_0_st_v1
|
12
|
+
- agilent_wholegenome
|
13
|
+
- agilent_cgh_44b
|
14
|
+
- illumina_humanwg_6_v2
|
15
|
+
- illumina_humanwg_6_v3
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
module COSMIC
|
4
|
+
extend Resource
|
5
|
+
self.subdir = "share/databases/COSMIC"
|
6
|
+
|
7
|
+
COSMIC.claim COSMIC.Mutations, :proc do
|
8
|
+
url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicMutantExport_v54_120711.tsv"
|
9
|
+
|
10
|
+
TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation GRCh37 genome position", :merge => true).to_s
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
puts COSMIC.Mutations.produce
|
data/lib/rbbt/sources/COSTART.rb
CHANGED
@@ -2,7 +2,7 @@ require 'rbbt-util'
|
|
2
2
|
|
3
3
|
module COSTART
|
4
4
|
|
5
|
-
Rbbt.share.databases.COSTART.COSTART
|
5
|
+
Rbbt.claim Rbbt.share.databases.COSTART.COSTART, :proc do
|
6
6
|
terms = ["#COSTART Terms"]
|
7
7
|
Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
|
8
8
|
puts line
|
data/lib/rbbt/sources/CTCAE.rb
CHANGED
@@ -2,5 +2,5 @@ require 'rbbt-util'
|
|
2
2
|
require 'rbbt/util/excel2tsv'
|
3
3
|
|
4
4
|
module CTCAE
|
5
|
-
Rbbt.share.CTCAE.CTCAE
|
5
|
+
Rbbt.claim Rbbt.share.databases.CTCAE.CTCAE, :proc do TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls').to_s end
|
6
6
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
module InterPro
|
3
|
+
extend Resource
|
4
|
+
self.subdir = "share/databases/InterPro"
|
5
|
+
|
6
|
+
InterPro.claim InterPro.root.find, :rake, Rbbt.share.install.InterPro.Rakefile.find(:lib)
|
7
|
+
|
8
|
+
def self.tsv(*args)
|
9
|
+
old_url = BioMart::BIOMART_URL
|
10
|
+
begin
|
11
|
+
BioMart::BIOMART_URL.replace "http://www.ebi.ac.uk/interpro/biomart/martservice?query="
|
12
|
+
BioMart.tsv(*args)
|
13
|
+
ensure
|
14
|
+
BioMart::BIOMART_URL.replace old_url
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rbbt'
|
2
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/tsv/attach'
|
3
4
|
require 'rbbt/util/log'
|
4
5
|
require 'cgi'
|
5
6
|
|
@@ -14,7 +15,7 @@ module BioMart
|
|
14
15
|
|
15
16
|
BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
|
16
17
|
|
17
|
-
MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.yaml
|
18
|
+
MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.yaml : {}
|
18
19
|
|
19
20
|
private
|
20
21
|
|
@@ -68,10 +69,10 @@ module BioMart
|
|
68
69
|
|
69
70
|
new_datafile = TmpFile.tmp_file
|
70
71
|
if data.nil?
|
71
|
-
TSV.
|
72
|
+
TSV.merge_row_fields Open.open(result_file), new_datafile
|
72
73
|
data = new_datafile
|
73
74
|
else
|
74
|
-
TSV.
|
75
|
+
TSV.merge_different_fields data, result_file, new_datafile
|
75
76
|
FileUtils.rm data
|
76
77
|
data = new_datafile
|
77
78
|
end
|
@@ -117,7 +118,6 @@ module BioMart
|
|
117
118
|
}
|
118
119
|
|
119
120
|
chunks << chunk if chunk.any?
|
120
|
-
|
121
121
|
|
122
122
|
Log.low "Chunks: #{chunks.length}"
|
123
123
|
chunks.each_with_index{|chunk,i|
|
@@ -125,15 +125,15 @@ module BioMart
|
|
125
125
|
data = get(database, main, chunk, filters, data, open_options)
|
126
126
|
}
|
127
127
|
|
128
|
-
open_options[:filename] ||= "BioMart
|
128
|
+
open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
|
129
129
|
if filename.nil?
|
130
|
-
results = TSV.
|
130
|
+
results = TSV.open data, open_options
|
131
131
|
results.key_field = main
|
132
132
|
results.fields = attrs
|
133
133
|
results
|
134
134
|
else
|
135
135
|
Open.write(filename) do |f|
|
136
|
-
f.puts "#: " << Misc.hash2string(TSV::
|
136
|
+
f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
|
137
137
|
if field_names.nil?
|
138
138
|
f.puts "#" << [main, attrs].flatten * "\t"
|
139
139
|
else
|
@@ -148,7 +148,7 @@ module BioMart
|
|
148
148
|
|
149
149
|
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
150
150
|
if @archive_url
|
151
|
-
attrs = attrs.reject{|attr| MISSING_IN_ARCHIVE[@archive].include? attr[1]}
|
151
|
+
attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
|
152
152
|
end
|
153
153
|
|
154
154
|
codes = attrs.collect{|attr| attr[1]}
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -1,18 +1,19 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/resource'
|
3
4
|
require 'rbbt/bow/bow'
|
4
5
|
require 'set'
|
5
6
|
|
6
7
|
module Entrez
|
7
8
|
|
8
|
-
Rbbt.share.databases.entrez.gene_info
|
9
|
-
Rbbt.share.databases.entrez.gene2pubmed
|
9
|
+
Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
|
10
|
+
Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
10
11
|
|
11
12
|
def self.entrez2native(taxs, options = {})
|
12
|
-
options = Misc.add_defaults options, :
|
13
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
|
13
14
|
|
14
15
|
taxs = [taxs] unless Array === taxs
|
15
|
-
options.merge! :grep => taxs.collect{|t| "
|
16
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
16
17
|
|
17
18
|
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
18
19
|
tsv.key_field = "Entrez Gene ID"
|
@@ -20,12 +21,24 @@ module Entrez
|
|
20
21
|
tsv
|
21
22
|
end
|
22
23
|
|
24
|
+
def self.entrez2name(taxs, options = {})
|
25
|
+
options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
|
26
|
+
|
27
|
+
taxs = [taxs] unless Array === taxs
|
28
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
29
|
+
|
30
|
+
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
31
|
+
tsv.key_field = "Entrez Gene ID"
|
32
|
+
tsv.fields = ["Associated Gene Name"]
|
33
|
+
tsv
|
34
|
+
end
|
35
|
+
|
36
|
+
|
23
37
|
def self.entrez2pubmed(taxs)
|
24
|
-
options = {:
|
38
|
+
options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
|
25
39
|
|
26
40
|
taxs = [taxs] unless taxs.is_a?(Array)
|
27
|
-
|
28
|
-
options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
|
41
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
29
42
|
|
30
43
|
Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
|
31
44
|
end
|
@@ -58,18 +71,31 @@ module Entrez
|
|
58
71
|
private
|
59
72
|
|
60
73
|
def self.get_online(geneids)
|
61
|
-
geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
|
62
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
|
63
74
|
|
64
|
-
|
75
|
+
genes_complete = geneids.is_a?(Array) ? geneids : [geneids]
|
65
76
|
|
66
|
-
genes =
|
77
|
+
genes = []
|
78
|
+
Misc.divide(genes_complete, (genes_complete.length / 100) + 1).each do |geneids_list|
|
79
|
+
begin
|
80
|
+
Misc.try3times do
|
81
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list * ","}"
|
82
|
+
|
83
|
+
xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
|
84
|
+
|
85
|
+
genes += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
|
86
|
+
end
|
87
|
+
rescue
|
88
|
+
puts $!.message
|
89
|
+
genes += geneids_list.collect{|g| nil}
|
90
|
+
end
|
91
|
+
end
|
67
92
|
|
68
93
|
if geneids.is_a? Array
|
69
|
-
list =
|
70
|
-
genes.
|
71
|
-
geneid =
|
72
|
-
|
94
|
+
list = Hash[*genes_complete.zip([nil]).flatten]
|
95
|
+
genes.each{|gene|
|
96
|
+
geneid = gene.match(/<Gene-track_geneid>(\d+)/)[1]
|
97
|
+
geneid = geneid.to_i unless list.include? geneid
|
98
|
+
list[geneid] = gene
|
73
99
|
}
|
74
100
|
return list
|
75
101
|
else
|
@@ -99,6 +125,7 @@ module Entrez
|
|
99
125
|
end
|
100
126
|
}
|
101
127
|
|
128
|
+
|
102
129
|
return list unless missing.any?
|
103
130
|
genes = get_online(missing)
|
104
131
|
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/persist/tsv'
|
2
4
|
|
3
5
|
# This module holds helper methods to deal with the Gene Ontology files. Right
|
4
6
|
# now all it does is provide a translation form id to the actual names.
|
5
7
|
module GO
|
6
8
|
|
7
|
-
Rbbt.share.databases.GO.gene_ontology
|
8
|
-
Rbbt.share.databases.GO.gslim_generic
|
9
|
+
Rbbt.claim Rbbt.share.databases.GO.gene_ontology, :url, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
|
10
|
+
Rbbt.claim Rbbt.share.databases.GO.gslim_generic, :url, 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
|
9
11
|
|
10
12
|
MULTIPLE_VALUE_FIELDS = %w(is_a)
|
11
|
-
TSV_GENE_ONTOLOGY = File.join(
|
13
|
+
TSV_GENE_ONTOLOGY = File.join(Persist.cachedir, 'gene_ontology')
|
12
14
|
|
13
15
|
# This method needs to be called before any translations can be made, it is
|
14
16
|
# called automatically the first time the id2name method is called. It loads
|
15
17
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
16
18
|
# only the name field is used.
|
17
19
|
def self.init
|
18
|
-
|
19
|
-
info =
|
20
|
+
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
21
|
+
info.serializer = :marshal if info.respond_to? :serializer and info.serializer == :type
|
20
22
|
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
21
23
|
term_info = {}
|
22
24
|
|
@@ -33,12 +35,13 @@ module GO
|
|
33
35
|
next if term_info["id"].nil?
|
34
36
|
info[term_info["id"]] = term_info
|
35
37
|
}
|
38
|
+
|
36
39
|
info
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
43
|
def self.info
|
41
|
-
self.init
|
44
|
+
@info ||= self.init
|
42
45
|
end
|
43
46
|
|
44
47
|
def self.goterms
|
@@ -1,32 +1,35 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
3
|
-
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/resource/with_key'
|
4
4
|
|
5
5
|
module Organism
|
6
6
|
extend Resource
|
7
|
-
|
7
|
+
self.pkgdir = "rbbt"
|
8
|
+
self.subdir = "share/organisms"
|
8
9
|
|
9
|
-
|
10
|
+
["Hsa", "Rno", "Sce"].each do |organism|
|
11
|
+
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
13
|
+
module_eval "#{ organism } = with_key '#{organism}'"
|
14
|
+
end
|
15
|
+
|
16
|
+
class OrganismNotProcessedError < StandardError; end
|
14
17
|
|
15
18
|
def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
|
16
19
|
Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
|
17
|
-
options = Misc.add_defaults options, :
|
20
|
+
options = Misc.add_defaults options, :persist => true, :case_insensitive => false
|
18
21
|
|
19
|
-
options.merge! :
|
22
|
+
options.merge! :key_field => target unless target.nil?
|
20
23
|
options.merge! :fields => fields unless fields.nil?
|
21
24
|
|
22
25
|
index = identifiers(org).tsv options
|
23
26
|
|
24
|
-
tsv.attach index, [:key]
|
27
|
+
tsv.attach index, :fields => [:key], :persist_input => true
|
25
28
|
end
|
26
29
|
|
27
30
|
def self.normalize(org, list, target = nil, fields = nil, options = {})
|
28
31
|
return [] if list.nil? or list.empty?
|
29
|
-
options = Misc.add_defaults options, :
|
32
|
+
options = Misc.add_defaults options, :persist => true, :case_insensitive => true, :double => false
|
30
33
|
double = Misc.process_options options, :double
|
31
34
|
|
32
35
|
|
@@ -50,14 +53,20 @@ module Organism
|
|
50
53
|
end
|
51
54
|
end
|
52
55
|
|
53
|
-
def self.guess_id(org, values)
|
54
|
-
identifiers
|
56
|
+
def self.guess_id(org, values, identifiers = nil)
|
57
|
+
identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
|
55
58
|
field_matches = identifiers.field_matches(values)
|
56
59
|
field_matches.sort_by{|field, matches| matches.uniq.length}.last
|
57
60
|
end
|
58
61
|
|
62
|
+
def self.guess_id(org, values)
|
63
|
+
field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
|
64
|
+
field_matches.sort_by{|field, count| count.to_i}.last
|
65
|
+
end
|
66
|
+
|
67
|
+
|
59
68
|
def self.organisms
|
60
|
-
Dir.glob(File.join(
|
69
|
+
Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
|
61
70
|
end
|
62
71
|
|
63
72
|
def self.name(organism)
|
@@ -70,14 +79,4 @@ module Organism
|
|
70
79
|
}.first
|
71
80
|
end
|
72
81
|
|
73
|
-
["Hsa", "Rno", "Sce"].each do |organism|
|
74
|
-
rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
|
75
|
-
rakefile.lib_dir = Resource.caller_lib_dir __FILE__
|
76
|
-
rakefile.pkgdir = 'phgx'
|
77
|
-
Organism[organism].define_as_rake rakefile
|
78
|
-
module_eval "#{ organism } = with_key '#{organism}'"
|
79
|
-
end
|
80
|
-
|
81
82
|
end
|
82
|
-
|
83
|
-
|
@@ -11,7 +11,12 @@ module Organism
|
|
11
11
|
exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
12
12
|
transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
|
13
13
|
|
14
|
-
transcripts =
|
14
|
+
transcripts = begin
|
15
|
+
exon_transcripts[exon].first
|
16
|
+
rescue
|
17
|
+
[]
|
18
|
+
end
|
19
|
+
|
15
20
|
transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
|
16
21
|
end
|
17
22
|
|
@@ -156,6 +161,8 @@ module Organism
|
|
156
161
|
transcript_offsets = {}
|
157
162
|
exons.each do |exon|
|
158
163
|
transcript_offsets[exon] ||= {}
|
164
|
+
offsets = nil
|
165
|
+
next unless exon_offsets.include? exon
|
159
166
|
offsets = exon_offsets[exon].zip_fields
|
160
167
|
|
161
168
|
offsets.collect do |transcript, offset|
|
@@ -173,7 +180,7 @@ module Organism
|
|
173
180
|
exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
174
181
|
exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
175
182
|
|
176
|
-
exons
|
183
|
+
exons = exons_at_genomic_positions(org, positions)
|
177
184
|
offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
|
178
185
|
|
179
186
|
position_exons = {}
|
@@ -203,8 +210,135 @@ module Organism
|
|
203
210
|
position_offsets
|
204
211
|
end
|
205
212
|
|
213
|
+
def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
|
214
|
+
chromosome = chromosome.to_s
|
215
|
+
chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
216
|
+
tsv = file.tsv(:persistence => true, :type => :list)
|
217
|
+
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
218
|
+
[exon, values["Exon Chr Start"].to_i]
|
219
|
+
end
|
220
|
+
end
|
221
|
+
chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
222
|
+
tsv = file.tsv(:persistence => true, :type => :list)
|
223
|
+
tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
|
224
|
+
[exon, values["Exon Chr End"].to_i]
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
if Array === positions
|
229
|
+
positions.collect{|position|
|
230
|
+
position = position.to_i
|
231
|
+
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
232
|
+
}
|
233
|
+
else
|
234
|
+
position = positions.to_i
|
235
|
+
chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
|
240
|
+
def self.exon_junctures_at_genomic_positions(org, positions)
|
241
|
+
positions = [positions] unless Array === positions.first
|
242
|
+
|
243
|
+
exons = []
|
244
|
+
chromosomes = {}
|
245
|
+
indices = {}
|
246
|
+
positions.each_with_index do |info,i|
|
247
|
+
chr, pos = info
|
248
|
+
chromosomes[chr] ||= []
|
249
|
+
indices[chr] ||= []
|
250
|
+
chromosomes[chr] << pos
|
251
|
+
indices[chr] << i
|
252
|
+
end
|
253
|
+
|
254
|
+
chromosomes.each do |chr, pos_list|
|
255
|
+
chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
|
256
|
+
chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
|
257
|
+
end
|
258
|
+
|
259
|
+
exons
|
260
|
+
end
|
261
|
+
|
262
|
+
def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
|
263
|
+
chromosome = chromosome.to_s
|
264
|
+
|
265
|
+
chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
|
266
|
+
rows = []
|
267
|
+
chromosome = options[:chromosome]
|
268
|
+
f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
|
269
|
+
while not f.eof?
|
270
|
+
line = f.gets.chomp
|
271
|
+
id, chr, pos = line.split "\t"
|
272
|
+
rows << [id, pos.to_i]
|
273
|
+
end
|
274
|
+
|
275
|
+
rows
|
276
|
+
end
|
277
|
+
|
278
|
+
if Array === positions
|
279
|
+
positions.collect{|position|
|
280
|
+
chromosome_bed[position];
|
281
|
+
}
|
282
|
+
else
|
283
|
+
chromosome_bed[positions];
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
|
288
|
+
def self.identify_variations_at_genomic_positions(org, positions, variations_file)
|
289
|
+
positions = [positions] unless Array === positions.first
|
290
|
+
|
291
|
+
variations = []
|
292
|
+
chromosomes = {}
|
293
|
+
indices = {}
|
294
|
+
positions.each_with_index do |info,i|
|
295
|
+
chr, pos = info
|
296
|
+
chromosomes[chr] ||= []
|
297
|
+
indices[chr] ||= []
|
298
|
+
chromosomes[chr] << pos
|
299
|
+
indices[chr] << i
|
300
|
+
end
|
301
|
+
|
302
|
+
chromosomes.each do |chr, pos_list|
|
303
|
+
chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
|
304
|
+
chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
|
305
|
+
end
|
306
|
+
|
307
|
+
variations
|
308
|
+
end
|
309
|
+
|
310
|
+
task_option :organism, "Organism", :string, "Hsa"
|
311
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
312
|
+
task_dependencies nil
|
313
|
+
task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
|
314
|
+
genomic_mutations = case
|
315
|
+
when TSV === genomic_mutations
|
316
|
+
genomic_mutations
|
317
|
+
else
|
318
|
+
TSV.new StringIO.new(genomic_mutations), :list
|
319
|
+
end
|
320
|
+
genomic_mutations.key_field ||= "Position"
|
321
|
+
genomic_mutations.fields ||= ["Mutation"]
|
322
|
+
|
323
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
324
|
+
|
325
|
+
step(:resources, "Load Resources")
|
326
|
+
|
327
|
+
exon_junctures = {}
|
328
|
+
genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
|
329
|
+
exon_junctures[position] = exons
|
330
|
+
end
|
331
|
+
|
332
|
+
genomic_mutations.add_field "Exon Junctions" do |position, values|
|
333
|
+
exon_junctures[position] * "|"
|
334
|
+
end
|
335
|
+
|
336
|
+
genomic_mutations.to_s :sort, true
|
337
|
+
end
|
338
|
+
|
339
|
+
|
206
340
|
task_option :organism, "Organism", :string, "Hsa"
|
207
|
-
task_option :genomic_mutations, "Position (chr:position)
|
341
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
208
342
|
task_dependencies nil
|
209
343
|
task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
|
210
344
|
genomic_mutations = case
|
@@ -213,8 +347,8 @@ module Organism
|
|
213
347
|
else
|
214
348
|
TSV.new StringIO.new(genomic_mutations), :list
|
215
349
|
end
|
216
|
-
genomic_mutations.key_field
|
217
|
-
genomic_mutations.fields
|
350
|
+
genomic_mutations.key_field ||= "Position"
|
351
|
+
genomic_mutations.fields ||= ["Mutation"]
|
218
352
|
|
219
353
|
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
220
354
|
|
@@ -234,7 +368,7 @@ Translates a collection of mutations in genomic coordinates into mutations in am
|
|
234
368
|
protein products of transcripts including those positions.
|
235
369
|
EOF
|
236
370
|
task_option :organism, "Organism", :string, "Hsa"
|
237
|
-
task_option :genomic_mutations, "Position (chr:position)
|
371
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
238
372
|
task_dependencies nil
|
239
373
|
task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
|
240
374
|
genomic_mutations = case
|
@@ -244,8 +378,8 @@ protein products of transcripts including those positions.
|
|
244
378
|
TSV.new StringIO.new(genomic_mutations), :list
|
245
379
|
end
|
246
380
|
|
247
|
-
genomic_mutations.key_field
|
248
|
-
genomic_mutations.fields
|
381
|
+
genomic_mutations.key_field ||= "Position"
|
382
|
+
genomic_mutations.fields ||= ["Mutation"]
|
249
383
|
|
250
384
|
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
251
385
|
|
@@ -256,7 +390,6 @@ protein products of transcripts including those positions.
|
|
256
390
|
results.type = :double
|
257
391
|
results.filename = path
|
258
392
|
|
259
|
-
|
260
393
|
step(:resources, "Load Resources")
|
261
394
|
transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
|
262
395
|
transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
|
@@ -264,26 +397,31 @@ protein products of transcripts including those positions.
|
|
264
397
|
exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
|
265
398
|
exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
|
266
399
|
exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
|
400
|
+
transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
|
267
401
|
|
268
402
|
step(:offsets, "Find transcripts and offsets for mutations")
|
269
403
|
offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
|
270
404
|
|
271
405
|
step(:aminoacid, "Translate mutation to amino acid substitutions")
|
272
406
|
offsets.each do |position, transcripts|
|
273
|
-
|
407
|
+
if genomic_mutations.type === :double
|
408
|
+
alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
|
409
|
+
else
|
410
|
+
alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
|
411
|
+
end
|
274
412
|
|
275
413
|
transcripts.each do |transcript, offset_info|
|
276
414
|
offset, strand = offset_info
|
277
|
-
begin
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
if not codon.nil?
|
415
|
+
codon = begin
|
416
|
+
Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
|
417
|
+
rescue
|
418
|
+
Log.medium $!.message
|
419
|
+
next
|
420
|
+
end
|
421
|
+
|
422
|
+
if not codon.nil? and not codon.empty?
|
285
423
|
alleles.each do |allele|
|
286
|
-
allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
|
424
|
+
allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
|
287
425
|
change = Organism.codon_change(allele, *codon.values_at(0,1))
|
288
426
|
pos_code = position * ":"
|
289
427
|
mutation = [change.first, codon.last + 1, change.last] * ""
|
@@ -298,8 +436,93 @@ protein products of transcripts including those positions.
|
|
298
436
|
|
299
437
|
end
|
300
438
|
|
439
|
+
step(:identify_proteins, "Identify Proteins for Transcripts")
|
440
|
+
transcript_field = results.identify_field "Ensembl Transcript ID"
|
441
|
+
results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
|
442
|
+
values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
|
443
|
+
end
|
444
|
+
|
445
|
+
|
301
446
|
results
|
302
447
|
end
|
448
|
+
|
449
|
+
|
450
|
+
task_option :organism, "Organism", :string, "Hsa"
|
451
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
452
|
+
task_dependencies nil
|
453
|
+
task :identify_germline_variations => :tsv do |org,genomic_mutations|
|
454
|
+
genomic_mutations = case
|
455
|
+
when TSV === genomic_mutations
|
456
|
+
genomic_mutations
|
457
|
+
else
|
458
|
+
TSV.new StringIO.new(genomic_mutations), :list
|
459
|
+
end
|
460
|
+
|
461
|
+
genomic_mutations.key_field ||= "Position"
|
462
|
+
genomic_mutations.fields ||= ["Mutation"]
|
463
|
+
|
464
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
465
|
+
|
466
|
+
|
467
|
+
step(:prepare, "Prepare Results")
|
468
|
+
results = TSV.new({})
|
469
|
+
results.key_field = "Position"
|
470
|
+
results.fields = ["SNP Id"]
|
471
|
+
results.type = :double
|
472
|
+
results.filename = path
|
473
|
+
|
474
|
+
|
475
|
+
step(:resources, "Load Resources")
|
476
|
+
|
477
|
+
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
|
478
|
+
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
479
|
+
|
480
|
+
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
481
|
+
snps_for_positions[position]
|
482
|
+
end
|
483
|
+
|
484
|
+
genomic_mutations
|
485
|
+
end
|
486
|
+
|
487
|
+
|
488
|
+
task_option :organism, "Organism", :string, "Hsa"
|
489
|
+
task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
|
490
|
+
task_dependencies nil
|
491
|
+
task :identify_somatic_variations => :tsv do |org,genomic_mutations|
|
492
|
+
genomic_mutations = case
|
493
|
+
when TSV === genomic_mutations
|
494
|
+
genomic_mutations
|
495
|
+
else
|
496
|
+
TSV.new StringIO.new(genomic_mutations), :list
|
497
|
+
end
|
498
|
+
|
499
|
+
genomic_mutations.key_field ||= "Position"
|
500
|
+
genomic_mutations.fields ||= ["Mutation"]
|
501
|
+
|
502
|
+
positions = genomic_mutations.keys.collect{|l| l.split(":")}
|
503
|
+
|
504
|
+
|
505
|
+
step(:prepare, "Prepare Results")
|
506
|
+
results = TSV.new({})
|
507
|
+
results.key_field = "Position"
|
508
|
+
results.fields = ["SNP Id"]
|
509
|
+
results.type = :double
|
510
|
+
results.filename = path
|
511
|
+
|
512
|
+
|
513
|
+
step(:resources, "Load Resources")
|
514
|
+
|
515
|
+
snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
|
516
|
+
snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
|
517
|
+
|
518
|
+
genomic_mutations.add_field "Germline SNP Id" do |position, values|
|
519
|
+
snps_for_positions[position]
|
520
|
+
end
|
521
|
+
|
522
|
+
genomic_mutations
|
523
|
+
end
|
524
|
+
|
525
|
+
|
303
526
|
end
|
304
527
|
|
305
528
|
if __FILE__ == $0
|
@@ -333,6 +556,17 @@ X 10085674 C T
|
|
333
556
|
21 19638426 G T
|
334
557
|
EOF
|
335
558
|
|
559
|
+
exon_juncture_test = <<-EOF
|
560
|
+
#Position Mutation
|
561
|
+
7:150753996 T
|
562
|
+
EOF
|
563
|
+
|
564
|
+
|
565
|
+
job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
|
566
|
+
job.run
|
567
|
+
job.clean if job.error?
|
568
|
+
puts job.messages
|
569
|
+
puts job.read
|
336
570
|
|
337
571
|
# # Build 36
|
338
572
|
# picmi_test = <<-EOF
|