rbbt-sources 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ may2009:
2
+ - agilent_wholegenome
3
+ - agilent_cgh_44b
4
+ - illumina_humanwg_6_v2
5
+ - illumina_humanwg_6_v3
6
+ dec2007:
7
+ - protein_id
8
+ - affy_hc_g110
9
+ - affy_hg_u133a_2
10
+ - affy_huex_1_0_st_v2
11
+ - affy_hugene_1_0_st_v1
12
+ - agilent_wholegenome
13
+ - agilent_cgh_44b
14
+ - illumina_humanwg_6_v2
15
+ - illumina_humanwg_6_v3
@@ -0,0 +1,14 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ module COSMIC
4
+ extend Resource
5
+ self.subdir = "share/databases/COSMIC"
6
+
7
+ COSMIC.claim COSMIC.Mutations, :proc do
8
+ url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicMutantExport_v54_120711.tsv"
9
+
10
+ TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation GRCh37 genome position", :merge => true).to_s
11
+ end
12
+ end
13
+
14
+ puts COSMIC.Mutations.produce
@@ -2,7 +2,7 @@ require 'rbbt-util'
2
2
 
3
3
  module COSTART
4
4
 
5
- Rbbt.share.databases.COSTART.COSTART.define_as_proc do
5
+ Rbbt.claim Rbbt.share.databases.COSTART.COSTART, :proc do
6
6
  terms = ["#COSTART Terms"]
7
7
  Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
8
8
  puts line
@@ -2,5 +2,5 @@ require 'rbbt-util'
2
2
  require 'rbbt/util/excel2tsv'
3
3
 
4
4
  module CTCAE
5
- Rbbt.share.CTCAE.CTCAE.define_as_url TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls')
5
+ Rbbt.claim Rbbt.share.databases.CTCAE.CTCAE, :proc do TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls').to_s end
6
6
  end
@@ -0,0 +1,17 @@
1
+ require 'rbbt-util'
2
+ module InterPro
3
+ extend Resource
4
+ self.subdir = "share/databases/InterPro"
5
+
6
+ InterPro.claim InterPro.root.find, :rake, Rbbt.share.install.InterPro.Rakefile.find(:lib)
7
+
8
+ def self.tsv(*args)
9
+ old_url = BioMart::BIOMART_URL
10
+ begin
11
+ BioMart::BIOMART_URL.replace "http://www.ebi.ac.uk/interpro/biomart/martservice?query="
12
+ BioMart.tsv(*args)
13
+ ensure
14
+ BioMart::BIOMART_URL.replace old_url
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,7 @@
1
+ require 'rbbt-util'
2
+ module NCI
3
+ extend Resource
4
+ self.subdir = "share/databases/NCI"
5
+
6
+ NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
7
+ end
@@ -1,5 +1,6 @@
1
1
  require 'rbbt'
2
- require 'rbbt/util/tsv'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/tsv/attach'
3
4
  require 'rbbt/util/log'
4
5
  require 'cgi'
5
6
 
@@ -14,7 +15,7 @@ module BioMart
14
15
 
15
16
  BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
16
17
 
17
- MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.yaml
18
+ MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.yaml : {}
18
19
 
19
20
  private
20
21
 
@@ -68,10 +69,10 @@ module BioMart
68
69
 
69
70
  new_datafile = TmpFile.tmp_file
70
71
  if data.nil?
71
- TSV.merge_rows Open.open(result_file), new_datafile
72
+ TSV.merge_row_fields Open.open(result_file), new_datafile
72
73
  data = new_datafile
73
74
  else
74
- TSV.paste_merge data, result_file, new_datafile
75
+ TSV.merge_different_fields data, result_file, new_datafile
75
76
  FileUtils.rm data
76
77
  data = new_datafile
77
78
  end
@@ -117,7 +118,6 @@ module BioMart
117
118
  }
118
119
 
119
120
  chunks << chunk if chunk.any?
120
-
121
121
 
122
122
  Log.low "Chunks: #{chunks.length}"
123
123
  chunks.each_with_index{|chunk,i|
@@ -125,15 +125,15 @@ module BioMart
125
125
  data = get(database, main, chunk, filters, data, open_options)
126
126
  }
127
127
 
128
- open_options[:filename] ||= "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
128
+ open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
129
129
  if filename.nil?
130
- results = TSV.new data, open_options
130
+ results = TSV.open data, open_options
131
131
  results.key_field = main
132
132
  results.fields = attrs
133
133
  results
134
134
  else
135
135
  Open.write(filename) do |f|
136
- f.puts "#: " << Misc.hash2string(TSV::EXTRA_ACCESSORS.collect{|key| [key, open_options[key]]})
136
+ f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
137
137
  if field_names.nil?
138
138
  f.puts "#" << [main, attrs].flatten * "\t"
139
139
  else
@@ -148,7 +148,7 @@ module BioMart
148
148
 
149
149
  def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
150
150
  if @archive_url
151
- attrs = attrs.reject{|attr| MISSING_IN_ARCHIVE[@archive].include? attr[1]}
151
+ attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
152
152
  end
153
153
 
154
154
  codes = attrs.collect{|attr| attr[1]}
@@ -1,18 +1,19 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/resource'
3
4
  require 'rbbt/bow/bow'
4
5
  require 'set'
5
6
 
6
7
  module Entrez
7
8
 
8
- Rbbt.share.databases.entrez.gene_info.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
9
- Rbbt.share.databases.entrez.gene2pubmed.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
9
+ Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
10
+ Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
10
11
 
11
12
  def self.entrez2native(taxs, options = {})
12
- options = Misc.add_defaults options, :key => 1, :fields => 5, :persistence => true, :merge => true
13
+ options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
13
14
 
14
15
  taxs = [taxs] unless Array === taxs
15
- options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
16
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
16
17
 
17
18
  tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
18
19
  tsv.key_field = "Entrez Gene ID"
@@ -20,12 +21,24 @@ module Entrez
20
21
  tsv
21
22
  end
22
23
 
24
+ def self.entrez2name(taxs, options = {})
25
+ options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
26
+
27
+ taxs = [taxs] unless Array === taxs
28
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
29
+
30
+ tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
31
+ tsv.key_field = "Entrez Gene ID"
32
+ tsv.fields = ["Associated Gene Name"]
33
+ tsv
34
+ end
35
+
36
+
23
37
  def self.entrez2pubmed(taxs)
24
- options = {:key => 1, :fields => 2, :persistence => true, :merge => true}
38
+ options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
25
39
 
26
40
  taxs = [taxs] unless taxs.is_a?(Array)
27
- taxs = taxs.collect{|t| t.to_s}
28
- options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
41
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
29
42
 
30
43
  Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
31
44
  end
@@ -58,18 +71,31 @@ module Entrez
58
71
  private
59
72
 
60
73
  def self.get_online(geneids)
61
- geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
62
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
63
74
 
64
- xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
75
+ genes_complete = geneids.is_a?(Array) ? geneids : [geneids]
65
76
 
66
- genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
77
+ genes = []
78
+ Misc.divide(genes_complete, (genes_complete.length / 100) + 1).each do |geneids_list|
79
+ begin
80
+ Misc.try3times do
81
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list * ","}"
82
+
83
+ xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
84
+
85
+ genes += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
86
+ end
87
+ rescue
88
+ puts $!.message
89
+ genes += geneids_list.collect{|g| nil}
90
+ end
91
+ end
67
92
 
68
93
  if geneids.is_a? Array
69
- list = {}
70
- genes.each_with_index{|gene,i|
71
- geneid = geneids[i]
72
- list[geneid ] = gene
94
+ list = Hash[*genes_complete.zip([nil]).flatten]
95
+ genes.each{|gene|
96
+ geneid = gene.match(/<Gene-track_geneid>(\d+)/)[1]
97
+ geneid = geneid.to_i unless list.include? geneid
98
+ list[geneid] = gene
73
99
  }
74
100
  return list
75
101
  else
@@ -99,6 +125,7 @@ module Entrez
99
125
  end
100
126
  }
101
127
 
128
+
102
129
  return list unless missing.any?
103
130
  genes = get_online(missing)
104
131
 
@@ -1,22 +1,24 @@
1
- require 'rbbt-util'
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/persist/tsv'
2
4
 
3
5
  # This module holds helper methods to deal with the Gene Ontology files. Right
4
6
  # now all it does is provide a translation form id to the actual names.
5
7
  module GO
6
8
 
7
- Rbbt.share.databases.GO.gene_ontology.define_as_url 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
8
- Rbbt.share.databases.GO.gslim_generic.define_as_url 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
9
+ Rbbt.claim Rbbt.share.databases.GO.gene_ontology, :url, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
10
+ Rbbt.claim Rbbt.share.databases.GO.gslim_generic, :url, 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
9
11
 
10
12
  MULTIPLE_VALUE_FIELDS = %w(is_a)
11
- TSV_GENE_ONTOLOGY = File.join(Persistence.cachedir, 'gene_ontology')
13
+ TSV_GENE_ONTOLOGY = File.join(Persist.cachedir, 'gene_ontology')
12
14
 
13
15
  # This method needs to be called before any translations can be made, it is
14
16
  # called automatically the first time the id2name method is called. It loads
15
17
  # the gene_ontology.obo file and extracts all the fields, although right now,
16
18
  # only the name field is used.
17
19
  def self.init
18
- init = Persistence.persist_tsv('gene_ontology', :Misc) do
19
- info = {}
20
+ Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
21
+ info.serializer = :marshal if info.respond_to? :serializer and info.serializer == :type
20
22
  Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
21
23
  term_info = {}
22
24
 
@@ -33,12 +35,13 @@ module GO
33
35
  next if term_info["id"].nil?
34
36
  info[term_info["id"]] = term_info
35
37
  }
38
+
36
39
  info
37
40
  end
38
41
  end
39
42
 
40
43
  def self.info
41
- self.init
44
+ @info ||= self.init
42
45
  end
43
46
 
44
47
  def self.goterms
@@ -0,0 +1,4 @@
1
+ require 'rbbt-util'
2
+ module JoChem
3
+ Rbbt.claim Rbbt.share.databases.JoChem, :rake, Rbbt.share.install.JoChem.Rakefile.find
4
+ end
@@ -1,32 +1,35 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/resource'
3
-
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/resource/with_key'
4
4
 
5
5
  module Organism
6
6
  extend Resource
7
- relative_to Rbbt, "share/organisms"
7
+ self.pkgdir = "rbbt"
8
+ self.subdir = "share/organisms"
8
9
 
9
- class OrganismNotProcessedError < StandardError; end
10
+ ["Hsa", "Rno", "Sce"].each do |organism|
11
+ claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
10
12
 
11
- def self.datadir(org)
12
- File.join(Rbbt.datadir, 'organisms', org)
13
- end
13
+ module_eval "#{ organism } = with_key '#{organism}'"
14
+ end
15
+
16
+ class OrganismNotProcessedError < StandardError; end
14
17
 
15
18
  def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
16
19
  Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
17
- options = Misc.add_defaults options, :persistence => true, :case_insensitive => false
20
+ options = Misc.add_defaults options, :persist => true, :case_insensitive => false
18
21
 
19
- options.merge! :key => target unless target.nil?
22
+ options.merge! :key_field => target unless target.nil?
20
23
  options.merge! :fields => fields unless fields.nil?
21
24
 
22
25
  index = identifiers(org).tsv options
23
26
 
24
- tsv.attach index, [:key]
27
+ tsv.attach index, :fields => [:key], :persist_input => true
25
28
  end
26
29
 
27
30
  def self.normalize(org, list, target = nil, fields = nil, options = {})
28
31
  return [] if list.nil? or list.empty?
29
- options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
32
+ options = Misc.add_defaults options, :persist => true, :case_insensitive => true, :double => false
30
33
  double = Misc.process_options options, :double
31
34
 
32
35
 
@@ -50,14 +53,20 @@ module Organism
50
53
  end
51
54
  end
52
55
 
53
- def self.guess_id(org, values)
54
- identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
56
+ def self.guess_id(org, values, identifiers = nil)
57
+ identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
55
58
  field_matches = identifiers.field_matches(values)
56
59
  field_matches.sort_by{|field, matches| matches.uniq.length}.last
57
60
  end
58
61
 
62
+ def self.guess_id(org, values)
63
+ field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
64
+ field_matches.sort_by{|field, count| count.to_i}.last
65
+ end
66
+
67
+
59
68
  def self.organisms
60
- Dir.glob(File.join(Rbbt.share.organisms.find, '*')).collect{|f| File.basename(f)}
69
+ Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
61
70
  end
62
71
 
63
72
  def self.name(organism)
@@ -70,14 +79,4 @@ module Organism
70
79
  }.first
71
80
  end
72
81
 
73
- ["Hsa", "Rno", "Sce"].each do |organism|
74
- rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
75
- rakefile.lib_dir = Resource.caller_lib_dir __FILE__
76
- rakefile.pkgdir = 'phgx'
77
- Organism[organism].define_as_rake rakefile
78
- module_eval "#{ organism } = with_key '#{organism}'"
79
- end
80
-
81
82
  end
82
-
83
-
@@ -11,7 +11,12 @@ module Organism
11
11
  exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
12
12
  transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
13
13
 
14
- transcripts = exon_transcripts[exon].first
14
+ transcripts = begin
15
+ exon_transcripts[exon].first
16
+ rescue
17
+ []
18
+ end
19
+
15
20
  transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
16
21
  end
17
22
 
@@ -156,6 +161,8 @@ module Organism
156
161
  transcript_offsets = {}
157
162
  exons.each do |exon|
158
163
  transcript_offsets[exon] ||= {}
164
+ offsets = nil
165
+ next unless exon_offsets.include? exon
159
166
  offsets = exon_offsets[exon].zip_fields
160
167
 
161
168
  offsets.collect do |transcript, offset|
@@ -173,7 +180,7 @@ module Organism
173
180
  exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
174
181
  exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
175
182
 
176
- exons = exons_at_genomic_positions(org, positions)
183
+ exons = exons_at_genomic_positions(org, positions)
177
184
  offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
178
185
 
179
186
  position_exons = {}
@@ -203,8 +210,135 @@ module Organism
203
210
  position_offsets
204
211
  end
205
212
 
213
+ def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
214
+ chromosome = chromosome.to_s
215
+ chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
216
+ tsv = file.tsv(:persistence => true, :type => :list)
217
+ tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
218
+ [exon, values["Exon Chr Start"].to_i]
219
+ end
220
+ end
221
+ chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
222
+ tsv = file.tsv(:persistence => true, :type => :list)
223
+ tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
224
+ [exon, values["Exon Chr End"].to_i]
225
+ end
226
+ end
227
+
228
+ if Array === positions
229
+ positions.collect{|position|
230
+ position = position.to_i
231
+ chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
232
+ }
233
+ else
234
+ position = positions.to_i
235
+ chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
236
+ end
237
+
238
+ end
239
+
240
+ def self.exon_junctures_at_genomic_positions(org, positions)
241
+ positions = [positions] unless Array === positions.first
242
+
243
+ exons = []
244
+ chromosomes = {}
245
+ indices = {}
246
+ positions.each_with_index do |info,i|
247
+ chr, pos = info
248
+ chromosomes[chr] ||= []
249
+ indices[chr] ||= []
250
+ chromosomes[chr] << pos
251
+ indices[chr] << i
252
+ end
253
+
254
+ chromosomes.each do |chr, pos_list|
255
+ chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
256
+ chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
257
+ end
258
+
259
+ exons
260
+ end
261
+
262
+ def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
263
+ chromosome = chromosome.to_s
264
+
265
+ chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
266
+ rows = []
267
+ chromosome = options[:chromosome]
268
+ f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
269
+ while not f.eof?
270
+ line = f.gets.chomp
271
+ id, chr, pos = line.split "\t"
272
+ rows << [id, pos.to_i]
273
+ end
274
+
275
+ rows
276
+ end
277
+
278
+ if Array === positions
279
+ positions.collect{|position|
280
+ chromosome_bed[position];
281
+ }
282
+ else
283
+ chromosome_bed[positions];
284
+ end
285
+ end
286
+
287
+
288
+ def self.identify_variations_at_genomic_positions(org, positions, variations_file)
289
+ positions = [positions] unless Array === positions.first
290
+
291
+ variations = []
292
+ chromosomes = {}
293
+ indices = {}
294
+ positions.each_with_index do |info,i|
295
+ chr, pos = info
296
+ chromosomes[chr] ||= []
297
+ indices[chr] ||= []
298
+ chromosomes[chr] << pos
299
+ indices[chr] << i
300
+ end
301
+
302
+ chromosomes.each do |chr, pos_list|
303
+ chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
304
+ chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
305
+ end
306
+
307
+ variations
308
+ end
309
+
310
+ task_option :organism, "Organism", :string, "Hsa"
311
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
312
+ task_dependencies nil
313
+ task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
314
+ genomic_mutations = case
315
+ when TSV === genomic_mutations
316
+ genomic_mutations
317
+ else
318
+ TSV.new StringIO.new(genomic_mutations), :list
319
+ end
320
+ genomic_mutations.key_field ||= "Position"
321
+ genomic_mutations.fields ||= ["Mutation"]
322
+
323
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
324
+
325
+ step(:resources, "Load Resources")
326
+
327
+ exon_junctures = {}
328
+ genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
329
+ exon_junctures[position] = exons
330
+ end
331
+
332
+ genomic_mutations.add_field "Exon Junctions" do |position, values|
333
+ exon_junctures[position] * "|"
334
+ end
335
+
336
+ genomic_mutations.to_s :sort, true
337
+ end
338
+
339
+
206
340
  task_option :organism, "Organism", :string, "Hsa"
207
- task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
341
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
208
342
  task_dependencies nil
209
343
  task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
210
344
  genomic_mutations = case
@@ -213,8 +347,8 @@ module Organism
213
347
  else
214
348
  TSV.new StringIO.new(genomic_mutations), :list
215
349
  end
216
- genomic_mutations.key_field = "Position"
217
- genomic_mutations.fields = ["Mutation"]
350
+ genomic_mutations.key_field ||= "Position"
351
+ genomic_mutations.fields ||= ["Mutation"]
218
352
 
219
353
  positions = genomic_mutations.keys.collect{|l| l.split(":")}
220
354
 
@@ -234,7 +368,7 @@ Translates a collection of mutations in genomic coordinates into mutations in am
234
368
  protein products of transcripts including those positions.
235
369
  EOF
236
370
  task_option :organism, "Organism", :string, "Hsa"
237
- task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
371
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
238
372
  task_dependencies nil
239
373
  task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
240
374
  genomic_mutations = case
@@ -244,8 +378,8 @@ protein products of transcripts including those positions.
244
378
  TSV.new StringIO.new(genomic_mutations), :list
245
379
  end
246
380
 
247
- genomic_mutations.key_field = "Position"
248
- genomic_mutations.fields = ["Mutation"]
381
+ genomic_mutations.key_field ||= "Position"
382
+ genomic_mutations.fields ||= ["Mutation"]
249
383
 
250
384
  positions = genomic_mutations.keys.collect{|l| l.split(":")}
251
385
 
@@ -256,7 +390,6 @@ protein products of transcripts including those positions.
256
390
  results.type = :double
257
391
  results.filename = path
258
392
 
259
-
260
393
  step(:resources, "Load Resources")
261
394
  transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
262
395
  transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
@@ -264,26 +397,31 @@ protein products of transcripts including those positions.
264
397
  exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
265
398
  exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
266
399
  exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
400
+ transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
267
401
 
268
402
  step(:offsets, "Find transcripts and offsets for mutations")
269
403
  offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
270
404
 
271
405
  step(:aminoacid, "Translate mutation to amino acid substitutions")
272
406
  offsets.each do |position, transcripts|
273
- alleles = genomic_mutations[position * ":"].collect{|allele| Misc.IUPAC_to_base(allele)}.flatten
407
+ if genomic_mutations.type === :double
408
+ alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
409
+ else
410
+ alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
411
+ end
274
412
 
275
413
  transcripts.each do |transcript, offset_info|
276
414
  offset, strand = offset_info
277
- begin
278
- codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
279
- rescue
280
- Log.medium $!.message
281
- next
282
- end
283
-
284
- if not codon.nil?
415
+ codon = begin
416
+ Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
417
+ rescue
418
+ Log.medium $!.message
419
+ next
420
+ end
421
+
422
+ if not codon.nil? and not codon.empty?
285
423
  alleles.each do |allele|
286
- allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
424
+ allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
287
425
  change = Organism.codon_change(allele, *codon.values_at(0,1))
288
426
  pos_code = position * ":"
289
427
  mutation = [change.first, codon.last + 1, change.last] * ""
@@ -298,8 +436,93 @@ protein products of transcripts including those positions.
298
436
 
299
437
  end
300
438
 
439
+ step(:identify_proteins, "Identify Proteins for Transcripts")
440
+ transcript_field = results.identify_field "Ensembl Transcript ID"
441
+ results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
442
+ values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
443
+ end
444
+
445
+
301
446
  results
302
447
  end
448
+
449
+
450
+ task_option :organism, "Organism", :string, "Hsa"
451
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
452
+ task_dependencies nil
453
+ task :identify_germline_variations => :tsv do |org,genomic_mutations|
454
+ genomic_mutations = case
455
+ when TSV === genomic_mutations
456
+ genomic_mutations
457
+ else
458
+ TSV.new StringIO.new(genomic_mutations), :list
459
+ end
460
+
461
+ genomic_mutations.key_field ||= "Position"
462
+ genomic_mutations.fields ||= ["Mutation"]
463
+
464
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
465
+
466
+
467
+ step(:prepare, "Prepare Results")
468
+ results = TSV.new({})
469
+ results.key_field = "Position"
470
+ results.fields = ["SNP Id"]
471
+ results.type = :double
472
+ results.filename = path
473
+
474
+
475
+ step(:resources, "Load Resources")
476
+
477
+ snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
478
+ snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
479
+
480
+ genomic_mutations.add_field "Germline SNP Id" do |position, values|
481
+ snps_for_positions[position]
482
+ end
483
+
484
+ genomic_mutations
485
+ end
486
+
487
+
488
+ task_option :organism, "Organism", :string, "Hsa"
489
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
490
+ task_dependencies nil
491
+ task :identify_somatic_variations => :tsv do |org,genomic_mutations|
492
+ genomic_mutations = case
493
+ when TSV === genomic_mutations
494
+ genomic_mutations
495
+ else
496
+ TSV.new StringIO.new(genomic_mutations), :list
497
+ end
498
+
499
+ genomic_mutations.key_field ||= "Position"
500
+ genomic_mutations.fields ||= ["Mutation"]
501
+
502
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
503
+
504
+
505
+ step(:prepare, "Prepare Results")
506
+ results = TSV.new({})
507
+ results.key_field = "Position"
508
+ results.fields = ["SNP Id"]
509
+ results.type = :double
510
+ results.filename = path
511
+
512
+
513
+ step(:resources, "Load Resources")
514
+
515
+ snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
516
+ snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
517
+
518
+ genomic_mutations.add_field "Germline SNP Id" do |position, values|
519
+ snps_for_positions[position]
520
+ end
521
+
522
+ genomic_mutations
523
+ end
524
+
525
+
303
526
  end
304
527
 
305
528
  if __FILE__ == $0
@@ -333,6 +556,17 @@ X 10085674 C T
333
556
  21 19638426 G T
334
557
  EOF
335
558
 
559
+ exon_juncture_test = <<-EOF
560
+ #Position Mutation
561
+ 7:150753996 T
562
+ EOF
563
+
564
+
565
+ job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
566
+ job.run
567
+ job.clean if job.error?
568
+ puts job.messages
569
+ puts job.read
336
570
 
337
571
  # # Build 36
338
572
  # picmi_test = <<-EOF