rbbt-sources 0.4.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ may2009:
2
+ - agilent_wholegenome
3
+ - agilent_cgh_44b
4
+ - illumina_humanwg_6_v2
5
+ - illumina_humanwg_6_v3
6
+ dec2007:
7
+ - protein_id
8
+ - affy_hc_g110
9
+ - affy_hg_u133a_2
10
+ - affy_huex_1_0_st_v2
11
+ - affy_hugene_1_0_st_v1
12
+ - agilent_wholegenome
13
+ - agilent_cgh_44b
14
+ - illumina_humanwg_6_v2
15
+ - illumina_humanwg_6_v3
@@ -0,0 +1,14 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ module COSMIC
4
+ extend Resource
5
+ self.subdir = "share/databases/COSMIC"
6
+
7
+ COSMIC.claim COSMIC.Mutations, :proc do
8
+ url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicMutantExport_v54_120711.tsv"
9
+
10
+ TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation GRCh37 genome position", :merge => true).to_s
11
+ end
12
+ end
13
+
14
+ puts COSMIC.Mutations.produce
@@ -2,7 +2,7 @@ require 'rbbt-util'
2
2
 
3
3
  module COSTART
4
4
 
5
- Rbbt.share.databases.COSTART.COSTART.define_as_proc do
5
+ Rbbt.claim Rbbt.share.databases.COSTART.COSTART, :proc do
6
6
  terms = ["#COSTART Terms"]
7
7
  Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
8
8
  puts line
@@ -2,5 +2,5 @@ require 'rbbt-util'
2
2
  require 'rbbt/util/excel2tsv'
3
3
 
4
4
  module CTCAE
5
- Rbbt.share.CTCAE.CTCAE.define_as_url TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls')
5
+ Rbbt.claim Rbbt.share.databases.CTCAE.CTCAE, :proc do TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls').to_s end
6
6
  end
@@ -0,0 +1,17 @@
1
+ require 'rbbt-util'
2
+ module InterPro
3
+ extend Resource
4
+ self.subdir = "share/databases/InterPro"
5
+
6
+ InterPro.claim InterPro.root.find, :rake, Rbbt.share.install.InterPro.Rakefile.find(:lib)
7
+
8
+ def self.tsv(*args)
9
+ old_url = BioMart::BIOMART_URL
10
+ begin
11
+ BioMart::BIOMART_URL.replace "http://www.ebi.ac.uk/interpro/biomart/martservice?query="
12
+ BioMart.tsv(*args)
13
+ ensure
14
+ BioMart::BIOMART_URL.replace old_url
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,7 @@
1
+ require 'rbbt-util'
2
+ module NCI
3
+ extend Resource
4
+ self.subdir = "share/databases/NCI"
5
+
6
+ NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
7
+ end
@@ -1,5 +1,6 @@
1
1
  require 'rbbt'
2
- require 'rbbt/util/tsv'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/tsv/attach'
3
4
  require 'rbbt/util/log'
4
5
  require 'cgi'
5
6
 
@@ -14,7 +15,7 @@ module BioMart
14
15
 
15
16
  BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
16
17
 
17
- MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.yaml
18
+ MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.yaml : {}
18
19
 
19
20
  private
20
21
 
@@ -68,10 +69,10 @@ module BioMart
68
69
 
69
70
  new_datafile = TmpFile.tmp_file
70
71
  if data.nil?
71
- TSV.merge_rows Open.open(result_file), new_datafile
72
+ TSV.merge_row_fields Open.open(result_file), new_datafile
72
73
  data = new_datafile
73
74
  else
74
- TSV.paste_merge data, result_file, new_datafile
75
+ TSV.merge_different_fields data, result_file, new_datafile
75
76
  FileUtils.rm data
76
77
  data = new_datafile
77
78
  end
@@ -117,7 +118,6 @@ module BioMart
117
118
  }
118
119
 
119
120
  chunks << chunk if chunk.any?
120
-
121
121
 
122
122
  Log.low "Chunks: #{chunks.length}"
123
123
  chunks.each_with_index{|chunk,i|
@@ -125,15 +125,15 @@ module BioMart
125
125
  data = get(database, main, chunk, filters, data, open_options)
126
126
  }
127
127
 
128
- open_options[:filename] ||= "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
128
+ open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
129
129
  if filename.nil?
130
- results = TSV.new data, open_options
130
+ results = TSV.open data, open_options
131
131
  results.key_field = main
132
132
  results.fields = attrs
133
133
  results
134
134
  else
135
135
  Open.write(filename) do |f|
136
- f.puts "#: " << Misc.hash2string(TSV::EXTRA_ACCESSORS.collect{|key| [key, open_options[key]]})
136
+ f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
137
137
  if field_names.nil?
138
138
  f.puts "#" << [main, attrs].flatten * "\t"
139
139
  else
@@ -148,7 +148,7 @@ module BioMart
148
148
 
149
149
  def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
150
150
  if @archive_url
151
- attrs = attrs.reject{|attr| MISSING_IN_ARCHIVE[@archive].include? attr[1]}
151
+ attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
152
152
  end
153
153
 
154
154
  codes = attrs.collect{|attr| attr[1]}
@@ -1,18 +1,19 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/resource'
3
4
  require 'rbbt/bow/bow'
4
5
  require 'set'
5
6
 
6
7
  module Entrez
7
8
 
8
- Rbbt.share.databases.entrez.gene_info.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
9
- Rbbt.share.databases.entrez.gene2pubmed.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
9
+ Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
10
+ Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
10
11
 
11
12
  def self.entrez2native(taxs, options = {})
12
- options = Misc.add_defaults options, :key => 1, :fields => 5, :persistence => true, :merge => true
13
+ options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
13
14
 
14
15
  taxs = [taxs] unless Array === taxs
15
- options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
16
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
16
17
 
17
18
  tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
18
19
  tsv.key_field = "Entrez Gene ID"
@@ -20,12 +21,24 @@ module Entrez
20
21
  tsv
21
22
  end
22
23
 
24
+ def self.entrez2name(taxs, options = {})
25
+ options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
26
+
27
+ taxs = [taxs] unless Array === taxs
28
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
29
+
30
+ tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
31
+ tsv.key_field = "Entrez Gene ID"
32
+ tsv.fields = ["Associated Gene Name"]
33
+ tsv
34
+ end
35
+
36
+
23
37
  def self.entrez2pubmed(taxs)
24
- options = {:key => 1, :fields => 2, :persistence => true, :merge => true}
38
+ options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
25
39
 
26
40
  taxs = [taxs] unless taxs.is_a?(Array)
27
- taxs = taxs.collect{|t| t.to_s}
28
- options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
41
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
29
42
 
30
43
  Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
31
44
  end
@@ -58,18 +71,31 @@ module Entrez
58
71
  private
59
72
 
60
73
  def self.get_online(geneids)
61
- geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
62
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
63
74
 
64
- xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
75
+ genes_complete = geneids.is_a?(Array) ? geneids : [geneids]
65
76
 
66
- genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
77
+ genes = []
78
+ Misc.divide(genes_complete, (genes_complete.length / 100) + 1).each do |geneids_list|
79
+ begin
80
+ Misc.try3times do
81
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list * ","}"
82
+
83
+ xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
84
+
85
+ genes += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
86
+ end
87
+ rescue
88
+ puts $!.message
89
+ genes += geneids_list.collect{|g| nil}
90
+ end
91
+ end
67
92
 
68
93
  if geneids.is_a? Array
69
- list = {}
70
- genes.each_with_index{|gene,i|
71
- geneid = geneids[i]
72
- list[geneid ] = gene
94
+ list = Hash[*genes_complete.zip([nil]).flatten]
95
+ genes.each{|gene|
96
+ geneid = gene.match(/<Gene-track_geneid>(\d+)/)[1]
97
+ geneid = geneid.to_i unless list.include? geneid
98
+ list[geneid] = gene
73
99
  }
74
100
  return list
75
101
  else
@@ -99,6 +125,7 @@ module Entrez
99
125
  end
100
126
  }
101
127
 
128
+
102
129
  return list unless missing.any?
103
130
  genes = get_online(missing)
104
131
 
@@ -1,22 +1,24 @@
1
- require 'rbbt-util'
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/persist/tsv'
2
4
 
3
5
  # This module holds helper methods to deal with the Gene Ontology files. Right
4
6
  # now all it does is provide a translation form id to the actual names.
5
7
  module GO
6
8
 
7
- Rbbt.share.databases.GO.gene_ontology.define_as_url 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
8
- Rbbt.share.databases.GO.gslim_generic.define_as_url 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
9
+ Rbbt.claim Rbbt.share.databases.GO.gene_ontology, :url, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
10
+ Rbbt.claim Rbbt.share.databases.GO.gslim_generic, :url, 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
9
11
 
10
12
  MULTIPLE_VALUE_FIELDS = %w(is_a)
11
- TSV_GENE_ONTOLOGY = File.join(Persistence.cachedir, 'gene_ontology')
13
+ TSV_GENE_ONTOLOGY = File.join(Persist.cachedir, 'gene_ontology')
12
14
 
13
15
  # This method needs to be called before any translations can be made, it is
14
16
  # called automatically the first time the id2name method is called. It loads
15
17
  # the gene_ontology.obo file and extracts all the fields, although right now,
16
18
  # only the name field is used.
17
19
  def self.init
18
- init = Persistence.persist_tsv('gene_ontology', :Misc) do
19
- info = {}
20
+ Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
21
+ info.serializer = :marshal if info.respond_to? :serializer and info.serializer == :type
20
22
  Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
21
23
  term_info = {}
22
24
 
@@ -33,12 +35,13 @@ module GO
33
35
  next if term_info["id"].nil?
34
36
  info[term_info["id"]] = term_info
35
37
  }
38
+
36
39
  info
37
40
  end
38
41
  end
39
42
 
40
43
  def self.info
41
- self.init
44
+ @info ||= self.init
42
45
  end
43
46
 
44
47
  def self.goterms
@@ -0,0 +1,4 @@
1
+ require 'rbbt-util'
2
+ module JoChem
3
+ Rbbt.claim Rbbt.share.databases.JoChem, :rake, Rbbt.share.install.JoChem.Rakefile.find
4
+ end
@@ -1,32 +1,35 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/resource'
3
-
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/resource/with_key'
4
4
 
5
5
  module Organism
6
6
  extend Resource
7
- relative_to Rbbt, "share/organisms"
7
+ self.pkgdir = "rbbt"
8
+ self.subdir = "share/organisms"
8
9
 
9
- class OrganismNotProcessedError < StandardError; end
10
+ ["Hsa", "Rno", "Sce"].each do |organism|
11
+ claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
10
12
 
11
- def self.datadir(org)
12
- File.join(Rbbt.datadir, 'organisms', org)
13
- end
13
+ module_eval "#{ organism } = with_key '#{organism}'"
14
+ end
15
+
16
+ class OrganismNotProcessedError < StandardError; end
14
17
 
15
18
  def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
16
19
  Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
17
- options = Misc.add_defaults options, :persistence => true, :case_insensitive => false
20
+ options = Misc.add_defaults options, :persist => true, :case_insensitive => false
18
21
 
19
- options.merge! :key => target unless target.nil?
22
+ options.merge! :key_field => target unless target.nil?
20
23
  options.merge! :fields => fields unless fields.nil?
21
24
 
22
25
  index = identifiers(org).tsv options
23
26
 
24
- tsv.attach index, [:key]
27
+ tsv.attach index, :fields => [:key], :persist_input => true
25
28
  end
26
29
 
27
30
  def self.normalize(org, list, target = nil, fields = nil, options = {})
28
31
  return [] if list.nil? or list.empty?
29
- options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
32
+ options = Misc.add_defaults options, :persist => true, :case_insensitive => true, :double => false
30
33
  double = Misc.process_options options, :double
31
34
 
32
35
 
@@ -50,14 +53,20 @@ module Organism
50
53
  end
51
54
  end
52
55
 
53
- def self.guess_id(org, values)
54
- identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
56
+ def self.guess_id(org, values, identifiers = nil)
57
+ identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
55
58
  field_matches = identifiers.field_matches(values)
56
59
  field_matches.sort_by{|field, matches| matches.uniq.length}.last
57
60
  end
58
61
 
62
+ def self.guess_id(org, values)
63
+ field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
64
+ field_matches.sort_by{|field, count| count.to_i}.last
65
+ end
66
+
67
+
59
68
  def self.organisms
60
- Dir.glob(File.join(Rbbt.share.organisms.find, '*')).collect{|f| File.basename(f)}
69
+ Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
61
70
  end
62
71
 
63
72
  def self.name(organism)
@@ -70,14 +79,4 @@ module Organism
70
79
  }.first
71
80
  end
72
81
 
73
- ["Hsa", "Rno", "Sce"].each do |organism|
74
- rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
75
- rakefile.lib_dir = Resource.caller_lib_dir __FILE__
76
- rakefile.pkgdir = 'phgx'
77
- Organism[organism].define_as_rake rakefile
78
- module_eval "#{ organism } = with_key '#{organism}'"
79
- end
80
-
81
82
  end
82
-
83
-
@@ -11,7 +11,12 @@ module Organism
11
11
  exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
12
12
  transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
13
13
 
14
- transcripts = exon_transcripts[exon].first
14
+ transcripts = begin
15
+ exon_transcripts[exon].first
16
+ rescue
17
+ []
18
+ end
19
+
15
20
  transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
16
21
  end
17
22
 
@@ -156,6 +161,8 @@ module Organism
156
161
  transcript_offsets = {}
157
162
  exons.each do |exon|
158
163
  transcript_offsets[exon] ||= {}
164
+ offsets = nil
165
+ next unless exon_offsets.include? exon
159
166
  offsets = exon_offsets[exon].zip_fields
160
167
 
161
168
  offsets.collect do |transcript, offset|
@@ -173,7 +180,7 @@ module Organism
173
180
  exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
174
181
  exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
175
182
 
176
- exons = exons_at_genomic_positions(org, positions)
183
+ exons = exons_at_genomic_positions(org, positions)
177
184
  offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
178
185
 
179
186
  position_exons = {}
@@ -203,8 +210,135 @@ module Organism
203
210
  position_offsets
204
211
  end
205
212
 
213
+ def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
214
+ chromosome = chromosome.to_s
215
+ chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
216
+ tsv = file.tsv(:persistence => true, :type => :list)
217
+ tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
218
+ [exon, values["Exon Chr Start"].to_i]
219
+ end
220
+ end
221
+ chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
222
+ tsv = file.tsv(:persistence => true, :type => :list)
223
+ tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
224
+ [exon, values["Exon Chr End"].to_i]
225
+ end
226
+ end
227
+
228
+ if Array === positions
229
+ positions.collect{|position|
230
+ position = position.to_i
231
+ chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
232
+ }
233
+ else
234
+ position = positions.to_i
235
+ chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
236
+ end
237
+
238
+ end
239
+
240
+ def self.exon_junctures_at_genomic_positions(org, positions)
241
+ positions = [positions] unless Array === positions.first
242
+
243
+ exons = []
244
+ chromosomes = {}
245
+ indices = {}
246
+ positions.each_with_index do |info,i|
247
+ chr, pos = info
248
+ chromosomes[chr] ||= []
249
+ indices[chr] ||= []
250
+ chromosomes[chr] << pos
251
+ indices[chr] << i
252
+ end
253
+
254
+ chromosomes.each do |chr, pos_list|
255
+ chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
256
+ chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
257
+ end
258
+
259
+ exons
260
+ end
261
+
262
+ def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
263
+ chromosome = chromosome.to_s
264
+
265
+ chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
266
+ rows = []
267
+ chromosome = options[:chromosome]
268
+ f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
269
+ while not f.eof?
270
+ line = f.gets.chomp
271
+ id, chr, pos = line.split "\t"
272
+ rows << [id, pos.to_i]
273
+ end
274
+
275
+ rows
276
+ end
277
+
278
+ if Array === positions
279
+ positions.collect{|position|
280
+ chromosome_bed[position];
281
+ }
282
+ else
283
+ chromosome_bed[positions];
284
+ end
285
+ end
286
+
287
+
288
+ def self.identify_variations_at_genomic_positions(org, positions, variations_file)
289
+ positions = [positions] unless Array === positions.first
290
+
291
+ variations = []
292
+ chromosomes = {}
293
+ indices = {}
294
+ positions.each_with_index do |info,i|
295
+ chr, pos = info
296
+ chromosomes[chr] ||= []
297
+ indices[chr] ||= []
298
+ chromosomes[chr] << pos
299
+ indices[chr] << i
300
+ end
301
+
302
+ chromosomes.each do |chr, pos_list|
303
+ chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
304
+ chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
305
+ end
306
+
307
+ variations
308
+ end
309
+
310
+ task_option :organism, "Organism", :string, "Hsa"
311
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
312
+ task_dependencies nil
313
+ task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
314
+ genomic_mutations = case
315
+ when TSV === genomic_mutations
316
+ genomic_mutations
317
+ else
318
+ TSV.new StringIO.new(genomic_mutations), :list
319
+ end
320
+ genomic_mutations.key_field ||= "Position"
321
+ genomic_mutations.fields ||= ["Mutation"]
322
+
323
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
324
+
325
+ step(:resources, "Load Resources")
326
+
327
+ exon_junctures = {}
328
+ genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
329
+ exon_junctures[position] = exons
330
+ end
331
+
332
+ genomic_mutations.add_field "Exon Junctions" do |position, values|
333
+ exon_junctures[position] * "|"
334
+ end
335
+
336
+ genomic_mutations.to_s :sort, true
337
+ end
338
+
339
+
206
340
  task_option :organism, "Organism", :string, "Hsa"
207
- task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
341
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
208
342
  task_dependencies nil
209
343
  task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
210
344
  genomic_mutations = case
@@ -213,8 +347,8 @@ module Organism
213
347
  else
214
348
  TSV.new StringIO.new(genomic_mutations), :list
215
349
  end
216
- genomic_mutations.key_field = "Position"
217
- genomic_mutations.fields = ["Mutation"]
350
+ genomic_mutations.key_field ||= "Position"
351
+ genomic_mutations.fields ||= ["Mutation"]
218
352
 
219
353
  positions = genomic_mutations.keys.collect{|l| l.split(":")}
220
354
 
@@ -234,7 +368,7 @@ Translates a collection of mutations in genomic coordinates into mutations in am
234
368
  protein products of transcripts including those positions.
235
369
  EOF
236
370
  task_option :organism, "Organism", :string, "Hsa"
237
- task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
371
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
238
372
  task_dependencies nil
239
373
  task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
240
374
  genomic_mutations = case
@@ -244,8 +378,8 @@ protein products of transcripts including those positions.
244
378
  TSV.new StringIO.new(genomic_mutations), :list
245
379
  end
246
380
 
247
- genomic_mutations.key_field = "Position"
248
- genomic_mutations.fields = ["Mutation"]
381
+ genomic_mutations.key_field ||= "Position"
382
+ genomic_mutations.fields ||= ["Mutation"]
249
383
 
250
384
  positions = genomic_mutations.keys.collect{|l| l.split(":")}
251
385
 
@@ -256,7 +390,6 @@ protein products of transcripts including those positions.
256
390
  results.type = :double
257
391
  results.filename = path
258
392
 
259
-
260
393
  step(:resources, "Load Resources")
261
394
  transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
262
395
  transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
@@ -264,26 +397,31 @@ protein products of transcripts including those positions.
264
397
  exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
265
398
  exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
266
399
  exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
400
+ transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
267
401
 
268
402
  step(:offsets, "Find transcripts and offsets for mutations")
269
403
  offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
270
404
 
271
405
  step(:aminoacid, "Translate mutation to amino acid substitutions")
272
406
  offsets.each do |position, transcripts|
273
- alleles = genomic_mutations[position * ":"].collect{|allele| Misc.IUPAC_to_base(allele)}.flatten
407
+ if genomic_mutations.type === :double
408
+ alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
409
+ else
410
+ alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
411
+ end
274
412
 
275
413
  transcripts.each do |transcript, offset_info|
276
414
  offset, strand = offset_info
277
- begin
278
- codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
279
- rescue
280
- Log.medium $!.message
281
- next
282
- end
283
-
284
- if not codon.nil?
415
+ codon = begin
416
+ Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
417
+ rescue
418
+ Log.medium $!.message
419
+ next
420
+ end
421
+
422
+ if not codon.nil? and not codon.empty?
285
423
  alleles.each do |allele|
286
- allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
424
+ allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
287
425
  change = Organism.codon_change(allele, *codon.values_at(0,1))
288
426
  pos_code = position * ":"
289
427
  mutation = [change.first, codon.last + 1, change.last] * ""
@@ -298,8 +436,93 @@ protein products of transcripts including those positions.
298
436
 
299
437
  end
300
438
 
439
+ step(:identify_proteins, "Identify Proteins for Transcripts")
440
+ transcript_field = results.identify_field "Ensembl Transcript ID"
441
+ results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
442
+ values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
443
+ end
444
+
445
+
301
446
  results
302
447
  end
448
+
449
+
450
+ task_option :organism, "Organism", :string, "Hsa"
451
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
452
+ task_dependencies nil
453
+ task :identify_germline_variations => :tsv do |org,genomic_mutations|
454
+ genomic_mutations = case
455
+ when TSV === genomic_mutations
456
+ genomic_mutations
457
+ else
458
+ TSV.new StringIO.new(genomic_mutations), :list
459
+ end
460
+
461
+ genomic_mutations.key_field ||= "Position"
462
+ genomic_mutations.fields ||= ["Mutation"]
463
+
464
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
465
+
466
+
467
+ step(:prepare, "Prepare Results")
468
+ results = TSV.new({})
469
+ results.key_field = "Position"
470
+ results.fields = ["SNP Id"]
471
+ results.type = :double
472
+ results.filename = path
473
+
474
+
475
+ step(:resources, "Load Resources")
476
+
477
+ snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
478
+ snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
479
+
480
+ genomic_mutations.add_field "Germline SNP Id" do |position, values|
481
+ snps_for_positions[position]
482
+ end
483
+
484
+ genomic_mutations
485
+ end
486
+
487
+
488
+ task_option :organism, "Organism", :string, "Hsa"
489
+ task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
490
+ task_dependencies nil
491
+ task :identify_somatic_variations => :tsv do |org,genomic_mutations|
492
+ genomic_mutations = case
493
+ when TSV === genomic_mutations
494
+ genomic_mutations
495
+ else
496
+ TSV.new StringIO.new(genomic_mutations), :list
497
+ end
498
+
499
+ genomic_mutations.key_field ||= "Position"
500
+ genomic_mutations.fields ||= ["Mutation"]
501
+
502
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
503
+
504
+
505
+ step(:prepare, "Prepare Results")
506
+ results = TSV.new({})
507
+ results.key_field = "Position"
508
+ results.fields = ["SNP Id"]
509
+ results.type = :double
510
+ results.filename = path
511
+
512
+
513
+ step(:resources, "Load Resources")
514
+
515
+ snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
516
+ snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
517
+
518
+ genomic_mutations.add_field "Germline SNP Id" do |position, values|
519
+ snps_for_positions[position]
520
+ end
521
+
522
+ genomic_mutations
523
+ end
524
+
525
+
303
526
  end
304
527
 
305
528
  if __FILE__ == $0
@@ -333,6 +556,17 @@ X 10085674 C T
333
556
  21 19638426 G T
334
557
  EOF
335
558
 
559
+ exon_juncture_test = <<-EOF
560
+ #Position Mutation
561
+ 7:150753996 T
562
+ EOF
563
+
564
+
565
+ job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
566
+ job.run
567
+ job.clean if job.error?
568
+ puts job.messages
569
+ puts job.read
336
570
 
337
571
  # # Build 36
338
572
  # picmi_test = <<-EOF