rbbt-sources 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,8 +2,7 @@ require 'rbbt-util'
2
2
 
3
3
  module COSTART
4
4
 
5
- Rbbt.claim "COSTART",
6
- Proc.new do
5
+ Rbbt.share.databases.COSTART.COSTART.define_as_proc do
7
6
  terms = ["#COSTART Terms"]
8
7
  Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
9
8
  puts line
@@ -12,5 +11,5 @@ module COSTART
12
11
  end
13
12
 
14
13
  terms * "\n"
15
- end, 'COSTART'
14
+ end
16
15
  end
@@ -2,5 +2,5 @@ require 'rbbt-util'
2
2
  require 'rbbt/util/excel2tsv'
3
3
 
4
4
  module CTCAE
5
- Rbb.claim "CTCAE", TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls'), 'CTCAE'
5
+ Rbbt.share.CTCAE.CTCAE.define_as_url TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls')
6
6
  end
@@ -1,5 +1,6 @@
1
- require 'rbbt-util'
1
+ require 'rbbt/util/tsv'
2
2
  require 'rbbt/util/log'
3
+ require 'cgi'
3
4
 
4
5
  # This module interacts with BioMart. It performs queries to BioMart and
5
6
  # synthesises a hash with the results. Note that this module connects to the
@@ -27,21 +28,23 @@ module BioMart
27
28
  EOT
28
29
 
29
30
  def self.set_archive(date)
30
- @archive_url = BIOMART_URL.sub(/www\.biomar\./, date + '.archive.ensemble')
31
+ @archive_url = BIOMART_URL.sub(/http:\/\/biomart\./, 'http://' + date + '.archive.ensembl.')
32
+ Log.debug "Using Archive URL #{ @archive_url }"
31
33
  end
32
34
 
33
35
  def self.unset_archive
36
+ Log.debug "Restoring current version URL #{BIOMART_URL}"
34
37
  @archive_url = nil
35
38
  end
36
39
 
37
40
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
41
+ repeats = true
38
42
  attrs ||= []
39
43
  filters ||= ["with_#{main}"]
40
- data ||= {}
41
-
44
+
42
45
  query = @@biomart_query_xml.dup
43
46
  query.sub!(/<!--DATABASE-->/,database)
44
- query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
47
+ query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
45
48
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
46
49
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
47
50
 
@@ -55,23 +58,18 @@ module BioMart
55
58
  raise BioMart::QueryError, response
56
59
  end
57
60
 
58
- response.each_line{|l|
59
- parts = l.chomp.split(/\t/)
60
- main = parts.shift
61
- next if main.nil? || main.empty?
62
-
63
- data[main] ||= {}
64
- attrs.each{|name|
65
- value = parts.shift
66
- data[main][name] ||= []
67
- next if value.nil? or value.empty?
68
- if data[main][name]
69
- data[main][name] = [value]
70
- else
71
- data[main][name] << value unless data[main][name].include? value
72
- end
73
- }
74
- }
61
+ result_file = TmpFile.tmp_file
62
+ Open.write(result_file, response)
63
+
64
+ if data.nil?
65
+ data = result_file
66
+ else
67
+ new_datafile = TmpFile.tmp_file
68
+ TSV.paste_merge data, result_file, new_datafile
69
+ FileUtils.rm data
70
+ data = new_datafile
71
+ FileUtils.rm result_file
72
+ end
75
73
 
76
74
  data
77
75
  end
@@ -95,8 +93,9 @@ module BioMart
95
93
  def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
96
94
  open_options = Misc.add_defaults open_options, :nocache => false
97
95
  attrs ||= []
98
- data ||= {}
99
-
96
+
97
+ open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
98
+
100
99
  Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
101
100
 
102
101
  max_items = 2
@@ -115,21 +114,22 @@ module BioMart
115
114
 
116
115
  Log.low "Chunks: #{chunks.length}"
117
116
  chunks.each_with_index{|chunk,i|
118
- Log.low "Chunk #{ i }: [#{chunk * ", "}]"
117
+ Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
119
118
  data = get(database, main, chunk, filters, data, open_options)
120
119
  }
121
120
 
122
- data
121
+ result = TSV.new(data, open_options)
122
+ result.key_field = main
123
+ result.fields = attrs
124
+ result.filename = "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
125
+
126
+ FileUtils.rm data
127
+ result
123
128
  end
124
129
 
125
130
  def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
126
131
  codes = attrs.collect{|attr| attr[1]}
127
- data = query(database, main.last, codes, filters, data, open_options)
128
- tsv = TSV.new({})
129
-
130
- data.each do |key, info|
131
- tsv[key] = info.values_at(*codes)
132
- end
132
+ tsv = query(database, main.last, codes, filters, data, open_options)
133
133
 
134
134
  tsv.key_field = main.first
135
135
  tsv.fields = attrs.collect{|attr| attr.first}
@@ -5,29 +5,29 @@ require 'set'
5
5
 
6
6
  module Entrez
7
7
 
8
- Rbbt.claim "gene_info", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz', 'databases/entrez'
9
- Rbbt.claim "gene2pubmed", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz', 'databases/entrez'
8
+ Rbbt.share.databases.entrez.gene_info.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
9
+ Rbbt.share.databases.entrez.gene2pubmed.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
10
10
 
11
11
  def self.entrez2native(taxs, options = {})
12
- options = Misc.add_defaults options, :key => 1, :others => 5, :persistence => true, :merge => true
12
+ options = Misc.add_defaults options, :key => 1, :fields => 5, :persistence => true, :merge => true
13
13
 
14
14
  taxs = [taxs] unless Array === taxs
15
- options.merge! :grep => taxs
16
-
17
- tsv = TSV.new(Rbbt.files.databases.entrez.gene_info, :flat, options)
15
+ options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
16
+
17
+ tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
18
18
  tsv.key_field = "Entrez Gene ID"
19
19
  tsv.fields = ["Native ID"]
20
20
  tsv
21
21
  end
22
22
 
23
23
  def self.entrez2pubmed(taxs)
24
- options = {:key => 1, :others => 2, :persistence => true, :merge => true}
24
+ options = {:key => 1, :fields => 2, :persistence => true, :merge => true}
25
25
 
26
26
  taxs = [taxs] unless taxs.is_a?(Array)
27
27
  taxs = taxs.collect{|t| t.to_s}
28
- options.merge! :grep => taxs
28
+ options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
29
29
 
30
- TSV.new(Rbbt.files.databases.entrez.gene2pubmed, :flat, options)
30
+ Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
31
31
  end
32
32
 
33
33
  class Gene
@@ -132,7 +132,11 @@ module Entrez
132
132
  when Entrez::Gene === gene
133
133
  gene_text = gene.text
134
134
  when String === gene || Fixnum === gene
135
- gene_text = get_gene(gene).text
135
+ begin
136
+ gene_text = get_gene(gene).text
137
+ rescue CMD::CMDError
138
+ return 0
139
+ end
136
140
  else
137
141
  return 0
138
142
  end
@@ -4,19 +4,20 @@ require 'rbbt-util'
4
4
  # now all it does is provide a translation form id to the actual names.
5
5
  module GO
6
6
 
7
- Rbbt.claim :gene_ontology, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo', 'databases/GO'
8
- Rbbt.claim :goslim_generic, 'http://www.geneontology.org/GO_slims/goslim_generic.obo', 'databases/GO'
7
+ Rbbt.share.databases.GO.gene_ontology.define_as_url 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
8
+ Rbbt.share.databases.GO.gslim_generic.define_as_url 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
9
9
 
10
10
  MULTIPLE_VALUE_FIELDS = %w(is_a)
11
- TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
11
+ TSV_GENE_ONTOLOGY = File.join(Persistence.cachedir, 'gene_ontology')
12
12
 
13
13
  # This method needs to be called before any translations can be made, it is
14
14
  # called automatically the first time the id2name method is called. It loads
15
15
  # the gene_ontology.obo file and extracts all the fields, although right now,
16
16
  # only the name field is used.
17
17
  def self.init
18
- info = TCHash.new(TSV_GENE_ONTOLOGY, true)
19
- File.open(Rbbt.find_datafile('gene_ontology')).read.split(/\[Term\]/).each{|term|
18
+ init = Persistence.persist_tsv('gene_ontology', :Misc) do
19
+ info = {}
20
+ Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
20
21
  term_info = {}
21
22
 
22
23
  term.split(/\n/). select{|l| l =~ /:/}.each{|l|
@@ -32,12 +33,12 @@ module GO
32
33
  next if term_info["id"].nil?
33
34
  info[term_info["id"]] = term_info
34
35
  }
35
- info.close
36
+ info
37
+ end
36
38
  end
37
39
 
38
40
  def self.info
39
- self.init unless File.exists? TSV_GENE_ONTOLOGY
40
- TCHash.get(TSV_GENE_ONTOLOGY)
41
+ self.init
41
42
  end
42
43
 
43
44
  def self.goterms
@@ -1,19 +1,39 @@
1
1
  require 'rbbt-util'
2
- require 'rbbt/util/data_module'
2
+ require 'rbbt/util/resource'
3
3
 
4
4
 
5
5
  module Organism
6
+ extend Resource
7
+ relative_to Rbbt, "share/organisms"
8
+
6
9
  class OrganismNotProcessedError < StandardError; end
7
10
 
8
11
  def self.datadir(org)
9
12
  File.join(Rbbt.datadir, 'organisms', org)
10
13
  end
11
14
 
12
- def self.normalize(org, list, field = nil, others = nil, options = {})
15
+ def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
16
+ Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
17
+ options = Misc.add_defaults options, :persistence => true, :case_insensitive => false
18
+
19
+ options.merge! :key => target unless target.nil?
20
+ options.merge! :fields => fields unless fields.nil?
21
+
22
+ index = identifiers(org).tsv options
23
+
24
+ tsv.attach index, [:key]
25
+ end
26
+
27
+ def self.normalize(org, list, target = nil, fields = nil, options = {})
13
28
  return [] if list.nil? or list.empty?
14
29
  options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
15
30
  double = Misc.process_options options, :double
16
-
31
+
32
+ options.merge! :target => target unless target.nil?
33
+ options.merge! :fields => fields unless fields.nil?
34
+
35
+ index = identifiers(org).index options
36
+
17
37
  if Array === list
18
38
  if double
19
39
  index.values_at *list
@@ -36,11 +56,11 @@ module Organism
36
56
  end
37
57
 
38
58
  def self.organisms
39
- Dir.glob(File.join(PKGData.sharedir_for_file(__FILE__), 'install/Organism/*/Rakefile')).collect{|f| File.basename(File.dirname(f))}
59
+ Dir.glob(File.join(Rbbt.share.organisms.find, '*')).collect{|f| File.basename(f)}
40
60
  end
41
61
 
42
62
  def self.name(organism)
43
- Open.read(Organism.scientific_name(organism)).strip
63
+ Organism.scientific_name(organism).read.strip
44
64
  end
45
65
 
46
66
  def self.organism(name)
@@ -48,9 +68,15 @@ module Organism
48
68
  organism == name or Organism.name(organism) =~ /#{ name }/i
49
69
  }.first
50
70
  end
51
-
52
- extend DataModule
53
-
54
- Hsa = with_key('Hsa')
55
- Sce = with_key('Sce')
71
+
72
+ ["Hsa", "Sce"].each do |organism|
73
+ rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
74
+ rakefile.lib_dir = Resource.caller_lib_dir __FILE__
75
+ rakefile.pkgdir = 'phgx'
76
+ Organism[organism].define_as_rake rakefile
77
+ module_eval "#{ organism } = with_key '#{organism}'"
78
+ end
79
+
56
80
  end
81
+
82
+
@@ -0,0 +1,337 @@
1
+ require 'rbbt/sources/organism'
2
+ require 'rbbt/util/workflow'
3
+ require 'bio'
4
+ # Sequence analyses
5
+ module Organism
6
+ extend WorkFlow
7
+
8
+ def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
9
+ exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
10
+ transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
11
+
12
+ transcripts = exon_transcripts[exon].first
13
+ transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
14
+ end
15
+
16
+ def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
17
+ transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
18
+ transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
19
+
20
+ utr5 = transcript_5utr[transcript]
21
+
22
+ raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
23
+
24
+ return nil if utr5 > offset
25
+
26
+ sequence = transcript_sequence[transcript]
27
+ raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
28
+
29
+ ccds_offset = offset - utr5
30
+ return nil if ccds_offset > sequence.length
31
+
32
+ range = (utr5..-1)
33
+ sequence = sequence[range]
34
+
35
+ codon = ccds_offset / 3
36
+ codon_offset = ccds_offset % 3
37
+
38
+ [sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
39
+ end
40
+
41
+ def self.codon_change(allele, codon, offset)
42
+ original = Bio::Sequence::NA .new(codon).translate
43
+ codon = codon.dup
44
+ codon[offset] = allele
45
+ new = Bio::Sequence::NA .new(codon).translate
46
+ [original, new]
47
+ end
48
+
49
+ def self.genes_at_chromosome_positions(org, chromosome, positions)
50
+ chromosome = chromosome.to_s
51
+ chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
52
+ tsv = file.tsv(:persistence => false, :type => :list)
53
+ tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
54
+ [gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
55
+ end
56
+ end
57
+
58
+ if Array === positions
59
+ positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
60
+ else
61
+ pos = chromosome_bed[positions];
62
+ pos.nil? ? nil : pos.first
63
+ end
64
+ end
65
+
66
+ def self.genes_at_genomic_positions(org, positions)
67
+ positions = [positions] unless Array === positions.first
68
+ genes = []
69
+ chromosomes = {}
70
+ indices = {}
71
+ positions.each_with_index do |info,i|
72
+ chr, pos = info
73
+ chromosomes[chr] ||= []
74
+ indices[chr] ||= []
75
+ chromosomes[chr] << pos
76
+ indices[chr] << i
77
+ end
78
+
79
+ chromosomes.each do |chr, pos_list|
80
+ chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
81
+ chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
82
+ end
83
+
84
+ genes
85
+ end
86
+
87
+ def self.exons_at_chromosome_positions(org, chromosome, positions)
88
+ chromosome = chromosome.to_s
89
+ chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
90
+ tsv = file.tsv(:persistence => true, :type => :list)
91
+ tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
92
+ [exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
93
+ end
94
+ end
95
+
96
+ if Array === positions
97
+ positions.collect{|position|
98
+ chromosome_bed[position];
99
+ }
100
+ else
101
+ chromosome_bed[positions];
102
+ end
103
+ end
104
+
105
+
106
+ def self.exons_at_genomic_positions(org, positions)
107
+ positions = [positions] unless Array === positions.first
108
+
109
+ exons = []
110
+ chromosomes = {}
111
+ indices = {}
112
+ positions.each_with_index do |info,i|
113
+ chr, pos = info
114
+ chromosomes[chr] ||= []
115
+ indices[chr] ||= []
116
+ chromosomes[chr] << pos
117
+ indices[chr] << i
118
+ end
119
+
120
+ chromosomes.each do |chr, pos_list|
121
+ chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
122
+ chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
123
+ end
124
+
125
+ exons
126
+ end
127
+
128
+ def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
129
+ exons ||= Organism.exons(org).tsv(:persistence => true)
130
+ transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
131
+
132
+ sizes = [0]
133
+ rank = nil
134
+ transcript_exons[transcript].zip_fields.each do |_exon, _rank|
135
+ _rank = _rank.to_i
136
+ s, e = exons[_exon].values_at("Start", "End")
137
+ size = e.to_i - s.to_i + 1
138
+ sizes[_rank] = size
139
+ rank = _rank if _exon == exon
140
+ end
141
+
142
+ if not rank.nil?
143
+ sizes[0..rank - 1].inject(0){|e,acc| acc += e}
144
+ else
145
+ nil
146
+ end
147
+ end
148
+
149
+ def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
150
+ exon_info ||= Organism.exons(org).tsv(:persistence => true)
151
+ exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
152
+
153
+ exons = [exons] unless Array === exons
154
+ transcript_offsets = {}
155
+ exons.each do |exon|
156
+ transcript_offsets[exon] ||= {}
157
+ offsets = exon_offsets[exon].zip_fields
158
+
159
+ offsets.collect do |transcript, offset|
160
+ next if transcript.empty?
161
+ transcript_offsets[exon][transcript] = offset.to_i
162
+ end
163
+ end
164
+
165
+ transcript_offsets
166
+ end
167
+
168
+ def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
169
+ exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
170
+ exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
171
+ exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
172
+ exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
173
+
174
+ exons = exons_at_genomic_positions(org, positions)
175
+ offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
176
+
177
+ position_exons = {}
178
+ positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
179
+
180
+ position_offsets = {}
181
+ position_exons.each do |position,pos_exons|
182
+ chr, pos = position
183
+ next if pos_exons.nil? or pos_exons.empty?
184
+ pos_exons.each do |exon|
185
+ if offsets.include? exon
186
+ if exon_strand[exon] == 1
187
+ offset_in_exon = (pos.to_i - exon_start[exon].to_i)
188
+ else
189
+ offset_in_exon = (exon_end[exon] - pos.to_i)
190
+ end
191
+ position_offsets[position] ||= {}
192
+ offsets[exon].each do |transcript, offset|
193
+ if not offset.nil?
194
+ position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+
201
+ position_offsets
202
+ end
203
+
204
+ task_option :org, "Organism", :string
205
+ task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
206
+ task :genomic_mutation_to_protein_mutation => :tsv do |org, genomic_mutations|
207
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
208
+
209
+ step(:prepare, "Prepare Results")
210
+ results = TSV.new({})
211
+ results.key_field = "Position"
212
+ results.fields = ["Ensembl Transcript ID", "Mutation"]
213
+ results.type = :double
214
+
215
+ step(:resources, "Load Resources")
216
+ transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
217
+ transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
218
+ exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
219
+ exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
220
+ exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
221
+ exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
222
+
223
+ step(:offsets, "Find transcripts and offsets for mutations")
224
+ offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
225
+
226
+ step(:aminoacid, "Translate mutation to amino acid substitutions")
227
+ offsets.each do |position, transcripts|
228
+ alleles = genomic_mutations[position * ":"].collect{|allele| Misc.IUPAC_to_base(allele)}.flatten
229
+
230
+ transcripts.each do |transcript, offset_info|
231
+ offset, strand = offset_info
232
+ ddd strand
233
+ begin
234
+ codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
235
+ rescue
236
+ Log.medium $!.message
237
+ next
238
+ end
239
+
240
+ ddd codon
241
+ if not codon.nil?
242
+ alleles.each do |allele|
243
+ ddd allele
244
+ allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
245
+ ddd allele
246
+ change = Organism.codon_change(allele, *codon.values_at(0,1))
247
+ pos_code = position * ":"
248
+ mutation = [change.first, codon.last + 1, change.last] * ""
249
+ if results.include? pos_code
250
+ results[pos_code] = results[pos_code].merge [transcript, mutation]
251
+ else
252
+ results[pos_code] = [[transcript], [mutation]]
253
+ end
254
+ end
255
+ end
256
+ end
257
+
258
+ end
259
+
260
+ results
261
+ end
262
+ end
263
+
264
+ if __FILE__ == $0
265
+ require 'rbbt/util/log'
266
+ require 'benchmark'
267
+
268
+ select = <<-EOF
269
+ 3:64581875
270
+ EOF
271
+ select = select.split("\n").collect{|l| l.split(":")}
272
+
273
+ picmi_test = <<-EOF
274
+ #Chromosome Name Position Reference Tumor
275
+ 1 100382265 C G
276
+ 1 100380997 A G
277
+ 22 30163533 A C
278
+ X 10094215 G A
279
+ X 10085674 C T
280
+ 20 50071099 G T
281
+ 21 19638426 G T
282
+ 2 230633386 C T
283
+ 2 230312220 C T
284
+ 1 100624830 T A
285
+ 4 30723053 G T
286
+ EOF
287
+
288
+ # Build 37
289
+ picmi_test = <<-EOF
290
+ #Chromosome Name Position Reference Tumor
291
+ 1 100624830 T A
292
+ 21 19638426 G T
293
+ EOF
294
+
295
+
296
+ # # Build 36
297
+ # picmi_test = <<-EOF
298
+ ##Chromosome Name Position Reference Tumor
299
+ #3 81780820 T C
300
+ #2 43881517 A T
301
+ #2 43857514 T C
302
+ #6 88375602 G A
303
+ #16 69875502 G T
304
+ #16 69876078 T C
305
+ #16 69877147 G A
306
+ #17 8101874 C T
307
+ # EOF
308
+
309
+
310
+ Log.severity = 2
311
+ org = 'Hsa/may2009'
312
+ file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
313
+
314
+ #positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
315
+ positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
316
+ positions.key_field = "Position"
317
+ positions.fields = %w(Reference Control Tumor)
318
+ #positions.fields = %w(Reference Tumor)
319
+
320
+ #puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
321
+
322
+
323
+ #positions = positions.select ["10:98099540"]
324
+
325
+ Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
326
+ job = Organism.job :genomic_mutation_to_protein_mutation, "Metastasis", org, positions.slice("Tumor")
327
+ job.run
328
+
329
+ while not job.done?
330
+ puts job.step
331
+ sleep 2
332
+ end
333
+
334
+ raise job.messages.last if job.error?
335
+ mutations = job.load
336
+
337
+ end