rbbt-sources 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,8 +2,7 @@ require 'rbbt-util'
2
2
 
3
3
  module COSTART
4
4
 
5
- Rbbt.claim "COSTART",
6
- Proc.new do
5
+ Rbbt.share.databases.COSTART.COSTART.define_as_proc do
7
6
  terms = ["#COSTART Terms"]
8
7
  Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
9
8
  puts line
@@ -12,5 +11,5 @@ module COSTART
12
11
  end
13
12
 
14
13
  terms * "\n"
15
- end, 'COSTART'
14
+ end
16
15
  end
@@ -2,5 +2,5 @@ require 'rbbt-util'
2
2
  require 'rbbt/util/excel2tsv'
3
3
 
4
4
  module CTCAE
5
- Rbb.claim "CTCAE", TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls'), 'CTCAE'
5
+ Rbbt.share.CTCAE.CTCAE.define_as_url TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls')
6
6
  end
@@ -1,5 +1,6 @@
1
- require 'rbbt-util'
1
+ require 'rbbt/util/tsv'
2
2
  require 'rbbt/util/log'
3
+ require 'cgi'
3
4
 
4
5
  # This module interacts with BioMart. It performs queries to BioMart and
5
6
  # synthesises a hash with the results. Note that this module connects to the
@@ -27,21 +28,23 @@ module BioMart
27
28
  EOT
28
29
 
29
30
  def self.set_archive(date)
30
- @archive_url = BIOMART_URL.sub(/www\.biomar\./, date + '.archive.ensemble')
31
+ @archive_url = BIOMART_URL.sub(/http:\/\/biomart\./, 'http://' + date + '.archive.ensembl.')
32
+ Log.debug "Using Archive URL #{ @archive_url }"
31
33
  end
32
34
 
33
35
  def self.unset_archive
36
+ Log.debug "Restoring current version URL #{BIOMART_URL}"
34
37
  @archive_url = nil
35
38
  end
36
39
 
37
40
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
41
+ repeats = true
38
42
  attrs ||= []
39
43
  filters ||= ["with_#{main}"]
40
- data ||= {}
41
-
44
+
42
45
  query = @@biomart_query_xml.dup
43
46
  query.sub!(/<!--DATABASE-->/,database)
44
- query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
47
+ query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
45
48
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
46
49
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
47
50
 
@@ -55,23 +58,18 @@ module BioMart
55
58
  raise BioMart::QueryError, response
56
59
  end
57
60
 
58
- response.each_line{|l|
59
- parts = l.chomp.split(/\t/)
60
- main = parts.shift
61
- next if main.nil? || main.empty?
62
-
63
- data[main] ||= {}
64
- attrs.each{|name|
65
- value = parts.shift
66
- data[main][name] ||= []
67
- next if value.nil? or value.empty?
68
- if data[main][name]
69
- data[main][name] = [value]
70
- else
71
- data[main][name] << value unless data[main][name].include? value
72
- end
73
- }
74
- }
61
+ result_file = TmpFile.tmp_file
62
+ Open.write(result_file, response)
63
+
64
+ if data.nil?
65
+ data = result_file
66
+ else
67
+ new_datafile = TmpFile.tmp_file
68
+ TSV.paste_merge data, result_file, new_datafile
69
+ FileUtils.rm data
70
+ data = new_datafile
71
+ FileUtils.rm result_file
72
+ end
75
73
 
76
74
  data
77
75
  end
@@ -95,8 +93,9 @@ module BioMart
95
93
  def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
96
94
  open_options = Misc.add_defaults open_options, :nocache => false
97
95
  attrs ||= []
98
- data ||= {}
99
-
96
+
97
+ open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
98
+
100
99
  Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
101
100
 
102
101
  max_items = 2
@@ -115,21 +114,22 @@ module BioMart
115
114
 
116
115
  Log.low "Chunks: #{chunks.length}"
117
116
  chunks.each_with_index{|chunk,i|
118
- Log.low "Chunk #{ i }: [#{chunk * ", "}]"
117
+ Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
119
118
  data = get(database, main, chunk, filters, data, open_options)
120
119
  }
121
120
 
122
- data
121
+ result = TSV.new(data, open_options)
122
+ result.key_field = main
123
+ result.fields = attrs
124
+ result.filename = "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
125
+
126
+ FileUtils.rm data
127
+ result
123
128
  end
124
129
 
125
130
  def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
126
131
  codes = attrs.collect{|attr| attr[1]}
127
- data = query(database, main.last, codes, filters, data, open_options)
128
- tsv = TSV.new({})
129
-
130
- data.each do |key, info|
131
- tsv[key] = info.values_at(*codes)
132
- end
132
+ tsv = query(database, main.last, codes, filters, data, open_options)
133
133
 
134
134
  tsv.key_field = main.first
135
135
  tsv.fields = attrs.collect{|attr| attr.first}
@@ -5,29 +5,29 @@ require 'set'
5
5
 
6
6
  module Entrez
7
7
 
8
- Rbbt.claim "gene_info", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz', 'databases/entrez'
9
- Rbbt.claim "gene2pubmed", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz', 'databases/entrez'
8
+ Rbbt.share.databases.entrez.gene_info.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
9
+ Rbbt.share.databases.entrez.gene2pubmed.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
10
10
 
11
11
  def self.entrez2native(taxs, options = {})
12
- options = Misc.add_defaults options, :key => 1, :others => 5, :persistence => true, :merge => true
12
+ options = Misc.add_defaults options, :key => 1, :fields => 5, :persistence => true, :merge => true
13
13
 
14
14
  taxs = [taxs] unless Array === taxs
15
- options.merge! :grep => taxs
16
-
17
- tsv = TSV.new(Rbbt.files.databases.entrez.gene_info, :flat, options)
15
+ options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
16
+
17
+ tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
18
18
  tsv.key_field = "Entrez Gene ID"
19
19
  tsv.fields = ["Native ID"]
20
20
  tsv
21
21
  end
22
22
 
23
23
  def self.entrez2pubmed(taxs)
24
- options = {:key => 1, :others => 2, :persistence => true, :merge => true}
24
+ options = {:key => 1, :fields => 2, :persistence => true, :merge => true}
25
25
 
26
26
  taxs = [taxs] unless taxs.is_a?(Array)
27
27
  taxs = taxs.collect{|t| t.to_s}
28
- options.merge! :grep => taxs
28
+ options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
29
29
 
30
- TSV.new(Rbbt.files.databases.entrez.gene2pubmed, :flat, options)
30
+ Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
31
31
  end
32
32
 
33
33
  class Gene
@@ -132,7 +132,11 @@ module Entrez
132
132
  when Entrez::Gene === gene
133
133
  gene_text = gene.text
134
134
  when String === gene || Fixnum === gene
135
- gene_text = get_gene(gene).text
135
+ begin
136
+ gene_text = get_gene(gene).text
137
+ rescue CMD::CMDError
138
+ return 0
139
+ end
136
140
  else
137
141
  return 0
138
142
  end
@@ -4,19 +4,20 @@ require 'rbbt-util'
4
4
  # now all it does is provide a translation form id to the actual names.
5
5
  module GO
6
6
 
7
- Rbbt.claim :gene_ontology, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo', 'databases/GO'
8
- Rbbt.claim :goslim_generic, 'http://www.geneontology.org/GO_slims/goslim_generic.obo', 'databases/GO'
7
+ Rbbt.share.databases.GO.gene_ontology.define_as_url 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
8
+ Rbbt.share.databases.GO.gslim_generic.define_as_url 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
9
9
 
10
10
  MULTIPLE_VALUE_FIELDS = %w(is_a)
11
- TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')
11
+ TSV_GENE_ONTOLOGY = File.join(Persistence.cachedir, 'gene_ontology')
12
12
 
13
13
  # This method needs to be called before any translations can be made, it is
14
14
  # called automatically the first time the id2name method is called. It loads
15
15
  # the gene_ontology.obo file and extracts all the fields, although right now,
16
16
  # only the name field is used.
17
17
  def self.init
18
- info = TCHash.new(TSV_GENE_ONTOLOGY, true)
19
- File.open(Rbbt.find_datafile('gene_ontology')).read.split(/\[Term\]/).each{|term|
18
+ init = Persistence.persist_tsv('gene_ontology', :Misc) do
19
+ info = {}
20
+ Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
20
21
  term_info = {}
21
22
 
22
23
  term.split(/\n/). select{|l| l =~ /:/}.each{|l|
@@ -32,12 +33,12 @@ module GO
32
33
  next if term_info["id"].nil?
33
34
  info[term_info["id"]] = term_info
34
35
  }
35
- info.close
36
+ info
37
+ end
36
38
  end
37
39
 
38
40
  def self.info
39
- self.init unless File.exists? TSV_GENE_ONTOLOGY
40
- TCHash.get(TSV_GENE_ONTOLOGY)
41
+ self.init
41
42
  end
42
43
 
43
44
  def self.goterms
@@ -1,19 +1,39 @@
1
1
  require 'rbbt-util'
2
- require 'rbbt/util/data_module'
2
+ require 'rbbt/util/resource'
3
3
 
4
4
 
5
5
  module Organism
6
+ extend Resource
7
+ relative_to Rbbt, "share/organisms"
8
+
6
9
  class OrganismNotProcessedError < StandardError; end
7
10
 
8
11
  def self.datadir(org)
9
12
  File.join(Rbbt.datadir, 'organisms', org)
10
13
  end
11
14
 
12
- def self.normalize(org, list, field = nil, others = nil, options = {})
15
+ def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
16
+ Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
17
+ options = Misc.add_defaults options, :persistence => true, :case_insensitive => false
18
+
19
+ options.merge! :key => target unless target.nil?
20
+ options.merge! :fields => fields unless fields.nil?
21
+
22
+ index = identifiers(org).tsv options
23
+
24
+ tsv.attach index, [:key]
25
+ end
26
+
27
+ def self.normalize(org, list, target = nil, fields = nil, options = {})
13
28
  return [] if list.nil? or list.empty?
14
29
  options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
15
30
  double = Misc.process_options options, :double
16
-
31
+
32
+ options.merge! :target => target unless target.nil?
33
+ options.merge! :fields => fields unless fields.nil?
34
+
35
+ index = identifiers(org).index options
36
+
17
37
  if Array === list
18
38
  if double
19
39
  index.values_at *list
@@ -36,11 +56,11 @@ module Organism
36
56
  end
37
57
 
38
58
  def self.organisms
39
- Dir.glob(File.join(PKGData.sharedir_for_file(__FILE__), 'install/Organism/*/Rakefile')).collect{|f| File.basename(File.dirname(f))}
59
+ Dir.glob(File.join(Rbbt.share.organisms.find, '*')).collect{|f| File.basename(f)}
40
60
  end
41
61
 
42
62
  def self.name(organism)
43
- Open.read(Organism.scientific_name(organism)).strip
63
+ Organism.scientific_name(organism).read.strip
44
64
  end
45
65
 
46
66
  def self.organism(name)
@@ -48,9 +68,15 @@ module Organism
48
68
  organism == name or Organism.name(organism) =~ /#{ name }/i
49
69
  }.first
50
70
  end
51
-
52
- extend DataModule
53
-
54
- Hsa = with_key('Hsa')
55
- Sce = with_key('Sce')
71
+
72
+ ["Hsa", "Sce"].each do |organism|
73
+ rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
74
+ rakefile.lib_dir = Resource.caller_lib_dir __FILE__
75
+ rakefile.pkgdir = 'phgx'
76
+ Organism[organism].define_as_rake rakefile
77
+ module_eval "#{ organism } = with_key '#{organism}'"
78
+ end
79
+
56
80
  end
81
+
82
+
@@ -0,0 +1,337 @@
1
+ require 'rbbt/sources/organism'
2
+ require 'rbbt/util/workflow'
3
+ require 'bio'
4
+ # Sequence analyses
5
+ module Organism
6
+ extend WorkFlow
7
+
8
+ def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
9
+ exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
10
+ transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
11
+
12
+ transcripts = exon_transcripts[exon].first
13
+ transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
14
+ end
15
+
16
+ def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
17
+ transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
18
+ transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
19
+
20
+ utr5 = transcript_5utr[transcript]
21
+
22
+ raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
23
+
24
+ return nil if utr5 > offset
25
+
26
+ sequence = transcript_sequence[transcript]
27
+ raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
28
+
29
+ ccds_offset = offset - utr5
30
+ return nil if ccds_offset > sequence.length
31
+
32
+ range = (utr5..-1)
33
+ sequence = sequence[range]
34
+
35
+ codon = ccds_offset / 3
36
+ codon_offset = ccds_offset % 3
37
+
38
+ [sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
39
+ end
40
+
41
+ def self.codon_change(allele, codon, offset)
42
+ original = Bio::Sequence::NA .new(codon).translate
43
+ codon = codon.dup
44
+ codon[offset] = allele
45
+ new = Bio::Sequence::NA .new(codon).translate
46
+ [original, new]
47
+ end
48
+
49
+ def self.genes_at_chromosome_positions(org, chromosome, positions)
50
+ chromosome = chromosome.to_s
51
+ chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
52
+ tsv = file.tsv(:persistence => false, :type => :list)
53
+ tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
54
+ [gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
55
+ end
56
+ end
57
+
58
+ if Array === positions
59
+ positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
60
+ else
61
+ pos = chromosome_bed[positions];
62
+ pos.nil? ? nil : pos.first
63
+ end
64
+ end
65
+
66
+ def self.genes_at_genomic_positions(org, positions)
67
+ positions = [positions] unless Array === positions.first
68
+ genes = []
69
+ chromosomes = {}
70
+ indices = {}
71
+ positions.each_with_index do |info,i|
72
+ chr, pos = info
73
+ chromosomes[chr] ||= []
74
+ indices[chr] ||= []
75
+ chromosomes[chr] << pos
76
+ indices[chr] << i
77
+ end
78
+
79
+ chromosomes.each do |chr, pos_list|
80
+ chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
81
+ chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
82
+ end
83
+
84
+ genes
85
+ end
86
+
87
+ def self.exons_at_chromosome_positions(org, chromosome, positions)
88
+ chromosome = chromosome.to_s
89
+ chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
90
+ tsv = file.tsv(:persistence => true, :type => :list)
91
+ tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
92
+ [exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
93
+ end
94
+ end
95
+
96
+ if Array === positions
97
+ positions.collect{|position|
98
+ chromosome_bed[position];
99
+ }
100
+ else
101
+ chromosome_bed[positions];
102
+ end
103
+ end
104
+
105
+
106
+ def self.exons_at_genomic_positions(org, positions)
107
+ positions = [positions] unless Array === positions.first
108
+
109
+ exons = []
110
+ chromosomes = {}
111
+ indices = {}
112
+ positions.each_with_index do |info,i|
113
+ chr, pos = info
114
+ chromosomes[chr] ||= []
115
+ indices[chr] ||= []
116
+ chromosomes[chr] << pos
117
+ indices[chr] << i
118
+ end
119
+
120
+ chromosomes.each do |chr, pos_list|
121
+ chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
122
+ chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
123
+ end
124
+
125
+ exons
126
+ end
127
+
128
+ def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
129
+ exons ||= Organism.exons(org).tsv(:persistence => true)
130
+ transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
131
+
132
+ sizes = [0]
133
+ rank = nil
134
+ transcript_exons[transcript].zip_fields.each do |_exon, _rank|
135
+ _rank = _rank.to_i
136
+ s, e = exons[_exon].values_at("Start", "End")
137
+ size = e.to_i - s.to_i + 1
138
+ sizes[_rank] = size
139
+ rank = _rank if _exon == exon
140
+ end
141
+
142
+ if not rank.nil?
143
+ sizes[0..rank - 1].inject(0){|e,acc| acc += e}
144
+ else
145
+ nil
146
+ end
147
+ end
148
+
149
+ def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
150
+ exon_info ||= Organism.exons(org).tsv(:persistence => true)
151
+ exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
152
+
153
+ exons = [exons] unless Array === exons
154
+ transcript_offsets = {}
155
+ exons.each do |exon|
156
+ transcript_offsets[exon] ||= {}
157
+ offsets = exon_offsets[exon].zip_fields
158
+
159
+ offsets.collect do |transcript, offset|
160
+ next if transcript.empty?
161
+ transcript_offsets[exon][transcript] = offset.to_i
162
+ end
163
+ end
164
+
165
+ transcript_offsets
166
+ end
167
+
168
+ def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
169
+ exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
170
+ exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
171
+ exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
172
+ exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
173
+
174
+ exons = exons_at_genomic_positions(org, positions)
175
+ offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
176
+
177
+ position_exons = {}
178
+ positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
179
+
180
+ position_offsets = {}
181
+ position_exons.each do |position,pos_exons|
182
+ chr, pos = position
183
+ next if pos_exons.nil? or pos_exons.empty?
184
+ pos_exons.each do |exon|
185
+ if offsets.include? exon
186
+ if exon_strand[exon] == 1
187
+ offset_in_exon = (pos.to_i - exon_start[exon].to_i)
188
+ else
189
+ offset_in_exon = (exon_end[exon] - pos.to_i)
190
+ end
191
+ position_offsets[position] ||= {}
192
+ offsets[exon].each do |transcript, offset|
193
+ if not offset.nil?
194
+ position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+
201
+ position_offsets
202
+ end
203
+
204
+ task_option :org, "Organism", :string
205
+ task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
206
+ task :genomic_mutation_to_protein_mutation => :tsv do |org, genomic_mutations|
207
+ positions = genomic_mutations.keys.collect{|l| l.split(":")}
208
+
209
+ step(:prepare, "Prepare Results")
210
+ results = TSV.new({})
211
+ results.key_field = "Position"
212
+ results.fields = ["Ensembl Transcript ID", "Mutation"]
213
+ results.type = :double
214
+
215
+ step(:resources, "Load Resources")
216
+ transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
217
+ transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
218
+ exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
219
+ exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
220
+ exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
221
+ exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
222
+
223
+ step(:offsets, "Find transcripts and offsets for mutations")
224
+ offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
225
+
226
+ step(:aminoacid, "Translate mutation to amino acid substitutions")
227
+ offsets.each do |position, transcripts|
228
+ alleles = genomic_mutations[position * ":"].collect{|allele| Misc.IUPAC_to_base(allele)}.flatten
229
+
230
+ transcripts.each do |transcript, offset_info|
231
+ offset, strand = offset_info
232
+ ddd strand
233
+ begin
234
+ codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
235
+ rescue
236
+ Log.medium $!.message
237
+ next
238
+ end
239
+
240
+ ddd codon
241
+ if not codon.nil?
242
+ alleles.each do |allele|
243
+ ddd allele
244
+ allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
245
+ ddd allele
246
+ change = Organism.codon_change(allele, *codon.values_at(0,1))
247
+ pos_code = position * ":"
248
+ mutation = [change.first, codon.last + 1, change.last] * ""
249
+ if results.include? pos_code
250
+ results[pos_code] = results[pos_code].merge [transcript, mutation]
251
+ else
252
+ results[pos_code] = [[transcript], [mutation]]
253
+ end
254
+ end
255
+ end
256
+ end
257
+
258
+ end
259
+
260
+ results
261
+ end
262
+ end
263
+
264
+ if __FILE__ == $0
265
+ require 'rbbt/util/log'
266
+ require 'benchmark'
267
+
268
+ select = <<-EOF
269
+ 3:64581875
270
+ EOF
271
+ select = select.split("\n").collect{|l| l.split(":")}
272
+
273
+ picmi_test = <<-EOF
274
+ #Chromosome Name Position Reference Tumor
275
+ 1 100382265 C G
276
+ 1 100380997 A G
277
+ 22 30163533 A C
278
+ X 10094215 G A
279
+ X 10085674 C T
280
+ 20 50071099 G T
281
+ 21 19638426 G T
282
+ 2 230633386 C T
283
+ 2 230312220 C T
284
+ 1 100624830 T A
285
+ 4 30723053 G T
286
+ EOF
287
+
288
+ # Build 37
289
+ picmi_test = <<-EOF
290
+ #Chromosome Name Position Reference Tumor
291
+ 1 100624830 T A
292
+ 21 19638426 G T
293
+ EOF
294
+
295
+
296
+ # # Build 36
297
+ # picmi_test = <<-EOF
298
+ ##Chromosome Name Position Reference Tumor
299
+ #3 81780820 T C
300
+ #2 43881517 A T
301
+ #2 43857514 T C
302
+ #6 88375602 G A
303
+ #16 69875502 G T
304
+ #16 69876078 T C
305
+ #16 69877147 G A
306
+ #17 8101874 C T
307
+ # EOF
308
+
309
+
310
+ Log.severity = 2
311
+ org = 'Hsa/may2009'
312
+ file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
313
+
314
+ #positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
315
+ positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
316
+ positions.key_field = "Position"
317
+ positions.fields = %w(Reference Control Tumor)
318
+ #positions.fields = %w(Reference Tumor)
319
+
320
+ #puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
321
+
322
+
323
+ #positions = positions.select ["10:98099540"]
324
+
325
+ Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
326
+ job = Organism.job :genomic_mutation_to_protein_mutation, "Metastasis", org, positions.slice("Tumor")
327
+ job.run
328
+
329
+ while not job.done?
330
+ puts job.step
331
+ sleep 2
332
+ end
333
+
334
+ raise job.messages.last if job.error?
335
+ mutations = job.load
336
+
337
+ end