rbbt-entities 1.1.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/icgc2rbbt.rb ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rbbt/util/open'
3
+
4
+ file = ARGV.shift
5
+ directory = ARGV.shift
6
+
7
+ genotypes = {}
8
+ Open.read(file).split("\n").each do |line|
9
+ next if line =~ /^Cancer Type/
10
+
11
+ chr, pos, ref, mut, sample = line.split(/\t/).values_at 2, 3, 6, 10, 35
12
+
13
+ chr.sub!(/chr/,'')
14
+ mut = '-' * (mut.length - 1) if mut =~/^-[ACGT]/
15
+
16
+ genotypes[sample] ||= []
17
+ genotypes[sample] << [chr, pos, mut] * ":"
18
+ end
19
+
20
+ genotypes.each do |sample, mutations|
21
+ mutations.uniq!
22
+ Open.write(File.join(directory, sample), mutations.uniq * "\n")
23
+ end
data/bin/vcf2rbbt.rb ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rbbt/util/open'
3
+
4
+ file = ARGV.shift
5
+
6
+ Open.read(file).split("\n").each do |line|
7
+ next if line =~ /^#/
8
+
9
+ chr, pos, id, ref, mut, score = line.split(/\t/)
10
+
11
+ chr.sub!(/chr/,'')
12
+ mut = mut + '-' * (ref.length - mut.length) if ref.length > mut.length
13
+
14
+ puts [chr, pos, mut, score] * ":"
15
+ end
@@ -0,0 +1,73 @@
1
+ require 'rbbt/entity'
2
+ require 'rbbt/workflow'
3
+ require 'rbbt/sources/organism'
4
+ require 'rbbt/entity/gene'
5
+
6
+ Workflow.require_workflow "Sequence"
7
+
8
+ module ChromosomeRange
9
+ extend Entity
10
+
11
+ self.annotation :organism
12
+
13
+ self.format = "Chromosome Range"
14
+
15
+
16
+ def self.text_to_unit(text)
17
+ text = text.sub('^', '+')
18
+ base = text.to_f
19
+ case
20
+ when text =~ /KB?$/
21
+ base * 1000
22
+ when text =~ /MB?$/
23
+ base * 1000_000
24
+ when text =~ /^\d+(\.\d+)?(e\+\d+)?$/
25
+ base
26
+ else
27
+ raise "Text format not understood: #{ text }"
28
+ end.to_i
29
+ end
30
+
31
+ property :unit => :array2single do
32
+ self.collect{|range|
33
+ chr, start, eend = range.split(":")
34
+ [chr, ChromosomeRange.text_to_unit(start), ChromosomeRange.text_to_unit(eend)] * ":"
35
+ }
36
+ end
37
+ persist :unit
38
+
39
+ property :genes => :array2single do
40
+ Sequence.job(:genes_at_genomic_ranges, "ChromosomeRange", :organism => organism, :ranges => self.unit).run.tap{|t| t.namespace = organism}.values_at *self.unit
41
+ end
42
+
43
+ property :ensembl_browser => :single2array do
44
+ "http://#{Misc.ensembl_server(self.organism)}/Homo_sapiens/Location/View?db=core&r=#{chromosome}:#{start}-#{eend}"
45
+ end
46
+
47
+ property :chromosome => :array2single do
48
+ self.clean_annotations.collect{|r| r.split(":")[0]}
49
+ end
50
+ persist :_ary_chromosome
51
+
52
+ property :start => :array2single do
53
+ self.clean_annotations.collect{|r| r.split(":")[1].to_i}
54
+ end
55
+ persist :_ary_start
56
+
57
+ property :eend => :array2single do
58
+ self.clean_annotations.collect{|r| r.split(":")[2].to_i}
59
+ end
60
+ persist :_ary_eend
61
+
62
+ property :end => :array2single do
63
+ self.eend
64
+ end
65
+ persist :_ary_end
66
+
67
+ property :range => :array2single do
68
+ start.zip(self.end).collect{|s,e| (s..e)}
69
+ end
70
+ persist :_ary_range
71
+
72
+
73
+ end
@@ -26,12 +26,30 @@ module CNV
26
26
 
27
27
  property :genes => :array2single do
28
28
  @genes ||= begin
29
- genes = Sequence.job(:genes_at_genomic_ranges, jobname, :organism => organism, :ranges => self).run
30
- genes.unnamed = true
29
+ genes = Sequence.job(:genes_at_genomic_ranges, jobname, :organism => organism, :ranges => self, :unnamed => true).run
31
30
  genes = genes.values_at *self
32
31
  Gene.setup(genes, "Ensembl Gene ID", organism)
33
32
  end
34
33
  end
35
34
 
35
+ property :chromosome => :array2single do
36
+ self.clean_annotations.collect{|mut| mut.split(":")[0]}
37
+ end
38
+ persist :_ary_chromosome
39
+
40
+ property :start => :array2single do
41
+ self.clean_annotations.collect{|mut| mut.split(":")[1].to_i}
42
+ end
43
+ persist :_ary_start
44
+
45
+ property :end => :array2single do
46
+ self.clean_annotations.collect{|mut| mut.split(":")[2].to_i}
47
+ end
48
+ persist :_ary_end
49
+
50
+ property :range => :array2single do
51
+ start.zip(self.end).collect{|s,e| (s..e)}
52
+ end
53
+
36
54
  end
37
55
 
@@ -7,6 +7,7 @@ require 'rbbt/sources/cancer'
7
7
  require 'rbbt/entity/protein'
8
8
  require 'rbbt/entity/pmid'
9
9
  require 'rbbt/entity/transcript'
10
+ require 'rbbt/bow/bow'
10
11
 
11
12
  Workflow.require_workflow "Translation"
12
13
 
@@ -15,7 +16,7 @@ module Gene
15
16
 
16
17
  def self.ensg2enst(organism, gene)
17
18
  @@ensg2enst ||= {}
18
- @@ensg2enst[organism] ||= Organism.gene_transcripts(organism).tsv(:type => :flat, :key_field => "Ensembl Gene ID", :fields => ["Ensembl Transcript ID"], :persist => true).tap{|o| o.unnamed = true}
19
+ @@ensg2enst[organism] ||= Organism.gene_transcripts(organism).tsv(:type => :flat, :key_field => "Ensembl Gene ID", :fields => ["Ensembl Transcript ID"], :persist => true, :unnamed => true)
19
20
 
20
21
  if Array === gene
21
22
  @@ensg2enst[organism].values_at *gene
@@ -24,7 +25,6 @@ module Gene
24
25
  end
25
26
  end
26
27
 
27
-
28
28
  def self.filter(query, field = nil, options = nil, entity = nil)
29
29
  return true if query == entity
30
30
 
@@ -33,39 +33,86 @@ module Gene
33
33
  false
34
34
  end
35
35
 
36
+ def self.gene_list_bases(genes)
37
+ genes = genes.ensembl
38
+ chromosome_genes = {}
39
+ Misc.process_to_hash(genes){|genes| genes.chromosome}.each{|gene, chr| chromosome_genes[chr] ||= []; chromosome_genes[chr] << gene}
40
+ total = 0
41
+ chromosome_genes.each do |chr,gs|
42
+ next if chr.nil?
43
+ total += Misc.total_length(genes.annotate(gs).chr_range.compact)
44
+ end
45
+
46
+ total
47
+ end
48
+
49
+ def self.gene_list_exon_bases(genes)
50
+ genes = genes.ensembl
51
+ chromosome_genes = {}
52
+ Misc.process_to_hash(genes){|genes| genes.chromosome}.each{|gene, chr| chromosome_genes[chr] ||= []; chromosome_genes[chr] << gene}
53
+
54
+ @@exon_range_tsv ||= {}
55
+ organism = genes.organism
56
+ @@exon_range_tsv[organism] ||= Organism.exons(organism).tsv :persist => true, :fields => ["Exon Chr Start", "Exon Chr End"], :type => :list, :cast => :to_i, :unnamed => true
57
+ total = 0
58
+
59
+ chromosome_genes.each do |chr,gs|
60
+ next if chr.nil?
61
+ exons = genes.annotate(gs).transcripts.compact.flatten.exons.compact.flatten.uniq
62
+
63
+ exon_ranges = exons.collect{|exon|
64
+ Log.low "Exon #{ exon } does not have range" unless @@exon_range_tsv[organism].include? exon
65
+ next unless @@exon_range_tsv[organism].include? exon
66
+ pos = @@exon_range_tsv[organism][exon]
67
+ (pos.first..pos.last)
68
+ }.compact
69
+ total += Misc.total_length(exon_ranges)
70
+ end
71
+
72
+ total
73
+ end
74
+
75
+
76
+
36
77
  self.annotation :format
37
78
  self.annotation :organism
38
79
 
39
- self.format = Organism::Hsa.identifiers.all_fields - ["Ensembl Protein ID", "Ensembl Transcript ID"]
80
+ self.format = Organism.identifiers("Hsa").all_fields - ["Ensembl Protein ID", "Ensembl Transcript ID"]
40
81
 
41
82
  property :ortholog => :array2single do |other|
42
83
  return self if organism =~ /^#{ other }(?!\w)/
43
84
  new_organism = organism.split(":")
44
85
  new_organism[0] = other
45
86
  new_organism = new_organism * "/"
46
- Gene.setup(Organism[organism]["ortholog_#{other}"].tsv(:persist => true).values_at(*self.ensembl).collect{|l| l.first}, "Ensembl Gene ID", new_organism)
87
+ Gene.setup(Organism[organism]["ortholog_#{other}"].tsv(:persist => true, :unnamed => true).values_at(*self.ensembl).collect{|l| l.first}, "Ensembl Gene ID", new_organism)
47
88
  end
48
89
  persist :ortholog
49
90
 
50
91
  property :to => :array2single do |new_format|
51
92
  return self if format == new_format
52
- Gene.setup(Translation.job(:tsv_translate, "", :organism => organism, :genes => self, :format => new_format).exec.values_at(*self), new_format, organism)
53
- end
54
-
55
- property :__to => :array2single do |new_format|
56
- return self if format == new_format
57
- to!(new_format).collect!{|v| Array === v ? v.first : v}
93
+ genes = Translation.job(:tsv_translate, "", :organism => organism, :genes => self, :format => new_format).exec.values_at(*self)
94
+ Gene.setup(genes, new_format, organism)
95
+ genes
58
96
  end
59
97
 
60
98
  property :strand => :array2single do
61
- Organism.gene_positions(organism).tsv(:fields => ["Strand"], :type => :single, :persist => true).values_at *self
99
+ @@strand_tsv ||= {}
100
+ @@strand_tsv[organism] ||= Organism.gene_positions(organism).tsv(:fields => ["Strand"], :type => :single, :persist => true, :unnamed => true)
101
+ to("Ensembl Gene ID").collect do |gene|
102
+ @@strand_tsv[organism][gene]
103
+ end
62
104
  end
63
- persist :strand
105
+ persist :_ary_strand
64
106
 
65
107
  property :ensembl => :array2single do
66
108
  to "Ensembl Gene ID"
67
109
  end
68
110
 
111
+ property :biotype => :array2single do
112
+ Organism.gene_biotype(organism).tsv(:persist => true, :type => :single, :unnamed => true).values_at *self.ensembl
113
+ end
114
+ persist :biotype
115
+
69
116
  property :entrez => :array2single do
70
117
  to "Entrez Gene ID"
71
118
  end
@@ -73,26 +120,25 @@ module Gene
73
120
  property :uniprot => :array2single do
74
121
  to "UniProt/SwissProt Accession"
75
122
  end
76
- persist :uniprot
77
123
 
78
124
  property :name => :array2single do
125
+ return self if self.format == "Associated Gene Name"
79
126
  to "Associated Gene Name"
80
127
  end
81
- persist :name
82
128
 
83
129
  property :chr_start => :array2single do
84
- Organism.gene_positions(organism).tsv(:persist => true, :type => :single, :cast => :to_i, :fields => ["Gene Start"]).values_at *self
130
+ Organism.gene_positions(organism).tsv(:persist => true, :type => :single, :cast => :to_i, :fields => ["Gene Start"], :unnamed => true).values_at *self
85
131
  end
86
132
  persist :chr_start
87
133
 
88
134
  property :go_bp_terms => :array2single do
89
- Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat).values_at *self.ensembl
135
+ Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :unnamed => true).values_at *self.ensembl
90
136
  end
91
137
  persist :go_bp_terms
92
138
 
93
139
  property :long_name => :array2single do
94
140
  entre = self.entrez
95
- gene = Entrez.get_gene(entrez).values_at(*entrez).collect{|gene| gene.nil? ? nil : gene.description.flatten.first}
141
+ gene = Entrez.get_gene(entrez).values_at(*entrez).collect{|gene| gene.nil? ? nil : (gene.description || []).flatten.first}
96
142
  end
97
143
  persist :long_name
98
144
 
@@ -140,26 +186,26 @@ module Gene
140
186
  persist :max_protein_length
141
187
 
142
188
  property :chromosome => :array2single do
143
- chr = Organism.gene_positions(organism).tsv :fields => ["Chromosome Name"], :type => :single, :persist => true
144
- chr.unnamed = true
189
+ @@chromosome_tsv ||= {}
190
+ @@chromosome_tsv[organism] ||= Organism.gene_positions(organism).tsv :fields => ["Chromosome Name"], :type => :single, :persist => true, :unnamed => true
145
191
  if Array === self
146
192
  to("Ensembl Gene ID").collect do |gene|
147
- chr[gene]
193
+ @@chromosome_tsv[organism][gene]
148
194
  end
149
195
  else
150
- chr[to("Ensembl Gene ID")]
196
+ @@chromosome_tsv[organism][to("Ensembl Gene ID")]
151
197
  end
152
198
  end
153
199
  persist :chromosome
154
200
 
155
- property :range => :array2single do
156
- pos = Organism.gene_positions(organism).tsv :fields => ["Gene Start", "Gene End"], :type => :list, :persist => true, :cast => :to_i
201
+ property :chr_range => :array2single do
202
+ chr_range_index ||= Organism.gene_positions(organism).tsv :fields => ["Gene Start", "Gene End"], :type => :list, :persist => true, :cast => :to_i, :unnamed => true
157
203
  to("Ensembl Gene ID").collect do |gene|
158
- next if not pos.include? gene
159
- Range.new *pos[gene]
204
+ next if not chr_range_index.include? gene
205
+ Range.new *chr_range_index[gene]
160
206
  end
161
207
  end
162
- persist :range
208
+ persist :chr_range
163
209
 
164
210
  property :articles => :array2single do
165
211
  PMID.setup(Organism.gene_pmids(organism).tsv(:persist => true, :fields => ["PMID"], :type => :flat, :unnamed => true).values_at *self.entrez)
@@ -167,14 +213,14 @@ module Gene
167
213
  persist :articles
168
214
 
169
215
  property :sequence => :array2single do
170
- Organism.gene_sequence(organism).tsv :persist => true
171
- @gene_sequence.unnamed = true
172
- @gene_sequence.values_at *self.ensembl
216
+ @@sequence_tsv ||= {}
217
+ @@sequence_tsv[organism] ||= Organism.gene_sequence(organism).tsv :persist => true, :unnamed => true
218
+ @@sequence_tsv[organism].values_at *self.ensembl
173
219
  end
174
220
  persist :sequence
175
221
 
176
222
  property :matador_drugs => :array2single do
177
- @@matador ||= Matador.protein_drug.tsv(:persist => false).tap{|o| o.unnamed = true}
223
+ @@matador ||= Matador.protein_drug.tsv(:persist => false, :unnamed => true)
178
224
 
179
225
  ensg = self.to("Ensembl Gene ID")
180
226
 
@@ -225,7 +271,7 @@ module Gene
225
271
  persist :pathway_drugs
226
272
 
227
273
  property :related_cancers => :array2single do
228
- Cancer["cancer_genes.tsv"].tsv(:persist => true, :type => :list).values_at(*self.name).collect{|v| v.nil? ? nil : v["Tumour Types (Somatic Mutations)"].split(", ") + v["Tumour Types (Germline Mutations)"].split(", ")}
274
+ Cancer["cancer_genes.tsv"].tsv(:persist => true, :type => :list).values_at(*self.name).collect{|v| v.nil? ? nil : (v["Tumour Types (Somatic Mutations)"].split(", ") + v["Tumour Types (Germline Mutations)"].split(", ")).uniq}
229
275
  end
230
276
  persist :related_cancers
231
277
 
@@ -234,7 +280,7 @@ module Gene
234
280
  raise "No organism defined" if self.organism.nil?
235
281
  clean_organism = self.organism.sub(/\/.*/,'') + '/jun2011'
236
282
  names.organism = clean_organism
237
- ranges = names.chromosome.zip(name.range).collect do |chromosome, range|
283
+ ranges = names.chromosome.zip(name.chr_range).collect do |chromosome, range|
238
284
  next if range.nil?
239
285
  [chromosome, range.begin, range.end] * ":"
240
286
  end
@@ -243,62 +289,89 @@ module Gene
243
289
  persist :somatic_snvs
244
290
 
245
291
 
292
+ property :literature_score do |terms|
293
+ terms = terms.collect{|t| t.stem}
294
+ articles = self.articles
295
+ if articles.nil? or articles.empty?
296
+ 0
297
+ else
298
+ articles.inject(0){|acc,article| acc += article.text.words.select{|word| terms.include? word}.length }.to_f / articles.length
299
+ end
300
+ end
301
+ persist :literature_score
302
+
303
+
246
304
  property :ihop_interactions => :single do
247
305
  uniprot = self.uniprot
248
- url = "http://ws.bioinfo.cnio.es/iHOP/cgi-bin/getSymbolInteractions?ncbiTaxId=9606&reference=#{uniprot}&namespace=UNIPROT__AC"
249
- doc = Nokogiri::XML(Open.read(url))
250
- sentences = doc.css("iHOPsentence")
251
- sentences
252
- end
306
+ if uniprot.nil?
307
+ nil
308
+ else
309
+ sentences = []
253
310
 
254
- property :tagged_ihop_interactions => :single do
255
- interactors = []
256
- ihop_interactions.each do |sentence|
257
- sentence.css('iHOPatom').collect{|atom|
258
- atom.css('evidence');
259
- }.compact.flatten.each do |evidence|
260
- symbol = evidence.attr('symbol')
261
- taxid = evidence.attr('ncbiTaxId')
262
-
263
- if Organism.entrez_taxids(self.organism).list.include? taxid
264
- interactors << symbol
265
- end
311
+ begin
312
+ url = "http://ws.bioinfo.cnio.es/iHOP/cgi-bin/getSymbolInteractions?ncbiTaxId=9606&reference=#{uniprot}&namespace=UNIPROT__AC"
313
+ doc = Nokogiri::XML(Open.read(url))
314
+ sentences = doc.css("iHOPsentence")
315
+ rescue
266
316
  end
267
- end
268
-
269
- Gene.setup(interactors, "Associated Gene Name", self.organism).organism
270
-
271
- interactors_ensembl = interactors.ensembl
272
317
 
273
- interactors2ensembl = {}
274
- interactors.collect{|i| i}.zip(interactors_ensembl.collect{|i| i}).each do |o,e|
275
- interactors2ensembl[o] = e
318
+ sentences
276
319
  end
320
+ end
277
321
 
278
- ihop_interactions.collect do |sentence|
279
- sentence.css('iHOPatom').each{|atom|
280
- evidences = atom.css('evidence')
281
- symbol = evidences.collect do |evidence|
322
+ property :tagged_ihop_interactions => :single do
323
+ interactors = []
324
+ ihop_interactions = self.ihop_interactions
325
+ if ihop_interactions.nil?
326
+ nil
327
+ else
328
+ ihop_interactions.each do |sentence|
329
+ sentence.css('iHOPatom').collect{|atom|
330
+ atom.css('evidence');
331
+ }.compact.flatten.each do |evidence|
282
332
  symbol = evidence.attr('symbol')
283
333
  taxid = evidence.attr('ncbiTaxId')
284
334
 
285
335
  if Organism.entrez_taxids(self.organism).list.include? taxid
286
- symbol
287
- else
288
- nil
336
+ interactors << symbol
289
337
  end
290
- end.compact.first
338
+ end
339
+ end
291
340
 
292
- evidences.remove
341
+ Gene.setup(interactors, "Associated Gene Name", self.organism).organism
342
+
343
+ interactors_ensembl = interactors.ensembl
293
344
 
294
- if interactors2ensembl.include? symbol and not interactors2ensembl[symbol].nil?
295
- atom.children.remove
296
- interactor = interactors2ensembl[symbol]
297
- atom.replace interactor.respond_to?(:link)? interactor.link : interactor.name
298
- end
299
- }
300
- sentence.to_s
345
+ interactors2ensembl = {}
346
+ interactors.collect{|i| i}.zip(interactors_ensembl.collect{|i| i}).each do |o,e|
347
+ interactors2ensembl[o] = e
348
+ end
349
+
350
+ ihop_interactions.collect do |sentence|
351
+ sentence.css('iHOPatom').each{|atom|
352
+ literal = atom.content()
353
+ evidences = atom.css('evidence')
354
+ symbol = evidences.collect do |evidence|
355
+ symbol = evidence.attr('symbol')
356
+ taxid = evidence.attr('ncbiTaxId')
357
+
358
+ if Organism.entrez_taxids(self.organism).list.include? taxid
359
+ symbol
360
+ else
361
+ nil
362
+ end
363
+ end.compact.first
364
+
365
+ evidences.remove
366
+
367
+ if interactors2ensembl.include? symbol and not interactors2ensembl[symbol].nil?
368
+ atom.children.remove
369
+ interactor = interactors2ensembl[symbol]
370
+ atom.replace interactor.respond_to?(:link)? interactor.link(nil, nil, :html_link_extra_attrs => "title='#{literal}'") : interactor.name
371
+ end
372
+ }
373
+ sentence.to_s
374
+ end
301
375
  end
302
376
  end
303
377
  end
304
-