rbbt-entities 1.1.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/icgc2rbbt.rb ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rbbt/util/open'
3
+
4
+ file = ARGV.shift
5
+ directory = ARGV.shift
6
+
7
+ genotypes = {}
8
+ Open.read(file).split("\n").each do |line|
9
+ next if line =~ /^Cancer Type/
10
+
11
+ chr, pos, ref, mut, sample = line.split(/\t/).values_at 2, 3, 6, 10, 35
12
+
13
+ chr.sub!(/chr/,'')
14
+ mut = '-' * (mut.length - 1) if mut =~/^-[ACGT]/
15
+
16
+ genotypes[sample] ||= []
17
+ genotypes[sample] << [chr, pos, mut] * ":"
18
+ end
19
+
20
+ genotypes.each do |sample, mutations|
21
+ mutations.uniq!
22
+ Open.write(File.join(directory, sample), mutations.uniq * "\n")
23
+ end
data/bin/vcf2rbbt.rb ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rbbt/util/open'
3
+
4
+ file = ARGV.shift
5
+
6
+ Open.read(file).split("\n").each do |line|
7
+ next if line =~ /^#/
8
+
9
+ chr, pos, id, ref, mut, score = line.split(/\t/)
10
+
11
+ chr.sub!(/chr/,'')
12
+ mut = mut + '-' * (ref.length - mut.length) if ref.length > mut.length
13
+
14
+ puts [chr, pos, mut, score] * ":"
15
+ end
@@ -0,0 +1,73 @@
1
+ require 'rbbt/entity'
2
+ require 'rbbt/workflow'
3
+ require 'rbbt/sources/organism'
4
+ require 'rbbt/entity/gene'
5
+
6
+ Workflow.require_workflow "Sequence"
7
+
8
+ module ChromosomeRange
9
+ extend Entity
10
+
11
+ self.annotation :organism
12
+
13
+ self.format = "Chromosome Range"
14
+
15
+
16
+ def self.text_to_unit(text)
17
+ text = text.sub('^', '+')
18
+ base = text.to_f
19
+ case
20
+ when text =~ /KB?$/
21
+ base * 1000
22
+ when text =~ /MB?$/
23
+ base * 1000_000
24
+ when text =~ /^\d+(\.\d+)?(e\+\d+)?$/
25
+ base
26
+ else
27
+ raise "Text format not understood: #{ text }"
28
+ end.to_i
29
+ end
30
+
31
+ property :unit => :array2single do
32
+ self.collect{|range|
33
+ chr, start, eend = range.split(":")
34
+ [chr, ChromosomeRange.text_to_unit(start), ChromosomeRange.text_to_unit(eend)] * ":"
35
+ }
36
+ end
37
+ persist :unit
38
+
39
+ property :genes => :array2single do
40
+ Sequence.job(:genes_at_genomic_ranges, "ChromosomeRange", :organism => organism, :ranges => self.unit).run.tap{|t| t.namespace = organism}.values_at *self.unit
41
+ end
42
+
43
+ property :ensembl_browser => :single2array do
44
+ "http://#{Misc.ensembl_server(self.organism)}/Homo_sapiens/Location/View?db=core&r=#{chromosome}:#{start}-#{eend}"
45
+ end
46
+
47
+ property :chromosome => :array2single do
48
+ self.clean_annotations.collect{|r| r.split(":")[0]}
49
+ end
50
+ persist :_ary_chromosome
51
+
52
+ property :start => :array2single do
53
+ self.clean_annotations.collect{|r| r.split(":")[1].to_i}
54
+ end
55
+ persist :_ary_start
56
+
57
+ property :eend => :array2single do
58
+ self.clean_annotations.collect{|r| r.split(":")[2].to_i}
59
+ end
60
+ persist :_ary_eend
61
+
62
+ property :end => :array2single do
63
+ self.eend
64
+ end
65
+ persist :_ary_end
66
+
67
+ property :range => :array2single do
68
+ start.zip(self.end).collect{|s,e| (s..e)}
69
+ end
70
+ persist :_ary_range
71
+
72
+
73
+ end
@@ -26,12 +26,30 @@ module CNV
26
26
 
27
27
  property :genes => :array2single do
28
28
  @genes ||= begin
29
- genes = Sequence.job(:genes_at_genomic_ranges, jobname, :organism => organism, :ranges => self).run
30
- genes.unnamed = true
29
+ genes = Sequence.job(:genes_at_genomic_ranges, jobname, :organism => organism, :ranges => self, :unnamed => true).run
31
30
  genes = genes.values_at *self
32
31
  Gene.setup(genes, "Ensembl Gene ID", organism)
33
32
  end
34
33
  end
35
34
 
35
+ property :chromosome => :array2single do
36
+ self.clean_annotations.collect{|mut| mut.split(":")[0]}
37
+ end
38
+ persist :_ary_chromosome
39
+
40
+ property :start => :array2single do
41
+ self.clean_annotations.collect{|mut| mut.split(":")[1].to_i}
42
+ end
43
+ persist :_ary_start
44
+
45
+ property :end => :array2single do
46
+ self.clean_annotations.collect{|mut| mut.split(":")[2].to_i}
47
+ end
48
+ persist :_ary_end
49
+
50
+ property :range => :array2single do
51
+ start.zip(self.end).collect{|s,e| (s..e)}
52
+ end
53
+
36
54
  end
37
55
 
@@ -7,6 +7,7 @@ require 'rbbt/sources/cancer'
7
7
  require 'rbbt/entity/protein'
8
8
  require 'rbbt/entity/pmid'
9
9
  require 'rbbt/entity/transcript'
10
+ require 'rbbt/bow/bow'
10
11
 
11
12
  Workflow.require_workflow "Translation"
12
13
 
@@ -15,7 +16,7 @@ module Gene
15
16
 
16
17
  def self.ensg2enst(organism, gene)
17
18
  @@ensg2enst ||= {}
18
- @@ensg2enst[organism] ||= Organism.gene_transcripts(organism).tsv(:type => :flat, :key_field => "Ensembl Gene ID", :fields => ["Ensembl Transcript ID"], :persist => true).tap{|o| o.unnamed = true}
19
+ @@ensg2enst[organism] ||= Organism.gene_transcripts(organism).tsv(:type => :flat, :key_field => "Ensembl Gene ID", :fields => ["Ensembl Transcript ID"], :persist => true, :unnamed => true)
19
20
 
20
21
  if Array === gene
21
22
  @@ensg2enst[organism].values_at *gene
@@ -24,7 +25,6 @@ module Gene
24
25
  end
25
26
  end
26
27
 
27
-
28
28
  def self.filter(query, field = nil, options = nil, entity = nil)
29
29
  return true if query == entity
30
30
 
@@ -33,39 +33,86 @@ module Gene
33
33
  false
34
34
  end
35
35
 
36
+ def self.gene_list_bases(genes)
37
+ genes = genes.ensembl
38
+ chromosome_genes = {}
39
+ Misc.process_to_hash(genes){|genes| genes.chromosome}.each{|gene, chr| chromosome_genes[chr] ||= []; chromosome_genes[chr] << gene}
40
+ total = 0
41
+ chromosome_genes.each do |chr,gs|
42
+ next if chr.nil?
43
+ total += Misc.total_length(genes.annotate(gs).chr_range.compact)
44
+ end
45
+
46
+ total
47
+ end
48
+
49
+ def self.gene_list_exon_bases(genes)
50
+ genes = genes.ensembl
51
+ chromosome_genes = {}
52
+ Misc.process_to_hash(genes){|genes| genes.chromosome}.each{|gene, chr| chromosome_genes[chr] ||= []; chromosome_genes[chr] << gene}
53
+
54
+ @@exon_range_tsv ||= {}
55
+ organism = genes.organism
56
+ @@exon_range_tsv[organism] ||= Organism.exons(organism).tsv :persist => true, :fields => ["Exon Chr Start", "Exon Chr End"], :type => :list, :cast => :to_i, :unnamed => true
57
+ total = 0
58
+
59
+ chromosome_genes.each do |chr,gs|
60
+ next if chr.nil?
61
+ exons = genes.annotate(gs).transcripts.compact.flatten.exons.compact.flatten.uniq
62
+
63
+ exon_ranges = exons.collect{|exon|
64
+ Log.low "Exon #{ exon } does not have range" unless @@exon_range_tsv[organism].include? exon
65
+ next unless @@exon_range_tsv[organism].include? exon
66
+ pos = @@exon_range_tsv[organism][exon]
67
+ (pos.first..pos.last)
68
+ }.compact
69
+ total += Misc.total_length(exon_ranges)
70
+ end
71
+
72
+ total
73
+ end
74
+
75
+
76
+
36
77
  self.annotation :format
37
78
  self.annotation :organism
38
79
 
39
- self.format = Organism::Hsa.identifiers.all_fields - ["Ensembl Protein ID", "Ensembl Transcript ID"]
80
+ self.format = Organism.identifiers("Hsa").all_fields - ["Ensembl Protein ID", "Ensembl Transcript ID"]
40
81
 
41
82
  property :ortholog => :array2single do |other|
42
83
  return self if organism =~ /^#{ other }(?!\w)/
43
84
  new_organism = organism.split(":")
44
85
  new_organism[0] = other
45
86
  new_organism = new_organism * "/"
46
- Gene.setup(Organism[organism]["ortholog_#{other}"].tsv(:persist => true).values_at(*self.ensembl).collect{|l| l.first}, "Ensembl Gene ID", new_organism)
87
+ Gene.setup(Organism[organism]["ortholog_#{other}"].tsv(:persist => true, :unnamed => true).values_at(*self.ensembl).collect{|l| l.first}, "Ensembl Gene ID", new_organism)
47
88
  end
48
89
  persist :ortholog
49
90
 
50
91
  property :to => :array2single do |new_format|
51
92
  return self if format == new_format
52
- Gene.setup(Translation.job(:tsv_translate, "", :organism => organism, :genes => self, :format => new_format).exec.values_at(*self), new_format, organism)
53
- end
54
-
55
- property :__to => :array2single do |new_format|
56
- return self if format == new_format
57
- to!(new_format).collect!{|v| Array === v ? v.first : v}
93
+ genes = Translation.job(:tsv_translate, "", :organism => organism, :genes => self, :format => new_format).exec.values_at(*self)
94
+ Gene.setup(genes, new_format, organism)
95
+ genes
58
96
  end
59
97
 
60
98
  property :strand => :array2single do
61
- Organism.gene_positions(organism).tsv(:fields => ["Strand"], :type => :single, :persist => true).values_at *self
99
+ @@strand_tsv ||= {}
100
+ @@strand_tsv[organism] ||= Organism.gene_positions(organism).tsv(:fields => ["Strand"], :type => :single, :persist => true, :unnamed => true)
101
+ to("Ensembl Gene ID").collect do |gene|
102
+ @@strand_tsv[organism][gene]
103
+ end
62
104
  end
63
- persist :strand
105
+ persist :_ary_strand
64
106
 
65
107
  property :ensembl => :array2single do
66
108
  to "Ensembl Gene ID"
67
109
  end
68
110
 
111
+ property :biotype => :array2single do
112
+ Organism.gene_biotype(organism).tsv(:persist => true, :type => :single, :unnamed => true).values_at *self.ensembl
113
+ end
114
+ persist :biotype
115
+
69
116
  property :entrez => :array2single do
70
117
  to "Entrez Gene ID"
71
118
  end
@@ -73,26 +120,25 @@ module Gene
73
120
  property :uniprot => :array2single do
74
121
  to "UniProt/SwissProt Accession"
75
122
  end
76
- persist :uniprot
77
123
 
78
124
  property :name => :array2single do
125
+ return self if self.format == "Associated Gene Name"
79
126
  to "Associated Gene Name"
80
127
  end
81
- persist :name
82
128
 
83
129
  property :chr_start => :array2single do
84
- Organism.gene_positions(organism).tsv(:persist => true, :type => :single, :cast => :to_i, :fields => ["Gene Start"]).values_at *self
130
+ Organism.gene_positions(organism).tsv(:persist => true, :type => :single, :cast => :to_i, :fields => ["Gene Start"], :unnamed => true).values_at *self
85
131
  end
86
132
  persist :chr_start
87
133
 
88
134
  property :go_bp_terms => :array2single do
89
- Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat).values_at *self.ensembl
135
+ Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :unnamed => true).values_at *self.ensembl
90
136
  end
91
137
  persist :go_bp_terms
92
138
 
93
139
  property :long_name => :array2single do
94
140
  entre = self.entrez
95
- gene = Entrez.get_gene(entrez).values_at(*entrez).collect{|gene| gene.nil? ? nil : gene.description.flatten.first}
141
+ gene = Entrez.get_gene(entrez).values_at(*entrez).collect{|gene| gene.nil? ? nil : (gene.description || []).flatten.first}
96
142
  end
97
143
  persist :long_name
98
144
 
@@ -140,26 +186,26 @@ module Gene
140
186
  persist :max_protein_length
141
187
 
142
188
  property :chromosome => :array2single do
143
- chr = Organism.gene_positions(organism).tsv :fields => ["Chromosome Name"], :type => :single, :persist => true
144
- chr.unnamed = true
189
+ @@chromosome_tsv ||= {}
190
+ @@chromosome_tsv[organism] ||= Organism.gene_positions(organism).tsv :fields => ["Chromosome Name"], :type => :single, :persist => true, :unnamed => true
145
191
  if Array === self
146
192
  to("Ensembl Gene ID").collect do |gene|
147
- chr[gene]
193
+ @@chromosome_tsv[organism][gene]
148
194
  end
149
195
  else
150
- chr[to("Ensembl Gene ID")]
196
+ @@chromosome_tsv[organism][to("Ensembl Gene ID")]
151
197
  end
152
198
  end
153
199
  persist :chromosome
154
200
 
155
- property :range => :array2single do
156
- pos = Organism.gene_positions(organism).tsv :fields => ["Gene Start", "Gene End"], :type => :list, :persist => true, :cast => :to_i
201
+ property :chr_range => :array2single do
202
+ chr_range_index ||= Organism.gene_positions(organism).tsv :fields => ["Gene Start", "Gene End"], :type => :list, :persist => true, :cast => :to_i, :unnamed => true
157
203
  to("Ensembl Gene ID").collect do |gene|
158
- next if not pos.include? gene
159
- Range.new *pos[gene]
204
+ next if not chr_range_index.include? gene
205
+ Range.new *chr_range_index[gene]
160
206
  end
161
207
  end
162
- persist :range
208
+ persist :chr_range
163
209
 
164
210
  property :articles => :array2single do
165
211
  PMID.setup(Organism.gene_pmids(organism).tsv(:persist => true, :fields => ["PMID"], :type => :flat, :unnamed => true).values_at *self.entrez)
@@ -167,14 +213,14 @@ module Gene
167
213
  persist :articles
168
214
 
169
215
  property :sequence => :array2single do
170
- Organism.gene_sequence(organism).tsv :persist => true
171
- @gene_sequence.unnamed = true
172
- @gene_sequence.values_at *self.ensembl
216
+ @@sequence_tsv ||= {}
217
+ @@sequence_tsv[organism] ||= Organism.gene_sequence(organism).tsv :persist => true, :unnamed => true
218
+ @@sequence_tsv[organism].values_at *self.ensembl
173
219
  end
174
220
  persist :sequence
175
221
 
176
222
  property :matador_drugs => :array2single do
177
- @@matador ||= Matador.protein_drug.tsv(:persist => false).tap{|o| o.unnamed = true}
223
+ @@matador ||= Matador.protein_drug.tsv(:persist => false, :unnamed => true)
178
224
 
179
225
  ensg = self.to("Ensembl Gene ID")
180
226
 
@@ -225,7 +271,7 @@ module Gene
225
271
  persist :pathway_drugs
226
272
 
227
273
  property :related_cancers => :array2single do
228
- Cancer["cancer_genes.tsv"].tsv(:persist => true, :type => :list).values_at(*self.name).collect{|v| v.nil? ? nil : v["Tumour Types (Somatic Mutations)"].split(", ") + v["Tumour Types (Germline Mutations)"].split(", ")}
274
+ Cancer["cancer_genes.tsv"].tsv(:persist => true, :type => :list).values_at(*self.name).collect{|v| v.nil? ? nil : (v["Tumour Types (Somatic Mutations)"].split(", ") + v["Tumour Types (Germline Mutations)"].split(", ")).uniq}
229
275
  end
230
276
  persist :related_cancers
231
277
 
@@ -234,7 +280,7 @@ module Gene
234
280
  raise "No organism defined" if self.organism.nil?
235
281
  clean_organism = self.organism.sub(/\/.*/,'') + '/jun2011'
236
282
  names.organism = clean_organism
237
- ranges = names.chromosome.zip(name.range).collect do |chromosome, range|
283
+ ranges = names.chromosome.zip(name.chr_range).collect do |chromosome, range|
238
284
  next if range.nil?
239
285
  [chromosome, range.begin, range.end] * ":"
240
286
  end
@@ -243,62 +289,89 @@ module Gene
243
289
  persist :somatic_snvs
244
290
 
245
291
 
292
+ property :literature_score do |terms|
293
+ terms = terms.collect{|t| t.stem}
294
+ articles = self.articles
295
+ if articles.nil? or articles.empty?
296
+ 0
297
+ else
298
+ articles.inject(0){|acc,article| acc += article.text.words.select{|word| terms.include? word}.length }.to_f / articles.length
299
+ end
300
+ end
301
+ persist :literature_score
302
+
303
+
246
304
  property :ihop_interactions => :single do
247
305
  uniprot = self.uniprot
248
- url = "http://ws.bioinfo.cnio.es/iHOP/cgi-bin/getSymbolInteractions?ncbiTaxId=9606&reference=#{uniprot}&namespace=UNIPROT__AC"
249
- doc = Nokogiri::XML(Open.read(url))
250
- sentences = doc.css("iHOPsentence")
251
- sentences
252
- end
306
+ if uniprot.nil?
307
+ nil
308
+ else
309
+ sentences = []
253
310
 
254
- property :tagged_ihop_interactions => :single do
255
- interactors = []
256
- ihop_interactions.each do |sentence|
257
- sentence.css('iHOPatom').collect{|atom|
258
- atom.css('evidence');
259
- }.compact.flatten.each do |evidence|
260
- symbol = evidence.attr('symbol')
261
- taxid = evidence.attr('ncbiTaxId')
262
-
263
- if Organism.entrez_taxids(self.organism).list.include? taxid
264
- interactors << symbol
265
- end
311
+ begin
312
+ url = "http://ws.bioinfo.cnio.es/iHOP/cgi-bin/getSymbolInteractions?ncbiTaxId=9606&reference=#{uniprot}&namespace=UNIPROT__AC"
313
+ doc = Nokogiri::XML(Open.read(url))
314
+ sentences = doc.css("iHOPsentence")
315
+ rescue
266
316
  end
267
- end
268
-
269
- Gene.setup(interactors, "Associated Gene Name", self.organism).organism
270
-
271
- interactors_ensembl = interactors.ensembl
272
317
 
273
- interactors2ensembl = {}
274
- interactors.collect{|i| i}.zip(interactors_ensembl.collect{|i| i}).each do |o,e|
275
- interactors2ensembl[o] = e
318
+ sentences
276
319
  end
320
+ end
277
321
 
278
- ihop_interactions.collect do |sentence|
279
- sentence.css('iHOPatom').each{|atom|
280
- evidences = atom.css('evidence')
281
- symbol = evidences.collect do |evidence|
322
+ property :tagged_ihop_interactions => :single do
323
+ interactors = []
324
+ ihop_interactions = self.ihop_interactions
325
+ if ihop_interactions.nil?
326
+ nil
327
+ else
328
+ ihop_interactions.each do |sentence|
329
+ sentence.css('iHOPatom').collect{|atom|
330
+ atom.css('evidence');
331
+ }.compact.flatten.each do |evidence|
282
332
  symbol = evidence.attr('symbol')
283
333
  taxid = evidence.attr('ncbiTaxId')
284
334
 
285
335
  if Organism.entrez_taxids(self.organism).list.include? taxid
286
- symbol
287
- else
288
- nil
336
+ interactors << symbol
289
337
  end
290
- end.compact.first
338
+ end
339
+ end
291
340
 
292
- evidences.remove
341
+ Gene.setup(interactors, "Associated Gene Name", self.organism).organism
342
+
343
+ interactors_ensembl = interactors.ensembl
293
344
 
294
- if interactors2ensembl.include? symbol and not interactors2ensembl[symbol].nil?
295
- atom.children.remove
296
- interactor = interactors2ensembl[symbol]
297
- atom.replace interactor.respond_to?(:link)? interactor.link : interactor.name
298
- end
299
- }
300
- sentence.to_s
345
+ interactors2ensembl = {}
346
+ interactors.collect{|i| i}.zip(interactors_ensembl.collect{|i| i}).each do |o,e|
347
+ interactors2ensembl[o] = e
348
+ end
349
+
350
+ ihop_interactions.collect do |sentence|
351
+ sentence.css('iHOPatom').each{|atom|
352
+ literal = atom.content()
353
+ evidences = atom.css('evidence')
354
+ symbol = evidences.collect do |evidence|
355
+ symbol = evidence.attr('symbol')
356
+ taxid = evidence.attr('ncbiTaxId')
357
+
358
+ if Organism.entrez_taxids(self.organism).list.include? taxid
359
+ symbol
360
+ else
361
+ nil
362
+ end
363
+ end.compact.first
364
+
365
+ evidences.remove
366
+
367
+ if interactors2ensembl.include? symbol and not interactors2ensembl[symbol].nil?
368
+ atom.children.remove
369
+ interactor = interactors2ensembl[symbol]
370
+ atom.replace interactor.respond_to?(:link)? interactor.link(nil, nil, :html_link_extra_attrs => "title='#{literal}'") : interactor.name
371
+ end
372
+ }
373
+ sentence.to_s
374
+ end
301
375
  end
302
376
  end
303
377
  end
304
-