rbbt-sources 3.0.36 → 3.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6dc886fa1c1e4052a8c5dee351f46e2c2f536e39
4
- data.tar.gz: 7a9722d19761ed23467f5f812ebb18787aeee534
3
+ metadata.gz: 27d9b524e74122ec28214154aae2646350075b34
4
+ data.tar.gz: 5a41416cadec47a6a65b5dc0fd15f836d8aaf27c
5
5
  SHA512:
6
- metadata.gz: 7da4beb09820957334fed2d9bfaa8af9da5b206fd3f4916ca357ab48224eb266c6711b72d920eb83cdb1f9ef2080db9665515e0bfa86ac7c7c557dd427c5f17f
7
- data.tar.gz: 3576bde597dd51ed3af48c20f8bcf46c57a3ae3e240bf38cc1e5e6b4173c0d9557c1375646b1de9534f2a76dac3a32fd692c84f1404c60a961ec00a372c5b98d
6
+ metadata.gz: f7479d80768eecbd609d4bb15907a36eb142d693ce185122fe7c4fe90d8b66cafa10aa0454dabc22db0a6ef4f73500d60752ef6f317b81c495929bfb7756bd2b
7
+ data.tar.gz: c9227fd124bad413ca05292702b3adcd9fe10bacbbb242022621c64c9fa63cf14ff218158dc93a48b6805e27872fb06e22f24f9cc00555896a12ef7566999037
@@ -0,0 +1,57 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/sources/organism'
4
+
5
+ module MSigDB
6
+ extend Resource
7
+ self.subdir = 'share/databases/MSigDB'
8
+
9
+ def self.organism(org="Hsa")
10
+ Organism.default_code(org)
11
+ end
12
+
13
+ #self.search_paths = {}
14
+ #self.search_paths[:default] = :lib
15
+
16
+ MSigDB.claim MSigDB['.source/all_sets.zip'], :proc do |filename|
17
+ raise "Download the 'ZIPed file set' from http://software.broadinstitute.org/gsea/downloads.jsp into #{filename}"
18
+ end
19
+
20
+ MSigDB.claim MSigDB.all_sets, :proc do |dirname|
21
+ zip_file = MSigDB['.source/all_sets.zip'].produce.find
22
+ TmpFile.with_dir do |tmpdir|
23
+ Misc.unzip_in_dir(zip_file, tmpdir)
24
+ Path.setup(tmpdir)
25
+ tmpdir.glob('**/*symbols.gmt').each do |file|
26
+ name_parts = File.basename(file).split(".")
27
+ base_name = name_parts[0..-5] * "_"
28
+ dumper = TSV::Dumper.new :key_field => "MSigDB Geneset ID", :fields => ["Associated Gene Name"], :namespace => MSigDB.organism, :type => :flat
29
+ dumper.init
30
+ io = TSV.traverse file, :type => :array, :into => dumper do |line|
31
+ name, url, *genes = line.split("\t")
32
+ [name, genes]
33
+ end
34
+ Open.write(dirname[base_name], io.stream)
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ if defined? Entity
41
+
42
+ module MSigDBGeneSet
43
+ extend Entity
44
+ self.format= "MSigDB Geneset ID"
45
+
46
+ property :name => :single2array do
47
+ self.downcase.gsub("_",' ')
48
+ end
49
+
50
+ property :genes => :single2array do
51
+ @@pathway_genes ||= MSigDB.all_sets.msigdb.tsv :persist => true
52
+ genes = @@pathway_genes[self]
53
+ Gene.setup(genes, "Associated Gene Name", MSigDB.organism).ensembl
54
+ end
55
+ end
56
+ end
57
+
@@ -72,7 +72,7 @@ module UniProt
72
72
  def self.get_uniprot_sequence(uniprotids)
73
73
  _array = Array === uniprotids
74
74
 
75
- uniprotids = [uniprotids] unless Array === uniprotids
75
+ uniprotids = [uniprotids] unless _array
76
76
  uniprotids = uniprotids.compact.collect{|id| id}
77
77
 
78
78
  result_files = FileCache.cache_online_elements(uniprotids, 'uniprot-sequence-{ID}') do |ids|
@@ -48,6 +48,10 @@ $biomart_transcript_biotype = [
48
48
  ["Ensembl Transcript Biotype", 'transcript_biotype'],
49
49
  ]
50
50
 
51
+ $biomart_transcript_name = [
52
+ ["Ensembl Transcript Name", 'external_transcript_id'],
53
+ ]
54
+
51
55
 
52
56
  $biomart_protein_sequence = [
53
57
  ['Protein Sequence','peptide'],
@@ -442,6 +446,12 @@ file 'transcript_biotype' do |t|
442
446
  Misc.sensiblewrite(t.name, biotype.to_s)
443
447
  end
444
448
 
449
+ file 'transcript_name' do |t|
450
+ biotype = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_name, [], nil, :type => :single, :namespace => Thread.current['namespace'])
451
+
452
+ Misc.sensiblewrite(t.name, biotype.to_s)
453
+ end
454
+
445
455
  file 'gene_pfam' do |t|
446
456
  pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => Thread.current['namespace'])
447
457
 
@@ -700,3 +710,46 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
700
710
 
701
711
  Misc.sensiblewrite(t.name, protein_sequence.to_s)
702
712
  end
713
+
714
+ file 'ensembl2uniprot' => ["protein_sequence", "protein_identifiers"] do |t|
715
+ ensp2unis = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
716
+ dumper = TSV::Dumper.new :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :namespace => Thread.current['namespace'], :type => :single
717
+ dumper.init
718
+ require 'rbbt/sources/uniprot'
719
+ TSV.traverse File.expand_path('./protein_sequence'), :into => dumper, :cpus => 20, :bar => true do |ensp,ensp_seq|
720
+ ensp = ensp.first if Array === ensp
721
+ unis = ensp2unis[ensp]
722
+ next if unis.nil? or unis.empty?
723
+ uni_seqs = UniProt.get_uniprot_sequence(unis)
724
+ best_uni = unis.zip(uni_seqs).sort_by do |uni,uni_seq|
725
+ (ensp_seq.length - uni_seq.length).abs
726
+ end.first.first
727
+ [ensp, best_uni]
728
+ end
729
+ Misc.sensiblewrite(t.name, dumper.stream)
730
+ end
731
+
732
+ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
733
+ uni2ensps = TSV.open(File.expand_path('./protein_identifiers'), :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :type => :flat, :merge => true, :unnamed => true)
734
+ ensp2seq = TSV.open(File.expand_path('./protein_sequence'), :unnamed => true)
735
+ dumper = TSV::Dumper.new :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :namespace => Thread.current['namespace'], :type => :single
736
+ dumper.init
737
+ require 'rbbt/sources/uniprot'
738
+ all_uni = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "UniProt/SwissProt Accession", :fields => [], :type => :double, :merge => true, :unnamed => true).keys.compact.reject{|u| u.empty?}
739
+ TSV.traverse all_uni, :into => dumper, :cpus => 1, :bar => true do |uni|
740
+ uni = uni.first if Array === uni
741
+ uni_seq = UniProt.get_uniprot_sequence(uni)
742
+ ensps = uni2ensps[uni]
743
+ next if ensps.nil? or ensps.empty?
744
+ best_ensp = ensps.sort_by do |ensp|
745
+ ensp_seq = ensp2seq[ensp]
746
+ if ensp_seq
747
+ (ensp_seq.length - uni_seq.length).abs
748
+ else
749
+ uni_seq.length
750
+ end
751
+ end.first
752
+ [uni, best_ensp]
753
+ end
754
+ Misc.sensiblewrite(t.name, dumper.stream)
755
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.36
4
+ version: 3.0.37
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-06 00:00:00.000000000 Z
11
+ date: 2015-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -92,6 +92,7 @@ files:
92
92
  - lib/rbbt/sources/COSTART.rb
93
93
  - lib/rbbt/sources/CTCAE.rb
94
94
  - lib/rbbt/sources/HPRD.rb
95
+ - lib/rbbt/sources/MSigDB.rb
95
96
  - lib/rbbt/sources/NCI.rb
96
97
  - lib/rbbt/sources/PSI_MI.rb
97
98
  - lib/rbbt/sources/STITCH.rb