rbbt-sources 3.0.36 → 3.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/sources/MSigDB.rb +57 -0
- data/lib/rbbt/sources/uniprot.rb +1 -1
- data/share/install/Organism/organism_helpers.rb +53 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 27d9b524e74122ec28214154aae2646350075b34
|
4
|
+
data.tar.gz: 5a41416cadec47a6a65b5dc0fd15f836d8aaf27c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7479d80768eecbd609d4bb15907a36eb142d693ce185122fe7c4fe90d8b66cafa10aa0454dabc22db0a6ef4f73500d60752ef6f317b81c495929bfb7756bd2b
|
7
|
+
data.tar.gz: c9227fd124bad413ca05292702b3adcd9fe10bacbbb242022621c64c9fa63cf14ff218158dc93a48b6805e27872fb06e22f24f9cc00555896a12ef7566999037
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/sources/organism'
|
4
|
+
|
5
|
+
module MSigDB
|
6
|
+
extend Resource
|
7
|
+
self.subdir = 'share/databases/MSigDB'
|
8
|
+
|
9
|
+
def self.organism(org="Hsa")
|
10
|
+
Organism.default_code(org)
|
11
|
+
end
|
12
|
+
|
13
|
+
#self.search_paths = {}
|
14
|
+
#self.search_paths[:default] = :lib
|
15
|
+
|
16
|
+
MSigDB.claim MSigDB['.source/all_sets.zip'], :proc do |filename|
|
17
|
+
raise "Download the 'ZIPed file set' from http://software.broadinstitute.org/gsea/downloads.jsp into #{filename}"
|
18
|
+
end
|
19
|
+
|
20
|
+
MSigDB.claim MSigDB.all_sets, :proc do |dirname|
|
21
|
+
zip_file = MSigDB['.source/all_sets.zip'].produce.find
|
22
|
+
TmpFile.with_dir do |tmpdir|
|
23
|
+
Misc.unzip_in_dir(zip_file, tmpdir)
|
24
|
+
Path.setup(tmpdir)
|
25
|
+
tmpdir.glob('**/*symbols.gmt').each do |file|
|
26
|
+
name_parts = File.basename(file).split(".")
|
27
|
+
base_name = name_parts[0..-5] * "_"
|
28
|
+
dumper = TSV::Dumper.new :key_field => "MSigDB Geneset ID", :fields => ["Associated Gene Name"], :namespace => MSigDB.organism, :type => :flat
|
29
|
+
dumper.init
|
30
|
+
io = TSV.traverse file, :type => :array, :into => dumper do |line|
|
31
|
+
name, url, *genes = line.split("\t")
|
32
|
+
[name, genes]
|
33
|
+
end
|
34
|
+
Open.write(dirname[base_name], io.stream)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
if defined? Entity
|
41
|
+
|
42
|
+
module MSigDBGeneSet
|
43
|
+
extend Entity
|
44
|
+
self.format= "MSigDB Geneset ID"
|
45
|
+
|
46
|
+
property :name => :single2array do
|
47
|
+
self.downcase.gsub("_",' ')
|
48
|
+
end
|
49
|
+
|
50
|
+
property :genes => :single2array do
|
51
|
+
@@pathway_genes ||= MSigDB.all_sets.msigdb.tsv :persist => true
|
52
|
+
genes = @@pathway_genes[self]
|
53
|
+
Gene.setup(genes, "Associated Gene Name", MSigDB.organism).ensembl
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -72,7 +72,7 @@ module UniProt
|
|
72
72
|
def self.get_uniprot_sequence(uniprotids)
|
73
73
|
_array = Array === uniprotids
|
74
74
|
|
75
|
-
uniprotids = [uniprotids] unless
|
75
|
+
uniprotids = [uniprotids] unless _array
|
76
76
|
uniprotids = uniprotids.compact.collect{|id| id}
|
77
77
|
|
78
78
|
result_files = FileCache.cache_online_elements(uniprotids, 'uniprot-sequence-{ID}') do |ids|
|
@@ -48,6 +48,10 @@ $biomart_transcript_biotype = [
|
|
48
48
|
["Ensembl Transcript Biotype", 'transcript_biotype'],
|
49
49
|
]
|
50
50
|
|
51
|
+
$biomart_transcript_name = [
|
52
|
+
["Ensembl Transcript Name", 'external_transcript_id'],
|
53
|
+
]
|
54
|
+
|
51
55
|
|
52
56
|
$biomart_protein_sequence = [
|
53
57
|
['Protein Sequence','peptide'],
|
@@ -442,6 +446,12 @@ file 'transcript_biotype' do |t|
|
|
442
446
|
Misc.sensiblewrite(t.name, biotype.to_s)
|
443
447
|
end
|
444
448
|
|
449
|
+
file 'transcript_name' do |t|
|
450
|
+
biotype = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_name, [], nil, :type => :single, :namespace => Thread.current['namespace'])
|
451
|
+
|
452
|
+
Misc.sensiblewrite(t.name, biotype.to_s)
|
453
|
+
end
|
454
|
+
|
445
455
|
file 'gene_pfam' do |t|
|
446
456
|
pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
447
457
|
|
@@ -700,3 +710,46 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
|
|
700
710
|
|
701
711
|
Misc.sensiblewrite(t.name, protein_sequence.to_s)
|
702
712
|
end
|
713
|
+
|
714
|
+
file 'ensembl2uniprot' => ["protein_sequence", "protein_identifiers"] do |t|
|
715
|
+
ensp2unis = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
|
716
|
+
dumper = TSV::Dumper.new :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :namespace => Thread.current['namespace'], :type => :single
|
717
|
+
dumper.init
|
718
|
+
require 'rbbt/sources/uniprot'
|
719
|
+
TSV.traverse File.expand_path('./protein_sequence'), :into => dumper, :cpus => 20, :bar => true do |ensp,ensp_seq|
|
720
|
+
ensp = ensp.first if Array === ensp
|
721
|
+
unis = ensp2unis[ensp]
|
722
|
+
next if unis.nil? or unis.empty?
|
723
|
+
uni_seqs = UniProt.get_uniprot_sequence(unis)
|
724
|
+
best_uni = unis.zip(uni_seqs).sort_by do |uni,uni_seq|
|
725
|
+
(ensp_seq.length - uni_seq.length).abs
|
726
|
+
end.first.first
|
727
|
+
[ensp, best_uni]
|
728
|
+
end
|
729
|
+
Misc.sensiblewrite(t.name, dumper.stream)
|
730
|
+
end
|
731
|
+
|
732
|
+
file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
|
733
|
+
uni2ensps = TSV.open(File.expand_path('./protein_identifiers'), :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :type => :flat, :merge => true, :unnamed => true)
|
734
|
+
ensp2seq = TSV.open(File.expand_path('./protein_sequence'), :unnamed => true)
|
735
|
+
dumper = TSV::Dumper.new :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :namespace => Thread.current['namespace'], :type => :single
|
736
|
+
dumper.init
|
737
|
+
require 'rbbt/sources/uniprot'
|
738
|
+
all_uni = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "UniProt/SwissProt Accession", :fields => [], :type => :double, :merge => true, :unnamed => true).keys.compact.reject{|u| u.empty?}
|
739
|
+
TSV.traverse all_uni, :into => dumper, :cpus => 1, :bar => true do |uni|
|
740
|
+
uni = uni.first if Array === uni
|
741
|
+
uni_seq = UniProt.get_uniprot_sequence(uni)
|
742
|
+
ensps = uni2ensps[uni]
|
743
|
+
next if ensps.nil? or ensps.empty?
|
744
|
+
best_ensp = ensps.sort_by do |ensp|
|
745
|
+
ensp_seq = ensp2seq[ensp]
|
746
|
+
if ensp_seq
|
747
|
+
(ensp_seq.length - uni_seq.length).abs
|
748
|
+
else
|
749
|
+
uni_seq.length
|
750
|
+
end
|
751
|
+
end.first
|
752
|
+
[uni, best_ensp]
|
753
|
+
end
|
754
|
+
Misc.sensiblewrite(t.name, dumper.stream)
|
755
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.37
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -92,6 +92,7 @@ files:
|
|
92
92
|
- lib/rbbt/sources/COSTART.rb
|
93
93
|
- lib/rbbt/sources/CTCAE.rb
|
94
94
|
- lib/rbbt/sources/HPRD.rb
|
95
|
+
- lib/rbbt/sources/MSigDB.rb
|
95
96
|
- lib/rbbt/sources/NCI.rb
|
96
97
|
- lib/rbbt/sources/PSI_MI.rb
|
97
98
|
- lib/rbbt/sources/STITCH.rb
|