rbbt-sources 3.0.36 → 3.0.37
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/sources/MSigDB.rb +57 -0
- data/lib/rbbt/sources/uniprot.rb +1 -1
- data/share/install/Organism/organism_helpers.rb +53 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 27d9b524e74122ec28214154aae2646350075b34
|
4
|
+
data.tar.gz: 5a41416cadec47a6a65b5dc0fd15f836d8aaf27c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7479d80768eecbd609d4bb15907a36eb142d693ce185122fe7c4fe90d8b66cafa10aa0454dabc22db0a6ef4f73500d60752ef6f317b81c495929bfb7756bd2b
|
7
|
+
data.tar.gz: c9227fd124bad413ca05292702b3adcd9fe10bacbbb242022621c64c9fa63cf14ff218158dc93a48b6805e27872fb06e22f24f9cc00555896a12ef7566999037
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'rbbt/sources/organism'
|
4
|
+
|
5
|
+
module MSigDB
|
6
|
+
extend Resource
|
7
|
+
self.subdir = 'share/databases/MSigDB'
|
8
|
+
|
9
|
+
def self.organism(org="Hsa")
|
10
|
+
Organism.default_code(org)
|
11
|
+
end
|
12
|
+
|
13
|
+
#self.search_paths = {}
|
14
|
+
#self.search_paths[:default] = :lib
|
15
|
+
|
16
|
+
MSigDB.claim MSigDB['.source/all_sets.zip'], :proc do |filename|
|
17
|
+
raise "Download the 'ZIPed file set' from http://software.broadinstitute.org/gsea/downloads.jsp into #{filename}"
|
18
|
+
end
|
19
|
+
|
20
|
+
MSigDB.claim MSigDB.all_sets, :proc do |dirname|
|
21
|
+
zip_file = MSigDB['.source/all_sets.zip'].produce.find
|
22
|
+
TmpFile.with_dir do |tmpdir|
|
23
|
+
Misc.unzip_in_dir(zip_file, tmpdir)
|
24
|
+
Path.setup(tmpdir)
|
25
|
+
tmpdir.glob('**/*symbols.gmt').each do |file|
|
26
|
+
name_parts = File.basename(file).split(".")
|
27
|
+
base_name = name_parts[0..-5] * "_"
|
28
|
+
dumper = TSV::Dumper.new :key_field => "MSigDB Geneset ID", :fields => ["Associated Gene Name"], :namespace => MSigDB.organism, :type => :flat
|
29
|
+
dumper.init
|
30
|
+
io = TSV.traverse file, :type => :array, :into => dumper do |line|
|
31
|
+
name, url, *genes = line.split("\t")
|
32
|
+
[name, genes]
|
33
|
+
end
|
34
|
+
Open.write(dirname[base_name], io.stream)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
if defined? Entity
|
41
|
+
|
42
|
+
module MSigDBGeneSet
|
43
|
+
extend Entity
|
44
|
+
self.format= "MSigDB Geneset ID"
|
45
|
+
|
46
|
+
property :name => :single2array do
|
47
|
+
self.downcase.gsub("_",' ')
|
48
|
+
end
|
49
|
+
|
50
|
+
property :genes => :single2array do
|
51
|
+
@@pathway_genes ||= MSigDB.all_sets.msigdb.tsv :persist => true
|
52
|
+
genes = @@pathway_genes[self]
|
53
|
+
Gene.setup(genes, "Associated Gene Name", MSigDB.organism).ensembl
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -72,7 +72,7 @@ module UniProt
|
|
72
72
|
def self.get_uniprot_sequence(uniprotids)
|
73
73
|
_array = Array === uniprotids
|
74
74
|
|
75
|
-
uniprotids = [uniprotids] unless
|
75
|
+
uniprotids = [uniprotids] unless _array
|
76
76
|
uniprotids = uniprotids.compact.collect{|id| id}
|
77
77
|
|
78
78
|
result_files = FileCache.cache_online_elements(uniprotids, 'uniprot-sequence-{ID}') do |ids|
|
@@ -48,6 +48,10 @@ $biomart_transcript_biotype = [
|
|
48
48
|
["Ensembl Transcript Biotype", 'transcript_biotype'],
|
49
49
|
]
|
50
50
|
|
51
|
+
$biomart_transcript_name = [
|
52
|
+
["Ensembl Transcript Name", 'external_transcript_id'],
|
53
|
+
]
|
54
|
+
|
51
55
|
|
52
56
|
$biomart_protein_sequence = [
|
53
57
|
['Protein Sequence','peptide'],
|
@@ -442,6 +446,12 @@ file 'transcript_biotype' do |t|
|
|
442
446
|
Misc.sensiblewrite(t.name, biotype.to_s)
|
443
447
|
end
|
444
448
|
|
449
|
+
file 'transcript_name' do |t|
|
450
|
+
biotype = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_name, [], nil, :type => :single, :namespace => Thread.current['namespace'])
|
451
|
+
|
452
|
+
Misc.sensiblewrite(t.name, biotype.to_s)
|
453
|
+
end
|
454
|
+
|
445
455
|
file 'gene_pfam' do |t|
|
446
456
|
pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
447
457
|
|
@@ -700,3 +710,46 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
|
|
700
710
|
|
701
711
|
Misc.sensiblewrite(t.name, protein_sequence.to_s)
|
702
712
|
end
|
713
|
+
|
714
|
+
file 'ensembl2uniprot' => ["protein_sequence", "protein_identifiers"] do |t|
|
715
|
+
ensp2unis = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
|
716
|
+
dumper = TSV::Dumper.new :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :namespace => Thread.current['namespace'], :type => :single
|
717
|
+
dumper.init
|
718
|
+
require 'rbbt/sources/uniprot'
|
719
|
+
TSV.traverse File.expand_path('./protein_sequence'), :into => dumper, :cpus => 20, :bar => true do |ensp,ensp_seq|
|
720
|
+
ensp = ensp.first if Array === ensp
|
721
|
+
unis = ensp2unis[ensp]
|
722
|
+
next if unis.nil? or unis.empty?
|
723
|
+
uni_seqs = UniProt.get_uniprot_sequence(unis)
|
724
|
+
best_uni = unis.zip(uni_seqs).sort_by do |uni,uni_seq|
|
725
|
+
(ensp_seq.length - uni_seq.length).abs
|
726
|
+
end.first.first
|
727
|
+
[ensp, best_uni]
|
728
|
+
end
|
729
|
+
Misc.sensiblewrite(t.name, dumper.stream)
|
730
|
+
end
|
731
|
+
|
732
|
+
file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
|
733
|
+
uni2ensps = TSV.open(File.expand_path('./protein_identifiers'), :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :type => :flat, :merge => true, :unnamed => true)
|
734
|
+
ensp2seq = TSV.open(File.expand_path('./protein_sequence'), :unnamed => true)
|
735
|
+
dumper = TSV::Dumper.new :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :namespace => Thread.current['namespace'], :type => :single
|
736
|
+
dumper.init
|
737
|
+
require 'rbbt/sources/uniprot'
|
738
|
+
all_uni = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "UniProt/SwissProt Accession", :fields => [], :type => :double, :merge => true, :unnamed => true).keys.compact.reject{|u| u.empty?}
|
739
|
+
TSV.traverse all_uni, :into => dumper, :cpus => 1, :bar => true do |uni|
|
740
|
+
uni = uni.first if Array === uni
|
741
|
+
uni_seq = UniProt.get_uniprot_sequence(uni)
|
742
|
+
ensps = uni2ensps[uni]
|
743
|
+
next if ensps.nil? or ensps.empty?
|
744
|
+
best_ensp = ensps.sort_by do |ensp|
|
745
|
+
ensp_seq = ensp2seq[ensp]
|
746
|
+
if ensp_seq
|
747
|
+
(ensp_seq.length - uni_seq.length).abs
|
748
|
+
else
|
749
|
+
uni_seq.length
|
750
|
+
end
|
751
|
+
end.first
|
752
|
+
[uni, best_ensp]
|
753
|
+
end
|
754
|
+
Misc.sensiblewrite(t.name, dumper.stream)
|
755
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.37
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -92,6 +92,7 @@ files:
|
|
92
92
|
- lib/rbbt/sources/COSTART.rb
|
93
93
|
- lib/rbbt/sources/CTCAE.rb
|
94
94
|
- lib/rbbt/sources/HPRD.rb
|
95
|
+
- lib/rbbt/sources/MSigDB.rb
|
95
96
|
- lib/rbbt/sources/NCI.rb
|
96
97
|
- lib/rbbt/sources/PSI_MI.rb
|
97
98
|
- lib/rbbt/sources/STITCH.rb
|