WingenderTFClass 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/README.md +36 -0
- data/Rakefile +10 -0
- data/WingenderTFClass.gemspec +32 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/WingenderTFClass.rb +6 -0
- data/lib/WingenderTFClass/motif_family_recognizer.rb +96 -0
- data/lib/WingenderTFClass/obo/term.rb +110 -0
- data/lib/WingenderTFClass/obo/tf_classification.rb +69 -0
- data/lib/WingenderTFClass/rake/WingenderClassification.rake +64 -0
- data/lib/WingenderTFClass/source_data/TFClass_ontologies_temp.zip +0 -0
- data/lib/WingenderTFClass/source_data/TFOntologies/TFClass_human.obo +41262 -0
- data/lib/WingenderTFClass/source_data/TFOntologies/TFClass_mouse.obo +23595 -0
- data/lib/WingenderTFClass/source_data/uniprot_infos/human.tsv +149031 -0
- data/lib/WingenderTFClass/source_data/uniprot_infos/mouse.tsv +79005 -0
- data/lib/WingenderTFClass/source_data_old/TFClass_ontologies_temp.zip +0 -0
- data/lib/WingenderTFClass/source_data_old/TFOntologies/TFClass_human.obo +41262 -0
- data/lib/WingenderTFClass/source_data_old/TFOntologies/TFClass_mouse.obo +23595 -0
- data/lib/WingenderTFClass/source_data_old/uniprot_ID_to_AC.tsv +226840 -0
- data/lib/WingenderTFClass/uniprot_info.rb +22 -0
- data/lib/WingenderTFClass/version.rb +3 -0
- metadata +111 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a5b5fa62fdeb9e8e624a9e63964f0886d135ddcb
|
4
|
+
data.tar.gz: 11372a9c59c0c627664e01faaab77fc3a212ca3e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e6c05b7b6670ec5b00a78524c00ebd4c0780552ae8c5539ad0f33925bd696e5987a5a4f84ceba4e09ea63f0de2ea907aa7d1f7fcdbde19310c8f3a70e7556694
|
7
|
+
data.tar.gz: 0c2168abdd84f9cc32818c60f1dd8dcc03ba062b10921ed018c142bc585492c670ead58b547ebcdc88f77c3a76348e8d2b8101d45538e6176cea012d48060911
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# WingenderTFClass
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/WingenderTFClass`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'WingenderTFClass'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install WingenderTFClass
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/WingenderTFClass.
|
36
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'WingenderTFClass/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "WingenderTFClass"
|
8
|
+
spec.version = WingenderTFClass::VERSION
|
9
|
+
spec.authors = ["prijutme4ty"]
|
10
|
+
spec.email = ["prijutme4ty@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Gem to acquire TFClass protein class/family/subfamily/etc for a transcription factor by its UniprotID.}
|
13
|
+
spec.description = %q{This gem allows to retrieve information for a transcription factor from Wingender's TFClass ontology.}
|
14
|
+
spec.homepage = "https://github.com/VorontsovIE/WingenderTFClass"
|
15
|
+
|
16
|
+
# # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
17
|
+
# # delete this section to allow pushing this gem to any host.
|
18
|
+
# if spec.respond_to?(:metadata)
|
19
|
+
# spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
|
20
|
+
# else
|
21
|
+
# raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
|
22
|
+
# end
|
23
|
+
|
24
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
25
|
+
spec.bindir = "exe"
|
26
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
27
|
+
spec.require_paths = ["lib"]
|
28
|
+
|
29
|
+
spec.add_development_dependency "bundler", "~> 1.10"
|
30
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
31
|
+
spec.add_development_dependency "minitest"
|
32
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "WingenderTFClass"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
require_relative 'obo/tf_classification'
|
2
|
+
require_relative 'uniprot_info'
|
3
|
+
|
4
|
+
module WingenderTFClass
|
5
|
+
module FilePaths
|
6
|
+
TFOntologyHuman = File.absolute_path('source_data/TFOntologies/TFClass_human.obo', __dir__)
|
7
|
+
TFOntologyMouse = File.absolute_path('source_data/TFOntologies/TFClass_mouse.obo', __dir__)
|
8
|
+
|
9
|
+
UniprotHuman = File.absolute_path('source_data/uniprot_infos/human.tsv', __dir__)
|
10
|
+
UniprotMouse = File.absolute_path('source_data/uniprot_infos/mouse.tsv', __dir__)
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
module ProteinFamilyRecognizers
|
15
|
+
def self.by_uniprot_id(deepness:, tf_classification_filename:, uniprot_infos_filename:)
|
16
|
+
tf_classification = OBO::TFClassification.from_file(tf_classification_filename)
|
17
|
+
ByUniprotID.new(
|
18
|
+
ByUniprotAC.new(tf_classification, deepness),
|
19
|
+
UniprotInfo.uniprot_ac_list_by_id_from_file(uniprot_infos_filename)
|
20
|
+
)
|
21
|
+
end
|
22
|
+
HumanAtLevel = Hash.new{|h, deepness|
|
23
|
+
h[deepness] = self.by_uniprot_id(
|
24
|
+
deepness: deepness,
|
25
|
+
tf_classification_filename: FilePaths::TFOntologyHuman,
|
26
|
+
uniprot_infos_filename: FilePaths::UniprotHuman,
|
27
|
+
)
|
28
|
+
}
|
29
|
+
|
30
|
+
MouseAtLevel = Hash.new{|h, deepness|
|
31
|
+
h[deepness] = self.by_uniprot_id(
|
32
|
+
deepness: deepness,
|
33
|
+
tf_classification_filename: FilePaths::TFOntologyMouse,
|
34
|
+
uniprot_infos_filename: FilePaths::UniprotMouse,
|
35
|
+
)
|
36
|
+
}
|
37
|
+
|
38
|
+
class ByUniprotAC
|
39
|
+
def initialize(tf_classification, deepness)
|
40
|
+
@deepness = deepness
|
41
|
+
@tf_classification = tf_classification
|
42
|
+
end
|
43
|
+
|
44
|
+
def subtree_groups
|
45
|
+
@subtree_groups ||= @tf_classification.tf_groups(@deepness)
|
46
|
+
end
|
47
|
+
|
48
|
+
private def subtree_root_by_uniprot_ac
|
49
|
+
@subtree_root_by_uniprot_id ||= begin
|
50
|
+
result = Hash.new{|h,k| h[k] = [] }
|
51
|
+
|
52
|
+
subtree_groups.each{|group_root, group_leafs|
|
53
|
+
group_leafs.flat_map(&:uniprot_ACs).uniq.each{|uniprot_ac|
|
54
|
+
result[uniprot_ac] << group_root
|
55
|
+
}
|
56
|
+
}
|
57
|
+
result
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# In most cases Uniprot refers the only leaf, but in some cases it refers several leafs in different subtrees.
|
62
|
+
# So we return an array of subfamilies
|
63
|
+
def subfamilies_by_uniprot_ac(uniprot_ac)
|
64
|
+
subtree_root_by_uniprot_ac[uniprot_ac]
|
65
|
+
end
|
66
|
+
|
67
|
+
def subfamilies_by_multiple_uniprot_acs(uniprot_acs)
|
68
|
+
uniprot_acs.flat_map{|uniprot_ac|
|
69
|
+
subfamilies_by_uniprot_ac(uniprot_ac)
|
70
|
+
}.uniq
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
#########################
|
75
|
+
|
76
|
+
class ByUniprotID
|
77
|
+
def initialize(motif_family_recognizer_by_uniprot_ac, uniprot_acs_by_id)
|
78
|
+
@motif_family_recognizer_by_uniprot_ac = motif_family_recognizer_by_uniprot_ac
|
79
|
+
@uniprot_acs_by_id = uniprot_acs_by_id
|
80
|
+
end
|
81
|
+
|
82
|
+
# In most cases Uniprot refers the only leaf, but in some cases it refers several leafs in different subtrees.
|
83
|
+
# So we return an array of subfamilies
|
84
|
+
def subfamilies_by_uniprot_id(uniprot_id)
|
85
|
+
uniprot_acs = @uniprot_acs_by_id[uniprot_id]
|
86
|
+
@motif_family_recognizer_by_uniprot_ac.subfamilies_by_multiple_uniprot_acs( uniprot_acs )
|
87
|
+
end
|
88
|
+
|
89
|
+
def subfamilies_by_multiple_uniprot_ids(uniprot_ids)
|
90
|
+
uniprot_ids.flat_map{|uniprot_id|
|
91
|
+
subfamilies_by_uniprot_id(uniprot_id)
|
92
|
+
}.uniq
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
module WingenderTFClass
|
2
|
+
module OBO
|
3
|
+
Term = Struct.new(:ontology_tree, :id, :name, :subset, :definition, :parent_id, :uniprot_ACs, :other) do
|
4
|
+
def self.from_line_array(ontology_tree, arr)
|
5
|
+
id, name, subset, definition, parent_id = nil, nil, nil, nil, nil
|
6
|
+
other = []
|
7
|
+
uniprot_ACs = []
|
8
|
+
|
9
|
+
arr.select{|line|
|
10
|
+
line.match(/^\w+:/)
|
11
|
+
}.each{|line|
|
12
|
+
case line
|
13
|
+
when /^id:/
|
14
|
+
id = line[/^id: (?<data>.+)$/, :data]
|
15
|
+
when /^name:/
|
16
|
+
name = line[/^name: (?<data>.+)$/, :data]
|
17
|
+
when /^subset:/
|
18
|
+
subset = line[/^subset: (?<data>.+)$/, :data]
|
19
|
+
when /^def:/
|
20
|
+
definition = line[/^def: (?<data>.+)$/, :data]
|
21
|
+
when /^is_a:/
|
22
|
+
parent_id = line[/^is_a: (?<data>.+?) ! .+$/, :data]
|
23
|
+
when /^xref: UNIPROT:/
|
24
|
+
uniprot_ACs << line[/^xref: UNIPROT:(?<data>\w+)\b/, :data]
|
25
|
+
else
|
26
|
+
other << line
|
27
|
+
end
|
28
|
+
}
|
29
|
+
|
30
|
+
self.new(ontology_tree, id, name, subset, definition, parent_id || '', uniprot_ACs, other)
|
31
|
+
end
|
32
|
+
|
33
|
+
def parent
|
34
|
+
ontology_tree.term(parent_id)
|
35
|
+
end
|
36
|
+
|
37
|
+
def <=>(other)
|
38
|
+
if self.id.split('.').first == '0' && other.id.split('.').first == '0' # unclassified vs unclassified
|
39
|
+
id <=> other.id
|
40
|
+
elsif !self.id.split('.').first == '0' && !other.id.split('.').first == '0' # classified vs classified
|
41
|
+
id <=> other.id
|
42
|
+
elsif self.id.split('.').first == '0' # classified vs unclassified
|
43
|
+
1
|
44
|
+
else
|
45
|
+
-1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def children
|
50
|
+
ontology_tree.children(id)
|
51
|
+
end
|
52
|
+
|
53
|
+
def leaf?
|
54
|
+
ontology_tree.leaf?(id)
|
55
|
+
end
|
56
|
+
|
57
|
+
# It can be different from number of ancestors
|
58
|
+
def deepness
|
59
|
+
id.split('.').size
|
60
|
+
end
|
61
|
+
|
62
|
+
def descendant_leafs
|
63
|
+
leaf? ? [self] : children.flat_map(&:descendant_leafs)
|
64
|
+
end
|
65
|
+
|
66
|
+
def descendants
|
67
|
+
children + children.flat_map(&:descendants)
|
68
|
+
end
|
69
|
+
|
70
|
+
def subtree_nodes
|
71
|
+
[self] + children.flat_map(&:subtree_nodes)
|
72
|
+
end
|
73
|
+
|
74
|
+
def ancestors
|
75
|
+
result = []
|
76
|
+
term = self
|
77
|
+
while term.parent
|
78
|
+
term = term.parent
|
79
|
+
result.unshift(term)
|
80
|
+
end
|
81
|
+
result
|
82
|
+
end
|
83
|
+
|
84
|
+
def level_name
|
85
|
+
case deepness
|
86
|
+
when 0
|
87
|
+
'all TFs'
|
88
|
+
when 1
|
89
|
+
'superclass'
|
90
|
+
when 2
|
91
|
+
'class'
|
92
|
+
when 3
|
93
|
+
'family'
|
94
|
+
when 4
|
95
|
+
'subfamily'
|
96
|
+
when 5
|
97
|
+
'genus'
|
98
|
+
when 6
|
99
|
+
'species'
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def to_s
|
104
|
+
"#{name}{#{id}}"
|
105
|
+
end
|
106
|
+
|
107
|
+
def inspect; to_s; end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require_relative 'term'
|
2
|
+
module WingenderTFClass
|
3
|
+
module OBO
|
4
|
+
class TFClassification
|
5
|
+
# terms by ids
|
6
|
+
def initialize()
|
7
|
+
@terms_by_id = {}
|
8
|
+
@children_by_id = Hash.new{|h,k| h[k] = [] }
|
9
|
+
@terms_by_name = Hash.new{|h,k| h[k] = [] }
|
10
|
+
# @terms_by_id.each{|term_id, term|
|
11
|
+
# @children_by_id[term.parent_id] << term if term.parent_id
|
12
|
+
# }
|
13
|
+
self << Term.new(self, '', '', 'Root', '', nil, [], [])
|
14
|
+
end
|
15
|
+
|
16
|
+
def <<(term)
|
17
|
+
raise "Duplicate id #{term.id}" if @terms_by_id[term.id]
|
18
|
+
@terms_by_id[term.id] = term
|
19
|
+
@terms_by_name[term.name] << term
|
20
|
+
@children_by_id[term.parent_id] << term if term.parent_id
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.from_file(filename)
|
24
|
+
tf_ontology = self.new
|
25
|
+
terms = File.readlines(filename)
|
26
|
+
.map(&:chomp)
|
27
|
+
.slice_before{|line|
|
28
|
+
line.start_with?('[Term]')
|
29
|
+
}.drop(1)
|
30
|
+
.map{|enumerator|
|
31
|
+
Term.from_line_array(tf_ontology, enumerator.to_a)
|
32
|
+
}
|
33
|
+
terms.each{|term|
|
34
|
+
tf_ontology << term
|
35
|
+
}
|
36
|
+
tf_ontology
|
37
|
+
end
|
38
|
+
|
39
|
+
def term_by_name(name)
|
40
|
+
@terms_by_name[name]
|
41
|
+
end
|
42
|
+
|
43
|
+
def term(term_id)
|
44
|
+
@terms_by_id[term_id]
|
45
|
+
end
|
46
|
+
|
47
|
+
def children(term_id)
|
48
|
+
@children_by_id[term_id]
|
49
|
+
end
|
50
|
+
|
51
|
+
def root
|
52
|
+
term('')
|
53
|
+
end
|
54
|
+
|
55
|
+
def leaf?(term_id)
|
56
|
+
raise "Term #{term_id} does not exist" unless @terms_by_id[term_id]
|
57
|
+
@children_by_id[term_id].empty?
|
58
|
+
end
|
59
|
+
|
60
|
+
def tf_groups(slice_deepness)
|
61
|
+
@terms_by_id.each_value.select{|term|
|
62
|
+
term.deepness >= slice_deepness && (!term.parent || term.parent.deepness < slice_deepness)
|
63
|
+
}.map{|term|
|
64
|
+
[term, term.subtree_nodes]
|
65
|
+
}.to_h
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/clean'
|
3
|
+
|
4
|
+
task default: 'WingenderTFClass'
|
5
|
+
desc 'Download Wingender TFClass ontology'
|
6
|
+
task 'WingenderTFClass' => ['WingenderTFClass:download_tfclass', 'WingenderTFClass:download_uniprot_id_ac_mapping']
|
7
|
+
|
8
|
+
desc 'Download Wingender ontology files'
|
9
|
+
task 'WingenderTFClass:download_tfclass' => ['WingenderTFClass:download_tfclass:human', 'WingenderTFClass:download_tfclass:mouse']
|
10
|
+
task 'WingenderTFClass:download_tfclass:human' => 'source_data/TFOntologies/TFClass_human.obo'
|
11
|
+
task 'WingenderTFClass:download_tfclass:mouse' => 'source_data/TFOntologies/TFClass_mouse.obo'
|
12
|
+
|
13
|
+
directory 'source_data'
|
14
|
+
directory 'source_data/uniprot_infos/'
|
15
|
+
directory 'source_data/TFOntologies/'
|
16
|
+
|
17
|
+
file 'source_data/TFClass_ontologies_temp.zip' => 'source_data' do
|
18
|
+
sh 'wget', 'http://tfclass.bioinf.med.uni-goettingen.de/suplementary/TFClass_ontologies.zip', '-O', 'source_data/TFClass_ontologies_temp.zip'
|
19
|
+
end
|
20
|
+
|
21
|
+
file 'source_data/TFOntologies/TFClass_human.obo' => ['source_data/TFOntologies/', 'source_data/TFClass_ontologies_temp.zip'] do
|
22
|
+
sh 'unzip', 'source_data/TFClass_ontologies_temp.zip', 'TFClass_human.obo', '-d', 'source_data/TFOntologies/'
|
23
|
+
end
|
24
|
+
|
25
|
+
file 'source_data/TFOntologies/TFClass_mouse.obo' => ['source_data/TFOntologies/', 'source_data/TFClass_ontologies_temp.zip'] do
|
26
|
+
sh 'unzip', 'source_data/TFClass_ontologies_temp.zip', 'TFClass_mouse.obo', '-d', 'source_data/TFOntologies/'
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
desc 'Download Uniprot ID-AC mapping'
|
32
|
+
task 'WingenderTFClass:download_uniprot_id_ac_mapping'
|
33
|
+
|
34
|
+
{'human' => 'Homo sapiens', 'mouse' => 'Mus musculus'}.each do |organism, organism_official_name|
|
35
|
+
task 'WingenderTFClass:download_uniprot_id_ac_mapping' => "source_data/uniprot_infos/#{organism}.tsv"
|
36
|
+
file "source_data/uniprot_infos/#{organism}.tsv" => "source_data/uniprot_infos/#{organism}.tsv.gz" do
|
37
|
+
sh 'gzip', '--decompress', "source_data/uniprot_infos/#{organism}.tsv.gz"
|
38
|
+
end
|
39
|
+
|
40
|
+
file "source_data/uniprot_infos/#{organism}.tsv.gz" => 'source_data/uniprot_infos/' do
|
41
|
+
query = 'organism:"%{organism}"' % {organism: organism_official_name}
|
42
|
+
columns = ['id', 'entry name' ] # id - is uniprot_ac; entry_name is uniprot_id. Orwell DB
|
43
|
+
|
44
|
+
options = {
|
45
|
+
sort: 'score',
|
46
|
+
desc: '',
|
47
|
+
compress: 'yes',
|
48
|
+
query: query,
|
49
|
+
fil: '',
|
50
|
+
format: 'tab',
|
51
|
+
force: 'yes',
|
52
|
+
columns: columns.join(','),
|
53
|
+
}
|
54
|
+
options_str = options.map{|k,v| "#{k}=#{v}" }.join('&')
|
55
|
+
|
56
|
+
sh 'wget', "http://www.uniprot.org/uniprot/?#{options_str}", '-O', "source_data/uniprot_infos/#{organism}.tsv.gz"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
CLEAN << 'source_data/uniprot_ID_to_AC.tsv.gz'
|
61
|
+
CLEAN << 'source_data/TFClass_ontologies_temp.zip'
|
62
|
+
|
63
|
+
CLOBBER << 'source_data/uniprot_ID_to_AC.tsv'
|
64
|
+
CLOBBER << 'source_data/TFOntologies/*'
|