DomFun 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/DomFun.gemspec +44 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/add_protein_functional_families.rb +133 -0
- data/bin/console +14 -0
- data/bin/domains_to_function_predictor.rb +287 -0
- data/bin/generate_CAFA2_dataset.rb +135 -0
- data/bin/generate_CAFA2_tripartite_network.rb +139 -0
- data/bin/generate_cafa_control.rb +45 -0
- data/bin/get_kegg_pathways.R +12 -0
- data/bin/lines.R +74 -0
- data/bin/merge_pairs.rb +139 -0
- data/bin/normalize_combined_scores.rb +118 -0
- data/bin/prepare_cafa_network.rb +96 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +53 -0
- data/bin/translate_kegg_genes2pathways.rb +98 -0
- data/bin/validate_ProtFunSys_predictions.rb +174 -0
- data/lib/DomFun.rb +6 -0
- data/lib/DomFun/generalMethods.rb +105 -0
- data/lib/DomFun/version.rb +3 -0
- metadata +128 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 87063a5c0ddf9a988af77b9aaa73e2c7c676f2ad
|
4
|
+
data.tar.gz: 725fcefa7c7f464526410e1dcc7f50f6a19adf8f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 13b5059cf7978c2c8fe8583da5118e46725287f5d90658e7f2574ded9eebf8988d61203fe8c576371ded87f93c64d5ef7de18523c1db72ea0180dde8b9f5b881
|
7
|
+
data.tar.gz: 47a817f6bdffa4efce7624199125afa94ea30d4cdc8e0b2d13f48dd5c6cf25af4e0af0c344988ea3f0ae14a8f8d043168c266d8c72f303267ca45344e315d9aa
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/DomFun.gemspec
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "DomFun/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "DomFun"
|
8
|
+
spec.version = DomFun::VERSION
|
9
|
+
spec.authors = ["Elena Rojano, Pedro Seoane"]
|
10
|
+
spec.email = ["elenarojano@uma.es, seoanezonjic@hotmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Tool to predict protein functions based on domains-FunSys associations.}
|
13
|
+
spec.description = %q{From associations calculated between protein domains and functional systems (FunSys), DomFun can predict the functions of proteins looking up domains and the FunSys that have been associated with. The system is validated using data from CAFA.}
|
14
|
+
spec.homepage = "https://github.com/ElenaRojano/DomFun"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
# if spec.respond_to?(:metadata)
|
20
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
21
|
+
|
22
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
23
|
+
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
24
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
25
|
+
# else
|
26
|
+
# raise "RubyGems 2.0 or newer is required to protect against " \
|
27
|
+
# "public gem pushes."
|
28
|
+
# end
|
29
|
+
|
30
|
+
# Specify which files should be added to the gem when it is released.
|
31
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
32
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
33
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
34
|
+
end
|
35
|
+
spec.bindir = "exe"
|
36
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
|
+
spec.require_paths = ["lib"]
|
38
|
+
|
39
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
40
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
41
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
42
|
+
|
43
|
+
spec.add_dependency "NetAnalyzer", "~> 0.1.5"
|
44
|
+
end
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 elenarojano
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# DomFun
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/DomFun`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'DomFun'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install DomFun
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/DomFun.
|
36
|
+
|
37
|
+
## License
|
38
|
+
|
39
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
##########################
|
3
|
+
# Rojano E. & Seoane P., June 2019
|
4
|
+
# Generate tripartite networks with domains-proteins-FunSys data
|
5
|
+
# Protein IDs and FunSys (GO-MF, KEGG and Reactome) from UniProtKB.
|
6
|
+
# Protein domains (Superfamilies and FunFams) from CATH.
|
7
|
+
##########################
|
8
|
+
|
9
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
10
|
+
ROOT_PATH = File.dirname(__FILE__)
|
11
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
|
12
|
+
require 'generalMethods.rb'
|
13
|
+
require 'csv'
|
14
|
+
require 'optparse'
|
15
|
+
|
16
|
+
##########################
|
17
|
+
#METHODS
|
18
|
+
##########################
|
19
|
+
def build_tripartite_networks(nomenclature_annotations, cath_data, path, protein2gene)
|
20
|
+
records = Hash.new(0)
|
21
|
+
nomenclature_annotations.each do |nomenclature, protein_annotations|
|
22
|
+
annots = []
|
23
|
+
datas = []
|
24
|
+
protein_annotations.each do |protID, annotations|
|
25
|
+
query_cath_data = cath_data[protID]
|
26
|
+
if !query_cath_data.nil?
|
27
|
+
#gene_ID = protein2gene[protID] unless protein2gene[protID].nil?
|
28
|
+
|
29
|
+
#gene_ID = protID if gene_ID.nil?
|
30
|
+
annotations.each do |annotation|
|
31
|
+
#annots << [annotation, gene_ID]
|
32
|
+
annots << [annotation, protID]
|
33
|
+
end
|
34
|
+
query_cath_data.each do |data|
|
35
|
+
#datas << [data, gene_ID]
|
36
|
+
datas << [data, protID]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
records[nomenclature] += annots.map{|pair| pair.last}.uniq.length
|
41
|
+
File.open(File.join(path, "network_#{nomenclature}"), 'w') do |f|
|
42
|
+
annots.uniq.each do |pair|
|
43
|
+
f.puts pair.join("\t")
|
44
|
+
end
|
45
|
+
datas.uniq.each do |pair|
|
46
|
+
f.puts pair.join("\t")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
return records
|
51
|
+
end
|
52
|
+
|
53
|
+
##########################
|
54
|
+
#OPT-PARSER
|
55
|
+
##########################
|
56
|
+
options = {}
|
57
|
+
OptionParser.new do |opts|
|
58
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
59
|
+
|
60
|
+
options[:input_annotations] = nil
|
61
|
+
opts.on("-a", "--input_annotations PATH", "Input file with gene annotations") do |data|
|
62
|
+
options[:input_annotations] = data
|
63
|
+
end
|
64
|
+
|
65
|
+
options[:calculate_proteins_by_domain] = false
|
66
|
+
opts.on("-c", "--calculate_proteins_by_domain", "Calculate the number of proteins that a domain has") do
|
67
|
+
options[:calculate_proteins_by_domain] = true
|
68
|
+
end
|
69
|
+
|
70
|
+
options[:input_domains] = nil
|
71
|
+
opts.on("-d", "--input_domains PATH", "Input file with protein domains") do |data|
|
72
|
+
options[:input_domains] = data
|
73
|
+
end
|
74
|
+
|
75
|
+
options[:search_domain] = true
|
76
|
+
opts.on("-f", "--search_domain", "Search full protein domains. If false, search funfams") do
|
77
|
+
options[:search_domain] = false
|
78
|
+
end
|
79
|
+
|
80
|
+
options[:annotation_types] = %w[ kegg reactome go]
|
81
|
+
opts.on("-p", "--annotation_types STRING", "List of annotation types separated by commas") do |data|
|
82
|
+
options[:annotation_types] = data.split(",")
|
83
|
+
end
|
84
|
+
|
85
|
+
options[:output_stats] = 'uniprot_stats.txt'
|
86
|
+
opts.on("-s", "--output_stats PATH", "Output file with UniProt stats") do |data|
|
87
|
+
options[:output_stats] = data
|
88
|
+
end
|
89
|
+
|
90
|
+
options[:category_type] = 'funfamID'
|
91
|
+
opts.on("-t", "--category_type STRING", "Input category of domains. Options: funfamID, superfamilyID") do |data|
|
92
|
+
options[:category_type] = data
|
93
|
+
end
|
94
|
+
|
95
|
+
options[:unnanotated_proteins] = 'unnanotated_proteins_list.txt'
|
96
|
+
opts.on("-u", "--unnanotated_proteins PATH", "Output file with unnanotated proteins list") do |data|
|
97
|
+
options[:unnanotated_proteins] = data
|
98
|
+
end
|
99
|
+
|
100
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
101
|
+
puts opts
|
102
|
+
exit
|
103
|
+
end
|
104
|
+
|
105
|
+
end.parse!
|
106
|
+
|
107
|
+
##########################
|
108
|
+
#MAIN
|
109
|
+
##########################
|
110
|
+
|
111
|
+
puts "Loading data..."
|
112
|
+
cath_data, protein2gene, gene2proteins, cath_proteins_number = load_cath_data(options[:input_domains], options[:category_type])
|
113
|
+
nomenclature_annotations, number_of_proteins, proteins_without_annotations = load_proteins_file(options[:input_annotations], options[:annotation_types])
|
114
|
+
networks_path = nil
|
115
|
+
if options[:category_type] == 'funfamID'
|
116
|
+
networks_path = 'networks/funfam_networks'
|
117
|
+
else
|
118
|
+
networks_path = 'networks/superfamily_networks'
|
119
|
+
end
|
120
|
+
puts "Generating tripartite networks. This can take a while, please wait."
|
121
|
+
protein_stats = build_tripartite_networks(nomenclature_annotations, cath_data, networks_path, protein2gene)
|
122
|
+
handler = File.open(options[:output_stats], 'w')
|
123
|
+
protein_stats.each do |annotation_type, number_of_proteins|
|
124
|
+
handler.puts "#{annotation_type}\t#{number_of_proteins}"
|
125
|
+
end
|
126
|
+
handler.puts "Total of Uniprot proteins\t#{number_of_proteins}"
|
127
|
+
handler.puts "Total of Uniprot proteins without annotations\t#{proteins_without_annotations.length}"
|
128
|
+
handler.puts "Total of CATH proteins\t#{cath_proteins_number}"
|
129
|
+
handler = File.open(options[:unnanotated_proteins], 'w')
|
130
|
+
proteins_without_annotations.each do |unnanotated_prot|
|
131
|
+
handler.puts unnanotated_prot
|
132
|
+
end
|
133
|
+
handler.close
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "DomFun"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
@@ -0,0 +1,287 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
##########################
|
3
|
+
# Rojano E. & Seoane P., June 2019
|
4
|
+
# Domain to functional annotation predictor
|
5
|
+
# Based on domain-annotation association, this predictor can add functions to a group of domains of a protein
|
6
|
+
# It predict the most putative functions associated to a protein based on their domains.
|
7
|
+
# Protein IDs and FunSys (GO-MF, KEGG and Reactome) from UniProtKB.
|
8
|
+
# Protein domains (Superfamilies and FunFams) from CATH.
|
9
|
+
##########################
|
10
|
+
|
11
|
+
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
|
12
|
+
ROOT_PATH = File.dirname(__FILE__)
|
13
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
|
14
|
+
require 'generalMethods.rb'
|
15
|
+
require 'csv'
|
16
|
+
require 'optparse'
|
17
|
+
require "statistics2"
|
18
|
+
require "terminal-table"
|
19
|
+
require 'report_html'
|
20
|
+
|
21
|
+
|
22
|
+
##########################
|
23
|
+
#METHODS
|
24
|
+
##########################
|
25
|
+
|
26
|
+
def get_protein_domains(cath_data, protein)
|
27
|
+
domains_to_predict = nil
|
28
|
+
unless cath_data[protein].nil?
|
29
|
+
domains_to_predict = cath_data[protein].uniq
|
30
|
+
end
|
31
|
+
return domains_to_predict
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
def load_domain_to_pathway_association(associations_file, threshold)
|
36
|
+
domain_to_pathway_associations = {}
|
37
|
+
File.open(associations_file).each do |line|
|
38
|
+
line.chomp!
|
39
|
+
annotation, domain, association_value = line.split("\t")
|
40
|
+
association_value = association_value.to_f
|
41
|
+
next if association_value < threshold
|
42
|
+
query = domain_to_pathway_associations[domain]
|
43
|
+
if query.nil?
|
44
|
+
domain_to_pathway_associations[domain] = [[annotation, association_value]]
|
45
|
+
else
|
46
|
+
query << [annotation, association_value]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
return domain_to_pathway_associations
|
50
|
+
end
|
51
|
+
|
52
|
+
def load_domains_to_predict(domains_file)
|
53
|
+
domains_to_predict = {}
|
54
|
+
File.open(domains_file).each do |line|
|
55
|
+
line.chomp!
|
56
|
+
protein_id, domains = line.split("\t")
|
57
|
+
domains_to_predict[protein_id] = domains.split(',')
|
58
|
+
end
|
59
|
+
return domains_to_predict
|
60
|
+
end
|
61
|
+
|
62
|
+
def search4function(domains_to_predict, domain_to_pathway_associations)
|
63
|
+
domain_to_function_and_association_value = {}
|
64
|
+
domains_to_predict.each do |domain|
|
65
|
+
#puts domain
|
66
|
+
associations = domain_to_pathway_associations[domain]
|
67
|
+
if !associations.nil?
|
68
|
+
domain_to_function_and_association_value[domain] = associations
|
69
|
+
end
|
70
|
+
end
|
71
|
+
return domain_to_function_and_association_value
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
def group_by_function(domain_to_function_and_association_value)
|
76
|
+
function_to_domains = {}
|
77
|
+
association_scores = {}
|
78
|
+
domain_to_function_and_association_value.each do |domain, annotations|
|
79
|
+
annotations.each do |annotation_id, association_score|
|
80
|
+
query = function_to_domains[annotation_id]
|
81
|
+
if query.nil?
|
82
|
+
function_to_domains[annotation_id] = [domain]
|
83
|
+
else
|
84
|
+
query << domain
|
85
|
+
end
|
86
|
+
query = association_scores[annotation_id]
|
87
|
+
if query.nil?
|
88
|
+
association_scores[annotation_id] = {domain => association_score}
|
89
|
+
else
|
90
|
+
query[domain] = association_score
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return function_to_domains, association_scores
|
95
|
+
end
|
96
|
+
|
97
|
+
def generate_domain_annotation_matrix(function_to_domains, association_scores, domains_to_predict, null_value=0)
|
98
|
+
# #method for creating the hpo to region matrix for plotting
|
99
|
+
# #info2predict = hpo list from user
|
100
|
+
# #hpo_associated_regions = [[chr, start, stop, [hpos_list], [weighted_association_scores]]]
|
101
|
+
domain_annotation_matrix = []
|
102
|
+
function_to_domains.each do |function_ID, domains_list|
|
103
|
+
row = []
|
104
|
+
domains_to_predict.each do |user_domain|
|
105
|
+
value = association_scores[function_ID][user_domain]
|
106
|
+
if value.nil?
|
107
|
+
row << null_value
|
108
|
+
else
|
109
|
+
row << value
|
110
|
+
end
|
111
|
+
end
|
112
|
+
domain_annotation_matrix << row
|
113
|
+
end
|
114
|
+
return domain_annotation_matrix
|
115
|
+
end
|
116
|
+
|
117
|
+
def scoring_funsys(function_to_domains, domain_annotation_matrix, scoring_system, freedom_degree='maxnum', null_value=0, pvalue_threshold)
|
118
|
+
domains_array = function_to_domains.values
|
119
|
+
max_cluster_length = domain_annotation_matrix.map{|x| x.count {|i| i != 0}}.max if freedom_degree == 'maxnum'
|
120
|
+
domain_annotation_matrix.each_with_index do |associations, i|
|
121
|
+
sample_length = nil
|
122
|
+
if freedom_degree == 'maxnum'
|
123
|
+
sample_length = max_cluster_length
|
124
|
+
else
|
125
|
+
abort("Invalid freedom degree calculation method: #{freedom_degree}")
|
126
|
+
end
|
127
|
+
if scoring_system == 'fisher'
|
128
|
+
#hyper must be ln not log10 from net analyzer
|
129
|
+
#https://en.wikipedia.org/wiki/Fisher%27s_method
|
130
|
+
lns = associations.map{|a| Math.log(10 ** -a)} #hyper values come as log10 values
|
131
|
+
sum = lns.inject(0){|s, a| s + a}
|
132
|
+
combined_pvalue = Statistics2.chi2_x(sample_length *2, -2*sum)
|
133
|
+
domains_array[i] << combined_pvalue
|
134
|
+
elsif scoring_system == 'harmonic'
|
135
|
+
#STDERR.puts associations.inspect
|
136
|
+
lns = associations.map{|a| 10 ** -a}
|
137
|
+
inv = lns.map{|n| 1.fdiv(n)}
|
138
|
+
sum = inv.inject(0){|s,x| s + x}
|
139
|
+
combined_pvalue = associations.length.fdiv(sum)
|
140
|
+
domains_array[i] << combined_pvalue
|
141
|
+
elsif scoring_system == 'stouffer'
|
142
|
+
sum = associations.inject(0){|s,x| s + x}
|
143
|
+
combined_z_score = sum/Math.sqrt(sample_length)
|
144
|
+
domains_array[i] << combined_z_score
|
145
|
+
elsif scoring_system == 'average'
|
146
|
+
sum = associations.inject(0){|s,x| s + x.abs}.fdiv(associations.length)
|
147
|
+
#STDERR.puts sum.inspect
|
148
|
+
domains_array[i] << sum
|
149
|
+
elsif scoring_system == 'sum'
|
150
|
+
sum = associations.inject(0){|s,x| s + x.abs}
|
151
|
+
domains_array[i] << sum
|
152
|
+
else
|
153
|
+
abort("Invalid integration method: #{scoring_system}")
|
154
|
+
end
|
155
|
+
end
|
156
|
+
if scoring_system == 'fisher' || scoring_system == 'harmonic'
|
157
|
+
function_to_domains.select!{|function, attributes| attributes.last <= pvalue_threshold}
|
158
|
+
else
|
159
|
+
function_to_domains.select!{|function, attributes| attributes.last >= pvalue_threshold}
|
160
|
+
end
|
161
|
+
#STDERR.puts function_to_domains.inspect
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
def report_data(predictions, html_file)
|
166
|
+
container = {:predictions => predictions }
|
167
|
+
template = File.open(File.join(REPORT_FOLDER, 'report_data.erb')).read
|
168
|
+
report = Report_html.new(container, 'Protein domains and FunSys predictions summary')
|
169
|
+
report.build(template)
|
170
|
+
report.write(html_file)
|
171
|
+
end
|
172
|
+
|
173
|
+
##########################
|
174
|
+
#OPT-PARSER
|
175
|
+
##########################
|
176
|
+
|
177
|
+
options = {}
|
178
|
+
OptionParser.new do |opts|
|
179
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
180
|
+
|
181
|
+
options[:input_associations] = nil
|
182
|
+
opts.on("-a", "--input_associations PATH", "Domain-function associations") do |data|
|
183
|
+
options[:input_associations] = data
|
184
|
+
end
|
185
|
+
|
186
|
+
options[:domain_category] = "superfamilyID"
|
187
|
+
opts.on("-c", "--domain_category PATH", "Domain category. Please choose one: superfamilyID or funfamID" ) do |data|
|
188
|
+
options[:domain_category] = data
|
189
|
+
end
|
190
|
+
|
191
|
+
options[:protein_domains_file] = nil
|
192
|
+
opts.on("-f", "--protein_domains_file PATH", "Input protein-domains file from CATH") do |data|
|
193
|
+
options[:protein_domains_file] = data
|
194
|
+
end
|
195
|
+
|
196
|
+
options[:integration_method] = 'fisher'
|
197
|
+
opts.on("-i", "--integration_method STRING", "Integration method") do |data|
|
198
|
+
options[:integration_method] = data
|
199
|
+
end
|
200
|
+
|
201
|
+
options[:output_file] = 'predictions_file.txt'
|
202
|
+
opts.on("-o", "--output_file PATH", "Predictions file") do |data|
|
203
|
+
options[:output_file] = data
|
204
|
+
end
|
205
|
+
|
206
|
+
options[:proteins_2predict] = nil
|
207
|
+
opts.on("-p", "--proteins_2predict PATH", "Protein to predict. Please use UniProt IDs" ) do |data|
|
208
|
+
options[:proteins_2predict] = data
|
209
|
+
end
|
210
|
+
|
211
|
+
options[:pvalue_threshold] = 0.05
|
212
|
+
opts.on("-t", "--pvalue_threshold FLOAT", "P-value threshold") do |pvalue_threshold|
|
213
|
+
options[:pvalue_threshold] = pvalue_threshold.to_f
|
214
|
+
end
|
215
|
+
|
216
|
+
options[:association_threshold] = 2
|
217
|
+
opts.on("-T", "--association_threshold FLOAT", "Association value threshold") do |association_threshold|
|
218
|
+
options[:association_threshold] = association_threshold.to_f
|
219
|
+
end
|
220
|
+
|
221
|
+
options[:multiple_proteins] = false
|
222
|
+
opts.on("-u", "--multiple_proteins", "Set if multiple profiles") do
|
223
|
+
options[:multiple_proteins] = true
|
224
|
+
end
|
225
|
+
|
226
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
227
|
+
puts opts
|
228
|
+
exit
|
229
|
+
end
|
230
|
+
|
231
|
+
end.parse!
|
232
|
+
|
233
|
+
##########################
|
234
|
+
#MAIN
|
235
|
+
##########################
|
236
|
+
|
237
|
+
# 1. Load protein domains classification to get domains from proteins to predict
|
238
|
+
cath_data, protein2gene, gene2proteins, cath_proteins_number = load_cath_data(options[:protein_domains_file], options[:domain_category])
|
239
|
+
# 2. Load protein(s) to predict
|
240
|
+
if File.exist?(options[:proteins_2predict])
|
241
|
+
if !options[:multiple_proteins]
|
242
|
+
options[:proteins_2predict] = [File.open(options[:proteins_2predict]).readlines.map!{|line| line.chomp}]
|
243
|
+
else
|
244
|
+
multiple_proteins = []
|
245
|
+
File.open(options[:proteins_2predict]).each do |line|
|
246
|
+
line.chomp!
|
247
|
+
multiple_proteins << line
|
248
|
+
end
|
249
|
+
options[:proteins_2predict] = multiple_proteins
|
250
|
+
end
|
251
|
+
else
|
252
|
+
if !options[:multiple_proteins]
|
253
|
+
options[:proteins_2predict] = [options[:proteins_2predict].split('|')]
|
254
|
+
else
|
255
|
+
options[:proteins_2predict] = options[:proteins_2predict].split('!').map{|profile| profile.split('|')}
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
# 3. Load domain-FunSys associations
|
261
|
+
domain_to_pathways_associations = load_domain_to_pathway_association(options[:input_associations], options[:association_threshold])
|
262
|
+
# 4. Prediction
|
263
|
+
handler = File.open(options[:output_file], 'w')
|
264
|
+
options[:proteins_2predict].each do |protein|
|
265
|
+
domains = get_protein_domains(cath_data, protein)
|
266
|
+
next if domains.nil?
|
267
|
+
null_value = 0
|
268
|
+
domain_function_assocValue = search4function(domains, domain_to_pathways_associations)
|
269
|
+
|
270
|
+
function_to_domains, association_scores = group_by_function(domain_function_assocValue)
|
271
|
+
annotation_matrix = generate_domain_annotation_matrix(function_to_domains, association_scores, domains, 0)
|
272
|
+
|
273
|
+
scoring_funsys(
|
274
|
+
function_to_domains,
|
275
|
+
annotation_matrix,
|
276
|
+
options[:integration_method],
|
277
|
+
'maxnum',
|
278
|
+
null_value,
|
279
|
+
options[:pvalue_threshold]
|
280
|
+
)
|
281
|
+
|
282
|
+
function_to_domains.each do |funsys, domains_data|
|
283
|
+
score = domains_data.pop
|
284
|
+
handler.puts "#{protein}\t#{domains_data.join(',')}\t#{funsys}\t#{score}"
|
285
|
+
end
|
286
|
+
end
|
287
|
+
handler.close
|