DomFun 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ab8f445b1ecc8e416c559fb87b46c853e2cc74e1
4
- data.tar.gz: 7d28a92576ffc5f4816c0062b4702f6195d2e178
3
+ metadata.gz: 8d744b8695b8d35ec3ec38ec0ea52f6f6c7b4bbc
4
+ data.tar.gz: ed2b3da0e0db38bcae175d477bcee641106684e9
5
5
  SHA512:
6
- metadata.gz: 88cbe093ee831669f5bc6037bfafa4d94e1865ab032fc613ae62550b46811d502a43ec7f1c632a81bb622d36923cafc493c400d7ecf98c572434461ad8731dd0
7
- data.tar.gz: 9820f00ac5e9d63b88449bc8a0e47b5c97c7e6541791d24522b0a97ba64363d61df1343f4acf04d65a7036ccd9cfc8529ac5c672cf69b7e3f2eec6994812f736
6
+ metadata.gz: 4be2da53056a56a7edae46c41c3ce84e525e93b602f2469084317ca1a9ee49872e684c3a6135a847f34fabc61133fdff7d767a410d976154eefd9a4389a96d24
7
+ data.tar.gz: 43c5717119062bee38dd29eff45fad55f120ac4158579c54d942e3fc688e16d23a2e9a5332608c4324e7196dbdac877397afaa5febbbad4e860e9a76dde98f5b
@@ -37,7 +37,7 @@ Gem::Specification.new do |spec|
37
37
  spec.require_paths = ["lib"]
38
38
 
39
39
  spec.add_development_dependency "bundler", "~> 2.0"
40
- spec.add_development_dependency "rake", "~> 10.0"
40
+ spec.add_development_dependency "rake", "~> 12.3.1"
41
41
  spec.add_development_dependency "rspec", "~> 3.0"
42
42
 
43
43
  spec.add_dependency "NetAnalyzer", "~> 0.1.5"
@@ -9,14 +9,15 @@
9
9
  REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
10
10
  ROOT_PATH = File.dirname(__FILE__)
11
11
  $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
12
- require 'generalMethods.rb'
12
+ require 'generalMethods'
13
13
  require 'csv'
14
14
  require 'optparse'
15
+ require 'fileutils'
15
16
 
16
17
  ##########################
17
18
  #METHODS
18
19
  ##########################
19
- def build_tripartite_networks(nomenclature_annotations, cath_data, path, protein2gene)
20
+ def build_tripartite_networks(nomenclature_annotations, cath_data, path, protein2gene, translate2gene)
20
21
  records = Hash.new(0)
21
22
  nomenclature_annotations.each do |nomenclature, protein_annotations|
22
23
  annots = []
@@ -24,16 +25,17 @@ def build_tripartite_networks(nomenclature_annotations, cath_data, path, protein
24
25
  protein_annotations.each do |protID, annotations|
25
26
  query_cath_data = cath_data[protID]
26
27
  if !query_cath_data.nil?
27
- #gene_ID = protein2gene[protID] unless protein2gene[protID].nil?
28
-
29
- #gene_ID = protID if gene_ID.nil?
28
+ if !translate2gene
29
+ recordID = protID
30
+ else
31
+ recordID = protein2gene[protID]
32
+ recordID = protID if recordID.nil?
33
+ end
30
34
  annotations.each do |annotation|
31
- #annots << [annotation, gene_ID]
32
- annots << [annotation, protID]
35
+ annots << [annotation, recordID]
33
36
  end
34
37
  query_cath_data.each do |data|
35
- #datas << [data, gene_ID]
36
- datas << [data, protID]
38
+ datas << [data, recordID]
37
39
  end
38
40
  end
39
41
  end
@@ -77,6 +79,11 @@ OptionParser.new do |opts|
77
79
  options[:search_domain] = false
78
80
  end
79
81
 
82
+ options[:output_file] = 'uniprot_translated.txt'
83
+ opts.on("-o", "--output_file PATH", "Output file with UniProt GeneName structure for prediction") do |data|
84
+ options[:output_file] = data
85
+ end
86
+
80
87
  options[:annotation_types] = %w[ kegg reactome go]
81
88
  opts.on("-p", "--annotation_types STRING", "List of annotation types separated by commas") do |data|
82
89
  options[:annotation_types] = data.split(",")
@@ -87,6 +94,11 @@ OptionParser.new do |opts|
87
94
  options[:output_stats] = data
88
95
  end
89
96
 
97
+ options[:translate2gene] = false
98
+ opts.on("-T", "--translate2gene", "Translate proteins to genes") do
99
+ options[:translate2gene] = true
100
+ end
101
+
90
102
  options[:category_type] = 'funfamID'
91
103
  opts.on("-t", "--category_type STRING", "Input category of domains. Options: funfamID, superfamilyID") do |data|
92
104
  options[:category_type] = data
@@ -109,25 +121,24 @@ end.parse!
109
121
  ##########################
110
122
 
111
123
  puts "Loading data..."
112
- cath_data, protein2gene, gene2proteins, cath_proteins_number = load_cath_data(options[:input_domains], options[:category_type])
124
+ cath_data, protein2gene, cath_proteins_number = load_cath_data(options[:input_domains], options[:category_type])
113
125
  nomenclature_annotations, number_of_proteins, proteins_without_annotations = load_proteins_file(options[:input_annotations], options[:annotation_types])
126
+
114
127
  networks_path = nil
115
128
  if options[:category_type] == 'funfamID'
116
129
  networks_path = 'networks/funfam_networks'
117
130
  else
118
131
  networks_path = 'networks/superfamily_networks'
119
132
  end
133
+ FileUtils.mkdir_p networks_path
120
134
  puts "Generating tripartite networks. This can take a while, please wait."
121
- protein_stats = build_tripartite_networks(nomenclature_annotations, cath_data, networks_path, protein2gene)
122
- handler = File.open(options[:output_stats], 'w')
123
- protein_stats.each do |annotation_type, number_of_proteins|
124
- handler.puts "#{annotation_type}\t#{number_of_proteins}"
125
- end
126
- handler.puts "Total of Uniprot proteins\t#{number_of_proteins}"
127
- handler.puts "Total of Uniprot proteins without annotations\t#{proteins_without_annotations.length}"
128
- handler.puts "Total of CATH proteins\t#{cath_proteins_number}"
129
- handler = File.open(options[:unnanotated_proteins], 'w')
130
- proteins_without_annotations.each do |unnanotated_prot|
131
- handler.puts unnanotated_prot
135
+ protein_stats = build_tripartite_networks(nomenclature_annotations, cath_data, networks_path, protein2gene, options[:translate2gene])
136
+ File.open(options[:output_stats], 'w') do |f|
137
+ protein_stats.each do |annotation_type, number_of_proteins|
138
+ f.puts "#{annotation_type}\t#{number_of_proteins}"
139
+ end
140
+ f.puts "Total of Uniprot proteins\t#{number_of_proteins}"
141
+ f.puts "Total of Uniprot proteins without annotations\t#{proteins_without_annotations.length}"
142
+ f.puts "Total of CATH proteins\t#{cath_proteins_number}"
132
143
  end
133
- handler.close
144
+ File.open(options[:unnanotated_proteins], 'w') {|f| f.puts proteins_without_annotations.join("\n")}
@@ -0,0 +1,94 @@
1
+ #! /usr/bin/env ruby
2
+ #Tool for calculating averages between different association values file.
3
+ #File structure: prec rec cut meth
4
+ #Load all files (7) stored in the same directory and calculate average;
5
+ #of lines for each method. Return a file with the same structure;
6
+ #giving name as "average" to the last column
7
+
8
+ require 'optparse'
9
+
10
+ ##########################
11
+ #METHODS
12
+ ##########################
13
+
14
+ def load_association_file(filename)
15
+ fileInfo = []
16
+ header = ''
17
+ line_number = 0
18
+ File.open(filename).each do |line|
19
+ line.chomp!
20
+ if line_number == 0
21
+ header = line
22
+ else
23
+ cut, precision, recall, meth = line.split("\t")
24
+ fileInfo << [cut.to_f, precision.to_f, recall.to_f, meth]
25
+ end
26
+ line_number += 1
27
+ end
28
+ return fileInfo, header
29
+ end
30
+
31
+ def calculate_average(all_files, cols_for_average)
32
+ average = []
33
+ n_files = all_files.length.to_f
34
+ ref_file = all_files.shift
35
+ summatory_file = []
36
+ ref_file.each_with_index do |line, i|
37
+ all_files.each do |file|
38
+ line2 = file[i]
39
+ cols_for_average.each do |col|
40
+ line[col] = line[col] + line2[col]
41
+ end
42
+ end
43
+ summatory_file << line
44
+ end
45
+ summatory_file.each do |line|
46
+ cols_for_average.each do |col|
47
+ line[col] = line[col]/n_files
48
+ end
49
+ average << line
50
+ end
51
+ return average
52
+ end
53
+
54
+
55
+ ##########################
56
+ #OPT-PARSER
57
+ ##########################
58
+
59
+ options = {}
60
+ OptionParser.new do |opts|
61
+ opts.banner = "Usage: #{__FILE__} [options]"
62
+
63
+ options[:file_names] = nil
64
+ opts.on("-f", "--file_names STRING", "Input file names to calculate averages. Please separate names by commas") do |file_names|
65
+ options[:file_names] = file_names.split(',')
66
+ end
67
+
68
+ options[:which_cols] = nil
69
+ opts.on("-c", "--which_cols STRING", "Cols for performing average analysis") do |which_cols|
70
+ options[:which_cols] = which_cols.split(',').map{|i| i.to_i - 1}
71
+ end
72
+
73
+ end.parse!
74
+
75
+ ##########################
76
+ #MAIN
77
+ ##########################
78
+
79
+ all_files = []
80
+ header = nil
81
+ options[:file_names].each do |filename|
82
+ file, header = load_association_file(filename)
83
+ all_files << file
84
+ end
85
+
86
+ average = calculate_average(all_files, options[:which_cols])
87
+
88
+ puts header
89
+ average.each do |line|
90
+ puts line.join("\t")
91
+ end
92
+
93
+
94
+
@@ -11,7 +11,7 @@
11
11
  REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
12
12
  ROOT_PATH = File.dirname(__FILE__)
13
13
  $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'DomFun'))
14
- require 'generalMethods.rb'
14
+ require 'generalMethods'
15
15
  require 'csv'
16
16
  require 'optparse'
17
17
  require "statistics2"
@@ -23,15 +23,24 @@ require 'report_html'
23
23
  #METHODS
24
24
  ##########################
25
25
 
26
- def get_protein_domains(cath_data, protein)
27
- domains_to_predict = nil
28
- unless cath_data[protein].nil?
29
- domains_to_predict = cath_data[protein].uniq
26
+ def get_protein_domains(cath_data, protein, gene2protein, identifier_mode)
27
+ domains_to_predict = []
28
+ if identifier_mode == 'mixed'
29
+ proteins = gene2protein[protein]
30
+ if !proteins.nil?
31
+ proteins.each do |protein|
32
+ domains_to_predict.concat(cath_data[protein])
33
+ end
34
+ else
35
+ domains_to_predict = cath_data[protein]
36
+ end
37
+ else
38
+ domains_to_predict = cath_data[protein]
30
39
  end
31
- return domains_to_predict
40
+ domains_to_predict = [] if domains_to_predict.nil?
41
+ return domains_to_predict.uniq
32
42
  end
33
43
 
34
-
35
44
  def load_domain_to_pathway_association(associations_file, threshold)
36
45
  domain_to_pathway_associations = {}
37
46
  File.open(associations_file).each do |line|
@@ -62,7 +71,6 @@ end
62
71
  def search4function(domains_to_predict, domain_to_pathway_associations)
63
72
  domain_to_function_and_association_value = {}
64
73
  domains_to_predict.each do |domain|
65
- #puts domain
66
74
  associations = domain_to_pathway_associations[domain]
67
75
  if !associations.nil?
68
76
  domain_to_function_and_association_value[domain] = associations
@@ -132,7 +140,6 @@ def scoring_funsys(function_to_domains, domain_annotation_matrix, scoring_system
132
140
  combined_pvalue = Statistics2.chi2_x(sample_length *2, -2*sum)
133
141
  domains_array[i] << combined_pvalue
134
142
  elsif scoring_system == 'harmonic'
135
- #STDERR.puts associations.inspect
136
143
  lns = associations.map{|a| 10 ** -a}
137
144
  inv = lns.map{|n| 1.fdiv(n)}
138
145
  sum = inv.inject(0){|s,x| s + x}
@@ -144,7 +151,6 @@ def scoring_funsys(function_to_domains, domain_annotation_matrix, scoring_system
144
151
  domains_array[i] << combined_z_score
145
152
  elsif scoring_system == 'average'
146
153
  sum = associations.inject(0){|s,x| s + x.abs}.fdiv(associations.length)
147
- #STDERR.puts sum.inspect
148
154
  domains_array[i] << sum
149
155
  elsif scoring_system == 'sum'
150
156
  sum = associations.inject(0){|s,x| s + x.abs}
@@ -158,7 +164,6 @@ def scoring_funsys(function_to_domains, domain_annotation_matrix, scoring_system
158
164
  else
159
165
  function_to_domains.select!{|function, attributes| attributes.last >= pvalue_threshold}
160
166
  end
161
- #STDERR.puts function_to_domains.inspect
162
167
  end
163
168
 
164
169
 
@@ -198,6 +203,11 @@ OptionParser.new do |opts|
198
203
  options[:integration_method] = data
199
204
  end
200
205
 
206
+ options[:identifier_mode] = 'normal'
207
+ opts.on("-I", "--identifier_mode STRING", "Identifier mode: normal or mixed") do |data|
208
+ options[:identifier_mode] = data
209
+ end
210
+
201
211
  options[:output_file] = 'predictions_file.txt'
202
212
  opts.on("-o", "--output_file PATH", "Predictions file") do |data|
203
213
  options[:output_file] = data
@@ -235,7 +245,7 @@ end.parse!
235
245
  ##########################
236
246
 
237
247
  # 1. Load protein domains classification to get domains from proteins to predict
238
- cath_data, protein2gene, gene2proteins, cath_proteins_number = load_cath_data(options[:protein_domains_file], options[:domain_category])
248
+ cath_data, protein2gene, cath_proteins_number = load_cath_data(options[:protein_domains_file], options[:domain_category])
239
249
  # 2. Load protein(s) to predict
240
250
  if File.exist?(options[:proteins_2predict])
241
251
  if !options[:multiple_proteins]
@@ -256,17 +266,16 @@ else
256
266
  end
257
267
  end
258
268
 
259
-
260
269
  # 3. Load domain-FunSys associations
261
270
  domain_to_pathways_associations = load_domain_to_pathway_association(options[:input_associations], options[:association_threshold])
262
271
  # 4. Prediction
263
- handler = File.open(options[:output_file], 'w')
272
+ #handler = File.open(options[:output_file], 'w')
273
+ gene2protein = invert_hash(protein2gene) if options[:identifier_mode] == 'mixed'
264
274
  options[:proteins_2predict].each do |protein|
265
- domains = get_protein_domains(cath_data, protein)
266
- next if domains.nil?
275
+ domains = get_protein_domains(cath_data, protein, gene2protein, options[:identifier_mode])
276
+ next if domains.empty?
267
277
  null_value = 0
268
278
  domain_function_assocValue = search4function(domains, domain_to_pathways_associations)
269
-
270
279
  function_to_domains, association_scores = group_by_function(domain_function_assocValue)
271
280
  annotation_matrix = generate_domain_annotation_matrix(function_to_domains, association_scores, domains, 0)
272
281
 
@@ -281,7 +290,8 @@ options[:proteins_2predict].each do |protein|
281
290
 
282
291
  function_to_domains.each do |funsys, domains_data|
283
292
  score = domains_data.pop
284
- handler.puts "#{protein}\t#{domains_data.join(',')}\t#{funsys}\t#{score}"
293
+ #handler.puts "#{protein}\t#{domains_data.join(',')}\t#{funsys}\t#{score}"
294
+ puts "#{protein}\t#{domains_data.join(',')}\t#{funsys}\t#{score}"
285
295
  end
286
296
  end
287
- handler.close
297
+ #handler.close
@@ -4,6 +4,7 @@ def load_proteins_file(file, annotation_types)
4
4
  annotation_types.each do |type| # initialize annotation hashes
5
5
  protein_annotations[type] = {}
6
6
  end
7
+ fields_to_split = annotation_types.length
7
8
  counter = 0
8
9
  File.open(file).each do |line|
9
10
  line.chomp!
@@ -12,7 +13,7 @@ def load_proteins_file(file, annotation_types)
12
13
  next
13
14
  end
14
15
  line.gsub!(' ', '')
15
- fields = line.split("\t", 4)
16
+ fields = line.split("\t", fields_to_split + 1)
16
17
  protID = fields.shift
17
18
  annotation_types.each_with_index do |type, i|
18
19
  annotations = fields[i].split(/[;,]/)
@@ -28,7 +29,7 @@ def load_proteins_file(file, annotation_types)
28
29
  protein_annotations[type][protID] = annotations
29
30
  end
30
31
  end
31
- if fields.count("") == 3
32
+ if fields.count("") == fields_to_split
32
33
  proteins_without_annotations << protID
33
34
  end
34
35
  end
@@ -37,23 +38,22 @@ def load_proteins_file(file, annotation_types)
37
38
  return protein_annotations, counter, proteins_without_annotations.uniq
38
39
  end
39
40
 
40
- def load_cath_data(file, category, meth='protACC')
41
+ def load_cath_data(file, category, dictionary_key='gene_name')
42
+ if dictionary_key == 'gene_name'
43
+ field = 3
44
+ elsif dictionary_key == 'geneID' # UNIPROT entry_name
45
+ field = 4
46
+ end
41
47
  cath_data = {}
42
- protein2gene = {}
43
- gene2proteins = {}
48
+ protein2gene_dict = {}
44
49
  csv_file = CSV.read(file, { :col_sep => "\t" })
45
50
  csv_file.delete_at(0)
46
51
  csv_file.each do |protein_domains_data|
47
52
  next if protein_domains_data.empty?
48
53
  protein_id = protein_domains_data[0]
49
- if meth == 'protACC'
50
- field = 3
51
- elsif meth == 'geneID'
52
- field = 4
53
- end
54
- gene_name = protein_domains_data[field]
55
- next if gene_name.include?('fusion')
56
- gene_name = gene_name.gsub(' ', '_') if gene_name.include?(' ')
54
+ protein_alternative_name = protein_domains_data[field]
55
+ next if protein_domains_data[3].include?('fusion') # Only can checked in cath gene name field
56
+ protein_alternative_name.gsub!(' ', '_') if protein_alternative_name.include?(' ')
57
57
  superfamilyID = protein_domains_data[5]
58
58
  funfamID = protein_domains_data[6]
59
59
  term2save = nil
@@ -63,16 +63,23 @@ def load_cath_data(file, category, meth='protACC')
63
63
  term2save = funfamID
64
64
  end
65
65
  add_term2dictionary(cath_data, protein_id, term2save)
66
- protein2gene[protein_id] = gene_name if gene_name != 'NULL'
67
- query = gene2proteins[gene_name]
68
- if query.nil?
69
- gene2proteins[gene_name] = [protein_id] if protein_id != 'NULL'
70
- else
71
- query << protein_id if protein_id != 'NULL'
72
- end
66
+ protein2gene_dict[protein_id] = protein_alternative_name if protein_alternative_name != 'NULL'
73
67
  end
74
68
  cath_proteins_number = cath_data.keys.length
75
- return cath_data, protein2gene, gene2proteins, cath_proteins_number
69
+ return cath_data, protein2gene_dict, cath_proteins_number
70
+ end
71
+
72
+ def invert_hash(hash)
73
+ new_hash = {}
74
+ hash.each do |k, v|
75
+ query = new_hash[v]
76
+ if query.nil?
77
+ new_hash[v] = [k]
78
+ else
79
+ query << k
80
+ end
81
+ end
82
+ return new_hash
76
83
  end
77
84
 
78
85
  def add_term2dictionary(dict, key, term)
@@ -1,3 +1,3 @@
1
1
  module DomFun
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -0,0 +1,59 @@
1
+ <div class="chunk">
2
+ <style type="text/css" scoped>
3
+ .tooltip {
4
+ position: relative;
5
+ display: inline-block;
6
+ border-bottom: 1px dotted black;
7
+ }
8
+
9
+ .tooltip .tooltiptext {
10
+ visibility: hidden;
11
+ width: 120px;
12
+ background-color: #555;
13
+ color: #fff;
14
+ text-align: center;
15
+ border-radius: 6px;
16
+ padding: 5px 0;
17
+ position: absolute;
18
+ z-index: 1;
19
+ bottom: 125%;
20
+ left: 50%;
21
+ margin-left: -60px;
22
+ opacity: 0;
23
+ transition: opacity 1s;
24
+ }
25
+
26
+ .tooltip .tooltiptext::after {
27
+ content: "";
28
+ position: absolute;
29
+ top: 100%;
30
+ left: 50%;
31
+ margin-left: -5px;
32
+ border-width: 5px;
33
+ border-style: solid;
34
+ border-color: #555 transparent transparent transparent;
35
+ }
36
+
37
+ .tooltip:hover .tooltiptext {
38
+ visibility: visible;
39
+ opacity: 1;
40
+ }
41
+ </style>
42
+ </div>
43
+
44
+ <div>
45
+ <h2 style="text-align: center; background-color:#d6eaf8">Ranked FunSys predicted based on queried protein domains</h2>
46
+ <p>The following table shows the protein domains, the FunSys associated, the association value calculated between the domain and the FunSys and the combined scored. Results are ranked from the highest to the lowest combined score.</p>
47
+ <%=
48
+ if !@hash_vars[:predictions].nil?
49
+ # gene_var = @hash_vars[:genes_with_kegg_data]
50
+ table(id: :predictions, header: true, border: 2,
51
+ cell_align: %w(center, center center center center)) do |data|
52
+ predictions = @hash_vars[:predictions]
53
+ # data.each_with_index do |row, row_number|
54
+ data.unshift(["ProteinID", "Domains", "FunSys", "Association values", "Combined score"])
55
+ end
56
+ end
57
+ %>
58
+
59
+ </div>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: DomFun
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elena Rojano, Pedro Seoane
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-21 00:00:00.000000000 Z
11
+ date: 2020-06-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: 12.3.1
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.1
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -83,6 +83,7 @@ files:
83
83
  - README.md
84
84
  - Rakefile
85
85
  - bin/add_protein_functional_families.rb
86
+ - bin/association_metrics_average.rb
86
87
  - bin/console
87
88
  - bin/domains_to_function_predictor.rb
88
89
  - bin/generate_CAFA2_dataset.rb
@@ -100,6 +101,7 @@ files:
100
101
  - lib/DomFun.rb
101
102
  - lib/DomFun/generalMethods.rb
102
103
  - lib/DomFun/version.rb
104
+ - templates/report_data.erb
103
105
  homepage: https://bitbucket.org/elenarojano/domfun
104
106
  licenses:
105
107
  - MIT