semtools 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +9 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +17 -0
- data/bin/console +14 -0
- data/bin/onto2json.rb +45 -0
- data/bin/setup +8 -0
- data/bin/strsimnet.rb +134 -0
- data/lib/data/hp.obo +152267 -0
- data/lib/data/phenotype_annotation.tab +159504 -0
- data/lib/semtools.rb +8 -0
- data/lib/semtools/math_methods.rb +140 -0
- data/lib/semtools/ontology.rb +2041 -0
- data/lib/semtools/sim_handler.rb +113 -0
- data/lib/semtools/version.rb +3 -0
- data/semtools.gemspec +37 -0
- metadata +113 -0
data/lib/semtools.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
# TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
|
2
|
+
#to cmpute fisher exact test
|
3
|
+
#Fisher => http://www.biostathandbook.com/fishers.html
|
4
|
+
def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
|
5
|
+
listA_listB = listA & listB
|
6
|
+
listA_nolistB = listA - listB
|
7
|
+
nolistA_listB = listB - listA
|
8
|
+
if weigths.nil?
|
9
|
+
listA_listB_count = listA_listB.length
|
10
|
+
listA_nolistB_count = listA_nolistB.length
|
11
|
+
nolistA_listB_count = nolistA_listB.length
|
12
|
+
nolistA_nolistB_count = all_elements_count - (listA | listB).length
|
13
|
+
else
|
14
|
+
# Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
|
15
|
+
# https://academic.oup.com/bioinformatics/article/22/13/1600/193669
|
16
|
+
listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
17
|
+
listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
18
|
+
nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
19
|
+
nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
20
|
+
all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
|
21
|
+
end
|
22
|
+
if tail == 'two_sided'
|
23
|
+
accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
24
|
+
elsif tail == 'less'
|
25
|
+
accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
26
|
+
end
|
27
|
+
return accumulated_prob
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
31
|
+
#https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
|
32
|
+
accumulated_prob = 0
|
33
|
+
ref_prob = compute_hyper_prob(
|
34
|
+
listA_listB_count,
|
35
|
+
listA_nolistB_count,
|
36
|
+
nolistA_listB_count,
|
37
|
+
nolistA_nolistB_count,
|
38
|
+
all_elements_count
|
39
|
+
)
|
40
|
+
accumulated_prob += ref_prob
|
41
|
+
[listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
|
42
|
+
n += 1
|
43
|
+
prob = compute_hyper_prob(
|
44
|
+
listA_listB_count - n,
|
45
|
+
listA_nolistB_count + n,
|
46
|
+
nolistA_listB_count + n,
|
47
|
+
nolistA_nolistB_count - n,
|
48
|
+
all_elements_count
|
49
|
+
)
|
50
|
+
prob <= ref_prob ? accumulated_prob += prob : break
|
51
|
+
end
|
52
|
+
|
53
|
+
[listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
|
54
|
+
n += 1
|
55
|
+
prob = compute_hyper_prob(
|
56
|
+
listA_listB_count + n,
|
57
|
+
listA_nolistB_count - n,
|
58
|
+
nolistA_listB_count - n,
|
59
|
+
nolistA_nolistB_count + n,
|
60
|
+
all_elements_count
|
61
|
+
)
|
62
|
+
accumulated_prob += prob if prob <= ref_prob
|
63
|
+
end
|
64
|
+
|
65
|
+
return accumulated_prob
|
66
|
+
end
|
67
|
+
|
68
|
+
def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
69
|
+
accumulated_prob = 0
|
70
|
+
[listA_listB_count, nolistA_nolistB_count].min.times do |n|
|
71
|
+
accumulated_prob += compute_hyper_prob(
|
72
|
+
listA_listB_count - n,
|
73
|
+
listA_nolistB_count + n,
|
74
|
+
nolistA_listB_count + n,
|
75
|
+
nolistA_nolistB_count - n,
|
76
|
+
all_elements_count
|
77
|
+
)
|
78
|
+
end
|
79
|
+
return accumulated_prob
|
80
|
+
end
|
81
|
+
|
82
|
+
def compute_hyper_prob(a, b, c, d, n)
|
83
|
+
# https://en.wikipedia.org/wiki/Fisher%27s_exact_test
|
84
|
+
binomA = binom(a + b, a)
|
85
|
+
binomC = binom(c + d, c)
|
86
|
+
divisor = binom(n, a + c)
|
87
|
+
return (binomA * binomC).fdiv(divisor)
|
88
|
+
end
|
89
|
+
|
90
|
+
def binom(n,k)
|
91
|
+
if k > 0 && k < n
|
92
|
+
res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
|
93
|
+
else
|
94
|
+
res = 1
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
#to cmpute adjusted pvalues
|
99
|
+
#https://rosettacode.org/wiki/P-value_correction#Ruby
|
100
|
+
def get_benjaminiHochberg_pvalues(arr_pvalues)
|
101
|
+
n = arr_pvalues.length
|
102
|
+
arr_o = order(arr_pvalues, true)
|
103
|
+
arr_cummin_input = []
|
104
|
+
(0..(n - 1)).each do |i|
|
105
|
+
arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
|
106
|
+
end
|
107
|
+
arr_ro = order(arr_o)
|
108
|
+
arr_cummin = cummin(arr_cummin_input)
|
109
|
+
arr_pmin = pmin(arr_cummin)
|
110
|
+
return arr_pmin.values_at(*arr_ro)
|
111
|
+
end
|
112
|
+
|
113
|
+
def order(array, decreasing = false)
|
114
|
+
if decreasing == false
|
115
|
+
array.sort.map { |n| array.index(n) }
|
116
|
+
else
|
117
|
+
array.sort.map { |n| array.index(n) }.reverse
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def cummin(array)
|
122
|
+
cumulative_min = array.first
|
123
|
+
arr_cummin = []
|
124
|
+
array.each do |p|
|
125
|
+
cumulative_min = [p, cumulative_min].min
|
126
|
+
arr_cummin << cumulative_min
|
127
|
+
end
|
128
|
+
return arr_cummin
|
129
|
+
end
|
130
|
+
|
131
|
+
def pmin(array)
|
132
|
+
x = 1
|
133
|
+
pmin_array = []
|
134
|
+
array.each_index do |i|
|
135
|
+
pmin_array[i] = [array[i], x].min
|
136
|
+
abort if pmin_array[i] > 1
|
137
|
+
end
|
138
|
+
return pmin_array
|
139
|
+
end
|
140
|
+
|
@@ -0,0 +1,2041 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
|
4
|
+
class Ontology
|
5
|
+
#########################################################
|
6
|
+
# AUTHOR NOTES
|
7
|
+
#########################################################
|
8
|
+
|
9
|
+
# 1 - Store @profiles as @stanzas[:instances]
|
10
|
+
# 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
|
11
|
+
|
12
|
+
|
13
|
+
#############################################
|
14
|
+
# FIELDS
|
15
|
+
#############################################
|
16
|
+
# Handled class variables
|
17
|
+
# => @@basic_tags :: hash with main OBO structure tags
|
18
|
+
# => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
|
19
|
+
# => @@symbolizable_ids :: tags which can be symbolized
|
20
|
+
# => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
|
21
|
+
#
|
22
|
+
# Handled object variables
|
23
|
+
# => @header :: file header (if is available)
|
24
|
+
# => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
|
25
|
+
# => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
|
26
|
+
# => @descendants_index :: hash of descendants per each term handled with any structure relationships
|
27
|
+
# => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
|
28
|
+
# => @obsoletes_index :: hash of obsoletes and it's new ids
|
29
|
+
# => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
|
30
|
+
# => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
|
31
|
+
# => @ics :: already calculated ICs for handled terms and IC types
|
32
|
+
# => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
|
33
|
+
# => @max_freqs :: maximum freqs found for structural and observed freqs
|
34
|
+
# => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
|
35
|
+
# => @profiles :: set of terms assigned to an ID
|
36
|
+
# => @profilesDict :: set of profile IDs assigned to a term
|
37
|
+
# => @items :: hash with items relations to terms
|
38
|
+
# => @removable_terms :: array of terms to not be considered
|
39
|
+
# => @term_paths :: metainfo about parental paths of each term
|
40
|
+
|
41
|
+
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
|
42
|
+
@@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
|
43
|
+
@@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
|
44
|
+
@@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
|
45
|
+
@@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
|
46
|
+
@@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
|
47
|
+
|
48
|
+
#############################################
|
49
|
+
# CONSTRUCTOR
|
50
|
+
#############################################
|
51
|
+
|
52
|
+
# Instantiate a OBO_Handler object
|
53
|
+
# ===== Parameters
|
54
|
+
# +file+:: with info to be loaded (.obo ; .json)
|
55
|
+
# +load_file+:: activate load process automatically (only for .obo)
|
56
|
+
# +removable_terms+: term to be removed from calcs
|
57
|
+
def initialize(file: nil, load_file: false, removable_terms: [])
|
58
|
+
# Initialize object variables
|
59
|
+
@header = nil
|
60
|
+
@stanzas = {terms: {}, typedefs: {}, instances: {}}
|
61
|
+
@ancestors_index = {}
|
62
|
+
@descendants_index = {}
|
63
|
+
@alternatives_index = {}
|
64
|
+
@obsoletes_index = {}
|
65
|
+
@structureType = nil
|
66
|
+
@ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
|
67
|
+
@meta = {}
|
68
|
+
@special_tags = @@basic_tags.clone
|
69
|
+
@max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
|
70
|
+
@dicts = {}
|
71
|
+
@profiles = {}
|
72
|
+
@profilesDict = {}
|
73
|
+
@items = {}
|
74
|
+
@removable_terms = []
|
75
|
+
@term_paths = {}
|
76
|
+
# Load if proceeds
|
77
|
+
add_removable_terms(removable_terms) if !removable_terms.empty?
|
78
|
+
load(file) if load_file
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
#############################################
|
83
|
+
# CLASS METHODS
|
84
|
+
#############################################
|
85
|
+
|
86
|
+
# Expand a (starting) term using a specific tag and return all extended terms into an array and
|
87
|
+
# the relationship structuture observed (hierarchical or circular). If circular structure is
|
88
|
+
# foumd, extended array will be an unique vector without starting term (no loops).
|
89
|
+
# +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
|
90
|
+
# ===== Parameters
|
91
|
+
# +start+:: term where start to expand
|
92
|
+
# +terms+:: set to be used to expand
|
93
|
+
# +target_tag+:: tag used to expand
|
94
|
+
# +eexpansion+:: already expanded info
|
95
|
+
# +split_info_char+:: special regex used to split info (if it is necessary)
|
96
|
+
# +split_info_indx+:: special index to take splitted info (if it is necessary)
|
97
|
+
# +alt_ids+:: set of alternative IDs
|
98
|
+
# ===== Returns
|
99
|
+
# A vector with the observed structure (string) and the array with extended terms.
|
100
|
+
def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
|
101
|
+
# Take start_id term available info and already accumulated info
|
102
|
+
current_associations = related_ids[start_id]
|
103
|
+
current_associations = [] if current_associations.nil?
|
104
|
+
return [:no_term,[]] if terms[start_id].nil?
|
105
|
+
id_relations = terms[start_id][target_tag]
|
106
|
+
return [:source,[]] if id_relations.nil?
|
107
|
+
|
108
|
+
# Prepare auxiliar variables
|
109
|
+
struct = :hierarchical
|
110
|
+
|
111
|
+
# Study direct extensions
|
112
|
+
id_relations = id_relations.clone
|
113
|
+
while id_relations.length > 0
|
114
|
+
id = id_relations.shift
|
115
|
+
id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
|
116
|
+
|
117
|
+
# Handle
|
118
|
+
if current_associations.include?(id) # Check if already have been included into this expansion
|
119
|
+
struct = :circular
|
120
|
+
else
|
121
|
+
current_associations << id
|
122
|
+
if related_ids.include?(id) # Check if current already has been expanded
|
123
|
+
current_associations = current_associations | related_ids[id]
|
124
|
+
if current_associations.include?(start_id) # Check circular case
|
125
|
+
struct = :circular
|
126
|
+
[id, start_id].each{|repeated| current_associations.delete(repeated)}
|
127
|
+
end
|
128
|
+
else # Expand
|
129
|
+
related_ids[start_id] = current_associations
|
130
|
+
structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
|
131
|
+
current_associations = current_associations | current_related_ids
|
132
|
+
struct = :circular if structExp == :circular # Check struct
|
133
|
+
if current_associations.include?(start_id) # Check circular case
|
134
|
+
struct = :circular
|
135
|
+
current_associations.delete(start_id)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
related_ids[start_id] = current_associations
|
141
|
+
|
142
|
+
return struct, current_associations
|
143
|
+
end
|
144
|
+
|
145
|
+
|
146
|
+
# Expand terms using a specific tag and return all extended terms into an array and
|
147
|
+
# the relationship structuture observed (hierarchical or circular). If circular structure is
|
148
|
+
# foumd, extended array will be an unique vector without starting term (no loops)
|
149
|
+
# ===== Parameters
|
150
|
+
# +terms+:: set to be used to expand
|
151
|
+
# +target_tag+:: tag used to expand
|
152
|
+
# +split_info_char+:: special regex used to split info (if it is necessary)
|
153
|
+
# +split_info_indx+:: special index to take splitted info (if it is necessary)
|
154
|
+
# +alt_ids+:: set of alternative IDs
|
155
|
+
# +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
|
156
|
+
# ===== Returns
|
157
|
+
# A vector with the observed structure (string) and the hash with extended terms
|
158
|
+
def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
|
159
|
+
# Define structure type
|
160
|
+
structType = :hierarchical
|
161
|
+
related_ids = {}
|
162
|
+
terms.each do |id, tags|
|
163
|
+
# Check if target tag is defined
|
164
|
+
if !tags[target_tag].nil?
|
165
|
+
# Obtain related terms
|
166
|
+
set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
|
167
|
+
# Check structure
|
168
|
+
structType = :circular if set_structure == :circular
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# Check special case
|
173
|
+
structType = :atomic if related_ids.length <= 0
|
174
|
+
structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
|
175
|
+
# Return type and hash with related_ids
|
176
|
+
return structType, related_ids
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
# Class method to transform string with <tag : info> into hash structure
|
181
|
+
# ===== Parameters
|
182
|
+
# +attributes+:: array tuples with info to be transformed into hash format
|
183
|
+
# ===== Returns
|
184
|
+
# Attributes stored into hash structure
|
185
|
+
def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
|
186
|
+
# Load info
|
187
|
+
info_hash = {}
|
188
|
+
# Only TERMS multivalue tags (future add Typedefs and Instance)
|
189
|
+
# multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
|
190
|
+
attributes.each do |tag, value|
|
191
|
+
# Check
|
192
|
+
raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
|
193
|
+
# Prepare
|
194
|
+
tag = tag.lstrip.to_sym
|
195
|
+
value.lstrip!
|
196
|
+
value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
|
197
|
+
|
198
|
+
# Store
|
199
|
+
query = info_hash[tag]
|
200
|
+
if !query.nil? # Tag already exists
|
201
|
+
if !query.kind_of?(Array) # Check that tag is multivalue
|
202
|
+
raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
|
203
|
+
else
|
204
|
+
query << value # Add new value to tag
|
205
|
+
end
|
206
|
+
else # New entry
|
207
|
+
if @@multivalue_tags.include?(tag)
|
208
|
+
info_hash[tag] = [value]
|
209
|
+
else
|
210
|
+
info_hash[tag] = value
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
self.symbolize_ids(info_hash)
|
215
|
+
return info_hash
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
# Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
|
220
|
+
# the Header, the Terms, the Typedefs and the Instances.
|
221
|
+
# ===== Parameters
|
222
|
+
# +file+:: OBO file to be loaded
|
223
|
+
# ===== Returns
|
224
|
+
# Hash with FILE, HEADER and STANZAS info
|
225
|
+
def self.load_obo(file) #TODO: Send to obo_parser class
|
226
|
+
raise("File is not defined") if file.nil?
|
227
|
+
# Data variables
|
228
|
+
header = ''
|
229
|
+
stanzas = {terms: {}, typedefs: {}, instances: {}}
|
230
|
+
# Auxiliar variables
|
231
|
+
infoType = 'Header'
|
232
|
+
currInfo = []
|
233
|
+
stanzas_flags = %w[[Term] [Typedef] [Instance]]
|
234
|
+
# Read file
|
235
|
+
File.open(file).each do |line|
|
236
|
+
line.chomp!
|
237
|
+
next if line.empty?
|
238
|
+
fields = line.split(':', 2)
|
239
|
+
# Check if new instance is found
|
240
|
+
if stanzas_flags.include?(line)
|
241
|
+
header = self.process_entity(header, infoType, stanzas, currInfo)
|
242
|
+
# Update info variables
|
243
|
+
currInfo = []
|
244
|
+
infoType = line.gsub!(/[\[\]]/, '')
|
245
|
+
next
|
246
|
+
end
|
247
|
+
# Concat info
|
248
|
+
currInfo << fields
|
249
|
+
end
|
250
|
+
# Store last loaded info
|
251
|
+
header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
|
252
|
+
|
253
|
+
# Prepare to return
|
254
|
+
finfo = {:file => file, :name => File.basename(file, File.extname(file))}
|
255
|
+
return finfo, header, stanzas
|
256
|
+
end
|
257
|
+
|
258
|
+
|
259
|
+
# Handle OBO loaded info and stores it into correct container and format
|
260
|
+
# ===== Parameters
|
261
|
+
# +header+:: container
|
262
|
+
# +infoType+:: current ontology item type detected
|
263
|
+
# +stanzas+:: container
|
264
|
+
# +currInfo+:: info to be stored
|
265
|
+
# ===== Returns
|
266
|
+
# header newly/already stored
|
267
|
+
def self.process_entity(header, infoType, stanzas, currInfo)
|
268
|
+
info = self.info2hash(currInfo)
|
269
|
+
# Store current info
|
270
|
+
if infoType.eql?('Header')
|
271
|
+
header = info
|
272
|
+
else
|
273
|
+
id = info[:id]
|
274
|
+
case infoType
|
275
|
+
when 'Term'
|
276
|
+
stanzas[:terms][id] = info
|
277
|
+
when 'Typedef'
|
278
|
+
stanzas[:typedefs][id] = info
|
279
|
+
when 'Instance'
|
280
|
+
stanzas[:instances][id] = info
|
281
|
+
end
|
282
|
+
end
|
283
|
+
return header
|
284
|
+
end
|
285
|
+
|
286
|
+
|
287
|
+
# Symboliza all values into hashs using symbolizable tags as keys
|
288
|
+
# ===== Parameters
|
289
|
+
# +item_hash+:: hash to be checked
|
290
|
+
def self.symbolize_ids(item_hash)
|
291
|
+
@@symbolizable_ids.each do |tag|
|
292
|
+
query = item_hash[tag]
|
293
|
+
if !query.nil?
|
294
|
+
if query.kind_of?(Array)
|
295
|
+
query.map!{|item| item.to_sym}
|
296
|
+
else
|
297
|
+
item_hash[tag] = query.to_sym if !query.nil?
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
|
304
|
+
#
|
305
|
+
# ===== Parameters
|
306
|
+
# +root+:: main term to expand
|
307
|
+
# +ontology+:: to be cutted
|
308
|
+
# +clone+:: if true, given ontology object will not be mutated
|
309
|
+
# +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
|
310
|
+
# ===== Returns
|
311
|
+
# An Ontology object with terms after cut the ontology.
|
312
|
+
def self.mutate(root, ontology, clone: true, remove_up: true)
|
313
|
+
ontology = ontology.clone if clone
|
314
|
+
# Obtain affected IDs
|
315
|
+
descendants = ontology.descendants_index[root]
|
316
|
+
descendants << root # Store itself to do not remove it
|
317
|
+
# Remove unnecesary terms
|
318
|
+
ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
|
319
|
+
ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
|
320
|
+
ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
|
321
|
+
ontology.dicts = {}
|
322
|
+
ontology.removable_terms = []
|
323
|
+
ontology.term_paths = {}
|
324
|
+
# Recalculate metadata
|
325
|
+
ontology.build_index
|
326
|
+
ontology.add_observed_terms_from_profiles
|
327
|
+
# Finish
|
328
|
+
return ontology
|
329
|
+
end
|
330
|
+
|
331
|
+
|
332
|
+
|
333
|
+
#############################################
|
334
|
+
# GENERAL METHODS
|
335
|
+
#############################################
|
336
|
+
|
337
|
+
# Include removable terms to current removable terms list
|
338
|
+
# ===== Parameters
|
339
|
+
# +terms+:: terms array to be concatenated
|
340
|
+
def add_removable_terms(terms)
|
341
|
+
terms = terms.map{|term| term.to_sym}
|
342
|
+
@removable_terms.concat(terms)
|
343
|
+
end
|
344
|
+
|
345
|
+
|
346
|
+
# Include removable terms to current removable terms list loading new
|
347
|
+
# terms from a one column plain text file
|
348
|
+
# ===== Parameters
|
349
|
+
# +file+:: to be loaded
|
350
|
+
def add_removable_terms_from_file(file)
|
351
|
+
File.open(excluded_codes_file).each do |line|
|
352
|
+
line.chomp!
|
353
|
+
@removable_terms << line.to_sym
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
|
358
|
+
# Increase observed frequency for a specific term
|
359
|
+
# ===== Parameters
|
360
|
+
# +term+:: term which frequency is going to be increased
|
361
|
+
# +increas+:: frequency rate to be increased. Default = 1
|
362
|
+
# ===== Return
|
363
|
+
# true if process ends without errors, false in other cases
|
364
|
+
def add_observed_term(term:,increase: 1.0)
|
365
|
+
# Check
|
366
|
+
raise ArgumentError, "Term given is NIL" if term.nil?
|
367
|
+
return false unless @stanzas[:terms].include?(term)
|
368
|
+
return false if @removable_terms.include?(term)
|
369
|
+
if @alternatives_index.include?(term)
|
370
|
+
alt_id = @alternatives_index[term]
|
371
|
+
@meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
|
372
|
+
@meta[term] = @meta[alt_id]
|
373
|
+
end
|
374
|
+
# Check if exists
|
375
|
+
@meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
|
376
|
+
# Add frequency
|
377
|
+
@meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
|
378
|
+
@meta[term][:observed_freq] += increase
|
379
|
+
# Check maximum frequency
|
380
|
+
@max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
|
381
|
+
return true
|
382
|
+
end
|
383
|
+
|
384
|
+
|
385
|
+
# Increase the arbitrary frequency of a given term set
|
386
|
+
# ===== Parameters
|
387
|
+
# +terms+:: set of terms to be updated
|
388
|
+
# +increase+:: amount to be increased
|
389
|
+
# +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
|
390
|
+
# ===== Return
|
391
|
+
# true if process ends without errors and false in other cases
|
392
|
+
def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
|
393
|
+
# Check
|
394
|
+
raise ArgumentError, 'Terms array given is NIL' if terms.nil?
|
395
|
+
raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
|
396
|
+
# Add observations
|
397
|
+
if transform_to_sym
|
398
|
+
checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
|
399
|
+
else
|
400
|
+
checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
|
401
|
+
end
|
402
|
+
return checks
|
403
|
+
end
|
404
|
+
|
405
|
+
|
406
|
+
# Compare to terms sets
|
407
|
+
# ===== Parameters
|
408
|
+
# +termsA+:: set to be compared
|
409
|
+
# +termsB+:: set to be compared
|
410
|
+
# +sim_type+:: similitude method to be used. Default: resnik
|
411
|
+
# +ic_type+:: ic type to be used. Default: resnik
|
412
|
+
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
413
|
+
# ===== Return
|
414
|
+
# similitude calculated
|
415
|
+
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
|
416
|
+
# Check
|
417
|
+
raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
|
418
|
+
raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
|
419
|
+
micasA = []
|
420
|
+
# Compare A -> B
|
421
|
+
termsA.each do |tA|
|
422
|
+
micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
|
423
|
+
# Remove special cases
|
424
|
+
[false,nil].each do |err_value| micas.delete(err_value) end
|
425
|
+
# Obtain maximum value
|
426
|
+
micasA << micas.max if micas.length > 0
|
427
|
+
micasA << 0 if micas.length <= 0
|
428
|
+
end
|
429
|
+
means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
|
430
|
+
# Compare B -> A
|
431
|
+
if bidirectional
|
432
|
+
means_simA = means_sim * micasA.size
|
433
|
+
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
|
434
|
+
means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
|
435
|
+
end
|
436
|
+
# Return
|
437
|
+
return means_sim
|
438
|
+
end
|
439
|
+
|
440
|
+
|
441
|
+
# Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
|
442
|
+
# ===== Parameters
|
443
|
+
# +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
|
444
|
+
# +sim_type+:: similitude method to be used. Default: resnik
|
445
|
+
# +ic_type+:: ic type to be used. Default: resnik
|
446
|
+
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
447
|
+
# ===== Return
|
448
|
+
# Similitudes calculated
|
449
|
+
def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
|
450
|
+
profiles_similarity = {} #calculate similarity between patients profile
|
451
|
+
profiles_ids = @profiles.keys
|
452
|
+
if external_profiles.nil?
|
453
|
+
comp_ids = profiles_ids
|
454
|
+
comp_profiles = @profiles
|
455
|
+
main_ids = comp_ids
|
456
|
+
main_profiles = comp_profiles
|
457
|
+
else
|
458
|
+
comp_ids = external_profiles.keys
|
459
|
+
comp_profiles = external_profiles
|
460
|
+
main_ids = profiles_ids
|
461
|
+
main_profiles = @profiles
|
462
|
+
end
|
463
|
+
# Compare
|
464
|
+
while !main_ids.empty?
|
465
|
+
curr_id = main_ids.shift
|
466
|
+
current_profile = main_profiles[curr_id]
|
467
|
+
comp_ids.each do |id|
|
468
|
+
profile = comp_profiles[id]
|
469
|
+
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
|
470
|
+
query = profiles_similarity[curr_id]
|
471
|
+
if query.nil?
|
472
|
+
profiles_similarity[curr_id] = {id => value}
|
473
|
+
else
|
474
|
+
query[id] = value
|
475
|
+
end
|
476
|
+
end
|
477
|
+
end
|
478
|
+
return profiles_similarity
|
479
|
+
end
|
480
|
+
|
481
|
+
|
482
|
+
# Expand alternative IDs arround all already stored terms
|
483
|
+
# ===== Parameters
|
484
|
+
# +alt_tag+:: tag used to expand alternative IDs
|
485
|
+
# ===== Returns
|
486
|
+
# true if process ends without errors and false in other cases
|
487
|
+
def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
|
488
|
+
# Check input
|
489
|
+
raise('stanzas terms empty') if @stanzas[:terms].empty?
|
490
|
+
# Take all alternative IDs
|
491
|
+
alt_ids2add = {}
|
492
|
+
@stanzas[:terms].each do |id, tags|
|
493
|
+
alt_ids = tags[alt_tag]
|
494
|
+
if !alt_ids.nil?
|
495
|
+
alt_ids = alt_ids - @removable_terms
|
496
|
+
# Update info
|
497
|
+
alt_ids.each do |alt_term|
|
498
|
+
@alternatives_index[alt_term] = id
|
499
|
+
alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
|
500
|
+
@ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
|
501
|
+
end
|
502
|
+
end
|
503
|
+
end
|
504
|
+
@stanzas[:terms].merge!(alt_ids2add)
|
505
|
+
end
|
506
|
+
|
507
|
+
|
508
|
+
# Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
|
509
|
+
# ===== Returns
|
510
|
+
# true if eprocess ends without errors and false in other cases
|
511
|
+
def build_index()
|
512
|
+
self.get_index_alternatives
|
513
|
+
self.get_index_obsoletes
|
514
|
+
self.get_index_child_parent_relations
|
515
|
+
@alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
|
516
|
+
@alternatives_index.compact!
|
517
|
+
@obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
|
518
|
+
@obsoletes_index.compact!
|
519
|
+
@ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
|
520
|
+
@ancestors_index.compact!
|
521
|
+
@descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
|
522
|
+
@descendants_index.compact!
|
523
|
+
self.get_index_frequencies
|
524
|
+
self.calc_dictionary(:name)
|
525
|
+
self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
|
526
|
+
self.calc_term_levels(calc_paths: true)
|
527
|
+
end
|
528
|
+
|
529
|
+
|
530
|
+
# Calculates regular frequencies based on ontology structure (using parentals)
|
531
|
+
# ===== Returns
|
532
|
+
# true if everything end without errors and false in other cases
|
533
|
+
def get_index_frequencies()
|
534
|
+
# Check
|
535
|
+
if @ancestors_index.empty?
|
536
|
+
warn('ancestors_index object is empty')
|
537
|
+
else
|
538
|
+
# Prepare useful variables
|
539
|
+
alternative_terms = @alternatives_index.keys
|
540
|
+
# Per each term, add frequencies
|
541
|
+
@stanzas[:terms].each do |id, tags|
|
542
|
+
if @alternatives_index.include?(id)
|
543
|
+
alt_id = @alternatives_index[id]
|
544
|
+
query = @meta[alt_id] # Check if exist
|
545
|
+
if query.nil?
|
546
|
+
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
547
|
+
@meta[alt_id] = query
|
548
|
+
end
|
549
|
+
@meta[id] = query
|
550
|
+
# Note: alternative terms do not increase structural frequencies
|
551
|
+
else # Official term
|
552
|
+
query = @meta[id] # Check if exist
|
553
|
+
if query.nil?
|
554
|
+
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
555
|
+
@meta[id] = query
|
556
|
+
end
|
557
|
+
# Store metadata
|
558
|
+
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
|
559
|
+
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
|
560
|
+
query[:struct_freq] = query[:descendants] + 1.0
|
561
|
+
# Update maximums
|
562
|
+
@max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
|
563
|
+
@max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
|
564
|
+
end
|
565
|
+
end
|
566
|
+
end
|
567
|
+
end
|
568
|
+
|
569
|
+
|
570
|
+
# Expand obsoletes set and link info to their alternative IDs
|
571
|
+
# ===== Parameters
|
572
|
+
# +obs_tags+:: tags to be used to find obsoletes
|
573
|
+
# +alt_tags+:: tags to find alternative IDs (if are available)
|
574
|
+
# +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
|
575
|
+
# ===== Returns
|
576
|
+
# true if process ends without errors and false in other cases
|
577
|
+
def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
|
578
|
+
if @stanzas[:terms].empty?
|
579
|
+
warn('stanzas terms empty')
|
580
|
+
else
|
581
|
+
# Check obsoletes
|
582
|
+
@stanzas[:terms].each do |id, term_tags|
|
583
|
+
next if term_tags.nil?
|
584
|
+
query = term_tags[obs_tag]
|
585
|
+
if !query.nil? && query == 'true' # Obsolete tag presence
|
586
|
+
next if !@obsoletes_index[id].nil? # Already stored
|
587
|
+
# Check if alternative value is available
|
588
|
+
alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
|
589
|
+
if !alt_ids.empty?
|
590
|
+
alt_id = alt_ids.first.first #FIRST tag, FIRST id
|
591
|
+
# Store
|
592
|
+
@alternatives_index[id] = alt_id
|
593
|
+
@obsoletes_index[id] = alt_id
|
594
|
+
end
|
595
|
+
end
|
596
|
+
end
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
|
601
|
+
# Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
|
602
|
+
# ===== Parameters
|
603
|
+
# +tag+:: tag used to expand parentals
|
604
|
+
# +split_info_char+:: special regex used to split info (if it is necessary)
|
605
|
+
# +split_info_indx+:: special index to take splitted info (if it is necessary)
|
606
|
+
# ===== Returns
|
607
|
+
# true if process ends without errors and false in other cases
|
608
|
+
def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
|
609
|
+
# Check
|
610
|
+
if @stanzas[:terms].nil?
|
611
|
+
warn('stanzas terms empty')
|
612
|
+
else
|
613
|
+
# Expand
|
614
|
+
structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
|
615
|
+
target_tag: tag,
|
616
|
+
alt_ids: @alternatives_index,
|
617
|
+
obsoletes: @obsoletes_index.length)
|
618
|
+
# Check
|
619
|
+
raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
|
620
|
+
# Prepare ancestors structure
|
621
|
+
anc = {}
|
622
|
+
des = {}
|
623
|
+
parentals.each do |id, parents|
|
624
|
+
parents = parents - @removable_terms
|
625
|
+
anc[id] = parents
|
626
|
+
parents.each do |anc_id| # Add descendants
|
627
|
+
if !des.include?(anc_id)
|
628
|
+
des[anc_id] = [id]
|
629
|
+
else
|
630
|
+
des[anc_id] << id
|
631
|
+
end
|
632
|
+
end
|
633
|
+
end
|
634
|
+
# Store alternatives
|
635
|
+
@alternatives_index.each do |id,alt|
|
636
|
+
anc[id] = anc[alt] if anc.include?(alt)
|
637
|
+
des[id] = des[alt] if des.include?(alt)
|
638
|
+
end
|
639
|
+
# Check structure
|
640
|
+
if ![:atomic,:sparse].include? structType
|
641
|
+
structType = structType == :circular ? :circular : :hierarchical
|
642
|
+
end
|
643
|
+
# Store
|
644
|
+
@ancestors_index = anc
|
645
|
+
@descendants_index = des
|
646
|
+
@structureType = structType
|
647
|
+
end
|
648
|
+
# Finish
|
649
|
+
end
|
650
|
+
|
651
|
+
|
652
|
+
# Find ancestors of a given term
|
653
|
+
# ===== Parameters
|
654
|
+
# +term+:: to be checked
|
655
|
+
# +filter_alternatives+:: if true, remove alternatives from final results
|
656
|
+
# ===== Returns
|
657
|
+
# an array with all ancestors of given term or false if parents are not available yet
|
658
|
+
def get_ancestors(term, filter_alternatives = false)
|
659
|
+
return self.get_familiar(term, true, filter_alternatives)
|
660
|
+
end
|
661
|
+
|
662
|
+
|
663
|
+
# Find descendants of a given term
|
664
|
+
# ===== Parameters
|
665
|
+
# +term+:: to be checked
|
666
|
+
# +filter_alternatives+:: if true, remove alternatives from final results
|
667
|
+
# ===== Returns
|
668
|
+
# an array with all descendants of given term or false if parents are not available yet
|
669
|
+
def get_descendants(term, filter_alternatives = false)
|
670
|
+
return self.get_familiar(term, false, filter_alternatives)
|
671
|
+
end
|
672
|
+
|
673
|
+
|
674
|
+
# Find ancestors/descendants of a given term
|
675
|
+
# ===== Parameters
|
676
|
+
# +term+:: to be checked
|
677
|
+
# +return_ancestors+:: return ancestors if true or descendants if false
|
678
|
+
# +filter_alternatives+:: if true, remove alternatives from final results
|
679
|
+
# ===== Returns
|
680
|
+
# an array with all ancestors/descendants of given term or nil if parents are not available yet
|
681
|
+
def get_familiar(term, return_ancestors = true, filter_alternatives = false)
|
682
|
+
# Find into parentals
|
683
|
+
familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
|
684
|
+
if !familiars.nil?
|
685
|
+
familiars = familiars.clone
|
686
|
+
if filter_alternatives
|
687
|
+
familiars.reject!{|fm| @alternatives_index.include?(fm)}
|
688
|
+
end
|
689
|
+
else
|
690
|
+
familiars = []
|
691
|
+
end
|
692
|
+
return familiars
|
693
|
+
end
|
694
|
+
|
695
|
+
|
696
|
+
# Obtain IC of an specific term
|
697
|
+
# ===== Parameters
|
698
|
+
# +term+:: which IC will be calculated
|
699
|
+
# +type+:: of IC to be calculated. Default: resnik
|
700
|
+
# +force+:: force re-calculate the IC. Do not check if it is already calculated
|
701
|
+
# +zhou_k+:: special coeficient for Zhou IC method
|
702
|
+
# ===== Returns
|
703
|
+
# the IC calculated
|
704
|
+
def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
|
705
|
+
term = termRaw.to_sym
|
706
|
+
# Check
|
707
|
+
raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
|
708
|
+
# Check if it's already calculated
|
709
|
+
return @ics[type][term] if (@ics[type].include? term) && !force
|
710
|
+
# Calculate
|
711
|
+
ic = - 1
|
712
|
+
case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
|
713
|
+
###########################################
|
714
|
+
#### STRUCTURE BASED METRICS
|
715
|
+
###########################################
|
716
|
+
# Shortest path
|
717
|
+
# Weighted Link
|
718
|
+
# Hirst and St-Onge Measure
|
719
|
+
# Wu and Palmer
|
720
|
+
# Slimani
|
721
|
+
# Li
|
722
|
+
# Leacock and Chodorow
|
723
|
+
###########################################
|
724
|
+
#### INFORMATION CONTENT METRICS
|
725
|
+
###########################################
|
726
|
+
when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
|
727
|
+
# -log(Freq(x) / Max_Freq)
|
728
|
+
ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
|
729
|
+
when :resnik_observed
|
730
|
+
# -log(Freq(x) / Max_Freq)
|
731
|
+
ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
|
732
|
+
# Lin
|
733
|
+
# Jiang & Conrath
|
734
|
+
|
735
|
+
###########################################
|
736
|
+
#### FEATURE-BASED METRICS
|
737
|
+
###########################################
|
738
|
+
# Tversky
|
739
|
+
# x-similarity
|
740
|
+
# Rodirguez
|
741
|
+
|
742
|
+
###########################################
|
743
|
+
#### HYBRID METRICS
|
744
|
+
###########################################
|
745
|
+
when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
|
746
|
+
# 1 - ( log(hypo(x) + 1) / log(max_nodes) )
|
747
|
+
ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
|
748
|
+
if :zhou # New Model of Semantic Similarity Measuring in Wordnet
|
749
|
+
# k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
|
750
|
+
@ics[:seco][term] = ic # Special store
|
751
|
+
ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
|
752
|
+
end
|
753
|
+
when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
|
754
|
+
ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
|
755
|
+
# Knappe
|
756
|
+
end
|
757
|
+
@ics[type][term] = ic
|
758
|
+
return ic
|
759
|
+
end
|
760
|
+
|
761
|
+
|
762
|
+
# Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
|
763
|
+
# ===== Returns
|
764
|
+
# two hashes with resnik and resnik_observed ICs for observed terms
|
765
|
+
def get_observed_ics_by_onto_and_freq
|
766
|
+
# Chech there are observed terms
|
767
|
+
if @profiles.empty?
|
768
|
+
resnik = {}
|
769
|
+
resnik_observed = {}
|
770
|
+
else
|
771
|
+
# Calc ICs for all terms
|
772
|
+
observed_terms = @profiles.values.flatten.uniq
|
773
|
+
observed_terms.each{ |term| get_IC(term)}
|
774
|
+
observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
|
775
|
+
resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
|
776
|
+
resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
|
777
|
+
end
|
778
|
+
return resnik.clone, resnik_observed.clone
|
779
|
+
end
|
780
|
+
|
781
|
+
|
782
|
+
# Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
|
783
|
+
# ===== Parameters
|
784
|
+
# +termA+:: term to be cheked
|
785
|
+
# +termB+:: term to be checked
|
786
|
+
# +ic_type+:: IC formula to be used
|
787
|
+
# ===== Returns
|
788
|
+
# the IC of the MICA(termA,termB)
|
789
|
+
def get_ICMICA(termA, termB, ic_type = :resnik)
|
790
|
+
mica = self.get_MICA(termA, termB, ic_type)
|
791
|
+
return mica.first.nil? ? nil : mica.last
|
792
|
+
end
|
793
|
+
|
794
|
+
|
795
|
+
# Find the Most Index Content shared Ancestor (MICA) of two given terms
|
796
|
+
# ===== Parameters
|
797
|
+
# +termA+:: term to be cheked
|
798
|
+
# +termB+:: term to be checked
|
799
|
+
# +ic_type+:: IC formula to be used
|
800
|
+
# ===== Returns
|
801
|
+
# the MICA(termA,termB) and it's IC
|
802
|
+
def get_MICA(termA, termB, ic_type = :resnik)
|
803
|
+
termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
|
804
|
+
termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
|
805
|
+
mica = [nil,-1.0]
|
806
|
+
# Special case
|
807
|
+
if termA.eql?(termB)
|
808
|
+
ic = self.get_IC(termA, type: ic_type)
|
809
|
+
mica = [termA, ic]
|
810
|
+
else
|
811
|
+
# Obtain ancestors (include itselfs too)
|
812
|
+
anc_A = self.get_ancestors(termA)
|
813
|
+
anc_B = self.get_ancestors(termB)
|
814
|
+
|
815
|
+
if !(anc_A.empty? && anc_B.empty?)
|
816
|
+
anc_A << termA
|
817
|
+
anc_B << termB
|
818
|
+
# Find shared ancestors
|
819
|
+
shared_ancestors = anc_A & anc_B
|
820
|
+
# Find MICA
|
821
|
+
if shared_ancestors.length > 0
|
822
|
+
shared_ancestors.each do |anc|
|
823
|
+
ic = self.get_IC(anc, type: ic_type)
|
824
|
+
# Check
|
825
|
+
mica = [anc,ic] if ic > mica[1]
|
826
|
+
end
|
827
|
+
end
|
828
|
+
end
|
829
|
+
end
|
830
|
+
return mica
|
831
|
+
end
|
832
|
+
|
833
|
+
|
834
|
+
# Calculate similarity between two given terms
|
835
|
+
# ===== Parameters
|
836
|
+
# +termsA+:: to be compared
|
837
|
+
# +termsB+:: to be compared
|
838
|
+
# +type+:: similitude formula to be used
|
839
|
+
# +ic_type+:: IC formula to be used
|
840
|
+
# ===== Returns
|
841
|
+
# the similarity between both sets or false if frequencies are not available yet
|
842
|
+
def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
|
843
|
+
# Check
|
844
|
+
raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
|
845
|
+
sim = nil
|
846
|
+
# Launch comparissons
|
847
|
+
sim_res = get_ICMICA(termA, termB, ic_type)
|
848
|
+
if !sim_res.nil?
|
849
|
+
case type
|
850
|
+
when :resnik
|
851
|
+
sim = sim_res
|
852
|
+
when :lin
|
853
|
+
sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
|
854
|
+
when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
|
855
|
+
sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
|
856
|
+
end
|
857
|
+
end
|
858
|
+
return sim
|
859
|
+
end
|
860
|
+
|
861
|
+
|
862
|
+
# Method used to load information stored into an OBO file and store it into this object.
|
863
|
+
# If a file is specified by input parameter, current @file value is updated
|
864
|
+
# ===== Parameters
|
865
|
+
# +file+:: optional file to update object stored file
|
866
|
+
def load(file, build: true)
|
867
|
+
_, header, stanzas = self.class.load_obo(file)
|
868
|
+
@header = header
|
869
|
+
@stanzas = stanzas
|
870
|
+
self.remove_removable()
|
871
|
+
# @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
|
872
|
+
self.build_index() if build
|
873
|
+
end
|
874
|
+
|
875
|
+
#
|
876
|
+
def remove_removable()
|
877
|
+
@removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
|
878
|
+
end
|
879
|
+
|
880
|
+
|
881
|
+
# Exports an OBO_Handler object in json format
|
882
|
+
# ===== Parameters
|
883
|
+
# +file+:: where info will be stored
|
884
|
+
def write(file)
|
885
|
+
# Take object stored info
|
886
|
+
obj_info = {header: @header,
|
887
|
+
stanzas: @stanzas,
|
888
|
+
ancestors_index: @ancestors_index,
|
889
|
+
descendants_index: @descendants_index,
|
890
|
+
alternatives_index: @alternatives_index,
|
891
|
+
obsoletes_index: @obsoletes_index,
|
892
|
+
structureType: @structureType,
|
893
|
+
ics: @ics,
|
894
|
+
meta: @meta,
|
895
|
+
special_tags: @special_tags,
|
896
|
+
max_freqs: @max_freqs,
|
897
|
+
dicts: @dicts,
|
898
|
+
profiles: @profiles,
|
899
|
+
profilesDict: @profilesDict,
|
900
|
+
items: @items,
|
901
|
+
removable_terms: @removable_terms,
|
902
|
+
term_paths: @term_paths}
|
903
|
+
# Convert to JSON format & write
|
904
|
+
File.open(file, "w") { |f| f.write obj_info.to_json }
|
905
|
+
end
|
906
|
+
|
907
|
+
|
908
|
+
def is_number? string
|
909
|
+
true if Float(string) rescue false
|
910
|
+
end
|
911
|
+
|
912
|
+
|
913
|
+
# Read a JSON file with an OBO_Handler object stored
|
914
|
+
# ===== Parameters
|
915
|
+
# +file+:: with object info
|
916
|
+
# ===== Return
|
917
|
+
# OBO_Handler internal fields
|
918
|
+
def read(file)
|
919
|
+
# Read file
|
920
|
+
jsonFile = File.open(file)
|
921
|
+
jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
|
922
|
+
# Pre-process (Symbolize some hashs values)
|
923
|
+
jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
|
924
|
+
jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
|
925
|
+
jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
|
926
|
+
jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h
|
927
|
+
jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
|
928
|
+
jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
|
929
|
+
jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h
|
930
|
+
jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
|
931
|
+
# Special case: byTerm
|
932
|
+
dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
|
933
|
+
if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
|
934
|
+
[term.to_s.to_i, value.map{|term| term.to_sym}]
|
935
|
+
elsif value.is_a? Numeric # Numeric dictionary
|
936
|
+
[term.to_sym, value]
|
937
|
+
elsif value.kind_of?(Array) && flag == :is_a
|
938
|
+
[term.to_sym, value.map{|v| v.to_sym}]
|
939
|
+
else
|
940
|
+
[term.to_sym, value]
|
941
|
+
end
|
942
|
+
end
|
943
|
+
dictionaries[:byTerm] = dictionaries[:byTerm].to_h
|
944
|
+
# By value
|
945
|
+
dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
|
946
|
+
if value.is_a? Numeric # Numeric dictionary
|
947
|
+
[value, term.to_sym]
|
948
|
+
elsif term.is_a? Numeric # Numeric dictionary
|
949
|
+
[value.to_s.to_sym, term]
|
950
|
+
elsif flag == :is_a
|
951
|
+
[value.to_sym, term.to_sym]
|
952
|
+
elsif term.kind_of?(Array)
|
953
|
+
[value.to_sym, term.map{|t| t.to_sym}]
|
954
|
+
else
|
955
|
+
[value.to_s, term.to_sym]
|
956
|
+
end
|
957
|
+
end
|
958
|
+
dictionaries[:byValue] = dictionaries[:byValue].to_h
|
959
|
+
end
|
960
|
+
jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
|
961
|
+
jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
|
962
|
+
jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}}
|
963
|
+
jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym}
|
964
|
+
jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
|
965
|
+
if v.kind_of?(Array)
|
966
|
+
jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
|
967
|
+
else
|
968
|
+
jsonInfo[:special_tags][k] = v.to_sym
|
969
|
+
end
|
970
|
+
end
|
971
|
+
jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}}
|
972
|
+
jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}}
|
973
|
+
# Store info
|
974
|
+
@header = jsonInfo[:header]
|
975
|
+
@stanzas = jsonInfo[:stanzas]
|
976
|
+
@ancestors_index = jsonInfo[:ancestors_index]
|
977
|
+
@descendants_index = jsonInfo[:descendants_index]
|
978
|
+
@alternatives_index = jsonInfo[:alternatives_index]
|
979
|
+
@obsoletes_index = jsonInfo[:obsoletes_index]
|
980
|
+
@structureType = jsonInfo[:structureType].to_sym
|
981
|
+
@ics = jsonInfo[:ics]
|
982
|
+
@meta = jsonInfo[:meta]
|
983
|
+
@special_tags = jsonInfo[:special_tags]
|
984
|
+
@max_freqs = jsonInfo[:max_freqs]
|
985
|
+
@dicts = jsonInfo[:dicts]
|
986
|
+
@profiles = jsonInfo[:profiles]
|
987
|
+
@profilesDict = jsonInfo[:profilesDict]
|
988
|
+
@items = jsonInfo[:items]
|
989
|
+
@removable_terms = jsonInfo[:removable_terms]
|
990
|
+
@term_paths = jsonInfo[:term_paths]
|
991
|
+
end
|
992
|
+
|
993
|
+
|
994
|
+
# Check if a given ID is stored as term into this object
|
995
|
+
# ===== Parameters
|
996
|
+
# +id+:: to be checked
|
997
|
+
# ===== Return
|
998
|
+
# True if term is allowed or false in other cases
|
999
|
+
def exists? id
|
1000
|
+
return stanzas[:terms].include?(id)
|
1001
|
+
end
|
1002
|
+
|
1003
|
+
|
1004
|
+
# This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
|
1005
|
+
# ===== Parameters
|
1006
|
+
# +text+:: to be checked
|
1007
|
+
# ===== Return
|
1008
|
+
# The correct ID if it can be found or nil in other cases
|
1009
|
+
def extract_id(text, splitBy: ' ')
|
1010
|
+
if self.exists?(text)
|
1011
|
+
return text
|
1012
|
+
else
|
1013
|
+
splittedText = text.to_s.split(splitBy).first.to_sym
|
1014
|
+
return self.exists?(splittedText) ? splittedText : nil
|
1015
|
+
end
|
1016
|
+
end
|
1017
|
+
|
1018
|
+
|
1019
|
+
# Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
|
1020
|
+
# This functions stores calculated dictionary into @dicts field.
|
1021
|
+
# This functions stores first value for multivalue tags
|
1022
|
+
# This function does not handle synonyms for byValue dictionaries
|
1023
|
+
# ===== Parameters
|
1024
|
+
# +tag+:: to be used to calculate dictionary
|
1025
|
+
# +select_regex+:: gives a regfex that can be used to modify value to be stored
|
1026
|
+
# +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
|
1027
|
+
# +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
|
1028
|
+
# +multiterm+:: if true, byValue will allows multi-term linkage (array)
|
1029
|
+
# +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
|
1030
|
+
# ===== Return
|
1031
|
+
# void. And stores calcualted bidirectional dictonary into dictionaries main container
|
1032
|
+
def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
|
1033
|
+
tag = tag.to_sym
|
1034
|
+
store_tag = tag if store_tag.nil?
|
1035
|
+
if @stanzas[:terms].empty?
|
1036
|
+
warn('Terms are not already loaded. Aborting dictionary calc')
|
1037
|
+
else
|
1038
|
+
byTerm = {}
|
1039
|
+
byValue = {}
|
1040
|
+
# Calc per term
|
1041
|
+
@stanzas[:terms].each do |term, tags|
|
1042
|
+
referenceTerm = term
|
1043
|
+
if @alternatives_index.include?(term) && substitute_alternatives # Special case
|
1044
|
+
referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
|
1045
|
+
end
|
1046
|
+
queryTag = tags[tag]
|
1047
|
+
if !queryTag.nil?
|
1048
|
+
# Pre-process
|
1049
|
+
if !select_regex.nil?
|
1050
|
+
if queryTag.kind_of?(Array)
|
1051
|
+
queryTag = queryTag.map{|value| value.scan(select_regex).first}
|
1052
|
+
queryTag.flatten!
|
1053
|
+
else
|
1054
|
+
queryTag = queryTag.scan(select_regex).first
|
1055
|
+
end
|
1056
|
+
queryTag.compact!
|
1057
|
+
end
|
1058
|
+
if queryTag.kind_of?(Array) # Store
|
1059
|
+
if !queryTag.empty?
|
1060
|
+
if byTerm.include?(referenceTerm)
|
1061
|
+
byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
|
1062
|
+
else
|
1063
|
+
byTerm[referenceTerm] = queryTag
|
1064
|
+
end
|
1065
|
+
if multiterm
|
1066
|
+
queryTag.each do |value|
|
1067
|
+
byValue[value] = [] if byValue[value].nil?
|
1068
|
+
byValue[value] << referenceTerm
|
1069
|
+
end
|
1070
|
+
else
|
1071
|
+
queryTag.each{|value| byValue[value] = referenceTerm}
|
1072
|
+
end
|
1073
|
+
end
|
1074
|
+
else
|
1075
|
+
if byTerm.include?(referenceTerm)
|
1076
|
+
byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
|
1077
|
+
else
|
1078
|
+
byTerm[referenceTerm] = [queryTag]
|
1079
|
+
end
|
1080
|
+
if multiterm
|
1081
|
+
byValue[queryTag] = [] if byValue[queryTag].nil?
|
1082
|
+
byValue[queryTag] << referenceTerm
|
1083
|
+
else
|
1084
|
+
byValue[queryTag] = referenceTerm
|
1085
|
+
end
|
1086
|
+
end
|
1087
|
+
end
|
1088
|
+
end
|
1089
|
+
|
1090
|
+
# Check self-references
|
1091
|
+
if self_type_references
|
1092
|
+
byTerm.map do |term, references|
|
1093
|
+
corrected_references = references.map do |t|
|
1094
|
+
checked = self.extract_id(t)
|
1095
|
+
if checked.nil?
|
1096
|
+
t
|
1097
|
+
else
|
1098
|
+
byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
|
1099
|
+
checked
|
1100
|
+
end
|
1101
|
+
end
|
1102
|
+
byTerm[term] = corrected_references.uniq
|
1103
|
+
end
|
1104
|
+
end
|
1105
|
+
|
1106
|
+
# Check order
|
1107
|
+
byTerm.map do |term,values|
|
1108
|
+
if self.exists?(term)
|
1109
|
+
referenceValue = @stanzas[:terms][term][tag]
|
1110
|
+
if !referenceValue.nil?
|
1111
|
+
if !select_regex.nil?
|
1112
|
+
if referenceValue.kind_of?(Array)
|
1113
|
+
referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
|
1114
|
+
referenceValue.flatten!
|
1115
|
+
else
|
1116
|
+
referenceValue = referenceValue.scan(select_regex).first
|
1117
|
+
end
|
1118
|
+
referenceValue.compact!
|
1119
|
+
end
|
1120
|
+
if self_type_references
|
1121
|
+
if referenceValue.kind_of?(Array)
|
1122
|
+
aux = referenceValue.map{|t| self.extract_id(t)}
|
1123
|
+
else
|
1124
|
+
aux = self.extract_id(referenceValue)
|
1125
|
+
end
|
1126
|
+
referenceValue = aux if !aux.nil?
|
1127
|
+
end
|
1128
|
+
referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
|
1129
|
+
byTerm[term] = referenceValue + (values - referenceValue)
|
1130
|
+
end
|
1131
|
+
end
|
1132
|
+
end
|
1133
|
+
|
1134
|
+
# Store
|
1135
|
+
@dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
|
1136
|
+
end
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
|
1140
|
+
# Calculates :is_a dictionary without alternatives substitution
|
1141
|
+
def calc_ancestors_dictionary
|
1142
|
+
self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true)
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
|
1146
|
+
# Translate a given value using an already calcualted dictionary
|
1147
|
+
# ===== Parameters
|
1148
|
+
# +toTranslate+:: value to be translated using dictiontionary
|
1149
|
+
# +tag+:: used to generate the dictionary
|
1150
|
+
# +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
|
1151
|
+
# ===== Return
|
1152
|
+
# translation
|
1153
|
+
def translate(toTranslate, tag, byValue: true)
|
1154
|
+
dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
|
1155
|
+
toTranslate = get_main_id(toTranslate) if !byValue
|
1156
|
+
return dict[toTranslate]
|
1157
|
+
end
|
1158
|
+
|
1159
|
+
|
1160
|
+
# Translate a name given
|
1161
|
+
# ===== Parameters
|
1162
|
+
# +name+:: to be translated
|
1163
|
+
# ===== Return
|
1164
|
+
# translated name or nil if it's not stored into this ontology
|
1165
|
+
def translate_name(name)
|
1166
|
+
term = self.translate(name, :name)
|
1167
|
+
term = self.translate(name, :synonym) if term.nil?
|
1168
|
+
return term
|
1169
|
+
end
|
1170
|
+
|
1171
|
+
|
1172
|
+
# Translate several names and return translations and a list of names which couldn't be translated
|
1173
|
+
# ===== Parameters
|
1174
|
+
# +names+:: array to be translated
|
1175
|
+
# ===== Return
|
1176
|
+
# two arrays with translations and names which couldn't be translated respectively
|
1177
|
+
def translate_names(names)
|
1178
|
+
translated = []
|
1179
|
+
rejected = []
|
1180
|
+
names.each do |name|
|
1181
|
+
tr = self.translate_name(name)
|
1182
|
+
if tr.nil?
|
1183
|
+
rejected << name
|
1184
|
+
else
|
1185
|
+
translated << tr
|
1186
|
+
end
|
1187
|
+
end
|
1188
|
+
return translated, rejected
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
|
1192
|
+
# Translates a given ID to it assigned name
|
1193
|
+
# ===== Parameters
|
1194
|
+
# +id+:: to be translated
|
1195
|
+
# ===== Return
|
1196
|
+
# main name or nil if it's not included into this ontology
|
1197
|
+
def translate_id(id)
|
1198
|
+
name = self.translate(id, :name, byValue: false)
|
1199
|
+
return name.nil? ? nil : name.first
|
1200
|
+
end
|
1201
|
+
|
1202
|
+
|
1203
|
+
# Translates several IDs and returns translations and not allowed IDs list
|
1204
|
+
# ===== Parameters
|
1205
|
+
# +ids+:: to be translated
|
1206
|
+
# ===== Return
|
1207
|
+
# two arrays with translations and names which couldn't be translated respectively
|
1208
|
+
def translate_ids(ids)
|
1209
|
+
translated = []
|
1210
|
+
rejected = []
|
1211
|
+
ids.each do |term_id|
|
1212
|
+
tr = self.translate_id(term_id.to_sym)
|
1213
|
+
if !tr.nil?
|
1214
|
+
translated << tr
|
1215
|
+
else
|
1216
|
+
rejected << tr
|
1217
|
+
end
|
1218
|
+
end
|
1219
|
+
return translated, rejected
|
1220
|
+
end
|
1221
|
+
|
1222
|
+
|
1223
|
+
# ===== Returns
|
1224
|
+
# the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
|
1225
|
+
# ===== Parameters
|
1226
|
+
# +id+:: to be translated
|
1227
|
+
# ===== Return
|
1228
|
+
# main ID related to a given ID. Returns nil if given ID is not an allowed ID
|
1229
|
+
def get_main_id(id)
|
1230
|
+
return nil if !@stanzas[:terms].include? id
|
1231
|
+
new_id = id
|
1232
|
+
mainID = @alternatives_index[id]
|
1233
|
+
new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
|
1234
|
+
return new_id
|
1235
|
+
end
|
1236
|
+
|
1237
|
+
|
1238
|
+
# Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
|
1239
|
+
# ===== Parameters
|
1240
|
+
# +ids+:: to be checked
|
1241
|
+
# ===== Return
|
1242
|
+
# two arrays whit allowed and rejected IDs respectively
|
1243
|
+
def check_ids(ids, substitute: true)
|
1244
|
+
checked_codes = []
|
1245
|
+
rejected_codes = []
|
1246
|
+
ids.each do |id|
|
1247
|
+
if @stanzas[:terms].include? id
|
1248
|
+
if substitute
|
1249
|
+
checked_codes << self.get_main_id(id)
|
1250
|
+
else
|
1251
|
+
checked_codes << id
|
1252
|
+
end
|
1253
|
+
else
|
1254
|
+
rejected_codes << id
|
1255
|
+
end
|
1256
|
+
end
|
1257
|
+
return checked_codes, rejected_codes
|
1258
|
+
end
|
1259
|
+
|
1260
|
+
|
1261
|
+
# Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
|
1262
|
+
# ===== Parameters
|
1263
|
+
# +id+:: assigned to profile
|
1264
|
+
# +terms+:: array of terms
|
1265
|
+
# +substitute+:: subsstitute flag from check_ids
|
1266
|
+
def add_profile(id, terms, substitute: true)
|
1267
|
+
warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
|
1268
|
+
correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
|
1269
|
+
if !rejected_terms.empty?
|
1270
|
+
warn('Given terms contains erroneus IDs. These IDs will be removed')
|
1271
|
+
end
|
1272
|
+
if id.is_a? Numeric
|
1273
|
+
@profiles[id] = correct_terms
|
1274
|
+
else
|
1275
|
+
@profiles[id.to_sym] = correct_terms
|
1276
|
+
end
|
1277
|
+
end
|
1278
|
+
|
1279
|
+
|
1280
|
+
# Method used to store a pull of profiles
|
1281
|
+
# ===== Parameters
|
1282
|
+
# +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
|
1283
|
+
# +calc_metadata+:: if true, launch calc_profiles_dictionary process
|
1284
|
+
# +reset_stored+:: if true, remove already stored profiles
|
1285
|
+
# +substitute+:: subsstitute flag from check_ids
|
1286
|
+
def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
|
1287
|
+
self.reset_profiles if reset_stored
|
1288
|
+
# Check
|
1289
|
+
if profiles.kind_of?(Array)
|
1290
|
+
profiles.each_with_index do |items, i|
|
1291
|
+
self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
|
1292
|
+
end
|
1293
|
+
else # Hash
|
1294
|
+
if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
|
1295
|
+
warn('Some profiles given are already stored. Stored version will be replaced')
|
1296
|
+
end
|
1297
|
+
profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
|
1298
|
+
end
|
1299
|
+
|
1300
|
+
self.add_observed_terms_from_profiles(reset: true)
|
1301
|
+
|
1302
|
+
if calc_metadata
|
1303
|
+
self.calc_profiles_dictionary
|
1304
|
+
end
|
1305
|
+
end
|
1306
|
+
|
1307
|
+
|
1308
|
+
# Internal method used to remove already stored profiles and restore observed frequencies
|
1309
|
+
def reset_profiles
|
1310
|
+
# Clean profiles storage
|
1311
|
+
@profiles = {}
|
1312
|
+
# Reset frequency observed
|
1313
|
+
@meta.each{|term,info| info[:observed_freq] = 0}
|
1314
|
+
@max_freqs[:observed_freq] = 0
|
1315
|
+
end
|
1316
|
+
|
1317
|
+
|
1318
|
+
# ===== Returns
|
1319
|
+
# profiles assigned to a given ID
|
1320
|
+
# ===== Parameters
|
1321
|
+
# +id+:: profile ID
|
1322
|
+
# ===== Return
|
1323
|
+
# specific profile or nil if it's not stored
|
1324
|
+
def get_profile(id)
|
1325
|
+
return @profiles[id]
|
1326
|
+
end
|
1327
|
+
|
1328
|
+
|
1329
|
+
# ===== Returns
|
1330
|
+
# an array of sizes for all stored profiles
|
1331
|
+
# ===== Return
|
1332
|
+
# array of profile sizes
|
1333
|
+
def get_profiles_sizes()
|
1334
|
+
return @profiles.map{|id,terms| terms.length}
|
1335
|
+
end
|
1336
|
+
|
1337
|
+
|
1338
|
+
# ===== Returns
|
1339
|
+
# mean size of stored profiles
|
1340
|
+
# ===== Parameters
|
1341
|
+
# +round_digits+:: number of digits to round result. Default: 4
|
1342
|
+
# ===== Returns
|
1343
|
+
# mean size of stored profiles
|
1344
|
+
def get_profiles_mean_size(round_digits: 4)
|
1345
|
+
sizes = self.get_profiles_sizes
|
1346
|
+
return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
|
1347
|
+
end
|
1348
|
+
|
1349
|
+
|
1350
|
+
# Calculates profiles sizes and returns size assigned to percentile given
|
1351
|
+
# ===== Parameters
|
1352
|
+
# +perc+:: percentile to be returned
|
1353
|
+
# +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
|
1354
|
+
# ===== Returns
|
1355
|
+
# values assigned to percentile asked
|
1356
|
+
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
1357
|
+
prof_lengths = self.get_profiles_sizes.sort
|
1358
|
+
prof_lengths.reverse! if !increasing_sort
|
1359
|
+
n_profiles = prof_lengths.length
|
1360
|
+
percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
|
1361
|
+
percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
|
1362
|
+
return prof_lengths[percentile_index]
|
1363
|
+
end
|
1364
|
+
|
1365
|
+
|
1366
|
+
# Translate a given profile to terms names
|
1367
|
+
# ===== Parameters
|
1368
|
+
# +prof+:: array of terms to be translated
|
1369
|
+
# ===== Returns
|
1370
|
+
# array of translated terms. Can include nils if some IDs are not allowed
|
1371
|
+
def profile_names(prof)
|
1372
|
+
return prof.map{|term| self.translate_id(term)}
|
1373
|
+
end
|
1374
|
+
|
1375
|
+
|
1376
|
+
# Trnaslates a bunch of profiles to it sets of term names
|
1377
|
+
# ===== Parameters
|
1378
|
+
# +profs+:: array of profiles
|
1379
|
+
# +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
|
1380
|
+
# ===== Returns
|
1381
|
+
# translated profiles
|
1382
|
+
def translate_profiles_ids(profs = [], asArray: true)
|
1383
|
+
profs = @profiles if profs.empty?
|
1384
|
+
profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
|
1385
|
+
profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
|
1386
|
+
return asArray ? profs_names.values : profs_names
|
1387
|
+
end
|
1388
|
+
|
1389
|
+
|
1390
|
+
# Includes as "observed_terms" all terms included into stored profiles
|
1391
|
+
# ===== Parameters
|
1392
|
+
# +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
|
1393
|
+
def add_observed_terms_from_profiles(reset: false)
|
1394
|
+
@meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
|
1395
|
+
@profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
|
1396
|
+
end
|
1397
|
+
|
1398
|
+
|
1399
|
+
# Get a term frequency
|
1400
|
+
# ===== Parameters
|
1401
|
+
# +term+:: term to be checked
|
1402
|
+
# +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
|
1403
|
+
# ===== Returns
|
1404
|
+
# frequency of term given or nil if term is not allowed
|
1405
|
+
def get_frequency(term, type: :struct_freq)
|
1406
|
+
queryFreq = @meta[term]
|
1407
|
+
return queryFreq.nil? ? nil : queryFreq[type]
|
1408
|
+
end
|
1409
|
+
|
1410
|
+
|
1411
|
+
# Geys structural frequency of a term given
|
1412
|
+
# ===== Parameters
|
1413
|
+
# +term+:: to be checked
|
1414
|
+
# ===== Returns
|
1415
|
+
# structural frequency of given term or nil if term is not allowed
|
1416
|
+
def get_structural_frequency(term)
|
1417
|
+
return self.get_frequency(term, type: :struct_freq)
|
1418
|
+
end
|
1419
|
+
|
1420
|
+
|
1421
|
+
# Gets observed frequency of a term given
|
1422
|
+
# ===== Parameters
|
1423
|
+
# +term+:: to be checked
|
1424
|
+
# ===== Returns
|
1425
|
+
# observed frequency of given term or nil if term is not allowed
|
1426
|
+
def get_observed_frequency(term)
|
1427
|
+
return self.get_frequency(term, type: :observed_freq)
|
1428
|
+
end
|
1429
|
+
|
1430
|
+
|
1431
|
+
# Calculates frequencies of stored profiles terms
|
1432
|
+
# ===== Parameters
|
1433
|
+
# +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
|
1434
|
+
# +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
|
1435
|
+
# +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
|
1436
|
+
# +translate+:: if true, term IDs will be translated to
|
1437
|
+
# ===== Returns
|
1438
|
+
# stored profiles terms frequencies
|
1439
|
+
def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
|
1440
|
+
n_profiles = @profiles.length
|
1441
|
+
if literal
|
1442
|
+
freqs = {}
|
1443
|
+
@profiles.each do |id, terms|
|
1444
|
+
terms.each do |literalTerm|
|
1445
|
+
if freqs.include?(literalTerm)
|
1446
|
+
freqs[literalTerm] += 1
|
1447
|
+
else
|
1448
|
+
freqs[literalTerm] = 1
|
1449
|
+
end
|
1450
|
+
end
|
1451
|
+
end
|
1452
|
+
if (ratio || translate)
|
1453
|
+
aux_keys = freqs.keys
|
1454
|
+
aux_keys.each do |term|
|
1455
|
+
freqs[term] = freqs[term].fdiv(n_profiles) if ratio
|
1456
|
+
if translate
|
1457
|
+
tr = self.translate_id(term)
|
1458
|
+
freqs[tr] = freqs.delete(term) if !tr.nil?
|
1459
|
+
end
|
1460
|
+
end
|
1461
|
+
end
|
1462
|
+
if asArray
|
1463
|
+
freqs = freqs.map{|term, freq| [term, freq]}
|
1464
|
+
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
1465
|
+
end
|
1466
|
+
else # Freqs translating alternatives
|
1467
|
+
freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
|
1468
|
+
freqs = freqs.to_h if !asArray
|
1469
|
+
if translate
|
1470
|
+
freqs = freqs.map do |term, freq|
|
1471
|
+
tr = self.translate_id(term)
|
1472
|
+
tr.nil? ? [term, freq] : [tr, freq]
|
1473
|
+
end
|
1474
|
+
end
|
1475
|
+
if asArray
|
1476
|
+
freqs = freqs.map{|term, freq| [term, freq]}
|
1477
|
+
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
1478
|
+
else
|
1479
|
+
freqs = freqs.to_h
|
1480
|
+
end
|
1481
|
+
end
|
1482
|
+
return freqs
|
1483
|
+
end
|
1484
|
+
|
1485
|
+
|
1486
|
+
# Clean a given profile returning cleaned set of terms and removed ancestors term.
|
1487
|
+
# ===== Parameters
|
1488
|
+
# +prof+:: array of terms to be checked
|
1489
|
+
# ===== Returns
|
1490
|
+
# two arrays, first is the cleaned profile and second is the removed elements array
|
1491
|
+
def remove_ancestors_from_profile(prof)
|
1492
|
+
ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
|
1493
|
+
redundant = prof.select{|term| ancestors.include?(term)}
|
1494
|
+
return prof - redundant, redundant
|
1495
|
+
end
|
1496
|
+
|
1497
|
+
|
1498
|
+
# Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
|
1499
|
+
# ===== Parameters
|
1500
|
+
# +prof+:: array of terms to be checked
|
1501
|
+
# ===== Returns
|
1502
|
+
# two arrays, first is the cleaned profile and second is the removed elements array
|
1503
|
+
def remove_alternatives_from_profile(prof)
|
1504
|
+
alternatives = prof.select{|term| @alternatives_index.include?(term)}
|
1505
|
+
redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
|
1506
|
+
return prof - redundant, redundant
|
1507
|
+
end
|
1508
|
+
|
1509
|
+
|
1510
|
+
# Remove alternatives (if official term is present) and ancestors terms of a given profile
|
1511
|
+
# ===== Parameters
|
1512
|
+
# +profile+:: profile to be cleaned
|
1513
|
+
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1514
|
+
# ===== Returns
|
1515
|
+
# cleaned profile
|
1516
|
+
def clean_profile(profile, remove_alternatives: true)
|
1517
|
+
terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
|
1518
|
+
if remove_alternatives
|
1519
|
+
terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
|
1520
|
+
else
|
1521
|
+
terms_without_ancestors_and_alternatices = terms_without_ancestors
|
1522
|
+
end
|
1523
|
+
return terms_without_ancestors_and_alternatices
|
1524
|
+
end
|
1525
|
+
|
1526
|
+
|
1527
|
+
# Remove alternatives (if official term is present) and ancestors terms of stored profiles
|
1528
|
+
# ===== Parameters
|
1529
|
+
# +store+:: if true, clenaed profiles will replace already stored profiles
|
1530
|
+
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1531
|
+
# ===== Returns
|
1532
|
+
# a hash with cleaned profiles
|
1533
|
+
def clean_profiles(store: false, remove_alternatives: true)
|
1534
|
+
cleaned_profiles = {}
|
1535
|
+
@profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
|
1536
|
+
@profiles = cleaned_profiles if store
|
1537
|
+
return cleaned_profiles
|
1538
|
+
end
|
1539
|
+
|
1540
|
+
|
1541
|
+
# Calculates number of ancestors present (redundant) in each profile stored
|
1542
|
+
# ===== Returns
|
1543
|
+
# array of parentals for each profile
|
1544
|
+
def parentals_per_profile
|
1545
|
+
cleaned_profiles = self.clean_profiles(remove_alternatives: false)
|
1546
|
+
parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
|
1547
|
+
return parentals
|
1548
|
+
end
|
1549
|
+
|
1550
|
+
|
1551
|
+
# Calculates mean IC of a given profile
|
1552
|
+
# ===== Parameters
|
1553
|
+
# +prof+:: profile to be checked
|
1554
|
+
# +ic_type+:: ic_type to be used
|
1555
|
+
# +zhou_k+:: special coeficient for Zhou IC method
|
1556
|
+
# ===== Returns
|
1557
|
+
# mean IC for a given profile
|
1558
|
+
def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
|
1559
|
+
return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
|
1560
|
+
end
|
1561
|
+
|
1562
|
+
|
1563
|
+
# Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
|
1564
|
+
# ===== Returns
|
1565
|
+
# two hashes with Profiles and IC calculated for resnik and observed resnik respectively
|
1566
|
+
def get_profiles_resnik_dual_ICs
|
1567
|
+
struct_ics = {}
|
1568
|
+
observ_ics = {}
|
1569
|
+
@profiles.each do |id, terms|
|
1570
|
+
struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
|
1571
|
+
observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
|
1572
|
+
end
|
1573
|
+
return struct_ics.clone, observ_ics.clone
|
1574
|
+
end
|
1575
|
+
|
1576
|
+
|
1577
|
+
# Calculates ontology structural levels for all ontology terms
|
1578
|
+
# ===== Parameters
|
1579
|
+
# +calc_paths+:: calculates term paths if it's not already calculated
|
1580
|
+
# +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
|
1581
|
+
def calc_term_levels(calc_paths: false, shortest_path: true)
|
1582
|
+
if @term_paths.empty?
|
1583
|
+
if calc_paths
|
1584
|
+
self.calc_term_paths
|
1585
|
+
else
|
1586
|
+
warn('Term paths are not already loaded. Aborting dictionary calc')
|
1587
|
+
end
|
1588
|
+
end
|
1589
|
+
if !@term_paths.empty?
|
1590
|
+
byTerm = {}
|
1591
|
+
byValue = {}
|
1592
|
+
# Calc per term
|
1593
|
+
@term_paths.each do |term, info|
|
1594
|
+
level = shortest_path ? info[:shortest_path] : info[:largest_path]
|
1595
|
+
if level.nil?
|
1596
|
+
level = -1
|
1597
|
+
else
|
1598
|
+
level = level.round(0)
|
1599
|
+
end
|
1600
|
+
byTerm[term] = level
|
1601
|
+
queryLevels = byValue[level]
|
1602
|
+
if queryLevels.nil?
|
1603
|
+
byValue[level] = [term]
|
1604
|
+
else
|
1605
|
+
byValue[level] << term
|
1606
|
+
end
|
1607
|
+
end
|
1608
|
+
@dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
|
1609
|
+
# Update maximum depth
|
1610
|
+
@max_freqs[:max_depth] = byValue.keys.max
|
1611
|
+
end
|
1612
|
+
end
|
1613
|
+
|
1614
|
+
|
1615
|
+
# Check if a term given is marked as obsolete
|
1616
|
+
def is_obsolete? term
|
1617
|
+
return @obsoletes_index.include?(term)
|
1618
|
+
end
|
1619
|
+
|
1620
|
+
# Check if a term given is marked as alternative
|
1621
|
+
def is_alternative? term
|
1622
|
+
return @alternatives_index.include?(term)
|
1623
|
+
end
|
1624
|
+
|
1625
|
+
# Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
|
1626
|
+
# Also calculates paths metadata and stores into @term_paths
|
1627
|
+
def calc_term_paths
|
1628
|
+
self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
|
1629
|
+
visited_terms = []
|
1630
|
+
@term_paths = {}
|
1631
|
+
if [:hierarchical, :sparse].include? @structureType
|
1632
|
+
terms = @stanzas[:terms].keys
|
1633
|
+
terms.each do |term|
|
1634
|
+
if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
|
1635
|
+
special_term = term
|
1636
|
+
term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
|
1637
|
+
@term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
|
1638
|
+
@term_paths[special_term] = @term_paths[term]
|
1639
|
+
visited_terms << special_term
|
1640
|
+
end
|
1641
|
+
|
1642
|
+
if !visited_terms.include?(term)
|
1643
|
+
@term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
|
1644
|
+
parentals = @dicts[:is_a][:byTerm][term]
|
1645
|
+
if parentals.nil?
|
1646
|
+
@term_paths[term][:paths] << [term]
|
1647
|
+
else
|
1648
|
+
parentals.each do |direct_parental|
|
1649
|
+
if visited_terms.include? direct_parental # Use direct_parental already calculated paths
|
1650
|
+
new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
|
1651
|
+
else # Calculate new paths
|
1652
|
+
self.expand_path(direct_parental, visited_terms)
|
1653
|
+
new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
|
1654
|
+
end
|
1655
|
+
new_paths.each{|path| @term_paths[term][:paths] << path}
|
1656
|
+
end
|
1657
|
+
end
|
1658
|
+
visited_terms << term
|
1659
|
+
end
|
1660
|
+
# Update metadata
|
1661
|
+
@term_paths[term][:total_paths] = @term_paths[term][:paths].length
|
1662
|
+
paths_sizes = @term_paths[term][:paths].map{|path| path.length}
|
1663
|
+
@term_paths[term][:largest_path] = paths_sizes.max
|
1664
|
+
@term_paths[term][:shortest_path] = paths_sizes.min
|
1665
|
+
end
|
1666
|
+
else
|
1667
|
+
warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
|
1668
|
+
end
|
1669
|
+
end
|
1670
|
+
|
1671
|
+
|
1672
|
+
# Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
|
1673
|
+
# ===== Parameters
|
1674
|
+
# +curr_term+:: current visited term
|
1675
|
+
# +visited_terms+:: already expanded terms
|
1676
|
+
def expand_path(curr_term, visited_terms)
|
1677
|
+
if !visited_terms.include?(curr_term) # Not already expanded
|
1678
|
+
@term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
|
1679
|
+
direct_parentals = @dicts[:is_a][:byTerm][curr_term]
|
1680
|
+
if direct_parentals.nil? # No parents :: End of recurrence
|
1681
|
+
@term_paths[curr_term][:paths] << [curr_term]
|
1682
|
+
else # Expand and concat
|
1683
|
+
direct_parentals.each do |ancestor|
|
1684
|
+
self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
|
1685
|
+
new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
|
1686
|
+
new_paths.each{|path| @term_paths[curr_term][:paths] << path}
|
1687
|
+
end
|
1688
|
+
end
|
1689
|
+
visited_terms << curr_term
|
1690
|
+
end
|
1691
|
+
end
|
1692
|
+
|
1693
|
+
|
1694
|
+
# Gets ontology levels calculated
|
1695
|
+
# ===== Returns
|
1696
|
+
# ontology levels calculated
|
1697
|
+
def get_ontology_levels
|
1698
|
+
return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
|
1699
|
+
end
|
1700
|
+
|
1701
|
+
|
1702
|
+
# Gets ontology level of a specific term
|
1703
|
+
# ===== Returns
|
1704
|
+
# Term level
|
1705
|
+
def get_term_level(term)
|
1706
|
+
return @dicts[:level][:byValue][term]
|
1707
|
+
end
|
1708
|
+
|
1709
|
+
|
1710
|
+
# Return ontology levels from profile terms
|
1711
|
+
# ===== Returns
|
1712
|
+
# hash of term levels (Key: level; Value: array of term IDs)
|
1713
|
+
def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
|
1714
|
+
profiles_terms = @profiles.values.flatten
|
1715
|
+
profiles_terms.uniq! if uniq
|
1716
|
+
term_freqs_byProfile = {}
|
1717
|
+
profiles_terms.each do |term|
|
1718
|
+
query = term_freqs_byProfile[term]
|
1719
|
+
if query.nil?
|
1720
|
+
term_freqs_byProfile[term] = 1
|
1721
|
+
else
|
1722
|
+
term_freqs_byProfile[term] += 1
|
1723
|
+
end
|
1724
|
+
end
|
1725
|
+
levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
|
1726
|
+
return levels_filtered
|
1727
|
+
end
|
1728
|
+
|
1729
|
+
|
1730
|
+
# Calculate profiles dictionary with Key= Term; Value = Profiles
|
1731
|
+
def calc_profiles_dictionary
|
1732
|
+
if @profiles.empty?
|
1733
|
+
warn('Profiles are not already loaded. Aborting dictionary calc')
|
1734
|
+
else
|
1735
|
+
byTerm = {} # Key: Terms
|
1736
|
+
# byValue -- Key: Profile == @profiles
|
1737
|
+
@profiles.each do |id, terms|
|
1738
|
+
terms.each do |term|
|
1739
|
+
if byTerm.include?(term)
|
1740
|
+
byTerm[term] << id
|
1741
|
+
else
|
1742
|
+
byTerm[term] = [id]
|
1743
|
+
end
|
1744
|
+
end
|
1745
|
+
end
|
1746
|
+
@profilesDict = byTerm
|
1747
|
+
end
|
1748
|
+
end
|
1749
|
+
|
1750
|
+
|
1751
|
+
# Gets profiles dictionary calculated
|
1752
|
+
# ===== Return
|
1753
|
+
# profiles dictionary (clone)
|
1754
|
+
def get_terms_linked_profiles
|
1755
|
+
return @profilesDict.clone
|
1756
|
+
end
|
1757
|
+
|
1758
|
+
|
1759
|
+
# Get related profiles to a given term
|
1760
|
+
# ===== Parameters
|
1761
|
+
# +term+:: to be checked
|
1762
|
+
# ===== Returns
|
1763
|
+
# profiles which contains given term
|
1764
|
+
def get_term_linked_profiles(term)
|
1765
|
+
return @profilesDict[term]
|
1766
|
+
end
|
1767
|
+
|
1768
|
+
|
1769
|
+
# Gets metainfo table from a set of terms
|
1770
|
+
# ===== Parameters
|
1771
|
+
# +terms+:: IDs to be expanded
|
1772
|
+
# +filter_alternatives+:: flag to be used in get_descendants method
|
1773
|
+
# ===== Returns
|
1774
|
+
# an array with triplets [TermID, TermName, DescendantsNames]
|
1775
|
+
def get_childs_table(terms, filter_alternatives = false)
|
1776
|
+
expanded_terms = []
|
1777
|
+
terms.each do |t|
|
1778
|
+
expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
|
1779
|
+
end
|
1780
|
+
return expanded_terms
|
1781
|
+
end
|
1782
|
+
|
1783
|
+
|
1784
|
+
# Store specific relations hash given into ITEMS structure
|
1785
|
+
# ===== Parameters
|
1786
|
+
# +relations+:: to be stored
|
1787
|
+
# +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
|
1788
|
+
# +expand+:: if true, already stored keys will be updated with the unique union of both sets
|
1789
|
+
def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
|
1790
|
+
@items = {} if remove_old_relations
|
1791
|
+
if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
|
1792
|
+
warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
|
1793
|
+
end
|
1794
|
+
if !remove_old_relations
|
1795
|
+
if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
|
1796
|
+
warn('Some terms given are already stored. Stored version will be replaced')
|
1797
|
+
end
|
1798
|
+
end
|
1799
|
+
if expand
|
1800
|
+
relations.each do |k,v|
|
1801
|
+
if @items.keys.include?(k)
|
1802
|
+
@items[k] = (@items[k] + v).uniq
|
1803
|
+
else
|
1804
|
+
@items[k] = v
|
1805
|
+
end
|
1806
|
+
end
|
1807
|
+
else
|
1808
|
+
@items.merge!(relations)
|
1809
|
+
end
|
1810
|
+
end
|
1811
|
+
|
1812
|
+
|
1813
|
+
# Assign a dictionary already calculated as a items set.
|
1814
|
+
# ===== Parameters
|
1815
|
+
# +dictID+:: dictionary ID to be stored (:byTerm will be used)
|
1816
|
+
def set_items_from_dict(dictID, remove_old_relations = false)
|
1817
|
+
@items = {} if remove_old_relations
|
1818
|
+
if(@dicts.keys.include?(dictID))
|
1819
|
+
@items.merge(@dicts[dictID][:byTerm])
|
1820
|
+
else
|
1821
|
+
warn('Specified ID is not calculated. Dict will not be added as a items set')
|
1822
|
+
end
|
1823
|
+
end
|
1824
|
+
|
1825
|
+
|
1826
|
+
# This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
|
1827
|
+
# Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
|
1828
|
+
# ===== Parameters
|
1829
|
+
# +ontology+:: (Optional) ontology object which items given belongs
|
1830
|
+
# +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
|
1831
|
+
# +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
|
1832
|
+
# ===== Returns
|
1833
|
+
# void and update items object
|
1834
|
+
def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
|
1835
|
+
# Check item keys
|
1836
|
+
if @items.empty?
|
1837
|
+
warn('Items have been not provided yet')
|
1838
|
+
return nil
|
1839
|
+
end
|
1840
|
+
targetKeys = @items.keys.select{|k| self.exists?(k)}
|
1841
|
+
if targetKeys.length == 0
|
1842
|
+
warn('Any item key is allowed')
|
1843
|
+
return nil
|
1844
|
+
elsif targetKeys.length < @items.keys.length
|
1845
|
+
warn('Some item keys are not allowed')
|
1846
|
+
end
|
1847
|
+
|
1848
|
+
# Expand to parentals
|
1849
|
+
targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
|
1850
|
+
targetKeys.flatten!
|
1851
|
+
targetKeys.uniq!
|
1852
|
+
|
1853
|
+
# Obtain levels (go from leaves to roots)
|
1854
|
+
levels = targetKeys.map{|term| self.get_term_level(term)}
|
1855
|
+
levels.compact!
|
1856
|
+
levels.uniq!
|
1857
|
+
levels.sort!
|
1858
|
+
levels.reverse!
|
1859
|
+
levels.shift # Leaves are not expandable
|
1860
|
+
|
1861
|
+
# Expand from leaves to roots
|
1862
|
+
levels.map do |lvl|
|
1863
|
+
curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
|
1864
|
+
curr_keys.map do |term_expand|
|
1865
|
+
to_infer = []
|
1866
|
+
# Obtain childs
|
1867
|
+
childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
|
1868
|
+
# Expand
|
1869
|
+
if childs.length > 0 && minimum_childs == 1 # Special case
|
1870
|
+
to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
|
1871
|
+
elsif childs.length >= minimum_childs
|
1872
|
+
to_infer = Hash.new(0)
|
1873
|
+
# Compare
|
1874
|
+
while childs.length > 1
|
1875
|
+
curr_term = childs.shift
|
1876
|
+
childs.each do |compare_term|
|
1877
|
+
pivot_items = @items[curr_term]
|
1878
|
+
compare_items = @items[compare_term]
|
1879
|
+
if ontology.nil? # Exact match
|
1880
|
+
pivot_items.map do |pitem|
|
1881
|
+
if compare_items.include?(pitem)
|
1882
|
+
to_infer[pitem] += 2
|
1883
|
+
end
|
1884
|
+
end
|
1885
|
+
else # Find MICAs
|
1886
|
+
local_infer = Hash.new(0)
|
1887
|
+
pivot_items.map do |pitem|
|
1888
|
+
micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
|
1889
|
+
maxmica = micas[0]
|
1890
|
+
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
1891
|
+
local_infer[maxmica.first] += 1
|
1892
|
+
end
|
1893
|
+
compare_items.map do |citem|
|
1894
|
+
micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
|
1895
|
+
maxmica = micas[0]
|
1896
|
+
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
1897
|
+
local_infer[maxmica.first] += 1
|
1898
|
+
end
|
1899
|
+
local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
|
1900
|
+
end
|
1901
|
+
end
|
1902
|
+
end
|
1903
|
+
# Filter infer
|
1904
|
+
to_infer = to_infer.select{|k,v| v >= minimum_childs}
|
1905
|
+
end
|
1906
|
+
# Infer
|
1907
|
+
if to_infer.length > 0
|
1908
|
+
@items[term_expand] = [] if @items[term_expand].nil?
|
1909
|
+
if to_infer.kind_of?(Array)
|
1910
|
+
@items[term_expand] = (@items[term_expand] + to_infer).uniq
|
1911
|
+
else
|
1912
|
+
@items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
|
1913
|
+
end
|
1914
|
+
@items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
|
1915
|
+
elsif !@items.include?(term_expand)
|
1916
|
+
targetKeys.delete(term_expand)
|
1917
|
+
end
|
1918
|
+
end
|
1919
|
+
end
|
1920
|
+
end
|
1921
|
+
|
1922
|
+
|
1923
|
+
|
1924
|
+
# NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
|
1925
|
+
# ===== Parameters
|
1926
|
+
# ++::
|
1927
|
+
# ===== Returns
|
1928
|
+
# ...
|
1929
|
+
def compute_relations_to_items(external_item_list, mode, thresold)
|
1930
|
+
results = []
|
1931
|
+
penalized_terms = {}
|
1932
|
+
# terms_levels = get_terms_levels(@items_relations.keys)
|
1933
|
+
terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
|
1934
|
+
terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
|
1935
|
+
terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
|
1936
|
+
levels = terms_levels.keys.sort
|
1937
|
+
levels.reverse_each do |level|
|
1938
|
+
terms_levels[level].each do |term|
|
1939
|
+
associated_items = @items_relations[term]
|
1940
|
+
if mode == :elim
|
1941
|
+
items_to_remove = penalized_terms[term]
|
1942
|
+
items_to_remove = [] if items_to_remove.nil?
|
1943
|
+
pval = get_fisher_exact_test(
|
1944
|
+
external_item_list - items_to_remove,
|
1945
|
+
associated_items - items_to_remove,
|
1946
|
+
((associated_items | external_item_list) - items_to_remove).length
|
1947
|
+
)
|
1948
|
+
if pval <= thresold
|
1949
|
+
parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
|
1950
|
+
parents.each do |prnt|
|
1951
|
+
query = penalized_terms[prnt]
|
1952
|
+
if query.nil?
|
1953
|
+
penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
|
1954
|
+
else
|
1955
|
+
query.concat(@items_relations[term])
|
1956
|
+
end
|
1957
|
+
end
|
1958
|
+
end
|
1959
|
+
end
|
1960
|
+
results << [term, pval]
|
1961
|
+
end
|
1962
|
+
end
|
1963
|
+
return results
|
1964
|
+
end
|
1965
|
+
|
1966
|
+
|
1967
|
+
# Check if a given ID is a removable (blacklist) term.
|
1968
|
+
# +DEPRECATED+ use is_removable? instead
|
1969
|
+
# ===== Parameters
|
1970
|
+
# +id+:: to be checked
|
1971
|
+
# ===== Returns
|
1972
|
+
# true if given term is a removable (blacklist) term or false in other cases
|
1973
|
+
def is_removable(id)
|
1974
|
+
warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
|
1975
|
+
return @removable_terms.include?(id.to_sym)
|
1976
|
+
end
|
1977
|
+
|
1978
|
+
# Check if a given ID is a removable (blacklist) term
|
1979
|
+
# ===== Parameters
|
1980
|
+
# +id+:: to be checked
|
1981
|
+
# ===== Returns
|
1982
|
+
# true if given term is a removable (blacklist) term or false in other cases
|
1983
|
+
def is_removable? id
|
1984
|
+
return @removable_terms.include?(id.to_sym)
|
1985
|
+
end
|
1986
|
+
|
1987
|
+
############################################
|
1988
|
+
# SPECIAL METHODS
|
1989
|
+
#############################################
|
1990
|
+
def ==(other)
|
1991
|
+
self.header == other.header &&
|
1992
|
+
self.stanzas == other.stanzas &&
|
1993
|
+
self.ancestors_index == other.ancestors_index &&
|
1994
|
+
self.alternatives_index == other.alternatives_index &&
|
1995
|
+
self.obsoletes_index == other.obsoletes_index &&
|
1996
|
+
self.structureType == other.structureType &&
|
1997
|
+
self.ics == other.ics &&
|
1998
|
+
self.meta == other.meta &&
|
1999
|
+
self.dicts == other.dicts &&
|
2000
|
+
self.profiles == other.profiles &&
|
2001
|
+
self.profilesDict == other.profilesDict &&
|
2002
|
+
(self.items.keys - other.items.keys).empty? &&
|
2003
|
+
self.removable_terms == other.removable_terms &&
|
2004
|
+
self.special_tags == other.special_tags &&
|
2005
|
+
self.items == other.items &&
|
2006
|
+
self.term_paths == other.term_paths &&
|
2007
|
+
self.max_freqs == other.max_freqs
|
2008
|
+
end
|
2009
|
+
|
2010
|
+
|
2011
|
+
def clone
|
2012
|
+
copy = Ontology.new
|
2013
|
+
copy.header = self.header.clone
|
2014
|
+
copy.stanzas[:terms] = self.stanzas[:terms].clone
|
2015
|
+
copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
|
2016
|
+
copy.stanzas[:instances] = self.stanzas[:instances].clone
|
2017
|
+
copy.ancestors_index = self.ancestors_index.clone
|
2018
|
+
copy.descendants_index = self.descendants_index.clone
|
2019
|
+
copy.alternatives_index = self.alternatives_index.clone
|
2020
|
+
copy.obsoletes_index = self.obsoletes_index.clone
|
2021
|
+
copy.structureType = self.structureType.clone
|
2022
|
+
copy.ics = self.ics.clone
|
2023
|
+
copy.meta = self.meta.clone
|
2024
|
+
copy.dicts = self.dicts.clone
|
2025
|
+
copy.profiles = self.profiles.clone
|
2026
|
+
copy.profilesDict = self.profilesDict.clone
|
2027
|
+
copy.items = self.items.clone
|
2028
|
+
copy.removable_terms = self.removable_terms.clone
|
2029
|
+
copy.term_paths = self.term_paths.clone
|
2030
|
+
copy.max_freqs = self.max_freqs.clone
|
2031
|
+
return copy
|
2032
|
+
end
|
2033
|
+
|
2034
|
+
|
2035
|
+
#############################################
|
2036
|
+
# ACCESS CONTROL
|
2037
|
+
#############################################
|
2038
|
+
|
2039
|
+
attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
|
2040
|
+
attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
|
2041
|
+
end
|