semtools 0.1.6 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -1
- data/README.md +2 -0
- data/bin/semtools.rb +521 -0
- data/bin/strsimnet.rb +1 -2
- data/external_data/ontologies.txt +4 -0
- data/lib/semtools/ontology.rb +1241 -2002
- data/lib/semtools/parsers/file_parser.rb +32 -0
- data/lib/semtools/parsers/json_parser.rb +84 -0
- data/lib/semtools/parsers/oboparser.rb +511 -0
- data/lib/semtools/sim_handler.rb +1 -1
- data/lib/semtools/version.rb +1 -1
- data/lib/semtools.rb +3 -1
- data/semtools.gemspec +3 -1
- metadata +40 -6
- data/lib/semtools/math_methods.rb +0 -148
data/lib/semtools/ontology.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'expcalc'
|
1
2
|
require 'json'
|
2
3
|
require 'colorize'
|
3
4
|
|
@@ -7,44 +8,29 @@ class Ontology
|
|
7
8
|
# AUTHOR NOTES
|
8
9
|
#########################################################
|
9
10
|
|
10
|
-
# 1 - Store @profiles as @stanzas[:instances]
|
11
11
|
# 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
|
12
12
|
|
13
|
-
|
14
13
|
#############################################
|
15
14
|
# FIELDS
|
16
15
|
#############################################
|
17
|
-
# Handled class variables
|
18
|
-
# => @@basic_tags :: hash with main OBO structure tags
|
19
|
-
# => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
|
20
|
-
# => @@symbolizable_ids :: tags which can be symbolized
|
21
|
-
# => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
|
22
|
-
#
|
23
16
|
# Handled object variables
|
24
|
-
# => @
|
25
|
-
# => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
|
17
|
+
# => @terms :: OBO terms descriptions
|
26
18
|
# => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
|
27
19
|
# => @descendants_index :: hash of descendants per each term handled with any structure relationships
|
28
20
|
# => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
|
29
|
-
# => @obsoletes_index :: hash of obsoletes and it's new ids
|
30
|
-
# => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
|
31
21
|
# => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
|
32
|
-
# => @ics :: already calculated ICs for handled terms and IC types
|
33
|
-
# => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
|
34
|
-
# => @max_freqs :: maximum freqs found for structural and observed freqs
|
35
22
|
# => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
|
36
|
-
# => @profiles :: set of terms assigned to an ID
|
37
|
-
# => @profilesDict :: set of profile IDs assigned to a term
|
38
|
-
# => @items :: hash with items relations to terms
|
39
23
|
# => @removable_terms :: array of terms to not be considered
|
24
|
+
# => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
|
25
|
+
# => @ics :: already calculated ICs for handled terms and IC types
|
40
26
|
# => @term_paths :: metainfo about parental paths of each term
|
27
|
+
# => @max_freqs :: maximum freqs found for structural and observed freqs
|
28
|
+
# => @items :: hash with items relations to terms
|
29
|
+
# => @profiles :: set of terms assigned to an ID
|
41
30
|
|
42
|
-
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
|
43
31
|
@@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
|
44
|
-
|
45
|
-
|
46
|
-
@@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
|
47
|
-
@@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
|
32
|
+
|
33
|
+
attr_accessor :terms, :ancestors_index, :descendants_index, :alternatives_index, :obsoletes, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :items, :term_paths, :reroot
|
48
34
|
|
49
35
|
#############################################
|
50
36
|
# CONSTRUCTOR
|
@@ -57,265 +43,138 @@ class Ontology
|
|
57
43
|
# +removable_terms+: term to be removed from calcs
|
58
44
|
# +build+: flag to launch metainfo calculation
|
59
45
|
# +file_format+: force format type despite file extension. Can be :obo or :json
|
60
|
-
def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
|
61
|
-
|
62
|
-
@header = nil
|
63
|
-
@stanzas = {terms: {}, typedefs: {}, instances: {}}
|
46
|
+
def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil, extra_dicts: [])
|
47
|
+
@terms = {}
|
64
48
|
@ancestors_index = {}
|
65
49
|
@descendants_index = {}
|
66
50
|
@alternatives_index = {}
|
67
|
-
@
|
51
|
+
@obsoletes = {} # id is obsolete but it could or not have an alt id
|
68
52
|
@structureType = nil
|
69
53
|
@ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
|
70
54
|
@meta = {}
|
71
|
-
@special_tags = @@basic_tags.clone
|
72
55
|
@max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
|
73
56
|
@dicts = {}
|
74
57
|
@profiles = {}
|
75
|
-
@profilesDict = {}
|
76
58
|
@items = {}
|
77
|
-
@removable_terms = []
|
78
59
|
@term_paths = {}
|
79
|
-
|
60
|
+
@reroot = false
|
80
61
|
load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
|
81
62
|
# Load if proceeds
|
82
63
|
if load_file
|
83
64
|
fformat = file_format
|
84
65
|
fformat = File.extname(file) if fformat.nil? && !file.nil?
|
85
66
|
if fformat == :obo || fformat == ".obo"
|
86
|
-
load(file, build: build)
|
67
|
+
OboParser.load(self, file, build: build, black_list: removable_terms, extra_dicts: extra_dicts)
|
87
68
|
elsif fformat == :json || fformat == ".json"
|
88
|
-
|
69
|
+
JsonParser.load(self, file, build: build)
|
89
70
|
elsif !fformat.nil?
|
90
71
|
warn 'Format not allowed. Loading process will not be performed'
|
91
72
|
end
|
73
|
+
precompute if build
|
92
74
|
end
|
93
75
|
end
|
94
76
|
|
95
|
-
|
96
77
|
#############################################
|
97
|
-
#
|
78
|
+
# GENERATE METADATA FOR ALL TERMS
|
98
79
|
#############################################
|
99
80
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
# +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
|
104
|
-
# ===== Parameters
|
105
|
-
# +start+:: term where start to expand
|
106
|
-
# +terms+:: set to be used to expand
|
107
|
-
# +target_tag+:: tag used to expand
|
108
|
-
# +eexpansion+:: already expanded info
|
109
|
-
# +split_info_char+:: special regex used to split info (if it is necessary)
|
110
|
-
# +split_info_indx+:: special index to take splitted info (if it is necessary)
|
111
|
-
# +alt_ids+:: set of alternative IDs
|
112
|
-
# ===== Returns
|
113
|
-
# A vector with the observed structure (string) and the array with extended terms.
|
114
|
-
def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
|
115
|
-
# Take start_id term available info and already accumulated info
|
116
|
-
current_associations = related_ids[start_id]
|
117
|
-
current_associations = [] if current_associations.nil?
|
118
|
-
return [:no_term,[]] if terms[start_id].nil?
|
119
|
-
id_relations = terms[start_id][target_tag]
|
120
|
-
return [:source,[]] if id_relations.nil?
|
121
|
-
|
122
|
-
# Prepare auxiliar variables
|
123
|
-
struct = :hierarchical
|
124
|
-
|
125
|
-
# Study direct extensions
|
126
|
-
id_relations = id_relations.clone
|
127
|
-
while id_relations.length > 0
|
128
|
-
id = id_relations.shift
|
129
|
-
id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
|
130
|
-
|
131
|
-
# Handle
|
132
|
-
if current_associations.include?(id) # Check if already have been included into this expansion
|
133
|
-
struct = :circular
|
134
|
-
else
|
135
|
-
current_associations << id
|
136
|
-
if related_ids.include?(id) # Check if current already has been expanded
|
137
|
-
current_associations = current_associations | related_ids[id]
|
138
|
-
if current_associations.include?(start_id) # Check circular case
|
139
|
-
struct = :circular
|
140
|
-
[id, start_id].each{|repeated| current_associations.delete(repeated)}
|
141
|
-
end
|
142
|
-
else # Expand
|
143
|
-
related_ids[start_id] = current_associations
|
144
|
-
structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
|
145
|
-
current_associations = current_associations | current_related_ids
|
146
|
-
struct = :circular if structExp == :circular # Check struct
|
147
|
-
if current_associations.include?(start_id) # Check circular case
|
148
|
-
struct = :circular
|
149
|
-
current_associations.delete(start_id)
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
154
|
-
related_ids[start_id] = current_associations
|
155
|
-
|
156
|
-
return struct, current_associations
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
# Expand terms using a specific tag and return all extended terms into an array and
|
161
|
-
# the relationship structuture observed (hierarchical or circular). If circular structure is
|
162
|
-
# foumd, extended array will be an unique vector without starting term (no loops)
|
163
|
-
# ===== Parameters
|
164
|
-
# +terms+:: set to be used to expand
|
165
|
-
# +target_tag+:: tag used to expand
|
166
|
-
# +split_info_char+:: special regex used to split info (if it is necessary)
|
167
|
-
# +split_info_indx+:: special index to take splitted info (if it is necessary)
|
168
|
-
# +alt_ids+:: set of alternative IDs
|
169
|
-
# +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
|
170
|
-
# ===== Returns
|
171
|
-
# A vector with the observed structure (string) and the hash with extended terms
|
172
|
-
def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
|
173
|
-
# Define structure type
|
174
|
-
structType = :hierarchical
|
175
|
-
related_ids = {}
|
176
|
-
terms.each do |id, tags|
|
177
|
-
# Check if target tag is defined
|
178
|
-
if !tags[target_tag].nil?
|
179
|
-
# Obtain related terms
|
180
|
-
set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
|
181
|
-
# Check structure
|
182
|
-
structType = :circular if set_structure == :circular
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
# Check special case
|
187
|
-
structType = :atomic if related_ids.length <= 0
|
188
|
-
structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
|
189
|
-
# Return type and hash with related_ids
|
190
|
-
return structType, related_ids
|
81
|
+
def precompute
|
82
|
+
get_index_frequencies
|
83
|
+
calc_term_levels(calc_paths: true)
|
191
84
|
end
|
192
85
|
|
193
|
-
|
194
|
-
# Class method to transform string with <tag : info> into hash structure
|
195
|
-
# ===== Parameters
|
196
|
-
# +attributes+:: array tuples with info to be transformed into hash format
|
86
|
+
# Calculates regular frequencies based on ontology structure (using parentals)
|
197
87
|
# ===== Returns
|
198
|
-
#
|
199
|
-
def
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
tag = tag.lstrip.to_sym
|
209
|
-
value.lstrip!
|
210
|
-
value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
|
211
|
-
|
212
|
-
# Store
|
213
|
-
query = info_hash[tag]
|
214
|
-
if !query.nil? # Tag already exists
|
215
|
-
if !query.kind_of?(Array) # Check that tag is multivalue
|
216
|
-
raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
|
217
|
-
else
|
218
|
-
query << value # Add new value to tag
|
219
|
-
end
|
220
|
-
else # New entry
|
221
|
-
if @@multivalue_tags.include?(tag)
|
222
|
-
info_hash[tag] = [value]
|
223
|
-
else
|
224
|
-
info_hash[tag] = value
|
88
|
+
# true if everything end without errors and false in other cases
|
89
|
+
def get_index_frequencies() # Per each term, add frequencies
|
90
|
+
if @ancestors_index.empty?
|
91
|
+
warn('ancestors_index object is empty')
|
92
|
+
else
|
93
|
+
each(att = true) do |id, tags|
|
94
|
+
query = @meta[id]
|
95
|
+
if query.nil?
|
96
|
+
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
97
|
+
@meta[id] = query
|
225
98
|
end
|
99
|
+
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].length.to_f : 0.0
|
100
|
+
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].length.to_f : 0.0
|
101
|
+
query[:struct_freq] = query[:descendants] + 1.0
|
102
|
+
# Update maximums
|
103
|
+
@max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
|
104
|
+
@max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
|
226
105
|
end
|
227
106
|
end
|
228
|
-
self.symbolize_ids(info_hash)
|
229
|
-
return info_hash
|
230
107
|
end
|
231
108
|
|
232
|
-
|
233
|
-
# Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
|
234
|
-
# the Header, the Terms, the Typedefs and the Instances.
|
109
|
+
# Calculates ontology structural levels for all ontology terms
|
235
110
|
# ===== Parameters
|
236
|
-
# +
|
237
|
-
#
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
# Read file
|
249
|
-
File.open(file).each do |line|
|
250
|
-
line.chomp!
|
251
|
-
next if line.empty?
|
252
|
-
fields = line.split(':', 2)
|
253
|
-
# Check if new instance is found
|
254
|
-
if stanzas_flags.include?(line)
|
255
|
-
header = self.process_entity(header, infoType, stanzas, currInfo)
|
256
|
-
# Update info variables
|
257
|
-
currInfo = []
|
258
|
-
infoType = line.gsub!(/[\[\]]/, '')
|
259
|
-
next
|
111
|
+
# +calc_paths+:: calculates term paths if it's not already calculated
|
112
|
+
# +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
|
113
|
+
def calc_term_levels(calc_paths: false, shortest_path: true)
|
114
|
+
self.calc_term_paths if @term_paths.empty? && calc_paths
|
115
|
+
if !@term_paths.empty?
|
116
|
+
byTerm = {}
|
117
|
+
byValue = {}
|
118
|
+
@term_paths.each do |term, info|
|
119
|
+
level = shortest_path ? info[:shortest_path] : info[:largest_path]
|
120
|
+
level = level.nil? ? -1 : level.round(0)
|
121
|
+
byTerm[term] = level
|
122
|
+
add2hash(byValue, level, term)
|
260
123
|
end
|
261
|
-
#
|
262
|
-
|
124
|
+
@dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
|
125
|
+
@max_freqs[:max_depth] = byValue.keys.max # Update maximum depth
|
263
126
|
end
|
264
|
-
# Store last loaded info
|
265
|
-
header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
|
266
|
-
|
267
|
-
# Prepare to return
|
268
|
-
finfo = {:file => file, :name => File.basename(file, File.extname(file))}
|
269
|
-
return finfo, header, stanzas
|
270
127
|
end
|
271
128
|
|
272
|
-
|
273
|
-
#
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
header = info
|
286
|
-
else
|
287
|
-
id = info[:id]
|
288
|
-
case infoType
|
289
|
-
when 'Term'
|
290
|
-
stanzas[:terms][id] = info
|
291
|
-
when 'Typedef'
|
292
|
-
stanzas[:typedefs][id] = info
|
293
|
-
when 'Instance'
|
294
|
-
stanzas[:instances][id] = info
|
129
|
+
# Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
|
130
|
+
# Also calculates paths metadata and stores into @term_paths
|
131
|
+
def calc_term_paths
|
132
|
+
@term_paths = {}
|
133
|
+
if [:hierarchical, :sparse].include? @structureType
|
134
|
+
each do |term|
|
135
|
+
expand_path(term)
|
136
|
+
path_attr = @term_paths[term]
|
137
|
+
# expand_path is arecursive function so these pat attributes must be calculated once the recursion is finished
|
138
|
+
path_attr[:total_paths] = path_attr[:paths].length
|
139
|
+
paths_sizes = path_attr[:paths].map{|path| path.length}
|
140
|
+
path_attr[:largest_path] = paths_sizes.max
|
141
|
+
path_attr[:shortest_path] = paths_sizes.min
|
295
142
|
end
|
143
|
+
else
|
144
|
+
warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
|
296
145
|
end
|
297
|
-
return header
|
298
146
|
end
|
299
147
|
|
300
|
-
|
301
|
-
# Symboliza all values into hashs using symbolizable tags as keys
|
148
|
+
# Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
|
302
149
|
# ===== Parameters
|
303
|
-
# +
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
150
|
+
# +curr_term+:: current visited term
|
151
|
+
# +visited_terms+:: already expanded terms
|
152
|
+
def expand_path(curr_term)
|
153
|
+
if !@term_paths.include?(curr_term)
|
154
|
+
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
|
155
|
+
@term_paths[curr_term] = path_attr
|
156
|
+
direct_parentals = @dicts[:is_a][:byTerm][curr_term]
|
157
|
+
if direct_parentals.nil? # No parents :: End of recurrence
|
158
|
+
path_attr[:paths] << [curr_term]
|
159
|
+
else # Expand and concat
|
160
|
+
direct_parentals.each do |ancestor|
|
161
|
+
path_attr_parental = @term_paths[ancestor]
|
162
|
+
if path_attr_parental.nil? # Calculate new paths
|
163
|
+
self.expand_path(ancestor)
|
164
|
+
new_paths = @term_paths[ancestor][:paths]
|
165
|
+
else # Use direct_parental paths already calculated
|
166
|
+
new_paths = path_attr_parental[:paths]
|
167
|
+
end
|
168
|
+
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
|
312
169
|
end
|
313
170
|
end
|
314
171
|
end
|
315
172
|
end
|
316
173
|
|
174
|
+
#############################################
|
175
|
+
# CLASS METHODS (TODO: TO BE TRANFORMED IN INSTANCE METHODS)
|
176
|
+
#############################################
|
317
177
|
|
318
|
-
#
|
319
178
|
# ===== Parameters
|
320
179
|
# +root+:: main term to expand
|
321
180
|
# +ontology+:: to be cutted
|
@@ -323,18 +182,32 @@ class Ontology
|
|
323
182
|
# +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
|
324
183
|
# ===== Returns
|
325
184
|
# An Ontology object with terms after cut the ontology.
|
326
|
-
def self.mutate(root, ontology, clone: true, remove_up: true)
|
185
|
+
def self.mutate(root, ontology, clone: true, remove_up: true) #TODO, pending to fix and pass to instance method
|
327
186
|
ontology = ontology.clone if clone
|
328
187
|
# Obtain affected IDs
|
329
188
|
descendants = ontology.descendants_index[root]
|
330
189
|
descendants << root # Store itself to do not remove it
|
331
190
|
# Remove unnecesary terms
|
332
|
-
|
191
|
+
terms = ontology.terms.select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
|
192
|
+
ids = terms.keys
|
193
|
+
terms.each do |id, term|
|
194
|
+
term[:is_a] = term[:is_a] & ids # Clean parental relations to keep only whose that exist between selected terms
|
195
|
+
end
|
196
|
+
ontology.terms = terms
|
333
197
|
ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
|
334
198
|
ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
|
335
199
|
ontology.dicts = {}
|
336
|
-
ontology.removable_terms = []
|
337
200
|
ontology.term_paths = {}
|
201
|
+
ontology.reroot = true
|
202
|
+
|
203
|
+
ontology.ancestors_index = {}
|
204
|
+
ontology.descendants_index = {}
|
205
|
+
ontology.alternatives_index = {}
|
206
|
+
ontology.meta = {}
|
207
|
+
ontology.profiles = {}
|
208
|
+
ontology.items = {}
|
209
|
+
|
210
|
+
|
338
211
|
# Recalculate metadata
|
339
212
|
ontology.build_index
|
340
213
|
ontology.add_observed_terms_from_profiles
|
@@ -342,33 +215,13 @@ class Ontology
|
|
342
215
|
return ontology
|
343
216
|
end
|
344
217
|
|
345
|
-
|
346
|
-
|
347
218
|
#############################################
|
348
|
-
#
|
219
|
+
# TERM METHODS
|
349
220
|
#############################################
|
350
221
|
|
351
|
-
#
|
352
|
-
|
353
|
-
# +terms+:: terms array to be concatenated
|
354
|
-
def add_removable_terms(terms)
|
355
|
-
terms = terms.map{|term| term.to_sym}
|
356
|
-
@removable_terms.concat(terms)
|
357
|
-
end
|
358
|
-
|
359
|
-
|
360
|
-
# Include removable terms to current removable terms list loading new
|
361
|
-
# terms from a one column plain text file
|
362
|
-
# ===== Parameters
|
363
|
-
# +file+:: to be loaded
|
364
|
-
def add_removable_terms_from_file(file)
|
365
|
-
File.open(excluded_codes_file).each do |line|
|
366
|
-
line.chomp!
|
367
|
-
@removable_terms << line.to_sym
|
368
|
-
end
|
369
|
-
end
|
222
|
+
# I/O observed term from data
|
223
|
+
####################################
|
370
224
|
|
371
|
-
|
372
225
|
# Increase observed frequency for a specific term
|
373
226
|
# ===== Parameters
|
374
227
|
# +term+:: term which frequency is going to be increased
|
@@ -376,15 +229,7 @@ class Ontology
|
|
376
229
|
# ===== Return
|
377
230
|
# true if process ends without errors, false in other cases
|
378
231
|
def add_observed_term(term:,increase: 1.0)
|
379
|
-
|
380
|
-
raise ArgumentError, "Term given is NIL" if term.nil?
|
381
|
-
return false unless @stanzas[:terms].include?(term)
|
382
|
-
return false if @removable_terms.include?(term)
|
383
|
-
if @alternatives_index.include?(term)
|
384
|
-
alt_id = @alternatives_index[term]
|
385
|
-
@meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
|
386
|
-
@meta[term] = @meta[alt_id]
|
387
|
-
end
|
232
|
+
return false unless term_exist?(term)
|
388
233
|
# Check if exists
|
389
234
|
@meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
|
390
235
|
# Add frequency
|
@@ -395,345 +240,199 @@ class Ontology
|
|
395
240
|
return true
|
396
241
|
end
|
397
242
|
|
243
|
+
# Obtain level and term relations
|
244
|
+
####################################
|
398
245
|
|
399
|
-
# Increase the arbitrary frequency of a given term set
|
400
246
|
# ===== Parameters
|
401
|
-
# +
|
402
|
-
# +
|
403
|
-
#
|
404
|
-
#
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
|
247
|
+
# +term+:: which are requested
|
248
|
+
# +relation+:: can be :ancestor or :descendant
|
249
|
+
# ===== Returns
|
250
|
+
# Direct ancestors/descendants of given term or nil if any error occurs
|
251
|
+
def get_direct_related(term, relation)
|
252
|
+
target = nil
|
253
|
+
case relation
|
254
|
+
when :ancestor
|
255
|
+
target = :byTerm
|
256
|
+
when :descendant
|
257
|
+
target = :byValue
|
258
|
+
else
|
259
|
+
warn('Relation type not allowed. Returning nil')
|
415
260
|
end
|
416
|
-
|
261
|
+
query = @dicts.dig(:is_a, target, term)
|
262
|
+
return query
|
417
263
|
end
|
418
264
|
|
419
|
-
|
420
|
-
#
|
265
|
+
# Return direct ancestors/descendants of a given term
|
266
|
+
# Return direct ancestors of a given term
|
421
267
|
# ===== Parameters
|
422
|
-
# +
|
423
|
-
#
|
424
|
-
#
|
425
|
-
|
426
|
-
|
427
|
-
# ===== Return
|
428
|
-
# similitude calculated
|
429
|
-
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
|
430
|
-
# Check
|
431
|
-
raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
|
432
|
-
raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
|
433
|
-
micasA = []
|
434
|
-
# Compare A -> B
|
435
|
-
termsA.each do |tA|
|
436
|
-
micas = []
|
437
|
-
termsB.each do |tB|
|
438
|
-
if store_mica
|
439
|
-
value = @mica_index.dig(tA, tB)
|
440
|
-
else
|
441
|
-
value = nil
|
442
|
-
end
|
443
|
-
if value.nil?
|
444
|
-
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
|
445
|
-
if store_mica
|
446
|
-
value = true if value.nil? # We use true to save that the operation was made but there is not mica value
|
447
|
-
add2nestHash(@mica_index, tA, tB, value)
|
448
|
-
end
|
449
|
-
end
|
450
|
-
micas << value if value.class == Float
|
451
|
-
end
|
452
|
-
if !micas.empty?
|
453
|
-
micasA << micas.max # Obtain maximum value
|
454
|
-
else
|
455
|
-
micasA << 0
|
456
|
-
end
|
457
|
-
end
|
458
|
-
means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
|
459
|
-
# Compare B -> A
|
460
|
-
if bidirectional
|
461
|
-
means_simA = means_sim * micasA.size
|
462
|
-
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
|
463
|
-
means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
|
464
|
-
end
|
465
|
-
# Return
|
466
|
-
return means_sim
|
268
|
+
# +term+:: which ancestors are requested
|
269
|
+
# ===== Returns
|
270
|
+
# Direct ancestors of given term or nil if any error occurs
|
271
|
+
def get_direct_ancentors(term)
|
272
|
+
return self.get_direct_related(term, :ancestor)
|
467
273
|
end
|
468
274
|
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
275
|
+
# Return direct descendants of a given term
|
276
|
+
# ===== Parameters
|
277
|
+
# +term+:: which descendants are requested
|
278
|
+
# ===== Returns
|
279
|
+
# Direct descendants of given term or nil if any error occurs
|
280
|
+
def get_direct_descendants(term)
|
281
|
+
return self.get_direct_related(term, :descendant)
|
476
282
|
end
|
477
283
|
|
478
|
-
#
|
284
|
+
# Find ancestors/descendants of a given term
|
479
285
|
# ===== Parameters
|
480
|
-
# +
|
481
|
-
# +
|
482
|
-
#
|
483
|
-
#
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
profiles_ids = @profiles.keys
|
489
|
-
if external_profiles.nil?
|
490
|
-
comp_ids = profiles_ids
|
491
|
-
comp_profiles = @profiles
|
492
|
-
main_ids = comp_ids
|
493
|
-
main_profiles = comp_profiles
|
286
|
+
# +term+:: to be checked
|
287
|
+
# +return_ancestors+:: return ancestors if true or descendants if false
|
288
|
+
# ===== Returns
|
289
|
+
# an array with all ancestors/descendants of given term or nil if parents are not available yet
|
290
|
+
def get_familiar(term, return_ancestors = true)
|
291
|
+
familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
|
292
|
+
if !familiars.nil?
|
293
|
+
familiars = familiars.clone
|
494
294
|
else
|
495
|
-
|
496
|
-
comp_profiles = external_profiles
|
497
|
-
main_ids = profiles_ids
|
498
|
-
main_profiles = @profiles
|
499
|
-
end
|
500
|
-
# Compare
|
501
|
-
@mica_index = {}
|
502
|
-
while !main_ids.empty?
|
503
|
-
curr_id = main_ids.shift
|
504
|
-
current_profile = main_profiles[curr_id]
|
505
|
-
comp_ids.each do |id|
|
506
|
-
profile = comp_profiles[id]
|
507
|
-
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
|
508
|
-
query = profiles_similarity[curr_id]
|
509
|
-
if query.nil?
|
510
|
-
profiles_similarity[curr_id] = {id => value}
|
511
|
-
else
|
512
|
-
query[id] = value
|
513
|
-
end
|
514
|
-
end
|
295
|
+
familiars = []
|
515
296
|
end
|
516
|
-
return
|
297
|
+
return familiars
|
517
298
|
end
|
518
299
|
|
300
|
+
# Find ancestors of a given term
|
301
|
+
# ===== Parameters
|
302
|
+
# +term+:: to be checked
|
303
|
+
# ===== Returns
|
304
|
+
# an array with all ancestors of given term or false if parents are not available yet
|
305
|
+
def get_ancestors(term)
|
306
|
+
return self.get_familiar(term, true)
|
307
|
+
end
|
519
308
|
|
520
|
-
#
|
309
|
+
# Find descendants of a given term
|
521
310
|
# ===== Parameters
|
522
|
-
# +
|
311
|
+
# +term+:: to be checked
|
523
312
|
# ===== Returns
|
524
|
-
#
|
525
|
-
def
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
313
|
+
# an array with all descendants of given term or false if parents are not available yet
|
314
|
+
def get_descendants(term)
|
315
|
+
return self.get_familiar(term, false)
|
316
|
+
end
|
317
|
+
|
318
|
+
# Gets ontology level of a specific term
|
319
|
+
# ===== Returns
|
320
|
+
# Term level
|
321
|
+
def get_term_level(term)
|
322
|
+
return @dicts[:level][:byValue][term]
|
323
|
+
end
|
324
|
+
|
325
|
+
# nil, term not found, [] term exists but not has parents
|
326
|
+
def get_parental_path(term, which_path = :shortest_path, level = 0)
|
327
|
+
path = nil
|
328
|
+
path_attr = @term_paths[term]
|
329
|
+
if !path_attr.nil?
|
330
|
+
path_length = path_attr[which_path]
|
331
|
+
all_paths = path_attr[:paths]
|
332
|
+
if all_paths.empty?
|
333
|
+
path = []
|
334
|
+
else
|
335
|
+
path = all_paths.select{|pt| pt.length == path_length}.first.clone
|
336
|
+
if level > 0 # we want the term and his ascendants until a specific level
|
337
|
+
n_parents = path_length - level
|
338
|
+
path = path[0..n_parents]
|
542
339
|
end
|
340
|
+
path.shift # Discard the term itself
|
543
341
|
end
|
544
342
|
end
|
545
|
-
|
343
|
+
return path
|
546
344
|
end
|
547
345
|
|
346
|
+
# ID Handlers
|
347
|
+
####################################
|
548
348
|
|
549
|
-
# Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
|
550
349
|
# ===== Returns
|
551
|
-
#
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
|
568
|
-
self.calc_term_levels(calc_paths: true)
|
350
|
+
# the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
|
351
|
+
# ===== Parameters
|
352
|
+
# +id+:: to be translated
|
353
|
+
# ===== Return
|
354
|
+
# main ID related to a given ID. Returns nil if given ID is not an allowed ID
|
355
|
+
def get_main_id(id)
|
356
|
+
mainID = @alternatives_index[id]
|
357
|
+
return nil if !term_exist?(id) && mainID.nil?
|
358
|
+
if !mainID.nil? # Recursive code to get the definitive final term id if there are several alt_id in chain
|
359
|
+
new_id = get_main_id(mainID)
|
360
|
+
if new_id != mainID
|
361
|
+
new_id = get_main_id(new_id)
|
362
|
+
end
|
363
|
+
id = new_id
|
364
|
+
end
|
365
|
+
return id
|
569
366
|
end
|
570
367
|
|
571
|
-
|
572
|
-
#
|
573
|
-
#
|
574
|
-
#
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
if @alternatives_index.include?(id)
|
583
|
-
alt_id = @alternatives_index[id]
|
584
|
-
query = @meta[alt_id] # Check if exist
|
585
|
-
if query.nil?
|
586
|
-
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
587
|
-
@meta[alt_id] = query
|
588
|
-
end
|
589
|
-
@meta[id] = query
|
590
|
-
# Note: alternative terms do not increase structural frequencies
|
591
|
-
else # Official term
|
592
|
-
query = @meta[id] # Check if exist
|
593
|
-
if query.nil?
|
594
|
-
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
595
|
-
@meta[id] = query
|
596
|
-
end
|
597
|
-
# Store metadata
|
598
|
-
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
|
599
|
-
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
|
600
|
-
query[:struct_freq] = query[:descendants] + 1.0
|
601
|
-
# Update maximums
|
602
|
-
@max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
|
603
|
-
@max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
|
604
|
-
end
|
605
|
-
end
|
606
|
-
end
|
368
|
+
# Translate a given value using an already calcualted dictionary
|
369
|
+
# ===== Parameters
|
370
|
+
# +toTranslate+:: value to be translated using dictiontionary
|
371
|
+
# +tag+:: used to generate the dictionary
|
372
|
+
# +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
|
373
|
+
# ===== Return
|
374
|
+
# translation
|
375
|
+
def translate(toTranslate, tag, byValue: true)
|
376
|
+
dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
|
377
|
+
toTranslate = get_main_id(toTranslate) if !byValue
|
378
|
+
return dict[toTranslate]
|
607
379
|
end
|
608
380
|
|
609
|
-
|
610
|
-
# Expand obsoletes set and link info to their alternative IDs
|
381
|
+
# Translate a name given
|
611
382
|
# ===== Parameters
|
612
|
-
# +
|
613
|
-
#
|
614
|
-
#
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
warn('stanzas terms empty')
|
620
|
-
else
|
621
|
-
# Check obsoletes
|
622
|
-
@stanzas[:terms].each do |id, term_tags|
|
623
|
-
next if term_tags.nil?
|
624
|
-
next if self.is_alternative?(id)
|
625
|
-
query = term_tags[obs_tag]
|
626
|
-
if !query.nil? && query == 'true' # Obsolete tag presence
|
627
|
-
next if !@obsoletes_index[id].nil? # Already stored
|
628
|
-
# Check if alternative value is available
|
629
|
-
alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
|
630
|
-
if !alt_ids.empty?
|
631
|
-
alt_id = alt_ids.first.first #FIRST tag, FIRST id
|
632
|
-
# Store
|
633
|
-
@alternatives_index[id] = alt_id
|
634
|
-
@obsoletes_index[id] = alt_id
|
635
|
-
end
|
636
|
-
end
|
637
|
-
end
|
638
|
-
end
|
383
|
+
# +name+:: to be translated
|
384
|
+
# ===== Return
|
385
|
+
# translated name or nil if it's not stored into this ontology
|
386
|
+
def translate_name(name)
|
387
|
+
term = self.translate(name, :name)
|
388
|
+
term = self.translate(name, :synonym) if term.nil?
|
389
|
+
return term
|
639
390
|
end
|
640
391
|
|
641
|
-
|
642
|
-
# Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
|
392
|
+
# Translates a given ID to it assigned name
|
643
393
|
# ===== Parameters
|
644
|
-
# +
|
645
|
-
#
|
646
|
-
#
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
# Check
|
651
|
-
if @stanzas[:terms].nil?
|
652
|
-
warn('stanzas terms empty')
|
653
|
-
else
|
654
|
-
# Expand
|
655
|
-
structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
|
656
|
-
target_tag: tag,
|
657
|
-
alt_ids: @alternatives_index,
|
658
|
-
obsoletes: @obsoletes_index.length)
|
659
|
-
# Check
|
660
|
-
raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
|
661
|
-
# Prepare ancestors structure
|
662
|
-
anc = {}
|
663
|
-
des = {}
|
664
|
-
parentals.each do |id, parents|
|
665
|
-
parents = parents - @removable_terms
|
666
|
-
anc[id] = parents
|
667
|
-
parents.each do |anc_id| # Add descendants
|
668
|
-
if !des.include?(anc_id)
|
669
|
-
des[anc_id] = [id]
|
670
|
-
else
|
671
|
-
des[anc_id] << id
|
672
|
-
end
|
673
|
-
end
|
674
|
-
end
|
675
|
-
# Store alternatives
|
676
|
-
# @alternatives_index.each do |id,alt|
|
677
|
-
# anc[id] = anc[alt] if anc.include?(alt)
|
678
|
-
# des[id] = des[alt] if des.include?(alt)
|
679
|
-
# end
|
680
|
-
# Check structure
|
681
|
-
if ![:atomic,:sparse].include? structType
|
682
|
-
structType = structType == :circular ? :circular : :hierarchical
|
683
|
-
end
|
684
|
-
# Store
|
685
|
-
@ancestors_index = anc
|
686
|
-
@descendants_index = des
|
687
|
-
@structureType = structType
|
688
|
-
end
|
689
|
-
# Finish
|
394
|
+
# +id+:: to be translated
|
395
|
+
# ===== Return
|
396
|
+
# main name or nil if it's not included into this ontology
|
397
|
+
def translate_id(id)
|
398
|
+
name = self.translate(id, :name, byValue: false)
|
399
|
+
return name.nil? ? nil : name.first
|
690
400
|
end
|
691
401
|
|
402
|
+
# Get term frequency and information
|
403
|
+
####################################
|
692
404
|
|
693
|
-
#
|
405
|
+
# One single term #
|
406
|
+
|
407
|
+
# Get a term frequency
|
694
408
|
# ===== Parameters
|
695
|
-
# +term+:: to be checked
|
696
|
-
# +
|
409
|
+
# +term+:: term to be checked
|
410
|
+
# +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
|
697
411
|
# ===== Returns
|
698
|
-
#
|
699
|
-
def
|
700
|
-
|
412
|
+
# frequency of term given or nil if term is not allowed
|
413
|
+
def get_frequency(term, type: :struct_freq)
|
414
|
+
queryFreq = @meta[term]
|
415
|
+
return queryFreq.nil? ? nil : queryFreq[type]
|
701
416
|
end
|
702
417
|
|
703
|
-
|
704
|
-
# Find descendants of a given term
|
418
|
+
# Geys structural frequency of a term given
|
705
419
|
# ===== Parameters
|
706
420
|
# +term+:: to be checked
|
707
|
-
# +filter_alternatives+:: if true, remove alternatives from final results
|
708
421
|
# ===== Returns
|
709
|
-
#
|
710
|
-
def
|
711
|
-
return self.
|
422
|
+
# structural frequency of given term or nil if term is not allowed
|
423
|
+
def get_structural_frequency(term)
|
424
|
+
return self.get_frequency(term, type: :struct_freq)
|
712
425
|
end
|
713
426
|
|
714
|
-
|
715
|
-
# Find ancestors/descendants of a given term
|
427
|
+
# Gets observed frequency of a term given
|
716
428
|
# ===== Parameters
|
717
429
|
# +term+:: to be checked
|
718
|
-
# +return_ancestors+:: return ancestors if true or descendants if false
|
719
|
-
# +filter_alternatives+:: if true, remove alternatives from final results
|
720
430
|
# ===== Returns
|
721
|
-
#
|
722
|
-
def
|
723
|
-
|
724
|
-
familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
|
725
|
-
if !familiars.nil?
|
726
|
-
familiars = familiars.clone
|
727
|
-
if filter_alternatives
|
728
|
-
familiars.reject!{|fm| @alternatives_index.include?(fm)}
|
729
|
-
end
|
730
|
-
else
|
731
|
-
familiars = []
|
732
|
-
end
|
733
|
-
return familiars
|
431
|
+
# observed frequency of given term or nil if term is not allowed
|
432
|
+
def get_observed_frequency(term)
|
433
|
+
return self.get_frequency(term, type: :observed_freq)
|
734
434
|
end
|
735
435
|
|
736
|
-
|
737
436
|
# Obtain IC of an specific term
|
738
437
|
# ===== Parameters
|
739
438
|
# +term+:: which IC will be calculated
|
@@ -787,7 +486,7 @@ class Ontology
|
|
787
486
|
###########################################
|
788
487
|
when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
|
789
488
|
# 1 - ( log(hypo(x) + 1) / log(max_nodes) )
|
790
|
-
ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@
|
489
|
+
ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@terms.length))
|
791
490
|
if :zhou # New Model of Semantic Similarity Measuring in Wordnet
|
792
491
|
# k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
|
793
492
|
@ics[:seco][term] = ic # Special store
|
@@ -801,40 +500,25 @@ class Ontology
|
|
801
500
|
return ic
|
802
501
|
end
|
803
502
|
|
503
|
+
# Term vs Term #
|
804
504
|
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
|
819
|
-
resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
|
505
|
+
def get_LCA(termA, termB, lca_index: false)
|
506
|
+
lca = []
|
507
|
+
if lca_index
|
508
|
+
res = @lca_index.dig(termA, termB)
|
509
|
+
lca = [res] if !res.nil?
|
510
|
+
else # Obtain ancestors (include itselfs too)
|
511
|
+
anc_A = self.get_ancestors(termA)
|
512
|
+
anc_B = self.get_ancestors(termB)
|
513
|
+
if !(anc_A.empty? && anc_B.empty?)
|
514
|
+
anc_A << termA
|
515
|
+
anc_B << termB
|
516
|
+
lca = anc_A & anc_B
|
517
|
+
end
|
820
518
|
end
|
821
|
-
return
|
822
|
-
end
|
823
|
-
|
824
|
-
|
825
|
-
# Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
|
826
|
-
# ===== Parameters
|
827
|
-
# +termA+:: term to be cheked
|
828
|
-
# +termB+:: term to be checked
|
829
|
-
# +ic_type+:: IC formula to be used
|
830
|
-
# ===== Returns
|
831
|
-
# the IC of the MICA(termA,termB)
|
832
|
-
def get_ICMICA(termA, termB, ic_type = :resnik)
|
833
|
-
term, ic = self.get_MICA(termA, termB, ic_type)
|
834
|
-
return term.nil? ? nil : ic
|
519
|
+
return lca
|
835
520
|
end
|
836
521
|
|
837
|
-
|
838
522
|
# Find the Most Index Content shared Ancestor (MICA) of two given terms
|
839
523
|
# ===== Parameters
|
840
524
|
# +termA+:: term to be cheked
|
@@ -842,30 +526,31 @@ class Ontology
|
|
842
526
|
# +ic_type+:: IC formula to be used
|
843
527
|
# ===== Returns
|
844
528
|
# the MICA(termA,termB) and it's IC
|
845
|
-
def get_MICA(termA, termB, ic_type = :resnik)
|
846
|
-
termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
|
847
|
-
termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
|
529
|
+
def get_MICA(termA, termB, ic_type = :resnik, lca_index = false)
|
848
530
|
mica = [nil,-1.0]
|
849
|
-
# Special case
|
850
|
-
if termA.eql?(termB)
|
531
|
+
if termA.eql?(termB) # Special case
|
851
532
|
ic = self.get_IC(termA, type: ic_type)
|
852
533
|
mica = [termA, ic]
|
853
|
-
else
|
854
|
-
#
|
855
|
-
|
856
|
-
|
857
|
-
if !(anc_A.empty? && anc_B.empty?)
|
858
|
-
anc_A << termA
|
859
|
-
anc_B << termB
|
860
|
-
(anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
|
861
|
-
ic = self.get_IC(anc, type: ic_type)
|
862
|
-
mica = [anc,ic] if ic > mica[1]
|
863
|
-
end
|
534
|
+
else
|
535
|
+
get_LCA(termA, termB, lca_index: lca_index).each do |lca| # Find MICA in shared ancestors
|
536
|
+
ic = self.get_IC(lca, type: ic_type)
|
537
|
+
mica = [lca, ic] if ic > mica[1]
|
864
538
|
end
|
865
539
|
end
|
866
540
|
return mica
|
867
541
|
end
|
868
542
|
|
543
|
+
# Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
|
544
|
+
# ===== Parameters
|
545
|
+
# +termA+:: term to be cheked
|
546
|
+
# +termB+:: term to be checked
|
547
|
+
# +ic_type+:: IC formula to be used
|
548
|
+
# ===== Returns
|
549
|
+
# the IC of the MICA(termA,termB)
|
550
|
+
def get_ICMICA(termA, termB, ic_type = :resnik)
|
551
|
+
term, ic = self.get_MICA(termA, termB, ic_type)
|
552
|
+
return term.nil? ? nil : ic
|
553
|
+
end
|
869
554
|
|
870
555
|
# Calculate similarity between two given terms
|
871
556
|
# ===== Parameters
|
@@ -875,11 +560,10 @@ class Ontology
|
|
875
560
|
# +ic_type+:: IC formula to be used
|
876
561
|
# ===== Returns
|
877
562
|
# the similarity between both sets or false if frequencies are not available yet
|
878
|
-
def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
|
879
|
-
# Check
|
563
|
+
def get_similarity(termA, termB, type: :resnik, ic_type: :resnik, lca_index: false)
|
880
564
|
raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
|
881
565
|
sim = nil
|
882
|
-
mica, sim_res = get_MICA(termA, termB, ic_type)
|
566
|
+
mica, sim_res = get_MICA(termA, termB, ic_type, lca_index)
|
883
567
|
if !mica.nil?
|
884
568
|
case type
|
885
569
|
when :resnik
|
@@ -893,1568 +577,1027 @@ class Ontology
|
|
893
577
|
return sim
|
894
578
|
end
|
895
579
|
|
580
|
+
# Checking valid terms
|
581
|
+
####################################
|
896
582
|
|
897
|
-
|
898
|
-
|
899
|
-
# ===== Parameters
|
900
|
-
# +file+:: optional file to update object stored file
|
901
|
-
def load(file, build: true)
|
902
|
-
_, header, stanzas = self.class.load_obo(file)
|
903
|
-
@header = header
|
904
|
-
@stanzas = stanzas
|
905
|
-
self.remove_removable()
|
906
|
-
# @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
|
907
|
-
self.build_index() if build
|
583
|
+
def term_exist?(id)
|
584
|
+
return @terms.include?(id)
|
908
585
|
end
|
909
586
|
|
910
|
-
#
|
911
|
-
def
|
912
|
-
|
587
|
+
# Check if a term given is marked as obsolete
|
588
|
+
def is_obsolete?(term)
|
589
|
+
return @obsoletes.include?(term)
|
913
590
|
end
|
914
591
|
|
592
|
+
#############################################
|
593
|
+
# ITEMS METHODS
|
594
|
+
#############################################
|
915
595
|
|
916
|
-
#
|
596
|
+
# I/O Items
|
597
|
+
####################################
|
598
|
+
|
599
|
+
# Store specific relations hash given into ITEMS structure
|
917
600
|
# ===== Parameters
|
918
|
-
# +
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
items: @items,
|
936
|
-
removable_terms: @removable_terms,
|
937
|
-
term_paths: @term_paths}
|
938
|
-
# Convert to JSON format & write
|
939
|
-
File.open(file, "w") { |f| f.write obj_info.to_json }
|
940
|
-
end
|
601
|
+
# +relations+:: hash to be stored
|
602
|
+
# +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
|
603
|
+
# +expand+:: if true, already stored keys will be updated with the unique union of both sets
|
604
|
+
def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
|
605
|
+
@items = {} if remove_old_relations
|
606
|
+
relations.each do |term, items|
|
607
|
+
if !term_exist?(term)
|
608
|
+
warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
|
609
|
+
break
|
610
|
+
end
|
611
|
+
end
|
612
|
+
if expand
|
613
|
+
@items = self.concatItems(@items, relations)
|
614
|
+
else
|
615
|
+
@items.merge!(relations)
|
616
|
+
end
|
617
|
+
end
|
941
618
|
|
619
|
+
# Defining Items from instance variables
|
620
|
+
########################################
|
942
621
|
|
943
|
-
|
944
|
-
|
622
|
+
# Assign a dictionary already calculated as a items set.
|
623
|
+
# ===== Parameters
|
624
|
+
# +dictID+:: dictionary ID to be stored (:byTerm will be used)
|
625
|
+
def set_items_from_dict(dictID, remove_old_relations = false)
|
626
|
+
@items = {} if remove_old_relations
|
627
|
+
query = @dicts[dictID]
|
628
|
+
if !query.nil?
|
629
|
+
@items.merge!(query[:byTerm])
|
630
|
+
else
|
631
|
+
warn('Specified ID is not calculated. Dict will not be added as a items set')
|
632
|
+
end
|
945
633
|
end
|
946
634
|
|
947
|
-
|
948
|
-
# Read a JSON file with an OBO_Handler object stored
|
635
|
+
# Get related profiles to a given term
|
949
636
|
# ===== Parameters
|
950
|
-
# +
|
951
|
-
#
|
952
|
-
#
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
|
962
|
-
[entry,info.map{|item| item.to_sym}]
|
963
|
-
else
|
964
|
-
[entry,info]
|
965
|
-
end
|
966
|
-
end
|
967
|
-
jsonInfo[:header] = aux.to_h
|
968
|
-
end
|
969
|
-
jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
|
970
|
-
jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
|
971
|
-
jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
|
972
|
-
# Optional
|
973
|
-
jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
|
974
|
-
jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
|
975
|
-
jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
|
976
|
-
jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:obsoletes_index].nil?
|
977
|
-
jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
|
978
|
-
next if dictionaries.nil?
|
979
|
-
# Special case: byTerm
|
980
|
-
dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
|
981
|
-
if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
|
982
|
-
[term.to_s.to_i, value.map{|term| term.to_sym}]
|
983
|
-
elsif value.is_a? Numeric # Numeric dictionary
|
984
|
-
[term.to_sym, value]
|
985
|
-
elsif value.kind_of?(Array) && flag == :is_a
|
986
|
-
[term.to_sym, value.map{|v| v.to_sym}]
|
987
|
-
else
|
988
|
-
[term.to_sym, value]
|
989
|
-
end
|
990
|
-
end
|
991
|
-
dictionaries[:byTerm] = dictionaries[:byTerm].to_h
|
992
|
-
# By value
|
993
|
-
dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
|
994
|
-
if value.is_a? Numeric # Numeric dictionary
|
995
|
-
[value, term.to_sym]
|
996
|
-
elsif term.is_a? Numeric # Numeric dictionary
|
997
|
-
[value.to_s.to_sym, term]
|
998
|
-
elsif flag == :is_a
|
999
|
-
[value.to_sym, term.map{|v| v.to_sym}]
|
1000
|
-
elsif term.kind_of?(Array)
|
1001
|
-
[value.to_sym, term.map{|t| t.to_sym}]
|
1002
|
-
else
|
1003
|
-
[value.to_s, term.to_sym]
|
1004
|
-
end
|
1005
|
-
end
|
1006
|
-
dictionaries[:byValue] = dictionaries[:byValue].to_h
|
1007
|
-
end
|
1008
|
-
if !jsonInfo[:profiles].nil?
|
1009
|
-
jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
|
1010
|
-
jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
|
1011
|
-
end
|
1012
|
-
jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}} unless jsonInfo[:profilesDict].nil?
|
1013
|
-
jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
|
1014
|
-
jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
|
1015
|
-
next if v.nil?
|
1016
|
-
if v.kind_of?(Array)
|
1017
|
-
jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
|
1018
|
-
else
|
1019
|
-
jsonInfo[:special_tags][k] = v.to_sym
|
1020
|
-
end
|
637
|
+
# +term+:: to be checked
|
638
|
+
# ===== Returns
|
639
|
+
# profiles which contains given term
|
640
|
+
def get_items_from_term(term)
|
641
|
+
return @items[term]
|
642
|
+
end
|
643
|
+
|
644
|
+
# For each term in profiles add the ids in the items term-id dictionary
|
645
|
+
def get_items_from_profiles
|
646
|
+
@profiles.each do |id, terms|
|
647
|
+
terms.each {|term| add2hash(@items, term, id) }
|
1021
648
|
end
|
1022
|
-
jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
|
1023
|
-
jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
|
1024
|
-
|
1025
|
-
# Store info
|
1026
|
-
@header = jsonInfo[:header]
|
1027
|
-
@stanzas = jsonInfo[:stanzas]
|
1028
|
-
@ancestors_index = jsonInfo[:ancestors_index]
|
1029
|
-
@descendants_index = jsonInfo[:descendants_index]
|
1030
|
-
@alternatives_index = jsonInfo[:alternatives_index]
|
1031
|
-
@obsoletes_index = jsonInfo[:obsoletes_index]
|
1032
|
-
jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
|
1033
|
-
@structureType = jsonInfo[:structureType]
|
1034
|
-
@ics = jsonInfo[:ics]
|
1035
|
-
@meta = jsonInfo[:meta]
|
1036
|
-
@special_tags = jsonInfo[:special_tags]
|
1037
|
-
@max_freqs = jsonInfo[:max_freqs]
|
1038
|
-
@dicts = jsonInfo[:dicts]
|
1039
|
-
@profiles = jsonInfo[:profiles]
|
1040
|
-
@profilesDict = jsonInfo[:profilesDict]
|
1041
|
-
@items = jsonInfo[:items]
|
1042
|
-
@removable_terms = jsonInfo[:removable_terms]
|
1043
|
-
@term_paths = jsonInfo[:term_paths]
|
1044
|
-
|
1045
|
-
self.build_index() if build
|
1046
|
-
end
|
1047
|
-
|
1048
|
-
|
1049
|
-
# Check if a given ID is stored as term into this object
|
1050
|
-
# ===== Parameters
|
1051
|
-
# +id+:: to be checked
|
1052
|
-
# ===== Return
|
1053
|
-
# True if term is allowed or false in other cases
|
1054
|
-
def exists? id
|
1055
|
-
return stanzas[:terms].include?(id)
|
1056
649
|
end
|
1057
650
|
|
651
|
+
# Defining instance variables from items
|
652
|
+
########################################
|
1058
653
|
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
# The correct ID if it can be found or nil in other cases
|
1064
|
-
def extract_id(text, splitBy: ' ')
|
1065
|
-
if self.exists?(text)
|
1066
|
-
return text
|
1067
|
-
else
|
1068
|
-
splittedText = text.to_s.split(splitBy).first.to_sym
|
1069
|
-
return self.exists?(splittedText) ? splittedText : nil
|
654
|
+
def get_profiles_from_items
|
655
|
+
new_profiles = {}
|
656
|
+
@items.each do |term, ids|
|
657
|
+
ids.each{|id| add2hash(new_profiles, id, term) }
|
1070
658
|
end
|
659
|
+
@profiles = new_profiles
|
1071
660
|
end
|
1072
661
|
|
662
|
+
# Expanding items
|
663
|
+
####################################
|
1073
664
|
|
1074
|
-
#
|
1075
|
-
#
|
1076
|
-
# This functions stores first value for multivalue tags
|
1077
|
-
# This function does not handle synonyms for byValue dictionaries
|
665
|
+
# This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
|
666
|
+
# Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
|
1078
667
|
# ===== Parameters
|
1079
|
-
# +
|
1080
|
-
# +
|
1081
|
-
# +
|
1082
|
-
#
|
1083
|
-
#
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
queryTag = queryTag.scan(select_regex).first
|
1110
|
-
end
|
1111
|
-
queryTag.compact!
|
1112
|
-
end
|
1113
|
-
if queryTag.kind_of?(Array) # Store
|
1114
|
-
if !queryTag.empty?
|
1115
|
-
if byTerm.include?(referenceTerm)
|
1116
|
-
byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
|
1117
|
-
else
|
1118
|
-
byTerm[referenceTerm] = queryTag
|
668
|
+
# +ontology+:: (Optional) ontology object which items given belongs
|
669
|
+
# +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
|
670
|
+
# +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
|
671
|
+
# ===== Returns
|
672
|
+
# void and update items object
|
673
|
+
def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
|
674
|
+
targetKeys = expand_profile_with_parents(@items.keys)
|
675
|
+
terms_per_level = list_terms_per_level(targetKeys)
|
676
|
+
terms_per_level = terms_per_level.to_a.sort{|l1, l2| l1.first <=> l2.first} # Obtain sorted levels
|
677
|
+
terms_per_level.pop # Leaves are not expandable # FRED: Thats comment could be not true
|
678
|
+
|
679
|
+
terms_per_level.reverse_each do |lvl, terms| # Expand from leaves to roots
|
680
|
+
terms.each do |term|
|
681
|
+
childs = self.get_descendants(term).select{|t| @items.include?(t)} # Get child with items
|
682
|
+
next if childs.length < minimum_childs
|
683
|
+
propagated_item_count = Hash.new(0)
|
684
|
+
if ontology.nil? # Count how many times is presented an item in childs
|
685
|
+
childs.each do |child|
|
686
|
+
@items[child].each{|i| propagated_item_count[i] += 1}
|
687
|
+
end
|
688
|
+
else # Count take into account similarity between terms in other ontology. Not pretty clear the full logic
|
689
|
+
while childs.length > 1
|
690
|
+
curr_term = childs.shift
|
691
|
+
childs.each do |child|
|
692
|
+
maxmica_counts = Hash.new(0)
|
693
|
+
curr_items = @items[curr_term]
|
694
|
+
child_items = @items[child]
|
695
|
+
curr_items.each do |item|
|
696
|
+
maxmica = ontology.get_maxmica_term2profile(item, child_items)
|
697
|
+
maxmica_counts[maxmica.first] += 1
|
1119
698
|
end
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
byValue[value] << referenceTerm
|
1124
|
-
end
|
1125
|
-
else
|
1126
|
-
queryTag.each{|value| byValue[value] = referenceTerm}
|
699
|
+
child_items.each do |item|
|
700
|
+
maxmica = ontology.get_maxmica_term2profile(item, curr_items)
|
701
|
+
maxmica_counts[maxmica.first] += 1
|
1127
702
|
end
|
1128
|
-
|
1129
|
-
|
1130
|
-
if byTerm.include?(referenceTerm)
|
1131
|
-
byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
|
1132
|
-
else
|
1133
|
-
byTerm[referenceTerm] = [queryTag]
|
1134
|
-
end
|
1135
|
-
if multiterm
|
1136
|
-
byValue[queryTag] = [] if byValue[queryTag].nil?
|
1137
|
-
byValue[queryTag] << referenceTerm
|
1138
|
-
else
|
1139
|
-
byValue[queryTag] = referenceTerm
|
703
|
+
maxmica_counts.each{|t,freq| propagated_item_count[t] += freq if freq >= 2} #TODO: Maybe need Division by 2 due to the calculation of mica two times but test fails.
|
704
|
+
# FRED: Maybe for the childs.shift there is uniqueness
|
1140
705
|
end
|
1141
706
|
end
|
1142
707
|
end
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
if
|
1151
|
-
|
1152
|
-
else
|
1153
|
-
byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
|
1154
|
-
checked
|
1155
|
-
end
|
708
|
+
propagated_items = propagated_item_count.select{|k,v| v >= minimum_childs}.keys
|
709
|
+
if propagated_items.length > 0
|
710
|
+
query = @items[term]
|
711
|
+
if query.nil?
|
712
|
+
@items[term] = propagated_items
|
713
|
+
else
|
714
|
+
terms = @items[term] | propagated_items
|
715
|
+
terms = ontology.clean_profile(terms) if clean_profiles && !ontology.nil?
|
716
|
+
@items[term] = terms
|
1156
717
|
end
|
1157
|
-
byTerm[term] = corrected_references.uniq
|
1158
718
|
end
|
1159
719
|
end
|
720
|
+
end
|
721
|
+
end
|
1160
722
|
|
1161
|
-
|
1162
|
-
|
1163
|
-
if self.exists?(term)
|
1164
|
-
referenceValue = @stanzas[:terms][term][tag]
|
1165
|
-
if !referenceValue.nil?
|
1166
|
-
if !select_regex.nil?
|
1167
|
-
if referenceValue.kind_of?(Array)
|
1168
|
-
referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
|
1169
|
-
referenceValue.flatten!
|
1170
|
-
else
|
1171
|
-
referenceValue = referenceValue.scan(select_regex).first
|
1172
|
-
end
|
1173
|
-
referenceValue.compact!
|
1174
|
-
end
|
1175
|
-
if self_type_references
|
1176
|
-
if referenceValue.kind_of?(Array)
|
1177
|
-
aux = referenceValue.map{|t| self.extract_id(t)}
|
1178
|
-
else
|
1179
|
-
aux = self.extract_id(referenceValue)
|
1180
|
-
end
|
1181
|
-
aux.compact! unless aux.nil?
|
1182
|
-
referenceValue = aux unless aux.nil?
|
1183
|
-
end
|
1184
|
-
referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
|
1185
|
-
byTerm[term] = referenceValue + (values - referenceValue)
|
1186
|
-
end
|
1187
|
-
end
|
1188
|
-
end
|
723
|
+
# Compute modified fisher between terms and items based on topgo methodology. Refactor to use all the possible methods of this class
|
724
|
+
#-------------------------------------------------------------------------------------------------------------------------------------
|
1189
725
|
|
1190
|
-
|
1191
|
-
|
726
|
+
def compute_relations_to_items(external_item_list, total_items, mode, thresold) # NEED TEST, check with PSZ how to maintain these methods
|
727
|
+
terms_levels = list_terms_per_level_from_items
|
728
|
+
connect_familiars!(terms_levels)
|
729
|
+
item_list_with_transf_parental = get_item_list_parental(terms_levels)
|
730
|
+
results = []
|
731
|
+
if mode == :elim
|
732
|
+
results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
|
733
|
+
elsif mode == :weight
|
734
|
+
results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
|
1192
735
|
end
|
736
|
+
return results
|
1193
737
|
end
|
1194
738
|
|
1195
|
-
|
1196
|
-
|
1197
|
-
def calc_ancestors_dictionary
|
1198
|
-
self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
|
739
|
+
def list_terms_per_level_from_items
|
740
|
+
return list_terms_per_level(@items.keys)
|
1199
741
|
end
|
1200
742
|
|
743
|
+
def list_terms_per_level(terms)
|
744
|
+
terms_levels = {}
|
745
|
+
terms.each do |term|
|
746
|
+
level = self.get_term_level(term)
|
747
|
+
add2hash(terms_levels, level, term)
|
748
|
+
end
|
749
|
+
return terms_levels
|
750
|
+
end
|
1201
751
|
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
1218
|
-
|
1219
|
-
# ===== Return
|
1220
|
-
# translated name or nil if it's not stored into this ontology
|
1221
|
-
def translate_name(name)
|
1222
|
-
term = self.translate(name, :name)
|
1223
|
-
term = self.translate(name, :synonym) if term.nil?
|
1224
|
-
return term
|
1225
|
-
end
|
1226
|
-
|
1227
|
-
|
1228
|
-
# Translate several names and return translations and a list of names which couldn't be translated
|
1229
|
-
# ===== Parameters
|
1230
|
-
# +names+:: array to be translated
|
1231
|
-
# ===== Return
|
1232
|
-
# two arrays with translations and names which couldn't be translated respectively
|
1233
|
-
def translate_names(names)
|
1234
|
-
translated = []
|
1235
|
-
rejected = []
|
1236
|
-
names.each do |name|
|
1237
|
-
tr = self.translate_name(name)
|
1238
|
-
if tr.nil?
|
1239
|
-
rejected << name
|
1240
|
-
else
|
1241
|
-
translated << tr
|
1242
|
-
end
|
1243
|
-
end
|
1244
|
-
return translated, rejected
|
1245
|
-
end
|
1246
|
-
|
1247
|
-
|
1248
|
-
# Translates a given ID to it assigned name
|
1249
|
-
# ===== Parameters
|
1250
|
-
# +id+:: to be translated
|
1251
|
-
# ===== Return
|
1252
|
-
# main name or nil if it's not included into this ontology
|
1253
|
-
def translate_id(id)
|
1254
|
-
name = self.translate(id, :name, byValue: false)
|
1255
|
-
return name.nil? ? nil : name.first
|
1256
|
-
end
|
1257
|
-
|
1258
|
-
|
1259
|
-
# Translates several IDs and returns translations and not allowed IDs list
|
1260
|
-
# ===== Parameters
|
1261
|
-
# +ids+:: to be translated
|
1262
|
-
# ===== Return
|
1263
|
-
# two arrays with translations and names which couldn't be translated respectively
|
1264
|
-
def translate_ids(ids)
|
1265
|
-
translated = []
|
1266
|
-
rejected = []
|
1267
|
-
ids.each do |term_id|
|
1268
|
-
tr = self.translate_id(term_id.to_sym)
|
1269
|
-
if !tr.nil?
|
1270
|
-
translated << tr
|
1271
|
-
else
|
1272
|
-
rejected << tr
|
1273
|
-
end
|
1274
|
-
end
|
1275
|
-
return translated, rejected
|
1276
|
-
end
|
1277
|
-
|
1278
|
-
|
1279
|
-
# ===== Returns
|
1280
|
-
# the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
|
1281
|
-
# ===== Parameters
|
1282
|
-
# +id+:: to be translated
|
1283
|
-
# ===== Return
|
1284
|
-
# main ID related to a given ID. Returns nil if given ID is not an allowed ID
|
1285
|
-
def get_main_id(id)
|
1286
|
-
return nil if !@stanzas[:terms].include? id
|
1287
|
-
new_id = id
|
1288
|
-
mainID = @alternatives_index[id]
|
1289
|
-
new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
|
1290
|
-
return new_id
|
1291
|
-
end
|
1292
|
-
|
1293
|
-
|
1294
|
-
# Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
|
1295
|
-
# ===== Parameters
|
1296
|
-
# +ids+:: to be checked
|
1297
|
-
# ===== Return
|
1298
|
-
# two arrays whit allowed and rejected IDs respectively
|
1299
|
-
def check_ids(ids, substitute: true)
|
1300
|
-
checked_codes = []
|
1301
|
-
rejected_codes = []
|
1302
|
-
ids.each do |id|
|
1303
|
-
if @stanzas[:terms].include? id
|
1304
|
-
if substitute
|
1305
|
-
checked_codes << self.get_main_id(id)
|
1306
|
-
else
|
1307
|
-
checked_codes << id
|
1308
|
-
end
|
1309
|
-
else
|
1310
|
-
rejected_codes << id
|
1311
|
-
end
|
1312
|
-
end
|
1313
|
-
return checked_codes, rejected_codes
|
1314
|
-
end
|
1315
|
-
|
1316
|
-
|
1317
|
-
# Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
|
1318
|
-
# ===== Parameters
|
1319
|
-
# +id+:: assigned to profile
|
1320
|
-
# +terms+:: array of terms
|
1321
|
-
# +substitute+:: subsstitute flag from check_ids
|
1322
|
-
def add_profile(id, terms, substitute: true)
|
1323
|
-
warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
|
1324
|
-
correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
|
1325
|
-
if !rejected_terms.empty?
|
1326
|
-
warn('Given terms contains erroneus IDs. These IDs will be removed')
|
1327
|
-
end
|
1328
|
-
if id.is_a? Numeric
|
1329
|
-
@profiles[id] = correct_terms
|
1330
|
-
else
|
1331
|
-
@profiles[id.to_sym] = correct_terms
|
1332
|
-
end
|
1333
|
-
end
|
1334
|
-
|
1335
|
-
|
1336
|
-
# Method used to store a pull of profiles
|
1337
|
-
# ===== Parameters
|
1338
|
-
# +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
|
1339
|
-
# +calc_metadata+:: if true, launch calc_profiles_dictionary process
|
1340
|
-
# +reset_stored+:: if true, remove already stored profiles
|
1341
|
-
# +substitute+:: subsstitute flag from check_ids
|
1342
|
-
def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
|
1343
|
-
self.reset_profiles if reset_stored
|
1344
|
-
# Check
|
1345
|
-
if profiles.kind_of?(Array)
|
1346
|
-
profiles.each_with_index do |items, i|
|
1347
|
-
self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
|
1348
|
-
end
|
1349
|
-
else # Hash
|
1350
|
-
if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
|
1351
|
-
warn('Some profiles given are already stored. Stored version will be replaced')
|
1352
|
-
end
|
1353
|
-
profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
|
1354
|
-
end
|
1355
|
-
|
1356
|
-
self.add_observed_terms_from_profiles(reset: true)
|
1357
|
-
|
1358
|
-
if calc_metadata
|
1359
|
-
self.calc_profiles_dictionary
|
1360
|
-
end
|
1361
|
-
end
|
1362
|
-
|
1363
|
-
|
1364
|
-
# Internal method used to remove already stored profiles and restore observed frequencies
|
1365
|
-
def reset_profiles
|
1366
|
-
# Clean profiles storage
|
1367
|
-
@profiles = {}
|
1368
|
-
# Reset frequency observed
|
1369
|
-
@meta.each{|term,info| info[:observed_freq] = 0}
|
1370
|
-
@max_freqs[:observed_freq] = 0
|
1371
|
-
end
|
1372
|
-
|
1373
|
-
|
1374
|
-
# ===== Returns
|
1375
|
-
# profiles assigned to a given ID
|
1376
|
-
# ===== Parameters
|
1377
|
-
# +id+:: profile ID
|
1378
|
-
# ===== Return
|
1379
|
-
# specific profile or nil if it's not stored
|
1380
|
-
def get_profile(id)
|
1381
|
-
return @profiles[id]
|
1382
|
-
end
|
1383
|
-
|
1384
|
-
|
1385
|
-
# ===== Returns
|
1386
|
-
# an array of sizes for all stored profiles
|
1387
|
-
# ===== Return
|
1388
|
-
# array of profile sizes
|
1389
|
-
def get_profiles_sizes()
|
1390
|
-
return @profiles.map{|id,terms| terms.length}
|
1391
|
-
end
|
1392
|
-
|
1393
|
-
|
1394
|
-
# ===== Returns
|
1395
|
-
# mean size of stored profiles
|
1396
|
-
# ===== Parameters
|
1397
|
-
# +round_digits+:: number of digits to round result. Default: 4
|
1398
|
-
# ===== Returns
|
1399
|
-
# mean size of stored profiles
|
1400
|
-
def get_profiles_mean_size(round_digits: 4)
|
1401
|
-
sizes = self.get_profiles_sizes
|
1402
|
-
return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
|
1403
|
-
end
|
1404
|
-
|
1405
|
-
|
1406
|
-
# Calculates profiles sizes and returns size assigned to percentile given
|
1407
|
-
# ===== Parameters
|
1408
|
-
# +perc+:: percentile to be returned
|
1409
|
-
# +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
|
1410
|
-
# ===== Returns
|
1411
|
-
# values assigned to percentile asked
|
1412
|
-
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
1413
|
-
prof_lengths = self.get_profiles_sizes.sort
|
1414
|
-
prof_lengths.reverse! if !increasing_sort
|
1415
|
-
n_profiles = prof_lengths.length
|
1416
|
-
percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
|
1417
|
-
percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
|
1418
|
-
return prof_lengths[percentile_index]
|
1419
|
-
end
|
1420
|
-
|
1421
|
-
|
1422
|
-
# Translate a given profile to terms names
|
1423
|
-
# ===== Parameters
|
1424
|
-
# +prof+:: array of terms to be translated
|
1425
|
-
# ===== Returns
|
1426
|
-
# array of translated terms. Can include nils if some IDs are not allowed
|
1427
|
-
def profile_names(prof)
|
1428
|
-
return prof.map{|term| self.translate_id(term)}
|
1429
|
-
end
|
1430
|
-
|
1431
|
-
|
1432
|
-
# Trnaslates a bunch of profiles to it sets of term names
|
1433
|
-
# ===== Parameters
|
1434
|
-
# +profs+:: array of profiles
|
1435
|
-
# +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
|
1436
|
-
# ===== Returns
|
1437
|
-
# translated profiles
|
1438
|
-
def translate_profiles_ids(profs = [], asArray: true)
|
1439
|
-
profs = @profiles if profs.empty?
|
1440
|
-
profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
|
1441
|
-
profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
|
1442
|
-
return asArray ? profs_names.values : profs_names
|
1443
|
-
end
|
1444
|
-
|
1445
|
-
|
1446
|
-
# Includes as "observed_terms" all terms included into stored profiles
|
1447
|
-
# ===== Parameters
|
1448
|
-
# +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
|
1449
|
-
def add_observed_terms_from_profiles(reset: false)
|
1450
|
-
@meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
|
1451
|
-
@profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
|
1452
|
-
end
|
1453
|
-
|
1454
|
-
|
1455
|
-
# Get a term frequency
|
1456
|
-
# ===== Parameters
|
1457
|
-
# +term+:: term to be checked
|
1458
|
-
# +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
|
1459
|
-
# ===== Returns
|
1460
|
-
# frequency of term given or nil if term is not allowed
|
1461
|
-
def get_frequency(term, type: :struct_freq)
|
1462
|
-
queryFreq = @meta[term]
|
1463
|
-
return queryFreq.nil? ? nil : queryFreq[type]
|
1464
|
-
end
|
1465
|
-
|
1466
|
-
|
1467
|
-
# Geys structural frequency of a term given
|
1468
|
-
# ===== Parameters
|
1469
|
-
# +term+:: to be checked
|
1470
|
-
# ===== Returns
|
1471
|
-
# structural frequency of given term or nil if term is not allowed
|
1472
|
-
def get_structural_frequency(term)
|
1473
|
-
return self.get_frequency(term, type: :struct_freq)
|
1474
|
-
end
|
1475
|
-
|
1476
|
-
|
1477
|
-
# Gets observed frequency of a term given
|
1478
|
-
# ===== Parameters
|
1479
|
-
# +term+:: to be checked
|
1480
|
-
# ===== Returns
|
1481
|
-
# observed frequency of given term or nil if term is not allowed
|
1482
|
-
def get_observed_frequency(term)
|
1483
|
-
return self.get_frequency(term, type: :observed_freq)
|
1484
|
-
end
|
1485
|
-
|
1486
|
-
|
1487
|
-
# Calculates frequencies of stored profiles terms
|
1488
|
-
# ===== Parameters
|
1489
|
-
# +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
|
1490
|
-
# +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
|
1491
|
-
# +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
|
1492
|
-
# +translate+:: if true, term IDs will be translated to
|
1493
|
-
# ===== Returns
|
1494
|
-
# stored profiles terms frequencies
|
1495
|
-
def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
|
1496
|
-
n_profiles = @profiles.length
|
1497
|
-
if literal
|
1498
|
-
freqs = {}
|
1499
|
-
@profiles.each do |id, terms|
|
1500
|
-
terms.each do |literalTerm|
|
1501
|
-
if freqs.include?(literalTerm)
|
1502
|
-
freqs[literalTerm] += 1
|
1503
|
-
else
|
1504
|
-
freqs[literalTerm] = 1
|
1505
|
-
end
|
1506
|
-
end
|
1507
|
-
end
|
1508
|
-
if (ratio || translate)
|
1509
|
-
aux_keys = freqs.keys
|
1510
|
-
aux_keys.each do |term|
|
1511
|
-
freqs[term] = freqs[term].fdiv(n_profiles) if ratio
|
1512
|
-
if translate
|
1513
|
-
tr = self.translate_id(term)
|
1514
|
-
freqs[tr] = freqs.delete(term) if !tr.nil?
|
1515
|
-
end
|
1516
|
-
end
|
1517
|
-
end
|
1518
|
-
if asArray
|
1519
|
-
freqs = freqs.map{|term, freq| [term, freq]}
|
1520
|
-
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
1521
|
-
end
|
1522
|
-
else # Freqs translating alternatives
|
1523
|
-
freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
|
1524
|
-
freqs = freqs.to_h if !asArray
|
1525
|
-
if translate
|
1526
|
-
freqs = freqs.map do |term, freq|
|
1527
|
-
tr = self.translate_id(term)
|
1528
|
-
tr.nil? ? [term, freq] : [tr, freq]
|
1529
|
-
end
|
1530
|
-
end
|
1531
|
-
if asArray
|
1532
|
-
freqs = freqs.map{|term, freq| [term, freq]}
|
1533
|
-
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
1534
|
-
else
|
1535
|
-
freqs = freqs.to_h
|
1536
|
-
end
|
1537
|
-
end
|
1538
|
-
return freqs
|
1539
|
-
end
|
1540
|
-
|
1541
|
-
|
1542
|
-
# Clean a given profile returning cleaned set of terms and removed ancestors term.
|
1543
|
-
# ===== Parameters
|
1544
|
-
# +prof+:: array of terms to be checked
|
1545
|
-
# ===== Returns
|
1546
|
-
# two arrays, first is the cleaned profile and second is the removed elements array
|
1547
|
-
def remove_ancestors_from_profile(prof)
|
1548
|
-
ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
|
1549
|
-
redundant = prof.select{|term| ancestors.include?(term)}
|
1550
|
-
return prof - redundant, redundant
|
1551
|
-
end
|
1552
|
-
|
1553
|
-
|
1554
|
-
# Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
|
1555
|
-
# ===== Parameters
|
1556
|
-
# +prof+:: array of terms to be checked
|
1557
|
-
# ===== Returns
|
1558
|
-
# two arrays, first is the cleaned profile and second is the removed elements array
|
1559
|
-
def remove_alternatives_from_profile(prof)
|
1560
|
-
alternatives = prof.select{|term| @alternatives_index.include?(term)}
|
1561
|
-
redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
|
1562
|
-
return prof - redundant, redundant
|
1563
|
-
end
|
1564
|
-
|
1565
|
-
|
1566
|
-
# Remove alternatives (if official term is present) and ancestors terms of a given profile
|
1567
|
-
# ===== Parameters
|
1568
|
-
# +profile+:: profile to be cleaned
|
1569
|
-
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1570
|
-
# ===== Returns
|
1571
|
-
# cleaned profile
|
1572
|
-
def clean_profile(profile, remove_alternatives: true)
|
1573
|
-
warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
|
1574
|
-
terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
|
1575
|
-
if remove_alternatives
|
1576
|
-
terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
|
1577
|
-
else
|
1578
|
-
terms_without_ancestors_and_alternatices = terms_without_ancestors
|
1579
|
-
end
|
1580
|
-
return terms_without_ancestors_and_alternatices
|
1581
|
-
end
|
1582
|
-
|
1583
|
-
def clean_profile_hard(profile)
|
1584
|
-
profile, _ = check_ids(profile)
|
1585
|
-
profile = profile.select{|t| !is_obsolete?(t)}
|
1586
|
-
profile = clean_profile(profile.uniq)
|
1587
|
-
return profile
|
1588
|
-
end
|
1589
|
-
|
1590
|
-
# Remove terms from a given profile using hierarchical info and scores set given
|
1591
|
-
# ===== Parameters
|
1592
|
-
# +profile+:: profile to be cleaned
|
1593
|
-
# +scores+:: hash with terms by keys and numerical values (scores)
|
1594
|
-
# +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
|
1595
|
-
# +remove_without_score+:: if true, terms without score will be removed. Default: true
|
1596
|
-
# ===== Returns
|
1597
|
-
# cleaned profile
|
1598
|
-
def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
|
1599
|
-
scores = scores.sort_by{|term,score| score}.to_h
|
1600
|
-
keep = profile.map do |term|
|
1601
|
-
if scores.include?(term)
|
1602
|
-
parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
|
1603
|
-
targetable = parentals.select{|parent| profile.include?(parent)}
|
1604
|
-
if targetable.empty?
|
1605
|
-
term
|
1606
|
-
else
|
1607
|
-
targetable << term
|
1608
|
-
targets = scores.select{|term,score| targetable.include?(term)}.to_h
|
1609
|
-
byMax ? targets.keys.last : targets.keys.first
|
1610
|
-
end
|
1611
|
-
elsif remove_without_score
|
1612
|
-
nil
|
1613
|
-
else
|
1614
|
-
term
|
752
|
+
def connect_familiars!(terms_levels)
|
753
|
+
levels = terms_levels.keys.sort
|
754
|
+
while levels.length > 1 # Process when current level has a parental level
|
755
|
+
level = levels.pop
|
756
|
+
parental_level = level - 1
|
757
|
+
parental_terms = terms_levels[parental_level]
|
758
|
+
if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
|
759
|
+
parental_terms = [] # Initialize required parental level
|
760
|
+
terms_levels[parental_level] = parental_terms
|
761
|
+
levels << parental_level
|
762
|
+
end
|
763
|
+
terms_levels[level].each do |term|
|
764
|
+
path_info = @term_paths[term]
|
765
|
+
shortest_path_length = path_info[:shortest_path]
|
766
|
+
path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
|
767
|
+
parental = path[1] # the first elements is the term itself
|
768
|
+
parental_terms << parental if !parental_terms.include?(parental)
|
1615
769
|
end
|
1616
770
|
end
|
1617
|
-
return keep.compact.uniq
|
1618
|
-
end
|
1619
|
-
|
1620
|
-
|
1621
|
-
# Remove alternatives (if official term is present) and ancestors terms of stored profiles
|
1622
|
-
# ===== Parameters
|
1623
|
-
# +store+:: if true, clenaed profiles will replace already stored profiles
|
1624
|
-
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1625
|
-
# ===== Returns
|
1626
|
-
# a hash with cleaned profiles
|
1627
|
-
def clean_profiles(store: false, remove_alternatives: true)
|
1628
|
-
cleaned_profiles = {}
|
1629
|
-
@profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
|
1630
|
-
@profiles = cleaned_profiles if store
|
1631
|
-
return cleaned_profiles
|
1632
771
|
end
|
1633
772
|
|
1634
|
-
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
# ===== Parameters
|
1647
|
-
# +prof+:: profile to be checked
|
1648
|
-
# +ic_type+:: ic_type to be used
|
1649
|
-
# +zhou_k+:: special coeficient for Zhou IC method
|
1650
|
-
# ===== Returns
|
1651
|
-
# mean IC for a given profile
|
1652
|
-
def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
|
1653
|
-
return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
|
1654
|
-
end
|
1655
|
-
|
1656
|
-
|
1657
|
-
# Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
|
1658
|
-
# ===== Returns
|
1659
|
-
# two hashes with Profiles and IC calculated for resnik and observed resnik respectively
|
1660
|
-
def get_profiles_resnik_dual_ICs
|
1661
|
-
struct_ics = {}
|
1662
|
-
observ_ics = {}
|
1663
|
-
@profiles.each do |id, terms|
|
1664
|
-
struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
|
1665
|
-
observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
|
1666
|
-
end
|
1667
|
-
return struct_ics.clone, observ_ics.clone
|
1668
|
-
end
|
1669
|
-
|
1670
|
-
|
1671
|
-
# Calculates ontology structural levels for all ontology terms
|
1672
|
-
# ===== Parameters
|
1673
|
-
# +calc_paths+:: calculates term paths if it's not already calculated
|
1674
|
-
# +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
|
1675
|
-
def calc_term_levels(calc_paths: false, shortest_path: true)
|
1676
|
-
if @term_paths.empty?
|
1677
|
-
if calc_paths
|
1678
|
-
self.calc_term_paths
|
1679
|
-
else
|
1680
|
-
warn('Term paths are not already loaded. Aborting dictionary calc')
|
1681
|
-
end
|
1682
|
-
end
|
1683
|
-
if !@term_paths.empty?
|
1684
|
-
byTerm = {}
|
1685
|
-
byValue = {}
|
1686
|
-
# Calc per term
|
1687
|
-
@term_paths.each do |term, info|
|
1688
|
-
level = shortest_path ? info[:shortest_path] : info[:largest_path]
|
1689
|
-
if level.nil?
|
1690
|
-
level = -1
|
1691
|
-
else
|
1692
|
-
level = level.round(0)
|
1693
|
-
end
|
1694
|
-
byTerm[term] = level
|
1695
|
-
queryLevels = byValue[level]
|
1696
|
-
if queryLevels.nil?
|
1697
|
-
byValue[level] = [term]
|
773
|
+
def get_item_list_parental(terms_levels)
|
774
|
+
transfered_list = {}
|
775
|
+
parent_dict = @dicts[:is_a][:byTerm]
|
776
|
+
levels = terms_levels.keys.sort
|
777
|
+
while levels.length > 1
|
778
|
+
level = levels.pop
|
779
|
+
terms_levels[level].each do |term|
|
780
|
+
parents = parent_dict[term]
|
781
|
+
if parents.nil?
|
782
|
+
next
|
783
|
+
elsif parents.length == 1
|
784
|
+
parent = parents.first
|
1698
785
|
else
|
1699
|
-
|
786
|
+
parent = (parents | terms_levels[level - 1]).first
|
1700
787
|
end
|
788
|
+
term_it = @items[term]
|
789
|
+
parent_it = @items[parent]
|
790
|
+
curr_it = transfered_list[term]
|
791
|
+
parent_all_items = merge_groups([term_it, parent_it, curr_it])
|
792
|
+
transfered_list[parent] = parent_all_items if !parent_all_items.empty?
|
793
|
+
term_all_items = merge_groups([term_it, curr_it])
|
794
|
+
transfered_list[term] = term_all_items if !term_all_items.empty?
|
1701
795
|
end
|
1702
|
-
@dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
|
1703
|
-
# Update maximum depth
|
1704
|
-
@max_freqs[:max_depth] = byValue.keys.max
|
1705
796
|
end
|
797
|
+
terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
|
798
|
+
transfered_list[term] = @items[term] if transfered_list[term].nil?
|
799
|
+
end
|
800
|
+
return transfered_list
|
1706
801
|
end
|
1707
802
|
|
1708
|
-
|
1709
|
-
|
1710
|
-
def is_obsolete? term
|
1711
|
-
return @obsoletes_index.include?(term)
|
803
|
+
def merge_groups(groups)
|
804
|
+
return groups.compact.inject([ ]){|it, a| it | a}
|
1712
805
|
end
|
1713
806
|
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
807
|
+
def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
|
808
|
+
results = []
|
809
|
+
penalized_terms = {}
|
810
|
+
levels = terms_levels.keys.sort
|
811
|
+
levels.reverse_each do |level|
|
812
|
+
terms_levels[level].each do |term|
|
813
|
+
associated_items = item_list[term]
|
814
|
+
items_to_remove = penalized_terms[term]
|
815
|
+
items_to_remove = [] if items_to_remove.nil?
|
816
|
+
pval = get_fisher_exact_test(
|
817
|
+
external_item_list - items_to_remove,
|
818
|
+
associated_items - items_to_remove,
|
819
|
+
#((associated_items | external_item_list) - items_to_remove).length
|
820
|
+
total_items
|
821
|
+
)
|
822
|
+
if pval <= thresold
|
823
|
+
parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
|
824
|
+
parents.each do |prnt|
|
825
|
+
query = penalized_terms[prnt]
|
826
|
+
if query.nil?
|
827
|
+
penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
|
828
|
+
else
|
829
|
+
query.concat(item_list[term])
|
830
|
+
end
|
831
|
+
end
|
832
|
+
end
|
833
|
+
results << [term, pval]
|
834
|
+
end
|
835
|
+
end
|
836
|
+
return results
|
1717
837
|
end
|
1718
838
|
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
1731
|
-
|
1732
|
-
|
1733
|
-
end
|
1734
|
-
if !visited_terms.include?(term)
|
1735
|
-
# PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
|
1736
|
-
path_attr = @term_paths[term]
|
1737
|
-
if path_attr.nil?
|
1738
|
-
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
|
1739
|
-
@term_paths[term] = path_attr #save path data container
|
1740
|
-
end
|
1741
|
-
parentals = @dicts[:is_a][:byTerm][term]
|
1742
|
-
if parentals.nil?
|
1743
|
-
path_attr[:paths] << [term]
|
1744
|
-
else
|
1745
|
-
parentals.each do |direct_parental|
|
1746
|
-
self.expand_path(direct_parental)
|
1747
|
-
new_paths = @term_paths[direct_parental][:paths]
|
1748
|
-
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
|
1749
|
-
end
|
1750
|
-
end
|
1751
|
-
anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
|
1752
|
-
visited_terms[term] = true
|
839
|
+
def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
|
840
|
+
pvals = {}
|
841
|
+
item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
|
842
|
+
levels = terms_levels.keys.sort
|
843
|
+
levels.reverse_each do |level|
|
844
|
+
terms_levels[level].each do |term|
|
845
|
+
associated_items = item_list[term]
|
846
|
+
#initialize observed items in item_weigths_per_term list
|
847
|
+
add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
|
848
|
+
children = @dicts[:is_a][:byValue][term]
|
849
|
+
if children.nil?
|
850
|
+
children = []
|
851
|
+
else
|
852
|
+
children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
|
1753
853
|
end
|
1754
|
-
|
1755
|
-
path_attr = @term_paths[term]
|
1756
|
-
path_attr[:total_paths] = path_attr[:paths].length
|
1757
|
-
paths_sizes = path_attr[:paths].map{|path| path.length}
|
1758
|
-
path_attr[:largest_path] = paths_sizes.max
|
1759
|
-
path_attr[:shortest_path] = paths_sizes.min
|
854
|
+
computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
1760
855
|
end
|
1761
|
-
else
|
1762
|
-
warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
|
1763
856
|
end
|
857
|
+
return pvals.to_a
|
1764
858
|
end
|
1765
859
|
|
860
|
+
def add_items_to_weigthed_list(term, associated_items, weigthed_list)
|
861
|
+
term_weigthing = weigthed_list[term]
|
862
|
+
associated_items.each{|ai| term_weigthing[ai] = 1}
|
863
|
+
weigthed_list[term] = term_weigthing
|
864
|
+
end
|
1766
865
|
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
|
1771
|
-
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
1776
|
-
|
1777
|
-
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
866
|
+
def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
867
|
+
#puts term.to_s.red
|
868
|
+
#puts @term_paths[term].inspect
|
869
|
+
#puts @dicts[:is_a][:byValue][term].inspect.light_blue
|
870
|
+
associated_items = item_weigths_per_term[term].keys
|
871
|
+
pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
|
872
|
+
'two_sided', item_weigths_per_term[term], true)
|
873
|
+
pvals[term] = pval
|
874
|
+
if children.length > 0
|
875
|
+
rates = {}
|
876
|
+
sig_child = 0
|
877
|
+
children.each do |child|
|
878
|
+
ratio = sigRatio(pvals[child], pval)
|
879
|
+
rates[child] = ratio
|
880
|
+
sig_child += 1 if ratio >= 1
|
881
|
+
end
|
882
|
+
if sig_child == 0 # CASE 1
|
883
|
+
children.each do |child|
|
884
|
+
current_ratio = rates[child]
|
885
|
+
query_child = item_weigths_per_term[child]
|
886
|
+
query_child.transform_values!{|weight| weight * current_ratio}
|
887
|
+
pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
|
888
|
+
'two_sided', item_weigths_per_term[child], true)
|
889
|
+
end
|
890
|
+
else
|
891
|
+
ancs = get_ancestors(term)
|
892
|
+
ancs << term
|
893
|
+
rates.each do |ch, ratio|# CASE 2
|
894
|
+
if ratio >= 1 # The child is better than parent
|
895
|
+
ancs.each do |anc|
|
896
|
+
query_anc = item_weigths_per_term[anc]
|
897
|
+
associated_items.each do |item|
|
898
|
+
query_anc[item] /= ratio # /= --> query_anc[item]/ratio
|
899
|
+
end
|
900
|
+
end
|
1786
901
|
end
|
1787
|
-
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
|
1788
902
|
end
|
903
|
+
computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
|
1789
904
|
end
|
1790
905
|
end
|
1791
906
|
end
|
1792
907
|
|
1793
|
-
|
1794
|
-
|
1795
|
-
# ===== Returns
|
1796
|
-
# ontology levels calculated
|
1797
|
-
def get_ontology_levels
|
1798
|
-
return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
|
908
|
+
def sigRatio(pvalA, pvalB)
|
909
|
+
return Math.log(pvalA)/Math.log(pvalB)
|
1799
910
|
end
|
1800
911
|
|
912
|
+
# END of methods involved with compute_relations_to_items
|
913
|
+
#-----------------------------------------------------------------------------------
|
914
|
+
|
915
|
+
#############################################
|
916
|
+
# PROFILE EXTERNAL METHODS
|
917
|
+
#############################################
|
1801
918
|
|
1802
|
-
#
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
919
|
+
# I/O profile
|
920
|
+
####################################
|
921
|
+
|
922
|
+
# Increase the arbitrary frequency of a given term set
|
923
|
+
# ===== Parameters
|
924
|
+
# +terms+:: set of terms to be updated
|
925
|
+
# +increase+:: amount to be increased
|
926
|
+
# +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
|
927
|
+
# ===== Return
|
928
|
+
# true if process ends without errors and false in other cases
|
929
|
+
def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false, expand2parentals: true)
|
930
|
+
terms = terms.map{|term| [term] + get_ancestors(term.to_sym)}.flatten if expand2parentals
|
931
|
+
return terms.map{|id| self.add_observed_term(
|
932
|
+
term: transform_to_sym ? id.to_sym : id,
|
933
|
+
increase: increase)} # FRED: It is necessary the return?
|
1807
934
|
end
|
1808
935
|
|
1809
|
-
#
|
1810
|
-
|
1811
|
-
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
if all_paths.empty?
|
1817
|
-
path = []
|
1818
|
-
else
|
1819
|
-
path = all_paths.select{|pt| pt.length == path_length}.first.clone
|
1820
|
-
if level > 0 # we want the term and his ascendants until a specific level
|
1821
|
-
n_parents = path_length - level
|
1822
|
-
path = path[0..n_parents]
|
1823
|
-
end
|
1824
|
-
path.shift # Discard the term itself
|
1825
|
-
end
|
936
|
+
# Modifying Profile
|
937
|
+
####################################
|
938
|
+
|
939
|
+
def expand_profile_with_parents(profile)
|
940
|
+
new_terms = []
|
941
|
+
profile.each do |term|
|
942
|
+
new_terms = new_terms | get_ancestors(term)
|
1826
943
|
end
|
1827
|
-
return
|
944
|
+
return new_terms | profile
|
945
|
+
end
|
946
|
+
|
947
|
+
# Clean a given profile returning cleaned set of terms and removed ancestors term.
|
948
|
+
# ===== Parameters
|
949
|
+
# +prof+:: array of terms to be checked
|
950
|
+
# ===== Returns
|
951
|
+
# two arrays, first is the cleaned profile and second is the removed elements array
|
952
|
+
def remove_ancestors_from_profile(prof)
|
953
|
+
ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
|
954
|
+
redundant = prof & ancestors
|
955
|
+
return prof - redundant, redundant
|
1828
956
|
end
|
1829
957
|
|
1830
|
-
#
|
958
|
+
# Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
|
959
|
+
# ===== Parameters
|
960
|
+
# +prof+:: array of terms to be checked
|
1831
961
|
# ===== Returns
|
1832
|
-
#
|
1833
|
-
def
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
profiles_terms.each do |term|
|
1838
|
-
query = term_freqs_byProfile[term]
|
1839
|
-
if query.nil?
|
1840
|
-
term_freqs_byProfile[term] = 1
|
1841
|
-
else
|
1842
|
-
term_freqs_byProfile[term] += 1
|
1843
|
-
end
|
1844
|
-
end
|
1845
|
-
levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
|
1846
|
-
return levels_filtered
|
962
|
+
# two arrays, first is the cleaned profile and second is the removed elements array
|
963
|
+
def remove_alternatives_from_profile(prof)
|
964
|
+
alternatives = prof.select{|term| @alternatives_index.include?(term)}
|
965
|
+
redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
|
966
|
+
return prof - redundant, redundant
|
1847
967
|
end
|
1848
968
|
|
1849
|
-
|
1850
|
-
|
1851
|
-
|
1852
|
-
|
1853
|
-
|
1854
|
-
|
1855
|
-
|
1856
|
-
|
1857
|
-
|
1858
|
-
|
1859
|
-
|
1860
|
-
cohort_terms = cohort_ontology_levels[level]
|
1861
|
-
uniq_cohort_terms = uniq_cohort_ontology_levels[level]
|
1862
|
-
if cohort_terms.nil? || uniq_cohort_terms.nil?
|
1863
|
-
num = 0
|
1864
|
-
u_num = 0
|
1865
|
-
else
|
1866
|
-
num = cohort_terms.length
|
1867
|
-
u_num = uniq_cohort_terms.length
|
1868
|
-
end
|
1869
|
-
ontology_levels << [level, terms.length, num]
|
1870
|
-
distribution_percentage << [
|
1871
|
-
level,
|
1872
|
-
(terms.length.fdiv(total_ontology_terms)*100).round(3),
|
1873
|
-
(num.fdiv(total_cohort_terms)*100).round(3),
|
1874
|
-
(u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
|
1875
|
-
]
|
1876
|
-
end
|
1877
|
-
ontology_levels.sort! { |x,y| x.first <=> y.first }
|
1878
|
-
distribution_percentage.sort! { |x,y| x.first <=> y.first }
|
1879
|
-
return ontology_levels, distribution_percentage
|
969
|
+
# Remove alternatives (if official term is present) and ancestors terms of a given profile
|
970
|
+
# ===== Parameters
|
971
|
+
# +profile+:: profile to be cleaned
|
972
|
+
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
973
|
+
# ===== Returns
|
974
|
+
# cleaned profile
|
975
|
+
def clean_profile(profile, remove_alternatives: true)
|
976
|
+
warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
|
977
|
+
terms_without_ancestors, _ = remove_ancestors_from_profile(profile)
|
978
|
+
terms_without_ancestors, _ = remove_alternatives_from_profile(terms_without_ancestors) if remove_alternatives
|
979
|
+
return terms_without_ancestors
|
1880
980
|
end
|
1881
981
|
|
1882
|
-
def
|
1883
|
-
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
1887
|
-
|
1888
|
-
|
1889
|
-
|
1890
|
-
maxL = nil
|
1891
|
-
distribution_percentage.each do |level_info|
|
1892
|
-
maxL = level_info.first if level_info[1] == max_terms
|
1893
|
-
end
|
1894
|
-
diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
|
1895
|
-
diffL.select!{|dL| dL.last > 0}
|
1896
|
-
lowSection = diffL.select{|dL| dL.first <= maxL}
|
1897
|
-
highSection = diffL.select{|dL| dL.first > maxL}
|
1898
|
-
dsi = nil
|
1899
|
-
if highSection.empty?
|
1900
|
-
dsi = 0
|
1901
|
-
else
|
1902
|
-
accumulated_weigth = 0
|
1903
|
-
accumulated_weigthed_diffL = 0
|
1904
|
-
hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
|
1905
|
-
lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
|
1906
|
-
dsi = hss.fdiv(lss)
|
1907
|
-
end
|
1908
|
-
return dsi
|
982
|
+
def clean_profile_hard(profile, options = {})
|
983
|
+
profile, _ = check_ids(profile)
|
984
|
+
profile = profile.select{|t| !is_obsolete?(t)}
|
985
|
+
if !options[:term_filter].nil?
|
986
|
+
profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
|
987
|
+
end
|
988
|
+
profile = clean_profile(profile.uniq)
|
989
|
+
return profile
|
1909
990
|
end
|
1910
991
|
|
1911
|
-
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1915
|
-
|
1916
|
-
|
992
|
+
# Remove terms from a given profile using hierarchical info and scores set given
|
993
|
+
# ===== Parameters
|
994
|
+
# +profile+:: profile to be cleaned
|
995
|
+
# +scores+:: hash with terms by keys and numerical values (scores)
|
996
|
+
# +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
|
997
|
+
# +remove_without_score+:: if true, terms without score will be removed. Default: true
|
998
|
+
# ===== Returns
|
999
|
+
# cleaned profile
|
1000
|
+
def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
|
1001
|
+
scores = scores.sort_by{|term,score| score}.to_h
|
1002
|
+
keep = profile.map do |term|
|
1003
|
+
if scores.include?(term)
|
1004
|
+
parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
|
1005
|
+
targetable = parentals.select{|parent| profile.include?(parent)}
|
1006
|
+
if targetable.empty?
|
1007
|
+
term
|
1008
|
+
else
|
1009
|
+
targetable << term
|
1010
|
+
targets = scores.select{|term,score| targetable.include?(term)}.to_h
|
1011
|
+
byMax ? targets.keys.last : targets.keys.first
|
1012
|
+
end
|
1013
|
+
elsif remove_without_score
|
1014
|
+
nil
|
1917
1015
|
else
|
1918
|
-
|
1016
|
+
term
|
1919
1017
|
end
|
1920
|
-
accumulated_weigthed_diffL += diff * weightL
|
1921
1018
|
end
|
1922
|
-
|
1923
|
-
return weigthed_contribution
|
1019
|
+
return keep.compact.uniq
|
1924
1020
|
end
|
1925
1021
|
|
1022
|
+
# ID Handlers
|
1023
|
+
####################################
|
1926
1024
|
|
1927
|
-
#
|
1928
|
-
|
1929
|
-
|
1930
|
-
|
1931
|
-
|
1932
|
-
|
1933
|
-
|
1934
|
-
|
1935
|
-
|
1936
|
-
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
|
1025
|
+
# Check a set of IDs and return allowed IDs removing which are not official terms on this ontology
|
1026
|
+
# ===== Parameters
|
1027
|
+
# +ids+:: to be checked
|
1028
|
+
# ===== Return
|
1029
|
+
# two arrays whit allowed and rejected IDs respectively
|
1030
|
+
def check_ids(ids, substitute: true)
|
1031
|
+
checked_codes = []
|
1032
|
+
rejected_codes = []
|
1033
|
+
ids.each do |id|
|
1034
|
+
new_id = get_main_id(id)
|
1035
|
+
if new_id.nil?
|
1036
|
+
rejected_codes << id
|
1037
|
+
else
|
1038
|
+
if substitute
|
1039
|
+
checked_codes << new_id
|
1040
|
+
else
|
1041
|
+
checked_codes << id
|
1941
1042
|
end
|
1942
1043
|
end
|
1943
|
-
@profilesDict = byTerm
|
1944
1044
|
end
|
1045
|
+
return checked_codes, rejected_codes
|
1945
1046
|
end
|
1946
1047
|
|
1947
1048
|
|
1948
|
-
#
|
1049
|
+
# Translates several IDs and returns translations and not allowed IDs list
|
1050
|
+
# ===== Parameters
|
1051
|
+
# +ids+:: to be translated
|
1949
1052
|
# ===== Return
|
1950
|
-
#
|
1951
|
-
def
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1053
|
+
# two arrays with translations and ids which couldn't be translated respectively
|
1054
|
+
def translate_ids(ids)
|
1055
|
+
translated = []
|
1056
|
+
rejected = []
|
1057
|
+
ids.each do |term_id|
|
1058
|
+
tr = self.translate_id(term_id.to_sym)
|
1059
|
+
if !tr.nil?
|
1060
|
+
translated << tr # FRED: Why have this a different behaviour from ...->
|
1061
|
+
else
|
1062
|
+
rejected << tr
|
1063
|
+
end
|
1064
|
+
end
|
1065
|
+
return translated, rejected
|
1066
|
+
end
|
1955
1067
|
|
1956
|
-
#
|
1068
|
+
# Translate several names and return translations and a list of names which couldn't be translated
|
1957
1069
|
# ===== Parameters
|
1958
|
-
# +
|
1959
|
-
# =====
|
1960
|
-
#
|
1961
|
-
def
|
1962
|
-
|
1070
|
+
# +names+:: array to be translated
|
1071
|
+
# ===== Return
|
1072
|
+
# two arrays with translations and names which couldn't be translated respectively
|
1073
|
+
def translate_names(names)
|
1074
|
+
translated = []
|
1075
|
+
rejected = []
|
1076
|
+
names.each do |name|
|
1077
|
+
tr = self.translate_name(name)
|
1078
|
+
if tr.nil?
|
1079
|
+
rejected << name # FRED: <-... this?
|
1080
|
+
else
|
1081
|
+
translated << tr
|
1082
|
+
end
|
1083
|
+
end
|
1084
|
+
return translated, rejected
|
1963
1085
|
end
|
1964
1086
|
|
1087
|
+
# Description of profile's terms
|
1088
|
+
####################################
|
1965
1089
|
|
1966
1090
|
# Gets metainfo table from a set of terms
|
1967
1091
|
# ===== Parameters
|
1968
1092
|
# +terms+:: IDs to be expanded
|
1969
|
-
# +filter_alternatives+:: flag to be used in get_descendants method
|
1970
1093
|
# ===== Returns
|
1971
1094
|
# an array with triplets [TermID, TermName, DescendantsNames]
|
1972
|
-
def get_childs_table(
|
1973
|
-
|
1974
|
-
|
1975
|
-
|
1095
|
+
def get_childs_table(profile)
|
1096
|
+
expanded_profile = []
|
1097
|
+
profile.each do |t|
|
1098
|
+
expanded_profile << [[t, translate_id(t)], get_descendants(t).map{|child| [child, translate_id(child)]}]
|
1976
1099
|
end
|
1977
|
-
return
|
1100
|
+
return expanded_profile
|
1978
1101
|
end
|
1979
1102
|
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
|
1984
|
-
# +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
|
1985
|
-
# +expand+:: if true, already stored keys will be updated with the unique union of both sets
|
1986
|
-
def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
|
1987
|
-
@items = {} if remove_old_relations
|
1988
|
-
if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
|
1989
|
-
warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
|
1103
|
+
def get_terms_levels(profile)
|
1104
|
+
termsAndLevels = []
|
1105
|
+
profile.each do |term|
|
1106
|
+
termsAndLevels << [term, get_term_level(term)]
|
1990
1107
|
end
|
1991
|
-
|
1992
|
-
|
1993
|
-
warn('Some terms given are already stored. Stored version will be replaced')
|
1994
|
-
end
|
1995
|
-
end
|
1996
|
-
if expand
|
1997
|
-
@items = self.concatItems(@items,relations)
|
1998
|
-
# relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
|
1999
|
-
# if @items.keys.include?(k)
|
2000
|
-
# if v.kind_of?(Array)
|
2001
|
-
# @items[k] = (@items[k] + v).uniq
|
2002
|
-
# elsif v.kind_of?(Hash)
|
2003
|
-
# @items.merge!(relations) do |k, oldV, newV|
|
2004
|
-
# if oldV.kind_of?(Array)
|
2005
|
-
# return (oldV + newV).uniq
|
2006
|
-
# else
|
2007
|
-
# oldV = [oldV,newV]
|
2008
|
-
# end
|
2009
|
-
# end
|
2010
|
-
# elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
|
2011
|
-
# @items[k] = (@items[k] + [v]).uniq
|
2012
|
-
# else
|
2013
|
-
# @items[k] = [@items[k],v]
|
2014
|
-
# end
|
2015
|
-
# else
|
2016
|
-
# @items[k] = v
|
2017
|
-
# end
|
2018
|
-
# end
|
2019
|
-
else
|
2020
|
-
@items.merge!(relations)
|
2021
|
-
end
|
2022
|
-
end
|
1108
|
+
return termsAndLevels
|
1109
|
+
end
|
2023
1110
|
|
2024
|
-
#
|
1111
|
+
# IC data
|
1112
|
+
####################################
|
1113
|
+
|
1114
|
+
# Get information coefficient from profiles #
|
1115
|
+
|
1116
|
+
# Calculates mean IC of a given profile
|
2025
1117
|
# ===== Parameters
|
2026
|
-
# +
|
2027
|
-
# +
|
2028
|
-
#
|
2029
|
-
#
|
2030
|
-
|
2031
|
-
|
2032
|
-
|
2033
|
-
|
2034
|
-
# A_array : B_single => NOT ALLOWED
|
2035
|
-
# A is Hash :: RETURN HASH
|
2036
|
-
# A_hash : B_array => NOT ALLOWED
|
2037
|
-
# A_hash : B_hash
|
2038
|
-
# A_hash : B_single => NOT ALLOWED
|
2039
|
-
# A is single element => RETURN ARRAY
|
2040
|
-
# A_single : B_array
|
2041
|
-
# A_single : B_hash => NOT ALLOWED
|
2042
|
-
# A_single : B_single
|
2043
|
-
concatenated = nil
|
2044
|
-
if itemA.kind_of?(Array) && itemB.kind_of?(Array)
|
2045
|
-
concatenated = (itemA + itemB).uniq
|
2046
|
-
elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
|
2047
|
-
concatenated = itemA.merge(itemB) do |k, oldV, newV|
|
2048
|
-
self.concatItems(oldV,newV)
|
2049
|
-
end
|
2050
|
-
elsif itemB.kind_of?(Array)
|
2051
|
-
concatenated = ([itemA] + itemB).uniq
|
2052
|
-
elsif ![Array, Hash].include?(itemB.class)
|
2053
|
-
concatenated = [itemA,itemB].uniq
|
2054
|
-
end
|
2055
|
-
return concatenated
|
2056
|
-
end
|
1118
|
+
# +prof+:: profile to be checked
|
1119
|
+
# +ic_type+:: ic_type to be used
|
1120
|
+
# +zhou_k+:: special coeficient for Zhou IC method
|
1121
|
+
# ===== Returns
|
1122
|
+
# mean IC for a given profile
|
1123
|
+
def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
|
1124
|
+
return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.sum.fdiv(prof.length)
|
1125
|
+
end
|
2057
1126
|
|
1127
|
+
# Term ref vs profile #
|
2058
1128
|
|
2059
|
-
|
1129
|
+
def get_maxmica_term2profile(ref_term, profile)
|
1130
|
+
micas = profile.map{|term| get_MICA(ref_term, term)}
|
1131
|
+
maxmica = micas.first
|
1132
|
+
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
1133
|
+
return maxmica
|
1134
|
+
end
|
1135
|
+
|
1136
|
+
# Profile vs Profile #
|
1137
|
+
|
1138
|
+
# Get semantic similarity from two term sets
|
2060
1139
|
# ===== Parameters
|
2061
|
-
# +
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2066
|
-
|
2067
|
-
|
1140
|
+
# +termsA+:: set to be compared
|
1141
|
+
# +termsB+:: set to be compared
|
1142
|
+
# +sim_type+:: similitude method to be used. Default: resnik
|
1143
|
+
# +ic_type+:: ic type to be used. Default: resnik
|
1144
|
+
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
1145
|
+
# ===== Return
|
1146
|
+
# similitude calculated
|
1147
|
+
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
|
1148
|
+
# Check
|
1149
|
+
raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
|
1150
|
+
raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
|
1151
|
+
micasA = []
|
1152
|
+
# Compare A -> B
|
1153
|
+
termsA.each do |tA|
|
1154
|
+
micas = []
|
1155
|
+
termsB.each do |tB|
|
1156
|
+
if store_mica
|
1157
|
+
value = @mica_index[tA][tB]
|
1158
|
+
else
|
1159
|
+
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
|
1160
|
+
end
|
1161
|
+
micas << value if value.class == Float
|
1162
|
+
end
|
1163
|
+
!micas.empty? ? micasA << micas.max : micasA << 0
|
1164
|
+
end
|
1165
|
+
means_sim = micasA.sum.fdiv(micasA.size)
|
1166
|
+
# Compare B -> A
|
1167
|
+
if bidirectional
|
1168
|
+
means_simA = means_sim * micasA.size
|
1169
|
+
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
|
1170
|
+
means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
|
2068
1171
|
end
|
1172
|
+
# Return
|
1173
|
+
return means_sim
|
2069
1174
|
end
|
2070
1175
|
|
2071
1176
|
|
2072
|
-
|
2073
|
-
#
|
1177
|
+
#############################################
|
1178
|
+
# PROFILE INTERNAL METHODS
|
1179
|
+
#############################################
|
1180
|
+
|
1181
|
+
# I/O profiles
|
1182
|
+
####################################
|
1183
|
+
|
1184
|
+
# Method used to store a pool of profiles
|
2074
1185
|
# ===== Parameters
|
2075
|
-
# +
|
2076
|
-
# +
|
2077
|
-
# +
|
2078
|
-
#
|
2079
|
-
|
2080
|
-
|
2081
|
-
# Check
|
2082
|
-
if
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
return nil
|
2090
|
-
elsif targetKeys.length < @items.keys.length
|
2091
|
-
warn('Some item keys are not allowed')
|
2092
|
-
end
|
2093
|
-
|
2094
|
-
# Expand to parentals
|
2095
|
-
targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
|
2096
|
-
targetKeys.flatten!
|
2097
|
-
targetKeys.uniq!
|
2098
|
-
|
2099
|
-
# Obtain levels (go from leaves to roots)
|
2100
|
-
levels = targetKeys.map{|term| self.get_term_level(term)}
|
2101
|
-
levels.compact!
|
2102
|
-
levels.uniq!
|
2103
|
-
levels.sort!
|
2104
|
-
levels.reverse!
|
2105
|
-
levels.shift # Leaves are not expandable
|
2106
|
-
|
2107
|
-
# Expand from leaves to roots
|
2108
|
-
levels.map do |lvl|
|
2109
|
-
curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
|
2110
|
-
curr_keys.map do |term_expand|
|
2111
|
-
to_infer = []
|
2112
|
-
# Obtain childs
|
2113
|
-
childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
|
2114
|
-
# Expand
|
2115
|
-
if childs.length > 0 && minimum_childs == 1 # Special case
|
2116
|
-
to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
|
2117
|
-
elsif childs.length >= minimum_childs
|
2118
|
-
to_infer = Hash.new(0)
|
2119
|
-
# Compare
|
2120
|
-
while childs.length > 1
|
2121
|
-
curr_term = childs.shift
|
2122
|
-
childs.each do |compare_term|
|
2123
|
-
pivot_items = @items[curr_term]
|
2124
|
-
compare_items = @items[compare_term]
|
2125
|
-
if ontology.nil? # Exact match
|
2126
|
-
pivot_items.map do |pitem|
|
2127
|
-
if compare_items.include?(pitem)
|
2128
|
-
to_infer[pitem] += 2
|
2129
|
-
end
|
2130
|
-
end
|
2131
|
-
else # Find MICAs
|
2132
|
-
local_infer = Hash.new(0)
|
2133
|
-
pivot_items.map do |pitem|
|
2134
|
-
micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
|
2135
|
-
maxmica = micas[0]
|
2136
|
-
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
2137
|
-
local_infer[maxmica.first] += 1
|
2138
|
-
end
|
2139
|
-
compare_items.map do |citem|
|
2140
|
-
micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
|
2141
|
-
maxmica = micas[0]
|
2142
|
-
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
2143
|
-
local_infer[maxmica.first] += 1
|
2144
|
-
end
|
2145
|
-
local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
|
2146
|
-
end
|
2147
|
-
end
|
2148
|
-
end
|
2149
|
-
# Filter infer
|
2150
|
-
to_infer = to_infer.select{|k,v| v >= minimum_childs}
|
2151
|
-
end
|
2152
|
-
# Infer
|
2153
|
-
if to_infer.length > 0
|
2154
|
-
@items[term_expand] = [] if @items[term_expand].nil?
|
2155
|
-
if to_infer.kind_of?(Array)
|
2156
|
-
@items[term_expand] = (@items[term_expand] + to_infer).uniq
|
2157
|
-
else
|
2158
|
-
@items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
|
2159
|
-
end
|
2160
|
-
@items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
|
2161
|
-
elsif !@items.include?(term_expand)
|
2162
|
-
targetKeys.delete(term_expand)
|
2163
|
-
end
|
1186
|
+
# +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
|
1187
|
+
# +calc_metadata+:: if true, launch get_items_from_profiles process
|
1188
|
+
# +reset_stored+:: if true, remove already stored profiles
|
1189
|
+
# +substitute+:: subsstitute flag from check_ids
|
1190
|
+
def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
|
1191
|
+
self.reset_profiles if reset_stored
|
1192
|
+
# Check
|
1193
|
+
if profiles.kind_of?(Array)
|
1194
|
+
profiles.each_with_index do |items, i|
|
1195
|
+
self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
|
1196
|
+
end
|
1197
|
+
else # Hash
|
1198
|
+
if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
|
1199
|
+
warn('Some profiles given are already stored. Stored version will be replaced')
|
2164
1200
|
end
|
1201
|
+
profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
|
2165
1202
|
end
|
2166
|
-
end
|
2167
1203
|
|
1204
|
+
self.add_observed_terms_from_profiles(reset: true)
|
2168
1205
|
|
2169
|
-
|
2170
|
-
|
2171
|
-
# +term+:: which are requested
|
2172
|
-
# +relation+:: can be :ancestor or :descendant
|
2173
|
-
# +remove_alternatives+:: if true, alternatives will be removed
|
2174
|
-
# ===== Returns
|
2175
|
-
# Direct ancestors/descendants of given term or nil if any error occurs
|
2176
|
-
def get_direct_related(term, relation, remove_alternatives: false)
|
2177
|
-
if @dicts[:is_a].nil?
|
2178
|
-
warn("Hierarchy dictionary is not already calculated. Returning nil")
|
2179
|
-
return nil
|
2180
|
-
end
|
2181
|
-
target = nil
|
2182
|
-
case relation
|
2183
|
-
when :ancestor
|
2184
|
-
target = :byTerm
|
2185
|
-
when :descendant
|
2186
|
-
target = :byValue
|
2187
|
-
else
|
2188
|
-
warn('Relation type not allowed. Returning nil')
|
1206
|
+
if calc_metadata
|
1207
|
+
self.get_items_from_profiles
|
2189
1208
|
end
|
2190
|
-
return nil if target.nil?
|
2191
|
-
query = @dicts[:is_a][target][term]
|
2192
|
-
return query if query.nil?
|
2193
|
-
query, _ = remove_alternatives_from_profile(query) if remove_alternatives
|
2194
|
-
return query
|
2195
1209
|
end
|
2196
1210
|
|
1211
|
+
# Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
|
1212
|
+
# ===== Parameters
|
1213
|
+
# +id+:: assigned to profile
|
1214
|
+
# +terms+:: array of terms
|
1215
|
+
# +substitute+:: subsstitute flag from check_ids
|
1216
|
+
def add_profile(id, terms, substitute: true) # FRED: Talk with PSZ about the uniqness of IDs translated
|
1217
|
+
warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
|
1218
|
+
correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
|
1219
|
+
if !rejected_terms.empty?
|
1220
|
+
warn("Given terms contains erroneus IDs: #{rejected_terms.join(",")}. These IDs will be removed")
|
1221
|
+
end
|
1222
|
+
if id.is_a? Numeric
|
1223
|
+
@profiles[id] = correct_terms
|
1224
|
+
else
|
1225
|
+
@profiles[id.to_sym] = correct_terms
|
1226
|
+
end
|
1227
|
+
end
|
1228
|
+
|
2197
1229
|
|
2198
|
-
#
|
1230
|
+
# Includes as "observed_terms" all terms included into stored profiles
|
2199
1231
|
# ===== Parameters
|
2200
|
-
# +
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
def get_direct_ancentors(term, remove_alternatives: false)
|
2205
|
-
return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
|
1232
|
+
# +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
|
1233
|
+
def add_observed_terms_from_profiles(reset: false)
|
1234
|
+
@meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
|
1235
|
+
@profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
|
2206
1236
|
end
|
2207
1237
|
|
2208
|
-
#
|
1238
|
+
# ===== Returns
|
1239
|
+
# profiles assigned to a given ID
|
2209
1240
|
# ===== Parameters
|
2210
|
-
# +
|
2211
|
-
#
|
2212
|
-
#
|
2213
|
-
|
2214
|
-
|
2215
|
-
return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
|
1241
|
+
# +id+:: profile ID
|
1242
|
+
# ===== Return
|
1243
|
+
# specific profile or nil if it's not stored
|
1244
|
+
def get_profile(id)
|
1245
|
+
return @profiles[id]
|
2216
1246
|
end
|
2217
1247
|
|
1248
|
+
# Modifying profiles
|
1249
|
+
####################################
|
2218
1250
|
|
1251
|
+
def reset_profiles # Internal method used to remove already stored profiles and restore observed frequencies #TODO FRED: Modify test for this method.
|
1252
|
+
@profiles = {} # Clean profiles storage
|
1253
|
+
# Reset frequency observed
|
1254
|
+
@meta.each{|term,info| info[:observed_freq] = 0}
|
1255
|
+
@max_freqs[:observed_freq] = 0
|
1256
|
+
@items = {}
|
1257
|
+
end
|
2219
1258
|
|
2220
|
-
|
2221
|
-
|
1259
|
+
def expand_profiles(meth, unwanted_terms: [], calc_metadata: true, ontology: nil, minimum_childs: 1, clean_profiles: true)
|
1260
|
+
if meth == 'parental'
|
1261
|
+
@profiles.each do |id, terms|
|
1262
|
+
@profiles[id] = expand_profile_with_parents(terms) - unwanted_terms
|
1263
|
+
end
|
1264
|
+
get_items_from_profiles if calc_metadata
|
1265
|
+
elsif meth == 'propagate'
|
1266
|
+
get_items_from_profiles
|
1267
|
+
expand_items_to_parentals(ontology: ontology, minimum_childs: minimum_childs, clean_profiles: clean_profiles)
|
1268
|
+
get_profiles_from_items
|
1269
|
+
end
|
1270
|
+
add_observed_terms_from_profiles(reset: true)
|
1271
|
+
end
|
2222
1272
|
|
2223
|
-
#
|
1273
|
+
# Remove alternatives (if official term is present) and ancestors terms of stored profiles
|
2224
1274
|
# ===== Parameters
|
2225
|
-
#
|
2226
|
-
#
|
2227
|
-
#
|
2228
|
-
|
2229
|
-
|
2230
|
-
|
2231
|
-
|
2232
|
-
|
2233
|
-
|
2234
|
-
results = []
|
2235
|
-
if mode == :elim
|
2236
|
-
results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
|
2237
|
-
elsif mode == :weight
|
2238
|
-
results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
|
2239
|
-
end
|
2240
|
-
return results
|
1275
|
+
# +store+:: if true, clenaed profiles will replace already stored profiles
|
1276
|
+
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1277
|
+
# ===== Returns
|
1278
|
+
# a hash with cleaned profiles
|
1279
|
+
def clean_profiles(store: false, remove_alternatives: true)
|
1280
|
+
cleaned_profiles = {}
|
1281
|
+
@profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
|
1282
|
+
@profiles = cleaned_profiles if store
|
1283
|
+
return cleaned_profiles
|
2241
1284
|
end
|
2242
1285
|
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2249
|
-
|
2250
|
-
|
2251
|
-
|
2252
|
-
|
2253
|
-
|
2254
|
-
|
2255
|
-
|
2256
|
-
|
2257
|
-
|
2258
|
-
term_it = @items[term]
|
2259
|
-
parent_it = @items[parent]
|
2260
|
-
curr_it = transfered_list[term]
|
2261
|
-
parent_all_items = merge_groups([term_it, parent_it, curr_it])
|
2262
|
-
transfered_list[parent] = parent_all_items if !parent_all_items.empty?
|
2263
|
-
term_all_items = merge_groups([term_it, curr_it])
|
2264
|
-
transfered_list[term] = term_all_items if !term_all_items.empty?
|
2265
|
-
end
|
1286
|
+
# ID Handlers
|
1287
|
+
####################################
|
1288
|
+
|
1289
|
+
# Trnaslates a bunch of profiles to it sets of term names
|
1290
|
+
# ===== Parameters
|
1291
|
+
# +profs+:: array of profiles
|
1292
|
+
# +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
|
1293
|
+
# ===== Returns
|
1294
|
+
# translated profiles
|
1295
|
+
def translate_profiles_ids(profs = [], asArray: true)
|
1296
|
+
profs2proc = {}
|
1297
|
+
if profs.empty?
|
1298
|
+
profs2proc = @profiles
|
1299
|
+
else
|
1300
|
+
profs.each_with_index{|terms, index| profs2proc[index] = terms} if profs.kind_of?(Array)
|
2266
1301
|
end
|
2267
|
-
|
2268
|
-
|
1302
|
+
profs_names = {}
|
1303
|
+
profs2proc.each do |id, terms|
|
1304
|
+
names, _ = translate_ids(terms)
|
1305
|
+
profs_names[id] = names
|
2269
1306
|
end
|
2270
|
-
return
|
1307
|
+
return asArray ? profs_names.values : profs_names
|
2271
1308
|
end
|
2272
1309
|
|
2273
|
-
|
2274
|
-
|
1310
|
+
# Description of profile size
|
1311
|
+
####################################
|
1312
|
+
|
1313
|
+
def profile_stats
|
1314
|
+
stats = Hash.new(0)
|
1315
|
+
data = get_profiles_sizes
|
1316
|
+
stats[:average] = data.sum().fdiv(data.size)
|
1317
|
+
sum_devs = data.sum{|element| (element - stats[:average]) ** 2}
|
1318
|
+
stats[:variance] = sum_devs.fdiv(data.size)
|
1319
|
+
stats[:standardDeviation] = stats[:variance] ** 0.5
|
1320
|
+
stats[:max] = data.max
|
1321
|
+
stats[:min] = data.min
|
1322
|
+
|
1323
|
+
stats[:count] = data.size
|
1324
|
+
data.each do |value|
|
1325
|
+
stats[:countNonZero] += 1 if value != 0
|
1326
|
+
end
|
1327
|
+
|
1328
|
+
stats[:q1] = data.get_quantiles(0.25)
|
1329
|
+
stats[:median] = data.get_quantiles(0.5)
|
1330
|
+
stats[:q3] = data.get_quantiles(0.75)
|
1331
|
+
return stats
|
1332
|
+
|
2275
1333
|
end
|
2276
1334
|
|
2277
|
-
|
2278
|
-
|
2279
|
-
|
2280
|
-
|
2281
|
-
|
2282
|
-
|
2283
|
-
|
2284
|
-
|
2285
|
-
|
2286
|
-
end
|
2287
|
-
end
|
2288
|
-
return terms_levels
|
1335
|
+
# ===== Returns
|
1336
|
+
# mean size of stored profiles
|
1337
|
+
# ===== Parameters
|
1338
|
+
# +round_digits+:: number of digits to round result. Default: 4
|
1339
|
+
# ===== Returns
|
1340
|
+
# mean size of stored profiles
|
1341
|
+
def get_profiles_mean_size(round_digits: 4)
|
1342
|
+
sizes = self.get_profiles_sizes
|
1343
|
+
return sizes.sum.fdiv(@profiles.length).round(round_digits)
|
2289
1344
|
end
|
2290
1345
|
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
|
2296
|
-
|
2297
|
-
|
2298
|
-
|
2299
|
-
|
2300
|
-
|
2301
|
-
|
2302
|
-
|
2303
|
-
|
2304
|
-
|
2305
|
-
|
2306
|
-
|
2307
|
-
|
1346
|
+
# ===== Returns
|
1347
|
+
# an array of sizes for all stored profiles
|
1348
|
+
# ===== Return
|
1349
|
+
# array of profile sizes
|
1350
|
+
def get_profiles_sizes()
|
1351
|
+
return @profiles.map{|id,terms| terms.length}
|
1352
|
+
end
|
1353
|
+
|
1354
|
+
# Calculates profiles sizes and returns size assigned to percentile given
|
1355
|
+
# ===== Parameters
|
1356
|
+
# +perc+:: percentile to be returned
|
1357
|
+
# +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
|
1358
|
+
# ===== Returns
|
1359
|
+
# values assigned to percentile asked
|
1360
|
+
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
1361
|
+
prof_lengths = self.get_profiles_sizes
|
1362
|
+
percentile_profile = prof_lengths.get_quantiles(perc.fdiv(100), decreasing_sort = !increasing_sort)
|
1363
|
+
return percentile_profile
|
1364
|
+
end
|
1365
|
+
|
1366
|
+
# IC data
|
1367
|
+
####################################
|
1368
|
+
|
1369
|
+
# Get frequency terms and information coefficient from profiles #
|
1370
|
+
|
1371
|
+
# Calculates frequencies of stored profiles terms
|
1372
|
+
# ===== Parameters
|
1373
|
+
# +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
|
1374
|
+
# +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
|
1375
|
+
# +translate+:: if true, term IDs will be translated to
|
1376
|
+
# ===== Returns
|
1377
|
+
# stored profiles terms frequencies
|
1378
|
+
def get_profiles_terms_frequency(ratio: true, asArray: true, translate: true)
|
1379
|
+
freqs = Hash.new(0)
|
1380
|
+
@profiles.each do |id, terms|
|
1381
|
+
terms.each{|term| freqs[term] += 1}
|
1382
|
+
end
|
1383
|
+
if translate
|
1384
|
+
translated_freqs = {}
|
1385
|
+
freqs.each do |term, freq|
|
1386
|
+
tr = self.translate_id(term)
|
1387
|
+
translated_freqs[tr] = freq if !tr.nil?
|
2308
1388
|
end
|
1389
|
+
freqs = translated_freqs
|
1390
|
+
end
|
1391
|
+
n_profiles = @profiles.length
|
1392
|
+
freqs.transform_values!{|freq| freq.fdiv(n_profiles)} if ratio
|
1393
|
+
if asArray
|
1394
|
+
freqs = freqs.to_a
|
1395
|
+
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
2309
1396
|
end
|
1397
|
+
return freqs
|
2310
1398
|
end
|
2311
1399
|
|
2312
|
-
|
2313
|
-
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
|
2320
|
-
|
2321
|
-
|
2322
|
-
|
2323
|
-
|
2324
|
-
|
2325
|
-
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
2330
|
-
|
2331
|
-
|
2332
|
-
|
2333
|
-
|
2334
|
-
|
2335
|
-
|
1400
|
+
# Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
|
1401
|
+
# ===== Returns
|
1402
|
+
# two hashes with Profiles and IC calculated for resnik and observed resnik respectively
|
1403
|
+
def get_profiles_resnik_dual_ICs(struct: :resnik, observ: :resnik_observed) # Maybe change name during migration to get_profiles_dual_ICs
|
1404
|
+
struct_ics = {}
|
1405
|
+
observ_ics = {}
|
1406
|
+
@profiles.each do |id, terms|
|
1407
|
+
struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: struct)
|
1408
|
+
observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: observ)
|
1409
|
+
end
|
1410
|
+
return struct_ics, observ_ics
|
1411
|
+
end
|
1412
|
+
|
1413
|
+
|
1414
|
+
# Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
|
1415
|
+
# ===== Returns
|
1416
|
+
# two hashes with resnik and resnik_observed ICs for observed terms
|
1417
|
+
def get_observed_ics_by_onto_and_freq()
|
1418
|
+
ic_ont = {}
|
1419
|
+
resnik_observed = {}
|
1420
|
+
observed_terms = @profiles.values.flatten.uniq
|
1421
|
+
observed_terms.each do |term|
|
1422
|
+
ic_ont[term] = get_IC(term)
|
1423
|
+
resnik_observed[term] = get_IC(term, type: :resnik_observed)
|
1424
|
+
end
|
1425
|
+
return ic_ont, resnik_observed
|
1426
|
+
end
|
1427
|
+
|
1428
|
+
# Profiles vs Profiles #
|
1429
|
+
|
1430
|
+
def get_pair_index(profiles_A, profiles_B)
|
1431
|
+
pair_index = {}
|
1432
|
+
profiles_A.each do |curr_id, profile_A|
|
1433
|
+
profiles_B.each do |id, profile_B|
|
1434
|
+
profile_A.each do |term_A|
|
1435
|
+
profile_B.each do |term_B|
|
1436
|
+
pair_index[[term_A, term_B].sort] = true
|
2336
1437
|
end
|
2337
1438
|
end
|
2338
|
-
|
2339
|
-
end
|
1439
|
+
end
|
2340
1440
|
end
|
2341
|
-
return
|
1441
|
+
return pair_index
|
2342
1442
|
end
|
2343
1443
|
|
2344
|
-
def
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2348
|
-
|
2349
|
-
|
2350
|
-
|
2351
|
-
#initialize observed items in item_weigths_per_term list
|
2352
|
-
add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
|
2353
|
-
children = @dicts[:is_a][:byValue][term]
|
2354
|
-
if children.nil?
|
2355
|
-
children = []
|
2356
|
-
else
|
2357
|
-
children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
|
2358
|
-
end
|
2359
|
-
computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
2360
|
-
end
|
1444
|
+
def get_mica_index_from_profiles(pair_index, sim_type: :resnik, ic_type: :resnik, lca_index: true)
|
1445
|
+
pair_index.each do |pair, val|
|
1446
|
+
tA, tB = pair
|
1447
|
+
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type, lca_index: lca_index)
|
1448
|
+
value = true if value.nil? # We use true to save that the operation was made but there is not mica value
|
1449
|
+
add2nestHash(@mica_index, tA, tB, value)
|
1450
|
+
add2nestHash(@mica_index, tB, tA, value)
|
2361
1451
|
end
|
2362
|
-
return pvals.to_a
|
2363
1452
|
end
|
2364
1453
|
|
2365
|
-
|
2366
|
-
|
2367
|
-
|
2368
|
-
|
1454
|
+
# Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
|
1455
|
+
# ===== Parameters
|
1456
|
+
# +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
|
1457
|
+
# +sim_type+:: similitude method to be used. Default: resnik
|
1458
|
+
# +ic_type+:: ic type to be used. Default: resnik
|
1459
|
+
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
1460
|
+
# ===== Return
|
1461
|
+
# Similitudes calculated
|
1462
|
+
def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
|
1463
|
+
profiles_similarity = {} #calculate similarity between patients profile
|
1464
|
+
if external_profiles.nil?
|
1465
|
+
comp_profiles = @profiles
|
1466
|
+
main_profiles = comp_profiles
|
1467
|
+
else
|
1468
|
+
comp_profiles = external_profiles
|
1469
|
+
main_profiles = @profiles
|
1470
|
+
end
|
1471
|
+
# Compare
|
1472
|
+
pair_index = get_pair_index(main_profiles, comp_profiles)
|
1473
|
+
@mica_index = {}
|
1474
|
+
get_mica_index_from_profiles(pair_index, sim_type: sim_type, ic_type: ic_type, lca_index: false)
|
1475
|
+
main_profiles.each do |curr_id, current_profile|
|
1476
|
+
comp_profiles.each do |id, profile|
|
1477
|
+
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
|
1478
|
+
add2nestHash(profiles_similarity, curr_id, id, value)
|
1479
|
+
end
|
1480
|
+
end
|
1481
|
+
return profiles_similarity
|
2369
1482
|
end
|
2370
1483
|
|
2371
|
-
|
2372
|
-
|
2373
|
-
|
2374
|
-
|
2375
|
-
|
2376
|
-
|
2377
|
-
|
2378
|
-
|
2379
|
-
|
2380
|
-
|
2381
|
-
|
2382
|
-
|
2383
|
-
|
2384
|
-
|
2385
|
-
|
2386
|
-
|
2387
|
-
|
2388
|
-
|
2389
|
-
|
2390
|
-
|
2391
|
-
|
2392
|
-
pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
|
2393
|
-
'two_sided', item_weigths_per_term[child], true)
|
2394
|
-
end
|
1484
|
+
# specifity_index related methods
|
1485
|
+
####################################
|
1486
|
+
|
1487
|
+
# Return ontology levels from profile terms
|
1488
|
+
# ===== Returns
|
1489
|
+
# hash of term levels (Key: level; Value: array of term IDs)
|
1490
|
+
def get_ontology_levels_from_profiles(uniq = true)
|
1491
|
+
profiles_terms = @profiles.values.flatten
|
1492
|
+
profiles_terms.uniq! if uniq
|
1493
|
+
term_freqs_byProfile = Hash.new(0)
|
1494
|
+
profiles_terms.each do |term|
|
1495
|
+
term_freqs_byProfile[term] += 1
|
1496
|
+
end
|
1497
|
+
levels_filtered = {}
|
1498
|
+
terms_levels = @dicts[:level][:byValue]
|
1499
|
+
term_freqs_byProfile.each do |term, count|
|
1500
|
+
level = terms_levels[term]
|
1501
|
+
term_repeat = Array.new(count, term)
|
1502
|
+
query = levels_filtered[level]
|
1503
|
+
if query.nil?
|
1504
|
+
levels_filtered[level] = term_repeat
|
2395
1505
|
else
|
2396
|
-
|
2397
|
-
ancs << term
|
2398
|
-
rates.each do |ch, ratio|# CASE 2
|
2399
|
-
if ratio >= 1 # The child is better than parent
|
2400
|
-
ancs.each do |anc|
|
2401
|
-
query_anc = item_weigths_per_term[anc]
|
2402
|
-
associated_items.each do |item|
|
2403
|
-
query_anc[item] /= ratio # /= --> query_anc[item]/ratio
|
2404
|
-
end
|
2405
|
-
end
|
2406
|
-
end
|
2407
|
-
end
|
2408
|
-
computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
|
1506
|
+
query.concat(term_repeat)
|
2409
1507
|
end
|
2410
1508
|
end
|
1509
|
+
return levels_filtered
|
2411
1510
|
end
|
2412
1511
|
|
2413
|
-
def
|
2414
|
-
|
2415
|
-
|
1512
|
+
def get_profile_ontology_distribution_tables
|
1513
|
+
cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
|
1514
|
+
uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
|
1515
|
+
ontology_levels = get_ontology_levels
|
1516
|
+
total_ontology_terms = ontology_levels.values.flatten.length
|
1517
|
+
total_cohort_terms = cohort_ontology_levels.values.flatten.length
|
1518
|
+
total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
|
2416
1519
|
|
2417
|
-
|
2418
|
-
|
1520
|
+
distribution_ontology_levels = []
|
1521
|
+
distribution_percentage = []
|
1522
|
+
ontology_levels.each do |level, terms|
|
1523
|
+
cohort_terms = cohort_ontology_levels[level]
|
1524
|
+
uniq_cohort_terms = uniq_cohort_ontology_levels[level]
|
1525
|
+
if cohort_terms.nil? || uniq_cohort_terms.nil?
|
1526
|
+
num = 0
|
1527
|
+
u_num = 0
|
1528
|
+
else
|
1529
|
+
num = cohort_terms.length
|
1530
|
+
u_num = uniq_cohort_terms.length
|
1531
|
+
end
|
1532
|
+
distribution_ontology_levels << [level, terms.length, num]
|
1533
|
+
distribution_percentage << [
|
1534
|
+
level,
|
1535
|
+
(terms.length.fdiv(total_ontology_terms)*100).round(3),
|
1536
|
+
(num.fdiv(total_cohort_terms)*100).round(3),
|
1537
|
+
(u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
|
1538
|
+
]
|
1539
|
+
end
|
1540
|
+
distribution_ontology_levels.sort! { |x,y| x.first <=> y.first }
|
1541
|
+
distribution_percentage.sort! { |x,y| x.first <=> y.first }
|
1542
|
+
return distribution_ontology_levels, distribution_percentage
|
1543
|
+
end
|
2419
1544
|
|
2420
|
-
|
2421
|
-
|
2422
|
-
|
2423
|
-
|
2424
|
-
|
2425
|
-
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
1545
|
+
def get_dataset_specifity_index(mode)
|
1546
|
+
ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
|
1547
|
+
if mode == 'uniq'
|
1548
|
+
observed_distribution = 3
|
1549
|
+
elsif mode == 'weigthed'
|
1550
|
+
observed_distribution = 2
|
1551
|
+
end
|
1552
|
+
max_terms = distribution_percentage.map{|row| row[1]}.max
|
1553
|
+
maxL = nil
|
1554
|
+
distribution_percentage.each do |level_info|
|
1555
|
+
maxL = level_info.first if level_info[1] == max_terms
|
1556
|
+
end
|
1557
|
+
diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
|
1558
|
+
diffL.select!{|dL| dL.last > 0}
|
1559
|
+
highSection = diffL.select{|dL| dL.first > maxL}
|
1560
|
+
lowSection = diffL.select{|dL| dL.first <= maxL}
|
1561
|
+
dsi = nil
|
1562
|
+
if highSection.empty?
|
1563
|
+
dsi = 0
|
1564
|
+
else
|
1565
|
+
hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
|
1566
|
+
lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
|
1567
|
+
dsi = hss.fdiv(lss)
|
1568
|
+
end
|
1569
|
+
return dsi
|
2429
1570
|
end
|
2430
1571
|
|
2431
|
-
|
2432
|
-
|
2433
|
-
|
2434
|
-
|
2435
|
-
|
2436
|
-
|
2437
|
-
|
1572
|
+
def get_weigthed_level_contribution(section, maxL, nLevels)
|
1573
|
+
accumulated_weigthed_diffL = 0
|
1574
|
+
section.each do |level, diff|
|
1575
|
+
weightL = maxL - level
|
1576
|
+
if weightL >= 0
|
1577
|
+
weightL += 1
|
1578
|
+
else
|
1579
|
+
weightL = weightL.abs
|
1580
|
+
end
|
1581
|
+
accumulated_weigthed_diffL += diff * weightL
|
1582
|
+
end
|
1583
|
+
weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
|
1584
|
+
return weigthed_contribution
|
2438
1585
|
end
|
2439
1586
|
|
2440
|
-
|
2441
|
-
|
2442
|
-
|
1587
|
+
########################################
|
1588
|
+
## GENERAL ONTOLOGY METHODS
|
1589
|
+
########################################
|
1590
|
+
|
2443
1591
|
def ==(other)
|
2444
|
-
self.
|
2445
|
-
self.stanzas == other.stanzas &&
|
1592
|
+
self.terms == other.terms &&
|
2446
1593
|
self.ancestors_index == other.ancestors_index &&
|
2447
1594
|
self.alternatives_index == other.alternatives_index &&
|
2448
|
-
self.obsoletes_index == other.obsoletes_index &&
|
2449
1595
|
self.structureType == other.structureType &&
|
2450
1596
|
self.ics == other.ics &&
|
2451
1597
|
self.meta == other.meta &&
|
2452
1598
|
self.dicts == other.dicts &&
|
2453
1599
|
self.profiles == other.profiles &&
|
2454
|
-
self.profilesDict == other.profilesDict &&
|
2455
1600
|
(self.items.keys - other.items.keys).empty? &&
|
2456
|
-
self.removable_terms == other.removable_terms &&
|
2457
|
-
self.special_tags == other.special_tags &&
|
2458
1601
|
self.items == other.items &&
|
2459
1602
|
self.term_paths == other.term_paths &&
|
2460
1603
|
self.max_freqs == other.max_freqs
|
@@ -2463,32 +1606,128 @@ class Ontology
|
|
2463
1606
|
|
2464
1607
|
def clone
|
2465
1608
|
copy = Ontology.new
|
2466
|
-
copy.
|
2467
|
-
copy.stanzas[:terms] = self.stanzas[:terms].clone
|
2468
|
-
copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
|
2469
|
-
copy.stanzas[:instances] = self.stanzas[:instances].clone
|
1609
|
+
copy.terms = self.terms.clone
|
2470
1610
|
copy.ancestors_index = self.ancestors_index.clone
|
2471
1611
|
copy.descendants_index = self.descendants_index.clone
|
2472
1612
|
copy.alternatives_index = self.alternatives_index.clone
|
2473
|
-
copy.obsoletes_index = self.obsoletes_index.clone
|
2474
1613
|
copy.structureType = self.structureType.clone
|
2475
1614
|
copy.ics = self.ics.clone
|
2476
1615
|
copy.meta = self.meta.clone
|
2477
1616
|
copy.dicts = self.dicts.clone
|
2478
1617
|
copy.profiles = self.profiles.clone
|
2479
|
-
copy.profilesDict = self.profilesDict.clone
|
2480
1618
|
copy.items = self.items.clone
|
2481
|
-
copy.removable_terms = self.removable_terms.clone
|
2482
1619
|
copy.term_paths = self.term_paths.clone
|
2483
1620
|
copy.max_freqs = self.max_freqs.clone
|
2484
1621
|
return copy
|
2485
1622
|
end
|
2486
1623
|
|
1624
|
+
# Exports an OBO_Handler object in json format
|
1625
|
+
# ===== Parameters
|
1626
|
+
# +file+:: where info will be stored
|
1627
|
+
def write(file)
|
1628
|
+
# Take object stored info
|
1629
|
+
obj_info = {terms: @terms,
|
1630
|
+
ancestors_index: @ancestors_index,
|
1631
|
+
descendants_index: @descendants_index,
|
1632
|
+
alternatives_index: @alternatives_index,
|
1633
|
+
structureType: @structureType,
|
1634
|
+
ics: @ics,
|
1635
|
+
meta: @meta,
|
1636
|
+
max_freqs: @max_freqs,
|
1637
|
+
dicts: @dicts,
|
1638
|
+
profiles: @profiles,
|
1639
|
+
items: @items,
|
1640
|
+
term_paths: @term_paths}
|
1641
|
+
# Convert to JSON format & write
|
1642
|
+
File.open(file, "w") { |f| f.write obj_info.to_json }
|
1643
|
+
end
|
1644
|
+
|
1645
|
+
|
1646
|
+
def each(att = false)
|
1647
|
+
warn('terms empty') if @terms.empty?
|
1648
|
+
@terms.each do |id, tags|
|
1649
|
+
if att
|
1650
|
+
yield(id, tags)
|
1651
|
+
else
|
1652
|
+
yield(id)
|
1653
|
+
end
|
1654
|
+
end
|
1655
|
+
end
|
1656
|
+
|
1657
|
+
def get_root
|
1658
|
+
roots = []
|
1659
|
+
each do |term|
|
1660
|
+
roots << term if @ancestors_index[term].nil?
|
1661
|
+
end
|
1662
|
+
return roots
|
1663
|
+
end
|
1664
|
+
|
1665
|
+
def list_term_attributes
|
1666
|
+
terms = []
|
1667
|
+
each do |code|
|
1668
|
+
terms << [code, translate_id(code), get_term_level(code)]
|
1669
|
+
end
|
1670
|
+
return terms
|
1671
|
+
end
|
1672
|
+
|
1673
|
+
# Gets ontology levels calculated
|
1674
|
+
# ===== Returns
|
1675
|
+
# ontology levels calculated
|
1676
|
+
def get_ontology_levels
|
1677
|
+
return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
|
1678
|
+
end
|
2487
1679
|
|
2488
|
-
|
2489
|
-
|
2490
|
-
|
1680
|
+
private
|
1681
|
+
|
1682
|
+
def add2hash(hash, key, val)
|
1683
|
+
query = hash[key]
|
1684
|
+
if query.nil?
|
1685
|
+
hash[key] = [val]
|
1686
|
+
else
|
1687
|
+
query << val
|
1688
|
+
end
|
1689
|
+
end
|
1690
|
+
|
1691
|
+
def add2nestHash(h, key1, key2, val)
|
1692
|
+
query1 = h[key1]
|
1693
|
+
if query1.nil?
|
1694
|
+
h[key1] = {key2 => val}
|
1695
|
+
else
|
1696
|
+
query1[key2] = val
|
1697
|
+
end
|
1698
|
+
end
|
2491
1699
|
|
2492
|
-
|
2493
|
-
|
1700
|
+
# Internal function to concat two elements.
|
1701
|
+
# ===== Parameters
|
1702
|
+
# +itemA+:: item to be concatenated
|
1703
|
+
# +itemB+:: item to be concatenated
|
1704
|
+
# ===== Returns
|
1705
|
+
# Concatenated objects
|
1706
|
+
def concatItems(itemA,itemB) # NEED TEST, CHECK WITH PSZ THIS METHOD
|
1707
|
+
# A is Array :: RETURN ARRAY
|
1708
|
+
# A_array : B_array
|
1709
|
+
# A_array : B_hash => NOT ALLOWED
|
1710
|
+
# A_array : B_single => NOT ALLOWED
|
1711
|
+
# A is Hash :: RETURN HASH
|
1712
|
+
# A_hash : B_array => NOT ALLOWED
|
1713
|
+
# A_hash : B_hash
|
1714
|
+
# A_hash : B_single => NOT ALLOWED
|
1715
|
+
# A is single element => RETURN ARRAY
|
1716
|
+
# A_single : B_array
|
1717
|
+
# A_single : B_hash => NOT ALLOWED
|
1718
|
+
# A_single : B_single
|
1719
|
+
concatenated = nil
|
1720
|
+
if itemA.kind_of?(Array) && itemB.kind_of?(Array)
|
1721
|
+
concatenated = itemA | itemB
|
1722
|
+
elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
|
1723
|
+
concatenated = itemA.merge(itemB) do |k, oldV, newV|
|
1724
|
+
self.concatItems(oldV,newV)
|
1725
|
+
end
|
1726
|
+
elsif itemB.kind_of?(Array)
|
1727
|
+
concatenated = ([itemA] + itemB).uniq
|
1728
|
+
elsif ![Array, Hash].include?(itemB.class)
|
1729
|
+
concatenated = [itemA,itemB].uniq
|
1730
|
+
end
|
1731
|
+
return concatenated
|
1732
|
+
end
|
2494
1733
|
end
|