semtools 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/semtools.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "semtools/version"
2
+ require "semtools/sim_handler"
3
+ require "semtools/math_methods"
4
+ require "semtools/ontology"
5
+
6
+ module Semtools
7
+ # Your code goes here...
8
+ end
@@ -0,0 +1,140 @@
1
+ # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
+ #to cmpute fisher exact test
3
+ #Fisher => http://www.biostathandbook.com/fishers.html
4
+ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
5
+ listA_listB = listA & listB
6
+ listA_nolistB = listA - listB
7
+ nolistA_listB = listB - listA
8
+ if weigths.nil?
9
+ listA_listB_count = listA_listB.length
10
+ listA_nolistB_count = listA_nolistB.length
11
+ nolistA_listB_count = nolistA_listB.length
12
+ nolistA_nolistB_count = all_elements_count - (listA | listB).length
13
+ else
14
+ # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
15
+ # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
16
+ listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
17
+ listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
+ nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
+ nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
20
+ all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
21
+ end
22
+ if tail == 'two_sided'
23
+ accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
24
+ elsif tail == 'less'
25
+ accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
26
+ end
27
+ return accumulated_prob
28
+ end
29
+
30
+ def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
31
+ #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
32
+ accumulated_prob = 0
33
+ ref_prob = compute_hyper_prob(
34
+ listA_listB_count,
35
+ listA_nolistB_count,
36
+ nolistA_listB_count,
37
+ nolistA_nolistB_count,
38
+ all_elements_count
39
+ )
40
+ accumulated_prob += ref_prob
41
+ [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
42
+ n += 1
43
+ prob = compute_hyper_prob(
44
+ listA_listB_count - n,
45
+ listA_nolistB_count + n,
46
+ nolistA_listB_count + n,
47
+ nolistA_nolistB_count - n,
48
+ all_elements_count
49
+ )
50
+ prob <= ref_prob ? accumulated_prob += prob : break
51
+ end
52
+
53
+ [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
54
+ n += 1
55
+ prob = compute_hyper_prob(
56
+ listA_listB_count + n,
57
+ listA_nolistB_count - n,
58
+ nolistA_listB_count - n,
59
+ nolistA_nolistB_count + n,
60
+ all_elements_count
61
+ )
62
+ accumulated_prob += prob if prob <= ref_prob
63
+ end
64
+
65
+ return accumulated_prob
66
+ end
67
+
68
+ def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
69
+ accumulated_prob = 0
70
+ [listA_listB_count, nolistA_nolistB_count].min.times do |n|
71
+ accumulated_prob += compute_hyper_prob(
72
+ listA_listB_count - n,
73
+ listA_nolistB_count + n,
74
+ nolistA_listB_count + n,
75
+ nolistA_nolistB_count - n,
76
+ all_elements_count
77
+ )
78
+ end
79
+ return accumulated_prob
80
+ end
81
+
82
+ def compute_hyper_prob(a, b, c, d, n)
83
+ # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
84
+ binomA = binom(a + b, a)
85
+ binomC = binom(c + d, c)
86
+ divisor = binom(n, a + c)
87
+ return (binomA * binomC).fdiv(divisor)
88
+ end
89
+
90
+ def binom(n,k)
91
+ if k > 0 && k < n
92
+ res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
93
+ else
94
+ res = 1
95
+ end
96
+ end
97
+
98
+ #to cmpute adjusted pvalues
99
+ #https://rosettacode.org/wiki/P-value_correction#Ruby
100
+ def get_benjaminiHochberg_pvalues(arr_pvalues)
101
+ n = arr_pvalues.length
102
+ arr_o = order(arr_pvalues, true)
103
+ arr_cummin_input = []
104
+ (0..(n - 1)).each do |i|
105
+ arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
106
+ end
107
+ arr_ro = order(arr_o)
108
+ arr_cummin = cummin(arr_cummin_input)
109
+ arr_pmin = pmin(arr_cummin)
110
+ return arr_pmin.values_at(*arr_ro)
111
+ end
112
+
113
+ def order(array, decreasing = false)
114
+ if decreasing == false
115
+ array.sort.map { |n| array.index(n) }
116
+ else
117
+ array.sort.map { |n| array.index(n) }.reverse
118
+ end
119
+ end
120
+
121
+ def cummin(array)
122
+ cumulative_min = array.first
123
+ arr_cummin = []
124
+ array.each do |p|
125
+ cumulative_min = [p, cumulative_min].min
126
+ arr_cummin << cumulative_min
127
+ end
128
+ return arr_cummin
129
+ end
130
+
131
+ def pmin(array)
132
+ x = 1
133
+ pmin_array = []
134
+ array.each_index do |i|
135
+ pmin_array[i] = [array[i], x].min
136
+ abort if pmin_array[i] > 1
137
+ end
138
+ return pmin_array
139
+ end
140
+
@@ -0,0 +1,2041 @@
1
+ require 'json'
2
+
3
+
4
+ class Ontology
5
+ #########################################################
6
+ # AUTHOR NOTES
7
+ #########################################################
8
+
9
+ # 1 - Store @profiles as @stanzas[:instances]
10
+ # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
11
+
12
+
13
+ #############################################
14
+ # FIELDS
15
+ #############################################
16
+ # Handled class variables
17
+ # => @@basic_tags :: hash with main OBO structure tags
18
+ # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
19
+ # => @@symbolizable_ids :: tags which can be symbolized
20
+ # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
21
+ #
22
+ # Handled object variables
23
+ # => @header :: file header (if is available)
24
+ # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
25
+ # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
26
+ # => @descendants_index :: hash of descendants per each term handled with any structure relationships
27
+ # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
28
+ # => @obsoletes_index :: hash of obsoletes and it's new ids
29
+ # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
30
+ # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
31
+ # => @ics :: already calculated ICs for handled terms and IC types
32
+ # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
33
+ # => @max_freqs :: maximum freqs found for structural and observed freqs
34
+ # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
35
+ # => @profiles :: set of terms assigned to an ID
36
+ # => @profilesDict :: set of profile IDs assigned to a term
37
+ # => @items :: hash with items relations to terms
38
+ # => @removable_terms :: array of terms to not be considered
39
+ # => @term_paths :: metainfo about parental paths of each term
40
+
41
+ @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
+ @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
+ @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
+ @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
45
+ @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
46
+ @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
47
+
48
+ #############################################
49
+ # CONSTRUCTOR
50
+ #############################################
51
+
52
+ # Instantiate a OBO_Handler object
53
+ # ===== Parameters
54
+ # +file+:: with info to be loaded (.obo ; .json)
55
+ # +load_file+:: activate load process automatically (only for .obo)
56
+ # +removable_terms+: term to be removed from calcs
57
+ def initialize(file: nil, load_file: false, removable_terms: [])
58
+ # Initialize object variables
59
+ @header = nil
60
+ @stanzas = {terms: {}, typedefs: {}, instances: {}}
61
+ @ancestors_index = {}
62
+ @descendants_index = {}
63
+ @alternatives_index = {}
64
+ @obsoletes_index = {}
65
+ @structureType = nil
66
+ @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
67
+ @meta = {}
68
+ @special_tags = @@basic_tags.clone
69
+ @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
70
+ @dicts = {}
71
+ @profiles = {}
72
+ @profilesDict = {}
73
+ @items = {}
74
+ @removable_terms = []
75
+ @term_paths = {}
76
+ # Load if proceeds
77
+ add_removable_terms(removable_terms) if !removable_terms.empty?
78
+ load(file) if load_file
79
+ end
80
+
81
+
82
+ #############################################
83
+ # CLASS METHODS
84
+ #############################################
85
+
86
+ # Expand a (starting) term using a specific tag and return all extended terms into an array and
87
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
88
+ # foumd, extended array will be an unique vector without starting term (no loops).
89
+ # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
90
+ # ===== Parameters
91
+ # +start+:: term where start to expand
92
+ # +terms+:: set to be used to expand
93
+ # +target_tag+:: tag used to expand
94
+ # +eexpansion+:: already expanded info
95
+ # +split_info_char+:: special regex used to split info (if it is necessary)
96
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
97
+ # +alt_ids+:: set of alternative IDs
98
+ # ===== Returns
99
+ # A vector with the observed structure (string) and the array with extended terms.
100
+ def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
101
+ # Take start_id term available info and already accumulated info
102
+ current_associations = related_ids[start_id]
103
+ current_associations = [] if current_associations.nil?
104
+ return [:no_term,[]] if terms[start_id].nil?
105
+ id_relations = terms[start_id][target_tag]
106
+ return [:source,[]] if id_relations.nil?
107
+
108
+ # Prepare auxiliar variables
109
+ struct = :hierarchical
110
+
111
+ # Study direct extensions
112
+ id_relations = id_relations.clone
113
+ while id_relations.length > 0
114
+ id = id_relations.shift
115
+ id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
116
+
117
+ # Handle
118
+ if current_associations.include?(id) # Check if already have been included into this expansion
119
+ struct = :circular
120
+ else
121
+ current_associations << id
122
+ if related_ids.include?(id) # Check if current already has been expanded
123
+ current_associations = current_associations | related_ids[id]
124
+ if current_associations.include?(start_id) # Check circular case
125
+ struct = :circular
126
+ [id, start_id].each{|repeated| current_associations.delete(repeated)}
127
+ end
128
+ else # Expand
129
+ related_ids[start_id] = current_associations
130
+ structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
131
+ current_associations = current_associations | current_related_ids
132
+ struct = :circular if structExp == :circular # Check struct
133
+ if current_associations.include?(start_id) # Check circular case
134
+ struct = :circular
135
+ current_associations.delete(start_id)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ related_ids[start_id] = current_associations
141
+
142
+ return struct, current_associations
143
+ end
144
+
145
+
146
+ # Expand terms using a specific tag and return all extended terms into an array and
147
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
148
+ # foumd, extended array will be an unique vector without starting term (no loops)
149
+ # ===== Parameters
150
+ # +terms+:: set to be used to expand
151
+ # +target_tag+:: tag used to expand
152
+ # +split_info_char+:: special regex used to split info (if it is necessary)
153
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
154
+ # +alt_ids+:: set of alternative IDs
155
+ # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
156
+ # ===== Returns
157
+ # A vector with the observed structure (string) and the hash with extended terms
158
+ def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
159
+ # Define structure type
160
+ structType = :hierarchical
161
+ related_ids = {}
162
+ terms.each do |id, tags|
163
+ # Check if target tag is defined
164
+ if !tags[target_tag].nil?
165
+ # Obtain related terms
166
+ set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
167
+ # Check structure
168
+ structType = :circular if set_structure == :circular
169
+ end
170
+ end
171
+
172
+ # Check special case
173
+ structType = :atomic if related_ids.length <= 0
174
+ structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
175
+ # Return type and hash with related_ids
176
+ return structType, related_ids
177
+ end
178
+
179
+
180
+ # Class method to transform string with <tag : info> into hash structure
181
+ # ===== Parameters
182
+ # +attributes+:: array tuples with info to be transformed into hash format
183
+ # ===== Returns
184
+ # Attributes stored into hash structure
185
+ def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
186
+ # Load info
187
+ info_hash = {}
188
+ # Only TERMS multivalue tags (future add Typedefs and Instance)
189
+ # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
190
+ attributes.each do |tag, value|
191
+ # Check
192
+ raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
193
+ # Prepare
194
+ tag = tag.lstrip.to_sym
195
+ value.lstrip!
196
+ value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
197
+
198
+ # Store
199
+ query = info_hash[tag]
200
+ if !query.nil? # Tag already exists
201
+ if !query.kind_of?(Array) # Check that tag is multivalue
202
+ raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
203
+ else
204
+ query << value # Add new value to tag
205
+ end
206
+ else # New entry
207
+ if @@multivalue_tags.include?(tag)
208
+ info_hash[tag] = [value]
209
+ else
210
+ info_hash[tag] = value
211
+ end
212
+ end
213
+ end
214
+ self.symbolize_ids(info_hash)
215
+ return info_hash
216
+ end
217
+
218
+
219
+ # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
220
+ # the Header, the Terms, the Typedefs and the Instances.
221
+ # ===== Parameters
222
+ # +file+:: OBO file to be loaded
223
+ # ===== Returns
224
+ # Hash with FILE, HEADER and STANZAS info
225
+ def self.load_obo(file) #TODO: Send to obo_parser class
226
+ raise("File is not defined") if file.nil?
227
+ # Data variables
228
+ header = ''
229
+ stanzas = {terms: {}, typedefs: {}, instances: {}}
230
+ # Auxiliar variables
231
+ infoType = 'Header'
232
+ currInfo = []
233
+ stanzas_flags = %w[[Term] [Typedef] [Instance]]
234
+ # Read file
235
+ File.open(file).each do |line|
236
+ line.chomp!
237
+ next if line.empty?
238
+ fields = line.split(':', 2)
239
+ # Check if new instance is found
240
+ if stanzas_flags.include?(line)
241
+ header = self.process_entity(header, infoType, stanzas, currInfo)
242
+ # Update info variables
243
+ currInfo = []
244
+ infoType = line.gsub!(/[\[\]]/, '')
245
+ next
246
+ end
247
+ # Concat info
248
+ currInfo << fields
249
+ end
250
+ # Store last loaded info
251
+ header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
252
+
253
+ # Prepare to return
254
+ finfo = {:file => file, :name => File.basename(file, File.extname(file))}
255
+ return finfo, header, stanzas
256
+ end
257
+
258
+
259
+ # Handle OBO loaded info and stores it into correct container and format
260
+ # ===== Parameters
261
+ # +header+:: container
262
+ # +infoType+:: current ontology item type detected
263
+ # +stanzas+:: container
264
+ # +currInfo+:: info to be stored
265
+ # ===== Returns
266
+ # header newly/already stored
267
+ def self.process_entity(header, infoType, stanzas, currInfo)
268
+ info = self.info2hash(currInfo)
269
+ # Store current info
270
+ if infoType.eql?('Header')
271
+ header = info
272
+ else
273
+ id = info[:id]
274
+ case infoType
275
+ when 'Term'
276
+ stanzas[:terms][id] = info
277
+ when 'Typedef'
278
+ stanzas[:typedefs][id] = info
279
+ when 'Instance'
280
+ stanzas[:instances][id] = info
281
+ end
282
+ end
283
+ return header
284
+ end
285
+
286
+
287
+ # Symboliza all values into hashs using symbolizable tags as keys
288
+ # ===== Parameters
289
+ # +item_hash+:: hash to be checked
290
+ def self.symbolize_ids(item_hash)
291
+ @@symbolizable_ids.each do |tag|
292
+ query = item_hash[tag]
293
+ if !query.nil?
294
+ if query.kind_of?(Array)
295
+ query.map!{|item| item.to_sym}
296
+ else
297
+ item_hash[tag] = query.to_sym if !query.nil?
298
+ end
299
+ end
300
+ end
301
+ end
302
+
303
+
304
+ #
305
+ # ===== Parameters
306
+ # +root+:: main term to expand
307
+ # +ontology+:: to be cutted
308
+ # +clone+:: if true, given ontology object will not be mutated
309
+ # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
310
+ # ===== Returns
311
+ # An Ontology object with terms after cut the ontology.
312
+ def self.mutate(root, ontology, clone: true, remove_up: true)
313
+ ontology = ontology.clone if clone
314
+ # Obtain affected IDs
315
+ descendants = ontology.descendants_index[root]
316
+ descendants << root # Store itself to do not remove it
317
+ # Remove unnecesary terms
318
+ ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
319
+ ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
320
+ ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
321
+ ontology.dicts = {}
322
+ ontology.removable_terms = []
323
+ ontology.term_paths = {}
324
+ # Recalculate metadata
325
+ ontology.build_index
326
+ ontology.add_observed_terms_from_profiles
327
+ # Finish
328
+ return ontology
329
+ end
330
+
331
+
332
+
333
+ #############################################
334
+ # GENERAL METHODS
335
+ #############################################
336
+
337
+ # Include removable terms to current removable terms list
338
+ # ===== Parameters
339
+ # +terms+:: terms array to be concatenated
340
+ def add_removable_terms(terms)
341
+ terms = terms.map{|term| term.to_sym}
342
+ @removable_terms.concat(terms)
343
+ end
344
+
345
+
346
+ # Include removable terms to current removable terms list loading new
347
+ # terms from a one column plain text file
348
+ # ===== Parameters
349
+ # +file+:: to be loaded
350
+ def add_removable_terms_from_file(file)
351
+ File.open(excluded_codes_file).each do |line|
352
+ line.chomp!
353
+ @removable_terms << line.to_sym
354
+ end
355
+ end
356
+
357
+
358
+ # Increase observed frequency for a specific term
359
+ # ===== Parameters
360
+ # +term+:: term which frequency is going to be increased
361
+ # +increas+:: frequency rate to be increased. Default = 1
362
+ # ===== Return
363
+ # true if process ends without errors, false in other cases
364
+ def add_observed_term(term:,increase: 1.0)
365
+ # Check
366
+ raise ArgumentError, "Term given is NIL" if term.nil?
367
+ return false unless @stanzas[:terms].include?(term)
368
+ return false if @removable_terms.include?(term)
369
+ if @alternatives_index.include?(term)
370
+ alt_id = @alternatives_index[term]
371
+ @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
372
+ @meta[term] = @meta[alt_id]
373
+ end
374
+ # Check if exists
375
+ @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
376
+ # Add frequency
377
+ @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
378
+ @meta[term][:observed_freq] += increase
379
+ # Check maximum frequency
380
+ @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
381
+ return true
382
+ end
383
+
384
+
385
+ # Increase the arbitrary frequency of a given term set
386
+ # ===== Parameters
387
+ # +terms+:: set of terms to be updated
388
+ # +increase+:: amount to be increased
389
+ # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
390
+ # ===== Return
391
+ # true if process ends without errors and false in other cases
392
+ def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
393
+ # Check
394
+ raise ArgumentError, 'Terms array given is NIL' if terms.nil?
395
+ raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
396
+ # Add observations
397
+ if transform_to_sym
398
+ checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
399
+ else
400
+ checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
401
+ end
402
+ return checks
403
+ end
404
+
405
+
406
+ # Compare to terms sets
407
+ # ===== Parameters
408
+ # +termsA+:: set to be compared
409
+ # +termsB+:: set to be compared
410
+ # +sim_type+:: similitude method to be used. Default: resnik
411
+ # +ic_type+:: ic type to be used. Default: resnik
412
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
413
+ # ===== Return
414
+ # similitude calculated
415
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
416
+ # Check
417
+ raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
418
+ raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
419
+ micasA = []
420
+ # Compare A -> B
421
+ termsA.each do |tA|
422
+ micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
423
+ # Remove special cases
424
+ [false,nil].each do |err_value| micas.delete(err_value) end
425
+ # Obtain maximum value
426
+ micasA << micas.max if micas.length > 0
427
+ micasA << 0 if micas.length <= 0
428
+ end
429
+ means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
430
+ # Compare B -> A
431
+ if bidirectional
432
+ means_simA = means_sim * micasA.size
433
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
434
+ means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
435
+ end
436
+ # Return
437
+ return means_sim
438
+ end
439
+
440
+
441
+ # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
442
+ # ===== Parameters
443
+ # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
444
+ # +sim_type+:: similitude method to be used. Default: resnik
445
+ # +ic_type+:: ic type to be used. Default: resnik
446
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
447
+ # ===== Return
448
+ # Similitudes calculated
449
+ def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
450
+ profiles_similarity = {} #calculate similarity between patients profile
451
+ profiles_ids = @profiles.keys
452
+ if external_profiles.nil?
453
+ comp_ids = profiles_ids
454
+ comp_profiles = @profiles
455
+ main_ids = comp_ids
456
+ main_profiles = comp_profiles
457
+ else
458
+ comp_ids = external_profiles.keys
459
+ comp_profiles = external_profiles
460
+ main_ids = profiles_ids
461
+ main_profiles = @profiles
462
+ end
463
+ # Compare
464
+ while !main_ids.empty?
465
+ curr_id = main_ids.shift
466
+ current_profile = main_profiles[curr_id]
467
+ comp_ids.each do |id|
468
+ profile = comp_profiles[id]
469
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
470
+ query = profiles_similarity[curr_id]
471
+ if query.nil?
472
+ profiles_similarity[curr_id] = {id => value}
473
+ else
474
+ query[id] = value
475
+ end
476
+ end
477
+ end
478
+ return profiles_similarity
479
+ end
480
+
481
+
482
+ # Expand alternative IDs arround all already stored terms
483
+ # ===== Parameters
484
+ # +alt_tag+:: tag used to expand alternative IDs
485
+ # ===== Returns
486
+ # true if process ends without errors and false in other cases
487
+ def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
488
+ # Check input
489
+ raise('stanzas terms empty') if @stanzas[:terms].empty?
490
+ # Take all alternative IDs
491
+ alt_ids2add = {}
492
+ @stanzas[:terms].each do |id, tags|
493
+ alt_ids = tags[alt_tag]
494
+ if !alt_ids.nil?
495
+ alt_ids = alt_ids - @removable_terms
496
+ # Update info
497
+ alt_ids.each do |alt_term|
498
+ @alternatives_index[alt_term] = id
499
+ alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
500
+ @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
501
+ end
502
+ end
503
+ end
504
+ @stanzas[:terms].merge!(alt_ids2add)
505
+ end
506
+
507
+
508
+ # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
509
+ # ===== Returns
510
+ # true if eprocess ends without errors and false in other cases
511
+ def build_index()
512
+ self.get_index_alternatives
513
+ self.get_index_obsoletes
514
+ self.get_index_child_parent_relations
515
+ @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
516
+ @alternatives_index.compact!
517
+ @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
518
+ @obsoletes_index.compact!
519
+ @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
520
+ @ancestors_index.compact!
521
+ @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
522
+ @descendants_index.compact!
523
+ self.get_index_frequencies
524
+ self.calc_dictionary(:name)
525
+ self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
526
+ self.calc_term_levels(calc_paths: true)
527
+ end
528
+
529
+
530
+ # Calculates regular frequencies based on ontology structure (using parentals)
531
+ # ===== Returns
532
+ # true if everything end without errors and false in other cases
533
+ def get_index_frequencies()
534
+ # Check
535
+ if @ancestors_index.empty?
536
+ warn('ancestors_index object is empty')
537
+ else
538
+ # Prepare useful variables
539
+ alternative_terms = @alternatives_index.keys
540
+ # Per each term, add frequencies
541
+ @stanzas[:terms].each do |id, tags|
542
+ if @alternatives_index.include?(id)
543
+ alt_id = @alternatives_index[id]
544
+ query = @meta[alt_id] # Check if exist
545
+ if query.nil?
546
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
547
+ @meta[alt_id] = query
548
+ end
549
+ @meta[id] = query
550
+ # Note: alternative terms do not increase structural frequencies
551
+ else # Official term
552
+ query = @meta[id] # Check if exist
553
+ if query.nil?
554
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
555
+ @meta[id] = query
556
+ end
557
+ # Store metadata
558
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
559
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
560
+ query[:struct_freq] = query[:descendants] + 1.0
561
+ # Update maximums
562
+ @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
563
+ @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
564
+ end
565
+ end
566
+ end
567
+ end
568
+
569
+
570
+ # Expand obsoletes set and link info to their alternative IDs
571
+ # ===== Parameters
572
+ # +obs_tags+:: tags to be used to find obsoletes
573
+ # +alt_tags+:: tags to find alternative IDs (if are available)
574
+ # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
575
+ # ===== Returns
576
+ # true if process ends without errors and false in other cases
577
+ def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
578
+ if @stanzas[:terms].empty?
579
+ warn('stanzas terms empty')
580
+ else
581
+ # Check obsoletes
582
+ @stanzas[:terms].each do |id, term_tags|
583
+ next if term_tags.nil?
584
+ query = term_tags[obs_tag]
585
+ if !query.nil? && query == 'true' # Obsolete tag presence
586
+ next if !@obsoletes_index[id].nil? # Already stored
587
+ # Check if alternative value is available
588
+ alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
589
+ if !alt_ids.empty?
590
+ alt_id = alt_ids.first.first #FIRST tag, FIRST id
591
+ # Store
592
+ @alternatives_index[id] = alt_id
593
+ @obsoletes_index[id] = alt_id
594
+ end
595
+ end
596
+ end
597
+ end
598
+ end
599
+
600
+
601
+ # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
602
+ # ===== Parameters
603
+ # +tag+:: tag used to expand parentals
604
+ # +split_info_char+:: special regex used to split info (if it is necessary)
605
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
606
+ # ===== Returns
607
+ # true if process ends without errors and false in other cases
608
+ def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
609
+ # Check
610
+ if @stanzas[:terms].nil?
611
+ warn('stanzas terms empty')
612
+ else
613
+ # Expand
614
+ structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
615
+ target_tag: tag,
616
+ alt_ids: @alternatives_index,
617
+ obsoletes: @obsoletes_index.length)
618
+ # Check
619
+ raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
620
+ # Prepare ancestors structure
621
+ anc = {}
622
+ des = {}
623
+ parentals.each do |id, parents|
624
+ parents = parents - @removable_terms
625
+ anc[id] = parents
626
+ parents.each do |anc_id| # Add descendants
627
+ if !des.include?(anc_id)
628
+ des[anc_id] = [id]
629
+ else
630
+ des[anc_id] << id
631
+ end
632
+ end
633
+ end
634
+ # Store alternatives
635
+ @alternatives_index.each do |id,alt|
636
+ anc[id] = anc[alt] if anc.include?(alt)
637
+ des[id] = des[alt] if des.include?(alt)
638
+ end
639
+ # Check structure
640
+ if ![:atomic,:sparse].include? structType
641
+ structType = structType == :circular ? :circular : :hierarchical
642
+ end
643
+ # Store
644
+ @ancestors_index = anc
645
+ @descendants_index = des
646
+ @structureType = structType
647
+ end
648
+ # Finish
649
+ end
650
+
651
+
652
+ # Find ancestors of a given term
653
+ # ===== Parameters
654
+ # +term+:: to be checked
655
+ # +filter_alternatives+:: if true, remove alternatives from final results
656
+ # ===== Returns
657
+ # an array with all ancestors of given term or false if parents are not available yet
658
+ def get_ancestors(term, filter_alternatives = false)
659
+ return self.get_familiar(term, true, filter_alternatives)
660
+ end
661
+
662
+
663
+ # Find descendants of a given term
664
+ # ===== Parameters
665
+ # +term+:: to be checked
666
+ # +filter_alternatives+:: if true, remove alternatives from final results
667
+ # ===== Returns
668
+ # an array with all descendants of given term or false if parents are not available yet
669
+ def get_descendants(term, filter_alternatives = false)
670
+ return self.get_familiar(term, false, filter_alternatives)
671
+ end
672
+
673
+
674
+ # Find ancestors/descendants of a given term
675
+ # ===== Parameters
676
+ # +term+:: to be checked
677
+ # +return_ancestors+:: return ancestors if true or descendants if false
678
+ # +filter_alternatives+:: if true, remove alternatives from final results
679
+ # ===== Returns
680
+ # an array with all ancestors/descendants of given term or nil if parents are not available yet
681
+ def get_familiar(term, return_ancestors = true, filter_alternatives = false)
682
+ # Find into parentals
683
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
684
+ if !familiars.nil?
685
+ familiars = familiars.clone
686
+ if filter_alternatives
687
+ familiars.reject!{|fm| @alternatives_index.include?(fm)}
688
+ end
689
+ else
690
+ familiars = []
691
+ end
692
+ return familiars
693
+ end
694
+
695
+
696
+ # Obtain IC of an specific term
697
+ # ===== Parameters
698
+ # +term+:: which IC will be calculated
699
+ # +type+:: of IC to be calculated. Default: resnik
700
+ # +force+:: force re-calculate the IC. Do not check if it is already calculated
701
+ # +zhou_k+:: special coeficient for Zhou IC method
702
+ # ===== Returns
703
+ # the IC calculated
704
+ def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
705
+ term = termRaw.to_sym
706
+ # Check
707
+ raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
708
+ # Check if it's already calculated
709
+ return @ics[type][term] if (@ics[type].include? term) && !force
710
+ # Calculate
711
+ ic = - 1
712
+ case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
713
+ ###########################################
714
+ #### STRUCTURE BASED METRICS
715
+ ###########################################
716
+ # Shortest path
717
+ # Weighted Link
718
+ # Hirst and St-Onge Measure
719
+ # Wu and Palmer
720
+ # Slimani
721
+ # Li
722
+ # Leacock and Chodorow
723
+ ###########################################
724
+ #### INFORMATION CONTENT METRICS
725
+ ###########################################
726
+ when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
727
+ # -log(Freq(x) / Max_Freq)
728
+ ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
729
+ when :resnik_observed
730
+ # -log(Freq(x) / Max_Freq)
731
+ ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
732
+ # Lin
733
+ # Jiang & Conrath
734
+
735
+ ###########################################
736
+ #### FEATURE-BASED METRICS
737
+ ###########################################
738
+ # Tversky
739
+ # x-similarity
740
+ # Rodirguez
741
+
742
+ ###########################################
743
+ #### HYBRID METRICS
744
+ ###########################################
745
+ when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
746
+ # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
747
+ ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
748
+ if :zhou # New Model of Semantic Similarity Measuring in Wordnet
749
+ # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
750
+ @ics[:seco][term] = ic # Special store
751
+ ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
752
+ end
753
+ when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
754
+ ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
755
+ # Knappe
756
+ end
757
+ @ics[type][term] = ic
758
+ return ic
759
+ end
760
+
761
+
762
+ # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
763
+ # ===== Returns
764
+ # two hashes with resnik and resnik_observed ICs for observed terms
765
+ def get_observed_ics_by_onto_and_freq
766
+ # Chech there are observed terms
767
+ if @profiles.empty?
768
+ resnik = {}
769
+ resnik_observed = {}
770
+ else
771
+ # Calc ICs for all terms
772
+ observed_terms = @profiles.values.flatten.uniq
773
+ observed_terms.each{ |term| get_IC(term)}
774
+ observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
775
+ resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
776
+ resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
777
+ end
778
+ return resnik.clone, resnik_observed.clone
779
+ end
780
+
781
+
782
+ # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
783
+ # ===== Parameters
784
+ # +termA+:: term to be cheked
785
+ # +termB+:: term to be checked
786
+ # +ic_type+:: IC formula to be used
787
+ # ===== Returns
788
+ # the IC of the MICA(termA,termB)
789
+ def get_ICMICA(termA, termB, ic_type = :resnik)
790
+ mica = self.get_MICA(termA, termB, ic_type)
791
+ return mica.first.nil? ? nil : mica.last
792
+ end
793
+
794
+
795
+ # Find the Most Index Content shared Ancestor (MICA) of two given terms
796
+ # ===== Parameters
797
+ # +termA+:: term to be cheked
798
+ # +termB+:: term to be checked
799
+ # +ic_type+:: IC formula to be used
800
+ # ===== Returns
801
+ # the MICA(termA,termB) and it's IC
802
+ def get_MICA(termA, termB, ic_type = :resnik)
803
+ termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
804
+ termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
805
+ mica = [nil,-1.0]
806
+ # Special case
807
+ if termA.eql?(termB)
808
+ ic = self.get_IC(termA, type: ic_type)
809
+ mica = [termA, ic]
810
+ else
811
+ # Obtain ancestors (include itselfs too)
812
+ anc_A = self.get_ancestors(termA)
813
+ anc_B = self.get_ancestors(termB)
814
+
815
+ if !(anc_A.empty? && anc_B.empty?)
816
+ anc_A << termA
817
+ anc_B << termB
818
+ # Find shared ancestors
819
+ shared_ancestors = anc_A & anc_B
820
+ # Find MICA
821
+ if shared_ancestors.length > 0
822
+ shared_ancestors.each do |anc|
823
+ ic = self.get_IC(anc, type: ic_type)
824
+ # Check
825
+ mica = [anc,ic] if ic > mica[1]
826
+ end
827
+ end
828
+ end
829
+ end
830
+ return mica
831
+ end
832
+
833
+
834
+ # Calculate similarity between two given terms
835
+ # ===== Parameters
836
+ # +termsA+:: to be compared
837
+ # +termsB+:: to be compared
838
+ # +type+:: similitude formula to be used
839
+ # +ic_type+:: IC formula to be used
840
+ # ===== Returns
841
+ # the similarity between both sets or false if frequencies are not available yet
842
+ def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
843
+ # Check
844
+ raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
845
+ sim = nil
846
+ # Launch comparissons
847
+ sim_res = get_ICMICA(termA, termB, ic_type)
848
+ if !sim_res.nil?
849
+ case type
850
+ when :resnik
851
+ sim = sim_res
852
+ when :lin
853
+ sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
854
+ when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
855
+ sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
856
+ end
857
+ end
858
+ return sim
859
+ end
860
+
861
+
862
+ # Method used to load information stored into an OBO file and store it into this object.
863
+ # If a file is specified by input parameter, current @file value is updated
864
+ # ===== Parameters
865
+ # +file+:: optional file to update object stored file
866
+ def load(file, build: true)
867
+ _, header, stanzas = self.class.load_obo(file)
868
+ @header = header
869
+ @stanzas = stanzas
870
+ self.remove_removable()
871
+ # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
872
+ self.build_index() if build
873
+ end
874
+
875
+ #
876
+ def remove_removable()
877
+ @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
878
+ end
879
+
880
+
881
+ # Exports an OBO_Handler object in json format
882
+ # ===== Parameters
883
+ # +file+:: where info will be stored
884
+ def write(file)
885
+ # Take object stored info
886
+ obj_info = {header: @header,
887
+ stanzas: @stanzas,
888
+ ancestors_index: @ancestors_index,
889
+ descendants_index: @descendants_index,
890
+ alternatives_index: @alternatives_index,
891
+ obsoletes_index: @obsoletes_index,
892
+ structureType: @structureType,
893
+ ics: @ics,
894
+ meta: @meta,
895
+ special_tags: @special_tags,
896
+ max_freqs: @max_freqs,
897
+ dicts: @dicts,
898
+ profiles: @profiles,
899
+ profilesDict: @profilesDict,
900
+ items: @items,
901
+ removable_terms: @removable_terms,
902
+ term_paths: @term_paths}
903
+ # Convert to JSON format & write
904
+ File.open(file, "w") { |f| f.write obj_info.to_json }
905
+ end
906
+
907
+
908
+ def is_number? string
909
+ true if Float(string) rescue false
910
+ end
911
+
912
+
913
+ # Read a JSON file with an OBO_Handler object stored
914
+ # ===== Parameters
915
+ # +file+:: with object info
916
+ # ===== Return
917
+ # OBO_Handler internal fields
918
+ def read(file)
919
+ # Read file
920
+ jsonFile = File.open(file)
921
+ jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
922
+ # Pre-process (Symbolize some hashs values)
923
+ jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
924
+ jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
925
+ jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
926
+ jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h
927
+ jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
928
+ jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
929
+ jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h
930
+ jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
931
+ # Special case: byTerm
932
+ dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
933
+ if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
934
+ [term.to_s.to_i, value.map{|term| term.to_sym}]
935
+ elsif value.is_a? Numeric # Numeric dictionary
936
+ [term.to_sym, value]
937
+ elsif value.kind_of?(Array) && flag == :is_a
938
+ [term.to_sym, value.map{|v| v.to_sym}]
939
+ else
940
+ [term.to_sym, value]
941
+ end
942
+ end
943
+ dictionaries[:byTerm] = dictionaries[:byTerm].to_h
944
+ # By value
945
+ dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
946
+ if value.is_a? Numeric # Numeric dictionary
947
+ [value, term.to_sym]
948
+ elsif term.is_a? Numeric # Numeric dictionary
949
+ [value.to_s.to_sym, term]
950
+ elsif flag == :is_a
951
+ [value.to_sym, term.to_sym]
952
+ elsif term.kind_of?(Array)
953
+ [value.to_sym, term.map{|t| t.to_sym}]
954
+ else
955
+ [value.to_s, term.to_sym]
956
+ end
957
+ end
958
+ dictionaries[:byValue] = dictionaries[:byValue].to_h
959
+ end
960
+ jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
961
+ jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
962
+ jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}}
963
+ jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym}
964
+ jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
965
+ if v.kind_of?(Array)
966
+ jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
967
+ else
968
+ jsonInfo[:special_tags][k] = v.to_sym
969
+ end
970
+ end
971
+ jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}}
972
+ jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}}
973
+ # Store info
974
+ @header = jsonInfo[:header]
975
+ @stanzas = jsonInfo[:stanzas]
976
+ @ancestors_index = jsonInfo[:ancestors_index]
977
+ @descendants_index = jsonInfo[:descendants_index]
978
+ @alternatives_index = jsonInfo[:alternatives_index]
979
+ @obsoletes_index = jsonInfo[:obsoletes_index]
980
+ @structureType = jsonInfo[:structureType].to_sym
981
+ @ics = jsonInfo[:ics]
982
+ @meta = jsonInfo[:meta]
983
+ @special_tags = jsonInfo[:special_tags]
984
+ @max_freqs = jsonInfo[:max_freqs]
985
+ @dicts = jsonInfo[:dicts]
986
+ @profiles = jsonInfo[:profiles]
987
+ @profilesDict = jsonInfo[:profilesDict]
988
+ @items = jsonInfo[:items]
989
+ @removable_terms = jsonInfo[:removable_terms]
990
+ @term_paths = jsonInfo[:term_paths]
991
+ end
992
+
993
+
994
+ # Check if a given ID is stored as term into this object
995
+ # ===== Parameters
996
+ # +id+:: to be checked
997
+ # ===== Return
998
+ # True if term is allowed or false in other cases
999
+ def exists? id
1000
+ return stanzas[:terms].include?(id)
1001
+ end
1002
+
1003
+
1004
+ # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1005
+ # ===== Parameters
1006
+ # +text+:: to be checked
1007
+ # ===== Return
1008
+ # The correct ID if it can be found or nil in other cases
1009
+ def extract_id(text, splitBy: ' ')
1010
+ if self.exists?(text)
1011
+ return text
1012
+ else
1013
+ splittedText = text.to_s.split(splitBy).first.to_sym
1014
+ return self.exists?(splittedText) ? splittedText : nil
1015
+ end
1016
+ end
1017
+
1018
+
1019
+ # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1020
+ # This functions stores calculated dictionary into @dicts field.
1021
+ # This functions stores first value for multivalue tags
1022
+ # This function does not handle synonyms for byValue dictionaries
1023
+ # ===== Parameters
1024
+ # +tag+:: to be used to calculate dictionary
1025
+ # +select_regex+:: gives a regfex that can be used to modify value to be stored
1026
+ # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1027
+ # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1028
+ # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1029
+ # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1030
+ # ===== Return
1031
+ # void. And stores calcualted bidirectional dictonary into dictionaries main container
1032
+ def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1033
+ tag = tag.to_sym
1034
+ store_tag = tag if store_tag.nil?
1035
+ if @stanzas[:terms].empty?
1036
+ warn('Terms are not already loaded. Aborting dictionary calc')
1037
+ else
1038
+ byTerm = {}
1039
+ byValue = {}
1040
+ # Calc per term
1041
+ @stanzas[:terms].each do |term, tags|
1042
+ referenceTerm = term
1043
+ if @alternatives_index.include?(term) && substitute_alternatives # Special case
1044
+ referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1045
+ end
1046
+ queryTag = tags[tag]
1047
+ if !queryTag.nil?
1048
+ # Pre-process
1049
+ if !select_regex.nil?
1050
+ if queryTag.kind_of?(Array)
1051
+ queryTag = queryTag.map{|value| value.scan(select_regex).first}
1052
+ queryTag.flatten!
1053
+ else
1054
+ queryTag = queryTag.scan(select_regex).first
1055
+ end
1056
+ queryTag.compact!
1057
+ end
1058
+ if queryTag.kind_of?(Array) # Store
1059
+ if !queryTag.empty?
1060
+ if byTerm.include?(referenceTerm)
1061
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1062
+ else
1063
+ byTerm[referenceTerm] = queryTag
1064
+ end
1065
+ if multiterm
1066
+ queryTag.each do |value|
1067
+ byValue[value] = [] if byValue[value].nil?
1068
+ byValue[value] << referenceTerm
1069
+ end
1070
+ else
1071
+ queryTag.each{|value| byValue[value] = referenceTerm}
1072
+ end
1073
+ end
1074
+ else
1075
+ if byTerm.include?(referenceTerm)
1076
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1077
+ else
1078
+ byTerm[referenceTerm] = [queryTag]
1079
+ end
1080
+ if multiterm
1081
+ byValue[queryTag] = [] if byValue[queryTag].nil?
1082
+ byValue[queryTag] << referenceTerm
1083
+ else
1084
+ byValue[queryTag] = referenceTerm
1085
+ end
1086
+ end
1087
+ end
1088
+ end
1089
+
1090
+ # Check self-references
1091
+ if self_type_references
1092
+ byTerm.map do |term, references|
1093
+ corrected_references = references.map do |t|
1094
+ checked = self.extract_id(t)
1095
+ if checked.nil?
1096
+ t
1097
+ else
1098
+ byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1099
+ checked
1100
+ end
1101
+ end
1102
+ byTerm[term] = corrected_references.uniq
1103
+ end
1104
+ end
1105
+
1106
+ # Check order
1107
+ byTerm.map do |term,values|
1108
+ if self.exists?(term)
1109
+ referenceValue = @stanzas[:terms][term][tag]
1110
+ if !referenceValue.nil?
1111
+ if !select_regex.nil?
1112
+ if referenceValue.kind_of?(Array)
1113
+ referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1114
+ referenceValue.flatten!
1115
+ else
1116
+ referenceValue = referenceValue.scan(select_regex).first
1117
+ end
1118
+ referenceValue.compact!
1119
+ end
1120
+ if self_type_references
1121
+ if referenceValue.kind_of?(Array)
1122
+ aux = referenceValue.map{|t| self.extract_id(t)}
1123
+ else
1124
+ aux = self.extract_id(referenceValue)
1125
+ end
1126
+ referenceValue = aux if !aux.nil?
1127
+ end
1128
+ referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1129
+ byTerm[term] = referenceValue + (values - referenceValue)
1130
+ end
1131
+ end
1132
+ end
1133
+
1134
+ # Store
1135
+ @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1136
+ end
1137
+ end
1138
+
1139
+
1140
+ # Calculates :is_a dictionary without alternatives substitution
1141
+ def calc_ancestors_dictionary
1142
+ self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true)
1143
+ end
1144
+
1145
+
1146
+ # Translate a given value using an already calcualted dictionary
1147
+ # ===== Parameters
1148
+ # +toTranslate+:: value to be translated using dictiontionary
1149
+ # +tag+:: used to generate the dictionary
1150
+ # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1151
+ # ===== Return
1152
+ # translation
1153
+ def translate(toTranslate, tag, byValue: true)
1154
+ dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1155
+ toTranslate = get_main_id(toTranslate) if !byValue
1156
+ return dict[toTranslate]
1157
+ end
1158
+
1159
+
1160
+ # Translate a name given
1161
+ # ===== Parameters
1162
+ # +name+:: to be translated
1163
+ # ===== Return
1164
+ # translated name or nil if it's not stored into this ontology
1165
+ def translate_name(name)
1166
+ term = self.translate(name, :name)
1167
+ term = self.translate(name, :synonym) if term.nil?
1168
+ return term
1169
+ end
1170
+
1171
+
1172
+ # Translate several names and return translations and a list of names which couldn't be translated
1173
+ # ===== Parameters
1174
+ # +names+:: array to be translated
1175
+ # ===== Return
1176
+ # two arrays with translations and names which couldn't be translated respectively
1177
+ def translate_names(names)
1178
+ translated = []
1179
+ rejected = []
1180
+ names.each do |name|
1181
+ tr = self.translate_name(name)
1182
+ if tr.nil?
1183
+ rejected << name
1184
+ else
1185
+ translated << tr
1186
+ end
1187
+ end
1188
+ return translated, rejected
1189
+ end
1190
+
1191
+
1192
+ # Translates a given ID to it assigned name
1193
+ # ===== Parameters
1194
+ # +id+:: to be translated
1195
+ # ===== Return
1196
+ # main name or nil if it's not included into this ontology
1197
+ def translate_id(id)
1198
+ name = self.translate(id, :name, byValue: false)
1199
+ return name.nil? ? nil : name.first
1200
+ end
1201
+
1202
+
1203
+ # Translates several IDs and returns translations and not allowed IDs list
1204
+ # ===== Parameters
1205
+ # +ids+:: to be translated
1206
+ # ===== Return
1207
+ # two arrays with translations and names which couldn't be translated respectively
1208
+ def translate_ids(ids)
1209
+ translated = []
1210
+ rejected = []
1211
+ ids.each do |term_id|
1212
+ tr = self.translate_id(term_id.to_sym)
1213
+ if !tr.nil?
1214
+ translated << tr
1215
+ else
1216
+ rejected << tr
1217
+ end
1218
+ end
1219
+ return translated, rejected
1220
+ end
1221
+
1222
+
1223
+ # ===== Returns
1224
+ # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1225
+ # ===== Parameters
1226
+ # +id+:: to be translated
1227
+ # ===== Return
1228
+ # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1229
+ def get_main_id(id)
1230
+ return nil if !@stanzas[:terms].include? id
1231
+ new_id = id
1232
+ mainID = @alternatives_index[id]
1233
+ new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1234
+ return new_id
1235
+ end
1236
+
1237
+
1238
+ # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1239
+ # ===== Parameters
1240
+ # +ids+:: to be checked
1241
+ # ===== Return
1242
+ # two arrays whit allowed and rejected IDs respectively
1243
+ def check_ids(ids, substitute: true)
1244
+ checked_codes = []
1245
+ rejected_codes = []
1246
+ ids.each do |id|
1247
+ if @stanzas[:terms].include? id
1248
+ if substitute
1249
+ checked_codes << self.get_main_id(id)
1250
+ else
1251
+ checked_codes << id
1252
+ end
1253
+ else
1254
+ rejected_codes << id
1255
+ end
1256
+ end
1257
+ return checked_codes, rejected_codes
1258
+ end
1259
+
1260
+
1261
+ # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1262
+ # ===== Parameters
1263
+ # +id+:: assigned to profile
1264
+ # +terms+:: array of terms
1265
+ # +substitute+:: subsstitute flag from check_ids
1266
+ def add_profile(id, terms, substitute: true)
1267
+ warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1268
+ correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1269
+ if !rejected_terms.empty?
1270
+ warn('Given terms contains erroneus IDs. These IDs will be removed')
1271
+ end
1272
+ if id.is_a? Numeric
1273
+ @profiles[id] = correct_terms
1274
+ else
1275
+ @profiles[id.to_sym] = correct_terms
1276
+ end
1277
+ end
1278
+
1279
+
1280
+ # Method used to store a pull of profiles
1281
+ # ===== Parameters
1282
+ # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1283
+ # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1284
+ # +reset_stored+:: if true, remove already stored profiles
1285
+ # +substitute+:: subsstitute flag from check_ids
1286
+ def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1287
+ self.reset_profiles if reset_stored
1288
+ # Check
1289
+ if profiles.kind_of?(Array)
1290
+ profiles.each_with_index do |items, i|
1291
+ self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1292
+ end
1293
+ else # Hash
1294
+ if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1295
+ warn('Some profiles given are already stored. Stored version will be replaced')
1296
+ end
1297
+ profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1298
+ end
1299
+
1300
+ self.add_observed_terms_from_profiles(reset: true)
1301
+
1302
+ if calc_metadata
1303
+ self.calc_profiles_dictionary
1304
+ end
1305
+ end
1306
+
1307
+
1308
+ # Internal method used to remove already stored profiles and restore observed frequencies
1309
+ def reset_profiles
1310
+ # Clean profiles storage
1311
+ @profiles = {}
1312
+ # Reset frequency observed
1313
+ @meta.each{|term,info| info[:observed_freq] = 0}
1314
+ @max_freqs[:observed_freq] = 0
1315
+ end
1316
+
1317
+
1318
+ # ===== Returns
1319
+ # profiles assigned to a given ID
1320
+ # ===== Parameters
1321
+ # +id+:: profile ID
1322
+ # ===== Return
1323
+ # specific profile or nil if it's not stored
1324
+ def get_profile(id)
1325
+ return @profiles[id]
1326
+ end
1327
+
1328
+
1329
+ # ===== Returns
1330
+ # an array of sizes for all stored profiles
1331
+ # ===== Return
1332
+ # array of profile sizes
1333
+ def get_profiles_sizes()
1334
+ return @profiles.map{|id,terms| terms.length}
1335
+ end
1336
+
1337
+
1338
+ # ===== Returns
1339
+ # mean size of stored profiles
1340
+ # ===== Parameters
1341
+ # +round_digits+:: number of digits to round result. Default: 4
1342
+ # ===== Returns
1343
+ # mean size of stored profiles
1344
+ def get_profiles_mean_size(round_digits: 4)
1345
+ sizes = self.get_profiles_sizes
1346
+ return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1347
+ end
1348
+
1349
+
1350
+ # Calculates profiles sizes and returns size assigned to percentile given
1351
+ # ===== Parameters
1352
+ # +perc+:: percentile to be returned
1353
+ # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1354
+ # ===== Returns
1355
+ # values assigned to percentile asked
1356
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1357
+ prof_lengths = self.get_profiles_sizes.sort
1358
+ prof_lengths.reverse! if !increasing_sort
1359
+ n_profiles = prof_lengths.length
1360
+ percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1361
+ percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1362
+ return prof_lengths[percentile_index]
1363
+ end
1364
+
1365
+
1366
+ # Translate a given profile to terms names
1367
+ # ===== Parameters
1368
+ # +prof+:: array of terms to be translated
1369
+ # ===== Returns
1370
+ # array of translated terms. Can include nils if some IDs are not allowed
1371
+ def profile_names(prof)
1372
+ return prof.map{|term| self.translate_id(term)}
1373
+ end
1374
+
1375
+
1376
+ # Trnaslates a bunch of profiles to it sets of term names
1377
+ # ===== Parameters
1378
+ # +profs+:: array of profiles
1379
+ # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1380
+ # ===== Returns
1381
+ # translated profiles
1382
+ def translate_profiles_ids(profs = [], asArray: true)
1383
+ profs = @profiles if profs.empty?
1384
+ profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1385
+ profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1386
+ return asArray ? profs_names.values : profs_names
1387
+ end
1388
+
1389
+
1390
+ # Includes as "observed_terms" all terms included into stored profiles
1391
+ # ===== Parameters
1392
+ # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1393
+ def add_observed_terms_from_profiles(reset: false)
1394
+ @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1395
+ @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1396
+ end
1397
+
1398
+
1399
+ # Get a term frequency
1400
+ # ===== Parameters
1401
+ # +term+:: term to be checked
1402
+ # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1403
+ # ===== Returns
1404
+ # frequency of term given or nil if term is not allowed
1405
+ def get_frequency(term, type: :struct_freq)
1406
+ queryFreq = @meta[term]
1407
+ return queryFreq.nil? ? nil : queryFreq[type]
1408
+ end
1409
+
1410
+
1411
+ # Geys structural frequency of a term given
1412
+ # ===== Parameters
1413
+ # +term+:: to be checked
1414
+ # ===== Returns
1415
+ # structural frequency of given term or nil if term is not allowed
1416
+ def get_structural_frequency(term)
1417
+ return self.get_frequency(term, type: :struct_freq)
1418
+ end
1419
+
1420
+
1421
+ # Gets observed frequency of a term given
1422
+ # ===== Parameters
1423
+ # +term+:: to be checked
1424
+ # ===== Returns
1425
+ # observed frequency of given term or nil if term is not allowed
1426
+ def get_observed_frequency(term)
1427
+ return self.get_frequency(term, type: :observed_freq)
1428
+ end
1429
+
1430
+
1431
+ # Calculates frequencies of stored profiles terms
1432
+ # ===== Parameters
1433
+ # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1434
+ # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1435
+ # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1436
+ # +translate+:: if true, term IDs will be translated to
1437
+ # ===== Returns
1438
+ # stored profiles terms frequencies
1439
+ def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1440
+ n_profiles = @profiles.length
1441
+ if literal
1442
+ freqs = {}
1443
+ @profiles.each do |id, terms|
1444
+ terms.each do |literalTerm|
1445
+ if freqs.include?(literalTerm)
1446
+ freqs[literalTerm] += 1
1447
+ else
1448
+ freqs[literalTerm] = 1
1449
+ end
1450
+ end
1451
+ end
1452
+ if (ratio || translate)
1453
+ aux_keys = freqs.keys
1454
+ aux_keys.each do |term|
1455
+ freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1456
+ if translate
1457
+ tr = self.translate_id(term)
1458
+ freqs[tr] = freqs.delete(term) if !tr.nil?
1459
+ end
1460
+ end
1461
+ end
1462
+ if asArray
1463
+ freqs = freqs.map{|term, freq| [term, freq]}
1464
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1465
+ end
1466
+ else # Freqs translating alternatives
1467
+ freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1468
+ freqs = freqs.to_h if !asArray
1469
+ if translate
1470
+ freqs = freqs.map do |term, freq|
1471
+ tr = self.translate_id(term)
1472
+ tr.nil? ? [term, freq] : [tr, freq]
1473
+ end
1474
+ end
1475
+ if asArray
1476
+ freqs = freqs.map{|term, freq| [term, freq]}
1477
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1478
+ else
1479
+ freqs = freqs.to_h
1480
+ end
1481
+ end
1482
+ return freqs
1483
+ end
1484
+
1485
+
1486
+ # Clean a given profile returning cleaned set of terms and removed ancestors term.
1487
+ # ===== Parameters
1488
+ # +prof+:: array of terms to be checked
1489
+ # ===== Returns
1490
+ # two arrays, first is the cleaned profile and second is the removed elements array
1491
+ def remove_ancestors_from_profile(prof)
1492
+ ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1493
+ redundant = prof.select{|term| ancestors.include?(term)}
1494
+ return prof - redundant, redundant
1495
+ end
1496
+
1497
+
1498
+ # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1499
+ # ===== Parameters
1500
+ # +prof+:: array of terms to be checked
1501
+ # ===== Returns
1502
+ # two arrays, first is the cleaned profile and second is the removed elements array
1503
+ def remove_alternatives_from_profile(prof)
1504
+ alternatives = prof.select{|term| @alternatives_index.include?(term)}
1505
+ redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1506
+ return prof - redundant, redundant
1507
+ end
1508
+
1509
+
1510
+ # Remove alternatives (if official term is present) and ancestors terms of a given profile
1511
+ # ===== Parameters
1512
+ # +profile+:: profile to be cleaned
1513
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1514
+ # ===== Returns
1515
+ # cleaned profile
1516
+ def clean_profile(profile, remove_alternatives: true)
1517
+ terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1518
+ if remove_alternatives
1519
+ terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1520
+ else
1521
+ terms_without_ancestors_and_alternatices = terms_without_ancestors
1522
+ end
1523
+ return terms_without_ancestors_and_alternatices
1524
+ end
1525
+
1526
+
1527
+ # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1528
+ # ===== Parameters
1529
+ # +store+:: if true, clenaed profiles will replace already stored profiles
1530
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1531
+ # ===== Returns
1532
+ # a hash with cleaned profiles
1533
+ def clean_profiles(store: false, remove_alternatives: true)
1534
+ cleaned_profiles = {}
1535
+ @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1536
+ @profiles = cleaned_profiles if store
1537
+ return cleaned_profiles
1538
+ end
1539
+
1540
+
1541
+ # Calculates number of ancestors present (redundant) in each profile stored
1542
+ # ===== Returns
1543
+ # array of parentals for each profile
1544
+ def parentals_per_profile
1545
+ cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1546
+ parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1547
+ return parentals
1548
+ end
1549
+
1550
+
1551
+ # Calculates mean IC of a given profile
1552
+ # ===== Parameters
1553
+ # +prof+:: profile to be checked
1554
+ # +ic_type+:: ic_type to be used
1555
+ # +zhou_k+:: special coeficient for Zhou IC method
1556
+ # ===== Returns
1557
+ # mean IC for a given profile
1558
+ def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1559
+ return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1560
+ end
1561
+
1562
+
1563
+ # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1564
+ # ===== Returns
1565
+ # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1566
+ def get_profiles_resnik_dual_ICs
1567
+ struct_ics = {}
1568
+ observ_ics = {}
1569
+ @profiles.each do |id, terms|
1570
+ struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1571
+ observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1572
+ end
1573
+ return struct_ics.clone, observ_ics.clone
1574
+ end
1575
+
1576
+
1577
+ # Calculates ontology structural levels for all ontology terms
1578
+ # ===== Parameters
1579
+ # +calc_paths+:: calculates term paths if it's not already calculated
1580
+ # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1581
+ def calc_term_levels(calc_paths: false, shortest_path: true)
1582
+ if @term_paths.empty?
1583
+ if calc_paths
1584
+ self.calc_term_paths
1585
+ else
1586
+ warn('Term paths are not already loaded. Aborting dictionary calc')
1587
+ end
1588
+ end
1589
+ if !@term_paths.empty?
1590
+ byTerm = {}
1591
+ byValue = {}
1592
+ # Calc per term
1593
+ @term_paths.each do |term, info|
1594
+ level = shortest_path ? info[:shortest_path] : info[:largest_path]
1595
+ if level.nil?
1596
+ level = -1
1597
+ else
1598
+ level = level.round(0)
1599
+ end
1600
+ byTerm[term] = level
1601
+ queryLevels = byValue[level]
1602
+ if queryLevels.nil?
1603
+ byValue[level] = [term]
1604
+ else
1605
+ byValue[level] << term
1606
+ end
1607
+ end
1608
+ @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1609
+ # Update maximum depth
1610
+ @max_freqs[:max_depth] = byValue.keys.max
1611
+ end
1612
+ end
1613
+
1614
+
1615
+ # Check if a term given is marked as obsolete
1616
+ def is_obsolete? term
1617
+ return @obsoletes_index.include?(term)
1618
+ end
1619
+
1620
+ # Check if a term given is marked as alternative
1621
+ def is_alternative? term
1622
+ return @alternatives_index.include?(term)
1623
+ end
1624
+
1625
+ # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1626
+ # Also calculates paths metadata and stores into @term_paths
1627
+ def calc_term_paths
1628
+ self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1629
+ visited_terms = []
1630
+ @term_paths = {}
1631
+ if [:hierarchical, :sparse].include? @structureType
1632
+ terms = @stanzas[:terms].keys
1633
+ terms.each do |term|
1634
+ if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1635
+ special_term = term
1636
+ term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1637
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1638
+ @term_paths[special_term] = @term_paths[term]
1639
+ visited_terms << special_term
1640
+ end
1641
+
1642
+ if !visited_terms.include?(term)
1643
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1644
+ parentals = @dicts[:is_a][:byTerm][term]
1645
+ if parentals.nil?
1646
+ @term_paths[term][:paths] << [term]
1647
+ else
1648
+ parentals.each do |direct_parental|
1649
+ if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1650
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1651
+ else # Calculate new paths
1652
+ self.expand_path(direct_parental, visited_terms)
1653
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1654
+ end
1655
+ new_paths.each{|path| @term_paths[term][:paths] << path}
1656
+ end
1657
+ end
1658
+ visited_terms << term
1659
+ end
1660
+ # Update metadata
1661
+ @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1662
+ paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1663
+ @term_paths[term][:largest_path] = paths_sizes.max
1664
+ @term_paths[term][:shortest_path] = paths_sizes.min
1665
+ end
1666
+ else
1667
+ warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1668
+ end
1669
+ end
1670
+
1671
+
1672
+ # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1673
+ # ===== Parameters
1674
+ # +curr_term+:: current visited term
1675
+ # +visited_terms+:: already expanded terms
1676
+ def expand_path(curr_term, visited_terms)
1677
+ if !visited_terms.include?(curr_term) # Not already expanded
1678
+ @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1679
+ direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1680
+ if direct_parentals.nil? # No parents :: End of recurrence
1681
+ @term_paths[curr_term][:paths] << [curr_term]
1682
+ else # Expand and concat
1683
+ direct_parentals.each do |ancestor|
1684
+ self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1685
+ new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1686
+ new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1687
+ end
1688
+ end
1689
+ visited_terms << curr_term
1690
+ end
1691
+ end
1692
+
1693
+
1694
+ # Gets ontology levels calculated
1695
+ # ===== Returns
1696
+ # ontology levels calculated
1697
+ def get_ontology_levels
1698
+ return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1699
+ end
1700
+
1701
+
1702
+ # Gets ontology level of a specific term
1703
+ # ===== Returns
1704
+ # Term level
1705
+ def get_term_level(term)
1706
+ return @dicts[:level][:byValue][term]
1707
+ end
1708
+
1709
+
1710
+ # Return ontology levels from profile terms
1711
+ # ===== Returns
1712
+ # hash of term levels (Key: level; Value: array of term IDs)
1713
+ def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1714
+ profiles_terms = @profiles.values.flatten
1715
+ profiles_terms.uniq! if uniq
1716
+ term_freqs_byProfile = {}
1717
+ profiles_terms.each do |term|
1718
+ query = term_freqs_byProfile[term]
1719
+ if query.nil?
1720
+ term_freqs_byProfile[term] = 1
1721
+ else
1722
+ term_freqs_byProfile[term] += 1
1723
+ end
1724
+ end
1725
+ levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1726
+ return levels_filtered
1727
+ end
1728
+
1729
+
1730
+ # Calculate profiles dictionary with Key= Term; Value = Profiles
1731
+ def calc_profiles_dictionary
1732
+ if @profiles.empty?
1733
+ warn('Profiles are not already loaded. Aborting dictionary calc')
1734
+ else
1735
+ byTerm = {} # Key: Terms
1736
+ # byValue -- Key: Profile == @profiles
1737
+ @profiles.each do |id, terms|
1738
+ terms.each do |term|
1739
+ if byTerm.include?(term)
1740
+ byTerm[term] << id
1741
+ else
1742
+ byTerm[term] = [id]
1743
+ end
1744
+ end
1745
+ end
1746
+ @profilesDict = byTerm
1747
+ end
1748
+ end
1749
+
1750
+
1751
+ # Gets profiles dictionary calculated
1752
+ # ===== Return
1753
+ # profiles dictionary (clone)
1754
+ def get_terms_linked_profiles
1755
+ return @profilesDict.clone
1756
+ end
1757
+
1758
+
1759
+ # Get related profiles to a given term
1760
+ # ===== Parameters
1761
+ # +term+:: to be checked
1762
+ # ===== Returns
1763
+ # profiles which contains given term
1764
+ def get_term_linked_profiles(term)
1765
+ return @profilesDict[term]
1766
+ end
1767
+
1768
+
1769
+ # Gets metainfo table from a set of terms
1770
+ # ===== Parameters
1771
+ # +terms+:: IDs to be expanded
1772
+ # +filter_alternatives+:: flag to be used in get_descendants method
1773
+ # ===== Returns
1774
+ # an array with triplets [TermID, TermName, DescendantsNames]
1775
+ def get_childs_table(terms, filter_alternatives = false)
1776
+ expanded_terms = []
1777
+ terms.each do |t|
1778
+ expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1779
+ end
1780
+ return expanded_terms
1781
+ end
1782
+
1783
+
1784
+ # Store specific relations hash given into ITEMS structure
1785
+ # ===== Parameters
1786
+ # +relations+:: to be stored
1787
+ # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1788
+ # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1789
+ def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1790
+ @items = {} if remove_old_relations
1791
+ if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1792
+ warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1793
+ end
1794
+ if !remove_old_relations
1795
+ if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1796
+ warn('Some terms given are already stored. Stored version will be replaced')
1797
+ end
1798
+ end
1799
+ if expand
1800
+ relations.each do |k,v|
1801
+ if @items.keys.include?(k)
1802
+ @items[k] = (@items[k] + v).uniq
1803
+ else
1804
+ @items[k] = v
1805
+ end
1806
+ end
1807
+ else
1808
+ @items.merge!(relations)
1809
+ end
1810
+ end
1811
+
1812
+
1813
+ # Assign a dictionary already calculated as a items set.
1814
+ # ===== Parameters
1815
+ # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1816
+ def set_items_from_dict(dictID, remove_old_relations = false)
1817
+ @items = {} if remove_old_relations
1818
+ if(@dicts.keys.include?(dictID))
1819
+ @items.merge(@dicts[dictID][:byTerm])
1820
+ else
1821
+ warn('Specified ID is not calculated. Dict will not be added as a items set')
1822
+ end
1823
+ end
1824
+
1825
+
1826
+ # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
1827
+ # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1828
+ # ===== Parameters
1829
+ # +ontology+:: (Optional) ontology object which items given belongs
1830
+ # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
1831
+ # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
1832
+ # ===== Returns
1833
+ # void and update items object
1834
+ def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
1835
+ # Check item keys
1836
+ if @items.empty?
1837
+ warn('Items have been not provided yet')
1838
+ return nil
1839
+ end
1840
+ targetKeys = @items.keys.select{|k| self.exists?(k)}
1841
+ if targetKeys.length == 0
1842
+ warn('Any item key is allowed')
1843
+ return nil
1844
+ elsif targetKeys.length < @items.keys.length
1845
+ warn('Some item keys are not allowed')
1846
+ end
1847
+
1848
+ # Expand to parentals
1849
+ targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
1850
+ targetKeys.flatten!
1851
+ targetKeys.uniq!
1852
+
1853
+ # Obtain levels (go from leaves to roots)
1854
+ levels = targetKeys.map{|term| self.get_term_level(term)}
1855
+ levels.compact!
1856
+ levels.uniq!
1857
+ levels.sort!
1858
+ levels.reverse!
1859
+ levels.shift # Leaves are not expandable
1860
+
1861
+ # Expand from leaves to roots
1862
+ levels.map do |lvl|
1863
+ curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
1864
+ curr_keys.map do |term_expand|
1865
+ to_infer = []
1866
+ # Obtain childs
1867
+ childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
1868
+ # Expand
1869
+ if childs.length > 0 && minimum_childs == 1 # Special case
1870
+ to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
1871
+ elsif childs.length >= minimum_childs
1872
+ to_infer = Hash.new(0)
1873
+ # Compare
1874
+ while childs.length > 1
1875
+ curr_term = childs.shift
1876
+ childs.each do |compare_term|
1877
+ pivot_items = @items[curr_term]
1878
+ compare_items = @items[compare_term]
1879
+ if ontology.nil? # Exact match
1880
+ pivot_items.map do |pitem|
1881
+ if compare_items.include?(pitem)
1882
+ to_infer[pitem] += 2
1883
+ end
1884
+ end
1885
+ else # Find MICAs
1886
+ local_infer = Hash.new(0)
1887
+ pivot_items.map do |pitem|
1888
+ micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
1889
+ maxmica = micas[0]
1890
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1891
+ local_infer[maxmica.first] += 1
1892
+ end
1893
+ compare_items.map do |citem|
1894
+ micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
1895
+ maxmica = micas[0]
1896
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1897
+ local_infer[maxmica.first] += 1
1898
+ end
1899
+ local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
1900
+ end
1901
+ end
1902
+ end
1903
+ # Filter infer
1904
+ to_infer = to_infer.select{|k,v| v >= minimum_childs}
1905
+ end
1906
+ # Infer
1907
+ if to_infer.length > 0
1908
+ @items[term_expand] = [] if @items[term_expand].nil?
1909
+ if to_infer.kind_of?(Array)
1910
+ @items[term_expand] = (@items[term_expand] + to_infer).uniq
1911
+ else
1912
+ @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
1913
+ end
1914
+ @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
1915
+ elsif !@items.include?(term_expand)
1916
+ targetKeys.delete(term_expand)
1917
+ end
1918
+ end
1919
+ end
1920
+ end
1921
+
1922
+
1923
+
1924
+ # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1925
+ # ===== Parameters
1926
+ # ++::
1927
+ # ===== Returns
1928
+ # ...
1929
+ def compute_relations_to_items(external_item_list, mode, thresold)
1930
+ results = []
1931
+ penalized_terms = {}
1932
+ # terms_levels = get_terms_levels(@items_relations.keys)
1933
+ terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1934
+ terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1935
+ terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1936
+ levels = terms_levels.keys.sort
1937
+ levels.reverse_each do |level|
1938
+ terms_levels[level].each do |term|
1939
+ associated_items = @items_relations[term]
1940
+ if mode == :elim
1941
+ items_to_remove = penalized_terms[term]
1942
+ items_to_remove = [] if items_to_remove.nil?
1943
+ pval = get_fisher_exact_test(
1944
+ external_item_list - items_to_remove,
1945
+ associated_items - items_to_remove,
1946
+ ((associated_items | external_item_list) - items_to_remove).length
1947
+ )
1948
+ if pval <= thresold
1949
+ parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1950
+ parents.each do |prnt|
1951
+ query = penalized_terms[prnt]
1952
+ if query.nil?
1953
+ penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1954
+ else
1955
+ query.concat(@items_relations[term])
1956
+ end
1957
+ end
1958
+ end
1959
+ end
1960
+ results << [term, pval]
1961
+ end
1962
+ end
1963
+ return results
1964
+ end
1965
+
1966
+
1967
+ # Check if a given ID is a removable (blacklist) term.
1968
+ # +DEPRECATED+ use is_removable? instead
1969
+ # ===== Parameters
1970
+ # +id+:: to be checked
1971
+ # ===== Returns
1972
+ # true if given term is a removable (blacklist) term or false in other cases
1973
+ def is_removable(id)
1974
+ warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
1975
+ return @removable_terms.include?(id.to_sym)
1976
+ end
1977
+
1978
+ # Check if a given ID is a removable (blacklist) term
1979
+ # ===== Parameters
1980
+ # +id+:: to be checked
1981
+ # ===== Returns
1982
+ # true if given term is a removable (blacklist) term or false in other cases
1983
+ def is_removable? id
1984
+ return @removable_terms.include?(id.to_sym)
1985
+ end
1986
+
1987
+ ############################################
1988
+ # SPECIAL METHODS
1989
+ #############################################
1990
+ def ==(other)
1991
+ self.header == other.header &&
1992
+ self.stanzas == other.stanzas &&
1993
+ self.ancestors_index == other.ancestors_index &&
1994
+ self.alternatives_index == other.alternatives_index &&
1995
+ self.obsoletes_index == other.obsoletes_index &&
1996
+ self.structureType == other.structureType &&
1997
+ self.ics == other.ics &&
1998
+ self.meta == other.meta &&
1999
+ self.dicts == other.dicts &&
2000
+ self.profiles == other.profiles &&
2001
+ self.profilesDict == other.profilesDict &&
2002
+ (self.items.keys - other.items.keys).empty? &&
2003
+ self.removable_terms == other.removable_terms &&
2004
+ self.special_tags == other.special_tags &&
2005
+ self.items == other.items &&
2006
+ self.term_paths == other.term_paths &&
2007
+ self.max_freqs == other.max_freqs
2008
+ end
2009
+
2010
+
2011
+ def clone
2012
+ copy = Ontology.new
2013
+ copy.header = self.header.clone
2014
+ copy.stanzas[:terms] = self.stanzas[:terms].clone
2015
+ copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2016
+ copy.stanzas[:instances] = self.stanzas[:instances].clone
2017
+ copy.ancestors_index = self.ancestors_index.clone
2018
+ copy.descendants_index = self.descendants_index.clone
2019
+ copy.alternatives_index = self.alternatives_index.clone
2020
+ copy.obsoletes_index = self.obsoletes_index.clone
2021
+ copy.structureType = self.structureType.clone
2022
+ copy.ics = self.ics.clone
2023
+ copy.meta = self.meta.clone
2024
+ copy.dicts = self.dicts.clone
2025
+ copy.profiles = self.profiles.clone
2026
+ copy.profilesDict = self.profilesDict.clone
2027
+ copy.items = self.items.clone
2028
+ copy.removable_terms = self.removable_terms.clone
2029
+ copy.term_paths = self.term_paths.clone
2030
+ copy.max_freqs = self.max_freqs.clone
2031
+ return copy
2032
+ end
2033
+
2034
+
2035
+ #############################################
2036
+ # ACCESS CONTROL
2037
+ #############################################
2038
+
2039
+ attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2040
+ attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2041
+ end