semtools 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/semtools.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "semtools/version"
2
+ require "semtools/sim_handler"
3
+ require "semtools/math_methods"
4
+ require "semtools/ontology"
5
+
6
+ module Semtools
7
+ # Your code goes here...
8
+ end
@@ -0,0 +1,140 @@
1
+ # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
+ #to cmpute fisher exact test
3
+ #Fisher => http://www.biostathandbook.com/fishers.html
4
+ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
5
+ listA_listB = listA & listB
6
+ listA_nolistB = listA - listB
7
+ nolistA_listB = listB - listA
8
+ if weigths.nil?
9
+ listA_listB_count = listA_listB.length
10
+ listA_nolistB_count = listA_nolistB.length
11
+ nolistA_listB_count = nolistA_listB.length
12
+ nolistA_nolistB_count = all_elements_count - (listA | listB).length
13
+ else
14
+ # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
15
+ # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
16
+ listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
17
+ listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
+ nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
+ nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
20
+ all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
21
+ end
22
+ if tail == 'two_sided'
23
+ accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
24
+ elsif tail == 'less'
25
+ accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
26
+ end
27
+ return accumulated_prob
28
+ end
29
+
30
+ def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
31
+ #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
32
+ accumulated_prob = 0
33
+ ref_prob = compute_hyper_prob(
34
+ listA_listB_count,
35
+ listA_nolistB_count,
36
+ nolistA_listB_count,
37
+ nolistA_nolistB_count,
38
+ all_elements_count
39
+ )
40
+ accumulated_prob += ref_prob
41
+ [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
42
+ n += 1
43
+ prob = compute_hyper_prob(
44
+ listA_listB_count - n,
45
+ listA_nolistB_count + n,
46
+ nolistA_listB_count + n,
47
+ nolistA_nolistB_count - n,
48
+ all_elements_count
49
+ )
50
+ prob <= ref_prob ? accumulated_prob += prob : break
51
+ end
52
+
53
+ [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
54
+ n += 1
55
+ prob = compute_hyper_prob(
56
+ listA_listB_count + n,
57
+ listA_nolistB_count - n,
58
+ nolistA_listB_count - n,
59
+ nolistA_nolistB_count + n,
60
+ all_elements_count
61
+ )
62
+ accumulated_prob += prob if prob <= ref_prob
63
+ end
64
+
65
+ return accumulated_prob
66
+ end
67
+
68
+ def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
69
+ accumulated_prob = 0
70
+ [listA_listB_count, nolistA_nolistB_count].min.times do |n|
71
+ accumulated_prob += compute_hyper_prob(
72
+ listA_listB_count - n,
73
+ listA_nolistB_count + n,
74
+ nolistA_listB_count + n,
75
+ nolistA_nolistB_count - n,
76
+ all_elements_count
77
+ )
78
+ end
79
+ return accumulated_prob
80
+ end
81
+
82
+ def compute_hyper_prob(a, b, c, d, n)
83
+ # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
84
+ binomA = binom(a + b, a)
85
+ binomC = binom(c + d, c)
86
+ divisor = binom(n, a + c)
87
+ return (binomA * binomC).fdiv(divisor)
88
+ end
89
+
90
+ def binom(n,k)
91
+ if k > 0 && k < n
92
+ res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
93
+ else
94
+ res = 1
95
+ end
96
+ end
97
+
98
+ #to cmpute adjusted pvalues
99
+ #https://rosettacode.org/wiki/P-value_correction#Ruby
100
+ def get_benjaminiHochberg_pvalues(arr_pvalues)
101
+ n = arr_pvalues.length
102
+ arr_o = order(arr_pvalues, true)
103
+ arr_cummin_input = []
104
+ (0..(n - 1)).each do |i|
105
+ arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
106
+ end
107
+ arr_ro = order(arr_o)
108
+ arr_cummin = cummin(arr_cummin_input)
109
+ arr_pmin = pmin(arr_cummin)
110
+ return arr_pmin.values_at(*arr_ro)
111
+ end
112
+
113
+ def order(array, decreasing = false)
114
+ if decreasing == false
115
+ array.sort.map { |n| array.index(n) }
116
+ else
117
+ array.sort.map { |n| array.index(n) }.reverse
118
+ end
119
+ end
120
+
121
+ def cummin(array)
122
+ cumulative_min = array.first
123
+ arr_cummin = []
124
+ array.each do |p|
125
+ cumulative_min = [p, cumulative_min].min
126
+ arr_cummin << cumulative_min
127
+ end
128
+ return arr_cummin
129
+ end
130
+
131
+ def pmin(array)
132
+ x = 1
133
+ pmin_array = []
134
+ array.each_index do |i|
135
+ pmin_array[i] = [array[i], x].min
136
+ abort if pmin_array[i] > 1
137
+ end
138
+ return pmin_array
139
+ end
140
+
@@ -0,0 +1,2041 @@
1
+ require 'json'
2
+
3
+
4
+ class Ontology
5
+ #########################################################
6
+ # AUTHOR NOTES
7
+ #########################################################
8
+
9
+ # 1 - Store @profiles as @stanzas[:instances]
10
+ # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
11
+
12
+
13
+ #############################################
14
+ # FIELDS
15
+ #############################################
16
+ # Handled class variables
17
+ # => @@basic_tags :: hash with main OBO structure tags
18
+ # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
19
+ # => @@symbolizable_ids :: tags which can be symbolized
20
+ # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
21
+ #
22
+ # Handled object variables
23
+ # => @header :: file header (if is available)
24
+ # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
25
+ # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
26
+ # => @descendants_index :: hash of descendants per each term handled with any structure relationships
27
+ # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
28
+ # => @obsoletes_index :: hash of obsoletes and it's new ids
29
+ # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
30
+ # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
31
+ # => @ics :: already calculated ICs for handled terms and IC types
32
+ # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
33
+ # => @max_freqs :: maximum freqs found for structural and observed freqs
34
+ # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
35
+ # => @profiles :: set of terms assigned to an ID
36
+ # => @profilesDict :: set of profile IDs assigned to a term
37
+ # => @items :: hash with items relations to terms
38
+ # => @removable_terms :: array of terms to not be considered
39
+ # => @term_paths :: metainfo about parental paths of each term
40
+
41
+ @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
+ @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
+ @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
+ @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
45
+ @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
46
+ @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
47
+
48
+ #############################################
49
+ # CONSTRUCTOR
50
+ #############################################
51
+
52
+ # Instantiate a OBO_Handler object
53
+ # ===== Parameters
54
+ # +file+:: with info to be loaded (.obo ; .json)
55
+ # +load_file+:: activate load process automatically (only for .obo)
56
+ # +removable_terms+: term to be removed from calcs
57
+ def initialize(file: nil, load_file: false, removable_terms: [])
58
+ # Initialize object variables
59
+ @header = nil
60
+ @stanzas = {terms: {}, typedefs: {}, instances: {}}
61
+ @ancestors_index = {}
62
+ @descendants_index = {}
63
+ @alternatives_index = {}
64
+ @obsoletes_index = {}
65
+ @structureType = nil
66
+ @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
67
+ @meta = {}
68
+ @special_tags = @@basic_tags.clone
69
+ @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
70
+ @dicts = {}
71
+ @profiles = {}
72
+ @profilesDict = {}
73
+ @items = {}
74
+ @removable_terms = []
75
+ @term_paths = {}
76
+ # Load if proceeds
77
+ add_removable_terms(removable_terms) if !removable_terms.empty?
78
+ load(file) if load_file
79
+ end
80
+
81
+
82
+ #############################################
83
+ # CLASS METHODS
84
+ #############################################
85
+
86
+ # Expand a (starting) term using a specific tag and return all extended terms into an array and
87
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
88
+ # foumd, extended array will be an unique vector without starting term (no loops).
89
+ # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
90
+ # ===== Parameters
91
+ # +start+:: term where start to expand
92
+ # +terms+:: set to be used to expand
93
+ # +target_tag+:: tag used to expand
94
+ # +eexpansion+:: already expanded info
95
+ # +split_info_char+:: special regex used to split info (if it is necessary)
96
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
97
+ # +alt_ids+:: set of alternative IDs
98
+ # ===== Returns
99
+ # A vector with the observed structure (string) and the array with extended terms.
100
+ def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
101
+ # Take start_id term available info and already accumulated info
102
+ current_associations = related_ids[start_id]
103
+ current_associations = [] if current_associations.nil?
104
+ return [:no_term,[]] if terms[start_id].nil?
105
+ id_relations = terms[start_id][target_tag]
106
+ return [:source,[]] if id_relations.nil?
107
+
108
+ # Prepare auxiliar variables
109
+ struct = :hierarchical
110
+
111
+ # Study direct extensions
112
+ id_relations = id_relations.clone
113
+ while id_relations.length > 0
114
+ id = id_relations.shift
115
+ id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
116
+
117
+ # Handle
118
+ if current_associations.include?(id) # Check if already have been included into this expansion
119
+ struct = :circular
120
+ else
121
+ current_associations << id
122
+ if related_ids.include?(id) # Check if current already has been expanded
123
+ current_associations = current_associations | related_ids[id]
124
+ if current_associations.include?(start_id) # Check circular case
125
+ struct = :circular
126
+ [id, start_id].each{|repeated| current_associations.delete(repeated)}
127
+ end
128
+ else # Expand
129
+ related_ids[start_id] = current_associations
130
+ structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
131
+ current_associations = current_associations | current_related_ids
132
+ struct = :circular if structExp == :circular # Check struct
133
+ if current_associations.include?(start_id) # Check circular case
134
+ struct = :circular
135
+ current_associations.delete(start_id)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ related_ids[start_id] = current_associations
141
+
142
+ return struct, current_associations
143
+ end
144
+
145
+
146
+ # Expand terms using a specific tag and return all extended terms into an array and
147
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
148
+ # foumd, extended array will be an unique vector without starting term (no loops)
149
+ # ===== Parameters
150
+ # +terms+:: set to be used to expand
151
+ # +target_tag+:: tag used to expand
152
+ # +split_info_char+:: special regex used to split info (if it is necessary)
153
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
154
+ # +alt_ids+:: set of alternative IDs
155
+ # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
156
+ # ===== Returns
157
+ # A vector with the observed structure (string) and the hash with extended terms
158
+ def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
159
+ # Define structure type
160
+ structType = :hierarchical
161
+ related_ids = {}
162
+ terms.each do |id, tags|
163
+ # Check if target tag is defined
164
+ if !tags[target_tag].nil?
165
+ # Obtain related terms
166
+ set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
167
+ # Check structure
168
+ structType = :circular if set_structure == :circular
169
+ end
170
+ end
171
+
172
+ # Check special case
173
+ structType = :atomic if related_ids.length <= 0
174
+ structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
175
+ # Return type and hash with related_ids
176
+ return structType, related_ids
177
+ end
178
+
179
+
180
+ # Class method to transform string with <tag : info> into hash structure
181
+ # ===== Parameters
182
+ # +attributes+:: array tuples with info to be transformed into hash format
183
+ # ===== Returns
184
+ # Attributes stored into hash structure
185
+ def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
186
+ # Load info
187
+ info_hash = {}
188
+ # Only TERMS multivalue tags (future add Typedefs and Instance)
189
+ # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
190
+ attributes.each do |tag, value|
191
+ # Check
192
+ raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
193
+ # Prepare
194
+ tag = tag.lstrip.to_sym
195
+ value.lstrip!
196
+ value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
197
+
198
+ # Store
199
+ query = info_hash[tag]
200
+ if !query.nil? # Tag already exists
201
+ if !query.kind_of?(Array) # Check that tag is multivalue
202
+ raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
203
+ else
204
+ query << value # Add new value to tag
205
+ end
206
+ else # New entry
207
+ if @@multivalue_tags.include?(tag)
208
+ info_hash[tag] = [value]
209
+ else
210
+ info_hash[tag] = value
211
+ end
212
+ end
213
+ end
214
+ self.symbolize_ids(info_hash)
215
+ return info_hash
216
+ end
217
+
218
+
219
+ # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
220
+ # the Header, the Terms, the Typedefs and the Instances.
221
+ # ===== Parameters
222
+ # +file+:: OBO file to be loaded
223
+ # ===== Returns
224
+ # Hash with FILE, HEADER and STANZAS info
225
+ def self.load_obo(file) #TODO: Send to obo_parser class
226
+ raise("File is not defined") if file.nil?
227
+ # Data variables
228
+ header = ''
229
+ stanzas = {terms: {}, typedefs: {}, instances: {}}
230
+ # Auxiliar variables
231
+ infoType = 'Header'
232
+ currInfo = []
233
+ stanzas_flags = %w[[Term] [Typedef] [Instance]]
234
+ # Read file
235
+ File.open(file).each do |line|
236
+ line.chomp!
237
+ next if line.empty?
238
+ fields = line.split(':', 2)
239
+ # Check if new instance is found
240
+ if stanzas_flags.include?(line)
241
+ header = self.process_entity(header, infoType, stanzas, currInfo)
242
+ # Update info variables
243
+ currInfo = []
244
+ infoType = line.gsub!(/[\[\]]/, '')
245
+ next
246
+ end
247
+ # Concat info
248
+ currInfo << fields
249
+ end
250
+ # Store last loaded info
251
+ header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
252
+
253
+ # Prepare to return
254
+ finfo = {:file => file, :name => File.basename(file, File.extname(file))}
255
+ return finfo, header, stanzas
256
+ end
257
+
258
+
259
+ # Handle OBO loaded info and stores it into correct container and format
260
+ # ===== Parameters
261
+ # +header+:: container
262
+ # +infoType+:: current ontology item type detected
263
+ # +stanzas+:: container
264
+ # +currInfo+:: info to be stored
265
+ # ===== Returns
266
+ # header newly/already stored
267
+ def self.process_entity(header, infoType, stanzas, currInfo)
268
+ info = self.info2hash(currInfo)
269
+ # Store current info
270
+ if infoType.eql?('Header')
271
+ header = info
272
+ else
273
+ id = info[:id]
274
+ case infoType
275
+ when 'Term'
276
+ stanzas[:terms][id] = info
277
+ when 'Typedef'
278
+ stanzas[:typedefs][id] = info
279
+ when 'Instance'
280
+ stanzas[:instances][id] = info
281
+ end
282
+ end
283
+ return header
284
+ end
285
+
286
+
287
+ # Symboliza all values into hashs using symbolizable tags as keys
288
+ # ===== Parameters
289
+ # +item_hash+:: hash to be checked
290
+ def self.symbolize_ids(item_hash)
291
+ @@symbolizable_ids.each do |tag|
292
+ query = item_hash[tag]
293
+ if !query.nil?
294
+ if query.kind_of?(Array)
295
+ query.map!{|item| item.to_sym}
296
+ else
297
+ item_hash[tag] = query.to_sym if !query.nil?
298
+ end
299
+ end
300
+ end
301
+ end
302
+
303
+
304
+ #
305
+ # ===== Parameters
306
+ # +root+:: main term to expand
307
+ # +ontology+:: to be cutted
308
+ # +clone+:: if true, given ontology object will not be mutated
309
+ # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
310
+ # ===== Returns
311
+ # An Ontology object with terms after cut the ontology.
312
+ def self.mutate(root, ontology, clone: true, remove_up: true)
313
+ ontology = ontology.clone if clone
314
+ # Obtain affected IDs
315
+ descendants = ontology.descendants_index[root]
316
+ descendants << root # Store itself to do not remove it
317
+ # Remove unnecesary terms
318
+ ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
319
+ ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
320
+ ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
321
+ ontology.dicts = {}
322
+ ontology.removable_terms = []
323
+ ontology.term_paths = {}
324
+ # Recalculate metadata
325
+ ontology.build_index
326
+ ontology.add_observed_terms_from_profiles
327
+ # Finish
328
+ return ontology
329
+ end
330
+
331
+
332
+
333
+ #############################################
334
+ # GENERAL METHODS
335
+ #############################################
336
+
337
+ # Include removable terms to current removable terms list
338
+ # ===== Parameters
339
+ # +terms+:: terms array to be concatenated
340
+ def add_removable_terms(terms)
341
+ terms = terms.map{|term| term.to_sym}
342
+ @removable_terms.concat(terms)
343
+ end
344
+
345
+
346
+ # Include removable terms to current removable terms list loading new
347
+ # terms from a one column plain text file
348
+ # ===== Parameters
349
+ # +file+:: to be loaded
350
+ def add_removable_terms_from_file(file)
351
+ File.open(excluded_codes_file).each do |line|
352
+ line.chomp!
353
+ @removable_terms << line.to_sym
354
+ end
355
+ end
356
+
357
+
358
+ # Increase observed frequency for a specific term
359
+ # ===== Parameters
360
+ # +term+:: term which frequency is going to be increased
361
+ # +increas+:: frequency rate to be increased. Default = 1
362
+ # ===== Return
363
+ # true if process ends without errors, false in other cases
364
+ def add_observed_term(term:,increase: 1.0)
365
+ # Check
366
+ raise ArgumentError, "Term given is NIL" if term.nil?
367
+ return false unless @stanzas[:terms].include?(term)
368
+ return false if @removable_terms.include?(term)
369
+ if @alternatives_index.include?(term)
370
+ alt_id = @alternatives_index[term]
371
+ @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
372
+ @meta[term] = @meta[alt_id]
373
+ end
374
+ # Check if exists
375
+ @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
376
+ # Add frequency
377
+ @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
378
+ @meta[term][:observed_freq] += increase
379
+ # Check maximum frequency
380
+ @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
381
+ return true
382
+ end
383
+
384
+
385
+ # Increase the arbitrary frequency of a given term set
386
+ # ===== Parameters
387
+ # +terms+:: set of terms to be updated
388
+ # +increase+:: amount to be increased
389
+ # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
390
+ # ===== Return
391
+ # true if process ends without errors and false in other cases
392
+ def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
393
+ # Check
394
+ raise ArgumentError, 'Terms array given is NIL' if terms.nil?
395
+ raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
396
+ # Add observations
397
+ if transform_to_sym
398
+ checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
399
+ else
400
+ checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
401
+ end
402
+ return checks
403
+ end
404
+
405
+
406
+ # Compare to terms sets
407
+ # ===== Parameters
408
+ # +termsA+:: set to be compared
409
+ # +termsB+:: set to be compared
410
+ # +sim_type+:: similitude method to be used. Default: resnik
411
+ # +ic_type+:: ic type to be used. Default: resnik
412
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
413
+ # ===== Return
414
+ # similitude calculated
415
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
416
+ # Check
417
+ raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
418
+ raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
419
+ micasA = []
420
+ # Compare A -> B
421
+ termsA.each do |tA|
422
+ micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
423
+ # Remove special cases
424
+ [false,nil].each do |err_value| micas.delete(err_value) end
425
+ # Obtain maximum value
426
+ micasA << micas.max if micas.length > 0
427
+ micasA << 0 if micas.length <= 0
428
+ end
429
+ means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
430
+ # Compare B -> A
431
+ if bidirectional
432
+ means_simA = means_sim * micasA.size
433
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
434
+ means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
435
+ end
436
+ # Return
437
+ return means_sim
438
+ end
439
+
440
+
441
+ # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
442
+ # ===== Parameters
443
+ # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
444
+ # +sim_type+:: similitude method to be used. Default: resnik
445
+ # +ic_type+:: ic type to be used. Default: resnik
446
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
447
+ # ===== Return
448
+ # Similitudes calculated
449
+ def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
450
+ profiles_similarity = {} #calculate similarity between patients profile
451
+ profiles_ids = @profiles.keys
452
+ if external_profiles.nil?
453
+ comp_ids = profiles_ids
454
+ comp_profiles = @profiles
455
+ main_ids = comp_ids
456
+ main_profiles = comp_profiles
457
+ else
458
+ comp_ids = external_profiles.keys
459
+ comp_profiles = external_profiles
460
+ main_ids = profiles_ids
461
+ main_profiles = @profiles
462
+ end
463
+ # Compare
464
+ while !main_ids.empty?
465
+ curr_id = main_ids.shift
466
+ current_profile = main_profiles[curr_id]
467
+ comp_ids.each do |id|
468
+ profile = comp_profiles[id]
469
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
470
+ query = profiles_similarity[curr_id]
471
+ if query.nil?
472
+ profiles_similarity[curr_id] = {id => value}
473
+ else
474
+ query[id] = value
475
+ end
476
+ end
477
+ end
478
+ return profiles_similarity
479
+ end
480
+
481
+
482
+ # Expand alternative IDs arround all already stored terms
483
+ # ===== Parameters
484
+ # +alt_tag+:: tag used to expand alternative IDs
485
+ # ===== Returns
486
+ # true if process ends without errors and false in other cases
487
+ def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
488
+ # Check input
489
+ raise('stanzas terms empty') if @stanzas[:terms].empty?
490
+ # Take all alternative IDs
491
+ alt_ids2add = {}
492
+ @stanzas[:terms].each do |id, tags|
493
+ alt_ids = tags[alt_tag]
494
+ if !alt_ids.nil?
495
+ alt_ids = alt_ids - @removable_terms
496
+ # Update info
497
+ alt_ids.each do |alt_term|
498
+ @alternatives_index[alt_term] = id
499
+ alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
500
+ @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
501
+ end
502
+ end
503
+ end
504
+ @stanzas[:terms].merge!(alt_ids2add)
505
+ end
506
+
507
+
508
+ # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
509
+ # ===== Returns
510
+ # true if eprocess ends without errors and false in other cases
511
+ def build_index()
512
+ self.get_index_alternatives
513
+ self.get_index_obsoletes
514
+ self.get_index_child_parent_relations
515
+ @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
516
+ @alternatives_index.compact!
517
+ @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
518
+ @obsoletes_index.compact!
519
+ @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
520
+ @ancestors_index.compact!
521
+ @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
522
+ @descendants_index.compact!
523
+ self.get_index_frequencies
524
+ self.calc_dictionary(:name)
525
+ self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
526
+ self.calc_term_levels(calc_paths: true)
527
+ end
528
+
529
+
530
+ # Calculates regular frequencies based on ontology structure (using parentals)
531
+ # ===== Returns
532
+ # true if everything end without errors and false in other cases
533
+ def get_index_frequencies()
534
+ # Check
535
+ if @ancestors_index.empty?
536
+ warn('ancestors_index object is empty')
537
+ else
538
+ # Prepare useful variables
539
+ alternative_terms = @alternatives_index.keys
540
+ # Per each term, add frequencies
541
+ @stanzas[:terms].each do |id, tags|
542
+ if @alternatives_index.include?(id)
543
+ alt_id = @alternatives_index[id]
544
+ query = @meta[alt_id] # Check if exist
545
+ if query.nil?
546
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
547
+ @meta[alt_id] = query
548
+ end
549
+ @meta[id] = query
550
+ # Note: alternative terms do not increase structural frequencies
551
+ else # Official term
552
+ query = @meta[id] # Check if exist
553
+ if query.nil?
554
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
555
+ @meta[id] = query
556
+ end
557
+ # Store metadata
558
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
559
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
560
+ query[:struct_freq] = query[:descendants] + 1.0
561
+ # Update maximums
562
+ @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
563
+ @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
564
+ end
565
+ end
566
+ end
567
+ end
568
+
569
+
570
+ # Expand obsoletes set and link info to their alternative IDs
571
+ # ===== Parameters
572
+ # +obs_tags+:: tags to be used to find obsoletes
573
+ # +alt_tags+:: tags to find alternative IDs (if are available)
574
+ # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
575
+ # ===== Returns
576
+ # true if process ends without errors and false in other cases
577
+ def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
578
+ if @stanzas[:terms].empty?
579
+ warn('stanzas terms empty')
580
+ else
581
+ # Check obsoletes
582
+ @stanzas[:terms].each do |id, term_tags|
583
+ next if term_tags.nil?
584
+ query = term_tags[obs_tag]
585
+ if !query.nil? && query == 'true' # Obsolete tag presence
586
+ next if !@obsoletes_index[id].nil? # Already stored
587
+ # Check if alternative value is available
588
+ alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
589
+ if !alt_ids.empty?
590
+ alt_id = alt_ids.first.first #FIRST tag, FIRST id
591
+ # Store
592
+ @alternatives_index[id] = alt_id
593
+ @obsoletes_index[id] = alt_id
594
+ end
595
+ end
596
+ end
597
+ end
598
+ end
599
+
600
+
601
+ # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
602
+ # ===== Parameters
603
+ # +tag+:: tag used to expand parentals
604
+ # +split_info_char+:: special regex used to split info (if it is necessary)
605
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
606
+ # ===== Returns
607
+ # true if process ends without errors and false in other cases
608
+ def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
609
+ # Check
610
+ if @stanzas[:terms].nil?
611
+ warn('stanzas terms empty')
612
+ else
613
+ # Expand
614
+ structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
615
+ target_tag: tag,
616
+ alt_ids: @alternatives_index,
617
+ obsoletes: @obsoletes_index.length)
618
+ # Check
619
+ raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
620
+ # Prepare ancestors structure
621
+ anc = {}
622
+ des = {}
623
+ parentals.each do |id, parents|
624
+ parents = parents - @removable_terms
625
+ anc[id] = parents
626
+ parents.each do |anc_id| # Add descendants
627
+ if !des.include?(anc_id)
628
+ des[anc_id] = [id]
629
+ else
630
+ des[anc_id] << id
631
+ end
632
+ end
633
+ end
634
+ # Store alternatives
635
+ @alternatives_index.each do |id,alt|
636
+ anc[id] = anc[alt] if anc.include?(alt)
637
+ des[id] = des[alt] if des.include?(alt)
638
+ end
639
+ # Check structure
640
+ if ![:atomic,:sparse].include? structType
641
+ structType = structType == :circular ? :circular : :hierarchical
642
+ end
643
+ # Store
644
+ @ancestors_index = anc
645
+ @descendants_index = des
646
+ @structureType = structType
647
+ end
648
+ # Finish
649
+ end
650
+
651
+
652
+ # Find ancestors of a given term
653
+ # ===== Parameters
654
+ # +term+:: to be checked
655
+ # +filter_alternatives+:: if true, remove alternatives from final results
656
+ # ===== Returns
657
+ # an array with all ancestors of given term or false if parents are not available yet
658
+ def get_ancestors(term, filter_alternatives = false)
659
+ return self.get_familiar(term, true, filter_alternatives)
660
+ end
661
+
662
+
663
+ # Find descendants of a given term
664
+ # ===== Parameters
665
+ # +term+:: to be checked
666
+ # +filter_alternatives+:: if true, remove alternatives from final results
667
+ # ===== Returns
668
+ # an array with all descendants of given term or false if parents are not available yet
669
+ def get_descendants(term, filter_alternatives = false)
670
+ return self.get_familiar(term, false, filter_alternatives)
671
+ end
672
+
673
+
674
+ # Find ancestors/descendants of a given term
675
+ # ===== Parameters
676
+ # +term+:: to be checked
677
+ # +return_ancestors+:: return ancestors if true or descendants if false
678
+ # +filter_alternatives+:: if true, remove alternatives from final results
679
+ # ===== Returns
680
+ # an array with all ancestors/descendants of given term or nil if parents are not available yet
681
+ def get_familiar(term, return_ancestors = true, filter_alternatives = false)
682
+ # Find into parentals
683
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
684
+ if !familiars.nil?
685
+ familiars = familiars.clone
686
+ if filter_alternatives
687
+ familiars.reject!{|fm| @alternatives_index.include?(fm)}
688
+ end
689
+ else
690
+ familiars = []
691
+ end
692
+ return familiars
693
+ end
694
+
695
+
696
+ # Obtain IC of an specific term
697
+ # ===== Parameters
698
+ # +term+:: which IC will be calculated
699
+ # +type+:: of IC to be calculated. Default: resnik
700
+ # +force+:: force re-calculate the IC. Do not check if it is already calculated
701
+ # +zhou_k+:: special coeficient for Zhou IC method
702
+ # ===== Returns
703
+ # the IC calculated
704
+ def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
705
+ term = termRaw.to_sym
706
+ # Check
707
+ raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
708
+ # Check if it's already calculated
709
+ return @ics[type][term] if (@ics[type].include? term) && !force
710
+ # Calculate
711
+ ic = - 1
712
+ case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
713
+ ###########################################
714
+ #### STRUCTURE BASED METRICS
715
+ ###########################################
716
+ # Shortest path
717
+ # Weighted Link
718
+ # Hirst and St-Onge Measure
719
+ # Wu and Palmer
720
+ # Slimani
721
+ # Li
722
+ # Leacock and Chodorow
723
+ ###########################################
724
+ #### INFORMATION CONTENT METRICS
725
+ ###########################################
726
+ when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
727
+ # -log(Freq(x) / Max_Freq)
728
+ ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
729
+ when :resnik_observed
730
+ # -log(Freq(x) / Max_Freq)
731
+ ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
732
+ # Lin
733
+ # Jiang & Conrath
734
+
735
+ ###########################################
736
+ #### FEATURE-BASED METRICS
737
+ ###########################################
738
+ # Tversky
739
+ # x-similarity
740
+ # Rodirguez
741
+
742
+ ###########################################
743
+ #### HYBRID METRICS
744
+ ###########################################
745
+ when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
746
+ # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
747
+ ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
748
+ if :zhou # New Model of Semantic Similarity Measuring in Wordnet
749
+ # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
750
+ @ics[:seco][term] = ic # Special store
751
+ ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
752
+ end
753
+ when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
754
+ ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
755
+ # Knappe
756
+ end
757
+ @ics[type][term] = ic
758
+ return ic
759
+ end
760
+
761
+
762
+ # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
763
+ # ===== Returns
764
+ # two hashes with resnik and resnik_observed ICs for observed terms
765
+ def get_observed_ics_by_onto_and_freq
766
+ # Chech there are observed terms
767
+ if @profiles.empty?
768
+ resnik = {}
769
+ resnik_observed = {}
770
+ else
771
+ # Calc ICs for all terms
772
+ observed_terms = @profiles.values.flatten.uniq
773
+ observed_terms.each{ |term| get_IC(term)}
774
+ observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
775
+ resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
776
+ resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
777
+ end
778
+ return resnik.clone, resnik_observed.clone
779
+ end
780
+
781
+
782
+ # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
783
+ # ===== Parameters
784
+ # +termA+:: term to be cheked
785
+ # +termB+:: term to be checked
786
+ # +ic_type+:: IC formula to be used
787
+ # ===== Returns
788
+ # the IC of the MICA(termA,termB)
789
+ def get_ICMICA(termA, termB, ic_type = :resnik)
790
+ mica = self.get_MICA(termA, termB, ic_type)
791
+ return mica.first.nil? ? nil : mica.last
792
+ end
793
+
794
+
795
+ # Find the Most Index Content shared Ancestor (MICA) of two given terms
796
+ # ===== Parameters
797
+ # +termA+:: term to be cheked
798
+ # +termB+:: term to be checked
799
+ # +ic_type+:: IC formula to be used
800
+ # ===== Returns
801
+ # the MICA(termA,termB) and it's IC
802
+ def get_MICA(termA, termB, ic_type = :resnik)
803
+ termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
804
+ termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
805
+ mica = [nil,-1.0]
806
+ # Special case
807
+ if termA.eql?(termB)
808
+ ic = self.get_IC(termA, type: ic_type)
809
+ mica = [termA, ic]
810
+ else
811
+ # Obtain ancestors (include itselfs too)
812
+ anc_A = self.get_ancestors(termA)
813
+ anc_B = self.get_ancestors(termB)
814
+
815
+ if !(anc_A.empty? && anc_B.empty?)
816
+ anc_A << termA
817
+ anc_B << termB
818
+ # Find shared ancestors
819
+ shared_ancestors = anc_A & anc_B
820
+ # Find MICA
821
+ if shared_ancestors.length > 0
822
+ shared_ancestors.each do |anc|
823
+ ic = self.get_IC(anc, type: ic_type)
824
+ # Check
825
+ mica = [anc,ic] if ic > mica[1]
826
+ end
827
+ end
828
+ end
829
+ end
830
+ return mica
831
+ end
832
+
833
+
834
+ # Calculate similarity between two given terms
835
+ # ===== Parameters
836
+ # +termsA+:: to be compared
837
+ # +termsB+:: to be compared
838
+ # +type+:: similitude formula to be used
839
+ # +ic_type+:: IC formula to be used
840
+ # ===== Returns
841
+ # the similarity between both sets or false if frequencies are not available yet
842
+ def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
843
+ # Check
844
+ raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
845
+ sim = nil
846
+ # Launch comparissons
847
+ sim_res = get_ICMICA(termA, termB, ic_type)
848
+ if !sim_res.nil?
849
+ case type
850
+ when :resnik
851
+ sim = sim_res
852
+ when :lin
853
+ sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
854
+ when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
855
+ sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
856
+ end
857
+ end
858
+ return sim
859
+ end
860
+
861
+
862
+ # Method used to load information stored into an OBO file and store it into this object.
863
+ # If a file is specified by input parameter, current @file value is updated
864
+ # ===== Parameters
865
+ # +file+:: optional file to update object stored file
866
+ def load(file, build: true)
867
+ _, header, stanzas = self.class.load_obo(file)
868
+ @header = header
869
+ @stanzas = stanzas
870
+ self.remove_removable()
871
+ # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
872
+ self.build_index() if build
873
+ end
874
+
875
+ #
876
+ def remove_removable()
877
+ @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
878
+ end
879
+
880
+
881
+ # Exports an OBO_Handler object in json format
882
+ # ===== Parameters
883
+ # +file+:: where info will be stored
884
+ def write(file)
885
+ # Take object stored info
886
+ obj_info = {header: @header,
887
+ stanzas: @stanzas,
888
+ ancestors_index: @ancestors_index,
889
+ descendants_index: @descendants_index,
890
+ alternatives_index: @alternatives_index,
891
+ obsoletes_index: @obsoletes_index,
892
+ structureType: @structureType,
893
+ ics: @ics,
894
+ meta: @meta,
895
+ special_tags: @special_tags,
896
+ max_freqs: @max_freqs,
897
+ dicts: @dicts,
898
+ profiles: @profiles,
899
+ profilesDict: @profilesDict,
900
+ items: @items,
901
+ removable_terms: @removable_terms,
902
+ term_paths: @term_paths}
903
+ # Convert to JSON format & write
904
+ File.open(file, "w") { |f| f.write obj_info.to_json }
905
+ end
906
+
907
+
908
+ def is_number? string
909
+ true if Float(string) rescue false
910
+ end
911
+
912
+
913
+ # Read a JSON file with an OBO_Handler object stored
914
+ # ===== Parameters
915
+ # +file+:: with object info
916
+ # ===== Return
917
+ # OBO_Handler internal fields
918
+ def read(file)
919
+ # Read file
920
+ jsonFile = File.open(file)
921
+ jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
922
+ # Pre-process (Symbolize some hashs values)
923
+ jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
924
+ jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
925
+ jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
926
+ jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h
927
+ jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
928
+ jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
929
+ jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h
930
+ jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
931
+ # Special case: byTerm
932
+ dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
933
+ if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
934
+ [term.to_s.to_i, value.map{|term| term.to_sym}]
935
+ elsif value.is_a? Numeric # Numeric dictionary
936
+ [term.to_sym, value]
937
+ elsif value.kind_of?(Array) && flag == :is_a
938
+ [term.to_sym, value.map{|v| v.to_sym}]
939
+ else
940
+ [term.to_sym, value]
941
+ end
942
+ end
943
+ dictionaries[:byTerm] = dictionaries[:byTerm].to_h
944
+ # By value
945
+ dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
946
+ if value.is_a? Numeric # Numeric dictionary
947
+ [value, term.to_sym]
948
+ elsif term.is_a? Numeric # Numeric dictionary
949
+ [value.to_s.to_sym, term]
950
+ elsif flag == :is_a
951
+ [value.to_sym, term.to_sym]
952
+ elsif term.kind_of?(Array)
953
+ [value.to_sym, term.map{|t| t.to_sym}]
954
+ else
955
+ [value.to_s, term.to_sym]
956
+ end
957
+ end
958
+ dictionaries[:byValue] = dictionaries[:byValue].to_h
959
+ end
960
+ jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
961
+ jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
962
+ jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}}
963
+ jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym}
964
+ jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
965
+ if v.kind_of?(Array)
966
+ jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
967
+ else
968
+ jsonInfo[:special_tags][k] = v.to_sym
969
+ end
970
+ end
971
+ jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}}
972
+ jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}}
973
+ # Store info
974
+ @header = jsonInfo[:header]
975
+ @stanzas = jsonInfo[:stanzas]
976
+ @ancestors_index = jsonInfo[:ancestors_index]
977
+ @descendants_index = jsonInfo[:descendants_index]
978
+ @alternatives_index = jsonInfo[:alternatives_index]
979
+ @obsoletes_index = jsonInfo[:obsoletes_index]
980
+ @structureType = jsonInfo[:structureType].to_sym
981
+ @ics = jsonInfo[:ics]
982
+ @meta = jsonInfo[:meta]
983
+ @special_tags = jsonInfo[:special_tags]
984
+ @max_freqs = jsonInfo[:max_freqs]
985
+ @dicts = jsonInfo[:dicts]
986
+ @profiles = jsonInfo[:profiles]
987
+ @profilesDict = jsonInfo[:profilesDict]
988
+ @items = jsonInfo[:items]
989
+ @removable_terms = jsonInfo[:removable_terms]
990
+ @term_paths = jsonInfo[:term_paths]
991
+ end
992
+
993
+
994
+ # Check if a given ID is stored as term into this object
995
+ # ===== Parameters
996
+ # +id+:: to be checked
997
+ # ===== Return
998
+ # True if term is allowed or false in other cases
999
+ def exists? id
1000
+ return stanzas[:terms].include?(id)
1001
+ end
1002
+
1003
+
1004
+ # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1005
+ # ===== Parameters
1006
+ # +text+:: to be checked
1007
+ # ===== Return
1008
+ # The correct ID if it can be found or nil in other cases
1009
+ def extract_id(text, splitBy: ' ')
1010
+ if self.exists?(text)
1011
+ return text
1012
+ else
1013
+ splittedText = text.to_s.split(splitBy).first.to_sym
1014
+ return self.exists?(splittedText) ? splittedText : nil
1015
+ end
1016
+ end
1017
+
1018
+
1019
+ # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1020
+ # This functions stores calculated dictionary into @dicts field.
1021
+ # This functions stores first value for multivalue tags
1022
+ # This function does not handle synonyms for byValue dictionaries
1023
+ # ===== Parameters
1024
+ # +tag+:: to be used to calculate dictionary
1025
+ # +select_regex+:: gives a regfex that can be used to modify value to be stored
1026
+ # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1027
+ # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1028
+ # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1029
+ # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1030
+ # ===== Return
1031
+ # void. And stores calcualted bidirectional dictonary into dictionaries main container
1032
+ def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1033
+ tag = tag.to_sym
1034
+ store_tag = tag if store_tag.nil?
1035
+ if @stanzas[:terms].empty?
1036
+ warn('Terms are not already loaded. Aborting dictionary calc')
1037
+ else
1038
+ byTerm = {}
1039
+ byValue = {}
1040
+ # Calc per term
1041
+ @stanzas[:terms].each do |term, tags|
1042
+ referenceTerm = term
1043
+ if @alternatives_index.include?(term) && substitute_alternatives # Special case
1044
+ referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1045
+ end
1046
+ queryTag = tags[tag]
1047
+ if !queryTag.nil?
1048
+ # Pre-process
1049
+ if !select_regex.nil?
1050
+ if queryTag.kind_of?(Array)
1051
+ queryTag = queryTag.map{|value| value.scan(select_regex).first}
1052
+ queryTag.flatten!
1053
+ else
1054
+ queryTag = queryTag.scan(select_regex).first
1055
+ end
1056
+ queryTag.compact!
1057
+ end
1058
+ if queryTag.kind_of?(Array) # Store
1059
+ if !queryTag.empty?
1060
+ if byTerm.include?(referenceTerm)
1061
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1062
+ else
1063
+ byTerm[referenceTerm] = queryTag
1064
+ end
1065
+ if multiterm
1066
+ queryTag.each do |value|
1067
+ byValue[value] = [] if byValue[value].nil?
1068
+ byValue[value] << referenceTerm
1069
+ end
1070
+ else
1071
+ queryTag.each{|value| byValue[value] = referenceTerm}
1072
+ end
1073
+ end
1074
+ else
1075
+ if byTerm.include?(referenceTerm)
1076
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1077
+ else
1078
+ byTerm[referenceTerm] = [queryTag]
1079
+ end
1080
+ if multiterm
1081
+ byValue[queryTag] = [] if byValue[queryTag].nil?
1082
+ byValue[queryTag] << referenceTerm
1083
+ else
1084
+ byValue[queryTag] = referenceTerm
1085
+ end
1086
+ end
1087
+ end
1088
+ end
1089
+
1090
+ # Check self-references
1091
+ if self_type_references
1092
+ byTerm.map do |term, references|
1093
+ corrected_references = references.map do |t|
1094
+ checked = self.extract_id(t)
1095
+ if checked.nil?
1096
+ t
1097
+ else
1098
+ byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1099
+ checked
1100
+ end
1101
+ end
1102
+ byTerm[term] = corrected_references.uniq
1103
+ end
1104
+ end
1105
+
1106
+ # Check order
1107
+ byTerm.map do |term,values|
1108
+ if self.exists?(term)
1109
+ referenceValue = @stanzas[:terms][term][tag]
1110
+ if !referenceValue.nil?
1111
+ if !select_regex.nil?
1112
+ if referenceValue.kind_of?(Array)
1113
+ referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1114
+ referenceValue.flatten!
1115
+ else
1116
+ referenceValue = referenceValue.scan(select_regex).first
1117
+ end
1118
+ referenceValue.compact!
1119
+ end
1120
+ if self_type_references
1121
+ if referenceValue.kind_of?(Array)
1122
+ aux = referenceValue.map{|t| self.extract_id(t)}
1123
+ else
1124
+ aux = self.extract_id(referenceValue)
1125
+ end
1126
+ referenceValue = aux if !aux.nil?
1127
+ end
1128
+ referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1129
+ byTerm[term] = referenceValue + (values - referenceValue)
1130
+ end
1131
+ end
1132
+ end
1133
+
1134
+ # Store
1135
+ @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1136
+ end
1137
+ end
1138
+
1139
+
1140
+ # Calculates :is_a dictionary without alternatives substitution
1141
+ def calc_ancestors_dictionary
1142
+ self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true)
1143
+ end
1144
+
1145
+
1146
+ # Translate a given value using an already calcualted dictionary
1147
+ # ===== Parameters
1148
+ # +toTranslate+:: value to be translated using dictiontionary
1149
+ # +tag+:: used to generate the dictionary
1150
+ # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1151
+ # ===== Return
1152
+ # translation
1153
+ def translate(toTranslate, tag, byValue: true)
1154
+ dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1155
+ toTranslate = get_main_id(toTranslate) if !byValue
1156
+ return dict[toTranslate]
1157
+ end
1158
+
1159
+
1160
+ # Translate a name given
1161
+ # ===== Parameters
1162
+ # +name+:: to be translated
1163
+ # ===== Return
1164
+ # translated name or nil if it's not stored into this ontology
1165
+ def translate_name(name)
1166
+ term = self.translate(name, :name)
1167
+ term = self.translate(name, :synonym) if term.nil?
1168
+ return term
1169
+ end
1170
+
1171
+
1172
+ # Translate several names and return translations and a list of names which couldn't be translated
1173
+ # ===== Parameters
1174
+ # +names+:: array to be translated
1175
+ # ===== Return
1176
+ # two arrays with translations and names which couldn't be translated respectively
1177
+ def translate_names(names)
1178
+ translated = []
1179
+ rejected = []
1180
+ names.each do |name|
1181
+ tr = self.translate_name(name)
1182
+ if tr.nil?
1183
+ rejected << name
1184
+ else
1185
+ translated << tr
1186
+ end
1187
+ end
1188
+ return translated, rejected
1189
+ end
1190
+
1191
+
1192
+ # Translates a given ID to it assigned name
1193
+ # ===== Parameters
1194
+ # +id+:: to be translated
1195
+ # ===== Return
1196
+ # main name or nil if it's not included into this ontology
1197
+ def translate_id(id)
1198
+ name = self.translate(id, :name, byValue: false)
1199
+ return name.nil? ? nil : name.first
1200
+ end
1201
+
1202
+
1203
+ # Translates several IDs and returns translations and not allowed IDs list
1204
+ # ===== Parameters
1205
+ # +ids+:: to be translated
1206
+ # ===== Return
1207
+ # two arrays with translations and names which couldn't be translated respectively
1208
+ def translate_ids(ids)
1209
+ translated = []
1210
+ rejected = []
1211
+ ids.each do |term_id|
1212
+ tr = self.translate_id(term_id.to_sym)
1213
+ if !tr.nil?
1214
+ translated << tr
1215
+ else
1216
+ rejected << tr
1217
+ end
1218
+ end
1219
+ return translated, rejected
1220
+ end
1221
+
1222
+
1223
+ # ===== Returns
1224
+ # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1225
+ # ===== Parameters
1226
+ # +id+:: to be translated
1227
+ # ===== Return
1228
+ # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1229
+ def get_main_id(id)
1230
+ return nil if !@stanzas[:terms].include? id
1231
+ new_id = id
1232
+ mainID = @alternatives_index[id]
1233
+ new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1234
+ return new_id
1235
+ end
1236
+
1237
+
1238
+ # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1239
+ # ===== Parameters
1240
+ # +ids+:: to be checked
1241
+ # ===== Return
1242
+ # two arrays whit allowed and rejected IDs respectively
1243
+ def check_ids(ids, substitute: true)
1244
+ checked_codes = []
1245
+ rejected_codes = []
1246
+ ids.each do |id|
1247
+ if @stanzas[:terms].include? id
1248
+ if substitute
1249
+ checked_codes << self.get_main_id(id)
1250
+ else
1251
+ checked_codes << id
1252
+ end
1253
+ else
1254
+ rejected_codes << id
1255
+ end
1256
+ end
1257
+ return checked_codes, rejected_codes
1258
+ end
1259
+
1260
+
1261
+ # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1262
+ # ===== Parameters
1263
+ # +id+:: assigned to profile
1264
+ # +terms+:: array of terms
1265
+ # +substitute+:: subsstitute flag from check_ids
1266
+ def add_profile(id, terms, substitute: true)
1267
+ warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1268
+ correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1269
+ if !rejected_terms.empty?
1270
+ warn('Given terms contains erroneus IDs. These IDs will be removed')
1271
+ end
1272
+ if id.is_a? Numeric
1273
+ @profiles[id] = correct_terms
1274
+ else
1275
+ @profiles[id.to_sym] = correct_terms
1276
+ end
1277
+ end
1278
+
1279
+
1280
+ # Method used to store a pull of profiles
1281
+ # ===== Parameters
1282
+ # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1283
+ # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1284
+ # +reset_stored+:: if true, remove already stored profiles
1285
+ # +substitute+:: subsstitute flag from check_ids
1286
+ def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1287
+ self.reset_profiles if reset_stored
1288
+ # Check
1289
+ if profiles.kind_of?(Array)
1290
+ profiles.each_with_index do |items, i|
1291
+ self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1292
+ end
1293
+ else # Hash
1294
+ if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1295
+ warn('Some profiles given are already stored. Stored version will be replaced')
1296
+ end
1297
+ profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1298
+ end
1299
+
1300
+ self.add_observed_terms_from_profiles(reset: true)
1301
+
1302
+ if calc_metadata
1303
+ self.calc_profiles_dictionary
1304
+ end
1305
+ end
1306
+
1307
+
1308
+ # Internal method used to remove already stored profiles and restore observed frequencies
1309
+ def reset_profiles
1310
+ # Clean profiles storage
1311
+ @profiles = {}
1312
+ # Reset frequency observed
1313
+ @meta.each{|term,info| info[:observed_freq] = 0}
1314
+ @max_freqs[:observed_freq] = 0
1315
+ end
1316
+
1317
+
1318
+ # ===== Returns
1319
+ # profiles assigned to a given ID
1320
+ # ===== Parameters
1321
+ # +id+:: profile ID
1322
+ # ===== Return
1323
+ # specific profile or nil if it's not stored
1324
+ def get_profile(id)
1325
+ return @profiles[id]
1326
+ end
1327
+
1328
+
1329
+ # ===== Returns
1330
+ # an array of sizes for all stored profiles
1331
+ # ===== Return
1332
+ # array of profile sizes
1333
+ def get_profiles_sizes()
1334
+ return @profiles.map{|id,terms| terms.length}
1335
+ end
1336
+
1337
+
1338
+ # ===== Returns
1339
+ # mean size of stored profiles
1340
+ # ===== Parameters
1341
+ # +round_digits+:: number of digits to round result. Default: 4
1342
+ # ===== Returns
1343
+ # mean size of stored profiles
1344
+ def get_profiles_mean_size(round_digits: 4)
1345
+ sizes = self.get_profiles_sizes
1346
+ return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1347
+ end
1348
+
1349
+
1350
+ # Calculates profiles sizes and returns size assigned to percentile given
1351
+ # ===== Parameters
1352
+ # +perc+:: percentile to be returned
1353
+ # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1354
+ # ===== Returns
1355
+ # values assigned to percentile asked
1356
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1357
+ prof_lengths = self.get_profiles_sizes.sort
1358
+ prof_lengths.reverse! if !increasing_sort
1359
+ n_profiles = prof_lengths.length
1360
+ percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1361
+ percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1362
+ return prof_lengths[percentile_index]
1363
+ end
1364
+
1365
+
1366
+ # Translate a given profile to terms names
1367
+ # ===== Parameters
1368
+ # +prof+:: array of terms to be translated
1369
+ # ===== Returns
1370
+ # array of translated terms. Can include nils if some IDs are not allowed
1371
+ def profile_names(prof)
1372
+ return prof.map{|term| self.translate_id(term)}
1373
+ end
1374
+
1375
+
1376
+ # Trnaslates a bunch of profiles to it sets of term names
1377
+ # ===== Parameters
1378
+ # +profs+:: array of profiles
1379
+ # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1380
+ # ===== Returns
1381
+ # translated profiles
1382
+ def translate_profiles_ids(profs = [], asArray: true)
1383
+ profs = @profiles if profs.empty?
1384
+ profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1385
+ profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1386
+ return asArray ? profs_names.values : profs_names
1387
+ end
1388
+
1389
+
1390
+ # Includes as "observed_terms" all terms included into stored profiles
1391
+ # ===== Parameters
1392
+ # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1393
+ def add_observed_terms_from_profiles(reset: false)
1394
+ @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1395
+ @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1396
+ end
1397
+
1398
+
1399
+ # Get a term frequency
1400
+ # ===== Parameters
1401
+ # +term+:: term to be checked
1402
+ # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1403
+ # ===== Returns
1404
+ # frequency of term given or nil if term is not allowed
1405
+ def get_frequency(term, type: :struct_freq)
1406
+ queryFreq = @meta[term]
1407
+ return queryFreq.nil? ? nil : queryFreq[type]
1408
+ end
1409
+
1410
+
1411
+ # Geys structural frequency of a term given
1412
+ # ===== Parameters
1413
+ # +term+:: to be checked
1414
+ # ===== Returns
1415
+ # structural frequency of given term or nil if term is not allowed
1416
+ def get_structural_frequency(term)
1417
+ return self.get_frequency(term, type: :struct_freq)
1418
+ end
1419
+
1420
+
1421
+ # Gets observed frequency of a term given
1422
+ # ===== Parameters
1423
+ # +term+:: to be checked
1424
+ # ===== Returns
1425
+ # observed frequency of given term or nil if term is not allowed
1426
+ def get_observed_frequency(term)
1427
+ return self.get_frequency(term, type: :observed_freq)
1428
+ end
1429
+
1430
+
1431
+ # Calculates frequencies of stored profiles terms
1432
+ # ===== Parameters
1433
+ # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1434
+ # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1435
+ # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1436
+ # +translate+:: if true, term IDs will be translated to
1437
+ # ===== Returns
1438
+ # stored profiles terms frequencies
1439
+ def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1440
+ n_profiles = @profiles.length
1441
+ if literal
1442
+ freqs = {}
1443
+ @profiles.each do |id, terms|
1444
+ terms.each do |literalTerm|
1445
+ if freqs.include?(literalTerm)
1446
+ freqs[literalTerm] += 1
1447
+ else
1448
+ freqs[literalTerm] = 1
1449
+ end
1450
+ end
1451
+ end
1452
+ if (ratio || translate)
1453
+ aux_keys = freqs.keys
1454
+ aux_keys.each do |term|
1455
+ freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1456
+ if translate
1457
+ tr = self.translate_id(term)
1458
+ freqs[tr] = freqs.delete(term) if !tr.nil?
1459
+ end
1460
+ end
1461
+ end
1462
+ if asArray
1463
+ freqs = freqs.map{|term, freq| [term, freq]}
1464
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1465
+ end
1466
+ else # Freqs translating alternatives
1467
+ freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1468
+ freqs = freqs.to_h if !asArray
1469
+ if translate
1470
+ freqs = freqs.map do |term, freq|
1471
+ tr = self.translate_id(term)
1472
+ tr.nil? ? [term, freq] : [tr, freq]
1473
+ end
1474
+ end
1475
+ if asArray
1476
+ freqs = freqs.map{|term, freq| [term, freq]}
1477
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1478
+ else
1479
+ freqs = freqs.to_h
1480
+ end
1481
+ end
1482
+ return freqs
1483
+ end
1484
+
1485
+
1486
+ # Clean a given profile returning cleaned set of terms and removed ancestors term.
1487
+ # ===== Parameters
1488
+ # +prof+:: array of terms to be checked
1489
+ # ===== Returns
1490
+ # two arrays, first is the cleaned profile and second is the removed elements array
1491
+ def remove_ancestors_from_profile(prof)
1492
+ ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1493
+ redundant = prof.select{|term| ancestors.include?(term)}
1494
+ return prof - redundant, redundant
1495
+ end
1496
+
1497
+
1498
+ # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1499
+ # ===== Parameters
1500
+ # +prof+:: array of terms to be checked
1501
+ # ===== Returns
1502
+ # two arrays, first is the cleaned profile and second is the removed elements array
1503
+ def remove_alternatives_from_profile(prof)
1504
+ alternatives = prof.select{|term| @alternatives_index.include?(term)}
1505
+ redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1506
+ return prof - redundant, redundant
1507
+ end
1508
+
1509
+
1510
+ # Remove alternatives (if official term is present) and ancestors terms of a given profile
1511
+ # ===== Parameters
1512
+ # +profile+:: profile to be cleaned
1513
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1514
+ # ===== Returns
1515
+ # cleaned profile
1516
+ def clean_profile(profile, remove_alternatives: true)
1517
+ terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1518
+ if remove_alternatives
1519
+ terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1520
+ else
1521
+ terms_without_ancestors_and_alternatices = terms_without_ancestors
1522
+ end
1523
+ return terms_without_ancestors_and_alternatices
1524
+ end
1525
+
1526
+
1527
+ # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1528
+ # ===== Parameters
1529
+ # +store+:: if true, clenaed profiles will replace already stored profiles
1530
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1531
+ # ===== Returns
1532
+ # a hash with cleaned profiles
1533
+ def clean_profiles(store: false, remove_alternatives: true)
1534
+ cleaned_profiles = {}
1535
+ @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1536
+ @profiles = cleaned_profiles if store
1537
+ return cleaned_profiles
1538
+ end
1539
+
1540
+
1541
+ # Calculates number of ancestors present (redundant) in each profile stored
1542
+ # ===== Returns
1543
+ # array of parentals for each profile
1544
+ def parentals_per_profile
1545
+ cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1546
+ parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1547
+ return parentals
1548
+ end
1549
+
1550
+
1551
+ # Calculates mean IC of a given profile
1552
+ # ===== Parameters
1553
+ # +prof+:: profile to be checked
1554
+ # +ic_type+:: ic_type to be used
1555
+ # +zhou_k+:: special coeficient for Zhou IC method
1556
+ # ===== Returns
1557
+ # mean IC for a given profile
1558
+ def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1559
+ return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1560
+ end
1561
+
1562
+
1563
+ # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1564
+ # ===== Returns
1565
+ # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1566
+ def get_profiles_resnik_dual_ICs
1567
+ struct_ics = {}
1568
+ observ_ics = {}
1569
+ @profiles.each do |id, terms|
1570
+ struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1571
+ observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1572
+ end
1573
+ return struct_ics.clone, observ_ics.clone
1574
+ end
1575
+
1576
+
1577
+ # Calculates ontology structural levels for all ontology terms
1578
+ # ===== Parameters
1579
+ # +calc_paths+:: calculates term paths if it's not already calculated
1580
+ # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1581
+ def calc_term_levels(calc_paths: false, shortest_path: true)
1582
+ if @term_paths.empty?
1583
+ if calc_paths
1584
+ self.calc_term_paths
1585
+ else
1586
+ warn('Term paths are not already loaded. Aborting dictionary calc')
1587
+ end
1588
+ end
1589
+ if !@term_paths.empty?
1590
+ byTerm = {}
1591
+ byValue = {}
1592
+ # Calc per term
1593
+ @term_paths.each do |term, info|
1594
+ level = shortest_path ? info[:shortest_path] : info[:largest_path]
1595
+ if level.nil?
1596
+ level = -1
1597
+ else
1598
+ level = level.round(0)
1599
+ end
1600
+ byTerm[term] = level
1601
+ queryLevels = byValue[level]
1602
+ if queryLevels.nil?
1603
+ byValue[level] = [term]
1604
+ else
1605
+ byValue[level] << term
1606
+ end
1607
+ end
1608
+ @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1609
+ # Update maximum depth
1610
+ @max_freqs[:max_depth] = byValue.keys.max
1611
+ end
1612
+ end
1613
+
1614
+
1615
+ # Check if a term given is marked as obsolete
1616
+ def is_obsolete? term
1617
+ return @obsoletes_index.include?(term)
1618
+ end
1619
+
1620
+ # Check if a term given is marked as alternative
1621
+ def is_alternative? term
1622
+ return @alternatives_index.include?(term)
1623
+ end
1624
+
1625
+ # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1626
+ # Also calculates paths metadata and stores into @term_paths
1627
+ def calc_term_paths
1628
+ self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1629
+ visited_terms = []
1630
+ @term_paths = {}
1631
+ if [:hierarchical, :sparse].include? @structureType
1632
+ terms = @stanzas[:terms].keys
1633
+ terms.each do |term|
1634
+ if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1635
+ special_term = term
1636
+ term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1637
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1638
+ @term_paths[special_term] = @term_paths[term]
1639
+ visited_terms << special_term
1640
+ end
1641
+
1642
+ if !visited_terms.include?(term)
1643
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1644
+ parentals = @dicts[:is_a][:byTerm][term]
1645
+ if parentals.nil?
1646
+ @term_paths[term][:paths] << [term]
1647
+ else
1648
+ parentals.each do |direct_parental|
1649
+ if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1650
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1651
+ else # Calculate new paths
1652
+ self.expand_path(direct_parental, visited_terms)
1653
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1654
+ end
1655
+ new_paths.each{|path| @term_paths[term][:paths] << path}
1656
+ end
1657
+ end
1658
+ visited_terms << term
1659
+ end
1660
+ # Update metadata
1661
+ @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1662
+ paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1663
+ @term_paths[term][:largest_path] = paths_sizes.max
1664
+ @term_paths[term][:shortest_path] = paths_sizes.min
1665
+ end
1666
+ else
1667
+ warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1668
+ end
1669
+ end
1670
+
1671
+
1672
+ # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1673
+ # ===== Parameters
1674
+ # +curr_term+:: current visited term
1675
+ # +visited_terms+:: already expanded terms
1676
+ def expand_path(curr_term, visited_terms)
1677
+ if !visited_terms.include?(curr_term) # Not already expanded
1678
+ @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1679
+ direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1680
+ if direct_parentals.nil? # No parents :: End of recurrence
1681
+ @term_paths[curr_term][:paths] << [curr_term]
1682
+ else # Expand and concat
1683
+ direct_parentals.each do |ancestor|
1684
+ self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1685
+ new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1686
+ new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1687
+ end
1688
+ end
1689
+ visited_terms << curr_term
1690
+ end
1691
+ end
1692
+
1693
+
1694
+ # Gets ontology levels calculated
1695
+ # ===== Returns
1696
+ # ontology levels calculated
1697
+ def get_ontology_levels
1698
+ return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1699
+ end
1700
+
1701
+
1702
+ # Gets ontology level of a specific term
1703
+ # ===== Returns
1704
+ # Term level
1705
+ def get_term_level(term)
1706
+ return @dicts[:level][:byValue][term]
1707
+ end
1708
+
1709
+
1710
+ # Return ontology levels from profile terms
1711
+ # ===== Returns
1712
+ # hash of term levels (Key: level; Value: array of term IDs)
1713
+ def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1714
+ profiles_terms = @profiles.values.flatten
1715
+ profiles_terms.uniq! if uniq
1716
+ term_freqs_byProfile = {}
1717
+ profiles_terms.each do |term|
1718
+ query = term_freqs_byProfile[term]
1719
+ if query.nil?
1720
+ term_freqs_byProfile[term] = 1
1721
+ else
1722
+ term_freqs_byProfile[term] += 1
1723
+ end
1724
+ end
1725
+ levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1726
+ return levels_filtered
1727
+ end
1728
+
1729
+
1730
+ # Calculate profiles dictionary with Key= Term; Value = Profiles
1731
+ def calc_profiles_dictionary
1732
+ if @profiles.empty?
1733
+ warn('Profiles are not already loaded. Aborting dictionary calc')
1734
+ else
1735
+ byTerm = {} # Key: Terms
1736
+ # byValue -- Key: Profile == @profiles
1737
+ @profiles.each do |id, terms|
1738
+ terms.each do |term|
1739
+ if byTerm.include?(term)
1740
+ byTerm[term] << id
1741
+ else
1742
+ byTerm[term] = [id]
1743
+ end
1744
+ end
1745
+ end
1746
+ @profilesDict = byTerm
1747
+ end
1748
+ end
1749
+
1750
+
1751
+ # Gets profiles dictionary calculated
1752
+ # ===== Return
1753
+ # profiles dictionary (clone)
1754
+ def get_terms_linked_profiles
1755
+ return @profilesDict.clone
1756
+ end
1757
+
1758
+
1759
+ # Get related profiles to a given term
1760
+ # ===== Parameters
1761
+ # +term+:: to be checked
1762
+ # ===== Returns
1763
+ # profiles which contains given term
1764
+ def get_term_linked_profiles(term)
1765
+ return @profilesDict[term]
1766
+ end
1767
+
1768
+
1769
+ # Gets metainfo table from a set of terms
1770
+ # ===== Parameters
1771
+ # +terms+:: IDs to be expanded
1772
+ # +filter_alternatives+:: flag to be used in get_descendants method
1773
+ # ===== Returns
1774
+ # an array with triplets [TermID, TermName, DescendantsNames]
1775
+ def get_childs_table(terms, filter_alternatives = false)
1776
+ expanded_terms = []
1777
+ terms.each do |t|
1778
+ expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1779
+ end
1780
+ return expanded_terms
1781
+ end
1782
+
1783
+
1784
+ # Store specific relations hash given into ITEMS structure
1785
+ # ===== Parameters
1786
+ # +relations+:: to be stored
1787
+ # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1788
+ # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1789
+ def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1790
+ @items = {} if remove_old_relations
1791
+ if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1792
+ warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1793
+ end
1794
+ if !remove_old_relations
1795
+ if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1796
+ warn('Some terms given are already stored. Stored version will be replaced')
1797
+ end
1798
+ end
1799
+ if expand
1800
+ relations.each do |k,v|
1801
+ if @items.keys.include?(k)
1802
+ @items[k] = (@items[k] + v).uniq
1803
+ else
1804
+ @items[k] = v
1805
+ end
1806
+ end
1807
+ else
1808
+ @items.merge!(relations)
1809
+ end
1810
+ end
1811
+
1812
+
1813
+ # Assign a dictionary already calculated as a items set.
1814
+ # ===== Parameters
1815
+ # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1816
+ def set_items_from_dict(dictID, remove_old_relations = false)
1817
+ @items = {} if remove_old_relations
1818
+ if(@dicts.keys.include?(dictID))
1819
+ @items.merge(@dicts[dictID][:byTerm])
1820
+ else
1821
+ warn('Specified ID is not calculated. Dict will not be added as a items set')
1822
+ end
1823
+ end
1824
+
1825
+
1826
+ # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
1827
+ # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1828
+ # ===== Parameters
1829
+ # +ontology+:: (Optional) ontology object which items given belongs
1830
+ # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
1831
+ # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
1832
+ # ===== Returns
1833
+ # void and update items object
1834
+ def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
1835
+ # Check item keys
1836
+ if @items.empty?
1837
+ warn('Items have been not provided yet')
1838
+ return nil
1839
+ end
1840
+ targetKeys = @items.keys.select{|k| self.exists?(k)}
1841
+ if targetKeys.length == 0
1842
+ warn('Any item key is allowed')
1843
+ return nil
1844
+ elsif targetKeys.length < @items.keys.length
1845
+ warn('Some item keys are not allowed')
1846
+ end
1847
+
1848
+ # Expand to parentals
1849
+ targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
1850
+ targetKeys.flatten!
1851
+ targetKeys.uniq!
1852
+
1853
+ # Obtain levels (go from leaves to roots)
1854
+ levels = targetKeys.map{|term| self.get_term_level(term)}
1855
+ levels.compact!
1856
+ levels.uniq!
1857
+ levels.sort!
1858
+ levels.reverse!
1859
+ levels.shift # Leaves are not expandable
1860
+
1861
+ # Expand from leaves to roots
1862
+ levels.map do |lvl|
1863
+ curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
1864
+ curr_keys.map do |term_expand|
1865
+ to_infer = []
1866
+ # Obtain childs
1867
+ childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
1868
+ # Expand
1869
+ if childs.length > 0 && minimum_childs == 1 # Special case
1870
+ to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
1871
+ elsif childs.length >= minimum_childs
1872
+ to_infer = Hash.new(0)
1873
+ # Compare
1874
+ while childs.length > 1
1875
+ curr_term = childs.shift
1876
+ childs.each do |compare_term|
1877
+ pivot_items = @items[curr_term]
1878
+ compare_items = @items[compare_term]
1879
+ if ontology.nil? # Exact match
1880
+ pivot_items.map do |pitem|
1881
+ if compare_items.include?(pitem)
1882
+ to_infer[pitem] += 2
1883
+ end
1884
+ end
1885
+ else # Find MICAs
1886
+ local_infer = Hash.new(0)
1887
+ pivot_items.map do |pitem|
1888
+ micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
1889
+ maxmica = micas[0]
1890
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1891
+ local_infer[maxmica.first] += 1
1892
+ end
1893
+ compare_items.map do |citem|
1894
+ micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
1895
+ maxmica = micas[0]
1896
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1897
+ local_infer[maxmica.first] += 1
1898
+ end
1899
+ local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
1900
+ end
1901
+ end
1902
+ end
1903
+ # Filter infer
1904
+ to_infer = to_infer.select{|k,v| v >= minimum_childs}
1905
+ end
1906
+ # Infer
1907
+ if to_infer.length > 0
1908
+ @items[term_expand] = [] if @items[term_expand].nil?
1909
+ if to_infer.kind_of?(Array)
1910
+ @items[term_expand] = (@items[term_expand] + to_infer).uniq
1911
+ else
1912
+ @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
1913
+ end
1914
+ @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
1915
+ elsif !@items.include?(term_expand)
1916
+ targetKeys.delete(term_expand)
1917
+ end
1918
+ end
1919
+ end
1920
+ end
1921
+
1922
+
1923
+
1924
+ # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1925
+ # ===== Parameters
1926
+ # ++::
1927
+ # ===== Returns
1928
+ # ...
1929
+ def compute_relations_to_items(external_item_list, mode, thresold)
1930
+ results = []
1931
+ penalized_terms = {}
1932
+ # terms_levels = get_terms_levels(@items_relations.keys)
1933
+ terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1934
+ terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1935
+ terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1936
+ levels = terms_levels.keys.sort
1937
+ levels.reverse_each do |level|
1938
+ terms_levels[level].each do |term|
1939
+ associated_items = @items_relations[term]
1940
+ if mode == :elim
1941
+ items_to_remove = penalized_terms[term]
1942
+ items_to_remove = [] if items_to_remove.nil?
1943
+ pval = get_fisher_exact_test(
1944
+ external_item_list - items_to_remove,
1945
+ associated_items - items_to_remove,
1946
+ ((associated_items | external_item_list) - items_to_remove).length
1947
+ )
1948
+ if pval <= thresold
1949
+ parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1950
+ parents.each do |prnt|
1951
+ query = penalized_terms[prnt]
1952
+ if query.nil?
1953
+ penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1954
+ else
1955
+ query.concat(@items_relations[term])
1956
+ end
1957
+ end
1958
+ end
1959
+ end
1960
+ results << [term, pval]
1961
+ end
1962
+ end
1963
+ return results
1964
+ end
1965
+
1966
+
1967
+ # Check if a given ID is a removable (blacklist) term.
1968
+ # +DEPRECATED+ use is_removable? instead
1969
+ # ===== Parameters
1970
+ # +id+:: to be checked
1971
+ # ===== Returns
1972
+ # true if given term is a removable (blacklist) term or false in other cases
1973
+ def is_removable(id)
1974
+ warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
1975
+ return @removable_terms.include?(id.to_sym)
1976
+ end
1977
+
1978
+ # Check if a given ID is a removable (blacklist) term
1979
+ # ===== Parameters
1980
+ # +id+:: to be checked
1981
+ # ===== Returns
1982
+ # true if given term is a removable (blacklist) term or false in other cases
1983
+ def is_removable? id
1984
+ return @removable_terms.include?(id.to_sym)
1985
+ end
1986
+
1987
+ ############################################
1988
+ # SPECIAL METHODS
1989
+ #############################################
1990
+ def ==(other)
1991
+ self.header == other.header &&
1992
+ self.stanzas == other.stanzas &&
1993
+ self.ancestors_index == other.ancestors_index &&
1994
+ self.alternatives_index == other.alternatives_index &&
1995
+ self.obsoletes_index == other.obsoletes_index &&
1996
+ self.structureType == other.structureType &&
1997
+ self.ics == other.ics &&
1998
+ self.meta == other.meta &&
1999
+ self.dicts == other.dicts &&
2000
+ self.profiles == other.profiles &&
2001
+ self.profilesDict == other.profilesDict &&
2002
+ (self.items.keys - other.items.keys).empty? &&
2003
+ self.removable_terms == other.removable_terms &&
2004
+ self.special_tags == other.special_tags &&
2005
+ self.items == other.items &&
2006
+ self.term_paths == other.term_paths &&
2007
+ self.max_freqs == other.max_freqs
2008
+ end
2009
+
2010
+
2011
+ def clone
2012
+ copy = Ontology.new
2013
+ copy.header = self.header.clone
2014
+ copy.stanzas[:terms] = self.stanzas[:terms].clone
2015
+ copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2016
+ copy.stanzas[:instances] = self.stanzas[:instances].clone
2017
+ copy.ancestors_index = self.ancestors_index.clone
2018
+ copy.descendants_index = self.descendants_index.clone
2019
+ copy.alternatives_index = self.alternatives_index.clone
2020
+ copy.obsoletes_index = self.obsoletes_index.clone
2021
+ copy.structureType = self.structureType.clone
2022
+ copy.ics = self.ics.clone
2023
+ copy.meta = self.meta.clone
2024
+ copy.dicts = self.dicts.clone
2025
+ copy.profiles = self.profiles.clone
2026
+ copy.profilesDict = self.profilesDict.clone
2027
+ copy.items = self.items.clone
2028
+ copy.removable_terms = self.removable_terms.clone
2029
+ copy.term_paths = self.term_paths.clone
2030
+ copy.max_freqs = self.max_freqs.clone
2031
+ return copy
2032
+ end
2033
+
2034
+
2035
+ #############################################
2036
+ # ACCESS CONTROL
2037
+ #############################################
2038
+
2039
+ attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2040
+ attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2041
+ end