semtools 0.1.2 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2041 +1,2559 @@
1
+ require 'expcalc'
1
2
  require 'json'
3
+ require 'colorize'
2
4
 
3
5
 
4
6
  class Ontology
5
- #########################################################
6
- # AUTHOR NOTES
7
- #########################################################
8
-
9
- # 1 - Store @profiles as @stanzas[:instances]
10
- # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
11
-
12
-
13
- #############################################
14
- # FIELDS
15
- #############################################
16
- # Handled class variables
17
- # => @@basic_tags :: hash with main OBO structure tags
18
- # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
19
- # => @@symbolizable_ids :: tags which can be symbolized
20
- # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
21
- #
22
- # Handled object variables
23
- # => @header :: file header (if is available)
24
- # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
25
- # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
26
- # => @descendants_index :: hash of descendants per each term handled with any structure relationships
27
- # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
28
- # => @obsoletes_index :: hash of obsoletes and it's new ids
29
- # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
30
- # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
31
- # => @ics :: already calculated ICs for handled terms and IC types
32
- # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
33
- # => @max_freqs :: maximum freqs found for structural and observed freqs
34
- # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
35
- # => @profiles :: set of terms assigned to an ID
36
- # => @profilesDict :: set of profile IDs assigned to a term
37
- # => @items :: hash with items relations to terms
38
- # => @removable_terms :: array of terms to not be considered
39
- # => @term_paths :: metainfo about parental paths of each term
40
-
41
- @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
- @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
- @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
- @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
45
- @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
46
- @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
47
-
48
- #############################################
49
- # CONSTRUCTOR
50
- #############################################
51
-
52
- # Instantiate a OBO_Handler object
53
- # ===== Parameters
54
- # +file+:: with info to be loaded (.obo ; .json)
55
- # +load_file+:: activate load process automatically (only for .obo)
56
- # +removable_terms+: term to be removed from calcs
57
- def initialize(file: nil, load_file: false, removable_terms: [])
58
- # Initialize object variables
59
- @header = nil
60
- @stanzas = {terms: {}, typedefs: {}, instances: {}}
61
- @ancestors_index = {}
62
- @descendants_index = {}
63
- @alternatives_index = {}
64
- @obsoletes_index = {}
65
- @structureType = nil
66
- @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
67
- @meta = {}
68
- @special_tags = @@basic_tags.clone
69
- @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
70
- @dicts = {}
71
- @profiles = {}
72
- @profilesDict = {}
73
- @items = {}
74
- @removable_terms = []
75
- @term_paths = {}
76
- # Load if proceeds
77
- add_removable_terms(removable_terms) if !removable_terms.empty?
78
- load(file) if load_file
79
- end
80
-
81
-
82
- #############################################
83
- # CLASS METHODS
84
- #############################################
85
-
86
- # Expand a (starting) term using a specific tag and return all extended terms into an array and
87
- # the relationship structuture observed (hierarchical or circular). If circular structure is
88
- # foumd, extended array will be an unique vector without starting term (no loops).
89
- # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
90
- # ===== Parameters
91
- # +start+:: term where start to expand
92
- # +terms+:: set to be used to expand
93
- # +target_tag+:: tag used to expand
94
- # +eexpansion+:: already expanded info
95
- # +split_info_char+:: special regex used to split info (if it is necessary)
96
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
97
- # +alt_ids+:: set of alternative IDs
98
- # ===== Returns
99
- # A vector with the observed structure (string) and the array with extended terms.
100
- def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
101
- # Take start_id term available info and already accumulated info
102
- current_associations = related_ids[start_id]
103
- current_associations = [] if current_associations.nil?
104
- return [:no_term,[]] if terms[start_id].nil?
105
- id_relations = terms[start_id][target_tag]
106
- return [:source,[]] if id_relations.nil?
107
-
108
- # Prepare auxiliar variables
109
- struct = :hierarchical
110
-
111
- # Study direct extensions
112
- id_relations = id_relations.clone
113
- while id_relations.length > 0
114
- id = id_relations.shift
115
- id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
116
-
117
- # Handle
118
- if current_associations.include?(id) # Check if already have been included into this expansion
119
- struct = :circular
120
- else
121
- current_associations << id
122
- if related_ids.include?(id) # Check if current already has been expanded
123
- current_associations = current_associations | related_ids[id]
124
- if current_associations.include?(start_id) # Check circular case
125
- struct = :circular
126
- [id, start_id].each{|repeated| current_associations.delete(repeated)}
127
- end
128
- else # Expand
129
- related_ids[start_id] = current_associations
130
- structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
131
- current_associations = current_associations | current_related_ids
132
- struct = :circular if structExp == :circular # Check struct
133
- if current_associations.include?(start_id) # Check circular case
134
- struct = :circular
135
- current_associations.delete(start_id)
136
- end
137
- end
138
- end
139
- end
140
- related_ids[start_id] = current_associations
141
-
142
- return struct, current_associations
143
- end
144
-
145
-
146
- # Expand terms using a specific tag and return all extended terms into an array and
147
- # the relationship structuture observed (hierarchical or circular). If circular structure is
148
- # foumd, extended array will be an unique vector without starting term (no loops)
149
- # ===== Parameters
150
- # +terms+:: set to be used to expand
151
- # +target_tag+:: tag used to expand
152
- # +split_info_char+:: special regex used to split info (if it is necessary)
153
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
154
- # +alt_ids+:: set of alternative IDs
155
- # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
156
- # ===== Returns
157
- # A vector with the observed structure (string) and the hash with extended terms
158
- def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
159
- # Define structure type
160
- structType = :hierarchical
161
- related_ids = {}
162
- terms.each do |id, tags|
163
- # Check if target tag is defined
164
- if !tags[target_tag].nil?
165
- # Obtain related terms
166
- set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
167
- # Check structure
168
- structType = :circular if set_structure == :circular
169
- end
170
- end
171
-
172
- # Check special case
173
- structType = :atomic if related_ids.length <= 0
174
- structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
175
- # Return type and hash with related_ids
176
- return structType, related_ids
177
- end
178
-
179
-
180
- # Class method to transform string with <tag : info> into hash structure
181
- # ===== Parameters
182
- # +attributes+:: array tuples with info to be transformed into hash format
183
- # ===== Returns
184
- # Attributes stored into hash structure
185
- def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
186
- # Load info
187
- info_hash = {}
188
- # Only TERMS multivalue tags (future add Typedefs and Instance)
189
- # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
190
- attributes.each do |tag, value|
191
- # Check
192
- raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
193
- # Prepare
194
- tag = tag.lstrip.to_sym
195
- value.lstrip!
196
- value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
197
-
198
- # Store
199
- query = info_hash[tag]
200
- if !query.nil? # Tag already exists
201
- if !query.kind_of?(Array) # Check that tag is multivalue
202
- raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
203
- else
204
- query << value # Add new value to tag
205
- end
206
- else # New entry
207
- if @@multivalue_tags.include?(tag)
208
- info_hash[tag] = [value]
209
- else
210
- info_hash[tag] = value
211
- end
212
- end
213
- end
214
- self.symbolize_ids(info_hash)
215
- return info_hash
216
- end
217
-
218
-
219
- # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
220
- # the Header, the Terms, the Typedefs and the Instances.
221
- # ===== Parameters
222
- # +file+:: OBO file to be loaded
223
- # ===== Returns
224
- # Hash with FILE, HEADER and STANZAS info
225
- def self.load_obo(file) #TODO: Send to obo_parser class
226
- raise("File is not defined") if file.nil?
227
- # Data variables
228
- header = ''
229
- stanzas = {terms: {}, typedefs: {}, instances: {}}
230
- # Auxiliar variables
231
- infoType = 'Header'
232
- currInfo = []
233
- stanzas_flags = %w[[Term] [Typedef] [Instance]]
234
- # Read file
235
- File.open(file).each do |line|
236
- line.chomp!
237
- next if line.empty?
238
- fields = line.split(':', 2)
239
- # Check if new instance is found
240
- if stanzas_flags.include?(line)
241
- header = self.process_entity(header, infoType, stanzas, currInfo)
242
- # Update info variables
243
- currInfo = []
244
- infoType = line.gsub!(/[\[\]]/, '')
245
- next
246
- end
247
- # Concat info
248
- currInfo << fields
249
- end
250
- # Store last loaded info
251
- header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
252
-
253
- # Prepare to return
254
- finfo = {:file => file, :name => File.basename(file, File.extname(file))}
255
- return finfo, header, stanzas
256
- end
257
-
258
-
259
- # Handle OBO loaded info and stores it into correct container and format
260
- # ===== Parameters
261
- # +header+:: container
262
- # +infoType+:: current ontology item type detected
263
- # +stanzas+:: container
264
- # +currInfo+:: info to be stored
265
- # ===== Returns
266
- # header newly/already stored
267
- def self.process_entity(header, infoType, stanzas, currInfo)
268
- info = self.info2hash(currInfo)
269
- # Store current info
270
- if infoType.eql?('Header')
271
- header = info
272
- else
273
- id = info[:id]
274
- case infoType
275
- when 'Term'
276
- stanzas[:terms][id] = info
277
- when 'Typedef'
278
- stanzas[:typedefs][id] = info
279
- when 'Instance'
280
- stanzas[:instances][id] = info
281
- end
282
- end
283
- return header
284
- end
285
-
286
-
287
- # Symboliza all values into hashs using symbolizable tags as keys
288
- # ===== Parameters
289
- # +item_hash+:: hash to be checked
290
- def self.symbolize_ids(item_hash)
291
- @@symbolizable_ids.each do |tag|
292
- query = item_hash[tag]
293
- if !query.nil?
294
- if query.kind_of?(Array)
295
- query.map!{|item| item.to_sym}
296
- else
297
- item_hash[tag] = query.to_sym if !query.nil?
298
- end
299
- end
300
- end
301
- end
302
-
303
-
304
- #
305
- # ===== Parameters
306
- # +root+:: main term to expand
307
- # +ontology+:: to be cutted
308
- # +clone+:: if true, given ontology object will not be mutated
309
- # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
310
- # ===== Returns
311
- # An Ontology object with terms after cut the ontology.
312
- def self.mutate(root, ontology, clone: true, remove_up: true)
313
- ontology = ontology.clone if clone
314
- # Obtain affected IDs
315
- descendants = ontology.descendants_index[root]
316
- descendants << root # Store itself to do not remove it
317
- # Remove unnecesary terms
318
- ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
319
- ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
320
- ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
321
- ontology.dicts = {}
322
- ontology.removable_terms = []
323
- ontology.term_paths = {}
324
- # Recalculate metadata
325
- ontology.build_index
326
- ontology.add_observed_terms_from_profiles
327
- # Finish
328
- return ontology
329
- end
330
-
331
-
332
-
333
- #############################################
334
- # GENERAL METHODS
335
- #############################################
336
-
337
- # Include removable terms to current removable terms list
338
- # ===== Parameters
339
- # +terms+:: terms array to be concatenated
340
- def add_removable_terms(terms)
341
- terms = terms.map{|term| term.to_sym}
342
- @removable_terms.concat(terms)
343
- end
344
-
345
-
346
- # Include removable terms to current removable terms list loading new
347
- # terms from a one column plain text file
348
- # ===== Parameters
349
- # +file+:: to be loaded
350
- def add_removable_terms_from_file(file)
351
- File.open(excluded_codes_file).each do |line|
352
- line.chomp!
353
- @removable_terms << line.to_sym
354
- end
355
- end
356
-
357
-
358
- # Increase observed frequency for a specific term
359
- # ===== Parameters
360
- # +term+:: term which frequency is going to be increased
361
- # +increas+:: frequency rate to be increased. Default = 1
362
- # ===== Return
363
- # true if process ends without errors, false in other cases
364
- def add_observed_term(term:,increase: 1.0)
365
- # Check
366
- raise ArgumentError, "Term given is NIL" if term.nil?
367
- return false unless @stanzas[:terms].include?(term)
368
- return false if @removable_terms.include?(term)
369
- if @alternatives_index.include?(term)
370
- alt_id = @alternatives_index[term]
371
- @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
372
- @meta[term] = @meta[alt_id]
373
- end
374
- # Check if exists
375
- @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
376
- # Add frequency
377
- @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
378
- @meta[term][:observed_freq] += increase
379
- # Check maximum frequency
380
- @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
381
- return true
382
- end
383
-
384
-
385
- # Increase the arbitrary frequency of a given term set
386
- # ===== Parameters
387
- # +terms+:: set of terms to be updated
388
- # +increase+:: amount to be increased
389
- # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
390
- # ===== Return
391
- # true if process ends without errors and false in other cases
392
- def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
393
- # Check
394
- raise ArgumentError, 'Terms array given is NIL' if terms.nil?
395
- raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
396
- # Add observations
397
- if transform_to_sym
398
- checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
399
- else
400
- checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
401
- end
402
- return checks
403
- end
404
-
405
-
406
- # Compare to terms sets
407
- # ===== Parameters
408
- # +termsA+:: set to be compared
409
- # +termsB+:: set to be compared
410
- # +sim_type+:: similitude method to be used. Default: resnik
411
- # +ic_type+:: ic type to be used. Default: resnik
412
- # +bidirectional+:: calculate bidirectional similitude. Default: false
413
- # ===== Return
414
- # similitude calculated
415
- def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
416
- # Check
417
- raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
418
- raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
419
- micasA = []
420
- # Compare A -> B
421
- termsA.each do |tA|
422
- micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
423
- # Remove special cases
424
- [false,nil].each do |err_value| micas.delete(err_value) end
425
- # Obtain maximum value
426
- micasA << micas.max if micas.length > 0
427
- micasA << 0 if micas.length <= 0
428
- end
429
- means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
430
- # Compare B -> A
431
- if bidirectional
432
- means_simA = means_sim * micasA.size
433
- means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
434
- means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
435
- end
436
- # Return
437
- return means_sim
438
- end
439
-
440
-
441
- # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
442
- # ===== Parameters
443
- # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
444
- # +sim_type+:: similitude method to be used. Default: resnik
445
- # +ic_type+:: ic type to be used. Default: resnik
446
- # +bidirectional+:: calculate bidirectional similitude. Default: false
447
- # ===== Return
448
- # Similitudes calculated
449
- def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
450
- profiles_similarity = {} #calculate similarity between patients profile
451
- profiles_ids = @profiles.keys
452
- if external_profiles.nil?
453
- comp_ids = profiles_ids
454
- comp_profiles = @profiles
455
- main_ids = comp_ids
456
- main_profiles = comp_profiles
457
- else
458
- comp_ids = external_profiles.keys
459
- comp_profiles = external_profiles
460
- main_ids = profiles_ids
461
- main_profiles = @profiles
462
- end
463
- # Compare
464
- while !main_ids.empty?
465
- curr_id = main_ids.shift
466
- current_profile = main_profiles[curr_id]
467
- comp_ids.each do |id|
468
- profile = comp_profiles[id]
469
- value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
470
- query = profiles_similarity[curr_id]
471
- if query.nil?
472
- profiles_similarity[curr_id] = {id => value}
473
- else
474
- query[id] = value
475
- end
476
- end
477
- end
478
- return profiles_similarity
479
- end
480
-
481
-
482
- # Expand alternative IDs arround all already stored terms
483
- # ===== Parameters
484
- # +alt_tag+:: tag used to expand alternative IDs
485
- # ===== Returns
486
- # true if process ends without errors and false in other cases
487
- def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
488
- # Check input
489
- raise('stanzas terms empty') if @stanzas[:terms].empty?
490
- # Take all alternative IDs
491
- alt_ids2add = {}
492
- @stanzas[:terms].each do |id, tags|
493
- alt_ids = tags[alt_tag]
494
- if !alt_ids.nil?
495
- alt_ids = alt_ids - @removable_terms
496
- # Update info
497
- alt_ids.each do |alt_term|
498
- @alternatives_index[alt_term] = id
499
- alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
500
- @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
501
- end
502
- end
503
- end
504
- @stanzas[:terms].merge!(alt_ids2add)
505
- end
506
-
507
-
508
- # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
509
- # ===== Returns
510
- # true if eprocess ends without errors and false in other cases
511
- def build_index()
512
- self.get_index_alternatives
513
- self.get_index_obsoletes
514
- self.get_index_child_parent_relations
515
- @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
516
- @alternatives_index.compact!
517
- @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
518
- @obsoletes_index.compact!
519
- @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
520
- @ancestors_index.compact!
521
- @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
522
- @descendants_index.compact!
523
- self.get_index_frequencies
524
- self.calc_dictionary(:name)
525
- self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
526
- self.calc_term_levels(calc_paths: true)
527
- end
528
-
529
-
530
- # Calculates regular frequencies based on ontology structure (using parentals)
531
- # ===== Returns
532
- # true if everything end without errors and false in other cases
533
- def get_index_frequencies()
534
- # Check
535
- if @ancestors_index.empty?
536
- warn('ancestors_index object is empty')
537
- else
538
- # Prepare useful variables
539
- alternative_terms = @alternatives_index.keys
540
- # Per each term, add frequencies
541
- @stanzas[:terms].each do |id, tags|
542
- if @alternatives_index.include?(id)
543
- alt_id = @alternatives_index[id]
544
- query = @meta[alt_id] # Check if exist
545
- if query.nil?
546
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
547
- @meta[alt_id] = query
548
- end
549
- @meta[id] = query
550
- # Note: alternative terms do not increase structural frequencies
551
- else # Official term
552
- query = @meta[id] # Check if exist
553
- if query.nil?
554
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
555
- @meta[id] = query
556
- end
557
- # Store metadata
558
- query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
559
- query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
560
- query[:struct_freq] = query[:descendants] + 1.0
561
- # Update maximums
562
- @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
563
- @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
564
- end
565
- end
566
- end
567
- end
568
-
569
-
570
- # Expand obsoletes set and link info to their alternative IDs
571
- # ===== Parameters
572
- # +obs_tags+:: tags to be used to find obsoletes
573
- # +alt_tags+:: tags to find alternative IDs (if are available)
574
- # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
575
- # ===== Returns
576
- # true if process ends without errors and false in other cases
577
- def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
578
- if @stanzas[:terms].empty?
579
- warn('stanzas terms empty')
580
- else
581
- # Check obsoletes
582
- @stanzas[:terms].each do |id, term_tags|
583
- next if term_tags.nil?
584
- query = term_tags[obs_tag]
585
- if !query.nil? && query == 'true' # Obsolete tag presence
586
- next if !@obsoletes_index[id].nil? # Already stored
587
- # Check if alternative value is available
588
- alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
589
- if !alt_ids.empty?
590
- alt_id = alt_ids.first.first #FIRST tag, FIRST id
591
- # Store
592
- @alternatives_index[id] = alt_id
593
- @obsoletes_index[id] = alt_id
594
- end
595
- end
596
- end
597
- end
598
- end
599
-
600
-
601
- # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
602
- # ===== Parameters
603
- # +tag+:: tag used to expand parentals
604
- # +split_info_char+:: special regex used to split info (if it is necessary)
605
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
606
- # ===== Returns
607
- # true if process ends without errors and false in other cases
608
- def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
609
- # Check
610
- if @stanzas[:terms].nil?
611
- warn('stanzas terms empty')
612
- else
613
- # Expand
614
- structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
615
- target_tag: tag,
616
- alt_ids: @alternatives_index,
617
- obsoletes: @obsoletes_index.length)
618
- # Check
619
- raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
620
- # Prepare ancestors structure
621
- anc = {}
622
- des = {}
623
- parentals.each do |id, parents|
624
- parents = parents - @removable_terms
625
- anc[id] = parents
626
- parents.each do |anc_id| # Add descendants
627
- if !des.include?(anc_id)
628
- des[anc_id] = [id]
629
- else
630
- des[anc_id] << id
631
- end
632
- end
633
- end
634
- # Store alternatives
635
- @alternatives_index.each do |id,alt|
636
- anc[id] = anc[alt] if anc.include?(alt)
637
- des[id] = des[alt] if des.include?(alt)
638
- end
639
- # Check structure
640
- if ![:atomic,:sparse].include? structType
641
- structType = structType == :circular ? :circular : :hierarchical
642
- end
643
- # Store
644
- @ancestors_index = anc
645
- @descendants_index = des
646
- @structureType = structType
647
- end
648
- # Finish
649
- end
650
-
651
-
652
- # Find ancestors of a given term
653
- # ===== Parameters
654
- # +term+:: to be checked
655
- # +filter_alternatives+:: if true, remove alternatives from final results
656
- # ===== Returns
657
- # an array with all ancestors of given term or false if parents are not available yet
658
- def get_ancestors(term, filter_alternatives = false)
659
- return self.get_familiar(term, true, filter_alternatives)
660
- end
661
-
662
-
663
- # Find descendants of a given term
664
- # ===== Parameters
665
- # +term+:: to be checked
666
- # +filter_alternatives+:: if true, remove alternatives from final results
667
- # ===== Returns
668
- # an array with all descendants of given term or false if parents are not available yet
669
- def get_descendants(term, filter_alternatives = false)
670
- return self.get_familiar(term, false, filter_alternatives)
671
- end
672
-
673
-
674
- # Find ancestors/descendants of a given term
675
- # ===== Parameters
676
- # +term+:: to be checked
677
- # +return_ancestors+:: return ancestors if true or descendants if false
678
- # +filter_alternatives+:: if true, remove alternatives from final results
679
- # ===== Returns
680
- # an array with all ancestors/descendants of given term or nil if parents are not available yet
681
- def get_familiar(term, return_ancestors = true, filter_alternatives = false)
682
- # Find into parentals
683
- familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
684
- if !familiars.nil?
685
- familiars = familiars.clone
686
- if filter_alternatives
687
- familiars.reject!{|fm| @alternatives_index.include?(fm)}
688
- end
689
- else
690
- familiars = []
691
- end
692
- return familiars
693
- end
694
-
695
-
696
- # Obtain IC of an specific term
697
- # ===== Parameters
698
- # +term+:: which IC will be calculated
699
- # +type+:: of IC to be calculated. Default: resnik
700
- # +force+:: force re-calculate the IC. Do not check if it is already calculated
701
- # +zhou_k+:: special coeficient for Zhou IC method
702
- # ===== Returns
703
- # the IC calculated
704
- def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
705
- term = termRaw.to_sym
706
- # Check
707
- raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
708
- # Check if it's already calculated
709
- return @ics[type][term] if (@ics[type].include? term) && !force
710
- # Calculate
711
- ic = - 1
712
- case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
713
- ###########################################
714
- #### STRUCTURE BASED METRICS
715
- ###########################################
716
- # Shortest path
717
- # Weighted Link
718
- # Hirst and St-Onge Measure
719
- # Wu and Palmer
720
- # Slimani
721
- # Li
722
- # Leacock and Chodorow
723
- ###########################################
724
- #### INFORMATION CONTENT METRICS
725
- ###########################################
726
- when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
727
- # -log(Freq(x) / Max_Freq)
728
- ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
729
- when :resnik_observed
730
- # -log(Freq(x) / Max_Freq)
731
- ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
732
- # Lin
733
- # Jiang & Conrath
734
-
735
- ###########################################
736
- #### FEATURE-BASED METRICS
737
- ###########################################
738
- # Tversky
739
- # x-similarity
740
- # Rodirguez
741
-
742
- ###########################################
743
- #### HYBRID METRICS
744
- ###########################################
745
- when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
746
- # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
747
- ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
748
- if :zhou # New Model of Semantic Similarity Measuring in Wordnet
749
- # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
750
- @ics[:seco][term] = ic # Special store
751
- ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
752
- end
753
- when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
754
- ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
755
- # Knappe
756
- end
757
- @ics[type][term] = ic
758
- return ic
759
- end
760
-
761
-
762
- # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
763
- # ===== Returns
764
- # two hashes with resnik and resnik_observed ICs for observed terms
765
- def get_observed_ics_by_onto_and_freq
766
- # Chech there are observed terms
767
- if @profiles.empty?
768
- resnik = {}
769
- resnik_observed = {}
770
- else
771
- # Calc ICs for all terms
772
- observed_terms = @profiles.values.flatten.uniq
773
- observed_terms.each{ |term| get_IC(term)}
774
- observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
775
- resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
776
- resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
777
- end
778
- return resnik.clone, resnik_observed.clone
779
- end
780
-
781
-
782
- # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
783
- # ===== Parameters
784
- # +termA+:: term to be cheked
785
- # +termB+:: term to be checked
786
- # +ic_type+:: IC formula to be used
787
- # ===== Returns
788
- # the IC of the MICA(termA,termB)
789
- def get_ICMICA(termA, termB, ic_type = :resnik)
790
- mica = self.get_MICA(termA, termB, ic_type)
791
- return mica.first.nil? ? nil : mica.last
792
- end
793
-
794
-
795
- # Find the Most Index Content shared Ancestor (MICA) of two given terms
796
- # ===== Parameters
797
- # +termA+:: term to be cheked
798
- # +termB+:: term to be checked
799
- # +ic_type+:: IC formula to be used
800
- # ===== Returns
801
- # the MICA(termA,termB) and it's IC
802
- def get_MICA(termA, termB, ic_type = :resnik)
803
- termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
804
- termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
805
- mica = [nil,-1.0]
806
- # Special case
807
- if termA.eql?(termB)
808
- ic = self.get_IC(termA, type: ic_type)
809
- mica = [termA, ic]
810
- else
811
- # Obtain ancestors (include itselfs too)
812
- anc_A = self.get_ancestors(termA)
813
- anc_B = self.get_ancestors(termB)
814
-
815
- if !(anc_A.empty? && anc_B.empty?)
816
- anc_A << termA
817
- anc_B << termB
818
- # Find shared ancestors
819
- shared_ancestors = anc_A & anc_B
820
- # Find MICA
821
- if shared_ancestors.length > 0
822
- shared_ancestors.each do |anc|
823
- ic = self.get_IC(anc, type: ic_type)
824
- # Check
825
- mica = [anc,ic] if ic > mica[1]
826
- end
827
- end
828
- end
829
- end
830
- return mica
831
- end
832
-
833
-
834
- # Calculate similarity between two given terms
835
- # ===== Parameters
836
- # +termsA+:: to be compared
837
- # +termsB+:: to be compared
838
- # +type+:: similitude formula to be used
839
- # +ic_type+:: IC formula to be used
840
- # ===== Returns
841
- # the similarity between both sets or false if frequencies are not available yet
842
- def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
843
- # Check
844
- raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
845
- sim = nil
846
- # Launch comparissons
847
- sim_res = get_ICMICA(termA, termB, ic_type)
848
- if !sim_res.nil?
849
- case type
850
- when :resnik
851
- sim = sim_res
852
- when :lin
853
- sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
854
- when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
855
- sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
856
- end
857
- end
858
- return sim
859
- end
860
-
861
-
862
- # Method used to load information stored into an OBO file and store it into this object.
863
- # If a file is specified by input parameter, current @file value is updated
864
- # ===== Parameters
865
- # +file+:: optional file to update object stored file
866
- def load(file, build: true)
867
- _, header, stanzas = self.class.load_obo(file)
868
- @header = header
869
- @stanzas = stanzas
870
- self.remove_removable()
871
- # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
872
- self.build_index() if build
873
- end
874
-
875
- #
876
- def remove_removable()
877
- @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
878
- end
879
-
880
-
881
- # Exports an OBO_Handler object in json format
882
- # ===== Parameters
883
- # +file+:: where info will be stored
884
- def write(file)
885
- # Take object stored info
886
- obj_info = {header: @header,
887
- stanzas: @stanzas,
888
- ancestors_index: @ancestors_index,
889
- descendants_index: @descendants_index,
890
- alternatives_index: @alternatives_index,
891
- obsoletes_index: @obsoletes_index,
892
- structureType: @structureType,
893
- ics: @ics,
894
- meta: @meta,
895
- special_tags: @special_tags,
896
- max_freqs: @max_freqs,
897
- dicts: @dicts,
898
- profiles: @profiles,
899
- profilesDict: @profilesDict,
900
- items: @items,
901
- removable_terms: @removable_terms,
902
- term_paths: @term_paths}
903
- # Convert to JSON format & write
904
- File.open(file, "w") { |f| f.write obj_info.to_json }
905
- end
906
-
907
-
908
- def is_number? string
909
- true if Float(string) rescue false
910
- end
911
-
912
-
913
- # Read a JSON file with an OBO_Handler object stored
914
- # ===== Parameters
915
- # +file+:: with object info
916
- # ===== Return
917
- # OBO_Handler internal fields
918
- def read(file)
919
- # Read file
920
- jsonFile = File.open(file)
921
- jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
922
- # Pre-process (Symbolize some hashs values)
923
- jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
924
- jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
925
- jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
926
- jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h
927
- jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
928
- jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
929
- jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h
930
- jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
931
- # Special case: byTerm
932
- dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
933
- if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
934
- [term.to_s.to_i, value.map{|term| term.to_sym}]
935
- elsif value.is_a? Numeric # Numeric dictionary
936
- [term.to_sym, value]
937
- elsif value.kind_of?(Array) && flag == :is_a
938
- [term.to_sym, value.map{|v| v.to_sym}]
939
- else
940
- [term.to_sym, value]
941
- end
942
- end
943
- dictionaries[:byTerm] = dictionaries[:byTerm].to_h
944
- # By value
945
- dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
946
- if value.is_a? Numeric # Numeric dictionary
947
- [value, term.to_sym]
948
- elsif term.is_a? Numeric # Numeric dictionary
949
- [value.to_s.to_sym, term]
950
- elsif flag == :is_a
951
- [value.to_sym, term.map{|v| v.to_sym}]
952
- elsif term.kind_of?(Array)
953
- [value.to_sym, term.map{|t| t.to_sym}]
954
- else
955
- [value.to_s, term.to_sym]
956
- end
957
- end
958
- dictionaries[:byValue] = dictionaries[:byValue].to_h
959
- end
960
- jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
961
- jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
962
- jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}}
963
- jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym}
964
- jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
965
- if v.kind_of?(Array)
966
- jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
967
- else
968
- jsonInfo[:special_tags][k] = v.to_sym
969
- end
970
- end
971
- jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}}
972
- jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}}
973
- # Store info
974
- @header = jsonInfo[:header]
975
- @stanzas = jsonInfo[:stanzas]
976
- @ancestors_index = jsonInfo[:ancestors_index]
977
- @descendants_index = jsonInfo[:descendants_index]
978
- @alternatives_index = jsonInfo[:alternatives_index]
979
- @obsoletes_index = jsonInfo[:obsoletes_index]
980
- @structureType = jsonInfo[:structureType].to_sym
981
- @ics = jsonInfo[:ics]
982
- @meta = jsonInfo[:meta]
983
- @special_tags = jsonInfo[:special_tags]
984
- @max_freqs = jsonInfo[:max_freqs]
985
- @dicts = jsonInfo[:dicts]
986
- @profiles = jsonInfo[:profiles]
987
- @profilesDict = jsonInfo[:profilesDict]
988
- @items = jsonInfo[:items]
989
- @removable_terms = jsonInfo[:removable_terms]
990
- @term_paths = jsonInfo[:term_paths]
991
- end
992
-
993
-
994
- # Check if a given ID is stored as term into this object
995
- # ===== Parameters
996
- # +id+:: to be checked
997
- # ===== Return
998
- # True if term is allowed or false in other cases
999
- def exists? id
1000
- return stanzas[:terms].include?(id)
1001
- end
1002
-
1003
-
1004
- # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1005
- # ===== Parameters
1006
- # +text+:: to be checked
1007
- # ===== Return
1008
- # The correct ID if it can be found or nil in other cases
1009
- def extract_id(text, splitBy: ' ')
1010
- if self.exists?(text)
1011
- return text
1012
- else
1013
- splittedText = text.to_s.split(splitBy).first.to_sym
1014
- return self.exists?(splittedText) ? splittedText : nil
1015
- end
1016
- end
1017
-
1018
-
1019
- # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1020
- # This functions stores calculated dictionary into @dicts field.
1021
- # This functions stores first value for multivalue tags
1022
- # This function does not handle synonyms for byValue dictionaries
1023
- # ===== Parameters
1024
- # +tag+:: to be used to calculate dictionary
1025
- # +select_regex+:: gives a regfex that can be used to modify value to be stored
1026
- # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1027
- # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1028
- # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1029
- # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1030
- # ===== Return
1031
- # void. And stores calcualted bidirectional dictonary into dictionaries main container
1032
- def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1033
- tag = tag.to_sym
1034
- store_tag = tag if store_tag.nil?
1035
- if @stanzas[:terms].empty?
1036
- warn('Terms are not already loaded. Aborting dictionary calc')
1037
- else
1038
- byTerm = {}
1039
- byValue = {}
1040
- # Calc per term
1041
- @stanzas[:terms].each do |term, tags|
1042
- referenceTerm = term
1043
- if @alternatives_index.include?(term) && substitute_alternatives # Special case
1044
- referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1045
- end
1046
- queryTag = tags[tag]
1047
- if !queryTag.nil?
1048
- # Pre-process
1049
- if !select_regex.nil?
1050
- if queryTag.kind_of?(Array)
1051
- queryTag = queryTag.map{|value| value.scan(select_regex).first}
1052
- queryTag.flatten!
1053
- else
1054
- queryTag = queryTag.scan(select_regex).first
1055
- end
1056
- queryTag.compact!
1057
- end
1058
- if queryTag.kind_of?(Array) # Store
1059
- if !queryTag.empty?
1060
- if byTerm.include?(referenceTerm)
1061
- byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1062
- else
1063
- byTerm[referenceTerm] = queryTag
1064
- end
1065
- if multiterm
1066
- queryTag.each do |value|
1067
- byValue[value] = [] if byValue[value].nil?
1068
- byValue[value] << referenceTerm
1069
- end
1070
- else
1071
- queryTag.each{|value| byValue[value] = referenceTerm}
1072
- end
1073
- end
1074
- else
1075
- if byTerm.include?(referenceTerm)
1076
- byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1077
- else
1078
- byTerm[referenceTerm] = [queryTag]
1079
- end
1080
- if multiterm
1081
- byValue[queryTag] = [] if byValue[queryTag].nil?
1082
- byValue[queryTag] << referenceTerm
1083
- else
1084
- byValue[queryTag] = referenceTerm
1085
- end
1086
- end
1087
- end
1088
- end
1089
-
1090
- # Check self-references
1091
- if self_type_references
1092
- byTerm.map do |term, references|
1093
- corrected_references = references.map do |t|
1094
- checked = self.extract_id(t)
1095
- if checked.nil?
1096
- t
1097
- else
1098
- byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1099
- checked
1100
- end
1101
- end
1102
- byTerm[term] = corrected_references.uniq
1103
- end
1104
- end
1105
-
1106
- # Check order
1107
- byTerm.map do |term,values|
1108
- if self.exists?(term)
1109
- referenceValue = @stanzas[:terms][term][tag]
1110
- if !referenceValue.nil?
1111
- if !select_regex.nil?
1112
- if referenceValue.kind_of?(Array)
1113
- referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1114
- referenceValue.flatten!
1115
- else
1116
- referenceValue = referenceValue.scan(select_regex).first
1117
- end
1118
- referenceValue.compact!
1119
- end
1120
- if self_type_references
1121
- if referenceValue.kind_of?(Array)
1122
- aux = referenceValue.map{|t| self.extract_id(t)}
1123
- else
1124
- aux = self.extract_id(referenceValue)
1125
- end
1126
- referenceValue = aux if !aux.nil?
1127
- end
1128
- referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1129
- byTerm[term] = referenceValue + (values - referenceValue)
1130
- end
1131
- end
1132
- end
1133
-
1134
- # Store
1135
- @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1136
- end
1137
- end
1138
-
1139
-
1140
- # Calculates :is_a dictionary without alternatives substitution
1141
- def calc_ancestors_dictionary
1142
- self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
1143
- end
1144
-
1145
-
1146
- # Translate a given value using an already calcualted dictionary
1147
- # ===== Parameters
1148
- # +toTranslate+:: value to be translated using dictiontionary
1149
- # +tag+:: used to generate the dictionary
1150
- # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1151
- # ===== Return
1152
- # translation
1153
- def translate(toTranslate, tag, byValue: true)
1154
- dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1155
- toTranslate = get_main_id(toTranslate) if !byValue
1156
- return dict[toTranslate]
1157
- end
1158
-
1159
-
1160
- # Translate a name given
1161
- # ===== Parameters
1162
- # +name+:: to be translated
1163
- # ===== Return
1164
- # translated name or nil if it's not stored into this ontology
1165
- def translate_name(name)
1166
- term = self.translate(name, :name)
1167
- term = self.translate(name, :synonym) if term.nil?
1168
- return term
1169
- end
1170
-
1171
-
1172
- # Translate several names and return translations and a list of names which couldn't be translated
1173
- # ===== Parameters
1174
- # +names+:: array to be translated
1175
- # ===== Return
1176
- # two arrays with translations and names which couldn't be translated respectively
1177
- def translate_names(names)
1178
- translated = []
1179
- rejected = []
1180
- names.each do |name|
1181
- tr = self.translate_name(name)
1182
- if tr.nil?
1183
- rejected << name
1184
- else
1185
- translated << tr
1186
- end
1187
- end
1188
- return translated, rejected
1189
- end
1190
-
1191
-
1192
- # Translates a given ID to it assigned name
1193
- # ===== Parameters
1194
- # +id+:: to be translated
1195
- # ===== Return
1196
- # main name or nil if it's not included into this ontology
1197
- def translate_id(id)
1198
- name = self.translate(id, :name, byValue: false)
1199
- return name.nil? ? nil : name.first
1200
- end
1201
-
1202
-
1203
- # Translates several IDs and returns translations and not allowed IDs list
1204
- # ===== Parameters
1205
- # +ids+:: to be translated
1206
- # ===== Return
1207
- # two arrays with translations and names which couldn't be translated respectively
1208
- def translate_ids(ids)
1209
- translated = []
1210
- rejected = []
1211
- ids.each do |term_id|
1212
- tr = self.translate_id(term_id.to_sym)
1213
- if !tr.nil?
1214
- translated << tr
1215
- else
1216
- rejected << tr
1217
- end
1218
- end
1219
- return translated, rejected
1220
- end
1221
-
1222
-
1223
- # ===== Returns
1224
- # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1225
- # ===== Parameters
1226
- # +id+:: to be translated
1227
- # ===== Return
1228
- # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1229
- def get_main_id(id)
1230
- return nil if !@stanzas[:terms].include? id
1231
- new_id = id
1232
- mainID = @alternatives_index[id]
1233
- new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1234
- return new_id
1235
- end
1236
-
1237
-
1238
- # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1239
- # ===== Parameters
1240
- # +ids+:: to be checked
1241
- # ===== Return
1242
- # two arrays whit allowed and rejected IDs respectively
1243
- def check_ids(ids, substitute: true)
1244
- checked_codes = []
1245
- rejected_codes = []
1246
- ids.each do |id|
1247
- if @stanzas[:terms].include? id
1248
- if substitute
1249
- checked_codes << self.get_main_id(id)
1250
- else
1251
- checked_codes << id
1252
- end
1253
- else
1254
- rejected_codes << id
1255
- end
1256
- end
1257
- return checked_codes, rejected_codes
1258
- end
1259
-
1260
-
1261
- # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1262
- # ===== Parameters
1263
- # +id+:: assigned to profile
1264
- # +terms+:: array of terms
1265
- # +substitute+:: subsstitute flag from check_ids
1266
- def add_profile(id, terms, substitute: true)
1267
- warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1268
- correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1269
- if !rejected_terms.empty?
1270
- warn('Given terms contains erroneus IDs. These IDs will be removed')
1271
- end
1272
- if id.is_a? Numeric
1273
- @profiles[id] = correct_terms
1274
- else
1275
- @profiles[id.to_sym] = correct_terms
1276
- end
1277
- end
1278
-
1279
-
1280
- # Method used to store a pull of profiles
1281
- # ===== Parameters
1282
- # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1283
- # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1284
- # +reset_stored+:: if true, remove already stored profiles
1285
- # +substitute+:: subsstitute flag from check_ids
1286
- def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1287
- self.reset_profiles if reset_stored
1288
- # Check
1289
- if profiles.kind_of?(Array)
1290
- profiles.each_with_index do |items, i|
1291
- self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1292
- end
1293
- else # Hash
1294
- if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1295
- warn('Some profiles given are already stored. Stored version will be replaced')
1296
- end
1297
- profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1298
- end
1299
-
1300
- self.add_observed_terms_from_profiles(reset: true)
1301
-
1302
- if calc_metadata
1303
- self.calc_profiles_dictionary
1304
- end
1305
- end
1306
-
1307
-
1308
- # Internal method used to remove already stored profiles and restore observed frequencies
1309
- def reset_profiles
1310
- # Clean profiles storage
1311
- @profiles = {}
1312
- # Reset frequency observed
1313
- @meta.each{|term,info| info[:observed_freq] = 0}
1314
- @max_freqs[:observed_freq] = 0
1315
- end
1316
-
1317
-
1318
- # ===== Returns
1319
- # profiles assigned to a given ID
1320
- # ===== Parameters
1321
- # +id+:: profile ID
1322
- # ===== Return
1323
- # specific profile or nil if it's not stored
1324
- def get_profile(id)
1325
- return @profiles[id]
1326
- end
1327
-
1328
-
1329
- # ===== Returns
1330
- # an array of sizes for all stored profiles
1331
- # ===== Return
1332
- # array of profile sizes
1333
- def get_profiles_sizes()
1334
- return @profiles.map{|id,terms| terms.length}
1335
- end
1336
-
1337
-
1338
- # ===== Returns
1339
- # mean size of stored profiles
1340
- # ===== Parameters
1341
- # +round_digits+:: number of digits to round result. Default: 4
1342
- # ===== Returns
1343
- # mean size of stored profiles
1344
- def get_profiles_mean_size(round_digits: 4)
1345
- sizes = self.get_profiles_sizes
1346
- return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1347
- end
1348
-
1349
-
1350
- # Calculates profiles sizes and returns size assigned to percentile given
1351
- # ===== Parameters
1352
- # +perc+:: percentile to be returned
1353
- # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1354
- # ===== Returns
1355
- # values assigned to percentile asked
1356
- def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1357
- prof_lengths = self.get_profiles_sizes.sort
1358
- prof_lengths.reverse! if !increasing_sort
1359
- n_profiles = prof_lengths.length
1360
- percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1361
- percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1362
- return prof_lengths[percentile_index]
1363
- end
1364
-
1365
-
1366
- # Translate a given profile to terms names
1367
- # ===== Parameters
1368
- # +prof+:: array of terms to be translated
1369
- # ===== Returns
1370
- # array of translated terms. Can include nils if some IDs are not allowed
1371
- def profile_names(prof)
1372
- return prof.map{|term| self.translate_id(term)}
1373
- end
1374
-
1375
-
1376
- # Trnaslates a bunch of profiles to it sets of term names
1377
- # ===== Parameters
1378
- # +profs+:: array of profiles
1379
- # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1380
- # ===== Returns
1381
- # translated profiles
1382
- def translate_profiles_ids(profs = [], asArray: true)
1383
- profs = @profiles if profs.empty?
1384
- profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1385
- profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1386
- return asArray ? profs_names.values : profs_names
1387
- end
1388
-
1389
-
1390
- # Includes as "observed_terms" all terms included into stored profiles
1391
- # ===== Parameters
1392
- # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1393
- def add_observed_terms_from_profiles(reset: false)
1394
- @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1395
- @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1396
- end
1397
-
1398
-
1399
- # Get a term frequency
1400
- # ===== Parameters
1401
- # +term+:: term to be checked
1402
- # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1403
- # ===== Returns
1404
- # frequency of term given or nil if term is not allowed
1405
- def get_frequency(term, type: :struct_freq)
1406
- queryFreq = @meta[term]
1407
- return queryFreq.nil? ? nil : queryFreq[type]
1408
- end
1409
-
1410
-
1411
- # Geys structural frequency of a term given
1412
- # ===== Parameters
1413
- # +term+:: to be checked
1414
- # ===== Returns
1415
- # structural frequency of given term or nil if term is not allowed
1416
- def get_structural_frequency(term)
1417
- return self.get_frequency(term, type: :struct_freq)
1418
- end
1419
-
1420
-
1421
- # Gets observed frequency of a term given
1422
- # ===== Parameters
1423
- # +term+:: to be checked
1424
- # ===== Returns
1425
- # observed frequency of given term or nil if term is not allowed
1426
- def get_observed_frequency(term)
1427
- return self.get_frequency(term, type: :observed_freq)
1428
- end
1429
-
1430
-
1431
- # Calculates frequencies of stored profiles terms
1432
- # ===== Parameters
1433
- # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1434
- # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1435
- # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1436
- # +translate+:: if true, term IDs will be translated to
1437
- # ===== Returns
1438
- # stored profiles terms frequencies
1439
- def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1440
- n_profiles = @profiles.length
1441
- if literal
1442
- freqs = {}
1443
- @profiles.each do |id, terms|
1444
- terms.each do |literalTerm|
1445
- if freqs.include?(literalTerm)
1446
- freqs[literalTerm] += 1
1447
- else
1448
- freqs[literalTerm] = 1
1449
- end
1450
- end
1451
- end
1452
- if (ratio || translate)
1453
- aux_keys = freqs.keys
1454
- aux_keys.each do |term|
1455
- freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1456
- if translate
1457
- tr = self.translate_id(term)
1458
- freqs[tr] = freqs.delete(term) if !tr.nil?
1459
- end
1460
- end
1461
- end
1462
- if asArray
1463
- freqs = freqs.map{|term, freq| [term, freq]}
1464
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1465
- end
1466
- else # Freqs translating alternatives
1467
- freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1468
- freqs = freqs.to_h if !asArray
1469
- if translate
1470
- freqs = freqs.map do |term, freq|
1471
- tr = self.translate_id(term)
1472
- tr.nil? ? [term, freq] : [tr, freq]
1473
- end
1474
- end
1475
- if asArray
1476
- freqs = freqs.map{|term, freq| [term, freq]}
1477
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1478
- else
1479
- freqs = freqs.to_h
1480
- end
1481
- end
1482
- return freqs
1483
- end
1484
-
1485
-
1486
- # Clean a given profile returning cleaned set of terms and removed ancestors term.
1487
- # ===== Parameters
1488
- # +prof+:: array of terms to be checked
1489
- # ===== Returns
1490
- # two arrays, first is the cleaned profile and second is the removed elements array
1491
- def remove_ancestors_from_profile(prof)
1492
- ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1493
- redundant = prof.select{|term| ancestors.include?(term)}
1494
- return prof - redundant, redundant
1495
- end
1496
-
1497
-
1498
- # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1499
- # ===== Parameters
1500
- # +prof+:: array of terms to be checked
1501
- # ===== Returns
1502
- # two arrays, first is the cleaned profile and second is the removed elements array
1503
- def remove_alternatives_from_profile(prof)
1504
- alternatives = prof.select{|term| @alternatives_index.include?(term)}
1505
- redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1506
- return prof - redundant, redundant
1507
- end
1508
-
1509
-
1510
- # Remove alternatives (if official term is present) and ancestors terms of a given profile
1511
- # ===== Parameters
1512
- # +profile+:: profile to be cleaned
1513
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1514
- # ===== Returns
1515
- # cleaned profile
1516
- def clean_profile(profile, remove_alternatives: true)
1517
- terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1518
- if remove_alternatives
1519
- terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1520
- else
1521
- terms_without_ancestors_and_alternatices = terms_without_ancestors
1522
- end
1523
- return terms_without_ancestors_and_alternatices
1524
- end
1525
-
1526
-
1527
- # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1528
- # ===== Parameters
1529
- # +store+:: if true, clenaed profiles will replace already stored profiles
1530
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1531
- # ===== Returns
1532
- # a hash with cleaned profiles
1533
- def clean_profiles(store: false, remove_alternatives: true)
1534
- cleaned_profiles = {}
1535
- @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1536
- @profiles = cleaned_profiles if store
1537
- return cleaned_profiles
1538
- end
1539
-
1540
-
1541
- # Calculates number of ancestors present (redundant) in each profile stored
1542
- # ===== Returns
1543
- # array of parentals for each profile
1544
- def parentals_per_profile
1545
- cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1546
- parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1547
- return parentals
1548
- end
1549
-
1550
-
1551
- # Calculates mean IC of a given profile
1552
- # ===== Parameters
1553
- # +prof+:: profile to be checked
1554
- # +ic_type+:: ic_type to be used
1555
- # +zhou_k+:: special coeficient for Zhou IC method
1556
- # ===== Returns
1557
- # mean IC for a given profile
1558
- def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1559
- return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1560
- end
1561
-
1562
-
1563
- # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1564
- # ===== Returns
1565
- # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1566
- def get_profiles_resnik_dual_ICs
1567
- struct_ics = {}
1568
- observ_ics = {}
1569
- @profiles.each do |id, terms|
1570
- struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1571
- observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1572
- end
1573
- return struct_ics.clone, observ_ics.clone
1574
- end
1575
-
1576
-
1577
- # Calculates ontology structural levels for all ontology terms
1578
- # ===== Parameters
1579
- # +calc_paths+:: calculates term paths if it's not already calculated
1580
- # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1581
- def calc_term_levels(calc_paths: false, shortest_path: true)
1582
- if @term_paths.empty?
1583
- if calc_paths
1584
- self.calc_term_paths
1585
- else
1586
- warn('Term paths are not already loaded. Aborting dictionary calc')
1587
- end
1588
- end
1589
- if !@term_paths.empty?
1590
- byTerm = {}
1591
- byValue = {}
1592
- # Calc per term
1593
- @term_paths.each do |term, info|
1594
- level = shortest_path ? info[:shortest_path] : info[:largest_path]
1595
- if level.nil?
1596
- level = -1
1597
- else
1598
- level = level.round(0)
1599
- end
1600
- byTerm[term] = level
1601
- queryLevels = byValue[level]
1602
- if queryLevels.nil?
1603
- byValue[level] = [term]
1604
- else
1605
- byValue[level] << term
1606
- end
1607
- end
1608
- @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1609
- # Update maximum depth
1610
- @max_freqs[:max_depth] = byValue.keys.max
1611
- end
1612
- end
1613
-
1614
-
1615
- # Check if a term given is marked as obsolete
1616
- def is_obsolete? term
1617
- return @obsoletes_index.include?(term)
1618
- end
1619
-
1620
- # Check if a term given is marked as alternative
1621
- def is_alternative? term
1622
- return @alternatives_index.include?(term)
1623
- end
1624
-
1625
- # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1626
- # Also calculates paths metadata and stores into @term_paths
1627
- def calc_term_paths
1628
- self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1629
- visited_terms = []
1630
- @term_paths = {}
1631
- if [:hierarchical, :sparse].include? @structureType
1632
- terms = @stanzas[:terms].keys
1633
- terms.each do |term|
1634
- if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1635
- special_term = term
1636
- term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1637
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1638
- @term_paths[special_term] = @term_paths[term]
1639
- visited_terms << special_term
1640
- end
1641
-
1642
- if !visited_terms.include?(term)
1643
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1644
- parentals = @dicts[:is_a][:byTerm][term]
1645
- if parentals.nil?
1646
- @term_paths[term][:paths] << [term]
1647
- else
1648
- parentals.each do |direct_parental|
1649
- if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1650
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1651
- else # Calculate new paths
1652
- self.expand_path(direct_parental, visited_terms)
1653
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1654
- end
1655
- new_paths.each{|path| @term_paths[term][:paths] << path}
1656
- end
1657
- end
1658
- visited_terms << term
1659
- end
1660
- # Update metadata
1661
- @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1662
- paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1663
- @term_paths[term][:largest_path] = paths_sizes.max
1664
- @term_paths[term][:shortest_path] = paths_sizes.min
1665
- end
1666
- else
1667
- warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1668
- end
1669
- end
1670
-
1671
-
1672
- # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1673
- # ===== Parameters
1674
- # +curr_term+:: current visited term
1675
- # +visited_terms+:: already expanded terms
1676
- def expand_path(curr_term, visited_terms)
1677
- if !visited_terms.include?(curr_term) # Not already expanded
1678
- @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1679
- direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1680
- if direct_parentals.nil? # No parents :: End of recurrence
1681
- @term_paths[curr_term][:paths] << [curr_term]
1682
- else # Expand and concat
1683
- direct_parentals.each do |ancestor|
1684
- self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1685
- new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1686
- new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1687
- end
1688
- end
1689
- visited_terms << curr_term
1690
- end
1691
- end
1692
-
1693
-
1694
- # Gets ontology levels calculated
1695
- # ===== Returns
1696
- # ontology levels calculated
1697
- def get_ontology_levels
1698
- return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1699
- end
1700
-
1701
-
1702
- # Gets ontology level of a specific term
1703
- # ===== Returns
1704
- # Term level
1705
- def get_term_level(term)
1706
- return @dicts[:level][:byValue][term]
1707
- end
1708
-
1709
-
1710
- # Return ontology levels from profile terms
1711
- # ===== Returns
1712
- # hash of term levels (Key: level; Value: array of term IDs)
1713
- def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1714
- profiles_terms = @profiles.values.flatten
1715
- profiles_terms.uniq! if uniq
1716
- term_freqs_byProfile = {}
1717
- profiles_terms.each do |term|
1718
- query = term_freqs_byProfile[term]
1719
- if query.nil?
1720
- term_freqs_byProfile[term] = 1
1721
- else
1722
- term_freqs_byProfile[term] += 1
1723
- end
1724
- end
1725
- levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1726
- return levels_filtered
1727
- end
1728
-
1729
-
1730
- # Calculate profiles dictionary with Key= Term; Value = Profiles
1731
- def calc_profiles_dictionary
1732
- if @profiles.empty?
1733
- warn('Profiles are not already loaded. Aborting dictionary calc')
1734
- else
1735
- byTerm = {} # Key: Terms
1736
- # byValue -- Key: Profile == @profiles
1737
- @profiles.each do |id, terms|
1738
- terms.each do |term|
1739
- if byTerm.include?(term)
1740
- byTerm[term] << id
1741
- else
1742
- byTerm[term] = [id]
1743
- end
1744
- end
1745
- end
1746
- @profilesDict = byTerm
1747
- end
1748
- end
1749
-
1750
-
1751
- # Gets profiles dictionary calculated
1752
- # ===== Return
1753
- # profiles dictionary (clone)
1754
- def get_terms_linked_profiles
1755
- return @profilesDict.clone
1756
- end
1757
-
1758
-
1759
- # Get related profiles to a given term
1760
- # ===== Parameters
1761
- # +term+:: to be checked
1762
- # ===== Returns
1763
- # profiles which contains given term
1764
- def get_term_linked_profiles(term)
1765
- return @profilesDict[term]
1766
- end
1767
-
1768
-
1769
- # Gets metainfo table from a set of terms
1770
- # ===== Parameters
1771
- # +terms+:: IDs to be expanded
1772
- # +filter_alternatives+:: flag to be used in get_descendants method
1773
- # ===== Returns
1774
- # an array with triplets [TermID, TermName, DescendantsNames]
1775
- def get_childs_table(terms, filter_alternatives = false)
1776
- expanded_terms = []
1777
- terms.each do |t|
1778
- expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1779
- end
1780
- return expanded_terms
1781
- end
1782
-
1783
-
1784
- # Store specific relations hash given into ITEMS structure
1785
- # ===== Parameters
1786
- # +relations+:: to be stored
1787
- # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1788
- # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1789
- def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1790
- @items = {} if remove_old_relations
1791
- if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1792
- warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1793
- end
1794
- if !remove_old_relations
1795
- if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1796
- warn('Some terms given are already stored. Stored version will be replaced')
1797
- end
1798
- end
1799
- if expand
1800
- relations.each do |k,v|
1801
- if @items.keys.include?(k)
1802
- @items[k] = (@items[k] + v).uniq
1803
- else
1804
- @items[k] = v
1805
- end
1806
- end
1807
- else
1808
- @items.merge!(relations)
1809
- end
1810
- end
1811
-
1812
-
1813
- # Assign a dictionary already calculated as a items set.
1814
- # ===== Parameters
1815
- # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1816
- def set_items_from_dict(dictID, remove_old_relations = false)
1817
- @items = {} if remove_old_relations
1818
- if(@dicts.keys.include?(dictID))
1819
- @items.merge(@dicts[dictID][:byTerm])
1820
- else
1821
- warn('Specified ID is not calculated. Dict will not be added as a items set')
1822
- end
1823
- end
1824
-
1825
-
1826
- # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
1827
- # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1828
- # ===== Parameters
1829
- # +ontology+:: (Optional) ontology object which items given belongs
1830
- # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
1831
- # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
1832
- # ===== Returns
1833
- # void and update items object
1834
- def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
1835
- # Check item keys
1836
- if @items.empty?
1837
- warn('Items have been not provided yet')
1838
- return nil
1839
- end
1840
- targetKeys = @items.keys.select{|k| self.exists?(k)}
1841
- if targetKeys.length == 0
1842
- warn('Any item key is allowed')
1843
- return nil
1844
- elsif targetKeys.length < @items.keys.length
1845
- warn('Some item keys are not allowed')
1846
- end
1847
-
1848
- # Expand to parentals
1849
- targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
1850
- targetKeys.flatten!
1851
- targetKeys.uniq!
1852
-
1853
- # Obtain levels (go from leaves to roots)
1854
- levels = targetKeys.map{|term| self.get_term_level(term)}
1855
- levels.compact!
1856
- levels.uniq!
1857
- levels.sort!
1858
- levels.reverse!
1859
- levels.shift # Leaves are not expandable
1860
-
1861
- # Expand from leaves to roots
1862
- levels.map do |lvl|
1863
- curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
1864
- curr_keys.map do |term_expand|
1865
- to_infer = []
1866
- # Obtain childs
1867
- childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
1868
- # Expand
1869
- if childs.length > 0 && minimum_childs == 1 # Special case
1870
- to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
1871
- elsif childs.length >= minimum_childs
1872
- to_infer = Hash.new(0)
1873
- # Compare
1874
- while childs.length > 1
1875
- curr_term = childs.shift
1876
- childs.each do |compare_term|
1877
- pivot_items = @items[curr_term]
1878
- compare_items = @items[compare_term]
1879
- if ontology.nil? # Exact match
1880
- pivot_items.map do |pitem|
1881
- if compare_items.include?(pitem)
1882
- to_infer[pitem] += 2
1883
- end
1884
- end
1885
- else # Find MICAs
1886
- local_infer = Hash.new(0)
1887
- pivot_items.map do |pitem|
1888
- micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
1889
- maxmica = micas[0]
1890
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1891
- local_infer[maxmica.first] += 1
1892
- end
1893
- compare_items.map do |citem|
1894
- micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
1895
- maxmica = micas[0]
1896
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1897
- local_infer[maxmica.first] += 1
1898
- end
1899
- local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
1900
- end
1901
- end
1902
- end
1903
- # Filter infer
1904
- to_infer = to_infer.select{|k,v| v >= minimum_childs}
1905
- end
1906
- # Infer
1907
- if to_infer.length > 0
1908
- @items[term_expand] = [] if @items[term_expand].nil?
1909
- if to_infer.kind_of?(Array)
1910
- @items[term_expand] = (@items[term_expand] + to_infer).uniq
1911
- else
1912
- @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
1913
- end
1914
- @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
1915
- elsif !@items.include?(term_expand)
1916
- targetKeys.delete(term_expand)
1917
- end
1918
- end
1919
- end
1920
- end
1921
-
1922
-
1923
-
1924
- # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1925
- # ===== Parameters
1926
- # ++::
1927
- # ===== Returns
1928
- # ...
1929
- def compute_relations_to_items(external_item_list, mode, thresold)
1930
- results = []
1931
- penalized_terms = {}
1932
- # terms_levels = get_terms_levels(@items_relations.keys)
1933
- terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1934
- terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1935
- terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1936
- levels = terms_levels.keys.sort
1937
- levels.reverse_each do |level|
1938
- terms_levels[level].each do |term|
1939
- associated_items = @items_relations[term]
1940
- if mode == :elim
1941
- items_to_remove = penalized_terms[term]
1942
- items_to_remove = [] if items_to_remove.nil?
1943
- pval = get_fisher_exact_test(
1944
- external_item_list - items_to_remove,
1945
- associated_items - items_to_remove,
1946
- ((associated_items | external_item_list) - items_to_remove).length
1947
- )
1948
- if pval <= thresold
1949
- parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1950
- parents.each do |prnt|
1951
- query = penalized_terms[prnt]
1952
- if query.nil?
1953
- penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1954
- else
1955
- query.concat(@items_relations[term])
1956
- end
1957
- end
1958
- end
1959
- end
1960
- results << [term, pval]
1961
- end
1962
- end
1963
- return results
1964
- end
1965
-
1966
-
1967
- # Check if a given ID is a removable (blacklist) term.
1968
- # +DEPRECATED+ use is_removable? instead
1969
- # ===== Parameters
1970
- # +id+:: to be checked
1971
- # ===== Returns
1972
- # true if given term is a removable (blacklist) term or false in other cases
1973
- def is_removable(id)
1974
- warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
1975
- return @removable_terms.include?(id.to_sym)
1976
- end
1977
-
1978
- # Check if a given ID is a removable (blacklist) term
1979
- # ===== Parameters
1980
- # +id+:: to be checked
1981
- # ===== Returns
1982
- # true if given term is a removable (blacklist) term or false in other cases
1983
- def is_removable? id
1984
- return @removable_terms.include?(id.to_sym)
1985
- end
1986
-
1987
- ############################################
1988
- # SPECIAL METHODS
1989
- #############################################
1990
- def ==(other)
1991
- self.header == other.header &&
1992
- self.stanzas == other.stanzas &&
1993
- self.ancestors_index == other.ancestors_index &&
1994
- self.alternatives_index == other.alternatives_index &&
1995
- self.obsoletes_index == other.obsoletes_index &&
1996
- self.structureType == other.structureType &&
1997
- self.ics == other.ics &&
1998
- self.meta == other.meta &&
1999
- self.dicts == other.dicts &&
2000
- self.profiles == other.profiles &&
2001
- self.profilesDict == other.profilesDict &&
2002
- (self.items.keys - other.items.keys).empty? &&
2003
- self.removable_terms == other.removable_terms &&
2004
- self.special_tags == other.special_tags &&
2005
- self.items == other.items &&
2006
- self.term_paths == other.term_paths &&
2007
- self.max_freqs == other.max_freqs
7
+ #########################################################
8
+ # AUTHOR NOTES
9
+ #########################################################
10
+
11
+ # 1 - Store @profiles as @stanzas[:instances]
12
+ # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
13
+
14
+
15
+ #############################################
16
+ # FIELDS
17
+ #############################################
18
+ # Handled class variables
19
+ # => @@basic_tags :: hash with main OBO structure tags
20
+ # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
21
+ # => @@symbolizable_ids :: tags which can be symbolized
22
+ # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
23
+ #
24
+ # Handled object variables
25
+ # => @header :: file header (if is available)
26
+ # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
27
+ # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
28
+ # => @descendants_index :: hash of descendants per each term handled with any structure relationships
29
+ # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
30
+ # => @obsoletes_index :: hash of obsoletes and it's new ids
31
+ # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
32
+ # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
33
+ # => @ics :: already calculated ICs for handled terms and IC types
34
+ # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
35
+ # => @max_freqs :: maximum freqs found for structural and observed freqs
36
+ # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
37
+ # => @profiles :: set of terms assigned to an ID
38
+ # => @profilesDict :: set of profile IDs assigned to a term
39
+ # => @items :: hash with items relations to terms
40
+ # => @removable_terms :: array of terms to not be considered
41
+ # => @term_paths :: metainfo about parental paths of each term
42
+
43
+ @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
44
+ @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
45
+ @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
46
+ @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
47
+ @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
48
+ @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
49
+
50
+ #############################################
51
+ # CONSTRUCTOR
52
+ #############################################
53
+
54
+ # Instantiate a OBO_Handler object
55
+ # ===== Parameters
56
+ # +file+:: with info to be loaded (.obo ; .json)
57
+ # +load_file+:: activate load process automatically
58
+ # +removable_terms+: term to be removed from calcs
59
+ # +build+: flag to launch metainfo calculation
60
+ # +file_format+: force format type despite file extension. Can be :obo or :json
61
+ def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
62
+ # Initialize object variables
63
+ @header = nil
64
+ @stanzas = {terms: {}, typedefs: {}, instances: {}}
65
+ @ancestors_index = {}
66
+ @descendants_index = {}
67
+ @alternatives_index = {}
68
+ @obsoletes_index = {}
69
+ @structureType = nil
70
+ @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
71
+ @meta = {}
72
+ @special_tags = @@basic_tags.clone
73
+ @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
74
+ @dicts = {}
75
+ @profiles = {}
76
+ @profilesDict = {}
77
+ @items = {}
78
+ @removable_terms = []
79
+ @term_paths = {}
80
+ add_removable_terms(removable_terms) if !removable_terms.empty?
81
+ load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
82
+ # Load if proceeds
83
+ if load_file
84
+ fformat = file_format
85
+ fformat = File.extname(file) if fformat.nil? && !file.nil?
86
+ if fformat == :obo || fformat == ".obo"
87
+ load(file, build: build)
88
+ elsif fformat == :json || fformat == ".json"
89
+ self.read(file, build: build)
90
+ elsif !fformat.nil?
91
+ warn 'Format not allowed. Loading process will not be performed'
92
+ end
93
+ end
94
+ end
95
+
96
+
97
+ #############################################
98
+ # CLASS METHODS
99
+ #############################################
100
+
101
+ # Expand a (starting) term using a specific tag and return all extended terms into an array and
102
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
103
+ # foumd, extended array will be an unique vector without starting term (no loops).
104
+ # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
105
+ # ===== Parameters
106
+ # +start+:: term where start to expand
107
+ # +terms+:: set to be used to expand
108
+ # +target_tag+:: tag used to expand
109
+ # +eexpansion+:: already expanded info
110
+ # +split_info_char+:: special regex used to split info (if it is necessary)
111
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
112
+ # +alt_ids+:: set of alternative IDs
113
+ # ===== Returns
114
+ # A vector with the observed structure (string) and the array with extended terms.
115
+ def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
116
+ # Take start_id term available info and already accumulated info
117
+ current_associations = related_ids[start_id]
118
+ current_associations = [] if current_associations.nil?
119
+ return [:no_term,[]] if terms[start_id].nil?
120
+ id_relations = terms[start_id][target_tag]
121
+ return [:source,[]] if id_relations.nil?
122
+
123
+ # Prepare auxiliar variables
124
+ struct = :hierarchical
125
+
126
+ # Study direct extensions
127
+ id_relations = id_relations.clone
128
+ while id_relations.length > 0
129
+ id = id_relations.shift
130
+ id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
131
+
132
+ # Handle
133
+ if current_associations.include?(id) # Check if already have been included into this expansion
134
+ struct = :circular
135
+ else
136
+ current_associations << id
137
+ if related_ids.include?(id) # Check if current already has been expanded
138
+ current_associations = current_associations | related_ids[id]
139
+ if current_associations.include?(start_id) # Check circular case
140
+ struct = :circular
141
+ [id, start_id].each{|repeated| current_associations.delete(repeated)}
142
+ end
143
+ else # Expand
144
+ related_ids[start_id] = current_associations
145
+ structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
146
+ current_associations = current_associations | current_related_ids
147
+ struct = :circular if structExp == :circular # Check struct
148
+ if current_associations.include?(start_id) # Check circular case
149
+ struct = :circular
150
+ current_associations.delete(start_id)
151
+ end
152
+ end
153
+ end
154
+ end
155
+ related_ids[start_id] = current_associations
156
+
157
+ return struct, current_associations
158
+ end
159
+
160
+
161
+ # Expand terms using a specific tag and return all extended terms into an array and
162
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
163
+ # foumd, extended array will be an unique vector without starting term (no loops)
164
+ # ===== Parameters
165
+ # +terms+:: set to be used to expand
166
+ # +target_tag+:: tag used to expand
167
+ # +split_info_char+:: special regex used to split info (if it is necessary)
168
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
169
+ # +alt_ids+:: set of alternative IDs
170
+ # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
171
+ # ===== Returns
172
+ # A vector with the observed structure (string) and the hash with extended terms
173
+ def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
174
+ # Define structure type
175
+ structType = :hierarchical
176
+ related_ids = {}
177
+ terms.each do |id, tags|
178
+ # Check if target tag is defined
179
+ if !tags[target_tag].nil?
180
+ # Obtain related terms
181
+ set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
182
+ # Check structure
183
+ structType = :circular if set_structure == :circular
184
+ end
185
+ end
186
+
187
+ # Check special case
188
+ structType = :atomic if related_ids.length <= 0
189
+ structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
190
+ # Return type and hash with related_ids
191
+ return structType, related_ids
192
+ end
193
+
194
+
195
+ # Class method to transform string with <tag : info> into hash structure
196
+ # ===== Parameters
197
+ # +attributes+:: array tuples with info to be transformed into hash format
198
+ # ===== Returns
199
+ # Attributes stored into hash structure
200
+ def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
201
+ # Load info
202
+ info_hash = {}
203
+ # Only TERMS multivalue tags (future add Typedefs and Instance)
204
+ # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
205
+ attributes.each do |tag, value|
206
+ value.gsub!(/{source=[\\\":A-Za-z0-9\/\.\-, =]+} /, '') if tag == 'is_a' # To delete "source" attributes in is_a tag of MONDO ontology
207
+ # Check
208
+ raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
209
+ # Prepare
210
+ tag = tag.lstrip.to_sym
211
+ value.lstrip!
212
+ value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
213
+
214
+ # Store
215
+ query = info_hash[tag]
216
+ if !query.nil? # Tag already exists
217
+ if !query.kind_of?(Array) # Check that tag is multivalue
218
+ raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
219
+ else
220
+ query << value # Add new value to tag
221
+ end
222
+ else # New entry
223
+ if @@multivalue_tags.include?(tag)
224
+ info_hash[tag] = [value]
225
+ else
226
+ info_hash[tag] = value
227
+ end
228
+ end
229
+ end
230
+ self.symbolize_ids(info_hash)
231
+ return info_hash
232
+ end
233
+
234
+
235
+ # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
236
+ # the Header, the Terms, the Typedefs and the Instances.
237
+ # ===== Parameters
238
+ # +file+:: OBO file to be loaded
239
+ # ===== Returns
240
+ # Hash with FILE, HEADER and STANZAS info
241
+ def self.load_obo(file) #TODO: Send to obo_parser class
242
+ raise("File is not defined") if file.nil?
243
+ # Data variables
244
+ header = ''
245
+ stanzas = {terms: {}, typedefs: {}, instances: {}}
246
+ # Auxiliar variables
247
+ infoType = 'Header'
248
+ currInfo = []
249
+ stanzas_flags = %w[[Term] [Typedef] [Instance]]
250
+ # Read file
251
+ File.open(file).each do |line|
252
+ line.chomp!
253
+ next if line.empty?
254
+ fields = line.split(':', 2)
255
+ # Check if new instance is found
256
+ if stanzas_flags.include?(line)
257
+ header = self.process_entity(header, infoType, stanzas, currInfo)
258
+ # Update info variables
259
+ currInfo = []
260
+ infoType = line.gsub!(/[\[\]]/, '')
261
+ next
262
+ end
263
+ # Concat info
264
+ currInfo << fields
265
+ end
266
+ # Store last loaded info
267
+ header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
268
+
269
+ # Prepare to return
270
+ finfo = {:file => file, :name => File.basename(file, File.extname(file))}
271
+ return finfo, header, stanzas
272
+ end
273
+
274
+
275
+ # Handle OBO loaded info and stores it into correct container and format
276
+ # ===== Parameters
277
+ # +header+:: container
278
+ # +infoType+:: current ontology item type detected
279
+ # +stanzas+:: container
280
+ # +currInfo+:: info to be stored
281
+ # ===== Returns
282
+ # header newly/already stored
283
+ def self.process_entity(header, infoType, stanzas, currInfo)
284
+ info = self.info2hash(currInfo)
285
+ # Store current info
286
+ if infoType.eql?('Header')
287
+ header = info
288
+ else
289
+ id = info[:id]
290
+ case infoType
291
+ when 'Term'
292
+ stanzas[:terms][id] = info
293
+ when 'Typedef'
294
+ stanzas[:typedefs][id] = info
295
+ when 'Instance'
296
+ stanzas[:instances][id] = info
297
+ end
298
+ end
299
+ return header
300
+ end
301
+
302
+
303
+ # Symboliza all values into hashs using symbolizable tags as keys
304
+ # ===== Parameters
305
+ # +item_hash+:: hash to be checked
306
+ def self.symbolize_ids(item_hash)
307
+ @@symbolizable_ids.each do |tag|
308
+ query = item_hash[tag]
309
+ if !query.nil?
310
+ if query.kind_of?(Array)
311
+ query.map!{|item| item.to_sym}
312
+ else
313
+ item_hash[tag] = query.to_sym if !query.nil?
314
+ end
315
+ end
316
+ end
317
+ end
318
+
319
+
320
+ #
321
+ # ===== Parameters
322
+ # +root+:: main term to expand
323
+ # +ontology+:: to be cutted
324
+ # +clone+:: if true, given ontology object will not be mutated
325
+ # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
326
+ # ===== Returns
327
+ # An Ontology object with terms after cut the ontology.
328
+ def self.mutate(root, ontology, clone: true, remove_up: true)
329
+ ontology = ontology.clone if clone
330
+ # Obtain affected IDs
331
+ descendants = ontology.descendants_index[root]
332
+ descendants << root # Store itself to do not remove it
333
+ # Remove unnecesary terms
334
+ ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
335
+ ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
336
+ ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
337
+ ontology.dicts = {}
338
+ ontology.removable_terms = []
339
+ ontology.term_paths = {}
340
+ # Recalculate metadata
341
+ ontology.build_index
342
+ ontology.add_observed_terms_from_profiles
343
+ # Finish
344
+ return ontology
345
+ end
346
+
347
+
348
+
349
+ #############################################
350
+ # GENERAL METHODS
351
+ #############################################
352
+
353
+ # Include removable terms to current removable terms list
354
+ # ===== Parameters
355
+ # +terms+:: terms array to be concatenated
356
+ def add_removable_terms(terms)
357
+ terms = terms.map{|term| term.to_sym}
358
+ @removable_terms.concat(terms)
359
+ end
360
+
361
+
362
+ # Include removable terms to current removable terms list loading new
363
+ # terms from a one column plain text file
364
+ # ===== Parameters
365
+ # +file+:: to be loaded
366
+ def add_removable_terms_from_file(file)
367
+ File.open(excluded_codes_file).each do |line|
368
+ line.chomp!
369
+ @removable_terms << line.to_sym
370
+ end
371
+ end
372
+
373
+
374
+ # Increase observed frequency for a specific term
375
+ # ===== Parameters
376
+ # +term+:: term which frequency is going to be increased
377
+ # +increas+:: frequency rate to be increased. Default = 1
378
+ # ===== Return
379
+ # true if process ends without errors, false in other cases
380
+ def add_observed_term(term:,increase: 1.0)
381
+ # Check
382
+ raise ArgumentError, "Term given is NIL" if term.nil?
383
+ return false unless @stanzas[:terms].include?(term)
384
+ return false if @removable_terms.include?(term)
385
+ if @alternatives_index.include?(term)
386
+ alt_id = @alternatives_index[term]
387
+ @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
388
+ @meta[term] = @meta[alt_id]
389
+ end
390
+ # Check if exists
391
+ @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
392
+ # Add frequency
393
+ @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
394
+ @meta[term][:observed_freq] += increase
395
+ # Check maximum frequency
396
+ @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
397
+ return true
398
+ end
399
+
400
+
401
+ # Increase the arbitrary frequency of a given term set
402
+ # ===== Parameters
403
+ # +terms+:: set of terms to be updated
404
+ # +increase+:: amount to be increased
405
+ # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
406
+ # ===== Return
407
+ # true if process ends without errors and false in other cases
408
+ def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
409
+ # Check
410
+ raise ArgumentError, 'Terms array given is NIL' if terms.nil?
411
+ raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
412
+ # Add observations
413
+ if transform_to_sym
414
+ checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
415
+ else
416
+ checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
417
+ end
418
+ return checks
419
+ end
420
+
421
+
422
+ # Compare to terms sets
423
+ # ===== Parameters
424
+ # +termsA+:: set to be compared
425
+ # +termsB+:: set to be compared
426
+ # +sim_type+:: similitude method to be used. Default: resnik
427
+ # +ic_type+:: ic type to be used. Default: resnik
428
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
429
+ # ===== Return
430
+ # similitude calculated
431
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
432
+ # Check
433
+ raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
434
+ raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
435
+ micasA = []
436
+ # Compare A -> B
437
+ termsA.each do |tA|
438
+ micas = []
439
+ termsB.each do |tB|
440
+ if store_mica
441
+ value = @mica_index.dig(tA, tB)
442
+ else
443
+ value = nil
444
+ end
445
+ if value.nil?
446
+ value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
447
+ if store_mica
448
+ value = true if value.nil? # We use true to save that the operation was made but there is not mica value
449
+ add2nestHash(@mica_index, tA, tB, value)
450
+ end
451
+ end
452
+ micas << value if value.class == Float
453
+ end
454
+ if !micas.empty?
455
+ micasA << micas.max # Obtain maximum value
456
+ else
457
+ micasA << 0
458
+ end
459
+ end
460
+ means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
461
+ # Compare B -> A
462
+ if bidirectional
463
+ means_simA = means_sim * micasA.size
464
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
465
+ means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
466
+ end
467
+ # Return
468
+ return means_sim
469
+ end
470
+
471
+ def add2nestHash(h, key1, key2, val)
472
+ query1 = h[key1]
473
+ if query1.nil?
474
+ h[key1] = {key2 => val}
475
+ else
476
+ query1[key2] = val
477
+ end
478
+ end
479
+
480
+ # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
481
+ # ===== Parameters
482
+ # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
483
+ # +sim_type+:: similitude method to be used. Default: resnik
484
+ # +ic_type+:: ic type to be used. Default: resnik
485
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
486
+ # ===== Return
487
+ # Similitudes calculated
488
+ def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
489
+ profiles_similarity = {} #calculate similarity between patients profile
490
+ profiles_ids = @profiles.keys
491
+ if external_profiles.nil?
492
+ comp_ids = profiles_ids
493
+ comp_profiles = @profiles
494
+ main_ids = comp_ids
495
+ main_profiles = comp_profiles
496
+ else
497
+ comp_ids = external_profiles.keys
498
+ comp_profiles = external_profiles
499
+ main_ids = profiles_ids
500
+ main_profiles = @profiles
501
+ end
502
+ # Compare
503
+ @mica_index = {}
504
+ while !main_ids.empty?
505
+ curr_id = main_ids.shift
506
+ current_profile = main_profiles[curr_id]
507
+ comp_ids.each do |id|
508
+ profile = comp_profiles[id]
509
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
510
+ query = profiles_similarity[curr_id]
511
+ if query.nil?
512
+ profiles_similarity[curr_id] = {id => value}
513
+ else
514
+ query[id] = value
515
+ end
516
+ end
517
+ end
518
+ return profiles_similarity
519
+ end
520
+
521
+
522
+ # Expand alternative IDs arround all already stored terms
523
+ # ===== Parameters
524
+ # +alt_tag+:: tag used to expand alternative IDs
525
+ # ===== Returns
526
+ # true if process ends without errors and false in other cases
527
+ def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
528
+ # Check input
529
+ raise('stanzas terms empty') if @stanzas[:terms].empty?
530
+ # Take all alternative IDs
531
+ alt_ids2add = {}
532
+ @stanzas[:terms].each do |id, tags|
533
+ if id == tags[:id] # Avoid simulated alternative terms
534
+ # id = tags[:id] # Take always real ID in case of alternative terms simulted
535
+ alt_ids = tags[alt_tag]
536
+ if !alt_ids.nil?
537
+ alt_ids = alt_ids - @removable_terms - [id]
538
+ # Update info
539
+ alt_ids.each do |alt_term|
540
+ @alternatives_index[alt_term] = id
541
+ alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
542
+ @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
543
+ end
544
+ end
545
+ end
546
+ end
547
+ @stanzas[:terms].merge!(alt_ids2add)
548
+ end
549
+
550
+
551
+ # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
552
+ # ===== Returns
553
+ # true if eprocess ends without errors and false in other cases
554
+ def build_index()
555
+ self.get_index_obsoletes
556
+ self.get_index_alternatives
557
+ self.get_index_child_parent_relations
558
+ @alternatives_index.each{|k,v| @alternatives_index[k] = self.extract_id(v)}
559
+ ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
560
+ @alternatives_index.compact!
561
+ @obsoletes_index.each{|k,v| @obsoletes_index[k] = self.extract_id(v)}
562
+ @obsoletes_index.compact!
563
+ @ancestors_index.each{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
564
+ @ancestors_index.compact!
565
+ @descendants_index.each{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
566
+ @descendants_index.compact!
567
+ self.get_index_frequencies
568
+ self.calc_dictionary(:name)
569
+ self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
570
+ self.calc_term_levels(calc_paths: true)
571
+ end
572
+
573
+
574
+ # Calculates regular frequencies based on ontology structure (using parentals)
575
+ # ===== Returns
576
+ # true if everything end without errors and false in other cases
577
+ def get_index_frequencies()
578
+ # Check
579
+ if @ancestors_index.empty?
580
+ warn('ancestors_index object is empty')
581
+ else
582
+ # Per each term, add frequencies
583
+ @stanzas[:terms].each do |id, tags|
584
+ if @alternatives_index.include?(id)
585
+ alt_id = @alternatives_index[id]
586
+ query = @meta[alt_id] # Check if exist
587
+ if query.nil?
588
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
589
+ @meta[alt_id] = query
590
+ end
591
+ @meta[id] = query
592
+ # Note: alternative terms do not increase structural frequencies
593
+ else # Official term
594
+ query = @meta[id] # Check if exist
595
+ if query.nil?
596
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
597
+ @meta[id] = query
598
+ end
599
+ # Store metadata
600
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
601
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
602
+ query[:struct_freq] = query[:descendants] + 1.0
603
+ # Update maximums
604
+ @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
605
+ @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
606
+ end
607
+ end
608
+ end
609
+ end
610
+
611
+
612
+ # Expand obsoletes set and link info to their alternative IDs
613
+ # ===== Parameters
614
+ # +obs_tags+:: tags to be used to find obsoletes
615
+ # +alt_tags+:: tags to find alternative IDs (if are available)
616
+ # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
617
+ # ===== Returns
618
+ # true if process ends without errors and false in other cases
619
+ def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
620
+ if @stanzas[:terms].empty?
621
+ warn('stanzas terms empty')
622
+ else
623
+ # Check obsoletes
624
+ @stanzas[:terms].each do |id, term_tags|
625
+ next if term_tags.nil?
626
+ next if self.is_alternative?(id)
627
+ query = term_tags[obs_tag]
628
+ if !query.nil? && query == 'true' # Obsolete tag presence
629
+ next if !@obsoletes_index[id].nil? # Already stored
630
+ # Check if alternative value is available
631
+ alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
632
+ if !alt_ids.empty?
633
+ alt_id = alt_ids.first.first #FIRST tag, FIRST id
634
+ # Store
635
+ @alternatives_index[id] = alt_id
636
+ @obsoletes_index[id] = alt_id
637
+ end
638
+ end
639
+ end
640
+ end
641
+ end
642
+
643
+
644
+ # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
645
+ # ===== Parameters
646
+ # +tag+:: tag used to expand parentals
647
+ # +split_info_char+:: special regex used to split info (if it is necessary)
648
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
649
+ # ===== Returns
650
+ # true if process ends without errors and false in other cases
651
+ def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
652
+ # Check
653
+ if @stanzas[:terms].nil?
654
+ warn('stanzas terms empty')
655
+ else
656
+ # Expand
657
+ structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
658
+ target_tag: tag,
659
+ alt_ids: @alternatives_index,
660
+ obsoletes: @obsoletes_index.length)
661
+ # Check
662
+ raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
663
+ # Prepare ancestors structure
664
+ anc = {}
665
+ des = {}
666
+ parentals.each do |id, parents|
667
+ parents = parents - @removable_terms
668
+ anc[id] = parents
669
+ parents.each do |anc_id| # Add descendants
670
+ if !des.include?(anc_id)
671
+ des[anc_id] = [id]
672
+ else
673
+ des[anc_id] << id
674
+ end
675
+ end
676
+ end
677
+ # Store alternatives
678
+ # @alternatives_index.each do |id,alt|
679
+ # anc[id] = anc[alt] if anc.include?(alt)
680
+ # des[id] = des[alt] if des.include?(alt)
681
+ # end
682
+ # Check structure
683
+ if ![:atomic,:sparse].include? structType
684
+ structType = structType == :circular ? :circular : :hierarchical
685
+ end
686
+ # Store
687
+ @ancestors_index = anc
688
+ @descendants_index = des
689
+ @structureType = structType
690
+ end
691
+ # Finish
692
+ end
693
+
694
+
695
+ # Find ancestors of a given term
696
+ # ===== Parameters
697
+ # +term+:: to be checked
698
+ # +filter_alternatives+:: if true, remove alternatives from final results
699
+ # ===== Returns
700
+ # an array with all ancestors of given term or false if parents are not available yet
701
+ def get_ancestors(term, filter_alternatives = false)
702
+ return self.get_familiar(term, true, filter_alternatives)
703
+ end
704
+
705
+
706
+ # Find descendants of a given term
707
+ # ===== Parameters
708
+ # +term+:: to be checked
709
+ # +filter_alternatives+:: if true, remove alternatives from final results
710
+ # ===== Returns
711
+ # an array with all descendants of given term or false if parents are not available yet
712
+ def get_descendants(term, filter_alternatives = false)
713
+ return self.get_familiar(term, false, filter_alternatives)
714
+ end
715
+
716
+
717
+ # Find ancestors/descendants of a given term
718
+ # ===== Parameters
719
+ # +term+:: to be checked
720
+ # +return_ancestors+:: return ancestors if true or descendants if false
721
+ # +filter_alternatives+:: if true, remove alternatives from final results
722
+ # ===== Returns
723
+ # an array with all ancestors/descendants of given term or nil if parents are not available yet
724
+ def get_familiar(term, return_ancestors = true, filter_alternatives = false)
725
+ # Find into parentals
726
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
727
+ if !familiars.nil?
728
+ familiars = familiars.clone
729
+ if filter_alternatives
730
+ familiars.reject!{|fm| @alternatives_index.include?(fm)}
731
+ end
732
+ else
733
+ familiars = []
734
+ end
735
+ return familiars
736
+ end
737
+
738
+
739
+ # Obtain IC of an specific term
740
+ # ===== Parameters
741
+ # +term+:: which IC will be calculated
742
+ # +type+:: of IC to be calculated. Default: resnik
743
+ # +force+:: force re-calculate the IC. Do not check if it is already calculated
744
+ # +zhou_k+:: special coeficient for Zhou IC method
745
+ # ===== Returns
746
+ # the IC calculated
747
+ def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
748
+ term = termRaw.to_sym
749
+ curr_ics = @ics[type]
750
+ # Check
751
+ raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
752
+ # Check if it's already calculated
753
+ return curr_ics[term] if (curr_ics.include? term) && !force
754
+ # Calculate
755
+ ic = - 1
756
+ term_meta = @meta[term]
757
+ case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
758
+ ###########################################
759
+ #### STRUCTURE BASED METRICS
760
+ ###########################################
761
+ # Shortest path
762
+ # Weighted Link
763
+ # Hirst and St-Onge Measure
764
+ # Wu and Palmer
765
+ # Slimani
766
+ # Li
767
+ # Leacock and Chodorow
768
+ ###########################################
769
+ #### INFORMATION CONTENT METRICS
770
+ ###########################################
771
+ when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
772
+ # -log(Freq(x) / Max_Freq)
773
+ ic = -Math.log10(term_meta[:struct_freq].fdiv(@max_freqs[:struct_freq]))
774
+ when :resnik_observed
775
+ # -log(Freq(x) / Max_Freq)
776
+ ic = -Math.log10(term_meta[:observed_freq].fdiv(@max_freqs[:observed_freq]))
777
+ # Lin
778
+ # Jiang & Conrath
779
+
780
+ ###########################################
781
+ #### FEATURE-BASED METRICS
782
+ ###########################################
783
+ # Tversky
784
+ # x-similarity
785
+ # Rodirguez
786
+
787
+ ###########################################
788
+ #### HYBRID METRICS
789
+ ###########################################
790
+ when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
791
+ # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
792
+ ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
793
+ if :zhou # New Model of Semantic Similarity Measuring in Wordnet
794
+ # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
795
+ @ics[:seco][term] = ic # Special store
796
+ ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(term_meta[:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
797
+ end
798
+ when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
799
+ ic = -Math.log10((term_meta[:descendants].fdiv(term_meta[:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
800
+ # Knappe
801
+ end
802
+ curr_ics[term] = ic
803
+ return ic
804
+ end
805
+
806
+
807
+ # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
808
+ # ===== Returns
809
+ # two hashes with resnik and resnik_observed ICs for observed terms
810
+ def get_observed_ics_by_onto_and_freq
811
+ # Chech there are observed terms
812
+ if @profiles.empty?
813
+ resnik = {}
814
+ resnik_observed = {}
815
+ else
816
+ # Calc ICs for all terms
817
+ observed_terms = @profiles.values.flatten.uniq
818
+ observed_terms.each{ |term| get_IC(term)}
819
+ observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
820
+ resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
821
+ resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
822
+ end
823
+ return resnik.clone, resnik_observed.clone
824
+ end
825
+
826
+
827
+ # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
828
+ # ===== Parameters
829
+ # +termA+:: term to be cheked
830
+ # +termB+:: term to be checked
831
+ # +ic_type+:: IC formula to be used
832
+ # ===== Returns
833
+ # the IC of the MICA(termA,termB)
834
+ def get_ICMICA(termA, termB, ic_type = :resnik)
835
+ term, ic = self.get_MICA(termA, termB, ic_type)
836
+ return term.nil? ? nil : ic
837
+ end
838
+
839
+
840
+ # Find the Most Index Content shared Ancestor (MICA) of two given terms
841
+ # ===== Parameters
842
+ # +termA+:: term to be cheked
843
+ # +termB+:: term to be checked
844
+ # +ic_type+:: IC formula to be used
845
+ # ===== Returns
846
+ # the MICA(termA,termB) and it's IC
847
+ def get_MICA(termA, termB, ic_type = :resnik)
848
+ termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
849
+ termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
850
+ mica = [nil,-1.0]
851
+ # Special case
852
+ if termA.eql?(termB)
853
+ ic = self.get_IC(termA, type: ic_type)
854
+ mica = [termA, ic]
855
+ else
856
+ # Obtain ancestors (include itselfs too)
857
+ anc_A = self.get_ancestors(termA)
858
+ anc_B = self.get_ancestors(termB)
859
+ if !(anc_A.empty? && anc_B.empty?)
860
+ anc_A << termA
861
+ anc_B << termB
862
+ (anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
863
+ ic = self.get_IC(anc, type: ic_type)
864
+ mica = [anc,ic] if ic > mica[1]
865
+ end
866
+ end
867
+ end
868
+ return mica
869
+ end
870
+
871
+
872
+ # Calculate similarity between two given terms
873
+ # ===== Parameters
874
+ # +termsA+:: to be compared
875
+ # +termsB+:: to be compared
876
+ # +type+:: similitude formula to be used
877
+ # +ic_type+:: IC formula to be used
878
+ # ===== Returns
879
+ # the similarity between both sets or false if frequencies are not available yet
880
+ def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
881
+ # Check
882
+ raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
883
+ sim = nil
884
+ mica, sim_res = get_MICA(termA, termB, ic_type)
885
+ if !mica.nil?
886
+ case type
887
+ when :resnik
888
+ sim = sim_res
889
+ when :lin
890
+ sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
891
+ when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
892
+ sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
893
+ end
894
+ end
895
+ return sim
896
+ end
897
+
898
+
899
+ # Method used to load information stored into an OBO file and store it into this object.
900
+ # If a file is specified by input parameter, current @file value is updated
901
+ # ===== Parameters
902
+ # +file+:: optional file to update object stored file
903
+ def load(file, build: true)
904
+ _, header, stanzas = self.class.load_obo(file)
905
+ @header = header
906
+ @stanzas = stanzas
907
+ self.remove_removable()
908
+ # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
909
+ self.build_index() if build
910
+ end
911
+
912
+ #
913
+ def remove_removable()
914
+ @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
915
+ end
916
+
917
+
918
+ # Exports an OBO_Handler object in json format
919
+ # ===== Parameters
920
+ # +file+:: where info will be stored
921
+ def write(file)
922
+ # Take object stored info
923
+ obj_info = {header: @header,
924
+ stanzas: @stanzas,
925
+ ancestors_index: @ancestors_index,
926
+ descendants_index: @descendants_index,
927
+ alternatives_index: @alternatives_index,
928
+ obsoletes_index: @obsoletes_index,
929
+ structureType: @structureType,
930
+ ics: @ics,
931
+ meta: @meta,
932
+ special_tags: @special_tags,
933
+ max_freqs: @max_freqs,
934
+ dicts: @dicts,
935
+ profiles: @profiles,
936
+ profilesDict: @profilesDict,
937
+ items: @items,
938
+ removable_terms: @removable_terms,
939
+ term_paths: @term_paths}
940
+ # Convert to JSON format & write
941
+ File.open(file, "w") { |f| f.write obj_info.to_json }
942
+ end
943
+
944
+
945
+ def is_number? string
946
+ true if Float(string) rescue false
947
+ end
948
+
949
+
950
+ # Read a JSON file with an OBO_Handler object stored
951
+ # ===== Parameters
952
+ # +file+:: with object info
953
+ # +file+:: if true, calculate indexes. Default: true
954
+ # ===== Return
955
+ # OBO_Handler internal fields
956
+ def read(file, build: true)
957
+ # Read file
958
+ jsonFile = File.open(file)
959
+ jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
960
+ # Pre-process (Symbolize some hashs values)
961
+ if !jsonInfo[:header].nil?
962
+ aux = jsonInfo[:header].map do |entry,info|
963
+ if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
964
+ [entry,info.map{|item| item.to_sym}]
965
+ else
966
+ [entry,info]
967
+ end
968
+ end
969
+ jsonInfo[:header] = aux.to_h
970
+ end
971
+ jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
972
+ jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
973
+ jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
974
+ # Optional
975
+ jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
976
+ jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
977
+ jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
978
+ jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:obsoletes_index].nil?
979
+ jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
980
+ next if dictionaries.nil?
981
+ # Special case: byTerm
982
+ dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
983
+ if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
984
+ [term.to_s.to_i, value.map{|term| term.to_sym}]
985
+ elsif value.is_a? Numeric # Numeric dictionary
986
+ [term.to_sym, value]
987
+ elsif value.kind_of?(Array) && flag == :is_a
988
+ [term.to_sym, value.map{|v| v.to_sym}]
989
+ else
990
+ [term.to_sym, value]
991
+ end
992
+ end
993
+ dictionaries[:byTerm] = dictionaries[:byTerm].to_h
994
+ # By value
995
+ dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
996
+ if value.is_a? Numeric # Numeric dictionary
997
+ [value, term.to_sym]
998
+ elsif term.is_a? Numeric # Numeric dictionary
999
+ [value.to_s.to_sym, term]
1000
+ elsif flag == :is_a
1001
+ [value.to_sym, term.map{|v| v.to_sym}]
1002
+ elsif term.kind_of?(Array)
1003
+ [value.to_sym, term.map{|t| t.to_sym}]
1004
+ else
1005
+ [value.to_s, term.to_sym]
1006
+ end
1007
+ end
1008
+ dictionaries[:byValue] = dictionaries[:byValue].to_h
1009
+ end
1010
+ if !jsonInfo[:profiles].nil?
1011
+ jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
1012
+ jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
1013
+ end
1014
+ jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}} unless jsonInfo[:profilesDict].nil?
1015
+ jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
1016
+ jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
1017
+ next if v.nil?
1018
+ if v.kind_of?(Array)
1019
+ jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
1020
+ else
1021
+ jsonInfo[:special_tags][k] = v.to_sym
1022
+ end
1023
+ end
1024
+ jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
1025
+ jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
1026
+
1027
+ # Store info
1028
+ @header = jsonInfo[:header]
1029
+ @stanzas = jsonInfo[:stanzas]
1030
+ @ancestors_index = jsonInfo[:ancestors_index]
1031
+ @descendants_index = jsonInfo[:descendants_index]
1032
+ @alternatives_index = jsonInfo[:alternatives_index]
1033
+ @obsoletes_index = jsonInfo[:obsoletes_index]
1034
+ jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
1035
+ @structureType = jsonInfo[:structureType]
1036
+ @ics = jsonInfo[:ics]
1037
+ @meta = jsonInfo[:meta]
1038
+ @special_tags = jsonInfo[:special_tags]
1039
+ @max_freqs = jsonInfo[:max_freqs]
1040
+ @dicts = jsonInfo[:dicts]
1041
+ @profiles = jsonInfo[:profiles]
1042
+ @profilesDict = jsonInfo[:profilesDict]
1043
+ @items = jsonInfo[:items]
1044
+ @removable_terms = jsonInfo[:removable_terms]
1045
+ @term_paths = jsonInfo[:term_paths]
1046
+
1047
+ self.build_index() if build
1048
+ end
1049
+
1050
+
1051
+ # Check if a given ID is stored as term into this object
1052
+ # ===== Parameters
1053
+ # +id+:: to be checked
1054
+ # ===== Return
1055
+ # True if term is allowed or false in other cases
1056
+ def exists? id
1057
+ return stanzas[:terms].include?(id)
1058
+ end
1059
+
1060
+
1061
+ # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1062
+ # ===== Parameters
1063
+ # +text+:: to be checked
1064
+ # ===== Return
1065
+ # The correct ID if it can be found or nil in other cases
1066
+ def extract_id(text, splitBy: ' ')
1067
+ if self.exists?(text)
1068
+ return text
1069
+ else
1070
+ splittedText = text.to_s.split(splitBy).first.to_sym
1071
+ return self.exists?(splittedText) ? splittedText : nil
1072
+ end
1073
+ end
1074
+
1075
+
1076
+ # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1077
+ # This functions stores calculated dictionary into @dicts field.
1078
+ # This functions stores first value for multivalue tags
1079
+ # This function does not handle synonyms for byValue dictionaries
1080
+ # ===== Parameters
1081
+ # +tag+:: to be used to calculate dictionary
1082
+ # +select_regex+:: gives a regfex that can be used to modify value to be stored
1083
+ # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1084
+ # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1085
+ # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1086
+ # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1087
+ # ===== Return
1088
+ # void. And stores calcualted bidirectional dictonary into dictionaries main container
1089
+ def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1090
+ tag = tag.to_sym
1091
+ store_tag = tag if store_tag.nil?
1092
+ if @stanzas[:terms].empty?
1093
+ warn('Terms are not already loaded. Aborting dictionary calc')
1094
+ else
1095
+ byTerm = {}
1096
+ byValue = {}
1097
+ # Calc per term
1098
+ @stanzas[:terms].each do |term, tags|
1099
+ referenceTerm = term
1100
+ if @alternatives_index.include?(term) && substitute_alternatives # Special case
1101
+ referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1102
+ end
1103
+ queryTag = tags[tag]
1104
+ if !queryTag.nil?
1105
+ # Pre-process
1106
+ if !select_regex.nil?
1107
+ if queryTag.kind_of?(Array)
1108
+ queryTag = queryTag.map{|value| value.scan(select_regex).first}
1109
+ queryTag.flatten!
1110
+ else
1111
+ queryTag = queryTag.scan(select_regex).first
1112
+ end
1113
+ queryTag.compact!
1114
+ end
1115
+ if queryTag.kind_of?(Array) # Store
1116
+ if !queryTag.empty?
1117
+ if byTerm.include?(referenceTerm)
1118
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1119
+ else
1120
+ byTerm[referenceTerm] = queryTag
1121
+ end
1122
+ if multiterm
1123
+ queryTag.each do |value|
1124
+ byValue[value] = [] if byValue[value].nil?
1125
+ byValue[value] << referenceTerm
1126
+ end
1127
+ else
1128
+ queryTag.each{|value| byValue[value] = referenceTerm}
1129
+ end
1130
+ end
1131
+ else
1132
+ if byTerm.include?(referenceTerm)
1133
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1134
+ else
1135
+ byTerm[referenceTerm] = [queryTag]
1136
+ end
1137
+ if multiterm
1138
+ byValue[queryTag] = [] if byValue[queryTag].nil?
1139
+ byValue[queryTag] << referenceTerm
1140
+ else
1141
+ byValue[queryTag] = referenceTerm
1142
+ end
1143
+ end
1144
+ end
1145
+ end
1146
+
1147
+ # Check self-references
1148
+ if self_type_references
1149
+ byTerm.map do |term, references|
1150
+ corrected_references = references.map do |t|
1151
+ checked = self.extract_id(t)
1152
+ if checked.nil?
1153
+ t
1154
+ else
1155
+ byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
1156
+ checked
1157
+ end
1158
+ end
1159
+ byTerm[term] = corrected_references.uniq
1160
+ end
1161
+ end
1162
+
1163
+ # Check order
1164
+ byTerm.map do |term,values|
1165
+ if self.exists?(term)
1166
+ referenceValue = @stanzas[:terms][term][tag]
1167
+ if !referenceValue.nil?
1168
+ if !select_regex.nil?
1169
+ if referenceValue.kind_of?(Array)
1170
+ referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1171
+ referenceValue.flatten!
1172
+ else
1173
+ referenceValue = referenceValue.scan(select_regex).first
1174
+ end
1175
+ referenceValue.compact!
1176
+ end
1177
+ if self_type_references
1178
+ if referenceValue.kind_of?(Array)
1179
+ aux = referenceValue.map{|t| self.extract_id(t)}
1180
+ else
1181
+ aux = self.extract_id(referenceValue)
1182
+ end
1183
+ aux.compact! unless aux.nil?
1184
+ referenceValue = aux unless aux.nil?
1185
+ end
1186
+ referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1187
+ byTerm[term] = referenceValue + (values - referenceValue)
1188
+ end
1189
+ end
1190
+ end
1191
+
1192
+ # Store
1193
+ @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1194
+ end
1195
+ end
1196
+
1197
+
1198
+ # Calculates :is_a dictionary without alternatives substitution
1199
+ def calc_ancestors_dictionary
1200
+ self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
1201
+ end
1202
+
1203
+
1204
+ # Translate a given value using an already calcualted dictionary
1205
+ # ===== Parameters
1206
+ # +toTranslate+:: value to be translated using dictiontionary
1207
+ # +tag+:: used to generate the dictionary
1208
+ # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1209
+ # ===== Return
1210
+ # translation
1211
+ def translate(toTranslate, tag, byValue: true)
1212
+ dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1213
+ toTranslate = get_main_id(toTranslate) if !byValue
1214
+ return dict[toTranslate]
1215
+ end
1216
+
1217
+
1218
+ # Translate a name given
1219
+ # ===== Parameters
1220
+ # +name+:: to be translated
1221
+ # ===== Return
1222
+ # translated name or nil if it's not stored into this ontology
1223
+ def translate_name(name)
1224
+ term = self.translate(name, :name)
1225
+ term = self.translate(name, :synonym) if term.nil?
1226
+ return term
1227
+ end
1228
+
1229
+
1230
+ # Translate several names and return translations and a list of names which couldn't be translated
1231
+ # ===== Parameters
1232
+ # +names+:: array to be translated
1233
+ # ===== Return
1234
+ # two arrays with translations and names which couldn't be translated respectively
1235
+ def translate_names(names)
1236
+ translated = []
1237
+ rejected = []
1238
+ names.each do |name|
1239
+ tr = self.translate_name(name)
1240
+ if tr.nil?
1241
+ rejected << name
1242
+ else
1243
+ translated << tr
1244
+ end
1245
+ end
1246
+ return translated, rejected
1247
+ end
1248
+
1249
+
1250
+ # Translates a given ID to it assigned name
1251
+ # ===== Parameters
1252
+ # +id+:: to be translated
1253
+ # ===== Return
1254
+ # main name or nil if it's not included into this ontology
1255
+ def translate_id(id)
1256
+ name = self.translate(id, :name, byValue: false)
1257
+ return name.nil? ? nil : name.first
1258
+ end
1259
+
1260
+
1261
+ # Translates several IDs and returns translations and not allowed IDs list
1262
+ # ===== Parameters
1263
+ # +ids+:: to be translated
1264
+ # ===== Return
1265
+ # two arrays with translations and names which couldn't be translated respectively
1266
+ def translate_ids(ids)
1267
+ translated = []
1268
+ rejected = []
1269
+ ids.each do |term_id|
1270
+ tr = self.translate_id(term_id.to_sym)
1271
+ if !tr.nil?
1272
+ translated << tr
1273
+ else
1274
+ rejected << tr
1275
+ end
1276
+ end
1277
+ return translated, rejected
1278
+ end
1279
+
1280
+
1281
+ # ===== Returns
1282
+ # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1283
+ # ===== Parameters
1284
+ # +id+:: to be translated
1285
+ # ===== Return
1286
+ # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1287
+ def get_main_id(id)
1288
+ return nil if !@stanzas[:terms].include? id
1289
+ new_id = id
1290
+ mainID = @alternatives_index[id]
1291
+ new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1292
+ return new_id
1293
+ end
1294
+
1295
+
1296
+ # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1297
+ # ===== Parameters
1298
+ # +ids+:: to be checked
1299
+ # ===== Return
1300
+ # two arrays whit allowed and rejected IDs respectively
1301
+ def check_ids(ids, substitute: true)
1302
+ checked_codes = []
1303
+ rejected_codes = []
1304
+ ids.each do |id|
1305
+ if @stanzas[:terms].include? id
1306
+ if substitute
1307
+ checked_codes << self.get_main_id(id)
1308
+ else
1309
+ checked_codes << id
1310
+ end
1311
+ else
1312
+ rejected_codes << id
1313
+ end
1314
+ end
1315
+ return checked_codes, rejected_codes
1316
+ end
1317
+
1318
+
1319
+ # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1320
+ # ===== Parameters
1321
+ # +id+:: assigned to profile
1322
+ # +terms+:: array of terms
1323
+ # +substitute+:: subsstitute flag from check_ids
1324
+ def add_profile(id, terms, substitute: true)
1325
+ warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1326
+ correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1327
+ if !rejected_terms.empty?
1328
+ warn('Given terms contains erroneus IDs. These IDs will be removed')
1329
+ end
1330
+ if id.is_a? Numeric
1331
+ @profiles[id] = correct_terms
1332
+ else
1333
+ @profiles[id.to_sym] = correct_terms
1334
+ end
1335
+ end
1336
+
1337
+
1338
+ # Method used to store a pull of profiles
1339
+ # ===== Parameters
1340
+ # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1341
+ # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1342
+ # +reset_stored+:: if true, remove already stored profiles
1343
+ # +substitute+:: subsstitute flag from check_ids
1344
+ def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1345
+ self.reset_profiles if reset_stored
1346
+ # Check
1347
+ if profiles.kind_of?(Array)
1348
+ profiles.each_with_index do |items, i|
1349
+ self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1350
+ end
1351
+ else # Hash
1352
+ if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1353
+ warn('Some profiles given are already stored. Stored version will be replaced')
1354
+ end
1355
+ profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1356
+ end
1357
+
1358
+ self.add_observed_terms_from_profiles(reset: true)
1359
+
1360
+ if calc_metadata
1361
+ self.calc_profiles_dictionary
1362
+ end
1363
+ end
1364
+
1365
+
1366
+ # Internal method used to remove already stored profiles and restore observed frequencies
1367
+ def reset_profiles
1368
+ # Clean profiles storage
1369
+ @profiles = {}
1370
+ # Reset frequency observed
1371
+ @meta.each{|term,info| info[:observed_freq] = 0}
1372
+ @max_freqs[:observed_freq] = 0
1373
+ end
1374
+
1375
+
1376
+ # ===== Returns
1377
+ # profiles assigned to a given ID
1378
+ # ===== Parameters
1379
+ # +id+:: profile ID
1380
+ # ===== Return
1381
+ # specific profile or nil if it's not stored
1382
+ def get_profile(id)
1383
+ return @profiles[id]
1384
+ end
1385
+
1386
+
1387
+ # ===== Returns
1388
+ # an array of sizes for all stored profiles
1389
+ # ===== Return
1390
+ # array of profile sizes
1391
+ def get_profiles_sizes()
1392
+ return @profiles.map{|id,terms| terms.length}
1393
+ end
1394
+
1395
+
1396
+ # ===== Returns
1397
+ # mean size of stored profiles
1398
+ # ===== Parameters
1399
+ # +round_digits+:: number of digits to round result. Default: 4
1400
+ # ===== Returns
1401
+ # mean size of stored profiles
1402
+ def get_profiles_mean_size(round_digits: 4)
1403
+ sizes = self.get_profiles_sizes
1404
+ return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1405
+ end
1406
+
1407
+
1408
+ # Calculates profiles sizes and returns size assigned to percentile given
1409
+ # ===== Parameters
1410
+ # +perc+:: percentile to be returned
1411
+ # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1412
+ # ===== Returns
1413
+ # values assigned to percentile asked
1414
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1415
+ prof_lengths = self.get_profiles_sizes.sort
1416
+ prof_lengths.reverse! if !increasing_sort
1417
+ n_profiles = prof_lengths.length
1418
+ percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1419
+ percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1420
+ return prof_lengths[percentile_index]
1421
+ end
1422
+
1423
+
1424
+ # Translate a given profile to terms names
1425
+ # ===== Parameters
1426
+ # +prof+:: array of terms to be translated
1427
+ # ===== Returns
1428
+ # array of translated terms. Can include nils if some IDs are not allowed
1429
+ def profile_names(prof)
1430
+ return prof.map{|term| self.translate_id(term)}
1431
+ end
1432
+
1433
+
1434
+ # Trnaslates a bunch of profiles to it sets of term names
1435
+ # ===== Parameters
1436
+ # +profs+:: array of profiles
1437
+ # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1438
+ # ===== Returns
1439
+ # translated profiles
1440
+ def translate_profiles_ids(profs = [], asArray: true)
1441
+ profs = @profiles if profs.empty?
1442
+ profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1443
+ profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1444
+ return asArray ? profs_names.values : profs_names
1445
+ end
1446
+
1447
+
1448
+ # Includes as "observed_terms" all terms included into stored profiles
1449
+ # ===== Parameters
1450
+ # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1451
+ def add_observed_terms_from_profiles(reset: false)
1452
+ @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1453
+ @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1454
+ end
1455
+
1456
+
1457
+ # Get a term frequency
1458
+ # ===== Parameters
1459
+ # +term+:: term to be checked
1460
+ # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1461
+ # ===== Returns
1462
+ # frequency of term given or nil if term is not allowed
1463
+ def get_frequency(term, type: :struct_freq)
1464
+ queryFreq = @meta[term]
1465
+ return queryFreq.nil? ? nil : queryFreq[type]
1466
+ end
1467
+
1468
+
1469
+ # Geys structural frequency of a term given
1470
+ # ===== Parameters
1471
+ # +term+:: to be checked
1472
+ # ===== Returns
1473
+ # structural frequency of given term or nil if term is not allowed
1474
+ def get_structural_frequency(term)
1475
+ return self.get_frequency(term, type: :struct_freq)
1476
+ end
1477
+
1478
+
1479
+ # Gets observed frequency of a term given
1480
+ # ===== Parameters
1481
+ # +term+:: to be checked
1482
+ # ===== Returns
1483
+ # observed frequency of given term or nil if term is not allowed
1484
+ def get_observed_frequency(term)
1485
+ return self.get_frequency(term, type: :observed_freq)
1486
+ end
1487
+
1488
+
1489
+ # Calculates frequencies of stored profiles terms
1490
+ # ===== Parameters
1491
+ # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1492
+ # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1493
+ # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1494
+ # +translate+:: if true, term IDs will be translated to
1495
+ # ===== Returns
1496
+ # stored profiles terms frequencies
1497
+ def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1498
+ n_profiles = @profiles.length
1499
+ if literal
1500
+ freqs = {}
1501
+ @profiles.each do |id, terms|
1502
+ terms.each do |literalTerm|
1503
+ if freqs.include?(literalTerm)
1504
+ freqs[literalTerm] += 1
1505
+ else
1506
+ freqs[literalTerm] = 1
1507
+ end
1508
+ end
1509
+ end
1510
+ if (ratio || translate)
1511
+ aux_keys = freqs.keys
1512
+ aux_keys.each do |term|
1513
+ freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1514
+ if translate
1515
+ tr = self.translate_id(term)
1516
+ freqs[tr] = freqs.delete(term) if !tr.nil?
1517
+ end
1518
+ end
1519
+ end
1520
+ if asArray
1521
+ freqs = freqs.map{|term, freq| [term, freq]}
1522
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1523
+ end
1524
+ else # Freqs translating alternatives
1525
+ freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1526
+ freqs = freqs.to_h if !asArray
1527
+ if translate
1528
+ freqs = freqs.map do |term, freq|
1529
+ tr = self.translate_id(term)
1530
+ tr.nil? ? [term, freq] : [tr, freq]
1531
+ end
1532
+ end
1533
+ if asArray
1534
+ freqs = freqs.map{|term, freq| [term, freq]}
1535
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1536
+ else
1537
+ freqs = freqs.to_h
1538
+ end
1539
+ end
1540
+ return freqs
1541
+ end
1542
+
1543
+
1544
+ # Clean a given profile returning cleaned set of terms and removed ancestors term.
1545
+ # ===== Parameters
1546
+ # +prof+:: array of terms to be checked
1547
+ # ===== Returns
1548
+ # two arrays, first is the cleaned profile and second is the removed elements array
1549
+ def remove_ancestors_from_profile(prof)
1550
+ ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1551
+ redundant = prof.select{|term| ancestors.include?(term)}
1552
+ return prof - redundant, redundant
1553
+ end
1554
+
1555
+
1556
+ # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1557
+ # ===== Parameters
1558
+ # +prof+:: array of terms to be checked
1559
+ # ===== Returns
1560
+ # two arrays, first is the cleaned profile and second is the removed elements array
1561
+ def remove_alternatives_from_profile(prof)
1562
+ alternatives = prof.select{|term| @alternatives_index.include?(term)}
1563
+ redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1564
+ return prof - redundant, redundant
1565
+ end
1566
+
1567
+
1568
+ # Remove alternatives (if official term is present) and ancestors terms of a given profile
1569
+ # ===== Parameters
1570
+ # +profile+:: profile to be cleaned
1571
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1572
+ # ===== Returns
1573
+ # cleaned profile
1574
+ def clean_profile(profile, remove_alternatives: true)
1575
+ warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
1576
+ terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1577
+ if remove_alternatives
1578
+ terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1579
+ else
1580
+ terms_without_ancestors_and_alternatices = terms_without_ancestors
1581
+ end
1582
+ return terms_without_ancestors_and_alternatices
1583
+ end
1584
+
1585
+ def clean_profile_hard(profile, options = {})
1586
+ profile, _ = check_ids(profile)
1587
+ profile = profile.select{|t| !is_obsolete?(t)}
1588
+ if !options[:term_filter].nil?
1589
+ profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
1590
+ end
1591
+ profile = clean_profile(profile.uniq)
1592
+ return profile
1593
+ end
1594
+
1595
+ # Remove terms from a given profile using hierarchical info and scores set given
1596
+ # ===== Parameters
1597
+ # +profile+:: profile to be cleaned
1598
+ # +scores+:: hash with terms by keys and numerical values (scores)
1599
+ # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
1600
+ # +remove_without_score+:: if true, terms without score will be removed. Default: true
1601
+ # ===== Returns
1602
+ # cleaned profile
1603
+ def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
1604
+ scores = scores.sort_by{|term,score| score}.to_h
1605
+ keep = profile.map do |term|
1606
+ if scores.include?(term)
1607
+ parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
1608
+ targetable = parentals.select{|parent| profile.include?(parent)}
1609
+ if targetable.empty?
1610
+ term
1611
+ else
1612
+ targetable << term
1613
+ targets = scores.select{|term,score| targetable.include?(term)}.to_h
1614
+ byMax ? targets.keys.last : targets.keys.first
1615
+ end
1616
+ elsif remove_without_score
1617
+ nil
1618
+ else
1619
+ term
1620
+ end
1621
+ end
1622
+ return keep.compact.uniq
1623
+ end
1624
+
1625
+
1626
+ # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1627
+ # ===== Parameters
1628
+ # +store+:: if true, clenaed profiles will replace already stored profiles
1629
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1630
+ # ===== Returns
1631
+ # a hash with cleaned profiles
1632
+ def clean_profiles(store: false, remove_alternatives: true)
1633
+ cleaned_profiles = {}
1634
+ @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1635
+ @profiles = cleaned_profiles if store
1636
+ return cleaned_profiles
1637
+ end
1638
+
1639
+
1640
+ # Calculates number of ancestors present (redundant) in each profile stored
1641
+ # ===== Returns
1642
+ # array of parentals for each profile
1643
+ def parentals_per_profile
1644
+ cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1645
+ parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1646
+ return parentals
1647
+ end
1648
+
1649
+
1650
+ def get_profile_redundancy()
1651
+ profile_sizes = self.get_profiles_sizes
1652
+ parental_terms_per_profile = self.parentals_per_profile# clean_profiles
1653
+ parental_terms_per_profile = parental_terms_per_profile.map{|item| item[0]}
1654
+ profile_sizes, parental_terms_per_profile = profile_sizes.zip(parental_terms_per_profile).sort_by{|i| i.first}.reverse.transpose
1655
+ return profile_sizes, parental_terms_per_profile
1656
+ end
1657
+
1658
+ def compute_term_list_and_childs()
1659
+ suggested_childs = {}
1660
+ total_terms = 0
1661
+ terms_with_more_specific_childs = 0
1662
+ @profiles.each do |id, terms|
1663
+ total_terms += terms.length
1664
+ more_specific_childs = self.get_childs_table(terms, true)
1665
+ terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
1666
+ suggested_childs[id] = more_specific_childs
1667
+ end
1668
+ return suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
1669
+ end
1670
+
1671
+ # Calculates mean IC of a given profile
1672
+ # ===== Parameters
1673
+ # +prof+:: profile to be checked
1674
+ # +ic_type+:: ic_type to be used
1675
+ # +zhou_k+:: special coeficient for Zhou IC method
1676
+ # ===== Returns
1677
+ # mean IC for a given profile
1678
+ def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1679
+ return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1680
+ end
1681
+
1682
+
1683
+ # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1684
+ # ===== Returns
1685
+ # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1686
+ def get_profiles_resnik_dual_ICs
1687
+ struct_ics = {}
1688
+ observ_ics = {}
1689
+ @profiles.each do |id, terms|
1690
+ struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1691
+ observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1692
+ end
1693
+ return struct_ics.clone, observ_ics.clone
1694
+ end
1695
+
1696
+
1697
+ # Calculates ontology structural levels for all ontology terms
1698
+ # ===== Parameters
1699
+ # +calc_paths+:: calculates term paths if it's not already calculated
1700
+ # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1701
+ def calc_term_levels(calc_paths: false, shortest_path: true)
1702
+ if @term_paths.empty?
1703
+ if calc_paths
1704
+ self.calc_term_paths
1705
+ else
1706
+ warn('Term paths are not already loaded. Aborting dictionary calc')
1707
+ end
1708
+ end
1709
+ if !@term_paths.empty?
1710
+ byTerm = {}
1711
+ byValue = {}
1712
+ # Calc per term
1713
+ @term_paths.each do |term, info|
1714
+ level = shortest_path ? info[:shortest_path] : info[:largest_path]
1715
+ if level.nil?
1716
+ level = -1
1717
+ else
1718
+ level = level.round(0)
1719
+ end
1720
+ byTerm[term] = level
1721
+ queryLevels = byValue[level]
1722
+ if queryLevels.nil?
1723
+ byValue[level] = [term]
1724
+ else
1725
+ byValue[level] << term
1726
+ end
1727
+ end
1728
+ @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1729
+ # Update maximum depth
1730
+ @max_freqs[:max_depth] = byValue.keys.max
1731
+ end
1732
+ end
1733
+
1734
+
1735
+ # Check if a term given is marked as obsolete
1736
+ def is_obsolete? term
1737
+ return @obsoletes_index.include?(term)
1738
+ end
1739
+
1740
+ # Check if a term given is marked as alternative
1741
+ def is_alternative? term
1742
+ return @alternatives_index.include?(term)
1743
+ end
1744
+
1745
+ # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1746
+ # Also calculates paths metadata and stores into @term_paths
1747
+ def calc_term_paths(only_main_terms=false)
1748
+ self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
1749
+ visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
1750
+ @term_paths = {}
1751
+ if [:hierarchical, :sparse].include? @structureType
1752
+ @stanzas[:terms].each do |term, t_attributes|
1753
+ if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term)) # Special case (obsoletes)
1754
+ special_term = term
1755
+ term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1756
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1757
+ @term_paths[special_term] = @term_paths[term]
1758
+ visited_terms[special_term] = true
1759
+ end
1760
+ if !visited_terms.include?(term)
1761
+ # PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
1762
+ path_attr = @term_paths[term]
1763
+ if path_attr.nil?
1764
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
1765
+ @term_paths[term] = path_attr #save path data container
1766
+ end
1767
+ parentals = @dicts[:is_a][:byTerm][term]
1768
+ if parentals.nil?
1769
+ path_attr[:paths] << [term]
1770
+ else
1771
+ parentals.each do |direct_parental|
1772
+ self.expand_path(direct_parental)
1773
+ new_paths = @term_paths[direct_parental][:paths]
1774
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
1775
+ end
1776
+ end
1777
+ anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
1778
+ visited_terms[term] = true
1779
+ end
1780
+ # Update metadata
1781
+ path_attr = @term_paths[term]
1782
+ path_attr[:total_paths] = path_attr[:paths].length
1783
+ paths_sizes = path_attr[:paths].map{|path| path.length}
1784
+ path_attr[:largest_path] = paths_sizes.max
1785
+ path_attr[:shortest_path] = paths_sizes.min
1786
+ end
1787
+ else
1788
+ warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1789
+ end
1790
+ end
1791
+
1792
+
1793
+ # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1794
+ # ===== Parameters
1795
+ # +curr_term+:: current visited term
1796
+ # +visited_terms+:: already expanded terms
1797
+ def expand_path(curr_term)
1798
+ if !@term_paths.include?(curr_term)
1799
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
1800
+ @term_paths[curr_term] = path_attr
1801
+ direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1802
+ if direct_parentals.nil? # No parents :: End of recurrence
1803
+ path_attr[:paths] << [curr_term]
1804
+ else # Expand and concat
1805
+ direct_parentals.each do |ancestor|
1806
+ path_attr_parental = @term_paths[ancestor]
1807
+ if path_attr_parental.nil? # Calculate new paths
1808
+ self.expand_path(ancestor)
1809
+ new_paths = @term_paths[ancestor][:paths]
1810
+ else # Use direct_parental paths already calculated
1811
+ new_paths = path_attr_parental[:paths]
1812
+ end
1813
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
1814
+ end
1815
+ end
1816
+ end
1817
+ end
1818
+
1819
+
1820
+ # Gets ontology levels calculated
1821
+ # ===== Returns
1822
+ # ontology levels calculated
1823
+ def get_ontology_levels
1824
+ return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1825
+ end
1826
+
1827
+
1828
+ # Gets ontology level of a specific term
1829
+ # ===== Returns
1830
+ # Term level
1831
+ def get_term_level(term)
1832
+ return @dicts[:level][:byValue][term]
1833
+ end
1834
+
1835
+ # nil, term not found, [] term exists but not has parents
1836
+ def get_parental_path(term, which_path = :shortest_path, level = 0)
1837
+ path = nil
1838
+ path_attr = @term_paths[term]
1839
+ if !path_attr.nil?
1840
+ path_length = path_attr[which_path]
1841
+ all_paths = path_attr[:paths]
1842
+ if all_paths.empty?
1843
+ path = []
1844
+ else
1845
+ path = all_paths.select{|pt| pt.length == path_length}.first.clone
1846
+ if level > 0 # we want the term and his ascendants until a specific level
1847
+ n_parents = path_length - level
1848
+ path = path[0..n_parents]
1849
+ end
1850
+ path.shift # Discard the term itself
1851
+ end
1852
+ end
1853
+ return path
1854
+ end
1855
+
1856
+ # Return ontology levels from profile terms
1857
+ # ===== Returns
1858
+ # hash of term levels (Key: level; Value: array of term IDs)
1859
+ def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1860
+ profiles_terms = @profiles.values.flatten
1861
+ profiles_terms.uniq! if uniq
1862
+ term_freqs_byProfile = {}
1863
+ profiles_terms.each do |term|
1864
+ query = term_freqs_byProfile[term]
1865
+ if query.nil?
1866
+ term_freqs_byProfile[term] = 1
1867
+ else
1868
+ term_freqs_byProfile[term] += 1
1869
+ end
1870
+ end
1871
+ levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1872
+ return levels_filtered
1873
+ end
1874
+
1875
+ def get_profile_ontology_distribution_tables
1876
+ cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
1877
+ uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
1878
+ hpo_ontology_levels = get_ontology_levels
1879
+ total_ontology_terms = hpo_ontology_levels.values.flatten.length
1880
+ total_cohort_terms = cohort_ontology_levels.values.flatten.length
1881
+ total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
1882
+
1883
+ ontology_levels = []
1884
+ distribution_percentage = []
1885
+ hpo_ontology_levels.each do |level, terms|
1886
+ cohort_terms = cohort_ontology_levels[level]
1887
+ uniq_cohort_terms = uniq_cohort_ontology_levels[level]
1888
+ if cohort_terms.nil? || uniq_cohort_terms.nil?
1889
+ num = 0
1890
+ u_num = 0
1891
+ else
1892
+ num = cohort_terms.length
1893
+ u_num = uniq_cohort_terms.length
1894
+ end
1895
+ ontology_levels << [level, terms.length, num]
1896
+ distribution_percentage << [
1897
+ level,
1898
+ (terms.length.fdiv(total_ontology_terms)*100).round(3),
1899
+ (num.fdiv(total_cohort_terms)*100).round(3),
1900
+ (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
1901
+ ]
1902
+ end
1903
+ ontology_levels.sort! { |x,y| x.first <=> y.first }
1904
+ distribution_percentage.sort! { |x,y| x.first <=> y.first }
1905
+ return ontology_levels, distribution_percentage
1906
+ end
1907
+
1908
+ def get_dataset_specifity_index(mode)
1909
+ ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
1910
+ if mode == 'uniq'
1911
+ observed_distribution = 3
1912
+ elsif mode == 'weigthed'
1913
+ observed_distribution = 2
1914
+ end
1915
+ max_terms = distribution_percentage.map{|row| row[1]}.max
1916
+ maxL = nil
1917
+ distribution_percentage.each do |level_info|
1918
+ maxL = level_info.first if level_info[1] == max_terms
1919
+ end
1920
+ diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
1921
+ diffL.select!{|dL| dL.last > 0}
1922
+ lowSection = diffL.select{|dL| dL.first <= maxL}
1923
+ highSection = diffL.select{|dL| dL.first > maxL}
1924
+ dsi = nil
1925
+ if highSection.empty?
1926
+ dsi = 0
1927
+ else
1928
+ accumulated_weigth = 0
1929
+ accumulated_weigthed_diffL = 0
1930
+ hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
1931
+ lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
1932
+ dsi = hss.fdiv(lss)
1933
+ end
1934
+ return dsi
1935
+ end
1936
+
1937
+ def get_weigthed_level_contribution(section, maxL, nLevels)
1938
+ accumulated_weigthed_diffL = 0
1939
+ section.each do |level, diff|
1940
+ weightL = maxL - level
1941
+ if weightL >= 0
1942
+ weightL += 1
1943
+ else
1944
+ weightL = weightL.abs
1945
+ end
1946
+ accumulated_weigthed_diffL += diff * weightL
1947
+ end
1948
+ weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
1949
+ return weigthed_contribution
1950
+ end
1951
+
1952
+
1953
+ # Calculate profiles dictionary with Key= Term; Value = Profiles
1954
+ def calc_profiles_dictionary
1955
+ if @profiles.empty?
1956
+ warn('Profiles are not already loaded. Aborting dictionary calc')
1957
+ else
1958
+ byTerm = {} # Key: Terms
1959
+ # byValue -- Key: Profile == @profiles
1960
+ @profiles.each do |id, terms|
1961
+ terms.each do |term|
1962
+ if byTerm.include?(term)
1963
+ byTerm[term] << id
1964
+ else
1965
+ byTerm[term] = [id]
1966
+ end
1967
+ end
1968
+ end
1969
+ @profilesDict = byTerm
1970
+ end
1971
+ end
1972
+
1973
+
1974
+ # Gets profiles dictionary calculated
1975
+ # ===== Return
1976
+ # profiles dictionary (clone)
1977
+ def get_terms_linked_profiles
1978
+ return @profilesDict.clone
1979
+ end
1980
+
1981
+
1982
+ # Get related profiles to a given term
1983
+ # ===== Parameters
1984
+ # +term+:: to be checked
1985
+ # ===== Returns
1986
+ # profiles which contains given term
1987
+ def get_term_linked_profiles(term)
1988
+ return @profilesDict[term]
1989
+ end
1990
+
1991
+
1992
+ # Gets metainfo table from a set of terms
1993
+ # ===== Parameters
1994
+ # +terms+:: IDs to be expanded
1995
+ # +filter_alternatives+:: flag to be used in get_descendants method
1996
+ # ===== Returns
1997
+ # an array with triplets [TermID, TermName, DescendantsNames]
1998
+ def get_childs_table(terms, filter_alternatives = false)
1999
+ expanded_terms = []
2000
+ terms.each do |t|
2001
+ expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
2002
+ end
2003
+ return expanded_terms
2004
+ end
2005
+
2006
+
2007
+ # Store specific relations hash given into ITEMS structure
2008
+ # ===== Parameters
2009
+ # +relations+:: hash to be stored
2010
+ # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
2011
+ # +expand+:: if true, already stored keys will be updated with the unique union of both sets
2012
+ def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
2013
+ @items = {} if remove_old_relations
2014
+ if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
2015
+ warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
2016
+ end
2017
+ if !remove_old_relations
2018
+ if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
2019
+ warn('Some terms given are already stored. Stored version will be replaced')
2020
+ end
2021
+ end
2022
+ if expand
2023
+ @items = self.concatItems(@items,relations)
2024
+ # relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
2025
+ # if @items.keys.include?(k)
2026
+ # if v.kind_of?(Array)
2027
+ # @items[k] = (@items[k] + v).uniq
2028
+ # elsif v.kind_of?(Hash)
2029
+ # @items.merge!(relations) do |k, oldV, newV|
2030
+ # if oldV.kind_of?(Array)
2031
+ # return (oldV + newV).uniq
2032
+ # else
2033
+ # oldV = [oldV,newV]
2034
+ # end
2035
+ # end
2036
+ # elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
2037
+ # @items[k] = (@items[k] + [v]).uniq
2038
+ # else
2039
+ # @items[k] = [@items[k],v]
2040
+ # end
2041
+ # else
2042
+ # @items[k] = v
2043
+ # end
2044
+ # end
2045
+ else
2046
+ @items.merge!(relations)
2047
+ end
2048
+ end
2049
+
2050
+ # Internal function to concat two elements.
2051
+ # ===== Parameters
2052
+ # +itemA+:: item to be concatenated
2053
+ # +itemB+:: item to be concatenated
2054
+ # ===== Returns
2055
+ # Concatenated objects
2056
+ def concatItems(itemA,itemB)
2057
+ # A is Array :: RETURN ARRAY
2058
+ # A_array : B_array
2059
+ # A_array : B_hash => NOT ALLOWED
2060
+ # A_array : B_single => NOT ALLOWED
2061
+ # A is Hash :: RETURN HASH
2062
+ # A_hash : B_array => NOT ALLOWED
2063
+ # A_hash : B_hash
2064
+ # A_hash : B_single => NOT ALLOWED
2065
+ # A is single element => RETURN ARRAY
2066
+ # A_single : B_array
2067
+ # A_single : B_hash => NOT ALLOWED
2068
+ # A_single : B_single
2069
+ concatenated = nil
2070
+ if itemA.kind_of?(Array) && itemB.kind_of?(Array)
2071
+ concatenated = (itemA + itemB).uniq
2072
+ elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
2073
+ concatenated = itemA.merge(itemB) do |k, oldV, newV|
2074
+ self.concatItems(oldV,newV)
2075
+ end
2076
+ elsif itemB.kind_of?(Array)
2077
+ concatenated = ([itemA] + itemB).uniq
2078
+ elsif ![Array, Hash].include?(itemB.class)
2079
+ concatenated = [itemA,itemB].uniq
2080
+ end
2081
+ return concatenated
2082
+ end
2083
+
2084
+
2085
+ # Assign a dictionary already calculated as a items set.
2086
+ # ===== Parameters
2087
+ # +dictID+:: dictionary ID to be stored (:byTerm will be used)
2088
+ def set_items_from_dict(dictID, remove_old_relations = false)
2089
+ @items = {} if remove_old_relations
2090
+ if !@dicts[dictID].nil?
2091
+ @items.merge(@dicts[dictID][:byTerm])
2092
+ else
2093
+ warn('Specified ID is not calculated. Dict will not be added as a items set')
2094
+ end
2095
+ end
2096
+
2097
+
2098
+ # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
2099
+ # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
2100
+ # ===== Parameters
2101
+ # +ontology+:: (Optional) ontology object which items given belongs
2102
+ # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
2103
+ # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
2104
+ # ===== Returns
2105
+ # void and update items object
2106
+ def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
2107
+ # Check item keys
2108
+ if @items.empty?
2109
+ warn('Items have been not provided yet')
2110
+ return nil
2111
+ end
2112
+ targetKeys = @items.keys.select{|k| self.exists?(k)}
2113
+ if targetKeys.length == 0
2114
+ warn('Any item key is allowed')
2115
+ return nil
2116
+ elsif targetKeys.length < @items.keys.length
2117
+ warn('Some item keys are not allowed')
2118
+ end
2119
+
2120
+ # Expand to parentals
2121
+ targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
2122
+ targetKeys.flatten!
2123
+ targetKeys.uniq!
2124
+
2125
+ # Obtain levels (go from leaves to roots)
2126
+ levels = targetKeys.map{|term| self.get_term_level(term)}
2127
+ levels.compact!
2128
+ levels.uniq!
2129
+ levels.sort!
2130
+ levels.reverse!
2131
+ levels.shift # Leaves are not expandable
2132
+
2133
+ # Expand from leaves to roots
2134
+ levels.map do |lvl|
2135
+ curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
2136
+ curr_keys.map do |term_expand|
2137
+ to_infer = []
2138
+ # Obtain childs
2139
+ childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
2140
+ # Expand
2141
+ if childs.length > 0 && minimum_childs == 1 # Special case
2142
+ to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
2143
+ elsif childs.length >= minimum_childs
2144
+ to_infer = Hash.new(0)
2145
+ # Compare
2146
+ while childs.length > 1
2147
+ curr_term = childs.shift
2148
+ childs.each do |compare_term|
2149
+ pivot_items = @items[curr_term]
2150
+ compare_items = @items[compare_term]
2151
+ if ontology.nil? # Exact match
2152
+ pivot_items.map do |pitem|
2153
+ if compare_items.include?(pitem)
2154
+ to_infer[pitem] += 2
2155
+ end
2156
+ end
2157
+ else # Find MICAs
2158
+ local_infer = Hash.new(0)
2159
+ pivot_items.map do |pitem|
2160
+ micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
2161
+ maxmica = micas[0]
2162
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
2163
+ local_infer[maxmica.first] += 1
2164
+ end
2165
+ compare_items.map do |citem|
2166
+ micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
2167
+ maxmica = micas[0]
2168
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
2169
+ local_infer[maxmica.first] += 1
2170
+ end
2171
+ local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
2172
+ end
2173
+ end
2174
+ end
2175
+ # Filter infer
2176
+ to_infer = to_infer.select{|k,v| v >= minimum_childs}
2177
+ end
2178
+ # Infer
2179
+ if to_infer.length > 0
2180
+ @items[term_expand] = [] if @items[term_expand].nil?
2181
+ if to_infer.kind_of?(Array)
2182
+ @items[term_expand] = (@items[term_expand] + to_infer).uniq
2183
+ else
2184
+ @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
2185
+ end
2186
+ @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
2187
+ elsif !@items.include?(term_expand)
2188
+ targetKeys.delete(term_expand)
2189
+ end
2190
+ end
2191
+ end
2192
+ end
2193
+
2194
+
2195
+ # Return direct ancestors/descendants of a given term
2196
+ # ===== Parameters
2197
+ # +term+:: which are requested
2198
+ # +relation+:: can be :ancestor or :descendant
2199
+ # +remove_alternatives+:: if true, alternatives will be removed
2200
+ # ===== Returns
2201
+ # Direct ancestors/descendants of given term or nil if any error occurs
2202
+ def get_direct_related(term, relation, remove_alternatives: false)
2203
+ if @dicts[:is_a].nil?
2204
+ warn("Hierarchy dictionary is not already calculated. Returning nil")
2205
+ return nil
2206
+ end
2207
+ target = nil
2208
+ case relation
2209
+ when :ancestor
2210
+ target = :byTerm
2211
+ when :descendant
2212
+ target = :byValue
2213
+ else
2214
+ warn('Relation type not allowed. Returning nil')
2215
+ end
2216
+ return nil if target.nil?
2217
+ query = @dicts[:is_a][target][term]
2218
+ return query if query.nil?
2219
+ query, _ = remove_alternatives_from_profile(query) if remove_alternatives
2220
+ return query
2221
+ end
2222
+
2223
+
2224
+ # Return direct ancestors of a given term
2225
+ # ===== Parameters
2226
+ # +term+:: which ancestors are requested
2227
+ # +remove_alternatives+:: if true, alternatives will be removed
2228
+ # ===== Returns
2229
+ # Direct ancestors of given term or nil if any error occurs
2230
+ def get_direct_ancentors(term, remove_alternatives: false)
2231
+ return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
2232
+ end
2233
+
2234
+ # Return direct descendants of a given term
2235
+ # ===== Parameters
2236
+ # +term+:: which descendants are requested
2237
+ # +remove_alternatives+:: if true, alternatives will be removed
2238
+ # ===== Returns
2239
+ # Direct descendants of given term or nil if any error occurs
2240
+ def get_direct_descendants(term, remove_alternatives: false)
2241
+ return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
2242
+ end
2243
+
2244
+ def each(att = false)
2245
+ @stanzas[:terms].each do |id, tags|
2246
+ next if @alternatives_index.include?(id)
2247
+ if att
2248
+ yield(id, tags)
2249
+ else
2250
+ yield(id)
2251
+ end
2252
+ end
2253
+ end
2254
+
2255
+ def list_term_attributes
2256
+ terms = []
2257
+ each do |code|
2258
+ terms << [code, translate_id(code), get_term_level(code)]
2259
+ end
2260
+ return terms
2261
+ end
2262
+
2263
+ #============================================================================
2264
+ #============================================================================
2265
+
2266
+ # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
2267
+ # ===== Parameters
2268
+ # ++::
2269
+ # ===== Returns
2270
+ # ...
2271
+ def compute_relations_to_items(external_item_list, total_items, mode, thresold)
2272
+ terms_levels = list_terms_per_level_from_items
2273
+ #puts terms_levels.inspect.yellow
2274
+ connect_familiars!(terms_levels)
2275
+ #puts terms_levels.inspect.blue
2276
+ item_list_with_transf_parental = get_item_list_parental(terms_levels)
2277
+ results = []
2278
+ if mode == :elim
2279
+ results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
2280
+ elsif mode == :weight
2281
+ results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
2282
+ end
2283
+ return results
2284
+ end
2285
+
2286
+ def get_item_list_parental(terms_levels)
2287
+ transfered_list = {}
2288
+ parent_dict = @dicts[:is_a][:byTerm]
2289
+ levels = terms_levels.keys.sort
2290
+ while levels.length > 1
2291
+ level = levels.pop
2292
+ terms_levels[level].each do |term|
2293
+ parents = parent_dict[term]
2294
+ if parents.nil?
2295
+ next
2296
+ elsif parents.length == 1
2297
+ parent = parents.first
2298
+ else
2299
+ parent = (parents | terms_levels[level - 1]).first
2300
+ end
2301
+ term_it = @items[term]
2302
+ parent_it = @items[parent]
2303
+ curr_it = transfered_list[term]
2304
+ parent_all_items = merge_groups([term_it, parent_it, curr_it])
2305
+ transfered_list[parent] = parent_all_items if !parent_all_items.empty?
2306
+ term_all_items = merge_groups([term_it, curr_it])
2307
+ transfered_list[term] = term_all_items if !term_all_items.empty?
2308
+ end
2309
+ end
2310
+ terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
2311
+ transfered_list[term] = @items[term] if transfered_list[term].nil?
2312
+ end
2313
+ return transfered_list
2314
+ end
2315
+
2316
+ def merge_groups(groups)
2317
+ return groups.compact.inject([]){|it, a| it | a}
2318
+ end
2319
+
2320
+ def list_terms_per_level_from_items
2321
+ terms_levels = {}
2322
+ @items.each do |term, items|
2323
+ level = self.get_term_level(term)
2324
+ query = terms_levels[level]
2325
+ if query.nil?
2326
+ terms_levels[level] = [term]
2327
+ else
2328
+ query << term
2329
+ end
2330
+ end
2331
+ return terms_levels
2332
+ end
2333
+
2334
+ def connect_familiars!(terms_levels)
2335
+ levels = terms_levels.keys.sort
2336
+ while levels.length > 1 # Process when current level has a parental level
2337
+ level = levels.pop
2338
+ parental_level = level - 1
2339
+ parental_terms = terms_levels[parental_level]
2340
+ if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
2341
+ parental_terms = [] # Initialize required parental level
2342
+ terms_levels[parental_level] = parental_terms
2343
+ levels << parental_level
2344
+ end
2345
+ terms_levels[level].each do |term|
2346
+ path_info = @term_paths[term]
2347
+ shortest_path_length = path_info[:shortest_path]
2348
+ path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
2349
+ parental = path[1] # the first elements is the term itself
2350
+ parental_terms << parental if !parental_terms.include?(parental)
2351
+ end
2352
+ end
2353
+ end
2354
+
2355
+ def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
2356
+ results = []
2357
+ penalized_terms = {}
2358
+ levels = terms_levels.keys.sort
2359
+ levels.reverse_each do |level|
2360
+ terms_levels[level].each do |term|
2361
+ associated_items = item_list[term]
2362
+ items_to_remove = penalized_terms[term]
2363
+ items_to_remove = [] if items_to_remove.nil?
2364
+ pval = get_fisher_exact_test(
2365
+ external_item_list - items_to_remove,
2366
+ associated_items - items_to_remove,
2367
+ #((associated_items | external_item_list) - items_to_remove).length
2368
+ total_items
2369
+ )
2370
+ if pval <= thresold
2371
+ parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
2372
+ parents.each do |prnt|
2373
+ query = penalized_terms[prnt]
2374
+ if query.nil?
2375
+ penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
2376
+ else
2377
+ query.concat(item_list[term])
2378
+ end
2379
+ end
2380
+ end
2381
+ results << [term, pval]
2382
+ end
2383
+ end
2384
+ return results
2385
+ end
2386
+
2387
+ def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
2388
+ pvals = {}
2389
+ item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
2390
+ levels = terms_levels.keys.sort
2391
+ levels.reverse_each do |level|
2392
+ terms_levels[level].each do |term|
2393
+ associated_items = item_list[term]
2394
+ #initialize observed items in item_weigths_per_term list
2395
+ add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
2396
+ children = @dicts[:is_a][:byValue][term]
2397
+ if children.nil?
2398
+ children = []
2399
+ else
2400
+ children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
2401
+ end
2402
+ computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2403
+ end
2404
+ end
2405
+ return pvals.to_a
2406
+ end
2407
+
2408
+ def add_items_to_weigthed_list(term, associated_items, weigthed_list)
2409
+ term_weigthing = weigthed_list[term]
2410
+ associated_items.each{|ai| term_weigthing[ai] = 1}
2411
+ weigthed_list[term] = term_weigthing
2412
+ end
2413
+
2414
+ def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2415
+ #puts term.to_s.red
2416
+ #puts @term_paths[term].inspect
2417
+ #puts @dicts[:is_a][:byValue][term].inspect.light_blue
2418
+ associated_items = item_weigths_per_term[term].keys
2419
+ pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
2420
+ 'two_sided', item_weigths_per_term[term], true)
2421
+ pvals[term] = pval
2422
+ if children.length > 0
2423
+ rates = {}
2424
+ sig_child = 0
2425
+ children.each do |child|
2426
+ ratio = sigRatio(pvals[child], pval)
2427
+ rates[child] = ratio
2428
+ sig_child += 1 if ratio >= 1
2429
+ end
2430
+ if sig_child == 0 # CASE 1
2431
+ children.each do |child|
2432
+ current_ratio = rates[child]
2433
+ query_child = item_weigths_per_term[child]
2434
+ query_child.transform_values!{|weight| weight * current_ratio}
2435
+ pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
2436
+ 'two_sided', item_weigths_per_term[child], true)
2437
+ end
2438
+ else
2439
+ ancs = get_ancestors(term, filter_alternatives = true)
2440
+ ancs << term
2441
+ rates.each do |ch, ratio|# CASE 2
2442
+ if ratio >= 1 # The child is better than parent
2443
+ ancs.each do |anc|
2444
+ query_anc = item_weigths_per_term[anc]
2445
+ associated_items.each do |item|
2446
+ query_anc[item] /= ratio # /= --> query_anc[item]/ratio
2447
+ end
2448
+ end
2449
+ end
2450
+ end
2451
+ computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
2452
+ end
2453
+ end
2454
+ end
2455
+
2456
+ def sigRatio(pvalA, pvalB)
2457
+ return Math.log(pvalA)/Math.log(pvalB)
2458
+ end
2459
+
2460
+ def profile_stats
2461
+ stats = Hash.new(0)
2462
+ data = @profiles.values.map{|ont_ids| ont_ids.size}
2463
+ stats[:average] = data.sum().fdiv(data.size)
2464
+ sum_devs = data.sum{|element| (element - stats[:avg]) ** 2}
2465
+ stats[:variance] = sum_devs.fdiv(data.size)
2466
+ stats[:standardDeviation] = stats[:variance] ** 0.5
2467
+ stats[:max] = data.max
2468
+ stats[:min] = data.min
2469
+
2470
+ stats[:count] = data.size
2471
+ data.each do |value|
2472
+ stats[:countNonZero] += 1 if value != 0
2473
+ end
2474
+
2475
+ stats[:q1] = data.get_quantiles(0.25)
2476
+ stats[:median] = data.get_quantiles(0.5)
2477
+ stats[:q3] = data.get_quantiles(0.75)
2478
+ return stats
2479
+
2480
+ end
2481
+
2482
+ #============================================================================
2483
+ #============================================================================
2484
+
2485
+ # Check if a given ID is a removable (blacklist) term.
2486
+ # +DEPRECATED+ use is_removable? instead
2487
+ # ===== Parameters
2488
+ # +id+:: to be checked
2489
+ # ===== Returns
2490
+ # true if given term is a removable (blacklist) term or false in other cases
2491
+ def is_removable(id)
2492
+ warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
2493
+ return @removable_terms.include?(id.to_sym)
2494
+ end
2495
+
2496
+ # Check if a given ID is a removable (blacklist) term
2497
+ # ===== Parameters
2498
+ # +id+:: to be checked
2499
+ # ===== Returns
2500
+ # true if given term is a removable (blacklist) term or false in other cases
2501
+ def is_removable? id
2502
+ return @removable_terms.include?(id.to_sym)
2503
+ end
2504
+
2505
+ ############################################
2506
+ # SPECIAL METHODS
2507
+ #############################################
2508
+ def ==(other)
2509
+ self.header == other.header &&
2510
+ self.stanzas == other.stanzas &&
2511
+ self.ancestors_index == other.ancestors_index &&
2512
+ self.alternatives_index == other.alternatives_index &&
2513
+ self.obsoletes_index == other.obsoletes_index &&
2514
+ self.structureType == other.structureType &&
2515
+ self.ics == other.ics &&
2516
+ self.meta == other.meta &&
2517
+ self.dicts == other.dicts &&
2518
+ self.profiles == other.profiles &&
2519
+ self.profilesDict == other.profilesDict &&
2520
+ (self.items.keys - other.items.keys).empty? &&
2521
+ self.removable_terms == other.removable_terms &&
2522
+ self.special_tags == other.special_tags &&
2523
+ self.items == other.items &&
2524
+ self.term_paths == other.term_paths &&
2525
+ self.max_freqs == other.max_freqs
2008
2526
  end
2009
2527
 
2010
2528
 
2011
2529
  def clone
2012
- copy = Ontology.new
2013
- copy.header = self.header.clone
2014
- copy.stanzas[:terms] = self.stanzas[:terms].clone
2015
- copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2016
- copy.stanzas[:instances] = self.stanzas[:instances].clone
2017
- copy.ancestors_index = self.ancestors_index.clone
2018
- copy.descendants_index = self.descendants_index.clone
2019
- copy.alternatives_index = self.alternatives_index.clone
2020
- copy.obsoletes_index = self.obsoletes_index.clone
2021
- copy.structureType = self.structureType.clone
2022
- copy.ics = self.ics.clone
2023
- copy.meta = self.meta.clone
2024
- copy.dicts = self.dicts.clone
2025
- copy.profiles = self.profiles.clone
2026
- copy.profilesDict = self.profilesDict.clone
2027
- copy.items = self.items.clone
2028
- copy.removable_terms = self.removable_terms.clone
2029
- copy.term_paths = self.term_paths.clone
2030
- copy.max_freqs = self.max_freqs.clone
2031
- return copy
2032
- end
2033
-
2034
-
2035
- #############################################
2036
- # ACCESS CONTROL
2037
- #############################################
2038
-
2039
- attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2040
- attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2530
+ copy = Ontology.new
2531
+ copy.header = self.header.clone
2532
+ copy.stanzas[:terms] = self.stanzas[:terms].clone
2533
+ copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2534
+ copy.stanzas[:instances] = self.stanzas[:instances].clone
2535
+ copy.ancestors_index = self.ancestors_index.clone
2536
+ copy.descendants_index = self.descendants_index.clone
2537
+ copy.alternatives_index = self.alternatives_index.clone
2538
+ copy.obsoletes_index = self.obsoletes_index.clone
2539
+ copy.structureType = self.structureType.clone
2540
+ copy.ics = self.ics.clone
2541
+ copy.meta = self.meta.clone
2542
+ copy.dicts = self.dicts.clone
2543
+ copy.profiles = self.profiles.clone
2544
+ copy.profilesDict = self.profilesDict.clone
2545
+ copy.items = self.items.clone
2546
+ copy.removable_terms = self.removable_terms.clone
2547
+ copy.term_paths = self.term_paths.clone
2548
+ copy.max_freqs = self.max_freqs.clone
2549
+ return copy
2550
+ end
2551
+
2552
+
2553
+ #############################################
2554
+ # ACCESS CONTROL
2555
+ #############################################
2556
+
2557
+ attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2558
+ attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2041
2559
  end