semtools 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fc29918a31045893647355dd72264a04386c5171c48ea868f7e9bbc93062151
4
- data.tar.gz: 692ce02343cb00ac37bbc34476da08386bedf0eaca7946689eb62c9a1f06d555
3
+ metadata.gz: e68630d42a4faf01dc15fdfa9f1acd64425ef1396ed6f9ce0a8d76319922ba06
4
+ data.tar.gz: 952d908af5370031df0f19c98ab69fbb59b51825f050b69714f4494e15f77f77
5
5
  SHA512:
6
- metadata.gz: 1b52667c81a0a25786b91156e9ed88a8de47e86fd18baddffc43b05ff199f95129b09da4e03025b6fb709d18a0274e22bf4a55c81471fda748e75aadca4d6ef1
7
- data.tar.gz: 46e5b49f611c021ee8576a522a0a6ef22a8b9ed349084dadb9e44fd76c712c05221e6314985f08bdba575ac2dd849f1f14d84d5ae686889f33fac993132a8372
6
+ metadata.gz: 85792433d82f824297df87cb0927b24116425ddb2a72a3e2f461748e014aa27f4efc8f73fcd7d1e6c423acd7487b77d21c2a8c0b7b0f8530030f6246ad62ad64
7
+ data.tar.gz: 2d0e0953f19d8c2cad2cc85a0c6d8c1cb9bf95f4dd1ee2d75aebcf15bdd3929d2938ede6544ed3f145ac5a8804b97af64f50a859ae7ecf8164f0ed4f07208fb2
data/bin/onto2json.rb CHANGED
@@ -18,14 +18,20 @@ OptionParser.new do |opts|
18
18
  opts.banner = "Usage: #{__FILE__} [options]"
19
19
 
20
20
  options[:input_file] = nil
21
- opts.on("-i", "--input_file PATH", "Input file with ontology in OBO format") do |data|
21
+ opts.on("-i", "--input_file FILE", "Input file with ontology in OBO format") do |data|
22
22
  options[:input_file] = data
23
23
  end
24
24
 
25
25
  options[:output_file] = nil
26
- opts.on("-o", "--output_file PATH", "Output path") do |data|
26
+ opts.on("-o", "--output_file FILE", "Output path") do |data|
27
27
  options[:output_file] = data
28
28
  end
29
+
30
+ options[:build] = false
31
+ opts.on("-b", "--build", "Activate build mode (calculate dictionaries)") do
32
+ options[:build] = true
33
+ end
34
+
29
35
 
30
36
  opts.on_tail("-h", "--help", "Show this message") do
31
37
  puts opts
@@ -39,7 +45,7 @@ end.parse!
39
45
  # MAIN
40
46
  ##########################
41
47
  puts "Loading ontology ..."
42
- onto = Ontology.new(file: options[:input_file], load_file: true)
48
+ onto = Ontology.new(file: options[:input_file], load_file: true, build: options[:build])
43
49
  puts "Exporting ontology to JSON ..."
44
50
  onto.write(options[:output_file])
45
51
  puts "Ontology exported"
@@ -2,2040 +2,2051 @@ require 'json'
2
2
 
3
3
 
4
4
  class Ontology
5
- #########################################################
6
- # AUTHOR NOTES
7
- #########################################################
8
-
9
- # 1 - Store @profiles as @stanzas[:instances]
10
- # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
11
-
12
-
13
- #############################################
14
- # FIELDS
15
- #############################################
16
- # Handled class variables
17
- # => @@basic_tags :: hash with main OBO structure tags
18
- # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
19
- # => @@symbolizable_ids :: tags which can be symbolized
20
- # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
21
- #
22
- # Handled object variables
23
- # => @header :: file header (if is available)
24
- # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
25
- # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
26
- # => @descendants_index :: hash of descendants per each term handled with any structure relationships
27
- # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
28
- # => @obsoletes_index :: hash of obsoletes and it's new ids
29
- # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
30
- # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
31
- # => @ics :: already calculated ICs for handled terms and IC types
32
- # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
33
- # => @max_freqs :: maximum freqs found for structural and observed freqs
34
- # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
35
- # => @profiles :: set of terms assigned to an ID
36
- # => @profilesDict :: set of profile IDs assigned to a term
37
- # => @items :: hash with items relations to terms
38
- # => @removable_terms :: array of terms to not be considered
39
- # => @term_paths :: metainfo about parental paths of each term
40
-
41
- @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
- @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
- @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
- @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
45
- @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
46
- @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
47
-
48
- #############################################
49
- # CONSTRUCTOR
50
- #############################################
51
-
52
- # Instantiate a OBO_Handler object
53
- # ===== Parameters
54
- # +file+:: with info to be loaded (.obo ; .json)
55
- # +load_file+:: activate load process automatically (only for .obo)
56
- # +removable_terms+: term to be removed from calcs
57
- def initialize(file: nil, load_file: false, removable_terms: [])
58
- # Initialize object variables
59
- @header = nil
60
- @stanzas = {terms: {}, typedefs: {}, instances: {}}
61
- @ancestors_index = {}
62
- @descendants_index = {}
63
- @alternatives_index = {}
64
- @obsoletes_index = {}
65
- @structureType = nil
66
- @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
67
- @meta = {}
68
- @special_tags = @@basic_tags.clone
69
- @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
70
- @dicts = {}
71
- @profiles = {}
72
- @profilesDict = {}
73
- @items = {}
74
- @removable_terms = []
75
- @term_paths = {}
76
- # Load if proceeds
77
- add_removable_terms(removable_terms) if !removable_terms.empty?
78
- load(file) if load_file
79
- end
80
-
81
-
82
- #############################################
83
- # CLASS METHODS
84
- #############################################
85
-
86
- # Expand a (starting) term using a specific tag and return all extended terms into an array and
87
- # the relationship structuture observed (hierarchical or circular). If circular structure is
88
- # foumd, extended array will be an unique vector without starting term (no loops).
89
- # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
90
- # ===== Parameters
91
- # +start+:: term where start to expand
92
- # +terms+:: set to be used to expand
93
- # +target_tag+:: tag used to expand
94
- # +eexpansion+:: already expanded info
95
- # +split_info_char+:: special regex used to split info (if it is necessary)
96
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
97
- # +alt_ids+:: set of alternative IDs
98
- # ===== Returns
99
- # A vector with the observed structure (string) and the array with extended terms.
100
- def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
101
- # Take start_id term available info and already accumulated info
102
- current_associations = related_ids[start_id]
103
- current_associations = [] if current_associations.nil?
104
- return [:no_term,[]] if terms[start_id].nil?
105
- id_relations = terms[start_id][target_tag]
106
- return [:source,[]] if id_relations.nil?
107
-
108
- # Prepare auxiliar variables
109
- struct = :hierarchical
110
-
111
- # Study direct extensions
112
- id_relations = id_relations.clone
113
- while id_relations.length > 0
114
- id = id_relations.shift
115
- id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
116
-
117
- # Handle
118
- if current_associations.include?(id) # Check if already have been included into this expansion
119
- struct = :circular
120
- else
121
- current_associations << id
122
- if related_ids.include?(id) # Check if current already has been expanded
123
- current_associations = current_associations | related_ids[id]
124
- if current_associations.include?(start_id) # Check circular case
125
- struct = :circular
126
- [id, start_id].each{|repeated| current_associations.delete(repeated)}
127
- end
128
- else # Expand
129
- related_ids[start_id] = current_associations
130
- structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
131
- current_associations = current_associations | current_related_ids
132
- struct = :circular if structExp == :circular # Check struct
133
- if current_associations.include?(start_id) # Check circular case
134
- struct = :circular
135
- current_associations.delete(start_id)
136
- end
137
- end
138
- end
139
- end
140
- related_ids[start_id] = current_associations
141
-
142
- return struct, current_associations
143
- end
144
-
145
-
146
- # Expand terms using a specific tag and return all extended terms into an array and
147
- # the relationship structuture observed (hierarchical or circular). If circular structure is
148
- # foumd, extended array will be an unique vector without starting term (no loops)
149
- # ===== Parameters
150
- # +terms+:: set to be used to expand
151
- # +target_tag+:: tag used to expand
152
- # +split_info_char+:: special regex used to split info (if it is necessary)
153
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
154
- # +alt_ids+:: set of alternative IDs
155
- # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
156
- # ===== Returns
157
- # A vector with the observed structure (string) and the hash with extended terms
158
- def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
159
- # Define structure type
160
- structType = :hierarchical
161
- related_ids = {}
162
- terms.each do |id, tags|
163
- # Check if target tag is defined
164
- if !tags[target_tag].nil?
165
- # Obtain related terms
166
- set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
167
- # Check structure
168
- structType = :circular if set_structure == :circular
169
- end
170
- end
171
-
172
- # Check special case
173
- structType = :atomic if related_ids.length <= 0
174
- structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
175
- # Return type and hash with related_ids
176
- return structType, related_ids
177
- end
178
-
179
-
180
- # Class method to transform string with <tag : info> into hash structure
181
- # ===== Parameters
182
- # +attributes+:: array tuples with info to be transformed into hash format
183
- # ===== Returns
184
- # Attributes stored into hash structure
185
- def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
186
- # Load info
187
- info_hash = {}
188
- # Only TERMS multivalue tags (future add Typedefs and Instance)
189
- # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
190
- attributes.each do |tag, value|
191
- # Check
192
- raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
193
- # Prepare
194
- tag = tag.lstrip.to_sym
195
- value.lstrip!
196
- value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
197
-
198
- # Store
199
- query = info_hash[tag]
200
- if !query.nil? # Tag already exists
201
- if !query.kind_of?(Array) # Check that tag is multivalue
202
- raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
203
- else
204
- query << value # Add new value to tag
205
- end
206
- else # New entry
207
- if @@multivalue_tags.include?(tag)
208
- info_hash[tag] = [value]
209
- else
210
- info_hash[tag] = value
211
- end
212
- end
213
- end
214
- self.symbolize_ids(info_hash)
215
- return info_hash
216
- end
217
-
218
-
219
- # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
220
- # the Header, the Terms, the Typedefs and the Instances.
221
- # ===== Parameters
222
- # +file+:: OBO file to be loaded
223
- # ===== Returns
224
- # Hash with FILE, HEADER and STANZAS info
225
- def self.load_obo(file) #TODO: Send to obo_parser class
226
- raise("File is not defined") if file.nil?
227
- # Data variables
228
- header = ''
229
- stanzas = {terms: {}, typedefs: {}, instances: {}}
230
- # Auxiliar variables
231
- infoType = 'Header'
232
- currInfo = []
233
- stanzas_flags = %w[[Term] [Typedef] [Instance]]
234
- # Read file
235
- File.open(file).each do |line|
236
- line.chomp!
237
- next if line.empty?
238
- fields = line.split(':', 2)
239
- # Check if new instance is found
240
- if stanzas_flags.include?(line)
241
- header = self.process_entity(header, infoType, stanzas, currInfo)
242
- # Update info variables
243
- currInfo = []
244
- infoType = line.gsub!(/[\[\]]/, '')
245
- next
246
- end
247
- # Concat info
248
- currInfo << fields
249
- end
250
- # Store last loaded info
251
- header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
252
-
253
- # Prepare to return
254
- finfo = {:file => file, :name => File.basename(file, File.extname(file))}
255
- return finfo, header, stanzas
256
- end
257
-
258
-
259
- # Handle OBO loaded info and stores it into correct container and format
260
- # ===== Parameters
261
- # +header+:: container
262
- # +infoType+:: current ontology item type detected
263
- # +stanzas+:: container
264
- # +currInfo+:: info to be stored
265
- # ===== Returns
266
- # header newly/already stored
267
- def self.process_entity(header, infoType, stanzas, currInfo)
268
- info = self.info2hash(currInfo)
269
- # Store current info
270
- if infoType.eql?('Header')
271
- header = info
272
- else
273
- id = info[:id]
274
- case infoType
275
- when 'Term'
276
- stanzas[:terms][id] = info
277
- when 'Typedef'
278
- stanzas[:typedefs][id] = info
279
- when 'Instance'
280
- stanzas[:instances][id] = info
281
- end
282
- end
283
- return header
284
- end
285
-
286
-
287
- # Symboliza all values into hashs using symbolizable tags as keys
288
- # ===== Parameters
289
- # +item_hash+:: hash to be checked
290
- def self.symbolize_ids(item_hash)
291
- @@symbolizable_ids.each do |tag|
292
- query = item_hash[tag]
293
- if !query.nil?
294
- if query.kind_of?(Array)
295
- query.map!{|item| item.to_sym}
296
- else
297
- item_hash[tag] = query.to_sym if !query.nil?
298
- end
299
- end
300
- end
301
- end
302
-
303
-
304
- #
305
- # ===== Parameters
306
- # +root+:: main term to expand
307
- # +ontology+:: to be cutted
308
- # +clone+:: if true, given ontology object will not be mutated
309
- # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
310
- # ===== Returns
311
- # An Ontology object with terms after cut the ontology.
312
- def self.mutate(root, ontology, clone: true, remove_up: true)
313
- ontology = ontology.clone if clone
314
- # Obtain affected IDs
315
- descendants = ontology.descendants_index[root]
316
- descendants << root # Store itself to do not remove it
317
- # Remove unnecesary terms
318
- ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
319
- ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
320
- ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
321
- ontology.dicts = {}
322
- ontology.removable_terms = []
323
- ontology.term_paths = {}
324
- # Recalculate metadata
325
- ontology.build_index
326
- ontology.add_observed_terms_from_profiles
327
- # Finish
328
- return ontology
329
- end
330
-
331
-
332
-
333
- #############################################
334
- # GENERAL METHODS
335
- #############################################
336
-
337
- # Include removable terms to current removable terms list
338
- # ===== Parameters
339
- # +terms+:: terms array to be concatenated
340
- def add_removable_terms(terms)
341
- terms = terms.map{|term| term.to_sym}
342
- @removable_terms.concat(terms)
343
- end
344
-
345
-
346
- # Include removable terms to current removable terms list loading new
347
- # terms from a one column plain text file
348
- # ===== Parameters
349
- # +file+:: to be loaded
350
- def add_removable_terms_from_file(file)
351
- File.open(excluded_codes_file).each do |line|
352
- line.chomp!
353
- @removable_terms << line.to_sym
354
- end
355
- end
356
-
357
-
358
- # Increase observed frequency for a specific term
359
- # ===== Parameters
360
- # +term+:: term which frequency is going to be increased
361
- # +increas+:: frequency rate to be increased. Default = 1
362
- # ===== Return
363
- # true if process ends without errors, false in other cases
364
- def add_observed_term(term:,increase: 1.0)
365
- # Check
366
- raise ArgumentError, "Term given is NIL" if term.nil?
367
- return false unless @stanzas[:terms].include?(term)
368
- return false if @removable_terms.include?(term)
369
- if @alternatives_index.include?(term)
370
- alt_id = @alternatives_index[term]
371
- @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
372
- @meta[term] = @meta[alt_id]
373
- end
374
- # Check if exists
375
- @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
376
- # Add frequency
377
- @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
378
- @meta[term][:observed_freq] += increase
379
- # Check maximum frequency
380
- @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
381
- return true
382
- end
383
-
384
-
385
- # Increase the arbitrary frequency of a given term set
386
- # ===== Parameters
387
- # +terms+:: set of terms to be updated
388
- # +increase+:: amount to be increased
389
- # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
390
- # ===== Return
391
- # true if process ends without errors and false in other cases
392
- def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
393
- # Check
394
- raise ArgumentError, 'Terms array given is NIL' if terms.nil?
395
- raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
396
- # Add observations
397
- if transform_to_sym
398
- checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
399
- else
400
- checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
401
- end
402
- return checks
403
- end
404
-
405
-
406
- # Compare to terms sets
407
- # ===== Parameters
408
- # +termsA+:: set to be compared
409
- # +termsB+:: set to be compared
410
- # +sim_type+:: similitude method to be used. Default: resnik
411
- # +ic_type+:: ic type to be used. Default: resnik
412
- # +bidirectional+:: calculate bidirectional similitude. Default: false
413
- # ===== Return
414
- # similitude calculated
415
- def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
416
- # Check
417
- raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
418
- raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
419
- micasA = []
420
- # Compare A -> B
421
- termsA.each do |tA|
422
- micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
423
- # Remove special cases
424
- [false,nil].each do |err_value| micas.delete(err_value) end
425
- # Obtain maximum value
426
- micasA << micas.max if micas.length > 0
427
- micasA << 0 if micas.length <= 0
428
- end
429
- means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
430
- # Compare B -> A
431
- if bidirectional
432
- means_simA = means_sim * micasA.size
433
- means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
434
- means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
435
- end
436
- # Return
437
- return means_sim
438
- end
439
-
440
-
441
- # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
442
- # ===== Parameters
443
- # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
444
- # +sim_type+:: similitude method to be used. Default: resnik
445
- # +ic_type+:: ic type to be used. Default: resnik
446
- # +bidirectional+:: calculate bidirectional similitude. Default: false
447
- # ===== Return
448
- # Similitudes calculated
449
- def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
450
- profiles_similarity = {} #calculate similarity between patients profile
451
- profiles_ids = @profiles.keys
452
- if external_profiles.nil?
453
- comp_ids = profiles_ids
454
- comp_profiles = @profiles
455
- main_ids = comp_ids
456
- main_profiles = comp_profiles
457
- else
458
- comp_ids = external_profiles.keys
459
- comp_profiles = external_profiles
460
- main_ids = profiles_ids
461
- main_profiles = @profiles
462
- end
463
- # Compare
464
- while !main_ids.empty?
465
- curr_id = main_ids.shift
466
- current_profile = main_profiles[curr_id]
467
- comp_ids.each do |id|
468
- profile = comp_profiles[id]
469
- value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
470
- query = profiles_similarity[curr_id]
471
- if query.nil?
472
- profiles_similarity[curr_id] = {id => value}
473
- else
474
- query[id] = value
475
- end
476
- end
477
- end
478
- return profiles_similarity
479
- end
480
-
481
-
482
- # Expand alternative IDs arround all already stored terms
483
- # ===== Parameters
484
- # +alt_tag+:: tag used to expand alternative IDs
485
- # ===== Returns
486
- # true if process ends without errors and false in other cases
487
- def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
488
- # Check input
489
- raise('stanzas terms empty') if @stanzas[:terms].empty?
490
- # Take all alternative IDs
491
- alt_ids2add = {}
492
- @stanzas[:terms].each do |id, tags|
493
- alt_ids = tags[alt_tag]
494
- if !alt_ids.nil?
495
- alt_ids = alt_ids - @removable_terms
496
- # Update info
497
- alt_ids.each do |alt_term|
498
- @alternatives_index[alt_term] = id
499
- alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
500
- @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
501
- end
502
- end
503
- end
504
- @stanzas[:terms].merge!(alt_ids2add)
505
- end
506
-
507
-
508
- # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
509
- # ===== Returns
510
- # true if eprocess ends without errors and false in other cases
511
- def build_index()
512
- self.get_index_alternatives
513
- self.get_index_obsoletes
514
- self.get_index_child_parent_relations
515
- @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
516
- @alternatives_index.compact!
517
- @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
518
- @obsoletes_index.compact!
519
- @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
520
- @ancestors_index.compact!
521
- @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
522
- @descendants_index.compact!
523
- self.get_index_frequencies
524
- self.calc_dictionary(:name)
525
- self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
526
- self.calc_term_levels(calc_paths: true)
527
- end
528
-
529
-
530
- # Calculates regular frequencies based on ontology structure (using parentals)
531
- # ===== Returns
532
- # true if everything end without errors and false in other cases
533
- def get_index_frequencies()
534
- # Check
535
- if @ancestors_index.empty?
536
- warn('ancestors_index object is empty')
537
- else
538
- # Prepare useful variables
539
- alternative_terms = @alternatives_index.keys
540
- # Per each term, add frequencies
541
- @stanzas[:terms].each do |id, tags|
542
- if @alternatives_index.include?(id)
543
- alt_id = @alternatives_index[id]
544
- query = @meta[alt_id] # Check if exist
545
- if query.nil?
546
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
547
- @meta[alt_id] = query
548
- end
549
- @meta[id] = query
550
- # Note: alternative terms do not increase structural frequencies
551
- else # Official term
552
- query = @meta[id] # Check if exist
553
- if query.nil?
554
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
555
- @meta[id] = query
556
- end
557
- # Store metadata
558
- query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
559
- query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
560
- query[:struct_freq] = query[:descendants] + 1.0
561
- # Update maximums
562
- @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
563
- @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
564
- end
565
- end
566
- end
567
- end
568
-
569
-
570
- # Expand obsoletes set and link info to their alternative IDs
571
- # ===== Parameters
572
- # +obs_tags+:: tags to be used to find obsoletes
573
- # +alt_tags+:: tags to find alternative IDs (if are available)
574
- # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
575
- # ===== Returns
576
- # true if process ends without errors and false in other cases
577
- def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
578
- if @stanzas[:terms].empty?
579
- warn('stanzas terms empty')
580
- else
581
- # Check obsoletes
582
- @stanzas[:terms].each do |id, term_tags|
583
- next if term_tags.nil?
584
- query = term_tags[obs_tag]
585
- if !query.nil? && query == 'true' # Obsolete tag presence
586
- next if !@obsoletes_index[id].nil? # Already stored
587
- # Check if alternative value is available
588
- alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
589
- if !alt_ids.empty?
590
- alt_id = alt_ids.first.first #FIRST tag, FIRST id
591
- # Store
592
- @alternatives_index[id] = alt_id
593
- @obsoletes_index[id] = alt_id
594
- end
595
- end
596
- end
597
- end
598
- end
599
-
600
-
601
- # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
602
- # ===== Parameters
603
- # +tag+:: tag used to expand parentals
604
- # +split_info_char+:: special regex used to split info (if it is necessary)
605
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
606
- # ===== Returns
607
- # true if process ends without errors and false in other cases
608
- def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
609
- # Check
610
- if @stanzas[:terms].nil?
611
- warn('stanzas terms empty')
612
- else
613
- # Expand
614
- structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
615
- target_tag: tag,
616
- alt_ids: @alternatives_index,
617
- obsoletes: @obsoletes_index.length)
618
- # Check
619
- raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
620
- # Prepare ancestors structure
621
- anc = {}
622
- des = {}
623
- parentals.each do |id, parents|
624
- parents = parents - @removable_terms
625
- anc[id] = parents
626
- parents.each do |anc_id| # Add descendants
627
- if !des.include?(anc_id)
628
- des[anc_id] = [id]
629
- else
630
- des[anc_id] << id
631
- end
632
- end
633
- end
634
- # Store alternatives
635
- @alternatives_index.each do |id,alt|
636
- anc[id] = anc[alt] if anc.include?(alt)
637
- des[id] = des[alt] if des.include?(alt)
638
- end
639
- # Check structure
640
- if ![:atomic,:sparse].include? structType
641
- structType = structType == :circular ? :circular : :hierarchical
642
- end
643
- # Store
644
- @ancestors_index = anc
645
- @descendants_index = des
646
- @structureType = structType
647
- end
648
- # Finish
649
- end
650
-
651
-
652
- # Find ancestors of a given term
653
- # ===== Parameters
654
- # +term+:: to be checked
655
- # +filter_alternatives+:: if true, remove alternatives from final results
656
- # ===== Returns
657
- # an array with all ancestors of given term or false if parents are not available yet
658
- def get_ancestors(term, filter_alternatives = false)
659
- return self.get_familiar(term, true, filter_alternatives)
660
- end
661
-
662
-
663
- # Find descendants of a given term
664
- # ===== Parameters
665
- # +term+:: to be checked
666
- # +filter_alternatives+:: if true, remove alternatives from final results
667
- # ===== Returns
668
- # an array with all descendants of given term or false if parents are not available yet
669
- def get_descendants(term, filter_alternatives = false)
670
- return self.get_familiar(term, false, filter_alternatives)
671
- end
672
-
673
-
674
- # Find ancestors/descendants of a given term
675
- # ===== Parameters
676
- # +term+:: to be checked
677
- # +return_ancestors+:: return ancestors if true or descendants if false
678
- # +filter_alternatives+:: if true, remove alternatives from final results
679
- # ===== Returns
680
- # an array with all ancestors/descendants of given term or nil if parents are not available yet
681
- def get_familiar(term, return_ancestors = true, filter_alternatives = false)
682
- # Find into parentals
683
- familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
684
- if !familiars.nil?
685
- familiars = familiars.clone
686
- if filter_alternatives
687
- familiars.reject!{|fm| @alternatives_index.include?(fm)}
688
- end
689
- else
690
- familiars = []
691
- end
692
- return familiars
693
- end
694
-
695
-
696
- # Obtain IC of an specific term
697
- # ===== Parameters
698
- # +term+:: which IC will be calculated
699
- # +type+:: of IC to be calculated. Default: resnik
700
- # +force+:: force re-calculate the IC. Do not check if it is already calculated
701
- # +zhou_k+:: special coeficient for Zhou IC method
702
- # ===== Returns
703
- # the IC calculated
704
- def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
705
- term = termRaw.to_sym
706
- # Check
707
- raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
708
- # Check if it's already calculated
709
- return @ics[type][term] if (@ics[type].include? term) && !force
710
- # Calculate
711
- ic = - 1
712
- case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
713
- ###########################################
714
- #### STRUCTURE BASED METRICS
715
- ###########################################
716
- # Shortest path
717
- # Weighted Link
718
- # Hirst and St-Onge Measure
719
- # Wu and Palmer
720
- # Slimani
721
- # Li
722
- # Leacock and Chodorow
723
- ###########################################
724
- #### INFORMATION CONTENT METRICS
725
- ###########################################
726
- when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
727
- # -log(Freq(x) / Max_Freq)
728
- ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
729
- when :resnik_observed
730
- # -log(Freq(x) / Max_Freq)
731
- ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
732
- # Lin
733
- # Jiang & Conrath
734
-
735
- ###########################################
736
- #### FEATURE-BASED METRICS
737
- ###########################################
738
- # Tversky
739
- # x-similarity
740
- # Rodirguez
741
-
742
- ###########################################
743
- #### HYBRID METRICS
744
- ###########################################
745
- when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
746
- # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
747
- ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
748
- if :zhou # New Model of Semantic Similarity Measuring in Wordnet
749
- # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
750
- @ics[:seco][term] = ic # Special store
751
- ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
752
- end
753
- when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
754
- ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
755
- # Knappe
756
- end
757
- @ics[type][term] = ic
758
- return ic
759
- end
760
-
761
-
762
- # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
763
- # ===== Returns
764
- # two hashes with resnik and resnik_observed ICs for observed terms
765
- def get_observed_ics_by_onto_and_freq
766
- # Chech there are observed terms
767
- if @profiles.empty?
768
- resnik = {}
769
- resnik_observed = {}
770
- else
771
- # Calc ICs for all terms
772
- observed_terms = @profiles.values.flatten.uniq
773
- observed_terms.each{ |term| get_IC(term)}
774
- observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
775
- resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
776
- resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
777
- end
778
- return resnik.clone, resnik_observed.clone
779
- end
780
-
781
-
782
- # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
783
- # ===== Parameters
784
- # +termA+:: term to be cheked
785
- # +termB+:: term to be checked
786
- # +ic_type+:: IC formula to be used
787
- # ===== Returns
788
- # the IC of the MICA(termA,termB)
789
- def get_ICMICA(termA, termB, ic_type = :resnik)
790
- mica = self.get_MICA(termA, termB, ic_type)
791
- return mica.first.nil? ? nil : mica.last
792
- end
793
-
794
-
795
- # Find the Most Index Content shared Ancestor (MICA) of two given terms
796
- # ===== Parameters
797
- # +termA+:: term to be cheked
798
- # +termB+:: term to be checked
799
- # +ic_type+:: IC formula to be used
800
- # ===== Returns
801
- # the MICA(termA,termB) and it's IC
802
- def get_MICA(termA, termB, ic_type = :resnik)
803
- termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
804
- termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
805
- mica = [nil,-1.0]
806
- # Special case
807
- if termA.eql?(termB)
808
- ic = self.get_IC(termA, type: ic_type)
809
- mica = [termA, ic]
810
- else
811
- # Obtain ancestors (include itselfs too)
812
- anc_A = self.get_ancestors(termA)
813
- anc_B = self.get_ancestors(termB)
814
-
815
- if !(anc_A.empty? && anc_B.empty?)
816
- anc_A << termA
817
- anc_B << termB
818
- # Find shared ancestors
819
- shared_ancestors = anc_A & anc_B
820
- # Find MICA
821
- if shared_ancestors.length > 0
822
- shared_ancestors.each do |anc|
823
- ic = self.get_IC(anc, type: ic_type)
824
- # Check
825
- mica = [anc,ic] if ic > mica[1]
826
- end
827
- end
828
- end
829
- end
830
- return mica
831
- end
832
-
833
-
834
- # Calculate similarity between two given terms
835
- # ===== Parameters
836
- # +termsA+:: to be compared
837
- # +termsB+:: to be compared
838
- # +type+:: similitude formula to be used
839
- # +ic_type+:: IC formula to be used
840
- # ===== Returns
841
- # the similarity between both sets or false if frequencies are not available yet
842
- def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
843
- # Check
844
- raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
845
- sim = nil
846
- # Launch comparissons
847
- sim_res = get_ICMICA(termA, termB, ic_type)
848
- if !sim_res.nil?
849
- case type
850
- when :resnik
851
- sim = sim_res
852
- when :lin
853
- sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
854
- when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
855
- sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
856
- end
857
- end
858
- return sim
859
- end
860
-
861
-
862
- # Method used to load information stored into an OBO file and store it into this object.
863
- # If a file is specified by input parameter, current @file value is updated
864
- # ===== Parameters
865
- # +file+:: optional file to update object stored file
866
- def load(file, build: true)
867
- _, header, stanzas = self.class.load_obo(file)
868
- @header = header
869
- @stanzas = stanzas
870
- self.remove_removable()
871
- # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
872
- self.build_index() if build
873
- end
874
-
875
- #
876
- def remove_removable()
877
- @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
878
- end
879
-
880
-
881
- # Exports an OBO_Handler object in json format
882
- # ===== Parameters
883
- # +file+:: where info will be stored
884
- def write(file)
885
- # Take object stored info
886
- obj_info = {header: @header,
887
- stanzas: @stanzas,
888
- ancestors_index: @ancestors_index,
889
- descendants_index: @descendants_index,
890
- alternatives_index: @alternatives_index,
891
- obsoletes_index: @obsoletes_index,
892
- structureType: @structureType,
893
- ics: @ics,
894
- meta: @meta,
895
- special_tags: @special_tags,
896
- max_freqs: @max_freqs,
897
- dicts: @dicts,
898
- profiles: @profiles,
899
- profilesDict: @profilesDict,
900
- items: @items,
901
- removable_terms: @removable_terms,
902
- term_paths: @term_paths}
903
- # Convert to JSON format & write
904
- File.open(file, "w") { |f| f.write obj_info.to_json }
905
- end
906
-
907
-
908
- def is_number? string
909
- true if Float(string) rescue false
910
- end
911
-
912
-
913
- # Read a JSON file with an OBO_Handler object stored
914
- # ===== Parameters
915
- # +file+:: with object info
916
- # ===== Return
917
- # OBO_Handler internal fields
918
- def read(file)
919
- # Read file
920
- jsonFile = File.open(file)
921
- jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
922
- # Pre-process (Symbolize some hashs values)
923
- jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
924
- jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
925
- jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
926
- jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h
927
- jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
928
- jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
929
- jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h
930
- jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
931
- # Special case: byTerm
932
- dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
933
- if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
934
- [term.to_s.to_i, value.map{|term| term.to_sym}]
935
- elsif value.is_a? Numeric # Numeric dictionary
936
- [term.to_sym, value]
937
- elsif value.kind_of?(Array) && flag == :is_a
938
- [term.to_sym, value.map{|v| v.to_sym}]
939
- else
940
- [term.to_sym, value]
941
- end
942
- end
943
- dictionaries[:byTerm] = dictionaries[:byTerm].to_h
944
- # By value
945
- dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
946
- if value.is_a? Numeric # Numeric dictionary
947
- [value, term.to_sym]
948
- elsif term.is_a? Numeric # Numeric dictionary
949
- [value.to_s.to_sym, term]
950
- elsif flag == :is_a
951
- [value.to_sym, term.map{|v| v.to_sym}]
952
- elsif term.kind_of?(Array)
953
- [value.to_sym, term.map{|t| t.to_sym}]
954
- else
955
- [value.to_s, term.to_sym]
956
- end
957
- end
958
- dictionaries[:byValue] = dictionaries[:byValue].to_h
959
- end
960
- jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
961
- jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
962
- jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}}
963
- jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym}
964
- jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
965
- if v.kind_of?(Array)
966
- jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
967
- else
968
- jsonInfo[:special_tags][k] = v.to_sym
969
- end
970
- end
971
- jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}}
972
- jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}}
973
- # Store info
974
- @header = jsonInfo[:header]
975
- @stanzas = jsonInfo[:stanzas]
976
- @ancestors_index = jsonInfo[:ancestors_index]
977
- @descendants_index = jsonInfo[:descendants_index]
978
- @alternatives_index = jsonInfo[:alternatives_index]
979
- @obsoletes_index = jsonInfo[:obsoletes_index]
980
- @structureType = jsonInfo[:structureType].to_sym
981
- @ics = jsonInfo[:ics]
982
- @meta = jsonInfo[:meta]
983
- @special_tags = jsonInfo[:special_tags]
984
- @max_freqs = jsonInfo[:max_freqs]
985
- @dicts = jsonInfo[:dicts]
986
- @profiles = jsonInfo[:profiles]
987
- @profilesDict = jsonInfo[:profilesDict]
988
- @items = jsonInfo[:items]
989
- @removable_terms = jsonInfo[:removable_terms]
990
- @term_paths = jsonInfo[:term_paths]
991
- end
992
-
993
-
994
- # Check if a given ID is stored as term into this object
995
- # ===== Parameters
996
- # +id+:: to be checked
997
- # ===== Return
998
- # True if term is allowed or false in other cases
999
- def exists? id
1000
- return stanzas[:terms].include?(id)
1001
- end
1002
-
1003
-
1004
- # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1005
- # ===== Parameters
1006
- # +text+:: to be checked
1007
- # ===== Return
1008
- # The correct ID if it can be found or nil in other cases
1009
- def extract_id(text, splitBy: ' ')
1010
- if self.exists?(text)
1011
- return text
1012
- else
1013
- splittedText = text.to_s.split(splitBy).first.to_sym
1014
- return self.exists?(splittedText) ? splittedText : nil
1015
- end
1016
- end
1017
-
1018
-
1019
- # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1020
- # This functions stores calculated dictionary into @dicts field.
1021
- # This functions stores first value for multivalue tags
1022
- # This function does not handle synonyms for byValue dictionaries
1023
- # ===== Parameters
1024
- # +tag+:: to be used to calculate dictionary
1025
- # +select_regex+:: gives a regfex that can be used to modify value to be stored
1026
- # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1027
- # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1028
- # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1029
- # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1030
- # ===== Return
1031
- # void. And stores calcualted bidirectional dictonary into dictionaries main container
1032
- def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1033
- tag = tag.to_sym
1034
- store_tag = tag if store_tag.nil?
1035
- if @stanzas[:terms].empty?
1036
- warn('Terms are not already loaded. Aborting dictionary calc')
1037
- else
1038
- byTerm = {}
1039
- byValue = {}
1040
- # Calc per term
1041
- @stanzas[:terms].each do |term, tags|
1042
- referenceTerm = term
1043
- if @alternatives_index.include?(term) && substitute_alternatives # Special case
1044
- referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1045
- end
1046
- queryTag = tags[tag]
1047
- if !queryTag.nil?
1048
- # Pre-process
1049
- if !select_regex.nil?
1050
- if queryTag.kind_of?(Array)
1051
- queryTag = queryTag.map{|value| value.scan(select_regex).first}
1052
- queryTag.flatten!
1053
- else
1054
- queryTag = queryTag.scan(select_regex).first
1055
- end
1056
- queryTag.compact!
1057
- end
1058
- if queryTag.kind_of?(Array) # Store
1059
- if !queryTag.empty?
1060
- if byTerm.include?(referenceTerm)
1061
- byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1062
- else
1063
- byTerm[referenceTerm] = queryTag
1064
- end
1065
- if multiterm
1066
- queryTag.each do |value|
1067
- byValue[value] = [] if byValue[value].nil?
1068
- byValue[value] << referenceTerm
1069
- end
1070
- else
1071
- queryTag.each{|value| byValue[value] = referenceTerm}
1072
- end
1073
- end
1074
- else
1075
- if byTerm.include?(referenceTerm)
1076
- byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1077
- else
1078
- byTerm[referenceTerm] = [queryTag]
1079
- end
1080
- if multiterm
1081
- byValue[queryTag] = [] if byValue[queryTag].nil?
1082
- byValue[queryTag] << referenceTerm
1083
- else
1084
- byValue[queryTag] = referenceTerm
1085
- end
1086
- end
1087
- end
1088
- end
1089
-
1090
- # Check self-references
1091
- if self_type_references
1092
- byTerm.map do |term, references|
1093
- corrected_references = references.map do |t|
1094
- checked = self.extract_id(t)
1095
- if checked.nil?
1096
- t
1097
- else
1098
- byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1099
- checked
1100
- end
1101
- end
1102
- byTerm[term] = corrected_references.uniq
1103
- end
1104
- end
1105
-
1106
- # Check order
1107
- byTerm.map do |term,values|
1108
- if self.exists?(term)
1109
- referenceValue = @stanzas[:terms][term][tag]
1110
- if !referenceValue.nil?
1111
- if !select_regex.nil?
1112
- if referenceValue.kind_of?(Array)
1113
- referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1114
- referenceValue.flatten!
1115
- else
1116
- referenceValue = referenceValue.scan(select_regex).first
1117
- end
1118
- referenceValue.compact!
1119
- end
1120
- if self_type_references
1121
- if referenceValue.kind_of?(Array)
1122
- aux = referenceValue.map{|t| self.extract_id(t)}
1123
- else
1124
- aux = self.extract_id(referenceValue)
1125
- end
1126
- referenceValue = aux if !aux.nil?
1127
- end
1128
- referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1129
- byTerm[term] = referenceValue + (values - referenceValue)
1130
- end
1131
- end
1132
- end
1133
-
1134
- # Store
1135
- @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1136
- end
1137
- end
1138
-
1139
-
1140
- # Calculates :is_a dictionary without alternatives substitution
1141
- def calc_ancestors_dictionary
1142
- self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
1143
- end
1144
-
1145
-
1146
- # Translate a given value using an already calcualted dictionary
1147
- # ===== Parameters
1148
- # +toTranslate+:: value to be translated using dictiontionary
1149
- # +tag+:: used to generate the dictionary
1150
- # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1151
- # ===== Return
1152
- # translation
1153
- def translate(toTranslate, tag, byValue: true)
1154
- dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1155
- toTranslate = get_main_id(toTranslate) if !byValue
1156
- return dict[toTranslate]
1157
- end
1158
-
1159
-
1160
- # Translate a name given
1161
- # ===== Parameters
1162
- # +name+:: to be translated
1163
- # ===== Return
1164
- # translated name or nil if it's not stored into this ontology
1165
- def translate_name(name)
1166
- term = self.translate(name, :name)
1167
- term = self.translate(name, :synonym) if term.nil?
1168
- return term
1169
- end
1170
-
1171
-
1172
- # Translate several names and return translations and a list of names which couldn't be translated
1173
- # ===== Parameters
1174
- # +names+:: array to be translated
1175
- # ===== Return
1176
- # two arrays with translations and names which couldn't be translated respectively
1177
- def translate_names(names)
1178
- translated = []
1179
- rejected = []
1180
- names.each do |name|
1181
- tr = self.translate_name(name)
1182
- if tr.nil?
1183
- rejected << name
1184
- else
1185
- translated << tr
1186
- end
1187
- end
1188
- return translated, rejected
1189
- end
1190
-
1191
-
1192
- # Translates a given ID to it assigned name
1193
- # ===== Parameters
1194
- # +id+:: to be translated
1195
- # ===== Return
1196
- # main name or nil if it's not included into this ontology
1197
- def translate_id(id)
1198
- name = self.translate(id, :name, byValue: false)
1199
- return name.nil? ? nil : name.first
1200
- end
1201
-
1202
-
1203
- # Translates several IDs and returns translations and not allowed IDs list
1204
- # ===== Parameters
1205
- # +ids+:: to be translated
1206
- # ===== Return
1207
- # two arrays with translations and names which couldn't be translated respectively
1208
- def translate_ids(ids)
1209
- translated = []
1210
- rejected = []
1211
- ids.each do |term_id|
1212
- tr = self.translate_id(term_id.to_sym)
1213
- if !tr.nil?
1214
- translated << tr
1215
- else
1216
- rejected << tr
1217
- end
1218
- end
1219
- return translated, rejected
1220
- end
1221
-
1222
-
1223
- # ===== Returns
1224
- # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1225
- # ===== Parameters
1226
- # +id+:: to be translated
1227
- # ===== Return
1228
- # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1229
- def get_main_id(id)
1230
- return nil if !@stanzas[:terms].include? id
1231
- new_id = id
1232
- mainID = @alternatives_index[id]
1233
- new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1234
- return new_id
1235
- end
1236
-
1237
-
1238
- # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1239
- # ===== Parameters
1240
- # +ids+:: to be checked
1241
- # ===== Return
1242
- # two arrays whit allowed and rejected IDs respectively
1243
- def check_ids(ids, substitute: true)
1244
- checked_codes = []
1245
- rejected_codes = []
1246
- ids.each do |id|
1247
- if @stanzas[:terms].include? id
1248
- if substitute
1249
- checked_codes << self.get_main_id(id)
1250
- else
1251
- checked_codes << id
1252
- end
1253
- else
1254
- rejected_codes << id
1255
- end
1256
- end
1257
- return checked_codes, rejected_codes
1258
- end
1259
-
1260
-
1261
- # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1262
- # ===== Parameters
1263
- # +id+:: assigned to profile
1264
- # +terms+:: array of terms
1265
- # +substitute+:: subsstitute flag from check_ids
1266
- def add_profile(id, terms, substitute: true)
1267
- warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1268
- correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1269
- if !rejected_terms.empty?
1270
- warn('Given terms contains erroneus IDs. These IDs will be removed')
1271
- end
1272
- if id.is_a? Numeric
1273
- @profiles[id] = correct_terms
1274
- else
1275
- @profiles[id.to_sym] = correct_terms
1276
- end
1277
- end
1278
-
1279
-
1280
- # Method used to store a pull of profiles
1281
- # ===== Parameters
1282
- # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1283
- # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1284
- # +reset_stored+:: if true, remove already stored profiles
1285
- # +substitute+:: subsstitute flag from check_ids
1286
- def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1287
- self.reset_profiles if reset_stored
1288
- # Check
1289
- if profiles.kind_of?(Array)
1290
- profiles.each_with_index do |items, i|
1291
- self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1292
- end
1293
- else # Hash
1294
- if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1295
- warn('Some profiles given are already stored. Stored version will be replaced')
1296
- end
1297
- profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1298
- end
1299
-
1300
- self.add_observed_terms_from_profiles(reset: true)
1301
-
1302
- if calc_metadata
1303
- self.calc_profiles_dictionary
1304
- end
1305
- end
1306
-
1307
-
1308
- # Internal method used to remove already stored profiles and restore observed frequencies
1309
- def reset_profiles
1310
- # Clean profiles storage
1311
- @profiles = {}
1312
- # Reset frequency observed
1313
- @meta.each{|term,info| info[:observed_freq] = 0}
1314
- @max_freqs[:observed_freq] = 0
1315
- end
1316
-
1317
-
1318
- # ===== Returns
1319
- # profiles assigned to a given ID
1320
- # ===== Parameters
1321
- # +id+:: profile ID
1322
- # ===== Return
1323
- # specific profile or nil if it's not stored
1324
- def get_profile(id)
1325
- return @profiles[id]
1326
- end
1327
-
1328
-
1329
- # ===== Returns
1330
- # an array of sizes for all stored profiles
1331
- # ===== Return
1332
- # array of profile sizes
1333
- def get_profiles_sizes()
1334
- return @profiles.map{|id,terms| terms.length}
1335
- end
1336
-
1337
-
1338
- # ===== Returns
1339
- # mean size of stored profiles
1340
- # ===== Parameters
1341
- # +round_digits+:: number of digits to round result. Default: 4
1342
- # ===== Returns
1343
- # mean size of stored profiles
1344
- def get_profiles_mean_size(round_digits: 4)
1345
- sizes = self.get_profiles_sizes
1346
- return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1347
- end
1348
-
1349
-
1350
- # Calculates profiles sizes and returns size assigned to percentile given
1351
- # ===== Parameters
1352
- # +perc+:: percentile to be returned
1353
- # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1354
- # ===== Returns
1355
- # values assigned to percentile asked
1356
- def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1357
- prof_lengths = self.get_profiles_sizes.sort
1358
- prof_lengths.reverse! if !increasing_sort
1359
- n_profiles = prof_lengths.length
1360
- percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1361
- percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1362
- return prof_lengths[percentile_index]
1363
- end
1364
-
1365
-
1366
- # Translate a given profile to terms names
1367
- # ===== Parameters
1368
- # +prof+:: array of terms to be translated
1369
- # ===== Returns
1370
- # array of translated terms. Can include nils if some IDs are not allowed
1371
- def profile_names(prof)
1372
- return prof.map{|term| self.translate_id(term)}
1373
- end
1374
-
1375
-
1376
- # Trnaslates a bunch of profiles to it sets of term names
1377
- # ===== Parameters
1378
- # +profs+:: array of profiles
1379
- # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1380
- # ===== Returns
1381
- # translated profiles
1382
- def translate_profiles_ids(profs = [], asArray: true)
1383
- profs = @profiles if profs.empty?
1384
- profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1385
- profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1386
- return asArray ? profs_names.values : profs_names
1387
- end
1388
-
1389
-
1390
- # Includes as "observed_terms" all terms included into stored profiles
1391
- # ===== Parameters
1392
- # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1393
- def add_observed_terms_from_profiles(reset: false)
1394
- @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1395
- @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1396
- end
1397
-
1398
-
1399
- # Get a term frequency
1400
- # ===== Parameters
1401
- # +term+:: term to be checked
1402
- # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1403
- # ===== Returns
1404
- # frequency of term given or nil if term is not allowed
1405
- def get_frequency(term, type: :struct_freq)
1406
- queryFreq = @meta[term]
1407
- return queryFreq.nil? ? nil : queryFreq[type]
1408
- end
1409
-
1410
-
1411
- # Geys structural frequency of a term given
1412
- # ===== Parameters
1413
- # +term+:: to be checked
1414
- # ===== Returns
1415
- # structural frequency of given term or nil if term is not allowed
1416
- def get_structural_frequency(term)
1417
- return self.get_frequency(term, type: :struct_freq)
1418
- end
1419
-
1420
-
1421
- # Gets observed frequency of a term given
1422
- # ===== Parameters
1423
- # +term+:: to be checked
1424
- # ===== Returns
1425
- # observed frequency of given term or nil if term is not allowed
1426
- def get_observed_frequency(term)
1427
- return self.get_frequency(term, type: :observed_freq)
1428
- end
1429
-
1430
-
1431
- # Calculates frequencies of stored profiles terms
1432
- # ===== Parameters
1433
- # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1434
- # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1435
- # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1436
- # +translate+:: if true, term IDs will be translated to
1437
- # ===== Returns
1438
- # stored profiles terms frequencies
1439
- def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1440
- n_profiles = @profiles.length
1441
- if literal
1442
- freqs = {}
1443
- @profiles.each do |id, terms|
1444
- terms.each do |literalTerm|
1445
- if freqs.include?(literalTerm)
1446
- freqs[literalTerm] += 1
1447
- else
1448
- freqs[literalTerm] = 1
1449
- end
1450
- end
1451
- end
1452
- if (ratio || translate)
1453
- aux_keys = freqs.keys
1454
- aux_keys.each do |term|
1455
- freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1456
- if translate
1457
- tr = self.translate_id(term)
1458
- freqs[tr] = freqs.delete(term) if !tr.nil?
1459
- end
1460
- end
1461
- end
1462
- if asArray
1463
- freqs = freqs.map{|term, freq| [term, freq]}
1464
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1465
- end
1466
- else # Freqs translating alternatives
1467
- freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1468
- freqs = freqs.to_h if !asArray
1469
- if translate
1470
- freqs = freqs.map do |term, freq|
1471
- tr = self.translate_id(term)
1472
- tr.nil? ? [term, freq] : [tr, freq]
1473
- end
1474
- end
1475
- if asArray
1476
- freqs = freqs.map{|term, freq| [term, freq]}
1477
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1478
- else
1479
- freqs = freqs.to_h
1480
- end
1481
- end
1482
- return freqs
1483
- end
1484
-
1485
-
1486
- # Clean a given profile returning cleaned set of terms and removed ancestors term.
1487
- # ===== Parameters
1488
- # +prof+:: array of terms to be checked
1489
- # ===== Returns
1490
- # two arrays, first is the cleaned profile and second is the removed elements array
1491
- def remove_ancestors_from_profile(prof)
1492
- ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1493
- redundant = prof.select{|term| ancestors.include?(term)}
1494
- return prof - redundant, redundant
1495
- end
1496
-
1497
-
1498
- # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1499
- # ===== Parameters
1500
- # +prof+:: array of terms to be checked
1501
- # ===== Returns
1502
- # two arrays, first is the cleaned profile and second is the removed elements array
1503
- def remove_alternatives_from_profile(prof)
1504
- alternatives = prof.select{|term| @alternatives_index.include?(term)}
1505
- redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1506
- return prof - redundant, redundant
1507
- end
1508
-
1509
-
1510
- # Remove alternatives (if official term is present) and ancestors terms of a given profile
1511
- # ===== Parameters
1512
- # +profile+:: profile to be cleaned
1513
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1514
- # ===== Returns
1515
- # cleaned profile
1516
- def clean_profile(profile, remove_alternatives: true)
1517
- terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1518
- if remove_alternatives
1519
- terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1520
- else
1521
- terms_without_ancestors_and_alternatices = terms_without_ancestors
1522
- end
1523
- return terms_without_ancestors_and_alternatices
1524
- end
1525
-
1526
-
1527
- # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1528
- # ===== Parameters
1529
- # +store+:: if true, clenaed profiles will replace already stored profiles
1530
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1531
- # ===== Returns
1532
- # a hash with cleaned profiles
1533
- def clean_profiles(store: false, remove_alternatives: true)
1534
- cleaned_profiles = {}
1535
- @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1536
- @profiles = cleaned_profiles if store
1537
- return cleaned_profiles
1538
- end
1539
-
1540
-
1541
- # Calculates number of ancestors present (redundant) in each profile stored
1542
- # ===== Returns
1543
- # array of parentals for each profile
1544
- def parentals_per_profile
1545
- cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1546
- parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1547
- return parentals
1548
- end
1549
-
1550
-
1551
- # Calculates mean IC of a given profile
1552
- # ===== Parameters
1553
- # +prof+:: profile to be checked
1554
- # +ic_type+:: ic_type to be used
1555
- # +zhou_k+:: special coeficient for Zhou IC method
1556
- # ===== Returns
1557
- # mean IC for a given profile
1558
- def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1559
- return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1560
- end
1561
-
1562
-
1563
- # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1564
- # ===== Returns
1565
- # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1566
- def get_profiles_resnik_dual_ICs
1567
- struct_ics = {}
1568
- observ_ics = {}
1569
- @profiles.each do |id, terms|
1570
- struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1571
- observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1572
- end
1573
- return struct_ics.clone, observ_ics.clone
1574
- end
1575
-
1576
-
1577
- # Calculates ontology structural levels for all ontology terms
1578
- # ===== Parameters
1579
- # +calc_paths+:: calculates term paths if it's not already calculated
1580
- # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1581
- def calc_term_levels(calc_paths: false, shortest_path: true)
1582
- if @term_paths.empty?
1583
- if calc_paths
1584
- self.calc_term_paths
1585
- else
1586
- warn('Term paths are not already loaded. Aborting dictionary calc')
1587
- end
1588
- end
1589
- if !@term_paths.empty?
1590
- byTerm = {}
1591
- byValue = {}
1592
- # Calc per term
1593
- @term_paths.each do |term, info|
1594
- level = shortest_path ? info[:shortest_path] : info[:largest_path]
1595
- if level.nil?
1596
- level = -1
1597
- else
1598
- level = level.round(0)
1599
- end
1600
- byTerm[term] = level
1601
- queryLevels = byValue[level]
1602
- if queryLevels.nil?
1603
- byValue[level] = [term]
1604
- else
1605
- byValue[level] << term
1606
- end
1607
- end
1608
- @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1609
- # Update maximum depth
1610
- @max_freqs[:max_depth] = byValue.keys.max
1611
- end
1612
- end
1613
-
1614
-
1615
- # Check if a term given is marked as obsolete
1616
- def is_obsolete? term
1617
- return @obsoletes_index.include?(term)
1618
- end
1619
-
1620
- # Check if a term given is marked as alternative
1621
- def is_alternative? term
1622
- return @alternatives_index.include?(term)
1623
- end
1624
-
1625
- # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1626
- # Also calculates paths metadata and stores into @term_paths
1627
- def calc_term_paths
1628
- self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1629
- visited_terms = []
1630
- @term_paths = {}
1631
- if [:hierarchical, :sparse].include? @structureType
1632
- terms = @stanzas[:terms].keys
1633
- terms.each do |term|
1634
- if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1635
- special_term = term
1636
- term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1637
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1638
- @term_paths[special_term] = @term_paths[term]
1639
- visited_terms << special_term
1640
- end
1641
-
1642
- if !visited_terms.include?(term)
1643
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1644
- parentals = @dicts[:is_a][:byTerm][term]
1645
- if parentals.nil?
1646
- @term_paths[term][:paths] << [term]
1647
- else
1648
- parentals.each do |direct_parental|
1649
- if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1650
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1651
- else # Calculate new paths
1652
- self.expand_path(direct_parental, visited_terms)
1653
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1654
- end
1655
- new_paths.each{|path| @term_paths[term][:paths] << path}
1656
- end
1657
- end
1658
- visited_terms << term
1659
- end
1660
- # Update metadata
1661
- @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1662
- paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1663
- @term_paths[term][:largest_path] = paths_sizes.max
1664
- @term_paths[term][:shortest_path] = paths_sizes.min
1665
- end
1666
- else
1667
- warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1668
- end
1669
- end
1670
-
1671
-
1672
- # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1673
- # ===== Parameters
1674
- # +curr_term+:: current visited term
1675
- # +visited_terms+:: already expanded terms
1676
- def expand_path(curr_term, visited_terms)
1677
- if !visited_terms.include?(curr_term) # Not already expanded
1678
- @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1679
- direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1680
- if direct_parentals.nil? # No parents :: End of recurrence
1681
- @term_paths[curr_term][:paths] << [curr_term]
1682
- else # Expand and concat
1683
- direct_parentals.each do |ancestor|
1684
- self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1685
- new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1686
- new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1687
- end
1688
- end
1689
- visited_terms << curr_term
1690
- end
1691
- end
1692
-
1693
-
1694
- # Gets ontology levels calculated
1695
- # ===== Returns
1696
- # ontology levels calculated
1697
- def get_ontology_levels
1698
- return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1699
- end
1700
-
1701
-
1702
- # Gets ontology level of a specific term
1703
- # ===== Returns
1704
- # Term level
1705
- def get_term_level(term)
1706
- return @dicts[:level][:byValue][term]
1707
- end
1708
-
1709
-
1710
- # Return ontology levels from profile terms
1711
- # ===== Returns
1712
- # hash of term levels (Key: level; Value: array of term IDs)
1713
- def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1714
- profiles_terms = @profiles.values.flatten
1715
- profiles_terms.uniq! if uniq
1716
- term_freqs_byProfile = {}
1717
- profiles_terms.each do |term|
1718
- query = term_freqs_byProfile[term]
1719
- if query.nil?
1720
- term_freqs_byProfile[term] = 1
1721
- else
1722
- term_freqs_byProfile[term] += 1
1723
- end
1724
- end
1725
- levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1726
- return levels_filtered
1727
- end
1728
-
1729
-
1730
- # Calculate profiles dictionary with Key= Term; Value = Profiles
1731
- def calc_profiles_dictionary
1732
- if @profiles.empty?
1733
- warn('Profiles are not already loaded. Aborting dictionary calc')
1734
- else
1735
- byTerm = {} # Key: Terms
1736
- # byValue -- Key: Profile == @profiles
1737
- @profiles.each do |id, terms|
1738
- terms.each do |term|
1739
- if byTerm.include?(term)
1740
- byTerm[term] << id
1741
- else
1742
- byTerm[term] = [id]
1743
- end
1744
- end
1745
- end
1746
- @profilesDict = byTerm
1747
- end
1748
- end
1749
-
1750
-
1751
- # Gets profiles dictionary calculated
1752
- # ===== Return
1753
- # profiles dictionary (clone)
1754
- def get_terms_linked_profiles
1755
- return @profilesDict.clone
1756
- end
1757
-
1758
-
1759
- # Get related profiles to a given term
1760
- # ===== Parameters
1761
- # +term+:: to be checked
1762
- # ===== Returns
1763
- # profiles which contains given term
1764
- def get_term_linked_profiles(term)
1765
- return @profilesDict[term]
1766
- end
1767
-
1768
-
1769
- # Gets metainfo table from a set of terms
1770
- # ===== Parameters
1771
- # +terms+:: IDs to be expanded
1772
- # +filter_alternatives+:: flag to be used in get_descendants method
1773
- # ===== Returns
1774
- # an array with triplets [TermID, TermName, DescendantsNames]
1775
- def get_childs_table(terms, filter_alternatives = false)
1776
- expanded_terms = []
1777
- terms.each do |t|
1778
- expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1779
- end
1780
- return expanded_terms
1781
- end
1782
-
1783
-
1784
- # Store specific relations hash given into ITEMS structure
1785
- # ===== Parameters
1786
- # +relations+:: to be stored
1787
- # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1788
- # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1789
- def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1790
- @items = {} if remove_old_relations
1791
- if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1792
- warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1793
- end
1794
- if !remove_old_relations
1795
- if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1796
- warn('Some terms given are already stored. Stored version will be replaced')
1797
- end
1798
- end
1799
- if expand
1800
- relations.each do |k,v|
1801
- if @items.keys.include?(k)
1802
- @items[k] = (@items[k] + v).uniq
1803
- else
1804
- @items[k] = v
1805
- end
1806
- end
1807
- else
1808
- @items.merge!(relations)
1809
- end
1810
- end
1811
-
1812
-
1813
- # Assign a dictionary already calculated as a items set.
1814
- # ===== Parameters
1815
- # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1816
- def set_items_from_dict(dictID, remove_old_relations = false)
1817
- @items = {} if remove_old_relations
1818
- if(@dicts.keys.include?(dictID))
1819
- @items.merge(@dicts[dictID][:byTerm])
1820
- else
1821
- warn('Specified ID is not calculated. Dict will not be added as a items set')
1822
- end
1823
- end
1824
-
1825
-
1826
- # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
1827
- # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1828
- # ===== Parameters
1829
- # +ontology+:: (Optional) ontology object which items given belongs
1830
- # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
1831
- # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
1832
- # ===== Returns
1833
- # void and update items object
1834
- def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
1835
- # Check item keys
1836
- if @items.empty?
1837
- warn('Items have been not provided yet')
1838
- return nil
1839
- end
1840
- targetKeys = @items.keys.select{|k| self.exists?(k)}
1841
- if targetKeys.length == 0
1842
- warn('Any item key is allowed')
1843
- return nil
1844
- elsif targetKeys.length < @items.keys.length
1845
- warn('Some item keys are not allowed')
1846
- end
1847
-
1848
- # Expand to parentals
1849
- targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
1850
- targetKeys.flatten!
1851
- targetKeys.uniq!
1852
-
1853
- # Obtain levels (go from leaves to roots)
1854
- levels = targetKeys.map{|term| self.get_term_level(term)}
1855
- levels.compact!
1856
- levels.uniq!
1857
- levels.sort!
1858
- levels.reverse!
1859
- levels.shift # Leaves are not expandable
1860
-
1861
- # Expand from leaves to roots
1862
- levels.map do |lvl|
1863
- curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
1864
- curr_keys.map do |term_expand|
1865
- to_infer = []
1866
- # Obtain childs
1867
- childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
1868
- # Expand
1869
- if childs.length > 0 && minimum_childs == 1 # Special case
1870
- to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
1871
- elsif childs.length >= minimum_childs
1872
- to_infer = Hash.new(0)
1873
- # Compare
1874
- while childs.length > 1
1875
- curr_term = childs.shift
1876
- childs.each do |compare_term|
1877
- pivot_items = @items[curr_term]
1878
- compare_items = @items[compare_term]
1879
- if ontology.nil? # Exact match
1880
- pivot_items.map do |pitem|
1881
- if compare_items.include?(pitem)
1882
- to_infer[pitem] += 2
1883
- end
1884
- end
1885
- else # Find MICAs
1886
- local_infer = Hash.new(0)
1887
- pivot_items.map do |pitem|
1888
- micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
1889
- maxmica = micas[0]
1890
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1891
- local_infer[maxmica.first] += 1
1892
- end
1893
- compare_items.map do |citem|
1894
- micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
1895
- maxmica = micas[0]
1896
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1897
- local_infer[maxmica.first] += 1
1898
- end
1899
- local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
1900
- end
1901
- end
1902
- end
1903
- # Filter infer
1904
- to_infer = to_infer.select{|k,v| v >= minimum_childs}
1905
- end
1906
- # Infer
1907
- if to_infer.length > 0
1908
- @items[term_expand] = [] if @items[term_expand].nil?
1909
- if to_infer.kind_of?(Array)
1910
- @items[term_expand] = (@items[term_expand] + to_infer).uniq
1911
- else
1912
- @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
1913
- end
1914
- @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
1915
- elsif !@items.include?(term_expand)
1916
- targetKeys.delete(term_expand)
1917
- end
1918
- end
1919
- end
1920
- end
1921
-
1922
-
1923
-
1924
- # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1925
- # ===== Parameters
1926
- # ++::
1927
- # ===== Returns
1928
- # ...
1929
- def compute_relations_to_items(external_item_list, mode, thresold)
1930
- results = []
1931
- penalized_terms = {}
1932
- # terms_levels = get_terms_levels(@items_relations.keys)
1933
- terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1934
- terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1935
- terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1936
- levels = terms_levels.keys.sort
1937
- levels.reverse_each do |level|
1938
- terms_levels[level].each do |term|
1939
- associated_items = @items_relations[term]
1940
- if mode == :elim
1941
- items_to_remove = penalized_terms[term]
1942
- items_to_remove = [] if items_to_remove.nil?
1943
- pval = get_fisher_exact_test(
1944
- external_item_list - items_to_remove,
1945
- associated_items - items_to_remove,
1946
- ((associated_items | external_item_list) - items_to_remove).length
1947
- )
1948
- if pval <= thresold
1949
- parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1950
- parents.each do |prnt|
1951
- query = penalized_terms[prnt]
1952
- if query.nil?
1953
- penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1954
- else
1955
- query.concat(@items_relations[term])
1956
- end
1957
- end
1958
- end
1959
- end
1960
- results << [term, pval]
1961
- end
1962
- end
1963
- return results
1964
- end
1965
-
1966
-
1967
- # Check if a given ID is a removable (blacklist) term.
1968
- # +DEPRECATED+ use is_removable? instead
1969
- # ===== Parameters
1970
- # +id+:: to be checked
1971
- # ===== Returns
1972
- # true if given term is a removable (blacklist) term or false in other cases
1973
- def is_removable(id)
1974
- warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
1975
- return @removable_terms.include?(id.to_sym)
1976
- end
1977
-
1978
- # Check if a given ID is a removable (blacklist) term
1979
- # ===== Parameters
1980
- # +id+:: to be checked
1981
- # ===== Returns
1982
- # true if given term is a removable (blacklist) term or false in other cases
1983
- def is_removable? id
1984
- return @removable_terms.include?(id.to_sym)
1985
- end
1986
-
1987
- ############################################
1988
- # SPECIAL METHODS
1989
- #############################################
1990
- def ==(other)
1991
- self.header == other.header &&
1992
- self.stanzas == other.stanzas &&
1993
- self.ancestors_index == other.ancestors_index &&
1994
- self.alternatives_index == other.alternatives_index &&
1995
- self.obsoletes_index == other.obsoletes_index &&
1996
- self.structureType == other.structureType &&
1997
- self.ics == other.ics &&
1998
- self.meta == other.meta &&
1999
- self.dicts == other.dicts &&
2000
- self.profiles == other.profiles &&
2001
- self.profilesDict == other.profilesDict &&
2002
- (self.items.keys - other.items.keys).empty? &&
2003
- self.removable_terms == other.removable_terms &&
2004
- self.special_tags == other.special_tags &&
2005
- self.items == other.items &&
2006
- self.term_paths == other.term_paths &&
2007
- self.max_freqs == other.max_freqs
5
+ #########################################################
6
+ # AUTHOR NOTES
7
+ #########################################################
8
+
9
+ # 1 - Store @profiles as @stanzas[:instances]
10
+ # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
11
+
12
+
13
+ #############################################
14
+ # FIELDS
15
+ #############################################
16
+ # Handled class variables
17
+ # => @@basic_tags :: hash with main OBO structure tags
18
+ # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
19
+ # => @@symbolizable_ids :: tags which can be symbolized
20
+ # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
21
+ #
22
+ # Handled object variables
23
+ # => @header :: file header (if is available)
24
+ # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
25
+ # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
26
+ # => @descendants_index :: hash of descendants per each term handled with any structure relationships
27
+ # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
28
+ # => @obsoletes_index :: hash of obsoletes and it's new ids
29
+ # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
30
+ # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
31
+ # => @ics :: already calculated ICs for handled terms and IC types
32
+ # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
33
+ # => @max_freqs :: maximum freqs found for structural and observed freqs
34
+ # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
35
+ # => @profiles :: set of terms assigned to an ID
36
+ # => @profilesDict :: set of profile IDs assigned to a term
37
+ # => @items :: hash with items relations to terms
38
+ # => @removable_terms :: array of terms to not be considered
39
+ # => @term_paths :: metainfo about parental paths of each term
40
+
41
+ @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
+ @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
+ @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
+ @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
45
+ @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
46
+ @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
47
+
48
+ #############################################
49
+ # CONSTRUCTOR
50
+ #############################################
51
+
52
+ # Instantiate a OBO_Handler object
53
+ # ===== Parameters
54
+ # +file+:: with info to be loaded (.obo ; .json)
55
+ # +load_file+:: activate load process automatically (only for .obo)
56
+ # +removable_terms+: term to be removed from calcs
57
+ # +build+: flag to launch metainfo calculation
58
+ def initialize(file: nil, load_file: false, removable_terms: [], build: true)
59
+ # Initialize object variables
60
+ @header = nil
61
+ @stanzas = {terms: {}, typedefs: {}, instances: {}}
62
+ @ancestors_index = {}
63
+ @descendants_index = {}
64
+ @alternatives_index = {}
65
+ @obsoletes_index = {}
66
+ @structureType = nil
67
+ @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
68
+ @meta = {}
69
+ @special_tags = @@basic_tags.clone
70
+ @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
71
+ @dicts = {}
72
+ @profiles = {}
73
+ @profilesDict = {}
74
+ @items = {}
75
+ @removable_terms = []
76
+ @term_paths = {}
77
+ # Load if proceeds
78
+ add_removable_terms(removable_terms) if !removable_terms.empty?
79
+ load(file, build: build) if load_file
80
+ end
81
+
82
+
83
+ #############################################
84
+ # CLASS METHODS
85
+ #############################################
86
+
87
+ # Expand a (starting) term using a specific tag and return all extended terms into an array and
88
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
89
+ # foumd, extended array will be an unique vector without starting term (no loops).
90
+ # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
91
+ # ===== Parameters
92
+ # +start+:: term where start to expand
93
+ # +terms+:: set to be used to expand
94
+ # +target_tag+:: tag used to expand
95
+ # +eexpansion+:: already expanded info
96
+ # +split_info_char+:: special regex used to split info (if it is necessary)
97
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
98
+ # +alt_ids+:: set of alternative IDs
99
+ # ===== Returns
100
+ # A vector with the observed structure (string) and the array with extended terms.
101
+ def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
102
+ # Take start_id term available info and already accumulated info
103
+ current_associations = related_ids[start_id]
104
+ current_associations = [] if current_associations.nil?
105
+ return [:no_term,[]] if terms[start_id].nil?
106
+ id_relations = terms[start_id][target_tag]
107
+ return [:source,[]] if id_relations.nil?
108
+
109
+ # Prepare auxiliar variables
110
+ struct = :hierarchical
111
+
112
+ # Study direct extensions
113
+ id_relations = id_relations.clone
114
+ while id_relations.length > 0
115
+ id = id_relations.shift
116
+ id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
117
+
118
+ # Handle
119
+ if current_associations.include?(id) # Check if already have been included into this expansion
120
+ struct = :circular
121
+ else
122
+ current_associations << id
123
+ if related_ids.include?(id) # Check if current already has been expanded
124
+ current_associations = current_associations | related_ids[id]
125
+ if current_associations.include?(start_id) # Check circular case
126
+ struct = :circular
127
+ [id, start_id].each{|repeated| current_associations.delete(repeated)}
128
+ end
129
+ else # Expand
130
+ related_ids[start_id] = current_associations
131
+ structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
132
+ current_associations = current_associations | current_related_ids
133
+ struct = :circular if structExp == :circular # Check struct
134
+ if current_associations.include?(start_id) # Check circular case
135
+ struct = :circular
136
+ current_associations.delete(start_id)
137
+ end
138
+ end
139
+ end
140
+ end
141
+ related_ids[start_id] = current_associations
142
+
143
+ return struct, current_associations
144
+ end
145
+
146
+
147
+ # Expand terms using a specific tag and return all extended terms into an array and
148
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
149
+ # foumd, extended array will be an unique vector without starting term (no loops)
150
+ # ===== Parameters
151
+ # +terms+:: set to be used to expand
152
+ # +target_tag+:: tag used to expand
153
+ # +split_info_char+:: special regex used to split info (if it is necessary)
154
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
155
+ # +alt_ids+:: set of alternative IDs
156
+ # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
157
+ # ===== Returns
158
+ # A vector with the observed structure (string) and the hash with extended terms
159
+ def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
160
+ # Define structure type
161
+ structType = :hierarchical
162
+ related_ids = {}
163
+ terms.each do |id, tags|
164
+ # Check if target tag is defined
165
+ if !tags[target_tag].nil?
166
+ # Obtain related terms
167
+ set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
168
+ # Check structure
169
+ structType = :circular if set_structure == :circular
170
+ end
171
+ end
172
+
173
+ # Check special case
174
+ structType = :atomic if related_ids.length <= 0
175
+ structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
176
+ # Return type and hash with related_ids
177
+ return structType, related_ids
178
+ end
179
+
180
+
181
+ # Class method to transform string with <tag : info> into hash structure
182
+ # ===== Parameters
183
+ # +attributes+:: array tuples with info to be transformed into hash format
184
+ # ===== Returns
185
+ # Attributes stored into hash structure
186
+ def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
187
+ # Load info
188
+ info_hash = {}
189
+ # Only TERMS multivalue tags (future add Typedefs and Instance)
190
+ # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
191
+ attributes.each do |tag, value|
192
+ # Check
193
+ raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
194
+ # Prepare
195
+ tag = tag.lstrip.to_sym
196
+ value.lstrip!
197
+ value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
198
+
199
+ # Store
200
+ query = info_hash[tag]
201
+ if !query.nil? # Tag already exists
202
+ if !query.kind_of?(Array) # Check that tag is multivalue
203
+ raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
204
+ else
205
+ query << value # Add new value to tag
206
+ end
207
+ else # New entry
208
+ if @@multivalue_tags.include?(tag)
209
+ info_hash[tag] = [value]
210
+ else
211
+ info_hash[tag] = value
212
+ end
213
+ end
214
+ end
215
+ self.symbolize_ids(info_hash)
216
+ return info_hash
217
+ end
218
+
219
+
220
+ # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
221
+ # the Header, the Terms, the Typedefs and the Instances.
222
+ # ===== Parameters
223
+ # +file+:: OBO file to be loaded
224
+ # ===== Returns
225
+ # Hash with FILE, HEADER and STANZAS info
226
+ def self.load_obo(file) #TODO: Send to obo_parser class
227
+ raise("File is not defined") if file.nil?
228
+ # Data variables
229
+ header = ''
230
+ stanzas = {terms: {}, typedefs: {}, instances: {}}
231
+ # Auxiliar variables
232
+ infoType = 'Header'
233
+ currInfo = []
234
+ stanzas_flags = %w[[Term] [Typedef] [Instance]]
235
+ # Read file
236
+ File.open(file).each do |line|
237
+ line.chomp!
238
+ next if line.empty?
239
+ fields = line.split(':', 2)
240
+ # Check if new instance is found
241
+ if stanzas_flags.include?(line)
242
+ header = self.process_entity(header, infoType, stanzas, currInfo)
243
+ # Update info variables
244
+ currInfo = []
245
+ infoType = line.gsub!(/[\[\]]/, '')
246
+ next
247
+ end
248
+ # Concat info
249
+ currInfo << fields
250
+ end
251
+ # Store last loaded info
252
+ header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
253
+
254
+ # Prepare to return
255
+ finfo = {:file => file, :name => File.basename(file, File.extname(file))}
256
+ return finfo, header, stanzas
257
+ end
258
+
259
+
260
+ # Handle OBO loaded info and stores it into correct container and format
261
+ # ===== Parameters
262
+ # +header+:: container
263
+ # +infoType+:: current ontology item type detected
264
+ # +stanzas+:: container
265
+ # +currInfo+:: info to be stored
266
+ # ===== Returns
267
+ # header newly/already stored
268
+ def self.process_entity(header, infoType, stanzas, currInfo)
269
+ info = self.info2hash(currInfo)
270
+ # Store current info
271
+ if infoType.eql?('Header')
272
+ header = info
273
+ else
274
+ id = info[:id]
275
+ case infoType
276
+ when 'Term'
277
+ stanzas[:terms][id] = info
278
+ when 'Typedef'
279
+ stanzas[:typedefs][id] = info
280
+ when 'Instance'
281
+ stanzas[:instances][id] = info
282
+ end
283
+ end
284
+ return header
285
+ end
286
+
287
+
288
+ # Symboliza all values into hashs using symbolizable tags as keys
289
+ # ===== Parameters
290
+ # +item_hash+:: hash to be checked
291
+ def self.symbolize_ids(item_hash)
292
+ @@symbolizable_ids.each do |tag|
293
+ query = item_hash[tag]
294
+ if !query.nil?
295
+ if query.kind_of?(Array)
296
+ query.map!{|item| item.to_sym}
297
+ else
298
+ item_hash[tag] = query.to_sym if !query.nil?
299
+ end
300
+ end
301
+ end
302
+ end
303
+
304
+
305
+ #
306
+ # ===== Parameters
307
+ # +root+:: main term to expand
308
+ # +ontology+:: to be cutted
309
+ # +clone+:: if true, given ontology object will not be mutated
310
+ # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
311
+ # ===== Returns
312
+ # An Ontology object with terms after cut the ontology.
313
+ def self.mutate(root, ontology, clone: true, remove_up: true)
314
+ ontology = ontology.clone if clone
315
+ # Obtain affected IDs
316
+ descendants = ontology.descendants_index[root]
317
+ descendants << root # Store itself to do not remove it
318
+ # Remove unnecesary terms
319
+ ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
320
+ ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
321
+ ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
322
+ ontology.dicts = {}
323
+ ontology.removable_terms = []
324
+ ontology.term_paths = {}
325
+ # Recalculate metadata
326
+ ontology.build_index
327
+ ontology.add_observed_terms_from_profiles
328
+ # Finish
329
+ return ontology
330
+ end
331
+
332
+
333
+
334
+ #############################################
335
+ # GENERAL METHODS
336
+ #############################################
337
+
338
+ # Include removable terms to current removable terms list
339
+ # ===== Parameters
340
+ # +terms+:: terms array to be concatenated
341
+ def add_removable_terms(terms)
342
+ terms = terms.map{|term| term.to_sym}
343
+ @removable_terms.concat(terms)
344
+ end
345
+
346
+
347
+ # Include removable terms to current removable terms list loading new
348
+ # terms from a one column plain text file
349
+ # ===== Parameters
350
+ # +file+:: to be loaded
351
+ def add_removable_terms_from_file(file)
352
+ File.open(excluded_codes_file).each do |line|
353
+ line.chomp!
354
+ @removable_terms << line.to_sym
355
+ end
356
+ end
357
+
358
+
359
+ # Increase observed frequency for a specific term
360
+ # ===== Parameters
361
+ # +term+:: term which frequency is going to be increased
362
+ # +increas+:: frequency rate to be increased. Default = 1
363
+ # ===== Return
364
+ # true if process ends without errors, false in other cases
365
+ def add_observed_term(term:,increase: 1.0)
366
+ # Check
367
+ raise ArgumentError, "Term given is NIL" if term.nil?
368
+ return false unless @stanzas[:terms].include?(term)
369
+ return false if @removable_terms.include?(term)
370
+ if @alternatives_index.include?(term)
371
+ alt_id = @alternatives_index[term]
372
+ @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
373
+ @meta[term] = @meta[alt_id]
374
+ end
375
+ # Check if exists
376
+ @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
377
+ # Add frequency
378
+ @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
379
+ @meta[term][:observed_freq] += increase
380
+ # Check maximum frequency
381
+ @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
382
+ return true
383
+ end
384
+
385
+
386
+ # Increase the arbitrary frequency of a given term set
387
+ # ===== Parameters
388
+ # +terms+:: set of terms to be updated
389
+ # +increase+:: amount to be increased
390
+ # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
391
+ # ===== Return
392
+ # true if process ends without errors and false in other cases
393
+ def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
394
+ # Check
395
+ raise ArgumentError, 'Terms array given is NIL' if terms.nil?
396
+ raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
397
+ # Add observations
398
+ if transform_to_sym
399
+ checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
400
+ else
401
+ checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
402
+ end
403
+ return checks
404
+ end
405
+
406
+
407
+ # Compare to terms sets
408
+ # ===== Parameters
409
+ # +termsA+:: set to be compared
410
+ # +termsB+:: set to be compared
411
+ # +sim_type+:: similitude method to be used. Default: resnik
412
+ # +ic_type+:: ic type to be used. Default: resnik
413
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
414
+ # ===== Return
415
+ # similitude calculated
416
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
417
+ # Check
418
+ raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
419
+ raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
420
+ micasA = []
421
+ # Compare A -> B
422
+ termsA.each do |tA|
423
+ micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
424
+ # Remove special cases
425
+ [false,nil].each do |err_value| micas.delete(err_value) end
426
+ # Obtain maximum value
427
+ micasA << micas.max if micas.length > 0
428
+ micasA << 0 if micas.length <= 0
429
+ end
430
+ means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
431
+ # Compare B -> A
432
+ if bidirectional
433
+ means_simA = means_sim * micasA.size
434
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
435
+ means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
436
+ end
437
+ # Return
438
+ return means_sim
439
+ end
440
+
441
+
442
+ # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
443
+ # ===== Parameters
444
+ # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
445
+ # +sim_type+:: similitude method to be used. Default: resnik
446
+ # +ic_type+:: ic type to be used. Default: resnik
447
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
448
+ # ===== Return
449
+ # Similitudes calculated
450
+ def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
451
+ profiles_similarity = {} #calculate similarity between patients profile
452
+ profiles_ids = @profiles.keys
453
+ if external_profiles.nil?
454
+ comp_ids = profiles_ids
455
+ comp_profiles = @profiles
456
+ main_ids = comp_ids
457
+ main_profiles = comp_profiles
458
+ else
459
+ comp_ids = external_profiles.keys
460
+ comp_profiles = external_profiles
461
+ main_ids = profiles_ids
462
+ main_profiles = @profiles
463
+ end
464
+ # Compare
465
+ while !main_ids.empty?
466
+ curr_id = main_ids.shift
467
+ current_profile = main_profiles[curr_id]
468
+ comp_ids.each do |id|
469
+ profile = comp_profiles[id]
470
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
471
+ query = profiles_similarity[curr_id]
472
+ if query.nil?
473
+ profiles_similarity[curr_id] = {id => value}
474
+ else
475
+ query[id] = value
476
+ end
477
+ end
478
+ end
479
+ return profiles_similarity
480
+ end
481
+
482
+
483
+ # Expand alternative IDs arround all already stored terms
484
+ # ===== Parameters
485
+ # +alt_tag+:: tag used to expand alternative IDs
486
+ # ===== Returns
487
+ # true if process ends without errors and false in other cases
488
+ def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
489
+ # Check input
490
+ raise('stanzas terms empty') if @stanzas[:terms].empty?
491
+ # Take all alternative IDs
492
+ alt_ids2add = {}
493
+ @stanzas[:terms].each do |id, tags|
494
+ alt_ids = tags[alt_tag]
495
+ if !alt_ids.nil?
496
+ alt_ids = alt_ids - @removable_terms
497
+ # Update info
498
+ alt_ids.each do |alt_term|
499
+ @alternatives_index[alt_term] = id
500
+ alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
501
+ @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
502
+ end
503
+ end
504
+ end
505
+ @stanzas[:terms].merge!(alt_ids2add)
506
+ end
507
+
508
+
509
+ # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
510
+ # ===== Returns
511
+ # true if eprocess ends without errors and false in other cases
512
+ def build_index()
513
+ self.get_index_alternatives
514
+ self.get_index_obsoletes
515
+ self.get_index_child_parent_relations
516
+ @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
517
+ @alternatives_index.compact!
518
+ @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
519
+ @obsoletes_index.compact!
520
+ @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
521
+ @ancestors_index.compact!
522
+ @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
523
+ @descendants_index.compact!
524
+ self.get_index_frequencies
525
+ self.calc_dictionary(:name)
526
+ self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
527
+ self.calc_term_levels(calc_paths: true)
528
+ end
529
+
530
+
531
+ # Calculates regular frequencies based on ontology structure (using parentals)
532
+ # ===== Returns
533
+ # true if everything end without errors and false in other cases
534
+ def get_index_frequencies()
535
+ # Check
536
+ if @ancestors_index.empty?
537
+ warn('ancestors_index object is empty')
538
+ else
539
+ # Prepare useful variables
540
+ alternative_terms = @alternatives_index.keys
541
+ # Per each term, add frequencies
542
+ @stanzas[:terms].each do |id, tags|
543
+ if @alternatives_index.include?(id)
544
+ alt_id = @alternatives_index[id]
545
+ query = @meta[alt_id] # Check if exist
546
+ if query.nil?
547
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
548
+ @meta[alt_id] = query
549
+ end
550
+ @meta[id] = query
551
+ # Note: alternative terms do not increase structural frequencies
552
+ else # Official term
553
+ query = @meta[id] # Check if exist
554
+ if query.nil?
555
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
556
+ @meta[id] = query
557
+ end
558
+ # Store metadata
559
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
560
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
561
+ query[:struct_freq] = query[:descendants] + 1.0
562
+ # Update maximums
563
+ @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
564
+ @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
565
+ end
566
+ end
567
+ end
568
+ end
569
+
570
+
571
+ # Expand obsoletes set and link info to their alternative IDs
572
+ # ===== Parameters
573
+ # +obs_tags+:: tags to be used to find obsoletes
574
+ # +alt_tags+:: tags to find alternative IDs (if are available)
575
+ # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
576
+ # ===== Returns
577
+ # true if process ends without errors and false in other cases
578
+ def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
579
+ if @stanzas[:terms].empty?
580
+ warn('stanzas terms empty')
581
+ else
582
+ # Check obsoletes
583
+ @stanzas[:terms].each do |id, term_tags|
584
+ next if term_tags.nil?
585
+ query = term_tags[obs_tag]
586
+ if !query.nil? && query == 'true' # Obsolete tag presence
587
+ next if !@obsoletes_index[id].nil? # Already stored
588
+ # Check if alternative value is available
589
+ alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
590
+ if !alt_ids.empty?
591
+ alt_id = alt_ids.first.first #FIRST tag, FIRST id
592
+ # Store
593
+ @alternatives_index[id] = alt_id
594
+ @obsoletes_index[id] = alt_id
595
+ end
596
+ end
597
+ end
598
+ end
599
+ end
600
+
601
+
602
+ # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
603
+ # ===== Parameters
604
+ # +tag+:: tag used to expand parentals
605
+ # +split_info_char+:: special regex used to split info (if it is necessary)
606
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
607
+ # ===== Returns
608
+ # true if process ends without errors and false in other cases
609
+ def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
610
+ # Check
611
+ if @stanzas[:terms].nil?
612
+ warn('stanzas terms empty')
613
+ else
614
+ # Expand
615
+ structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
616
+ target_tag: tag,
617
+ alt_ids: @alternatives_index,
618
+ obsoletes: @obsoletes_index.length)
619
+ # Check
620
+ raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
621
+ # Prepare ancestors structure
622
+ anc = {}
623
+ des = {}
624
+ parentals.each do |id, parents|
625
+ parents = parents - @removable_terms
626
+ anc[id] = parents
627
+ parents.each do |anc_id| # Add descendants
628
+ if !des.include?(anc_id)
629
+ des[anc_id] = [id]
630
+ else
631
+ des[anc_id] << id
632
+ end
633
+ end
634
+ end
635
+ # Store alternatives
636
+ @alternatives_index.each do |id,alt|
637
+ anc[id] = anc[alt] if anc.include?(alt)
638
+ des[id] = des[alt] if des.include?(alt)
639
+ end
640
+ # Check structure
641
+ if ![:atomic,:sparse].include? structType
642
+ structType = structType == :circular ? :circular : :hierarchical
643
+ end
644
+ # Store
645
+ @ancestors_index = anc
646
+ @descendants_index = des
647
+ @structureType = structType
648
+ end
649
+ # Finish
650
+ end
651
+
652
+
653
+ # Find ancestors of a given term
654
+ # ===== Parameters
655
+ # +term+:: to be checked
656
+ # +filter_alternatives+:: if true, remove alternatives from final results
657
+ # ===== Returns
658
+ # an array with all ancestors of given term or false if parents are not available yet
659
+ def get_ancestors(term, filter_alternatives = false)
660
+ return self.get_familiar(term, true, filter_alternatives)
661
+ end
662
+
663
+
664
+ # Find descendants of a given term
665
+ # ===== Parameters
666
+ # +term+:: to be checked
667
+ # +filter_alternatives+:: if true, remove alternatives from final results
668
+ # ===== Returns
669
+ # an array with all descendants of given term or false if parents are not available yet
670
+ def get_descendants(term, filter_alternatives = false)
671
+ return self.get_familiar(term, false, filter_alternatives)
672
+ end
673
+
674
+
675
+ # Find ancestors/descendants of a given term
676
+ # ===== Parameters
677
+ # +term+:: to be checked
678
+ # +return_ancestors+:: return ancestors if true or descendants if false
679
+ # +filter_alternatives+:: if true, remove alternatives from final results
680
+ # ===== Returns
681
+ # an array with all ancestors/descendants of given term or nil if parents are not available yet
682
+ def get_familiar(term, return_ancestors = true, filter_alternatives = false)
683
+ # Find into parentals
684
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
685
+ if !familiars.nil?
686
+ familiars = familiars.clone
687
+ if filter_alternatives
688
+ familiars.reject!{|fm| @alternatives_index.include?(fm)}
689
+ end
690
+ else
691
+ familiars = []
692
+ end
693
+ return familiars
694
+ end
695
+
696
+
697
+ # Obtain IC of an specific term
698
+ # ===== Parameters
699
+ # +term+:: which IC will be calculated
700
+ # +type+:: of IC to be calculated. Default: resnik
701
+ # +force+:: force re-calculate the IC. Do not check if it is already calculated
702
+ # +zhou_k+:: special coeficient for Zhou IC method
703
+ # ===== Returns
704
+ # the IC calculated
705
+ def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
706
+ term = termRaw.to_sym
707
+ # Check
708
+ raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
709
+ # Check if it's already calculated
710
+ return @ics[type][term] if (@ics[type].include? term) && !force
711
+ # Calculate
712
+ ic = - 1
713
+ case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
714
+ ###########################################
715
+ #### STRUCTURE BASED METRICS
716
+ ###########################################
717
+ # Shortest path
718
+ # Weighted Link
719
+ # Hirst and St-Onge Measure
720
+ # Wu and Palmer
721
+ # Slimani
722
+ # Li
723
+ # Leacock and Chodorow
724
+ ###########################################
725
+ #### INFORMATION CONTENT METRICS
726
+ ###########################################
727
+ when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
728
+ # -log(Freq(x) / Max_Freq)
729
+ ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
730
+ when :resnik_observed
731
+ # -log(Freq(x) / Max_Freq)
732
+ ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
733
+ # Lin
734
+ # Jiang & Conrath
735
+
736
+ ###########################################
737
+ #### FEATURE-BASED METRICS
738
+ ###########################################
739
+ # Tversky
740
+ # x-similarity
741
+ # Rodirguez
742
+
743
+ ###########################################
744
+ #### HYBRID METRICS
745
+ ###########################################
746
+ when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
747
+ # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
748
+ ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
749
+ if :zhou # New Model of Semantic Similarity Measuring in Wordnet
750
+ # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
751
+ @ics[:seco][term] = ic # Special store
752
+ ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
753
+ end
754
+ when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
755
+ ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
756
+ # Knappe
757
+ end
758
+ @ics[type][term] = ic
759
+ return ic
760
+ end
761
+
762
+
763
+ # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
764
+ # ===== Returns
765
+ # two hashes with resnik and resnik_observed ICs for observed terms
766
+ def get_observed_ics_by_onto_and_freq
767
+ # Chech there are observed terms
768
+ if @profiles.empty?
769
+ resnik = {}
770
+ resnik_observed = {}
771
+ else
772
+ # Calc ICs for all terms
773
+ observed_terms = @profiles.values.flatten.uniq
774
+ observed_terms.each{ |term| get_IC(term)}
775
+ observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
776
+ resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
777
+ resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
778
+ end
779
+ return resnik.clone, resnik_observed.clone
780
+ end
781
+
782
+
783
+ # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
784
+ # ===== Parameters
785
+ # +termA+:: term to be cheked
786
+ # +termB+:: term to be checked
787
+ # +ic_type+:: IC formula to be used
788
+ # ===== Returns
789
+ # the IC of the MICA(termA,termB)
790
+ def get_ICMICA(termA, termB, ic_type = :resnik)
791
+ mica = self.get_MICA(termA, termB, ic_type)
792
+ return mica.first.nil? ? nil : mica.last
793
+ end
794
+
795
+
796
+ # Find the Most Index Content shared Ancestor (MICA) of two given terms
797
+ # ===== Parameters
798
+ # +termA+:: term to be cheked
799
+ # +termB+:: term to be checked
800
+ # +ic_type+:: IC formula to be used
801
+ # ===== Returns
802
+ # the MICA(termA,termB) and it's IC
803
+ def get_MICA(termA, termB, ic_type = :resnik)
804
+ termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
805
+ termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
806
+ mica = [nil,-1.0]
807
+ # Special case
808
+ if termA.eql?(termB)
809
+ ic = self.get_IC(termA, type: ic_type)
810
+ mica = [termA, ic]
811
+ else
812
+ # Obtain ancestors (include itselfs too)
813
+ anc_A = self.get_ancestors(termA)
814
+ anc_B = self.get_ancestors(termB)
815
+
816
+ if !(anc_A.empty? && anc_B.empty?)
817
+ anc_A << termA
818
+ anc_B << termB
819
+ # Find shared ancestors
820
+ shared_ancestors = anc_A & anc_B
821
+ # Find MICA
822
+ if shared_ancestors.length > 0
823
+ shared_ancestors.each do |anc|
824
+ ic = self.get_IC(anc, type: ic_type)
825
+ # Check
826
+ mica = [anc,ic] if ic > mica[1]
827
+ end
828
+ end
829
+ end
830
+ end
831
+ return mica
832
+ end
833
+
834
+
835
+ # Calculate similarity between two given terms
836
+ # ===== Parameters
837
+ # +termsA+:: to be compared
838
+ # +termsB+:: to be compared
839
+ # +type+:: similitude formula to be used
840
+ # +ic_type+:: IC formula to be used
841
+ # ===== Returns
842
+ # the similarity between both sets or false if frequencies are not available yet
843
+ def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
844
+ # Check
845
+ raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
846
+ sim = nil
847
+ # Launch comparissons
848
+ sim_res = get_ICMICA(termA, termB, ic_type)
849
+ if !sim_res.nil?
850
+ case type
851
+ when :resnik
852
+ sim = sim_res
853
+ when :lin
854
+ sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
855
+ when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
856
+ sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
857
+ end
858
+ end
859
+ return sim
860
+ end
861
+
862
+
863
+ # Method used to load information stored into an OBO file and store it into this object.
864
+ # If a file is specified by input parameter, current @file value is updated
865
+ # ===== Parameters
866
+ # +file+:: optional file to update object stored file
867
+ def load(file, build: true)
868
+ _, header, stanzas = self.class.load_obo(file)
869
+ @header = header
870
+ @stanzas = stanzas
871
+ self.remove_removable()
872
+ # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
873
+ self.build_index() if build
874
+ end
875
+
876
+ #
877
+ def remove_removable()
878
+ @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
879
+ end
880
+
881
+
882
+ # Exports an OBO_Handler object in json format
883
+ # ===== Parameters
884
+ # +file+:: where info will be stored
885
+ def write(file)
886
+ # Take object stored info
887
+ obj_info = {header: @header,
888
+ stanzas: @stanzas,
889
+ ancestors_index: @ancestors_index,
890
+ descendants_index: @descendants_index,
891
+ alternatives_index: @alternatives_index,
892
+ obsoletes_index: @obsoletes_index,
893
+ structureType: @structureType,
894
+ ics: @ics,
895
+ meta: @meta,
896
+ special_tags: @special_tags,
897
+ max_freqs: @max_freqs,
898
+ dicts: @dicts,
899
+ profiles: @profiles,
900
+ profilesDict: @profilesDict,
901
+ items: @items,
902
+ removable_terms: @removable_terms,
903
+ term_paths: @term_paths}
904
+ # Convert to JSON format & write
905
+ File.open(file, "w") { |f| f.write obj_info.to_json }
906
+ end
907
+
908
+
909
+ def is_number? string
910
+ true if Float(string) rescue false
911
+ end
912
+
913
+
914
+ # Read a JSON file with an OBO_Handler object stored
915
+ # ===== Parameters
916
+ # +file+:: with object info
917
+ # +file+:: if true, calculate indexes. Default: true
918
+ # ===== Return
919
+ # OBO_Handler internal fields
920
+ def read(file, build: true)
921
+ # Read file
922
+ jsonFile = File.open(file)
923
+ jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
924
+ # Pre-process (Symbolize some hashs values)
925
+ jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
926
+ jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
927
+ jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
928
+ # Optional
929
+ jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
930
+ jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
931
+ jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
932
+ jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:obsoletes_index].nil?
933
+ jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
934
+ next if dictionaries.nil?
935
+ # Special case: byTerm
936
+ dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
937
+ if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
938
+ [term.to_s.to_i, value.map{|term| term.to_sym}]
939
+ elsif value.is_a? Numeric # Numeric dictionary
940
+ [term.to_sym, value]
941
+ elsif value.kind_of?(Array) && flag == :is_a
942
+ [term.to_sym, value.map{|v| v.to_sym}]
943
+ else
944
+ [term.to_sym, value]
945
+ end
946
+ end
947
+ dictionaries[:byTerm] = dictionaries[:byTerm].to_h
948
+ # By value
949
+ dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
950
+ if value.is_a? Numeric # Numeric dictionary
951
+ [value, term.to_sym]
952
+ elsif term.is_a? Numeric # Numeric dictionary
953
+ [value.to_s.to_sym, term]
954
+ elsif flag == :is_a
955
+ [value.to_sym, term.map{|v| v.to_sym}]
956
+ elsif term.kind_of?(Array)
957
+ [value.to_sym, term.map{|t| t.to_sym}]
958
+ else
959
+ [value.to_s, term.to_sym]
960
+ end
961
+ end
962
+ dictionaries[:byValue] = dictionaries[:byValue].to_h
963
+ end
964
+ if !jsonInfo[:profiles].nil?
965
+ jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
966
+ jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
967
+ end
968
+ jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}} unless jsonInfo[:profilesDict].nil?
969
+ jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
970
+ jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
971
+ next if v.nil?
972
+ if v.kind_of?(Array)
973
+ jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
974
+ else
975
+ jsonInfo[:special_tags][k] = v.to_sym
976
+ end
977
+ end
978
+ jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
979
+ jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
980
+
981
+ # Store info
982
+ @header = jsonInfo[:header]
983
+ @stanzas = jsonInfo[:stanzas]
984
+ @ancestors_index = jsonInfo[:ancestors_index]
985
+ @descendants_index = jsonInfo[:descendants_index]
986
+ @alternatives_index = jsonInfo[:alternatives_index]
987
+ @obsoletes_index = jsonInfo[:obsoletes_index]
988
+ jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
989
+ @structureType = jsonInfo[:structureType]
990
+ @ics = jsonInfo[:ics]
991
+ @meta = jsonInfo[:meta]
992
+ @special_tags = jsonInfo[:special_tags]
993
+ @max_freqs = jsonInfo[:max_freqs]
994
+ @dicts = jsonInfo[:dicts]
995
+ @profiles = jsonInfo[:profiles]
996
+ @profilesDict = jsonInfo[:profilesDict]
997
+ @items = jsonInfo[:items]
998
+ @removable_terms = jsonInfo[:removable_terms]
999
+ @term_paths = jsonInfo[:term_paths]
1000
+
1001
+ self.build_index() if build
1002
+ end
1003
+
1004
+
1005
+ # Check if a given ID is stored as term into this object
1006
+ # ===== Parameters
1007
+ # +id+:: to be checked
1008
+ # ===== Return
1009
+ # True if term is allowed or false in other cases
1010
+ def exists? id
1011
+ return stanzas[:terms].include?(id)
1012
+ end
1013
+
1014
+
1015
+ # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1016
+ # ===== Parameters
1017
+ # +text+:: to be checked
1018
+ # ===== Return
1019
+ # The correct ID if it can be found or nil in other cases
1020
+ def extract_id(text, splitBy: ' ')
1021
+ if self.exists?(text)
1022
+ return text
1023
+ else
1024
+ splittedText = text.to_s.split(splitBy).first.to_sym
1025
+ return self.exists?(splittedText) ? splittedText : nil
1026
+ end
1027
+ end
1028
+
1029
+
1030
+ # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1031
+ # This functions stores calculated dictionary into @dicts field.
1032
+ # This functions stores first value for multivalue tags
1033
+ # This function does not handle synonyms for byValue dictionaries
1034
+ # ===== Parameters
1035
+ # +tag+:: to be used to calculate dictionary
1036
+ # +select_regex+:: gives a regfex that can be used to modify value to be stored
1037
+ # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1038
+ # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1039
+ # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1040
+ # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1041
+ # ===== Return
1042
+ # void. And stores calcualted bidirectional dictonary into dictionaries main container
1043
+ def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1044
+ tag = tag.to_sym
1045
+ store_tag = tag if store_tag.nil?
1046
+ if @stanzas[:terms].empty?
1047
+ warn('Terms are not already loaded. Aborting dictionary calc')
1048
+ else
1049
+ byTerm = {}
1050
+ byValue = {}
1051
+ # Calc per term
1052
+ @stanzas[:terms].each do |term, tags|
1053
+ referenceTerm = term
1054
+ if @alternatives_index.include?(term) && substitute_alternatives # Special case
1055
+ referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1056
+ end
1057
+ queryTag = tags[tag]
1058
+ if !queryTag.nil?
1059
+ # Pre-process
1060
+ if !select_regex.nil?
1061
+ if queryTag.kind_of?(Array)
1062
+ queryTag = queryTag.map{|value| value.scan(select_regex).first}
1063
+ queryTag.flatten!
1064
+ else
1065
+ queryTag = queryTag.scan(select_regex).first
1066
+ end
1067
+ queryTag.compact!
1068
+ end
1069
+ if queryTag.kind_of?(Array) # Store
1070
+ if !queryTag.empty?
1071
+ if byTerm.include?(referenceTerm)
1072
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1073
+ else
1074
+ byTerm[referenceTerm] = queryTag
1075
+ end
1076
+ if multiterm
1077
+ queryTag.each do |value|
1078
+ byValue[value] = [] if byValue[value].nil?
1079
+ byValue[value] << referenceTerm
1080
+ end
1081
+ else
1082
+ queryTag.each{|value| byValue[value] = referenceTerm}
1083
+ end
1084
+ end
1085
+ else
1086
+ if byTerm.include?(referenceTerm)
1087
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1088
+ else
1089
+ byTerm[referenceTerm] = [queryTag]
1090
+ end
1091
+ if multiterm
1092
+ byValue[queryTag] = [] if byValue[queryTag].nil?
1093
+ byValue[queryTag] << referenceTerm
1094
+ else
1095
+ byValue[queryTag] = referenceTerm
1096
+ end
1097
+ end
1098
+ end
1099
+ end
1100
+
1101
+ # Check self-references
1102
+ if self_type_references
1103
+ byTerm.map do |term, references|
1104
+ corrected_references = references.map do |t|
1105
+ checked = self.extract_id(t)
1106
+ if checked.nil?
1107
+ t
1108
+ else
1109
+ byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1110
+ checked
1111
+ end
1112
+ end
1113
+ byTerm[term] = corrected_references.uniq
1114
+ end
1115
+ end
1116
+
1117
+ # Check order
1118
+ byTerm.map do |term,values|
1119
+ if self.exists?(term)
1120
+ referenceValue = @stanzas[:terms][term][tag]
1121
+ if !referenceValue.nil?
1122
+ if !select_regex.nil?
1123
+ if referenceValue.kind_of?(Array)
1124
+ referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1125
+ referenceValue.flatten!
1126
+ else
1127
+ referenceValue = referenceValue.scan(select_regex).first
1128
+ end
1129
+ referenceValue.compact!
1130
+ end
1131
+ if self_type_references
1132
+ if referenceValue.kind_of?(Array)
1133
+ aux = referenceValue.map{|t| self.extract_id(t)}
1134
+ else
1135
+ aux = self.extract_id(referenceValue)
1136
+ end
1137
+ referenceValue = aux if !aux.nil?
1138
+ end
1139
+ referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1140
+ byTerm[term] = referenceValue + (values - referenceValue)
1141
+ end
1142
+ end
1143
+ end
1144
+
1145
+ # Store
1146
+ @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1147
+ end
1148
+ end
1149
+
1150
+
1151
+ # Calculates :is_a dictionary without alternatives substitution
1152
+ def calc_ancestors_dictionary
1153
+ self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
1154
+ end
1155
+
1156
+
1157
+ # Translate a given value using an already calcualted dictionary
1158
+ # ===== Parameters
1159
+ # +toTranslate+:: value to be translated using dictiontionary
1160
+ # +tag+:: used to generate the dictionary
1161
+ # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1162
+ # ===== Return
1163
+ # translation
1164
+ def translate(toTranslate, tag, byValue: true)
1165
+ dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1166
+ toTranslate = get_main_id(toTranslate) if !byValue
1167
+ return dict[toTranslate]
1168
+ end
1169
+
1170
+
1171
+ # Translate a name given
1172
+ # ===== Parameters
1173
+ # +name+:: to be translated
1174
+ # ===== Return
1175
+ # translated name or nil if it's not stored into this ontology
1176
+ def translate_name(name)
1177
+ term = self.translate(name, :name)
1178
+ term = self.translate(name, :synonym) if term.nil?
1179
+ return term
1180
+ end
1181
+
1182
+
1183
+ # Translate several names and return translations and a list of names which couldn't be translated
1184
+ # ===== Parameters
1185
+ # +names+:: array to be translated
1186
+ # ===== Return
1187
+ # two arrays with translations and names which couldn't be translated respectively
1188
+ def translate_names(names)
1189
+ translated = []
1190
+ rejected = []
1191
+ names.each do |name|
1192
+ tr = self.translate_name(name)
1193
+ if tr.nil?
1194
+ rejected << name
1195
+ else
1196
+ translated << tr
1197
+ end
1198
+ end
1199
+ return translated, rejected
1200
+ end
1201
+
1202
+
1203
+ # Translates a given ID to it assigned name
1204
+ # ===== Parameters
1205
+ # +id+:: to be translated
1206
+ # ===== Return
1207
+ # main name or nil if it's not included into this ontology
1208
+ def translate_id(id)
1209
+ name = self.translate(id, :name, byValue: false)
1210
+ return name.nil? ? nil : name.first
1211
+ end
1212
+
1213
+
1214
+ # Translates several IDs and returns translations and not allowed IDs list
1215
+ # ===== Parameters
1216
+ # +ids+:: to be translated
1217
+ # ===== Return
1218
+ # two arrays with translations and names which couldn't be translated respectively
1219
+ def translate_ids(ids)
1220
+ translated = []
1221
+ rejected = []
1222
+ ids.each do |term_id|
1223
+ tr = self.translate_id(term_id.to_sym)
1224
+ if !tr.nil?
1225
+ translated << tr
1226
+ else
1227
+ rejected << tr
1228
+ end
1229
+ end
1230
+ return translated, rejected
1231
+ end
1232
+
1233
+
1234
+ # ===== Returns
1235
+ # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1236
+ # ===== Parameters
1237
+ # +id+:: to be translated
1238
+ # ===== Return
1239
+ # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1240
+ def get_main_id(id)
1241
+ return nil if !@stanzas[:terms].include? id
1242
+ new_id = id
1243
+ mainID = @alternatives_index[id]
1244
+ new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1245
+ return new_id
1246
+ end
1247
+
1248
+
1249
+ # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1250
+ # ===== Parameters
1251
+ # +ids+:: to be checked
1252
+ # ===== Return
1253
+ # two arrays whit allowed and rejected IDs respectively
1254
+ def check_ids(ids, substitute: true)
1255
+ checked_codes = []
1256
+ rejected_codes = []
1257
+ ids.each do |id|
1258
+ if @stanzas[:terms].include? id
1259
+ if substitute
1260
+ checked_codes << self.get_main_id(id)
1261
+ else
1262
+ checked_codes << id
1263
+ end
1264
+ else
1265
+ rejected_codes << id
1266
+ end
1267
+ end
1268
+ return checked_codes, rejected_codes
1269
+ end
1270
+
1271
+
1272
+ # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1273
+ # ===== Parameters
1274
+ # +id+:: assigned to profile
1275
+ # +terms+:: array of terms
1276
+ # +substitute+:: subsstitute flag from check_ids
1277
+ def add_profile(id, terms, substitute: true)
1278
+ warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1279
+ correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1280
+ if !rejected_terms.empty?
1281
+ warn('Given terms contains erroneus IDs. These IDs will be removed')
1282
+ end
1283
+ if id.is_a? Numeric
1284
+ @profiles[id] = correct_terms
1285
+ else
1286
+ @profiles[id.to_sym] = correct_terms
1287
+ end
1288
+ end
1289
+
1290
+
1291
+ # Method used to store a pull of profiles
1292
+ # ===== Parameters
1293
+ # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1294
+ # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1295
+ # +reset_stored+:: if true, remove already stored profiles
1296
+ # +substitute+:: subsstitute flag from check_ids
1297
+ def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1298
+ self.reset_profiles if reset_stored
1299
+ # Check
1300
+ if profiles.kind_of?(Array)
1301
+ profiles.each_with_index do |items, i|
1302
+ self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1303
+ end
1304
+ else # Hash
1305
+ if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1306
+ warn('Some profiles given are already stored. Stored version will be replaced')
1307
+ end
1308
+ profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1309
+ end
1310
+
1311
+ self.add_observed_terms_from_profiles(reset: true)
1312
+
1313
+ if calc_metadata
1314
+ self.calc_profiles_dictionary
1315
+ end
1316
+ end
1317
+
1318
+
1319
+ # Internal method used to remove already stored profiles and restore observed frequencies
1320
+ def reset_profiles
1321
+ # Clean profiles storage
1322
+ @profiles = {}
1323
+ # Reset frequency observed
1324
+ @meta.each{|term,info| info[:observed_freq] = 0}
1325
+ @max_freqs[:observed_freq] = 0
1326
+ end
1327
+
1328
+
1329
+ # ===== Returns
1330
+ # profiles assigned to a given ID
1331
+ # ===== Parameters
1332
+ # +id+:: profile ID
1333
+ # ===== Return
1334
+ # specific profile or nil if it's not stored
1335
+ def get_profile(id)
1336
+ return @profiles[id]
1337
+ end
1338
+
1339
+
1340
+ # ===== Returns
1341
+ # an array of sizes for all stored profiles
1342
+ # ===== Return
1343
+ # array of profile sizes
1344
+ def get_profiles_sizes()
1345
+ return @profiles.map{|id,terms| terms.length}
1346
+ end
1347
+
1348
+
1349
+ # ===== Returns
1350
+ # mean size of stored profiles
1351
+ # ===== Parameters
1352
+ # +round_digits+:: number of digits to round result. Default: 4
1353
+ # ===== Returns
1354
+ # mean size of stored profiles
1355
+ def get_profiles_mean_size(round_digits: 4)
1356
+ sizes = self.get_profiles_sizes
1357
+ return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1358
+ end
1359
+
1360
+
1361
+ # Calculates profiles sizes and returns size assigned to percentile given
1362
+ # ===== Parameters
1363
+ # +perc+:: percentile to be returned
1364
+ # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1365
+ # ===== Returns
1366
+ # values assigned to percentile asked
1367
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1368
+ prof_lengths = self.get_profiles_sizes.sort
1369
+ prof_lengths.reverse! if !increasing_sort
1370
+ n_profiles = prof_lengths.length
1371
+ percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1372
+ percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1373
+ return prof_lengths[percentile_index]
1374
+ end
1375
+
1376
+
1377
+ # Translate a given profile to terms names
1378
+ # ===== Parameters
1379
+ # +prof+:: array of terms to be translated
1380
+ # ===== Returns
1381
+ # array of translated terms. Can include nils if some IDs are not allowed
1382
+ def profile_names(prof)
1383
+ return prof.map{|term| self.translate_id(term)}
1384
+ end
1385
+
1386
+
1387
+ # Trnaslates a bunch of profiles to it sets of term names
1388
+ # ===== Parameters
1389
+ # +profs+:: array of profiles
1390
+ # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1391
+ # ===== Returns
1392
+ # translated profiles
1393
+ def translate_profiles_ids(profs = [], asArray: true)
1394
+ profs = @profiles if profs.empty?
1395
+ profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1396
+ profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1397
+ return asArray ? profs_names.values : profs_names
1398
+ end
1399
+
1400
+
1401
+ # Includes as "observed_terms" all terms included into stored profiles
1402
+ # ===== Parameters
1403
+ # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1404
+ def add_observed_terms_from_profiles(reset: false)
1405
+ @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1406
+ @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1407
+ end
1408
+
1409
+
1410
+ # Get a term frequency
1411
+ # ===== Parameters
1412
+ # +term+:: term to be checked
1413
+ # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1414
+ # ===== Returns
1415
+ # frequency of term given or nil if term is not allowed
1416
+ def get_frequency(term, type: :struct_freq)
1417
+ queryFreq = @meta[term]
1418
+ return queryFreq.nil? ? nil : queryFreq[type]
1419
+ end
1420
+
1421
+
1422
+ # Geys structural frequency of a term given
1423
+ # ===== Parameters
1424
+ # +term+:: to be checked
1425
+ # ===== Returns
1426
+ # structural frequency of given term or nil if term is not allowed
1427
+ def get_structural_frequency(term)
1428
+ return self.get_frequency(term, type: :struct_freq)
1429
+ end
1430
+
1431
+
1432
+ # Gets observed frequency of a term given
1433
+ # ===== Parameters
1434
+ # +term+:: to be checked
1435
+ # ===== Returns
1436
+ # observed frequency of given term or nil if term is not allowed
1437
+ def get_observed_frequency(term)
1438
+ return self.get_frequency(term, type: :observed_freq)
1439
+ end
1440
+
1441
+
1442
+ # Calculates frequencies of stored profiles terms
1443
+ # ===== Parameters
1444
+ # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1445
+ # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1446
+ # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1447
+ # +translate+:: if true, term IDs will be translated to
1448
+ # ===== Returns
1449
+ # stored profiles terms frequencies
1450
+ def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1451
+ n_profiles = @profiles.length
1452
+ if literal
1453
+ freqs = {}
1454
+ @profiles.each do |id, terms|
1455
+ terms.each do |literalTerm|
1456
+ if freqs.include?(literalTerm)
1457
+ freqs[literalTerm] += 1
1458
+ else
1459
+ freqs[literalTerm] = 1
1460
+ end
1461
+ end
1462
+ end
1463
+ if (ratio || translate)
1464
+ aux_keys = freqs.keys
1465
+ aux_keys.each do |term|
1466
+ freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1467
+ if translate
1468
+ tr = self.translate_id(term)
1469
+ freqs[tr] = freqs.delete(term) if !tr.nil?
1470
+ end
1471
+ end
1472
+ end
1473
+ if asArray
1474
+ freqs = freqs.map{|term, freq| [term, freq]}
1475
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1476
+ end
1477
+ else # Freqs translating alternatives
1478
+ freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1479
+ freqs = freqs.to_h if !asArray
1480
+ if translate
1481
+ freqs = freqs.map do |term, freq|
1482
+ tr = self.translate_id(term)
1483
+ tr.nil? ? [term, freq] : [tr, freq]
1484
+ end
1485
+ end
1486
+ if asArray
1487
+ freqs = freqs.map{|term, freq| [term, freq]}
1488
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1489
+ else
1490
+ freqs = freqs.to_h
1491
+ end
1492
+ end
1493
+ return freqs
1494
+ end
1495
+
1496
+
1497
+ # Clean a given profile returning cleaned set of terms and removed ancestors term.
1498
+ # ===== Parameters
1499
+ # +prof+:: array of terms to be checked
1500
+ # ===== Returns
1501
+ # two arrays, first is the cleaned profile and second is the removed elements array
1502
+ def remove_ancestors_from_profile(prof)
1503
+ ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1504
+ redundant = prof.select{|term| ancestors.include?(term)}
1505
+ return prof - redundant, redundant
1506
+ end
1507
+
1508
+
1509
+ # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1510
+ # ===== Parameters
1511
+ # +prof+:: array of terms to be checked
1512
+ # ===== Returns
1513
+ # two arrays, first is the cleaned profile and second is the removed elements array
1514
+ def remove_alternatives_from_profile(prof)
1515
+ alternatives = prof.select{|term| @alternatives_index.include?(term)}
1516
+ redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1517
+ return prof - redundant, redundant
1518
+ end
1519
+
1520
+
1521
+ # Remove alternatives (if official term is present) and ancestors terms of a given profile
1522
+ # ===== Parameters
1523
+ # +profile+:: profile to be cleaned
1524
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1525
+ # ===== Returns
1526
+ # cleaned profile
1527
+ def clean_profile(profile, remove_alternatives: true)
1528
+ terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1529
+ if remove_alternatives
1530
+ terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1531
+ else
1532
+ terms_without_ancestors_and_alternatices = terms_without_ancestors
1533
+ end
1534
+ return terms_without_ancestors_and_alternatices
1535
+ end
1536
+
1537
+
1538
+ # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1539
+ # ===== Parameters
1540
+ # +store+:: if true, clenaed profiles will replace already stored profiles
1541
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1542
+ # ===== Returns
1543
+ # a hash with cleaned profiles
1544
+ def clean_profiles(store: false, remove_alternatives: true)
1545
+ cleaned_profiles = {}
1546
+ @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1547
+ @profiles = cleaned_profiles if store
1548
+ return cleaned_profiles
1549
+ end
1550
+
1551
+
1552
+ # Calculates number of ancestors present (redundant) in each profile stored
1553
+ # ===== Returns
1554
+ # array of parentals for each profile
1555
+ def parentals_per_profile
1556
+ cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1557
+ parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1558
+ return parentals
1559
+ end
1560
+
1561
+
1562
+ # Calculates mean IC of a given profile
1563
+ # ===== Parameters
1564
+ # +prof+:: profile to be checked
1565
+ # +ic_type+:: ic_type to be used
1566
+ # +zhou_k+:: special coeficient for Zhou IC method
1567
+ # ===== Returns
1568
+ # mean IC for a given profile
1569
+ def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1570
+ return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1571
+ end
1572
+
1573
+
1574
+ # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1575
+ # ===== Returns
1576
+ # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1577
+ def get_profiles_resnik_dual_ICs
1578
+ struct_ics = {}
1579
+ observ_ics = {}
1580
+ @profiles.each do |id, terms|
1581
+ struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1582
+ observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1583
+ end
1584
+ return struct_ics.clone, observ_ics.clone
1585
+ end
1586
+
1587
+
1588
+ # Calculates ontology structural levels for all ontology terms
1589
+ # ===== Parameters
1590
+ # +calc_paths+:: calculates term paths if it's not already calculated
1591
+ # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1592
+ def calc_term_levels(calc_paths: false, shortest_path: true)
1593
+ if @term_paths.empty?
1594
+ if calc_paths
1595
+ self.calc_term_paths
1596
+ else
1597
+ warn('Term paths are not already loaded. Aborting dictionary calc')
1598
+ end
1599
+ end
1600
+ if !@term_paths.empty?
1601
+ byTerm = {}
1602
+ byValue = {}
1603
+ # Calc per term
1604
+ @term_paths.each do |term, info|
1605
+ level = shortest_path ? info[:shortest_path] : info[:largest_path]
1606
+ if level.nil?
1607
+ level = -1
1608
+ else
1609
+ level = level.round(0)
1610
+ end
1611
+ byTerm[term] = level
1612
+ queryLevels = byValue[level]
1613
+ if queryLevels.nil?
1614
+ byValue[level] = [term]
1615
+ else
1616
+ byValue[level] << term
1617
+ end
1618
+ end
1619
+ @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1620
+ # Update maximum depth
1621
+ @max_freqs[:max_depth] = byValue.keys.max
1622
+ end
1623
+ end
1624
+
1625
+
1626
+ # Check if a term given is marked as obsolete
1627
+ def is_obsolete? term
1628
+ return @obsoletes_index.include?(term)
1629
+ end
1630
+
1631
+ # Check if a term given is marked as alternative
1632
+ def is_alternative? term
1633
+ return @alternatives_index.include?(term)
1634
+ end
1635
+
1636
+ # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1637
+ # Also calculates paths metadata and stores into @term_paths
1638
+ def calc_term_paths
1639
+ self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1640
+ visited_terms = []
1641
+ @term_paths = {}
1642
+ if [:hierarchical, :sparse].include? @structureType
1643
+ terms = @stanzas[:terms].keys
1644
+ terms.each do |term|
1645
+ if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1646
+ special_term = term
1647
+ term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1648
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1649
+ @term_paths[special_term] = @term_paths[term]
1650
+ visited_terms << special_term
1651
+ end
1652
+
1653
+ if !visited_terms.include?(term)
1654
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1655
+ parentals = @dicts[:is_a][:byTerm][term]
1656
+ if parentals.nil?
1657
+ @term_paths[term][:paths] << [term]
1658
+ else
1659
+ parentals.each do |direct_parental|
1660
+ if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1661
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1662
+ else # Calculate new paths
1663
+ self.expand_path(direct_parental, visited_terms)
1664
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1665
+ end
1666
+ new_paths.each{|path| @term_paths[term][:paths] << path}
1667
+ end
1668
+ end
1669
+ visited_terms << term
1670
+ end
1671
+ # Update metadata
1672
+ @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1673
+ paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1674
+ @term_paths[term][:largest_path] = paths_sizes.max
1675
+ @term_paths[term][:shortest_path] = paths_sizes.min
1676
+ end
1677
+ else
1678
+ warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1679
+ end
1680
+ end
1681
+
1682
+
1683
+ # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1684
+ # ===== Parameters
1685
+ # +curr_term+:: current visited term
1686
+ # +visited_terms+:: already expanded terms
1687
+ def expand_path(curr_term, visited_terms)
1688
+ if !visited_terms.include?(curr_term) # Not already expanded
1689
+ @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1690
+ direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1691
+ if direct_parentals.nil? # No parents :: End of recurrence
1692
+ @term_paths[curr_term][:paths] << [curr_term]
1693
+ else # Expand and concat
1694
+ direct_parentals.each do |ancestor|
1695
+ self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1696
+ new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1697
+ new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1698
+ end
1699
+ end
1700
+ visited_terms << curr_term
1701
+ end
1702
+ end
1703
+
1704
+
1705
+ # Gets ontology levels calculated
1706
+ # ===== Returns
1707
+ # ontology levels calculated
1708
+ def get_ontology_levels
1709
+ return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1710
+ end
1711
+
1712
+
1713
+ # Gets ontology level of a specific term
1714
+ # ===== Returns
1715
+ # Term level
1716
+ def get_term_level(term)
1717
+ return @dicts[:level][:byValue][term]
1718
+ end
1719
+
1720
+
1721
+ # Return ontology levels from profile terms
1722
+ # ===== Returns
1723
+ # hash of term levels (Key: level; Value: array of term IDs)
1724
+ def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1725
+ profiles_terms = @profiles.values.flatten
1726
+ profiles_terms.uniq! if uniq
1727
+ term_freqs_byProfile = {}
1728
+ profiles_terms.each do |term|
1729
+ query = term_freqs_byProfile[term]
1730
+ if query.nil?
1731
+ term_freqs_byProfile[term] = 1
1732
+ else
1733
+ term_freqs_byProfile[term] += 1
1734
+ end
1735
+ end
1736
+ levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1737
+ return levels_filtered
1738
+ end
1739
+
1740
+
1741
+ # Calculate profiles dictionary with Key= Term; Value = Profiles
1742
+ def calc_profiles_dictionary
1743
+ if @profiles.empty?
1744
+ warn('Profiles are not already loaded. Aborting dictionary calc')
1745
+ else
1746
+ byTerm = {} # Key: Terms
1747
+ # byValue -- Key: Profile == @profiles
1748
+ @profiles.each do |id, terms|
1749
+ terms.each do |term|
1750
+ if byTerm.include?(term)
1751
+ byTerm[term] << id
1752
+ else
1753
+ byTerm[term] = [id]
1754
+ end
1755
+ end
1756
+ end
1757
+ @profilesDict = byTerm
1758
+ end
1759
+ end
1760
+
1761
+
1762
+ # Gets profiles dictionary calculated
1763
+ # ===== Return
1764
+ # profiles dictionary (clone)
1765
+ def get_terms_linked_profiles
1766
+ return @profilesDict.clone
1767
+ end
1768
+
1769
+
1770
+ # Get related profiles to a given term
1771
+ # ===== Parameters
1772
+ # +term+:: to be checked
1773
+ # ===== Returns
1774
+ # profiles which contains given term
1775
+ def get_term_linked_profiles(term)
1776
+ return @profilesDict[term]
1777
+ end
1778
+
1779
+
1780
+ # Gets metainfo table from a set of terms
1781
+ # ===== Parameters
1782
+ # +terms+:: IDs to be expanded
1783
+ # +filter_alternatives+:: flag to be used in get_descendants method
1784
+ # ===== Returns
1785
+ # an array with triplets [TermID, TermName, DescendantsNames]
1786
+ def get_childs_table(terms, filter_alternatives = false)
1787
+ expanded_terms = []
1788
+ terms.each do |t|
1789
+ expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1790
+ end
1791
+ return expanded_terms
1792
+ end
1793
+
1794
+
1795
+ # Store specific relations hash given into ITEMS structure
1796
+ # ===== Parameters
1797
+ # +relations+:: hash to be stored
1798
+ # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1799
+ # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1800
+ def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1801
+ @items = {} if remove_old_relations
1802
+ if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1803
+ warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1804
+ end
1805
+ if !remove_old_relations
1806
+ if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1807
+ warn('Some terms given are already stored. Stored version will be replaced')
1808
+ end
1809
+ end
1810
+ if expand
1811
+ relations.each do |k,v|
1812
+ if @items.keys.include?(k)
1813
+ @items[k] = (@items[k] + v).uniq
1814
+ else
1815
+ @items[k] = v
1816
+ end
1817
+ end
1818
+ else
1819
+ @items.merge!(relations)
1820
+ end
1821
+ end
1822
+
1823
+
1824
+ # Assign a dictionary already calculated as a items set.
1825
+ # ===== Parameters
1826
+ # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1827
+ def set_items_from_dict(dictID, remove_old_relations = false)
1828
+ @items = {} if remove_old_relations
1829
+ if(@dicts.keys.include?(dictID))
1830
+ @items.merge(@dicts[dictID][:byTerm])
1831
+ else
1832
+ warn('Specified ID is not calculated. Dict will not be added as a items set')
1833
+ end
1834
+ end
1835
+
1836
+
1837
+ # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
1838
+ # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1839
+ # ===== Parameters
1840
+ # +ontology+:: (Optional) ontology object which items given belongs
1841
+ # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
1842
+ # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
1843
+ # ===== Returns
1844
+ # void and update items object
1845
+ def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
1846
+ # Check item keys
1847
+ if @items.empty?
1848
+ warn('Items have been not provided yet')
1849
+ return nil
1850
+ end
1851
+ targetKeys = @items.keys.select{|k| self.exists?(k)}
1852
+ if targetKeys.length == 0
1853
+ warn('Any item key is allowed')
1854
+ return nil
1855
+ elsif targetKeys.length < @items.keys.length
1856
+ warn('Some item keys are not allowed')
1857
+ end
1858
+
1859
+ # Expand to parentals
1860
+ targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
1861
+ targetKeys.flatten!
1862
+ targetKeys.uniq!
1863
+
1864
+ # Obtain levels (go from leaves to roots)
1865
+ levels = targetKeys.map{|term| self.get_term_level(term)}
1866
+ levels.compact!
1867
+ levels.uniq!
1868
+ levels.sort!
1869
+ levels.reverse!
1870
+ levels.shift # Leaves are not expandable
1871
+
1872
+ # Expand from leaves to roots
1873
+ levels.map do |lvl|
1874
+ curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
1875
+ curr_keys.map do |term_expand|
1876
+ to_infer = []
1877
+ # Obtain childs
1878
+ childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
1879
+ # Expand
1880
+ if childs.length > 0 && minimum_childs == 1 # Special case
1881
+ to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
1882
+ elsif childs.length >= minimum_childs
1883
+ to_infer = Hash.new(0)
1884
+ # Compare
1885
+ while childs.length > 1
1886
+ curr_term = childs.shift
1887
+ childs.each do |compare_term|
1888
+ pivot_items = @items[curr_term]
1889
+ compare_items = @items[compare_term]
1890
+ if ontology.nil? # Exact match
1891
+ pivot_items.map do |pitem|
1892
+ if compare_items.include?(pitem)
1893
+ to_infer[pitem] += 2
1894
+ end
1895
+ end
1896
+ else # Find MICAs
1897
+ local_infer = Hash.new(0)
1898
+ pivot_items.map do |pitem|
1899
+ micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
1900
+ maxmica = micas[0]
1901
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1902
+ local_infer[maxmica.first] += 1
1903
+ end
1904
+ compare_items.map do |citem|
1905
+ micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
1906
+ maxmica = micas[0]
1907
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1908
+ local_infer[maxmica.first] += 1
1909
+ end
1910
+ local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
1911
+ end
1912
+ end
1913
+ end
1914
+ # Filter infer
1915
+ to_infer = to_infer.select{|k,v| v >= minimum_childs}
1916
+ end
1917
+ # Infer
1918
+ if to_infer.length > 0
1919
+ @items[term_expand] = [] if @items[term_expand].nil?
1920
+ if to_infer.kind_of?(Array)
1921
+ @items[term_expand] = (@items[term_expand] + to_infer).uniq
1922
+ else
1923
+ @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
1924
+ end
1925
+ @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
1926
+ elsif !@items.include?(term_expand)
1927
+ targetKeys.delete(term_expand)
1928
+ end
1929
+ end
1930
+ end
1931
+ end
1932
+
1933
+
1934
+
1935
+ # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1936
+ # ===== Parameters
1937
+ # ++::
1938
+ # ===== Returns
1939
+ # ...
1940
+ def compute_relations_to_items(external_item_list, mode, thresold)
1941
+ results = []
1942
+ penalized_terms = {}
1943
+ # terms_levels = get_terms_levels(@items_relations.keys)
1944
+ terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1945
+ terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1946
+ terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1947
+ levels = terms_levels.keys.sort
1948
+ levels.reverse_each do |level|
1949
+ terms_levels[level].each do |term|
1950
+ associated_items = @items_relations[term]
1951
+ if mode == :elim
1952
+ items_to_remove = penalized_terms[term]
1953
+ items_to_remove = [] if items_to_remove.nil?
1954
+ pval = get_fisher_exact_test(
1955
+ external_item_list - items_to_remove,
1956
+ associated_items - items_to_remove,
1957
+ ((associated_items | external_item_list) - items_to_remove).length
1958
+ )
1959
+ if pval <= thresold
1960
+ parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1961
+ parents.each do |prnt|
1962
+ query = penalized_terms[prnt]
1963
+ if query.nil?
1964
+ penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1965
+ else
1966
+ query.concat(@items_relations[term])
1967
+ end
1968
+ end
1969
+ end
1970
+ end
1971
+ results << [term, pval]
1972
+ end
1973
+ end
1974
+ return results
1975
+ end
1976
+
1977
+
1978
+ # Check if a given ID is a removable (blacklist) term.
1979
+ # +DEPRECATED+ use is_removable? instead
1980
+ # ===== Parameters
1981
+ # +id+:: to be checked
1982
+ # ===== Returns
1983
+ # true if given term is a removable (blacklist) term or false in other cases
1984
+ def is_removable(id)
1985
+ warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
1986
+ return @removable_terms.include?(id.to_sym)
1987
+ end
1988
+
1989
+ # Check if a given ID is a removable (blacklist) term
1990
+ # ===== Parameters
1991
+ # +id+:: to be checked
1992
+ # ===== Returns
1993
+ # true if given term is a removable (blacklist) term or false in other cases
1994
+ def is_removable? id
1995
+ return @removable_terms.include?(id.to_sym)
1996
+ end
1997
+
1998
+ ############################################
1999
+ # SPECIAL METHODS
2000
+ #############################################
2001
+ def ==(other)
2002
+ self.header == other.header &&
2003
+ self.stanzas == other.stanzas &&
2004
+ self.ancestors_index == other.ancestors_index &&
2005
+ self.alternatives_index == other.alternatives_index &&
2006
+ self.obsoletes_index == other.obsoletes_index &&
2007
+ self.structureType == other.structureType &&
2008
+ self.ics == other.ics &&
2009
+ self.meta == other.meta &&
2010
+ self.dicts == other.dicts &&
2011
+ self.profiles == other.profiles &&
2012
+ self.profilesDict == other.profilesDict &&
2013
+ (self.items.keys - other.items.keys).empty? &&
2014
+ self.removable_terms == other.removable_terms &&
2015
+ self.special_tags == other.special_tags &&
2016
+ self.items == other.items &&
2017
+ self.term_paths == other.term_paths &&
2018
+ self.max_freqs == other.max_freqs
2008
2019
  end
2009
2020
 
2010
2021
 
2011
2022
  def clone
2012
- copy = Ontology.new
2013
- copy.header = self.header.clone
2014
- copy.stanzas[:terms] = self.stanzas[:terms].clone
2015
- copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2016
- copy.stanzas[:instances] = self.stanzas[:instances].clone
2017
- copy.ancestors_index = self.ancestors_index.clone
2018
- copy.descendants_index = self.descendants_index.clone
2019
- copy.alternatives_index = self.alternatives_index.clone
2020
- copy.obsoletes_index = self.obsoletes_index.clone
2021
- copy.structureType = self.structureType.clone
2022
- copy.ics = self.ics.clone
2023
- copy.meta = self.meta.clone
2024
- copy.dicts = self.dicts.clone
2025
- copy.profiles = self.profiles.clone
2026
- copy.profilesDict = self.profilesDict.clone
2027
- copy.items = self.items.clone
2028
- copy.removable_terms = self.removable_terms.clone
2029
- copy.term_paths = self.term_paths.clone
2030
- copy.max_freqs = self.max_freqs.clone
2031
- return copy
2032
- end
2033
-
2034
-
2035
- #############################################
2036
- # ACCESS CONTROL
2037
- #############################################
2038
-
2039
- attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2040
- attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2023
+ copy = Ontology.new
2024
+ copy.header = self.header.clone
2025
+ copy.stanzas[:terms] = self.stanzas[:terms].clone
2026
+ copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2027
+ copy.stanzas[:instances] = self.stanzas[:instances].clone
2028
+ copy.ancestors_index = self.ancestors_index.clone
2029
+ copy.descendants_index = self.descendants_index.clone
2030
+ copy.alternatives_index = self.alternatives_index.clone
2031
+ copy.obsoletes_index = self.obsoletes_index.clone
2032
+ copy.structureType = self.structureType.clone
2033
+ copy.ics = self.ics.clone
2034
+ copy.meta = self.meta.clone
2035
+ copy.dicts = self.dicts.clone
2036
+ copy.profiles = self.profiles.clone
2037
+ copy.profilesDict = self.profilesDict.clone
2038
+ copy.items = self.items.clone
2039
+ copy.removable_terms = self.removable_terms.clone
2040
+ copy.term_paths = self.term_paths.clone
2041
+ copy.max_freqs = self.max_freqs.clone
2042
+ return copy
2043
+ end
2044
+
2045
+
2046
+ #############################################
2047
+ # ACCESS CONTROL
2048
+ #############################################
2049
+
2050
+ attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2051
+ attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2041
2052
  end