semtools 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fc29918a31045893647355dd72264a04386c5171c48ea868f7e9bbc93062151
4
- data.tar.gz: 692ce02343cb00ac37bbc34476da08386bedf0eaca7946689eb62c9a1f06d555
3
+ metadata.gz: e68630d42a4faf01dc15fdfa9f1acd64425ef1396ed6f9ce0a8d76319922ba06
4
+ data.tar.gz: 952d908af5370031df0f19c98ab69fbb59b51825f050b69714f4494e15f77f77
5
5
  SHA512:
6
- metadata.gz: 1b52667c81a0a25786b91156e9ed88a8de47e86fd18baddffc43b05ff199f95129b09da4e03025b6fb709d18a0274e22bf4a55c81471fda748e75aadca4d6ef1
7
- data.tar.gz: 46e5b49f611c021ee8576a522a0a6ef22a8b9ed349084dadb9e44fd76c712c05221e6314985f08bdba575ac2dd849f1f14d84d5ae686889f33fac993132a8372
6
+ metadata.gz: 85792433d82f824297df87cb0927b24116425ddb2a72a3e2f461748e014aa27f4efc8f73fcd7d1e6c423acd7487b77d21c2a8c0b7b0f8530030f6246ad62ad64
7
+ data.tar.gz: 2d0e0953f19d8c2cad2cc85a0c6d8c1cb9bf95f4dd1ee2d75aebcf15bdd3929d2938ede6544ed3f145ac5a8804b97af64f50a859ae7ecf8164f0ed4f07208fb2
data/bin/onto2json.rb CHANGED
@@ -18,14 +18,20 @@ OptionParser.new do |opts|
18
18
  opts.banner = "Usage: #{__FILE__} [options]"
19
19
 
20
20
  options[:input_file] = nil
21
- opts.on("-i", "--input_file PATH", "Input file with ontology in OBO format") do |data|
21
+ opts.on("-i", "--input_file FILE", "Input file with ontology in OBO format") do |data|
22
22
  options[:input_file] = data
23
23
  end
24
24
 
25
25
  options[:output_file] = nil
26
- opts.on("-o", "--output_file PATH", "Output path") do |data|
26
+ opts.on("-o", "--output_file FILE", "Output path") do |data|
27
27
  options[:output_file] = data
28
28
  end
29
+
30
+ options[:build] = false
31
+ opts.on("-b", "--build", "Activate build mode (calculate dictionaries)") do
32
+ options[:build] = true
33
+ end
34
+
29
35
 
30
36
  opts.on_tail("-h", "--help", "Show this message") do
31
37
  puts opts
@@ -39,7 +45,7 @@ end.parse!
39
45
  # MAIN
40
46
  ##########################
41
47
  puts "Loading ontology ..."
42
- onto = Ontology.new(file: options[:input_file], load_file: true)
48
+ onto = Ontology.new(file: options[:input_file], load_file: true, build: options[:build])
43
49
  puts "Exporting ontology to JSON ..."
44
50
  onto.write(options[:output_file])
45
51
  puts "Ontology exported"
@@ -2,2040 +2,2051 @@ require 'json'
2
2
 
3
3
 
4
4
  class Ontology
5
- #########################################################
6
- # AUTHOR NOTES
7
- #########################################################
8
-
9
- # 1 - Store @profiles as @stanzas[:instances]
10
- # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
11
-
12
-
13
- #############################################
14
- # FIELDS
15
- #############################################
16
- # Handled class variables
17
- # => @@basic_tags :: hash with main OBO structure tags
18
- # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
19
- # => @@symbolizable_ids :: tags which can be symbolized
20
- # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
21
- #
22
- # Handled object variables
23
- # => @header :: file header (if is available)
24
- # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
25
- # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
26
- # => @descendants_index :: hash of descendants per each term handled with any structure relationships
27
- # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
28
- # => @obsoletes_index :: hash of obsoletes and it's new ids
29
- # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
30
- # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
31
- # => @ics :: already calculated ICs for handled terms and IC types
32
- # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
33
- # => @max_freqs :: maximum freqs found for structural and observed freqs
34
- # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
35
- # => @profiles :: set of terms assigned to an ID
36
- # => @profilesDict :: set of profile IDs assigned to a term
37
- # => @items :: hash with items relations to terms
38
- # => @removable_terms :: array of terms to not be considered
39
- # => @term_paths :: metainfo about parental paths of each term
40
-
41
- @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
- @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
- @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
- @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
45
- @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
46
- @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
47
-
48
- #############################################
49
- # CONSTRUCTOR
50
- #############################################
51
-
52
- # Instantiate a OBO_Handler object
53
- # ===== Parameters
54
- # +file+:: with info to be loaded (.obo ; .json)
55
- # +load_file+:: activate load process automatically (only for .obo)
56
- # +removable_terms+: term to be removed from calcs
57
- def initialize(file: nil, load_file: false, removable_terms: [])
58
- # Initialize object variables
59
- @header = nil
60
- @stanzas = {terms: {}, typedefs: {}, instances: {}}
61
- @ancestors_index = {}
62
- @descendants_index = {}
63
- @alternatives_index = {}
64
- @obsoletes_index = {}
65
- @structureType = nil
66
- @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
67
- @meta = {}
68
- @special_tags = @@basic_tags.clone
69
- @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
70
- @dicts = {}
71
- @profiles = {}
72
- @profilesDict = {}
73
- @items = {}
74
- @removable_terms = []
75
- @term_paths = {}
76
- # Load if proceeds
77
- add_removable_terms(removable_terms) if !removable_terms.empty?
78
- load(file) if load_file
79
- end
80
-
81
-
82
- #############################################
83
- # CLASS METHODS
84
- #############################################
85
-
86
- # Expand a (starting) term using a specific tag and return all extended terms into an array and
87
- # the relationship structuture observed (hierarchical or circular). If circular structure is
88
- # foumd, extended array will be an unique vector without starting term (no loops).
89
- # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
90
- # ===== Parameters
91
- # +start+:: term where start to expand
92
- # +terms+:: set to be used to expand
93
- # +target_tag+:: tag used to expand
94
- # +eexpansion+:: already expanded info
95
- # +split_info_char+:: special regex used to split info (if it is necessary)
96
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
97
- # +alt_ids+:: set of alternative IDs
98
- # ===== Returns
99
- # A vector with the observed structure (string) and the array with extended terms.
100
- def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
101
- # Take start_id term available info and already accumulated info
102
- current_associations = related_ids[start_id]
103
- current_associations = [] if current_associations.nil?
104
- return [:no_term,[]] if terms[start_id].nil?
105
- id_relations = terms[start_id][target_tag]
106
- return [:source,[]] if id_relations.nil?
107
-
108
- # Prepare auxiliar variables
109
- struct = :hierarchical
110
-
111
- # Study direct extensions
112
- id_relations = id_relations.clone
113
- while id_relations.length > 0
114
- id = id_relations.shift
115
- id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
116
-
117
- # Handle
118
- if current_associations.include?(id) # Check if already have been included into this expansion
119
- struct = :circular
120
- else
121
- current_associations << id
122
- if related_ids.include?(id) # Check if current already has been expanded
123
- current_associations = current_associations | related_ids[id]
124
- if current_associations.include?(start_id) # Check circular case
125
- struct = :circular
126
- [id, start_id].each{|repeated| current_associations.delete(repeated)}
127
- end
128
- else # Expand
129
- related_ids[start_id] = current_associations
130
- structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
131
- current_associations = current_associations | current_related_ids
132
- struct = :circular if structExp == :circular # Check struct
133
- if current_associations.include?(start_id) # Check circular case
134
- struct = :circular
135
- current_associations.delete(start_id)
136
- end
137
- end
138
- end
139
- end
140
- related_ids[start_id] = current_associations
141
-
142
- return struct, current_associations
143
- end
144
-
145
-
146
- # Expand terms using a specific tag and return all extended terms into an array and
147
- # the relationship structuture observed (hierarchical or circular). If circular structure is
148
- # foumd, extended array will be an unique vector without starting term (no loops)
149
- # ===== Parameters
150
- # +terms+:: set to be used to expand
151
- # +target_tag+:: tag used to expand
152
- # +split_info_char+:: special regex used to split info (if it is necessary)
153
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
154
- # +alt_ids+:: set of alternative IDs
155
- # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
156
- # ===== Returns
157
- # A vector with the observed structure (string) and the hash with extended terms
158
- def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
159
- # Define structure type
160
- structType = :hierarchical
161
- related_ids = {}
162
- terms.each do |id, tags|
163
- # Check if target tag is defined
164
- if !tags[target_tag].nil?
165
- # Obtain related terms
166
- set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
167
- # Check structure
168
- structType = :circular if set_structure == :circular
169
- end
170
- end
171
-
172
- # Check special case
173
- structType = :atomic if related_ids.length <= 0
174
- structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
175
- # Return type and hash with related_ids
176
- return structType, related_ids
177
- end
178
-
179
-
180
- # Class method to transform string with <tag : info> into hash structure
181
- # ===== Parameters
182
- # +attributes+:: array tuples with info to be transformed into hash format
183
- # ===== Returns
184
- # Attributes stored into hash structure
185
- def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
186
- # Load info
187
- info_hash = {}
188
- # Only TERMS multivalue tags (future add Typedefs and Instance)
189
- # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
190
- attributes.each do |tag, value|
191
- # Check
192
- raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
193
- # Prepare
194
- tag = tag.lstrip.to_sym
195
- value.lstrip!
196
- value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
197
-
198
- # Store
199
- query = info_hash[tag]
200
- if !query.nil? # Tag already exists
201
- if !query.kind_of?(Array) # Check that tag is multivalue
202
- raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
203
- else
204
- query << value # Add new value to tag
205
- end
206
- else # New entry
207
- if @@multivalue_tags.include?(tag)
208
- info_hash[tag] = [value]
209
- else
210
- info_hash[tag] = value
211
- end
212
- end
213
- end
214
- self.symbolize_ids(info_hash)
215
- return info_hash
216
- end
217
-
218
-
219
- # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
220
- # the Header, the Terms, the Typedefs and the Instances.
221
- # ===== Parameters
222
- # +file+:: OBO file to be loaded
223
- # ===== Returns
224
- # Hash with FILE, HEADER and STANZAS info
225
- def self.load_obo(file) #TODO: Send to obo_parser class
226
- raise("File is not defined") if file.nil?
227
- # Data variables
228
- header = ''
229
- stanzas = {terms: {}, typedefs: {}, instances: {}}
230
- # Auxiliar variables
231
- infoType = 'Header'
232
- currInfo = []
233
- stanzas_flags = %w[[Term] [Typedef] [Instance]]
234
- # Read file
235
- File.open(file).each do |line|
236
- line.chomp!
237
- next if line.empty?
238
- fields = line.split(':', 2)
239
- # Check if new instance is found
240
- if stanzas_flags.include?(line)
241
- header = self.process_entity(header, infoType, stanzas, currInfo)
242
- # Update info variables
243
- currInfo = []
244
- infoType = line.gsub!(/[\[\]]/, '')
245
- next
246
- end
247
- # Concat info
248
- currInfo << fields
249
- end
250
- # Store last loaded info
251
- header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
252
-
253
- # Prepare to return
254
- finfo = {:file => file, :name => File.basename(file, File.extname(file))}
255
- return finfo, header, stanzas
256
- end
257
-
258
-
259
- # Handle OBO loaded info and stores it into correct container and format
260
- # ===== Parameters
261
- # +header+:: container
262
- # +infoType+:: current ontology item type detected
263
- # +stanzas+:: container
264
- # +currInfo+:: info to be stored
265
- # ===== Returns
266
- # header newly/already stored
267
- def self.process_entity(header, infoType, stanzas, currInfo)
268
- info = self.info2hash(currInfo)
269
- # Store current info
270
- if infoType.eql?('Header')
271
- header = info
272
- else
273
- id = info[:id]
274
- case infoType
275
- when 'Term'
276
- stanzas[:terms][id] = info
277
- when 'Typedef'
278
- stanzas[:typedefs][id] = info
279
- when 'Instance'
280
- stanzas[:instances][id] = info
281
- end
282
- end
283
- return header
284
- end
285
-
286
-
287
- # Symboliza all values into hashs using symbolizable tags as keys
288
- # ===== Parameters
289
- # +item_hash+:: hash to be checked
290
- def self.symbolize_ids(item_hash)
291
- @@symbolizable_ids.each do |tag|
292
- query = item_hash[tag]
293
- if !query.nil?
294
- if query.kind_of?(Array)
295
- query.map!{|item| item.to_sym}
296
- else
297
- item_hash[tag] = query.to_sym if !query.nil?
298
- end
299
- end
300
- end
301
- end
302
-
303
-
304
- #
305
- # ===== Parameters
306
- # +root+:: main term to expand
307
- # +ontology+:: to be cutted
308
- # +clone+:: if true, given ontology object will not be mutated
309
- # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
310
- # ===== Returns
311
- # An Ontology object with terms after cut the ontology.
312
- def self.mutate(root, ontology, clone: true, remove_up: true)
313
- ontology = ontology.clone if clone
314
- # Obtain affected IDs
315
- descendants = ontology.descendants_index[root]
316
- descendants << root # Store itself to do not remove it
317
- # Remove unnecesary terms
318
- ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
319
- ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
320
- ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
321
- ontology.dicts = {}
322
- ontology.removable_terms = []
323
- ontology.term_paths = {}
324
- # Recalculate metadata
325
- ontology.build_index
326
- ontology.add_observed_terms_from_profiles
327
- # Finish
328
- return ontology
329
- end
330
-
331
-
332
-
333
- #############################################
334
- # GENERAL METHODS
335
- #############################################
336
-
337
- # Include removable terms to current removable terms list
338
- # ===== Parameters
339
- # +terms+:: terms array to be concatenated
340
- def add_removable_terms(terms)
341
- terms = terms.map{|term| term.to_sym}
342
- @removable_terms.concat(terms)
343
- end
344
-
345
-
346
- # Include removable terms to current removable terms list loading new
347
- # terms from a one column plain text file
348
- # ===== Parameters
349
- # +file+:: to be loaded
350
- def add_removable_terms_from_file(file)
351
- File.open(excluded_codes_file).each do |line|
352
- line.chomp!
353
- @removable_terms << line.to_sym
354
- end
355
- end
356
-
357
-
358
- # Increase observed frequency for a specific term
359
- # ===== Parameters
360
- # +term+:: term which frequency is going to be increased
361
- # +increas+:: frequency rate to be increased. Default = 1
362
- # ===== Return
363
- # true if process ends without errors, false in other cases
364
- def add_observed_term(term:,increase: 1.0)
365
- # Check
366
- raise ArgumentError, "Term given is NIL" if term.nil?
367
- return false unless @stanzas[:terms].include?(term)
368
- return false if @removable_terms.include?(term)
369
- if @alternatives_index.include?(term)
370
- alt_id = @alternatives_index[term]
371
- @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
372
- @meta[term] = @meta[alt_id]
373
- end
374
- # Check if exists
375
- @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
376
- # Add frequency
377
- @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
378
- @meta[term][:observed_freq] += increase
379
- # Check maximum frequency
380
- @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
381
- return true
382
- end
383
-
384
-
385
- # Increase the arbitrary frequency of a given term set
386
- # ===== Parameters
387
- # +terms+:: set of terms to be updated
388
- # +increase+:: amount to be increased
389
- # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
390
- # ===== Return
391
- # true if process ends without errors and false in other cases
392
- def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
393
- # Check
394
- raise ArgumentError, 'Terms array given is NIL' if terms.nil?
395
- raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
396
- # Add observations
397
- if transform_to_sym
398
- checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
399
- else
400
- checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
401
- end
402
- return checks
403
- end
404
-
405
-
406
- # Compare to terms sets
407
- # ===== Parameters
408
- # +termsA+:: set to be compared
409
- # +termsB+:: set to be compared
410
- # +sim_type+:: similitude method to be used. Default: resnik
411
- # +ic_type+:: ic type to be used. Default: resnik
412
- # +bidirectional+:: calculate bidirectional similitude. Default: false
413
- # ===== Return
414
- # similitude calculated
415
- def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
416
- # Check
417
- raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
418
- raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
419
- micasA = []
420
- # Compare A -> B
421
- termsA.each do |tA|
422
- micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
423
- # Remove special cases
424
- [false,nil].each do |err_value| micas.delete(err_value) end
425
- # Obtain maximum value
426
- micasA << micas.max if micas.length > 0
427
- micasA << 0 if micas.length <= 0
428
- end
429
- means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
430
- # Compare B -> A
431
- if bidirectional
432
- means_simA = means_sim * micasA.size
433
- means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
434
- means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
435
- end
436
- # Return
437
- return means_sim
438
- end
439
-
440
-
441
- # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
442
- # ===== Parameters
443
- # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
444
- # +sim_type+:: similitude method to be used. Default: resnik
445
- # +ic_type+:: ic type to be used. Default: resnik
446
- # +bidirectional+:: calculate bidirectional similitude. Default: false
447
- # ===== Return
448
- # Similitudes calculated
449
- def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
450
- profiles_similarity = {} #calculate similarity between patients profile
451
- profiles_ids = @profiles.keys
452
- if external_profiles.nil?
453
- comp_ids = profiles_ids
454
- comp_profiles = @profiles
455
- main_ids = comp_ids
456
- main_profiles = comp_profiles
457
- else
458
- comp_ids = external_profiles.keys
459
- comp_profiles = external_profiles
460
- main_ids = profiles_ids
461
- main_profiles = @profiles
462
- end
463
- # Compare
464
- while !main_ids.empty?
465
- curr_id = main_ids.shift
466
- current_profile = main_profiles[curr_id]
467
- comp_ids.each do |id|
468
- profile = comp_profiles[id]
469
- value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
470
- query = profiles_similarity[curr_id]
471
- if query.nil?
472
- profiles_similarity[curr_id] = {id => value}
473
- else
474
- query[id] = value
475
- end
476
- end
477
- end
478
- return profiles_similarity
479
- end
480
-
481
-
482
- # Expand alternative IDs arround all already stored terms
483
- # ===== Parameters
484
- # +alt_tag+:: tag used to expand alternative IDs
485
- # ===== Returns
486
- # true if process ends without errors and false in other cases
487
- def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
488
- # Check input
489
- raise('stanzas terms empty') if @stanzas[:terms].empty?
490
- # Take all alternative IDs
491
- alt_ids2add = {}
492
- @stanzas[:terms].each do |id, tags|
493
- alt_ids = tags[alt_tag]
494
- if !alt_ids.nil?
495
- alt_ids = alt_ids - @removable_terms
496
- # Update info
497
- alt_ids.each do |alt_term|
498
- @alternatives_index[alt_term] = id
499
- alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
500
- @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
501
- end
502
- end
503
- end
504
- @stanzas[:terms].merge!(alt_ids2add)
505
- end
506
-
507
-
508
- # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
509
- # ===== Returns
510
- # true if eprocess ends without errors and false in other cases
511
- def build_index()
512
- self.get_index_alternatives
513
- self.get_index_obsoletes
514
- self.get_index_child_parent_relations
515
- @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
516
- @alternatives_index.compact!
517
- @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
518
- @obsoletes_index.compact!
519
- @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
520
- @ancestors_index.compact!
521
- @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
522
- @descendants_index.compact!
523
- self.get_index_frequencies
524
- self.calc_dictionary(:name)
525
- self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
526
- self.calc_term_levels(calc_paths: true)
527
- end
528
-
529
-
530
- # Calculates regular frequencies based on ontology structure (using parentals)
531
- # ===== Returns
532
- # true if everything end without errors and false in other cases
533
- def get_index_frequencies()
534
- # Check
535
- if @ancestors_index.empty?
536
- warn('ancestors_index object is empty')
537
- else
538
- # Prepare useful variables
539
- alternative_terms = @alternatives_index.keys
540
- # Per each term, add frequencies
541
- @stanzas[:terms].each do |id, tags|
542
- if @alternatives_index.include?(id)
543
- alt_id = @alternatives_index[id]
544
- query = @meta[alt_id] # Check if exist
545
- if query.nil?
546
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
547
- @meta[alt_id] = query
548
- end
549
- @meta[id] = query
550
- # Note: alternative terms do not increase structural frequencies
551
- else # Official term
552
- query = @meta[id] # Check if exist
553
- if query.nil?
554
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
555
- @meta[id] = query
556
- end
557
- # Store metadata
558
- query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
559
- query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
560
- query[:struct_freq] = query[:descendants] + 1.0
561
- # Update maximums
562
- @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
563
- @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
564
- end
565
- end
566
- end
567
- end
568
-
569
-
570
- # Expand obsoletes set and link info to their alternative IDs
571
- # ===== Parameters
572
- # +obs_tags+:: tags to be used to find obsoletes
573
- # +alt_tags+:: tags to find alternative IDs (if are available)
574
- # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
575
- # ===== Returns
576
- # true if process ends without errors and false in other cases
577
- def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
578
- if @stanzas[:terms].empty?
579
- warn('stanzas terms empty')
580
- else
581
- # Check obsoletes
582
- @stanzas[:terms].each do |id, term_tags|
583
- next if term_tags.nil?
584
- query = term_tags[obs_tag]
585
- if !query.nil? && query == 'true' # Obsolete tag presence
586
- next if !@obsoletes_index[id].nil? # Already stored
587
- # Check if alternative value is available
588
- alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
589
- if !alt_ids.empty?
590
- alt_id = alt_ids.first.first #FIRST tag, FIRST id
591
- # Store
592
- @alternatives_index[id] = alt_id
593
- @obsoletes_index[id] = alt_id
594
- end
595
- end
596
- end
597
- end
598
- end
599
-
600
-
601
- # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
602
- # ===== Parameters
603
- # +tag+:: tag used to expand parentals
604
- # +split_info_char+:: special regex used to split info (if it is necessary)
605
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
606
- # ===== Returns
607
- # true if process ends without errors and false in other cases
608
- def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
609
- # Check
610
- if @stanzas[:terms].nil?
611
- warn('stanzas terms empty')
612
- else
613
- # Expand
614
- structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
615
- target_tag: tag,
616
- alt_ids: @alternatives_index,
617
- obsoletes: @obsoletes_index.length)
618
- # Check
619
- raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
620
- # Prepare ancestors structure
621
- anc = {}
622
- des = {}
623
- parentals.each do |id, parents|
624
- parents = parents - @removable_terms
625
- anc[id] = parents
626
- parents.each do |anc_id| # Add descendants
627
- if !des.include?(anc_id)
628
- des[anc_id] = [id]
629
- else
630
- des[anc_id] << id
631
- end
632
- end
633
- end
634
- # Store alternatives
635
- @alternatives_index.each do |id,alt|
636
- anc[id] = anc[alt] if anc.include?(alt)
637
- des[id] = des[alt] if des.include?(alt)
638
- end
639
- # Check structure
640
- if ![:atomic,:sparse].include? structType
641
- structType = structType == :circular ? :circular : :hierarchical
642
- end
643
- # Store
644
- @ancestors_index = anc
645
- @descendants_index = des
646
- @structureType = structType
647
- end
648
- # Finish
649
- end
650
-
651
-
652
- # Find ancestors of a given term
653
- # ===== Parameters
654
- # +term+:: to be checked
655
- # +filter_alternatives+:: if true, remove alternatives from final results
656
- # ===== Returns
657
- # an array with all ancestors of given term or false if parents are not available yet
658
- def get_ancestors(term, filter_alternatives = false)
659
- return self.get_familiar(term, true, filter_alternatives)
660
- end
661
-
662
-
663
- # Find descendants of a given term
664
- # ===== Parameters
665
- # +term+:: to be checked
666
- # +filter_alternatives+:: if true, remove alternatives from final results
667
- # ===== Returns
668
- # an array with all descendants of given term or false if parents are not available yet
669
- def get_descendants(term, filter_alternatives = false)
670
- return self.get_familiar(term, false, filter_alternatives)
671
- end
672
-
673
-
674
- # Find ancestors/descendants of a given term
675
- # ===== Parameters
676
- # +term+:: to be checked
677
- # +return_ancestors+:: return ancestors if true or descendants if false
678
- # +filter_alternatives+:: if true, remove alternatives from final results
679
- # ===== Returns
680
- # an array with all ancestors/descendants of given term or nil if parents are not available yet
681
- def get_familiar(term, return_ancestors = true, filter_alternatives = false)
682
- # Find into parentals
683
- familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
684
- if !familiars.nil?
685
- familiars = familiars.clone
686
- if filter_alternatives
687
- familiars.reject!{|fm| @alternatives_index.include?(fm)}
688
- end
689
- else
690
- familiars = []
691
- end
692
- return familiars
693
- end
694
-
695
-
696
- # Obtain IC of an specific term
697
- # ===== Parameters
698
- # +term+:: which IC will be calculated
699
- # +type+:: of IC to be calculated. Default: resnik
700
- # +force+:: force re-calculate the IC. Do not check if it is already calculated
701
- # +zhou_k+:: special coeficient for Zhou IC method
702
- # ===== Returns
703
- # the IC calculated
704
- def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
705
- term = termRaw.to_sym
706
- # Check
707
- raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
708
- # Check if it's already calculated
709
- return @ics[type][term] if (@ics[type].include? term) && !force
710
- # Calculate
711
- ic = - 1
712
- case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
713
- ###########################################
714
- #### STRUCTURE BASED METRICS
715
- ###########################################
716
- # Shortest path
717
- # Weighted Link
718
- # Hirst and St-Onge Measure
719
- # Wu and Palmer
720
- # Slimani
721
- # Li
722
- # Leacock and Chodorow
723
- ###########################################
724
- #### INFORMATION CONTENT METRICS
725
- ###########################################
726
- when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
727
- # -log(Freq(x) / Max_Freq)
728
- ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
729
- when :resnik_observed
730
- # -log(Freq(x) / Max_Freq)
731
- ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
732
- # Lin
733
- # Jiang & Conrath
734
-
735
- ###########################################
736
- #### FEATURE-BASED METRICS
737
- ###########################################
738
- # Tversky
739
- # x-similarity
740
- # Rodirguez
741
-
742
- ###########################################
743
- #### HYBRID METRICS
744
- ###########################################
745
- when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
746
- # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
747
- ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
748
- if :zhou # New Model of Semantic Similarity Measuring in Wordnet
749
- # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
750
- @ics[:seco][term] = ic # Special store
751
- ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
752
- end
753
- when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
754
- ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
755
- # Knappe
756
- end
757
- @ics[type][term] = ic
758
- return ic
759
- end
760
-
761
-
762
- # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
763
- # ===== Returns
764
- # two hashes with resnik and resnik_observed ICs for observed terms
765
- def get_observed_ics_by_onto_and_freq
766
- # Chech there are observed terms
767
- if @profiles.empty?
768
- resnik = {}
769
- resnik_observed = {}
770
- else
771
- # Calc ICs for all terms
772
- observed_terms = @profiles.values.flatten.uniq
773
- observed_terms.each{ |term| get_IC(term)}
774
- observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
775
- resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
776
- resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
777
- end
778
- return resnik.clone, resnik_observed.clone
779
- end
780
-
781
-
782
- # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
783
- # ===== Parameters
784
- # +termA+:: term to be cheked
785
- # +termB+:: term to be checked
786
- # +ic_type+:: IC formula to be used
787
- # ===== Returns
788
- # the IC of the MICA(termA,termB)
789
- def get_ICMICA(termA, termB, ic_type = :resnik)
790
- mica = self.get_MICA(termA, termB, ic_type)
791
- return mica.first.nil? ? nil : mica.last
792
- end
793
-
794
-
795
- # Find the Most Index Content shared Ancestor (MICA) of two given terms
796
- # ===== Parameters
797
- # +termA+:: term to be cheked
798
- # +termB+:: term to be checked
799
- # +ic_type+:: IC formula to be used
800
- # ===== Returns
801
- # the MICA(termA,termB) and it's IC
802
- def get_MICA(termA, termB, ic_type = :resnik)
803
- termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
804
- termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
805
- mica = [nil,-1.0]
806
- # Special case
807
- if termA.eql?(termB)
808
- ic = self.get_IC(termA, type: ic_type)
809
- mica = [termA, ic]
810
- else
811
- # Obtain ancestors (include itselfs too)
812
- anc_A = self.get_ancestors(termA)
813
- anc_B = self.get_ancestors(termB)
814
-
815
- if !(anc_A.empty? && anc_B.empty?)
816
- anc_A << termA
817
- anc_B << termB
818
- # Find shared ancestors
819
- shared_ancestors = anc_A & anc_B
820
- # Find MICA
821
- if shared_ancestors.length > 0
822
- shared_ancestors.each do |anc|
823
- ic = self.get_IC(anc, type: ic_type)
824
- # Check
825
- mica = [anc,ic] if ic > mica[1]
826
- end
827
- end
828
- end
829
- end
830
- return mica
831
- end
832
-
833
-
834
- # Calculate similarity between two given terms
835
- # ===== Parameters
836
- # +termsA+:: to be compared
837
- # +termsB+:: to be compared
838
- # +type+:: similitude formula to be used
839
- # +ic_type+:: IC formula to be used
840
- # ===== Returns
841
- # the similarity between both sets or false if frequencies are not available yet
842
- def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
843
- # Check
844
- raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
845
- sim = nil
846
- # Launch comparissons
847
- sim_res = get_ICMICA(termA, termB, ic_type)
848
- if !sim_res.nil?
849
- case type
850
- when :resnik
851
- sim = sim_res
852
- when :lin
853
- sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
854
- when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
855
- sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
856
- end
857
- end
858
- return sim
859
- end
860
-
861
-
862
- # Method used to load information stored into an OBO file and store it into this object.
863
- # If a file is specified by input parameter, current @file value is updated
864
- # ===== Parameters
865
- # +file+:: optional file to update object stored file
866
- def load(file, build: true)
867
- _, header, stanzas = self.class.load_obo(file)
868
- @header = header
869
- @stanzas = stanzas
870
- self.remove_removable()
871
- # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
872
- self.build_index() if build
873
- end
874
-
875
- #
876
- def remove_removable()
877
- @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
878
- end
879
-
880
-
881
- # Exports an OBO_Handler object in json format
882
- # ===== Parameters
883
- # +file+:: where info will be stored
884
- def write(file)
885
- # Take object stored info
886
- obj_info = {header: @header,
887
- stanzas: @stanzas,
888
- ancestors_index: @ancestors_index,
889
- descendants_index: @descendants_index,
890
- alternatives_index: @alternatives_index,
891
- obsoletes_index: @obsoletes_index,
892
- structureType: @structureType,
893
- ics: @ics,
894
- meta: @meta,
895
- special_tags: @special_tags,
896
- max_freqs: @max_freqs,
897
- dicts: @dicts,
898
- profiles: @profiles,
899
- profilesDict: @profilesDict,
900
- items: @items,
901
- removable_terms: @removable_terms,
902
- term_paths: @term_paths}
903
- # Convert to JSON format & write
904
- File.open(file, "w") { |f| f.write obj_info.to_json }
905
- end
906
-
907
-
908
- def is_number? string
909
- true if Float(string) rescue false
910
- end
911
-
912
-
913
- # Read a JSON file with an OBO_Handler object stored
914
- # ===== Parameters
915
- # +file+:: with object info
916
- # ===== Return
917
- # OBO_Handler internal fields
918
- def read(file)
919
- # Read file
920
- jsonFile = File.open(file)
921
- jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
922
- # Pre-process (Symbolize some hashs values)
923
- jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
924
- jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
925
- jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
926
- jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h
927
- jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
928
- jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}}
929
- jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h
930
- jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
931
- # Special case: byTerm
932
- dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
933
- if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
934
- [term.to_s.to_i, value.map{|term| term.to_sym}]
935
- elsif value.is_a? Numeric # Numeric dictionary
936
- [term.to_sym, value]
937
- elsif value.kind_of?(Array) && flag == :is_a
938
- [term.to_sym, value.map{|v| v.to_sym}]
939
- else
940
- [term.to_sym, value]
941
- end
942
- end
943
- dictionaries[:byTerm] = dictionaries[:byTerm].to_h
944
- # By value
945
- dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
946
- if value.is_a? Numeric # Numeric dictionary
947
- [value, term.to_sym]
948
- elsif term.is_a? Numeric # Numeric dictionary
949
- [value.to_s.to_sym, term]
950
- elsif flag == :is_a
951
- [value.to_sym, term.map{|v| v.to_sym}]
952
- elsif term.kind_of?(Array)
953
- [value.to_sym, term.map{|t| t.to_sym}]
954
- else
955
- [value.to_s, term.to_sym]
956
- end
957
- end
958
- dictionaries[:byValue] = dictionaries[:byValue].to_h
959
- end
960
- jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
961
- jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
962
- jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}}
963
- jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym}
964
- jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
965
- if v.kind_of?(Array)
966
- jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
967
- else
968
- jsonInfo[:special_tags][k] = v.to_sym
969
- end
970
- end
971
- jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}}
972
- jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}}
973
- # Store info
974
- @header = jsonInfo[:header]
975
- @stanzas = jsonInfo[:stanzas]
976
- @ancestors_index = jsonInfo[:ancestors_index]
977
- @descendants_index = jsonInfo[:descendants_index]
978
- @alternatives_index = jsonInfo[:alternatives_index]
979
- @obsoletes_index = jsonInfo[:obsoletes_index]
980
- @structureType = jsonInfo[:structureType].to_sym
981
- @ics = jsonInfo[:ics]
982
- @meta = jsonInfo[:meta]
983
- @special_tags = jsonInfo[:special_tags]
984
- @max_freqs = jsonInfo[:max_freqs]
985
- @dicts = jsonInfo[:dicts]
986
- @profiles = jsonInfo[:profiles]
987
- @profilesDict = jsonInfo[:profilesDict]
988
- @items = jsonInfo[:items]
989
- @removable_terms = jsonInfo[:removable_terms]
990
- @term_paths = jsonInfo[:term_paths]
991
- end
992
-
993
-
994
- # Check if a given ID is stored as term into this object
995
- # ===== Parameters
996
- # +id+:: to be checked
997
- # ===== Return
998
- # True if term is allowed or false in other cases
999
- def exists? id
1000
- return stanzas[:terms].include?(id)
1001
- end
1002
-
1003
-
1004
- # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1005
- # ===== Parameters
1006
- # +text+:: to be checked
1007
- # ===== Return
1008
- # The correct ID if it can be found or nil in other cases
1009
- def extract_id(text, splitBy: ' ')
1010
- if self.exists?(text)
1011
- return text
1012
- else
1013
- splittedText = text.to_s.split(splitBy).first.to_sym
1014
- return self.exists?(splittedText) ? splittedText : nil
1015
- end
1016
- end
1017
-
1018
-
1019
- # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1020
- # This functions stores calculated dictionary into @dicts field.
1021
- # This functions stores first value for multivalue tags
1022
- # This function does not handle synonyms for byValue dictionaries
1023
- # ===== Parameters
1024
- # +tag+:: to be used to calculate dictionary
1025
- # +select_regex+:: gives a regfex that can be used to modify value to be stored
1026
- # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1027
- # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1028
- # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1029
- # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1030
- # ===== Return
1031
- # void. And stores calcualted bidirectional dictonary into dictionaries main container
1032
- def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1033
- tag = tag.to_sym
1034
- store_tag = tag if store_tag.nil?
1035
- if @stanzas[:terms].empty?
1036
- warn('Terms are not already loaded. Aborting dictionary calc')
1037
- else
1038
- byTerm = {}
1039
- byValue = {}
1040
- # Calc per term
1041
- @stanzas[:terms].each do |term, tags|
1042
- referenceTerm = term
1043
- if @alternatives_index.include?(term) && substitute_alternatives # Special case
1044
- referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1045
- end
1046
- queryTag = tags[tag]
1047
- if !queryTag.nil?
1048
- # Pre-process
1049
- if !select_regex.nil?
1050
- if queryTag.kind_of?(Array)
1051
- queryTag = queryTag.map{|value| value.scan(select_regex).first}
1052
- queryTag.flatten!
1053
- else
1054
- queryTag = queryTag.scan(select_regex).first
1055
- end
1056
- queryTag.compact!
1057
- end
1058
- if queryTag.kind_of?(Array) # Store
1059
- if !queryTag.empty?
1060
- if byTerm.include?(referenceTerm)
1061
- byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1062
- else
1063
- byTerm[referenceTerm] = queryTag
1064
- end
1065
- if multiterm
1066
- queryTag.each do |value|
1067
- byValue[value] = [] if byValue[value].nil?
1068
- byValue[value] << referenceTerm
1069
- end
1070
- else
1071
- queryTag.each{|value| byValue[value] = referenceTerm}
1072
- end
1073
- end
1074
- else
1075
- if byTerm.include?(referenceTerm)
1076
- byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1077
- else
1078
- byTerm[referenceTerm] = [queryTag]
1079
- end
1080
- if multiterm
1081
- byValue[queryTag] = [] if byValue[queryTag].nil?
1082
- byValue[queryTag] << referenceTerm
1083
- else
1084
- byValue[queryTag] = referenceTerm
1085
- end
1086
- end
1087
- end
1088
- end
1089
-
1090
- # Check self-references
1091
- if self_type_references
1092
- byTerm.map do |term, references|
1093
- corrected_references = references.map do |t|
1094
- checked = self.extract_id(t)
1095
- if checked.nil?
1096
- t
1097
- else
1098
- byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1099
- checked
1100
- end
1101
- end
1102
- byTerm[term] = corrected_references.uniq
1103
- end
1104
- end
1105
-
1106
- # Check order
1107
- byTerm.map do |term,values|
1108
- if self.exists?(term)
1109
- referenceValue = @stanzas[:terms][term][tag]
1110
- if !referenceValue.nil?
1111
- if !select_regex.nil?
1112
- if referenceValue.kind_of?(Array)
1113
- referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1114
- referenceValue.flatten!
1115
- else
1116
- referenceValue = referenceValue.scan(select_regex).first
1117
- end
1118
- referenceValue.compact!
1119
- end
1120
- if self_type_references
1121
- if referenceValue.kind_of?(Array)
1122
- aux = referenceValue.map{|t| self.extract_id(t)}
1123
- else
1124
- aux = self.extract_id(referenceValue)
1125
- end
1126
- referenceValue = aux if !aux.nil?
1127
- end
1128
- referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1129
- byTerm[term] = referenceValue + (values - referenceValue)
1130
- end
1131
- end
1132
- end
1133
-
1134
- # Store
1135
- @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1136
- end
1137
- end
1138
-
1139
-
1140
- # Calculates :is_a dictionary without alternatives substitution
1141
- def calc_ancestors_dictionary
1142
- self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
1143
- end
1144
-
1145
-
1146
- # Translate a given value using an already calcualted dictionary
1147
- # ===== Parameters
1148
- # +toTranslate+:: value to be translated using dictiontionary
1149
- # +tag+:: used to generate the dictionary
1150
- # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1151
- # ===== Return
1152
- # translation
1153
- def translate(toTranslate, tag, byValue: true)
1154
- dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1155
- toTranslate = get_main_id(toTranslate) if !byValue
1156
- return dict[toTranslate]
1157
- end
1158
-
1159
-
1160
- # Translate a name given
1161
- # ===== Parameters
1162
- # +name+:: to be translated
1163
- # ===== Return
1164
- # translated name or nil if it's not stored into this ontology
1165
- def translate_name(name)
1166
- term = self.translate(name, :name)
1167
- term = self.translate(name, :synonym) if term.nil?
1168
- return term
1169
- end
1170
-
1171
-
1172
- # Translate several names and return translations and a list of names which couldn't be translated
1173
- # ===== Parameters
1174
- # +names+:: array to be translated
1175
- # ===== Return
1176
- # two arrays with translations and names which couldn't be translated respectively
1177
- def translate_names(names)
1178
- translated = []
1179
- rejected = []
1180
- names.each do |name|
1181
- tr = self.translate_name(name)
1182
- if tr.nil?
1183
- rejected << name
1184
- else
1185
- translated << tr
1186
- end
1187
- end
1188
- return translated, rejected
1189
- end
1190
-
1191
-
1192
- # Translates a given ID to it assigned name
1193
- # ===== Parameters
1194
- # +id+:: to be translated
1195
- # ===== Return
1196
- # main name or nil if it's not included into this ontology
1197
- def translate_id(id)
1198
- name = self.translate(id, :name, byValue: false)
1199
- return name.nil? ? nil : name.first
1200
- end
1201
-
1202
-
1203
- # Translates several IDs and returns translations and not allowed IDs list
1204
- # ===== Parameters
1205
- # +ids+:: to be translated
1206
- # ===== Return
1207
- # two arrays with translations and names which couldn't be translated respectively
1208
- def translate_ids(ids)
1209
- translated = []
1210
- rejected = []
1211
- ids.each do |term_id|
1212
- tr = self.translate_id(term_id.to_sym)
1213
- if !tr.nil?
1214
- translated << tr
1215
- else
1216
- rejected << tr
1217
- end
1218
- end
1219
- return translated, rejected
1220
- end
1221
-
1222
-
1223
- # ===== Returns
1224
- # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1225
- # ===== Parameters
1226
- # +id+:: to be translated
1227
- # ===== Return
1228
- # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1229
- def get_main_id(id)
1230
- return nil if !@stanzas[:terms].include? id
1231
- new_id = id
1232
- mainID = @alternatives_index[id]
1233
- new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1234
- return new_id
1235
- end
1236
-
1237
-
1238
- # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1239
- # ===== Parameters
1240
- # +ids+:: to be checked
1241
- # ===== Return
1242
- # two arrays whit allowed and rejected IDs respectively
1243
- def check_ids(ids, substitute: true)
1244
- checked_codes = []
1245
- rejected_codes = []
1246
- ids.each do |id|
1247
- if @stanzas[:terms].include? id
1248
- if substitute
1249
- checked_codes << self.get_main_id(id)
1250
- else
1251
- checked_codes << id
1252
- end
1253
- else
1254
- rejected_codes << id
1255
- end
1256
- end
1257
- return checked_codes, rejected_codes
1258
- end
1259
-
1260
-
1261
- # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1262
- # ===== Parameters
1263
- # +id+:: assigned to profile
1264
- # +terms+:: array of terms
1265
- # +substitute+:: subsstitute flag from check_ids
1266
- def add_profile(id, terms, substitute: true)
1267
- warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1268
- correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1269
- if !rejected_terms.empty?
1270
- warn('Given terms contains erroneus IDs. These IDs will be removed')
1271
- end
1272
- if id.is_a? Numeric
1273
- @profiles[id] = correct_terms
1274
- else
1275
- @profiles[id.to_sym] = correct_terms
1276
- end
1277
- end
1278
-
1279
-
1280
- # Method used to store a pull of profiles
1281
- # ===== Parameters
1282
- # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1283
- # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1284
- # +reset_stored+:: if true, remove already stored profiles
1285
- # +substitute+:: subsstitute flag from check_ids
1286
- def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1287
- self.reset_profiles if reset_stored
1288
- # Check
1289
- if profiles.kind_of?(Array)
1290
- profiles.each_with_index do |items, i|
1291
- self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1292
- end
1293
- else # Hash
1294
- if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1295
- warn('Some profiles given are already stored. Stored version will be replaced')
1296
- end
1297
- profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1298
- end
1299
-
1300
- self.add_observed_terms_from_profiles(reset: true)
1301
-
1302
- if calc_metadata
1303
- self.calc_profiles_dictionary
1304
- end
1305
- end
1306
-
1307
-
1308
- # Internal method used to remove already stored profiles and restore observed frequencies
1309
- def reset_profiles
1310
- # Clean profiles storage
1311
- @profiles = {}
1312
- # Reset frequency observed
1313
- @meta.each{|term,info| info[:observed_freq] = 0}
1314
- @max_freqs[:observed_freq] = 0
1315
- end
1316
-
1317
-
1318
- # ===== Returns
1319
- # profiles assigned to a given ID
1320
- # ===== Parameters
1321
- # +id+:: profile ID
1322
- # ===== Return
1323
- # specific profile or nil if it's not stored
1324
- def get_profile(id)
1325
- return @profiles[id]
1326
- end
1327
-
1328
-
1329
- # ===== Returns
1330
- # an array of sizes for all stored profiles
1331
- # ===== Return
1332
- # array of profile sizes
1333
- def get_profiles_sizes()
1334
- return @profiles.map{|id,terms| terms.length}
1335
- end
1336
-
1337
-
1338
- # ===== Returns
1339
- # mean size of stored profiles
1340
- # ===== Parameters
1341
- # +round_digits+:: number of digits to round result. Default: 4
1342
- # ===== Returns
1343
- # mean size of stored profiles
1344
- def get_profiles_mean_size(round_digits: 4)
1345
- sizes = self.get_profiles_sizes
1346
- return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1347
- end
1348
-
1349
-
1350
- # Calculates profiles sizes and returns size assigned to percentile given
1351
- # ===== Parameters
1352
- # +perc+:: percentile to be returned
1353
- # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1354
- # ===== Returns
1355
- # values assigned to percentile asked
1356
- def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1357
- prof_lengths = self.get_profiles_sizes.sort
1358
- prof_lengths.reverse! if !increasing_sort
1359
- n_profiles = prof_lengths.length
1360
- percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1361
- percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1362
- return prof_lengths[percentile_index]
1363
- end
1364
-
1365
-
1366
- # Translate a given profile to terms names
1367
- # ===== Parameters
1368
- # +prof+:: array of terms to be translated
1369
- # ===== Returns
1370
- # array of translated terms. Can include nils if some IDs are not allowed
1371
- def profile_names(prof)
1372
- return prof.map{|term| self.translate_id(term)}
1373
- end
1374
-
1375
-
1376
- # Trnaslates a bunch of profiles to it sets of term names
1377
- # ===== Parameters
1378
- # +profs+:: array of profiles
1379
- # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1380
- # ===== Returns
1381
- # translated profiles
1382
- def translate_profiles_ids(profs = [], asArray: true)
1383
- profs = @profiles if profs.empty?
1384
- profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1385
- profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1386
- return asArray ? profs_names.values : profs_names
1387
- end
1388
-
1389
-
1390
- # Includes as "observed_terms" all terms included into stored profiles
1391
- # ===== Parameters
1392
- # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1393
- def add_observed_terms_from_profiles(reset: false)
1394
- @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1395
- @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1396
- end
1397
-
1398
-
1399
- # Get a term frequency
1400
- # ===== Parameters
1401
- # +term+:: term to be checked
1402
- # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1403
- # ===== Returns
1404
- # frequency of term given or nil if term is not allowed
1405
- def get_frequency(term, type: :struct_freq)
1406
- queryFreq = @meta[term]
1407
- return queryFreq.nil? ? nil : queryFreq[type]
1408
- end
1409
-
1410
-
1411
- # Geys structural frequency of a term given
1412
- # ===== Parameters
1413
- # +term+:: to be checked
1414
- # ===== Returns
1415
- # structural frequency of given term or nil if term is not allowed
1416
- def get_structural_frequency(term)
1417
- return self.get_frequency(term, type: :struct_freq)
1418
- end
1419
-
1420
-
1421
- # Gets observed frequency of a term given
1422
- # ===== Parameters
1423
- # +term+:: to be checked
1424
- # ===== Returns
1425
- # observed frequency of given term or nil if term is not allowed
1426
- def get_observed_frequency(term)
1427
- return self.get_frequency(term, type: :observed_freq)
1428
- end
1429
-
1430
-
1431
- # Calculates frequencies of stored profiles terms
1432
- # ===== Parameters
1433
- # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1434
- # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1435
- # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1436
- # +translate+:: if true, term IDs will be translated to
1437
- # ===== Returns
1438
- # stored profiles terms frequencies
1439
- def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1440
- n_profiles = @profiles.length
1441
- if literal
1442
- freqs = {}
1443
- @profiles.each do |id, terms|
1444
- terms.each do |literalTerm|
1445
- if freqs.include?(literalTerm)
1446
- freqs[literalTerm] += 1
1447
- else
1448
- freqs[literalTerm] = 1
1449
- end
1450
- end
1451
- end
1452
- if (ratio || translate)
1453
- aux_keys = freqs.keys
1454
- aux_keys.each do |term|
1455
- freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1456
- if translate
1457
- tr = self.translate_id(term)
1458
- freqs[tr] = freqs.delete(term) if !tr.nil?
1459
- end
1460
- end
1461
- end
1462
- if asArray
1463
- freqs = freqs.map{|term, freq| [term, freq]}
1464
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1465
- end
1466
- else # Freqs translating alternatives
1467
- freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1468
- freqs = freqs.to_h if !asArray
1469
- if translate
1470
- freqs = freqs.map do |term, freq|
1471
- tr = self.translate_id(term)
1472
- tr.nil? ? [term, freq] : [tr, freq]
1473
- end
1474
- end
1475
- if asArray
1476
- freqs = freqs.map{|term, freq| [term, freq]}
1477
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1478
- else
1479
- freqs = freqs.to_h
1480
- end
1481
- end
1482
- return freqs
1483
- end
1484
-
1485
-
1486
- # Clean a given profile returning cleaned set of terms and removed ancestors term.
1487
- # ===== Parameters
1488
- # +prof+:: array of terms to be checked
1489
- # ===== Returns
1490
- # two arrays, first is the cleaned profile and second is the removed elements array
1491
- def remove_ancestors_from_profile(prof)
1492
- ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1493
- redundant = prof.select{|term| ancestors.include?(term)}
1494
- return prof - redundant, redundant
1495
- end
1496
-
1497
-
1498
- # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1499
- # ===== Parameters
1500
- # +prof+:: array of terms to be checked
1501
- # ===== Returns
1502
- # two arrays, first is the cleaned profile and second is the removed elements array
1503
- def remove_alternatives_from_profile(prof)
1504
- alternatives = prof.select{|term| @alternatives_index.include?(term)}
1505
- redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1506
- return prof - redundant, redundant
1507
- end
1508
-
1509
-
1510
- # Remove alternatives (if official term is present) and ancestors terms of a given profile
1511
- # ===== Parameters
1512
- # +profile+:: profile to be cleaned
1513
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1514
- # ===== Returns
1515
- # cleaned profile
1516
- def clean_profile(profile, remove_alternatives: true)
1517
- terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1518
- if remove_alternatives
1519
- terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1520
- else
1521
- terms_without_ancestors_and_alternatices = terms_without_ancestors
1522
- end
1523
- return terms_without_ancestors_and_alternatices
1524
- end
1525
-
1526
-
1527
- # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1528
- # ===== Parameters
1529
- # +store+:: if true, clenaed profiles will replace already stored profiles
1530
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1531
- # ===== Returns
1532
- # a hash with cleaned profiles
1533
- def clean_profiles(store: false, remove_alternatives: true)
1534
- cleaned_profiles = {}
1535
- @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1536
- @profiles = cleaned_profiles if store
1537
- return cleaned_profiles
1538
- end
1539
-
1540
-
1541
- # Calculates number of ancestors present (redundant) in each profile stored
1542
- # ===== Returns
1543
- # array of parentals for each profile
1544
- def parentals_per_profile
1545
- cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1546
- parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1547
- return parentals
1548
- end
1549
-
1550
-
1551
- # Calculates mean IC of a given profile
1552
- # ===== Parameters
1553
- # +prof+:: profile to be checked
1554
- # +ic_type+:: ic_type to be used
1555
- # +zhou_k+:: special coeficient for Zhou IC method
1556
- # ===== Returns
1557
- # mean IC for a given profile
1558
- def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1559
- return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1560
- end
1561
-
1562
-
1563
- # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1564
- # ===== Returns
1565
- # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1566
- def get_profiles_resnik_dual_ICs
1567
- struct_ics = {}
1568
- observ_ics = {}
1569
- @profiles.each do |id, terms|
1570
- struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1571
- observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1572
- end
1573
- return struct_ics.clone, observ_ics.clone
1574
- end
1575
-
1576
-
1577
- # Calculates ontology structural levels for all ontology terms
1578
- # ===== Parameters
1579
- # +calc_paths+:: calculates term paths if it's not already calculated
1580
- # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1581
- def calc_term_levels(calc_paths: false, shortest_path: true)
1582
- if @term_paths.empty?
1583
- if calc_paths
1584
- self.calc_term_paths
1585
- else
1586
- warn('Term paths are not already loaded. Aborting dictionary calc')
1587
- end
1588
- end
1589
- if !@term_paths.empty?
1590
- byTerm = {}
1591
- byValue = {}
1592
- # Calc per term
1593
- @term_paths.each do |term, info|
1594
- level = shortest_path ? info[:shortest_path] : info[:largest_path]
1595
- if level.nil?
1596
- level = -1
1597
- else
1598
- level = level.round(0)
1599
- end
1600
- byTerm[term] = level
1601
- queryLevels = byValue[level]
1602
- if queryLevels.nil?
1603
- byValue[level] = [term]
1604
- else
1605
- byValue[level] << term
1606
- end
1607
- end
1608
- @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1609
- # Update maximum depth
1610
- @max_freqs[:max_depth] = byValue.keys.max
1611
- end
1612
- end
1613
-
1614
-
1615
- # Check if a term given is marked as obsolete
1616
- def is_obsolete? term
1617
- return @obsoletes_index.include?(term)
1618
- end
1619
-
1620
- # Check if a term given is marked as alternative
1621
- def is_alternative? term
1622
- return @alternatives_index.include?(term)
1623
- end
1624
-
1625
- # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1626
- # Also calculates paths metadata and stores into @term_paths
1627
- def calc_term_paths
1628
- self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1629
- visited_terms = []
1630
- @term_paths = {}
1631
- if [:hierarchical, :sparse].include? @structureType
1632
- terms = @stanzas[:terms].keys
1633
- terms.each do |term|
1634
- if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1635
- special_term = term
1636
- term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1637
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1638
- @term_paths[special_term] = @term_paths[term]
1639
- visited_terms << special_term
1640
- end
1641
-
1642
- if !visited_terms.include?(term)
1643
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1644
- parentals = @dicts[:is_a][:byTerm][term]
1645
- if parentals.nil?
1646
- @term_paths[term][:paths] << [term]
1647
- else
1648
- parentals.each do |direct_parental|
1649
- if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1650
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1651
- else # Calculate new paths
1652
- self.expand_path(direct_parental, visited_terms)
1653
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1654
- end
1655
- new_paths.each{|path| @term_paths[term][:paths] << path}
1656
- end
1657
- end
1658
- visited_terms << term
1659
- end
1660
- # Update metadata
1661
- @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1662
- paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1663
- @term_paths[term][:largest_path] = paths_sizes.max
1664
- @term_paths[term][:shortest_path] = paths_sizes.min
1665
- end
1666
- else
1667
- warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1668
- end
1669
- end
1670
-
1671
-
1672
- # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1673
- # ===== Parameters
1674
- # +curr_term+:: current visited term
1675
- # +visited_terms+:: already expanded terms
1676
- def expand_path(curr_term, visited_terms)
1677
- if !visited_terms.include?(curr_term) # Not already expanded
1678
- @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1679
- direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1680
- if direct_parentals.nil? # No parents :: End of recurrence
1681
- @term_paths[curr_term][:paths] << [curr_term]
1682
- else # Expand and concat
1683
- direct_parentals.each do |ancestor|
1684
- self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1685
- new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1686
- new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1687
- end
1688
- end
1689
- visited_terms << curr_term
1690
- end
1691
- end
1692
-
1693
-
1694
- # Gets ontology levels calculated
1695
- # ===== Returns
1696
- # ontology levels calculated
1697
- def get_ontology_levels
1698
- return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1699
- end
1700
-
1701
-
1702
- # Gets ontology level of a specific term
1703
- # ===== Returns
1704
- # Term level
1705
- def get_term_level(term)
1706
- return @dicts[:level][:byValue][term]
1707
- end
1708
-
1709
-
1710
- # Return ontology levels from profile terms
1711
- # ===== Returns
1712
- # hash of term levels (Key: level; Value: array of term IDs)
1713
- def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1714
- profiles_terms = @profiles.values.flatten
1715
- profiles_terms.uniq! if uniq
1716
- term_freqs_byProfile = {}
1717
- profiles_terms.each do |term|
1718
- query = term_freqs_byProfile[term]
1719
- if query.nil?
1720
- term_freqs_byProfile[term] = 1
1721
- else
1722
- term_freqs_byProfile[term] += 1
1723
- end
1724
- end
1725
- levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1726
- return levels_filtered
1727
- end
1728
-
1729
-
1730
- # Calculate profiles dictionary with Key= Term; Value = Profiles
1731
- def calc_profiles_dictionary
1732
- if @profiles.empty?
1733
- warn('Profiles are not already loaded. Aborting dictionary calc')
1734
- else
1735
- byTerm = {} # Key: Terms
1736
- # byValue -- Key: Profile == @profiles
1737
- @profiles.each do |id, terms|
1738
- terms.each do |term|
1739
- if byTerm.include?(term)
1740
- byTerm[term] << id
1741
- else
1742
- byTerm[term] = [id]
1743
- end
1744
- end
1745
- end
1746
- @profilesDict = byTerm
1747
- end
1748
- end
1749
-
1750
-
1751
- # Gets profiles dictionary calculated
1752
- # ===== Return
1753
- # profiles dictionary (clone)
1754
- def get_terms_linked_profiles
1755
- return @profilesDict.clone
1756
- end
1757
-
1758
-
1759
- # Get related profiles to a given term
1760
- # ===== Parameters
1761
- # +term+:: to be checked
1762
- # ===== Returns
1763
- # profiles which contains given term
1764
- def get_term_linked_profiles(term)
1765
- return @profilesDict[term]
1766
- end
1767
-
1768
-
1769
- # Gets metainfo table from a set of terms
1770
- # ===== Parameters
1771
- # +terms+:: IDs to be expanded
1772
- # +filter_alternatives+:: flag to be used in get_descendants method
1773
- # ===== Returns
1774
- # an array with triplets [TermID, TermName, DescendantsNames]
1775
- def get_childs_table(terms, filter_alternatives = false)
1776
- expanded_terms = []
1777
- terms.each do |t|
1778
- expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1779
- end
1780
- return expanded_terms
1781
- end
1782
-
1783
-
1784
- # Store specific relations hash given into ITEMS structure
1785
- # ===== Parameters
1786
- # +relations+:: to be stored
1787
- # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1788
- # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1789
- def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1790
- @items = {} if remove_old_relations
1791
- if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1792
- warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1793
- end
1794
- if !remove_old_relations
1795
- if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1796
- warn('Some terms given are already stored. Stored version will be replaced')
1797
- end
1798
- end
1799
- if expand
1800
- relations.each do |k,v|
1801
- if @items.keys.include?(k)
1802
- @items[k] = (@items[k] + v).uniq
1803
- else
1804
- @items[k] = v
1805
- end
1806
- end
1807
- else
1808
- @items.merge!(relations)
1809
- end
1810
- end
1811
-
1812
-
1813
- # Assign a dictionary already calculated as a items set.
1814
- # ===== Parameters
1815
- # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1816
- def set_items_from_dict(dictID, remove_old_relations = false)
1817
- @items = {} if remove_old_relations
1818
- if(@dicts.keys.include?(dictID))
1819
- @items.merge(@dicts[dictID][:byTerm])
1820
- else
1821
- warn('Specified ID is not calculated. Dict will not be added as a items set')
1822
- end
1823
- end
1824
-
1825
-
1826
- # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
1827
- # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1828
- # ===== Parameters
1829
- # +ontology+:: (Optional) ontology object which items given belongs
1830
- # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
1831
- # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
1832
- # ===== Returns
1833
- # void and update items object
1834
- def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
1835
- # Check item keys
1836
- if @items.empty?
1837
- warn('Items have been not provided yet')
1838
- return nil
1839
- end
1840
- targetKeys = @items.keys.select{|k| self.exists?(k)}
1841
- if targetKeys.length == 0
1842
- warn('Any item key is allowed')
1843
- return nil
1844
- elsif targetKeys.length < @items.keys.length
1845
- warn('Some item keys are not allowed')
1846
- end
1847
-
1848
- # Expand to parentals
1849
- targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
1850
- targetKeys.flatten!
1851
- targetKeys.uniq!
1852
-
1853
- # Obtain levels (go from leaves to roots)
1854
- levels = targetKeys.map{|term| self.get_term_level(term)}
1855
- levels.compact!
1856
- levels.uniq!
1857
- levels.sort!
1858
- levels.reverse!
1859
- levels.shift # Leaves are not expandable
1860
-
1861
- # Expand from leaves to roots
1862
- levels.map do |lvl|
1863
- curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
1864
- curr_keys.map do |term_expand|
1865
- to_infer = []
1866
- # Obtain childs
1867
- childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
1868
- # Expand
1869
- if childs.length > 0 && minimum_childs == 1 # Special case
1870
- to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
1871
- elsif childs.length >= minimum_childs
1872
- to_infer = Hash.new(0)
1873
- # Compare
1874
- while childs.length > 1
1875
- curr_term = childs.shift
1876
- childs.each do |compare_term|
1877
- pivot_items = @items[curr_term]
1878
- compare_items = @items[compare_term]
1879
- if ontology.nil? # Exact match
1880
- pivot_items.map do |pitem|
1881
- if compare_items.include?(pitem)
1882
- to_infer[pitem] += 2
1883
- end
1884
- end
1885
- else # Find MICAs
1886
- local_infer = Hash.new(0)
1887
- pivot_items.map do |pitem|
1888
- micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
1889
- maxmica = micas[0]
1890
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1891
- local_infer[maxmica.first] += 1
1892
- end
1893
- compare_items.map do |citem|
1894
- micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
1895
- maxmica = micas[0]
1896
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1897
- local_infer[maxmica.first] += 1
1898
- end
1899
- local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
1900
- end
1901
- end
1902
- end
1903
- # Filter infer
1904
- to_infer = to_infer.select{|k,v| v >= minimum_childs}
1905
- end
1906
- # Infer
1907
- if to_infer.length > 0
1908
- @items[term_expand] = [] if @items[term_expand].nil?
1909
- if to_infer.kind_of?(Array)
1910
- @items[term_expand] = (@items[term_expand] + to_infer).uniq
1911
- else
1912
- @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
1913
- end
1914
- @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
1915
- elsif !@items.include?(term_expand)
1916
- targetKeys.delete(term_expand)
1917
- end
1918
- end
1919
- end
1920
- end
1921
-
1922
-
1923
-
1924
- # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1925
- # ===== Parameters
1926
- # ++::
1927
- # ===== Returns
1928
- # ...
1929
- def compute_relations_to_items(external_item_list, mode, thresold)
1930
- results = []
1931
- penalized_terms = {}
1932
- # terms_levels = get_terms_levels(@items_relations.keys)
1933
- terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1934
- terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1935
- terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1936
- levels = terms_levels.keys.sort
1937
- levels.reverse_each do |level|
1938
- terms_levels[level].each do |term|
1939
- associated_items = @items_relations[term]
1940
- if mode == :elim
1941
- items_to_remove = penalized_terms[term]
1942
- items_to_remove = [] if items_to_remove.nil?
1943
- pval = get_fisher_exact_test(
1944
- external_item_list - items_to_remove,
1945
- associated_items - items_to_remove,
1946
- ((associated_items | external_item_list) - items_to_remove).length
1947
- )
1948
- if pval <= thresold
1949
- parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1950
- parents.each do |prnt|
1951
- query = penalized_terms[prnt]
1952
- if query.nil?
1953
- penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1954
- else
1955
- query.concat(@items_relations[term])
1956
- end
1957
- end
1958
- end
1959
- end
1960
- results << [term, pval]
1961
- end
1962
- end
1963
- return results
1964
- end
1965
-
1966
-
1967
- # Check if a given ID is a removable (blacklist) term.
1968
- # +DEPRECATED+ use is_removable? instead
1969
- # ===== Parameters
1970
- # +id+:: to be checked
1971
- # ===== Returns
1972
- # true if given term is a removable (blacklist) term or false in other cases
1973
- def is_removable(id)
1974
- warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
1975
- return @removable_terms.include?(id.to_sym)
1976
- end
1977
-
1978
- # Check if a given ID is a removable (blacklist) term
1979
- # ===== Parameters
1980
- # +id+:: to be checked
1981
- # ===== Returns
1982
- # true if given term is a removable (blacklist) term or false in other cases
1983
- def is_removable? id
1984
- return @removable_terms.include?(id.to_sym)
1985
- end
1986
-
1987
- ############################################
1988
- # SPECIAL METHODS
1989
- #############################################
1990
- def ==(other)
1991
- self.header == other.header &&
1992
- self.stanzas == other.stanzas &&
1993
- self.ancestors_index == other.ancestors_index &&
1994
- self.alternatives_index == other.alternatives_index &&
1995
- self.obsoletes_index == other.obsoletes_index &&
1996
- self.structureType == other.structureType &&
1997
- self.ics == other.ics &&
1998
- self.meta == other.meta &&
1999
- self.dicts == other.dicts &&
2000
- self.profiles == other.profiles &&
2001
- self.profilesDict == other.profilesDict &&
2002
- (self.items.keys - other.items.keys).empty? &&
2003
- self.removable_terms == other.removable_terms &&
2004
- self.special_tags == other.special_tags &&
2005
- self.items == other.items &&
2006
- self.term_paths == other.term_paths &&
2007
- self.max_freqs == other.max_freqs
5
+ #########################################################
6
+ # AUTHOR NOTES
7
+ #########################################################
8
+
9
+ # 1 - Store @profiles as @stanzas[:instances]
10
+ # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
11
+
12
+
13
+ #############################################
14
+ # FIELDS
15
+ #############################################
16
+ # Handled class variables
17
+ # => @@basic_tags :: hash with main OBO structure tags
18
+ # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
19
+ # => @@symbolizable_ids :: tags which can be symbolized
20
+ # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
21
+ #
22
+ # Handled object variables
23
+ # => @header :: file header (if is available)
24
+ # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
25
+ # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
26
+ # => @descendants_index :: hash of descendants per each term handled with any structure relationships
27
+ # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
28
+ # => @obsoletes_index :: hash of obsoletes and it's new ids
29
+ # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
30
+ # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
31
+ # => @ics :: already calculated ICs for handled terms and IC types
32
+ # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
33
+ # => @max_freqs :: maximum freqs found for structural and observed freqs
34
+ # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
35
+ # => @profiles :: set of terms assigned to an ID
36
+ # => @profilesDict :: set of profile IDs assigned to a term
37
+ # => @items :: hash with items relations to terms
38
+ # => @removable_terms :: array of terms to not be considered
39
+ # => @term_paths :: metainfo about parental paths of each term
40
+
41
+ @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
+ @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
+ @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
+ @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
45
+ @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
46
+ @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
47
+
48
+ #############################################
49
+ # CONSTRUCTOR
50
+ #############################################
51
+
52
+ # Instantiate a OBO_Handler object
53
+ # ===== Parameters
54
+ # +file+:: with info to be loaded (.obo ; .json)
55
+ # +load_file+:: activate load process automatically (only for .obo)
56
+ # +removable_terms+: term to be removed from calcs
57
+ # +build+: flag to launch metainfo calculation
58
+ def initialize(file: nil, load_file: false, removable_terms: [], build: true)
59
+ # Initialize object variables
60
+ @header = nil
61
+ @stanzas = {terms: {}, typedefs: {}, instances: {}}
62
+ @ancestors_index = {}
63
+ @descendants_index = {}
64
+ @alternatives_index = {}
65
+ @obsoletes_index = {}
66
+ @structureType = nil
67
+ @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
68
+ @meta = {}
69
+ @special_tags = @@basic_tags.clone
70
+ @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
71
+ @dicts = {}
72
+ @profiles = {}
73
+ @profilesDict = {}
74
+ @items = {}
75
+ @removable_terms = []
76
+ @term_paths = {}
77
+ # Load if proceeds
78
+ add_removable_terms(removable_terms) if !removable_terms.empty?
79
+ load(file, build: build) if load_file
80
+ end
81
+
82
+
83
+ #############################################
84
+ # CLASS METHODS
85
+ #############################################
86
+
87
+ # Expand a (starting) term using a specific tag and return all extended terms into an array and
88
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
89
+ # foumd, extended array will be an unique vector without starting term (no loops).
90
+ # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
91
+ # ===== Parameters
92
+ # +start+:: term where start to expand
93
+ # +terms+:: set to be used to expand
94
+ # +target_tag+:: tag used to expand
95
+ # +eexpansion+:: already expanded info
96
+ # +split_info_char+:: special regex used to split info (if it is necessary)
97
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
98
+ # +alt_ids+:: set of alternative IDs
99
+ # ===== Returns
100
+ # A vector with the observed structure (string) and the array with extended terms.
101
+ def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
102
+ # Take start_id term available info and already accumulated info
103
+ current_associations = related_ids[start_id]
104
+ current_associations = [] if current_associations.nil?
105
+ return [:no_term,[]] if terms[start_id].nil?
106
+ id_relations = terms[start_id][target_tag]
107
+ return [:source,[]] if id_relations.nil?
108
+
109
+ # Prepare auxiliar variables
110
+ struct = :hierarchical
111
+
112
+ # Study direct extensions
113
+ id_relations = id_relations.clone
114
+ while id_relations.length > 0
115
+ id = id_relations.shift
116
+ id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
117
+
118
+ # Handle
119
+ if current_associations.include?(id) # Check if already have been included into this expansion
120
+ struct = :circular
121
+ else
122
+ current_associations << id
123
+ if related_ids.include?(id) # Check if current already has been expanded
124
+ current_associations = current_associations | related_ids[id]
125
+ if current_associations.include?(start_id) # Check circular case
126
+ struct = :circular
127
+ [id, start_id].each{|repeated| current_associations.delete(repeated)}
128
+ end
129
+ else # Expand
130
+ related_ids[start_id] = current_associations
131
+ structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
132
+ current_associations = current_associations | current_related_ids
133
+ struct = :circular if structExp == :circular # Check struct
134
+ if current_associations.include?(start_id) # Check circular case
135
+ struct = :circular
136
+ current_associations.delete(start_id)
137
+ end
138
+ end
139
+ end
140
+ end
141
+ related_ids[start_id] = current_associations
142
+
143
+ return struct, current_associations
144
+ end
145
+
146
+
147
+ # Expand terms using a specific tag and return all extended terms into an array and
148
+ # the relationship structuture observed (hierarchical or circular). If circular structure is
149
+ # foumd, extended array will be an unique vector without starting term (no loops)
150
+ # ===== Parameters
151
+ # +terms+:: set to be used to expand
152
+ # +target_tag+:: tag used to expand
153
+ # +split_info_char+:: special regex used to split info (if it is necessary)
154
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
155
+ # +alt_ids+:: set of alternative IDs
156
+ # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
157
+ # ===== Returns
158
+ # A vector with the observed structure (string) and the hash with extended terms
159
+ def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
160
+ # Define structure type
161
+ structType = :hierarchical
162
+ related_ids = {}
163
+ terms.each do |id, tags|
164
+ # Check if target tag is defined
165
+ if !tags[target_tag].nil?
166
+ # Obtain related terms
167
+ set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
168
+ # Check structure
169
+ structType = :circular if set_structure == :circular
170
+ end
171
+ end
172
+
173
+ # Check special case
174
+ structType = :atomic if related_ids.length <= 0
175
+ structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
176
+ # Return type and hash with related_ids
177
+ return structType, related_ids
178
+ end
179
+
180
+
181
+ # Class method to transform string with <tag : info> into hash structure
182
+ # ===== Parameters
183
+ # +attributes+:: array tuples with info to be transformed into hash format
184
+ # ===== Returns
185
+ # Attributes stored into hash structure
186
+ def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
187
+ # Load info
188
+ info_hash = {}
189
+ # Only TERMS multivalue tags (future add Typedefs and Instance)
190
+ # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
191
+ attributes.each do |tag, value|
192
+ # Check
193
+ raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
194
+ # Prepare
195
+ tag = tag.lstrip.to_sym
196
+ value.lstrip!
197
+ value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
198
+
199
+ # Store
200
+ query = info_hash[tag]
201
+ if !query.nil? # Tag already exists
202
+ if !query.kind_of?(Array) # Check that tag is multivalue
203
+ raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
204
+ else
205
+ query << value # Add new value to tag
206
+ end
207
+ else # New entry
208
+ if @@multivalue_tags.include?(tag)
209
+ info_hash[tag] = [value]
210
+ else
211
+ info_hash[tag] = value
212
+ end
213
+ end
214
+ end
215
+ self.symbolize_ids(info_hash)
216
+ return info_hash
217
+ end
218
+
219
+
220
+ # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
221
+ # the Header, the Terms, the Typedefs and the Instances.
222
+ # ===== Parameters
223
+ # +file+:: OBO file to be loaded
224
+ # ===== Returns
225
+ # Hash with FILE, HEADER and STANZAS info
226
+ def self.load_obo(file) #TODO: Send to obo_parser class
227
+ raise("File is not defined") if file.nil?
228
+ # Data variables
229
+ header = ''
230
+ stanzas = {terms: {}, typedefs: {}, instances: {}}
231
+ # Auxiliar variables
232
+ infoType = 'Header'
233
+ currInfo = []
234
+ stanzas_flags = %w[[Term] [Typedef] [Instance]]
235
+ # Read file
236
+ File.open(file).each do |line|
237
+ line.chomp!
238
+ next if line.empty?
239
+ fields = line.split(':', 2)
240
+ # Check if new instance is found
241
+ if stanzas_flags.include?(line)
242
+ header = self.process_entity(header, infoType, stanzas, currInfo)
243
+ # Update info variables
244
+ currInfo = []
245
+ infoType = line.gsub!(/[\[\]]/, '')
246
+ next
247
+ end
248
+ # Concat info
249
+ currInfo << fields
250
+ end
251
+ # Store last loaded info
252
+ header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
253
+
254
+ # Prepare to return
255
+ finfo = {:file => file, :name => File.basename(file, File.extname(file))}
256
+ return finfo, header, stanzas
257
+ end
258
+
259
+
260
+ # Handle OBO loaded info and stores it into correct container and format
261
+ # ===== Parameters
262
+ # +header+:: container
263
+ # +infoType+:: current ontology item type detected
264
+ # +stanzas+:: container
265
+ # +currInfo+:: info to be stored
266
+ # ===== Returns
267
+ # header newly/already stored
268
+ def self.process_entity(header, infoType, stanzas, currInfo)
269
+ info = self.info2hash(currInfo)
270
+ # Store current info
271
+ if infoType.eql?('Header')
272
+ header = info
273
+ else
274
+ id = info[:id]
275
+ case infoType
276
+ when 'Term'
277
+ stanzas[:terms][id] = info
278
+ when 'Typedef'
279
+ stanzas[:typedefs][id] = info
280
+ when 'Instance'
281
+ stanzas[:instances][id] = info
282
+ end
283
+ end
284
+ return header
285
+ end
286
+
287
+
288
+ # Symboliza all values into hashs using symbolizable tags as keys
289
+ # ===== Parameters
290
+ # +item_hash+:: hash to be checked
291
+ def self.symbolize_ids(item_hash)
292
+ @@symbolizable_ids.each do |tag|
293
+ query = item_hash[tag]
294
+ if !query.nil?
295
+ if query.kind_of?(Array)
296
+ query.map!{|item| item.to_sym}
297
+ else
298
+ item_hash[tag] = query.to_sym if !query.nil?
299
+ end
300
+ end
301
+ end
302
+ end
303
+
304
+
305
+ #
306
+ # ===== Parameters
307
+ # +root+:: main term to expand
308
+ # +ontology+:: to be cutted
309
+ # +clone+:: if true, given ontology object will not be mutated
310
+ # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
311
+ # ===== Returns
312
+ # An Ontology object with terms after cut the ontology.
313
+ def self.mutate(root, ontology, clone: true, remove_up: true)
314
+ ontology = ontology.clone if clone
315
+ # Obtain affected IDs
316
+ descendants = ontology.descendants_index[root]
317
+ descendants << root # Store itself to do not remove it
318
+ # Remove unnecesary terms
319
+ ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
320
+ ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
321
+ ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
322
+ ontology.dicts = {}
323
+ ontology.removable_terms = []
324
+ ontology.term_paths = {}
325
+ # Recalculate metadata
326
+ ontology.build_index
327
+ ontology.add_observed_terms_from_profiles
328
+ # Finish
329
+ return ontology
330
+ end
331
+
332
+
333
+
334
+ #############################################
335
+ # GENERAL METHODS
336
+ #############################################
337
+
338
+ # Include removable terms to current removable terms list
339
+ # ===== Parameters
340
+ # +terms+:: terms array to be concatenated
341
+ def add_removable_terms(terms)
342
+ terms = terms.map{|term| term.to_sym}
343
+ @removable_terms.concat(terms)
344
+ end
345
+
346
+
347
+ # Include removable terms to current removable terms list loading new
348
+ # terms from a one column plain text file
349
+ # ===== Parameters
350
+ # +file+:: to be loaded
351
+ def add_removable_terms_from_file(file)
352
+ File.open(excluded_codes_file).each do |line|
353
+ line.chomp!
354
+ @removable_terms << line.to_sym
355
+ end
356
+ end
357
+
358
+
359
+ # Increase observed frequency for a specific term
360
+ # ===== Parameters
361
+ # +term+:: term which frequency is going to be increased
362
+ # +increas+:: frequency rate to be increased. Default = 1
363
+ # ===== Return
364
+ # true if process ends without errors, false in other cases
365
+ def add_observed_term(term:,increase: 1.0)
366
+ # Check
367
+ raise ArgumentError, "Term given is NIL" if term.nil?
368
+ return false unless @stanzas[:terms].include?(term)
369
+ return false if @removable_terms.include?(term)
370
+ if @alternatives_index.include?(term)
371
+ alt_id = @alternatives_index[term]
372
+ @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
373
+ @meta[term] = @meta[alt_id]
374
+ end
375
+ # Check if exists
376
+ @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
377
+ # Add frequency
378
+ @meta[term][:observed_freq] = 0 if @meta[term][:observed_freq] == -1
379
+ @meta[term][:observed_freq] += increase
380
+ # Check maximum frequency
381
+ @max_freqs[:observed_freq] = @meta[term][:observed_freq] if @max_freqs[:observed_freq] < @meta[term][:observed_freq]
382
+ return true
383
+ end
384
+
385
+
386
+ # Increase the arbitrary frequency of a given term set
387
+ # ===== Parameters
388
+ # +terms+:: set of terms to be updated
389
+ # +increase+:: amount to be increased
390
+ # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
391
+ # ===== Return
392
+ # true if process ends without errors and false in other cases
393
+ def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
394
+ # Check
395
+ raise ArgumentError, 'Terms array given is NIL' if terms.nil?
396
+ raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
397
+ # Add observations
398
+ if transform_to_sym
399
+ checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
400
+ else
401
+ checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
402
+ end
403
+ return checks
404
+ end
405
+
406
+
407
+ # Compare to terms sets
408
+ # ===== Parameters
409
+ # +termsA+:: set to be compared
410
+ # +termsB+:: set to be compared
411
+ # +sim_type+:: similitude method to be used. Default: resnik
412
+ # +ic_type+:: ic type to be used. Default: resnik
413
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
414
+ # ===== Return
415
+ # similitude calculated
416
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
417
+ # Check
418
+ raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
419
+ raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
420
+ micasA = []
421
+ # Compare A -> B
422
+ termsA.each do |tA|
423
+ micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
424
+ # Remove special cases
425
+ [false,nil].each do |err_value| micas.delete(err_value) end
426
+ # Obtain maximum value
427
+ micasA << micas.max if micas.length > 0
428
+ micasA << 0 if micas.length <= 0
429
+ end
430
+ means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
431
+ # Compare B -> A
432
+ if bidirectional
433
+ means_simA = means_sim * micasA.size
434
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
435
+ means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
436
+ end
437
+ # Return
438
+ return means_sim
439
+ end
440
+
441
+
442
+ # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
443
+ # ===== Parameters
444
+ # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
445
+ # +sim_type+:: similitude method to be used. Default: resnik
446
+ # +ic_type+:: ic type to be used. Default: resnik
447
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
448
+ # ===== Return
449
+ # Similitudes calculated
450
+ def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
451
+ profiles_similarity = {} #calculate similarity between patients profile
452
+ profiles_ids = @profiles.keys
453
+ if external_profiles.nil?
454
+ comp_ids = profiles_ids
455
+ comp_profiles = @profiles
456
+ main_ids = comp_ids
457
+ main_profiles = comp_profiles
458
+ else
459
+ comp_ids = external_profiles.keys
460
+ comp_profiles = external_profiles
461
+ main_ids = profiles_ids
462
+ main_profiles = @profiles
463
+ end
464
+ # Compare
465
+ while !main_ids.empty?
466
+ curr_id = main_ids.shift
467
+ current_profile = main_profiles[curr_id]
468
+ comp_ids.each do |id|
469
+ profile = comp_profiles[id]
470
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
471
+ query = profiles_similarity[curr_id]
472
+ if query.nil?
473
+ profiles_similarity[curr_id] = {id => value}
474
+ else
475
+ query[id] = value
476
+ end
477
+ end
478
+ end
479
+ return profiles_similarity
480
+ end
481
+
482
+
483
+ # Expand alternative IDs arround all already stored terms
484
+ # ===== Parameters
485
+ # +alt_tag+:: tag used to expand alternative IDs
486
+ # ===== Returns
487
+ # true if process ends without errors and false in other cases
488
+ def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
489
+ # Check input
490
+ raise('stanzas terms empty') if @stanzas[:terms].empty?
491
+ # Take all alternative IDs
492
+ alt_ids2add = {}
493
+ @stanzas[:terms].each do |id, tags|
494
+ alt_ids = tags[alt_tag]
495
+ if !alt_ids.nil?
496
+ alt_ids = alt_ids - @removable_terms
497
+ # Update info
498
+ alt_ids.each do |alt_term|
499
+ @alternatives_index[alt_term] = id
500
+ alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
501
+ @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
502
+ end
503
+ end
504
+ end
505
+ @stanzas[:terms].merge!(alt_ids2add)
506
+ end
507
+
508
+
509
+ # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
510
+ # ===== Returns
511
+ # true if eprocess ends without errors and false in other cases
512
+ def build_index()
513
+ self.get_index_alternatives
514
+ self.get_index_obsoletes
515
+ self.get_index_child_parent_relations
516
+ @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
517
+ @alternatives_index.compact!
518
+ @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
519
+ @obsoletes_index.compact!
520
+ @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
521
+ @ancestors_index.compact!
522
+ @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
523
+ @descendants_index.compact!
524
+ self.get_index_frequencies
525
+ self.calc_dictionary(:name)
526
+ self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
527
+ self.calc_term_levels(calc_paths: true)
528
+ end
529
+
530
+
531
+ # Calculates regular frequencies based on ontology structure (using parentals)
532
+ # ===== Returns
533
+ # true if everything end without errors and false in other cases
534
+ def get_index_frequencies()
535
+ # Check
536
+ if @ancestors_index.empty?
537
+ warn('ancestors_index object is empty')
538
+ else
539
+ # Prepare useful variables
540
+ alternative_terms = @alternatives_index.keys
541
+ # Per each term, add frequencies
542
+ @stanzas[:terms].each do |id, tags|
543
+ if @alternatives_index.include?(id)
544
+ alt_id = @alternatives_index[id]
545
+ query = @meta[alt_id] # Check if exist
546
+ if query.nil?
547
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
548
+ @meta[alt_id] = query
549
+ end
550
+ @meta[id] = query
551
+ # Note: alternative terms do not increase structural frequencies
552
+ else # Official term
553
+ query = @meta[id] # Check if exist
554
+ if query.nil?
555
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
556
+ @meta[id] = query
557
+ end
558
+ # Store metadata
559
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
560
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
561
+ query[:struct_freq] = query[:descendants] + 1.0
562
+ # Update maximums
563
+ @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
564
+ @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
565
+ end
566
+ end
567
+ end
568
+ end
569
+
570
+
571
+ # Expand obsoletes set and link info to their alternative IDs
572
+ # ===== Parameters
573
+ # +obs_tags+:: tags to be used to find obsoletes
574
+ # +alt_tags+:: tags to find alternative IDs (if are available)
575
+ # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
576
+ # ===== Returns
577
+ # true if process ends without errors and false in other cases
578
+ def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
579
+ if @stanzas[:terms].empty?
580
+ warn('stanzas terms empty')
581
+ else
582
+ # Check obsoletes
583
+ @stanzas[:terms].each do |id, term_tags|
584
+ next if term_tags.nil?
585
+ query = term_tags[obs_tag]
586
+ if !query.nil? && query == 'true' # Obsolete tag presence
587
+ next if !@obsoletes_index[id].nil? # Already stored
588
+ # Check if alternative value is available
589
+ alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
590
+ if !alt_ids.empty?
591
+ alt_id = alt_ids.first.first #FIRST tag, FIRST id
592
+ # Store
593
+ @alternatives_index[id] = alt_id
594
+ @obsoletes_index[id] = alt_id
595
+ end
596
+ end
597
+ end
598
+ end
599
+ end
600
+
601
+
602
+ # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
603
+ # ===== Parameters
604
+ # +tag+:: tag used to expand parentals
605
+ # +split_info_char+:: special regex used to split info (if it is necessary)
606
+ # +split_info_indx+:: special index to take splitted info (if it is necessary)
607
+ # ===== Returns
608
+ # true if process ends without errors and false in other cases
609
+ def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
610
+ # Check
611
+ if @stanzas[:terms].nil?
612
+ warn('stanzas terms empty')
613
+ else
614
+ # Expand
615
+ structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
616
+ target_tag: tag,
617
+ alt_ids: @alternatives_index,
618
+ obsoletes: @obsoletes_index.length)
619
+ # Check
620
+ raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
621
+ # Prepare ancestors structure
622
+ anc = {}
623
+ des = {}
624
+ parentals.each do |id, parents|
625
+ parents = parents - @removable_terms
626
+ anc[id] = parents
627
+ parents.each do |anc_id| # Add descendants
628
+ if !des.include?(anc_id)
629
+ des[anc_id] = [id]
630
+ else
631
+ des[anc_id] << id
632
+ end
633
+ end
634
+ end
635
+ # Store alternatives
636
+ @alternatives_index.each do |id,alt|
637
+ anc[id] = anc[alt] if anc.include?(alt)
638
+ des[id] = des[alt] if des.include?(alt)
639
+ end
640
+ # Check structure
641
+ if ![:atomic,:sparse].include? structType
642
+ structType = structType == :circular ? :circular : :hierarchical
643
+ end
644
+ # Store
645
+ @ancestors_index = anc
646
+ @descendants_index = des
647
+ @structureType = structType
648
+ end
649
+ # Finish
650
+ end
651
+
652
+
653
+ # Find ancestors of a given term
654
+ # ===== Parameters
655
+ # +term+:: to be checked
656
+ # +filter_alternatives+:: if true, remove alternatives from final results
657
+ # ===== Returns
658
+ # an array with all ancestors of given term or false if parents are not available yet
659
+ def get_ancestors(term, filter_alternatives = false)
660
+ return self.get_familiar(term, true, filter_alternatives)
661
+ end
662
+
663
+
664
+ # Find descendants of a given term
665
+ # ===== Parameters
666
+ # +term+:: to be checked
667
+ # +filter_alternatives+:: if true, remove alternatives from final results
668
+ # ===== Returns
669
+ # an array with all descendants of given term or false if parents are not available yet
670
+ def get_descendants(term, filter_alternatives = false)
671
+ return self.get_familiar(term, false, filter_alternatives)
672
+ end
673
+
674
+
675
+ # Find ancestors/descendants of a given term
676
+ # ===== Parameters
677
+ # +term+:: to be checked
678
+ # +return_ancestors+:: return ancestors if true or descendants if false
679
+ # +filter_alternatives+:: if true, remove alternatives from final results
680
+ # ===== Returns
681
+ # an array with all ancestors/descendants of given term or nil if parents are not available yet
682
+ def get_familiar(term, return_ancestors = true, filter_alternatives = false)
683
+ # Find into parentals
684
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
685
+ if !familiars.nil?
686
+ familiars = familiars.clone
687
+ if filter_alternatives
688
+ familiars.reject!{|fm| @alternatives_index.include?(fm)}
689
+ end
690
+ else
691
+ familiars = []
692
+ end
693
+ return familiars
694
+ end
695
+
696
+
697
+ # Obtain IC of an specific term
698
+ # ===== Parameters
699
+ # +term+:: which IC will be calculated
700
+ # +type+:: of IC to be calculated. Default: resnik
701
+ # +force+:: force re-calculate the IC. Do not check if it is already calculated
702
+ # +zhou_k+:: special coeficient for Zhou IC method
703
+ # ===== Returns
704
+ # the IC calculated
705
+ def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
706
+ term = termRaw.to_sym
707
+ # Check
708
+ raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
709
+ # Check if it's already calculated
710
+ return @ics[type][term] if (@ics[type].include? term) && !force
711
+ # Calculate
712
+ ic = - 1
713
+ case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
714
+ ###########################################
715
+ #### STRUCTURE BASED METRICS
716
+ ###########################################
717
+ # Shortest path
718
+ # Weighted Link
719
+ # Hirst and St-Onge Measure
720
+ # Wu and Palmer
721
+ # Slimani
722
+ # Li
723
+ # Leacock and Chodorow
724
+ ###########################################
725
+ #### INFORMATION CONTENT METRICS
726
+ ###########################################
727
+ when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
728
+ # -log(Freq(x) / Max_Freq)
729
+ ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
730
+ when :resnik_observed
731
+ # -log(Freq(x) / Max_Freq)
732
+ ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
733
+ # Lin
734
+ # Jiang & Conrath
735
+
736
+ ###########################################
737
+ #### FEATURE-BASED METRICS
738
+ ###########################################
739
+ # Tversky
740
+ # x-similarity
741
+ # Rodirguez
742
+
743
+ ###########################################
744
+ #### HYBRID METRICS
745
+ ###########################################
746
+ when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
747
+ # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
748
+ ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
749
+ if :zhou # New Model of Semantic Similarity Measuring in Wordnet
750
+ # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
751
+ @ics[:seco][term] = ic # Special store
752
+ ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
753
+ end
754
+ when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
755
+ ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
756
+ # Knappe
757
+ end
758
+ @ics[type][term] = ic
759
+ return ic
760
+ end
761
+
762
+
763
+ # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
764
+ # ===== Returns
765
+ # two hashes with resnik and resnik_observed ICs for observed terms
766
+ def get_observed_ics_by_onto_and_freq
767
+ # Chech there are observed terms
768
+ if @profiles.empty?
769
+ resnik = {}
770
+ resnik_observed = {}
771
+ else
772
+ # Calc ICs for all terms
773
+ observed_terms = @profiles.values.flatten.uniq
774
+ observed_terms.each{ |term| get_IC(term)}
775
+ observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
776
+ resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
777
+ resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
778
+ end
779
+ return resnik.clone, resnik_observed.clone
780
+ end
781
+
782
+
783
+ # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
784
+ # ===== Parameters
785
+ # +termA+:: term to be cheked
786
+ # +termB+:: term to be checked
787
+ # +ic_type+:: IC formula to be used
788
+ # ===== Returns
789
+ # the IC of the MICA(termA,termB)
790
+ def get_ICMICA(termA, termB, ic_type = :resnik)
791
+ mica = self.get_MICA(termA, termB, ic_type)
792
+ return mica.first.nil? ? nil : mica.last
793
+ end
794
+
795
+
796
+ # Find the Most Index Content shared Ancestor (MICA) of two given terms
797
+ # ===== Parameters
798
+ # +termA+:: term to be cheked
799
+ # +termB+:: term to be checked
800
+ # +ic_type+:: IC formula to be used
801
+ # ===== Returns
802
+ # the MICA(termA,termB) and it's IC
803
+ def get_MICA(termA, termB, ic_type = :resnik)
804
+ termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
805
+ termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
806
+ mica = [nil,-1.0]
807
+ # Special case
808
+ if termA.eql?(termB)
809
+ ic = self.get_IC(termA, type: ic_type)
810
+ mica = [termA, ic]
811
+ else
812
+ # Obtain ancestors (include itselfs too)
813
+ anc_A = self.get_ancestors(termA)
814
+ anc_B = self.get_ancestors(termB)
815
+
816
+ if !(anc_A.empty? && anc_B.empty?)
817
+ anc_A << termA
818
+ anc_B << termB
819
+ # Find shared ancestors
820
+ shared_ancestors = anc_A & anc_B
821
+ # Find MICA
822
+ if shared_ancestors.length > 0
823
+ shared_ancestors.each do |anc|
824
+ ic = self.get_IC(anc, type: ic_type)
825
+ # Check
826
+ mica = [anc,ic] if ic > mica[1]
827
+ end
828
+ end
829
+ end
830
+ end
831
+ return mica
832
+ end
833
+
834
+
835
+ # Calculate similarity between two given terms
836
+ # ===== Parameters
837
+ # +termsA+:: to be compared
838
+ # +termsB+:: to be compared
839
+ # +type+:: similitude formula to be used
840
+ # +ic_type+:: IC formula to be used
841
+ # ===== Returns
842
+ # the similarity between both sets or false if frequencies are not available yet
843
+ def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
844
+ # Check
845
+ raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
846
+ sim = nil
847
+ # Launch comparissons
848
+ sim_res = get_ICMICA(termA, termB, ic_type)
849
+ if !sim_res.nil?
850
+ case type
851
+ when :resnik
852
+ sim = sim_res
853
+ when :lin
854
+ sim = (2.0 * sim_res).fdiv(self.get_IC(termA,type: ic_type) + self.get_IC(termB,type: ic_type))
855
+ when :jiang_conrath # This is not a similarity, this is a disimilarity (distance)
856
+ sim = (self.get_IC(termA, type: ic_type) + self.get_IC(termB, type: ic_type)) - (2.0 * sim_res)
857
+ end
858
+ end
859
+ return sim
860
+ end
861
+
862
+
863
+ # Method used to load information stored into an OBO file and store it into this object.
864
+ # If a file is specified by input parameter, current @file value is updated
865
+ # ===== Parameters
866
+ # +file+:: optional file to update object stored file
867
+ def load(file, build: true)
868
+ _, header, stanzas = self.class.load_obo(file)
869
+ @header = header
870
+ @stanzas = stanzas
871
+ self.remove_removable()
872
+ # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
873
+ self.build_index() if build
874
+ end
875
+
876
+ #
877
+ def remove_removable()
878
+ @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
879
+ end
880
+
881
+
882
+ # Exports an OBO_Handler object in json format
883
+ # ===== Parameters
884
+ # +file+:: where info will be stored
885
+ def write(file)
886
+ # Take object stored info
887
+ obj_info = {header: @header,
888
+ stanzas: @stanzas,
889
+ ancestors_index: @ancestors_index,
890
+ descendants_index: @descendants_index,
891
+ alternatives_index: @alternatives_index,
892
+ obsoletes_index: @obsoletes_index,
893
+ structureType: @structureType,
894
+ ics: @ics,
895
+ meta: @meta,
896
+ special_tags: @special_tags,
897
+ max_freqs: @max_freqs,
898
+ dicts: @dicts,
899
+ profiles: @profiles,
900
+ profilesDict: @profilesDict,
901
+ items: @items,
902
+ removable_terms: @removable_terms,
903
+ term_paths: @term_paths}
904
+ # Convert to JSON format & write
905
+ File.open(file, "w") { |f| f.write obj_info.to_json }
906
+ end
907
+
908
+
909
+ def is_number? string
910
+ true if Float(string) rescue false
911
+ end
912
+
913
+
914
+ # Read a JSON file with an OBO_Handler object stored
915
+ # ===== Parameters
916
+ # +file+:: with object info
917
+ # +file+:: if true, calculate indexes. Default: true
918
+ # ===== Return
919
+ # OBO_Handler internal fields
920
+ def read(file, build: true)
921
+ # Read file
922
+ jsonFile = File.open(file)
923
+ jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
924
+ # Pre-process (Symbolize some hashs values)
925
+ jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
926
+ jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
927
+ jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
928
+ # Optional
929
+ jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
930
+ jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
931
+ jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
932
+ jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:obsoletes_index].nil?
933
+ jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
934
+ next if dictionaries.nil?
935
+ # Special case: byTerm
936
+ dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
937
+ if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
938
+ [term.to_s.to_i, value.map{|term| term.to_sym}]
939
+ elsif value.is_a? Numeric # Numeric dictionary
940
+ [term.to_sym, value]
941
+ elsif value.kind_of?(Array) && flag == :is_a
942
+ [term.to_sym, value.map{|v| v.to_sym}]
943
+ else
944
+ [term.to_sym, value]
945
+ end
946
+ end
947
+ dictionaries[:byTerm] = dictionaries[:byTerm].to_h
948
+ # By value
949
+ dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
950
+ if value.is_a? Numeric # Numeric dictionary
951
+ [value, term.to_sym]
952
+ elsif term.is_a? Numeric # Numeric dictionary
953
+ [value.to_s.to_sym, term]
954
+ elsif flag == :is_a
955
+ [value.to_sym, term.map{|v| v.to_sym}]
956
+ elsif term.kind_of?(Array)
957
+ [value.to_sym, term.map{|t| t.to_sym}]
958
+ else
959
+ [value.to_s, term.to_sym]
960
+ end
961
+ end
962
+ dictionaries[:byValue] = dictionaries[:byValue].to_h
963
+ end
964
+ if !jsonInfo[:profiles].nil?
965
+ jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
966
+ jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
967
+ end
968
+ jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}} unless jsonInfo[:profilesDict].nil?
969
+ jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
970
+ jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
971
+ next if v.nil?
972
+ if v.kind_of?(Array)
973
+ jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
974
+ else
975
+ jsonInfo[:special_tags][k] = v.to_sym
976
+ end
977
+ end
978
+ jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
979
+ jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
980
+
981
+ # Store info
982
+ @header = jsonInfo[:header]
983
+ @stanzas = jsonInfo[:stanzas]
984
+ @ancestors_index = jsonInfo[:ancestors_index]
985
+ @descendants_index = jsonInfo[:descendants_index]
986
+ @alternatives_index = jsonInfo[:alternatives_index]
987
+ @obsoletes_index = jsonInfo[:obsoletes_index]
988
+ jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
989
+ @structureType = jsonInfo[:structureType]
990
+ @ics = jsonInfo[:ics]
991
+ @meta = jsonInfo[:meta]
992
+ @special_tags = jsonInfo[:special_tags]
993
+ @max_freqs = jsonInfo[:max_freqs]
994
+ @dicts = jsonInfo[:dicts]
995
+ @profiles = jsonInfo[:profiles]
996
+ @profilesDict = jsonInfo[:profilesDict]
997
+ @items = jsonInfo[:items]
998
+ @removable_terms = jsonInfo[:removable_terms]
999
+ @term_paths = jsonInfo[:term_paths]
1000
+
1001
+ self.build_index() if build
1002
+ end
1003
+
1004
+
1005
+ # Check if a given ID is stored as term into this object
1006
+ # ===== Parameters
1007
+ # +id+:: to be checked
1008
+ # ===== Return
1009
+ # True if term is allowed or false in other cases
1010
+ def exists? id
1011
+ return stanzas[:terms].include?(id)
1012
+ end
1013
+
1014
+
1015
+ # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1016
+ # ===== Parameters
1017
+ # +text+:: to be checked
1018
+ # ===== Return
1019
+ # The correct ID if it can be found or nil in other cases
1020
+ def extract_id(text, splitBy: ' ')
1021
+ if self.exists?(text)
1022
+ return text
1023
+ else
1024
+ splittedText = text.to_s.split(splitBy).first.to_sym
1025
+ return self.exists?(splittedText) ? splittedText : nil
1026
+ end
1027
+ end
1028
+
1029
+
1030
+ # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1031
+ # This functions stores calculated dictionary into @dicts field.
1032
+ # This functions stores first value for multivalue tags
1033
+ # This function does not handle synonyms for byValue dictionaries
1034
+ # ===== Parameters
1035
+ # +tag+:: to be used to calculate dictionary
1036
+ # +select_regex+:: gives a regfex that can be used to modify value to be stored
1037
+ # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1038
+ # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1039
+ # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1040
+ # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1041
+ # ===== Return
1042
+ # void. And stores calcualted bidirectional dictonary into dictionaries main container
1043
+ def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1044
+ tag = tag.to_sym
1045
+ store_tag = tag if store_tag.nil?
1046
+ if @stanzas[:terms].empty?
1047
+ warn('Terms are not already loaded. Aborting dictionary calc')
1048
+ else
1049
+ byTerm = {}
1050
+ byValue = {}
1051
+ # Calc per term
1052
+ @stanzas[:terms].each do |term, tags|
1053
+ referenceTerm = term
1054
+ if @alternatives_index.include?(term) && substitute_alternatives # Special case
1055
+ referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1056
+ end
1057
+ queryTag = tags[tag]
1058
+ if !queryTag.nil?
1059
+ # Pre-process
1060
+ if !select_regex.nil?
1061
+ if queryTag.kind_of?(Array)
1062
+ queryTag = queryTag.map{|value| value.scan(select_regex).first}
1063
+ queryTag.flatten!
1064
+ else
1065
+ queryTag = queryTag.scan(select_regex).first
1066
+ end
1067
+ queryTag.compact!
1068
+ end
1069
+ if queryTag.kind_of?(Array) # Store
1070
+ if !queryTag.empty?
1071
+ if byTerm.include?(referenceTerm)
1072
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1073
+ else
1074
+ byTerm[referenceTerm] = queryTag
1075
+ end
1076
+ if multiterm
1077
+ queryTag.each do |value|
1078
+ byValue[value] = [] if byValue[value].nil?
1079
+ byValue[value] << referenceTerm
1080
+ end
1081
+ else
1082
+ queryTag.each{|value| byValue[value] = referenceTerm}
1083
+ end
1084
+ end
1085
+ else
1086
+ if byTerm.include?(referenceTerm)
1087
+ byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1088
+ else
1089
+ byTerm[referenceTerm] = [queryTag]
1090
+ end
1091
+ if multiterm
1092
+ byValue[queryTag] = [] if byValue[queryTag].nil?
1093
+ byValue[queryTag] << referenceTerm
1094
+ else
1095
+ byValue[queryTag] = referenceTerm
1096
+ end
1097
+ end
1098
+ end
1099
+ end
1100
+
1101
+ # Check self-references
1102
+ if self_type_references
1103
+ byTerm.map do |term, references|
1104
+ corrected_references = references.map do |t|
1105
+ checked = self.extract_id(t)
1106
+ if checked.nil?
1107
+ t
1108
+ else
1109
+ byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1110
+ checked
1111
+ end
1112
+ end
1113
+ byTerm[term] = corrected_references.uniq
1114
+ end
1115
+ end
1116
+
1117
+ # Check order
1118
+ byTerm.map do |term,values|
1119
+ if self.exists?(term)
1120
+ referenceValue = @stanzas[:terms][term][tag]
1121
+ if !referenceValue.nil?
1122
+ if !select_regex.nil?
1123
+ if referenceValue.kind_of?(Array)
1124
+ referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1125
+ referenceValue.flatten!
1126
+ else
1127
+ referenceValue = referenceValue.scan(select_regex).first
1128
+ end
1129
+ referenceValue.compact!
1130
+ end
1131
+ if self_type_references
1132
+ if referenceValue.kind_of?(Array)
1133
+ aux = referenceValue.map{|t| self.extract_id(t)}
1134
+ else
1135
+ aux = self.extract_id(referenceValue)
1136
+ end
1137
+ referenceValue = aux if !aux.nil?
1138
+ end
1139
+ referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1140
+ byTerm[term] = referenceValue + (values - referenceValue)
1141
+ end
1142
+ end
1143
+ end
1144
+
1145
+ # Store
1146
+ @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1147
+ end
1148
+ end
1149
+
1150
+
1151
+ # Calculates :is_a dictionary without alternatives substitution
1152
+ def calc_ancestors_dictionary
1153
+ self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
1154
+ end
1155
+
1156
+
1157
+ # Translate a given value using an already calcualted dictionary
1158
+ # ===== Parameters
1159
+ # +toTranslate+:: value to be translated using dictiontionary
1160
+ # +tag+:: used to generate the dictionary
1161
+ # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1162
+ # ===== Return
1163
+ # translation
1164
+ def translate(toTranslate, tag, byValue: true)
1165
+ dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1166
+ toTranslate = get_main_id(toTranslate) if !byValue
1167
+ return dict[toTranslate]
1168
+ end
1169
+
1170
+
1171
+ # Translate a name given
1172
+ # ===== Parameters
1173
+ # +name+:: to be translated
1174
+ # ===== Return
1175
+ # translated name or nil if it's not stored into this ontology
1176
+ def translate_name(name)
1177
+ term = self.translate(name, :name)
1178
+ term = self.translate(name, :synonym) if term.nil?
1179
+ return term
1180
+ end
1181
+
1182
+
1183
+ # Translate several names and return translations and a list of names which couldn't be translated
1184
+ # ===== Parameters
1185
+ # +names+:: array to be translated
1186
+ # ===== Return
1187
+ # two arrays with translations and names which couldn't be translated respectively
1188
+ def translate_names(names)
1189
+ translated = []
1190
+ rejected = []
1191
+ names.each do |name|
1192
+ tr = self.translate_name(name)
1193
+ if tr.nil?
1194
+ rejected << name
1195
+ else
1196
+ translated << tr
1197
+ end
1198
+ end
1199
+ return translated, rejected
1200
+ end
1201
+
1202
+
1203
+ # Translates a given ID to it assigned name
1204
+ # ===== Parameters
1205
+ # +id+:: to be translated
1206
+ # ===== Return
1207
+ # main name or nil if it's not included into this ontology
1208
+ def translate_id(id)
1209
+ name = self.translate(id, :name, byValue: false)
1210
+ return name.nil? ? nil : name.first
1211
+ end
1212
+
1213
+
1214
+ # Translates several IDs and returns translations and not allowed IDs list
1215
+ # ===== Parameters
1216
+ # +ids+:: to be translated
1217
+ # ===== Return
1218
+ # two arrays with translations and names which couldn't be translated respectively
1219
+ def translate_ids(ids)
1220
+ translated = []
1221
+ rejected = []
1222
+ ids.each do |term_id|
1223
+ tr = self.translate_id(term_id.to_sym)
1224
+ if !tr.nil?
1225
+ translated << tr
1226
+ else
1227
+ rejected << tr
1228
+ end
1229
+ end
1230
+ return translated, rejected
1231
+ end
1232
+
1233
+
1234
+ # ===== Returns
1235
+ # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1236
+ # ===== Parameters
1237
+ # +id+:: to be translated
1238
+ # ===== Return
1239
+ # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1240
+ def get_main_id(id)
1241
+ return nil if !@stanzas[:terms].include? id
1242
+ new_id = id
1243
+ mainID = @alternatives_index[id]
1244
+ new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1245
+ return new_id
1246
+ end
1247
+
1248
+
1249
+ # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1250
+ # ===== Parameters
1251
+ # +ids+:: to be checked
1252
+ # ===== Return
1253
+ # two arrays whit allowed and rejected IDs respectively
1254
+ def check_ids(ids, substitute: true)
1255
+ checked_codes = []
1256
+ rejected_codes = []
1257
+ ids.each do |id|
1258
+ if @stanzas[:terms].include? id
1259
+ if substitute
1260
+ checked_codes << self.get_main_id(id)
1261
+ else
1262
+ checked_codes << id
1263
+ end
1264
+ else
1265
+ rejected_codes << id
1266
+ end
1267
+ end
1268
+ return checked_codes, rejected_codes
1269
+ end
1270
+
1271
+
1272
+ # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1273
+ # ===== Parameters
1274
+ # +id+:: assigned to profile
1275
+ # +terms+:: array of terms
1276
+ # +substitute+:: subsstitute flag from check_ids
1277
+ def add_profile(id, terms, substitute: true)
1278
+ warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1279
+ correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1280
+ if !rejected_terms.empty?
1281
+ warn('Given terms contains erroneus IDs. These IDs will be removed')
1282
+ end
1283
+ if id.is_a? Numeric
1284
+ @profiles[id] = correct_terms
1285
+ else
1286
+ @profiles[id.to_sym] = correct_terms
1287
+ end
1288
+ end
1289
+
1290
+
1291
+ # Method used to store a pull of profiles
1292
+ # ===== Parameters
1293
+ # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1294
+ # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1295
+ # +reset_stored+:: if true, remove already stored profiles
1296
+ # +substitute+:: subsstitute flag from check_ids
1297
+ def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1298
+ self.reset_profiles if reset_stored
1299
+ # Check
1300
+ if profiles.kind_of?(Array)
1301
+ profiles.each_with_index do |items, i|
1302
+ self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1303
+ end
1304
+ else # Hash
1305
+ if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1306
+ warn('Some profiles given are already stored. Stored version will be replaced')
1307
+ end
1308
+ profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1309
+ end
1310
+
1311
+ self.add_observed_terms_from_profiles(reset: true)
1312
+
1313
+ if calc_metadata
1314
+ self.calc_profiles_dictionary
1315
+ end
1316
+ end
1317
+
1318
+
1319
+ # Internal method used to remove already stored profiles and restore observed frequencies
1320
+ def reset_profiles
1321
+ # Clean profiles storage
1322
+ @profiles = {}
1323
+ # Reset frequency observed
1324
+ @meta.each{|term,info| info[:observed_freq] = 0}
1325
+ @max_freqs[:observed_freq] = 0
1326
+ end
1327
+
1328
+
1329
+ # ===== Returns
1330
+ # profiles assigned to a given ID
1331
+ # ===== Parameters
1332
+ # +id+:: profile ID
1333
+ # ===== Return
1334
+ # specific profile or nil if it's not stored
1335
+ def get_profile(id)
1336
+ return @profiles[id]
1337
+ end
1338
+
1339
+
1340
+ # ===== Returns
1341
+ # an array of sizes for all stored profiles
1342
+ # ===== Return
1343
+ # array of profile sizes
1344
+ def get_profiles_sizes()
1345
+ return @profiles.map{|id,terms| terms.length}
1346
+ end
1347
+
1348
+
1349
+ # ===== Returns
1350
+ # mean size of stored profiles
1351
+ # ===== Parameters
1352
+ # +round_digits+:: number of digits to round result. Default: 4
1353
+ # ===== Returns
1354
+ # mean size of stored profiles
1355
+ def get_profiles_mean_size(round_digits: 4)
1356
+ sizes = self.get_profiles_sizes
1357
+ return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1358
+ end
1359
+
1360
+
1361
+ # Calculates profiles sizes and returns size assigned to percentile given
1362
+ # ===== Parameters
1363
+ # +perc+:: percentile to be returned
1364
+ # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1365
+ # ===== Returns
1366
+ # values assigned to percentile asked
1367
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1368
+ prof_lengths = self.get_profiles_sizes.sort
1369
+ prof_lengths.reverse! if !increasing_sort
1370
+ n_profiles = prof_lengths.length
1371
+ percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1372
+ percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1373
+ return prof_lengths[percentile_index]
1374
+ end
1375
+
1376
+
1377
+ # Translate a given profile to terms names
1378
+ # ===== Parameters
1379
+ # +prof+:: array of terms to be translated
1380
+ # ===== Returns
1381
+ # array of translated terms. Can include nils if some IDs are not allowed
1382
+ def profile_names(prof)
1383
+ return prof.map{|term| self.translate_id(term)}
1384
+ end
1385
+
1386
+
1387
+ # Trnaslates a bunch of profiles to it sets of term names
1388
+ # ===== Parameters
1389
+ # +profs+:: array of profiles
1390
+ # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1391
+ # ===== Returns
1392
+ # translated profiles
1393
+ def translate_profiles_ids(profs = [], asArray: true)
1394
+ profs = @profiles if profs.empty?
1395
+ profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1396
+ profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1397
+ return asArray ? profs_names.values : profs_names
1398
+ end
1399
+
1400
+
1401
+ # Includes as "observed_terms" all terms included into stored profiles
1402
+ # ===== Parameters
1403
+ # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1404
+ def add_observed_terms_from_profiles(reset: false)
1405
+ @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1406
+ @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1407
+ end
1408
+
1409
+
1410
+ # Get a term frequency
1411
+ # ===== Parameters
1412
+ # +term+:: term to be checked
1413
+ # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1414
+ # ===== Returns
1415
+ # frequency of term given or nil if term is not allowed
1416
+ def get_frequency(term, type: :struct_freq)
1417
+ queryFreq = @meta[term]
1418
+ return queryFreq.nil? ? nil : queryFreq[type]
1419
+ end
1420
+
1421
+
1422
+ # Geys structural frequency of a term given
1423
+ # ===== Parameters
1424
+ # +term+:: to be checked
1425
+ # ===== Returns
1426
+ # structural frequency of given term or nil if term is not allowed
1427
+ def get_structural_frequency(term)
1428
+ return self.get_frequency(term, type: :struct_freq)
1429
+ end
1430
+
1431
+
1432
+ # Gets observed frequency of a term given
1433
+ # ===== Parameters
1434
+ # +term+:: to be checked
1435
+ # ===== Returns
1436
+ # observed frequency of given term or nil if term is not allowed
1437
+ def get_observed_frequency(term)
1438
+ return self.get_frequency(term, type: :observed_freq)
1439
+ end
1440
+
1441
+
1442
+ # Calculates frequencies of stored profiles terms
1443
+ # ===== Parameters
1444
+ # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1445
+ # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1446
+ # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1447
+ # +translate+:: if true, term IDs will be translated to
1448
+ # ===== Returns
1449
+ # stored profiles terms frequencies
1450
+ def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1451
+ n_profiles = @profiles.length
1452
+ if literal
1453
+ freqs = {}
1454
+ @profiles.each do |id, terms|
1455
+ terms.each do |literalTerm|
1456
+ if freqs.include?(literalTerm)
1457
+ freqs[literalTerm] += 1
1458
+ else
1459
+ freqs[literalTerm] = 1
1460
+ end
1461
+ end
1462
+ end
1463
+ if (ratio || translate)
1464
+ aux_keys = freqs.keys
1465
+ aux_keys.each do |term|
1466
+ freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1467
+ if translate
1468
+ tr = self.translate_id(term)
1469
+ freqs[tr] = freqs.delete(term) if !tr.nil?
1470
+ end
1471
+ end
1472
+ end
1473
+ if asArray
1474
+ freqs = freqs.map{|term, freq| [term, freq]}
1475
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1476
+ end
1477
+ else # Freqs translating alternatives
1478
+ freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1479
+ freqs = freqs.to_h if !asArray
1480
+ if translate
1481
+ freqs = freqs.map do |term, freq|
1482
+ tr = self.translate_id(term)
1483
+ tr.nil? ? [term, freq] : [tr, freq]
1484
+ end
1485
+ end
1486
+ if asArray
1487
+ freqs = freqs.map{|term, freq| [term, freq]}
1488
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1489
+ else
1490
+ freqs = freqs.to_h
1491
+ end
1492
+ end
1493
+ return freqs
1494
+ end
1495
+
1496
+
1497
+ # Clean a given profile returning cleaned set of terms and removed ancestors term.
1498
+ # ===== Parameters
1499
+ # +prof+:: array of terms to be checked
1500
+ # ===== Returns
1501
+ # two arrays, first is the cleaned profile and second is the removed elements array
1502
+ def remove_ancestors_from_profile(prof)
1503
+ ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1504
+ redundant = prof.select{|term| ancestors.include?(term)}
1505
+ return prof - redundant, redundant
1506
+ end
1507
+
1508
+
1509
+ # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1510
+ # ===== Parameters
1511
+ # +prof+:: array of terms to be checked
1512
+ # ===== Returns
1513
+ # two arrays, first is the cleaned profile and second is the removed elements array
1514
+ def remove_alternatives_from_profile(prof)
1515
+ alternatives = prof.select{|term| @alternatives_index.include?(term)}
1516
+ redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1517
+ return prof - redundant, redundant
1518
+ end
1519
+
1520
+
1521
+ # Remove alternatives (if official term is present) and ancestors terms of a given profile
1522
+ # ===== Parameters
1523
+ # +profile+:: profile to be cleaned
1524
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1525
+ # ===== Returns
1526
+ # cleaned profile
1527
+ def clean_profile(profile, remove_alternatives: true)
1528
+ terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1529
+ if remove_alternatives
1530
+ terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1531
+ else
1532
+ terms_without_ancestors_and_alternatices = terms_without_ancestors
1533
+ end
1534
+ return terms_without_ancestors_and_alternatices
1535
+ end
1536
+
1537
+
1538
+ # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1539
+ # ===== Parameters
1540
+ # +store+:: if true, clenaed profiles will replace already stored profiles
1541
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1542
+ # ===== Returns
1543
+ # a hash with cleaned profiles
1544
+ def clean_profiles(store: false, remove_alternatives: true)
1545
+ cleaned_profiles = {}
1546
+ @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1547
+ @profiles = cleaned_profiles if store
1548
+ return cleaned_profiles
1549
+ end
1550
+
1551
+
1552
+ # Calculates number of ancestors present (redundant) in each profile stored
1553
+ # ===== Returns
1554
+ # array of parentals for each profile
1555
+ def parentals_per_profile
1556
+ cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1557
+ parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1558
+ return parentals
1559
+ end
1560
+
1561
+
1562
+ # Calculates mean IC of a given profile
1563
+ # ===== Parameters
1564
+ # +prof+:: profile to be checked
1565
+ # +ic_type+:: ic_type to be used
1566
+ # +zhou_k+:: special coeficient for Zhou IC method
1567
+ # ===== Returns
1568
+ # mean IC for a given profile
1569
+ def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1570
+ return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1571
+ end
1572
+
1573
+
1574
+ # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1575
+ # ===== Returns
1576
+ # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1577
+ def get_profiles_resnik_dual_ICs
1578
+ struct_ics = {}
1579
+ observ_ics = {}
1580
+ @profiles.each do |id, terms|
1581
+ struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1582
+ observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1583
+ end
1584
+ return struct_ics.clone, observ_ics.clone
1585
+ end
1586
+
1587
+
1588
+ # Calculates ontology structural levels for all ontology terms
1589
+ # ===== Parameters
1590
+ # +calc_paths+:: calculates term paths if it's not already calculated
1591
+ # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1592
+ def calc_term_levels(calc_paths: false, shortest_path: true)
1593
+ if @term_paths.empty?
1594
+ if calc_paths
1595
+ self.calc_term_paths
1596
+ else
1597
+ warn('Term paths are not already loaded. Aborting dictionary calc')
1598
+ end
1599
+ end
1600
+ if !@term_paths.empty?
1601
+ byTerm = {}
1602
+ byValue = {}
1603
+ # Calc per term
1604
+ @term_paths.each do |term, info|
1605
+ level = shortest_path ? info[:shortest_path] : info[:largest_path]
1606
+ if level.nil?
1607
+ level = -1
1608
+ else
1609
+ level = level.round(0)
1610
+ end
1611
+ byTerm[term] = level
1612
+ queryLevels = byValue[level]
1613
+ if queryLevels.nil?
1614
+ byValue[level] = [term]
1615
+ else
1616
+ byValue[level] << term
1617
+ end
1618
+ end
1619
+ @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1620
+ # Update maximum depth
1621
+ @max_freqs[:max_depth] = byValue.keys.max
1622
+ end
1623
+ end
1624
+
1625
+
1626
+ # Check if a term given is marked as obsolete
1627
+ def is_obsolete? term
1628
+ return @obsoletes_index.include?(term)
1629
+ end
1630
+
1631
+ # Check if a term given is marked as alternative
1632
+ def is_alternative? term
1633
+ return @alternatives_index.include?(term)
1634
+ end
1635
+
1636
+ # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1637
+ # Also calculates paths metadata and stores into @term_paths
1638
+ def calc_term_paths
1639
+ self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1640
+ visited_terms = []
1641
+ @term_paths = {}
1642
+ if [:hierarchical, :sparse].include? @structureType
1643
+ terms = @stanzas[:terms].keys
1644
+ terms.each do |term|
1645
+ if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1646
+ special_term = term
1647
+ term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1648
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1649
+ @term_paths[special_term] = @term_paths[term]
1650
+ visited_terms << special_term
1651
+ end
1652
+
1653
+ if !visited_terms.include?(term)
1654
+ @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1655
+ parentals = @dicts[:is_a][:byTerm][term]
1656
+ if parentals.nil?
1657
+ @term_paths[term][:paths] << [term]
1658
+ else
1659
+ parentals.each do |direct_parental|
1660
+ if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1661
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1662
+ else # Calculate new paths
1663
+ self.expand_path(direct_parental, visited_terms)
1664
+ new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1665
+ end
1666
+ new_paths.each{|path| @term_paths[term][:paths] << path}
1667
+ end
1668
+ end
1669
+ visited_terms << term
1670
+ end
1671
+ # Update metadata
1672
+ @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1673
+ paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1674
+ @term_paths[term][:largest_path] = paths_sizes.max
1675
+ @term_paths[term][:shortest_path] = paths_sizes.min
1676
+ end
1677
+ else
1678
+ warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1679
+ end
1680
+ end
1681
+
1682
+
1683
+ # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1684
+ # ===== Parameters
1685
+ # +curr_term+:: current visited term
1686
+ # +visited_terms+:: already expanded terms
1687
+ def expand_path(curr_term, visited_terms)
1688
+ if !visited_terms.include?(curr_term) # Not already expanded
1689
+ @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1690
+ direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1691
+ if direct_parentals.nil? # No parents :: End of recurrence
1692
+ @term_paths[curr_term][:paths] << [curr_term]
1693
+ else # Expand and concat
1694
+ direct_parentals.each do |ancestor|
1695
+ self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1696
+ new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1697
+ new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1698
+ end
1699
+ end
1700
+ visited_terms << curr_term
1701
+ end
1702
+ end
1703
+
1704
+
1705
+ # Gets ontology levels calculated
1706
+ # ===== Returns
1707
+ # ontology levels calculated
1708
+ def get_ontology_levels
1709
+ return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1710
+ end
1711
+
1712
+
1713
+ # Gets ontology level of a specific term
1714
+ # ===== Returns
1715
+ # Term level
1716
+ def get_term_level(term)
1717
+ return @dicts[:level][:byValue][term]
1718
+ end
1719
+
1720
+
1721
+ # Return ontology levels from profile terms
1722
+ # ===== Returns
1723
+ # hash of term levels (Key: level; Value: array of term IDs)
1724
+ def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1725
+ profiles_terms = @profiles.values.flatten
1726
+ profiles_terms.uniq! if uniq
1727
+ term_freqs_byProfile = {}
1728
+ profiles_terms.each do |term|
1729
+ query = term_freqs_byProfile[term]
1730
+ if query.nil?
1731
+ term_freqs_byProfile[term] = 1
1732
+ else
1733
+ term_freqs_byProfile[term] += 1
1734
+ end
1735
+ end
1736
+ levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1737
+ return levels_filtered
1738
+ end
1739
+
1740
+
1741
+ # Calculate profiles dictionary with Key= Term; Value = Profiles
1742
+ def calc_profiles_dictionary
1743
+ if @profiles.empty?
1744
+ warn('Profiles are not already loaded. Aborting dictionary calc')
1745
+ else
1746
+ byTerm = {} # Key: Terms
1747
+ # byValue -- Key: Profile == @profiles
1748
+ @profiles.each do |id, terms|
1749
+ terms.each do |term|
1750
+ if byTerm.include?(term)
1751
+ byTerm[term] << id
1752
+ else
1753
+ byTerm[term] = [id]
1754
+ end
1755
+ end
1756
+ end
1757
+ @profilesDict = byTerm
1758
+ end
1759
+ end
1760
+
1761
+
1762
+ # Gets profiles dictionary calculated
1763
+ # ===== Return
1764
+ # profiles dictionary (clone)
1765
+ def get_terms_linked_profiles
1766
+ return @profilesDict.clone
1767
+ end
1768
+
1769
+
1770
+ # Get related profiles to a given term
1771
+ # ===== Parameters
1772
+ # +term+:: to be checked
1773
+ # ===== Returns
1774
+ # profiles which contains given term
1775
+ def get_term_linked_profiles(term)
1776
+ return @profilesDict[term]
1777
+ end
1778
+
1779
+
1780
+ # Gets metainfo table from a set of terms
1781
+ # ===== Parameters
1782
+ # +terms+:: IDs to be expanded
1783
+ # +filter_alternatives+:: flag to be used in get_descendants method
1784
+ # ===== Returns
1785
+ # an array with triplets [TermID, TermName, DescendantsNames]
1786
+ def get_childs_table(terms, filter_alternatives = false)
1787
+ expanded_terms = []
1788
+ terms.each do |t|
1789
+ expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1790
+ end
1791
+ return expanded_terms
1792
+ end
1793
+
1794
+
1795
+ # Store specific relations hash given into ITEMS structure
1796
+ # ===== Parameters
1797
+ # +relations+:: hash to be stored
1798
+ # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1799
+ # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1800
+ def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1801
+ @items = {} if remove_old_relations
1802
+ if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1803
+ warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1804
+ end
1805
+ if !remove_old_relations
1806
+ if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1807
+ warn('Some terms given are already stored. Stored version will be replaced')
1808
+ end
1809
+ end
1810
+ if expand
1811
+ relations.each do |k,v|
1812
+ if @items.keys.include?(k)
1813
+ @items[k] = (@items[k] + v).uniq
1814
+ else
1815
+ @items[k] = v
1816
+ end
1817
+ end
1818
+ else
1819
+ @items.merge!(relations)
1820
+ end
1821
+ end
1822
+
1823
+
1824
+ # Assign a dictionary already calculated as a items set.
1825
+ # ===== Parameters
1826
+ # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1827
+ def set_items_from_dict(dictID, remove_old_relations = false)
1828
+ @items = {} if remove_old_relations
1829
+ if(@dicts.keys.include?(dictID))
1830
+ @items.merge(@dicts[dictID][:byTerm])
1831
+ else
1832
+ warn('Specified ID is not calculated. Dict will not be added as a items set')
1833
+ end
1834
+ end
1835
+
1836
+
1837
+ # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
1838
+ # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1839
+ # ===== Parameters
1840
+ # +ontology+:: (Optional) ontology object which items given belongs
1841
+ # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
1842
+ # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
1843
+ # ===== Returns
1844
+ # void and update items object
1845
+ def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
1846
+ # Check item keys
1847
+ if @items.empty?
1848
+ warn('Items have been not provided yet')
1849
+ return nil
1850
+ end
1851
+ targetKeys = @items.keys.select{|k| self.exists?(k)}
1852
+ if targetKeys.length == 0
1853
+ warn('Any item key is allowed')
1854
+ return nil
1855
+ elsif targetKeys.length < @items.keys.length
1856
+ warn('Some item keys are not allowed')
1857
+ end
1858
+
1859
+ # Expand to parentals
1860
+ targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
1861
+ targetKeys.flatten!
1862
+ targetKeys.uniq!
1863
+
1864
+ # Obtain levels (go from leaves to roots)
1865
+ levels = targetKeys.map{|term| self.get_term_level(term)}
1866
+ levels.compact!
1867
+ levels.uniq!
1868
+ levels.sort!
1869
+ levels.reverse!
1870
+ levels.shift # Leaves are not expandable
1871
+
1872
+ # Expand from leaves to roots
1873
+ levels.map do |lvl|
1874
+ curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
1875
+ curr_keys.map do |term_expand|
1876
+ to_infer = []
1877
+ # Obtain childs
1878
+ childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
1879
+ # Expand
1880
+ if childs.length > 0 && minimum_childs == 1 # Special case
1881
+ to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
1882
+ elsif childs.length >= minimum_childs
1883
+ to_infer = Hash.new(0)
1884
+ # Compare
1885
+ while childs.length > 1
1886
+ curr_term = childs.shift
1887
+ childs.each do |compare_term|
1888
+ pivot_items = @items[curr_term]
1889
+ compare_items = @items[compare_term]
1890
+ if ontology.nil? # Exact match
1891
+ pivot_items.map do |pitem|
1892
+ if compare_items.include?(pitem)
1893
+ to_infer[pitem] += 2
1894
+ end
1895
+ end
1896
+ else # Find MICAs
1897
+ local_infer = Hash.new(0)
1898
+ pivot_items.map do |pitem|
1899
+ micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
1900
+ maxmica = micas[0]
1901
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1902
+ local_infer[maxmica.first] += 1
1903
+ end
1904
+ compare_items.map do |citem|
1905
+ micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
1906
+ maxmica = micas[0]
1907
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1908
+ local_infer[maxmica.first] += 1
1909
+ end
1910
+ local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
1911
+ end
1912
+ end
1913
+ end
1914
+ # Filter infer
1915
+ to_infer = to_infer.select{|k,v| v >= minimum_childs}
1916
+ end
1917
+ # Infer
1918
+ if to_infer.length > 0
1919
+ @items[term_expand] = [] if @items[term_expand].nil?
1920
+ if to_infer.kind_of?(Array)
1921
+ @items[term_expand] = (@items[term_expand] + to_infer).uniq
1922
+ else
1923
+ @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
1924
+ end
1925
+ @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
1926
+ elsif !@items.include?(term_expand)
1927
+ targetKeys.delete(term_expand)
1928
+ end
1929
+ end
1930
+ end
1931
+ end
1932
+
1933
+
1934
+
1935
+ # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1936
+ # ===== Parameters
1937
+ # ++::
1938
+ # ===== Returns
1939
+ # ...
1940
+ def compute_relations_to_items(external_item_list, mode, thresold)
1941
+ results = []
1942
+ penalized_terms = {}
1943
+ # terms_levels = get_terms_levels(@items_relations.keys)
1944
+ terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1945
+ terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1946
+ terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1947
+ levels = terms_levels.keys.sort
1948
+ levels.reverse_each do |level|
1949
+ terms_levels[level].each do |term|
1950
+ associated_items = @items_relations[term]
1951
+ if mode == :elim
1952
+ items_to_remove = penalized_terms[term]
1953
+ items_to_remove = [] if items_to_remove.nil?
1954
+ pval = get_fisher_exact_test(
1955
+ external_item_list - items_to_remove,
1956
+ associated_items - items_to_remove,
1957
+ ((associated_items | external_item_list) - items_to_remove).length
1958
+ )
1959
+ if pval <= thresold
1960
+ parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1961
+ parents.each do |prnt|
1962
+ query = penalized_terms[prnt]
1963
+ if query.nil?
1964
+ penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1965
+ else
1966
+ query.concat(@items_relations[term])
1967
+ end
1968
+ end
1969
+ end
1970
+ end
1971
+ results << [term, pval]
1972
+ end
1973
+ end
1974
+ return results
1975
+ end
1976
+
1977
+
1978
+ # Check if a given ID is a removable (blacklist) term.
1979
+ # +DEPRECATED+ use is_removable? instead
1980
+ # ===== Parameters
1981
+ # +id+:: to be checked
1982
+ # ===== Returns
1983
+ # true if given term is a removable (blacklist) term or false in other cases
1984
+ def is_removable(id)
1985
+ warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
1986
+ return @removable_terms.include?(id.to_sym)
1987
+ end
1988
+
1989
+ # Check if a given ID is a removable (blacklist) term
1990
+ # ===== Parameters
1991
+ # +id+:: to be checked
1992
+ # ===== Returns
1993
+ # true if given term is a removable (blacklist) term or false in other cases
1994
+ def is_removable? id
1995
+ return @removable_terms.include?(id.to_sym)
1996
+ end
1997
+
1998
+ ############################################
1999
+ # SPECIAL METHODS
2000
+ #############################################
2001
+ def ==(other)
2002
+ self.header == other.header &&
2003
+ self.stanzas == other.stanzas &&
2004
+ self.ancestors_index == other.ancestors_index &&
2005
+ self.alternatives_index == other.alternatives_index &&
2006
+ self.obsoletes_index == other.obsoletes_index &&
2007
+ self.structureType == other.structureType &&
2008
+ self.ics == other.ics &&
2009
+ self.meta == other.meta &&
2010
+ self.dicts == other.dicts &&
2011
+ self.profiles == other.profiles &&
2012
+ self.profilesDict == other.profilesDict &&
2013
+ (self.items.keys - other.items.keys).empty? &&
2014
+ self.removable_terms == other.removable_terms &&
2015
+ self.special_tags == other.special_tags &&
2016
+ self.items == other.items &&
2017
+ self.term_paths == other.term_paths &&
2018
+ self.max_freqs == other.max_freqs
2008
2019
  end
2009
2020
 
2010
2021
 
2011
2022
  def clone
2012
- copy = Ontology.new
2013
- copy.header = self.header.clone
2014
- copy.stanzas[:terms] = self.stanzas[:terms].clone
2015
- copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2016
- copy.stanzas[:instances] = self.stanzas[:instances].clone
2017
- copy.ancestors_index = self.ancestors_index.clone
2018
- copy.descendants_index = self.descendants_index.clone
2019
- copy.alternatives_index = self.alternatives_index.clone
2020
- copy.obsoletes_index = self.obsoletes_index.clone
2021
- copy.structureType = self.structureType.clone
2022
- copy.ics = self.ics.clone
2023
- copy.meta = self.meta.clone
2024
- copy.dicts = self.dicts.clone
2025
- copy.profiles = self.profiles.clone
2026
- copy.profilesDict = self.profilesDict.clone
2027
- copy.items = self.items.clone
2028
- copy.removable_terms = self.removable_terms.clone
2029
- copy.term_paths = self.term_paths.clone
2030
- copy.max_freqs = self.max_freqs.clone
2031
- return copy
2032
- end
2033
-
2034
-
2035
- #############################################
2036
- # ACCESS CONTROL
2037
- #############################################
2038
-
2039
- attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2040
- attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2023
+ copy = Ontology.new
2024
+ copy.header = self.header.clone
2025
+ copy.stanzas[:terms] = self.stanzas[:terms].clone
2026
+ copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2027
+ copy.stanzas[:instances] = self.stanzas[:instances].clone
2028
+ copy.ancestors_index = self.ancestors_index.clone
2029
+ copy.descendants_index = self.descendants_index.clone
2030
+ copy.alternatives_index = self.alternatives_index.clone
2031
+ copy.obsoletes_index = self.obsoletes_index.clone
2032
+ copy.structureType = self.structureType.clone
2033
+ copy.ics = self.ics.clone
2034
+ copy.meta = self.meta.clone
2035
+ copy.dicts = self.dicts.clone
2036
+ copy.profiles = self.profiles.clone
2037
+ copy.profilesDict = self.profilesDict.clone
2038
+ copy.items = self.items.clone
2039
+ copy.removable_terms = self.removable_terms.clone
2040
+ copy.term_paths = self.term_paths.clone
2041
+ copy.max_freqs = self.max_freqs.clone
2042
+ return copy
2043
+ end
2044
+
2045
+
2046
+ #############################################
2047
+ # ACCESS CONTROL
2048
+ #############################################
2049
+
2050
+ attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2051
+ attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2041
2052
  end