semtools 0.1.6 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ require 'expcalc'
1
2
  require 'json'
2
3
  require 'colorize'
3
4
 
@@ -7,44 +8,29 @@ class Ontology
7
8
  # AUTHOR NOTES
8
9
  #########################################################
9
10
 
10
- # 1 - Store @profiles as @stanzas[:instances]
11
11
  # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
12
12
 
13
-
14
13
  #############################################
15
14
  # FIELDS
16
15
  #############################################
17
- # Handled class variables
18
- # => @@basic_tags :: hash with main OBO structure tags
19
- # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
20
- # => @@symbolizable_ids :: tags which can be symbolized
21
- # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
22
- #
23
16
  # Handled object variables
24
- # => @header :: file header (if is available)
25
- # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
17
+ # => @terms :: OBO terms descriptions
26
18
  # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
27
19
  # => @descendants_index :: hash of descendants per each term handled with any structure relationships
28
20
  # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
29
- # => @obsoletes_index :: hash of obsoletes and it's new ids
30
- # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
31
21
  # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
32
- # => @ics :: already calculated ICs for handled terms and IC types
33
- # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
34
- # => @max_freqs :: maximum freqs found for structural and observed freqs
35
22
  # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
36
- # => @profiles :: set of terms assigned to an ID
37
- # => @profilesDict :: set of profile IDs assigned to a term
38
- # => @items :: hash with items relations to terms
39
23
  # => @removable_terms :: array of terms to not be considered
24
+ # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
25
+ # => @ics :: already calculated ICs for handled terms and IC types
40
26
  # => @term_paths :: metainfo about parental paths of each term
27
+ # => @max_freqs :: maximum freqs found for structural and observed freqs
28
+ # => @items :: hash with items relations to terms
29
+ # => @profiles :: set of terms assigned to an ID
41
30
 
42
- @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
43
31
  @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
44
- @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
45
- @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
46
- @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
47
- @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
32
+
33
+ attr_accessor :terms, :ancestors_index, :descendants_index, :alternatives_index, :obsoletes, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :items, :term_paths, :reroot
48
34
 
49
35
  #############################################
50
36
  # CONSTRUCTOR
@@ -57,265 +43,138 @@ class Ontology
57
43
  # +removable_terms+: term to be removed from calcs
58
44
  # +build+: flag to launch metainfo calculation
59
45
  # +file_format+: force format type despite file extension. Can be :obo or :json
60
- def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
61
- # Initialize object variables
62
- @header = nil
63
- @stanzas = {terms: {}, typedefs: {}, instances: {}}
46
+ def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil, extra_dicts: [])
47
+ @terms = {}
64
48
  @ancestors_index = {}
65
49
  @descendants_index = {}
66
50
  @alternatives_index = {}
67
- @obsoletes_index = {}
51
+ @obsoletes = {} # id is obsolete but it could or not have an alt id
68
52
  @structureType = nil
69
53
  @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
70
54
  @meta = {}
71
- @special_tags = @@basic_tags.clone
72
55
  @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
73
56
  @dicts = {}
74
57
  @profiles = {}
75
- @profilesDict = {}
76
58
  @items = {}
77
- @removable_terms = []
78
59
  @term_paths = {}
79
- add_removable_terms(removable_terms) if !removable_terms.empty?
60
+ @reroot = false
80
61
  load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
81
62
  # Load if proceeds
82
63
  if load_file
83
64
  fformat = file_format
84
65
  fformat = File.extname(file) if fformat.nil? && !file.nil?
85
66
  if fformat == :obo || fformat == ".obo"
86
- load(file, build: build)
67
+ OboParser.load(self, file, build: build, black_list: removable_terms, extra_dicts: extra_dicts)
87
68
  elsif fformat == :json || fformat == ".json"
88
- self.read(file, build: build)
69
+ JsonParser.load(self, file, build: build)
89
70
  elsif !fformat.nil?
90
71
  warn 'Format not allowed. Loading process will not be performed'
91
72
  end
73
+ precompute if build
92
74
  end
93
75
  end
94
76
 
95
-
96
77
  #############################################
97
- # CLASS METHODS
78
+ # GENERATE METADATA FOR ALL TERMS
98
79
  #############################################
99
80
 
100
- # Expand a (starting) term using a specific tag and return all extended terms into an array and
101
- # the relationship structuture observed (hierarchical or circular). If circular structure is
102
- # foumd, extended array will be an unique vector without starting term (no loops).
103
- # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
104
- # ===== Parameters
105
- # +start+:: term where start to expand
106
- # +terms+:: set to be used to expand
107
- # +target_tag+:: tag used to expand
108
- # +eexpansion+:: already expanded info
109
- # +split_info_char+:: special regex used to split info (if it is necessary)
110
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
111
- # +alt_ids+:: set of alternative IDs
112
- # ===== Returns
113
- # A vector with the observed structure (string) and the array with extended terms.
114
- def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
115
- # Take start_id term available info and already accumulated info
116
- current_associations = related_ids[start_id]
117
- current_associations = [] if current_associations.nil?
118
- return [:no_term,[]] if terms[start_id].nil?
119
- id_relations = terms[start_id][target_tag]
120
- return [:source,[]] if id_relations.nil?
121
-
122
- # Prepare auxiliar variables
123
- struct = :hierarchical
124
-
125
- # Study direct extensions
126
- id_relations = id_relations.clone
127
- while id_relations.length > 0
128
- id = id_relations.shift
129
- id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
130
-
131
- # Handle
132
- if current_associations.include?(id) # Check if already have been included into this expansion
133
- struct = :circular
134
- else
135
- current_associations << id
136
- if related_ids.include?(id) # Check if current already has been expanded
137
- current_associations = current_associations | related_ids[id]
138
- if current_associations.include?(start_id) # Check circular case
139
- struct = :circular
140
- [id, start_id].each{|repeated| current_associations.delete(repeated)}
141
- end
142
- else # Expand
143
- related_ids[start_id] = current_associations
144
- structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
145
- current_associations = current_associations | current_related_ids
146
- struct = :circular if structExp == :circular # Check struct
147
- if current_associations.include?(start_id) # Check circular case
148
- struct = :circular
149
- current_associations.delete(start_id)
150
- end
151
- end
152
- end
153
- end
154
- related_ids[start_id] = current_associations
155
-
156
- return struct, current_associations
157
- end
158
-
159
-
160
- # Expand terms using a specific tag and return all extended terms into an array and
161
- # the relationship structuture observed (hierarchical or circular). If circular structure is
162
- # foumd, extended array will be an unique vector without starting term (no loops)
163
- # ===== Parameters
164
- # +terms+:: set to be used to expand
165
- # +target_tag+:: tag used to expand
166
- # +split_info_char+:: special regex used to split info (if it is necessary)
167
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
168
- # +alt_ids+:: set of alternative IDs
169
- # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
170
- # ===== Returns
171
- # A vector with the observed structure (string) and the hash with extended terms
172
- def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
173
- # Define structure type
174
- structType = :hierarchical
175
- related_ids = {}
176
- terms.each do |id, tags|
177
- # Check if target tag is defined
178
- if !tags[target_tag].nil?
179
- # Obtain related terms
180
- set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
181
- # Check structure
182
- structType = :circular if set_structure == :circular
183
- end
184
- end
185
-
186
- # Check special case
187
- structType = :atomic if related_ids.length <= 0
188
- structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
189
- # Return type and hash with related_ids
190
- return structType, related_ids
81
+ def precompute
82
+ get_index_frequencies
83
+ calc_term_levels(calc_paths: true)
191
84
  end
192
85
 
193
-
194
- # Class method to transform string with <tag : info> into hash structure
195
- # ===== Parameters
196
- # +attributes+:: array tuples with info to be transformed into hash format
86
+ # Calculates regular frequencies based on ontology structure (using parentals)
197
87
  # ===== Returns
198
- # Attributes stored into hash structure
199
- def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
200
- # Load info
201
- info_hash = {}
202
- # Only TERMS multivalue tags (future add Typedefs and Instance)
203
- # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
204
- attributes.each do |tag, value|
205
- # Check
206
- raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
207
- # Prepare
208
- tag = tag.lstrip.to_sym
209
- value.lstrip!
210
- value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
211
-
212
- # Store
213
- query = info_hash[tag]
214
- if !query.nil? # Tag already exists
215
- if !query.kind_of?(Array) # Check that tag is multivalue
216
- raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
217
- else
218
- query << value # Add new value to tag
219
- end
220
- else # New entry
221
- if @@multivalue_tags.include?(tag)
222
- info_hash[tag] = [value]
223
- else
224
- info_hash[tag] = value
88
+ # true if everything end without errors and false in other cases
89
+ def get_index_frequencies() # Per each term, add frequencies
90
+ if @ancestors_index.empty?
91
+ warn('ancestors_index object is empty')
92
+ else
93
+ each(att = true) do |id, tags|
94
+ query = @meta[id]
95
+ if query.nil?
96
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
97
+ @meta[id] = query
225
98
  end
99
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].length.to_f : 0.0
100
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].length.to_f : 0.0
101
+ query[:struct_freq] = query[:descendants] + 1.0
102
+ # Update maximums
103
+ @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
104
+ @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
226
105
  end
227
106
  end
228
- self.symbolize_ids(info_hash)
229
- return info_hash
230
107
  end
231
108
 
232
-
233
- # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
234
- # the Header, the Terms, the Typedefs and the Instances.
109
+ # Calculates ontology structural levels for all ontology terms
235
110
  # ===== Parameters
236
- # +file+:: OBO file to be loaded
237
- # ===== Returns
238
- # Hash with FILE, HEADER and STANZAS info
239
- def self.load_obo(file) #TODO: Send to obo_parser class
240
- raise("File is not defined") if file.nil?
241
- # Data variables
242
- header = ''
243
- stanzas = {terms: {}, typedefs: {}, instances: {}}
244
- # Auxiliar variables
245
- infoType = 'Header'
246
- currInfo = []
247
- stanzas_flags = %w[[Term] [Typedef] [Instance]]
248
- # Read file
249
- File.open(file).each do |line|
250
- line.chomp!
251
- next if line.empty?
252
- fields = line.split(':', 2)
253
- # Check if new instance is found
254
- if stanzas_flags.include?(line)
255
- header = self.process_entity(header, infoType, stanzas, currInfo)
256
- # Update info variables
257
- currInfo = []
258
- infoType = line.gsub!(/[\[\]]/, '')
259
- next
111
+ # +calc_paths+:: calculates term paths if it's not already calculated
112
+ # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
113
+ def calc_term_levels(calc_paths: false, shortest_path: true)
114
+ self.calc_term_paths if @term_paths.empty? && calc_paths
115
+ if !@term_paths.empty?
116
+ byTerm = {}
117
+ byValue = {}
118
+ @term_paths.each do |term, info|
119
+ level = shortest_path ? info[:shortest_path] : info[:largest_path]
120
+ level = level.nil? ? -1 : level.round(0)
121
+ byTerm[term] = level
122
+ add2hash(byValue, level, term)
260
123
  end
261
- # Concat info
262
- currInfo << fields
124
+ @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
125
+ @max_freqs[:max_depth] = byValue.keys.max # Update maximum depth
263
126
  end
264
- # Store last loaded info
265
- header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
266
-
267
- # Prepare to return
268
- finfo = {:file => file, :name => File.basename(file, File.extname(file))}
269
- return finfo, header, stanzas
270
127
  end
271
128
 
272
-
273
- # Handle OBO loaded info and stores it into correct container and format
274
- # ===== Parameters
275
- # +header+:: container
276
- # +infoType+:: current ontology item type detected
277
- # +stanzas+:: container
278
- # +currInfo+:: info to be stored
279
- # ===== Returns
280
- # header newly/already stored
281
- def self.process_entity(header, infoType, stanzas, currInfo)
282
- info = self.info2hash(currInfo)
283
- # Store current info
284
- if infoType.eql?('Header')
285
- header = info
286
- else
287
- id = info[:id]
288
- case infoType
289
- when 'Term'
290
- stanzas[:terms][id] = info
291
- when 'Typedef'
292
- stanzas[:typedefs][id] = info
293
- when 'Instance'
294
- stanzas[:instances][id] = info
129
+ # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
130
+ # Also calculates paths metadata and stores into @term_paths
131
+ def calc_term_paths
132
+ @term_paths = {}
133
+ if [:hierarchical, :sparse].include? @structureType
134
+ each do |term|
135
+ expand_path(term)
136
+ path_attr = @term_paths[term]
137
+ # expand_path is arecursive function so these pat attributes must be calculated once the recursion is finished
138
+ path_attr[:total_paths] = path_attr[:paths].length
139
+ paths_sizes = path_attr[:paths].map{|path| path.length}
140
+ path_attr[:largest_path] = paths_sizes.max
141
+ path_attr[:shortest_path] = paths_sizes.min
295
142
  end
143
+ else
144
+ warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
296
145
  end
297
- return header
298
146
  end
299
147
 
300
-
301
- # Symboliza all values into hashs using symbolizable tags as keys
148
+ # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
302
149
  # ===== Parameters
303
- # +item_hash+:: hash to be checked
304
- def self.symbolize_ids(item_hash)
305
- @@symbolizable_ids.each do |tag|
306
- query = item_hash[tag]
307
- if !query.nil?
308
- if query.kind_of?(Array)
309
- query.map!{|item| item.to_sym}
310
- else
311
- item_hash[tag] = query.to_sym if !query.nil?
150
+ # +curr_term+:: current visited term
151
+ # +visited_terms+:: already expanded terms
152
+ def expand_path(curr_term)
153
+ if !@term_paths.include?(curr_term)
154
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
155
+ @term_paths[curr_term] = path_attr
156
+ direct_parentals = @dicts[:is_a][:byTerm][curr_term]
157
+ if direct_parentals.nil? # No parents :: End of recurrence
158
+ path_attr[:paths] << [curr_term]
159
+ else # Expand and concat
160
+ direct_parentals.each do |ancestor|
161
+ path_attr_parental = @term_paths[ancestor]
162
+ if path_attr_parental.nil? # Calculate new paths
163
+ self.expand_path(ancestor)
164
+ new_paths = @term_paths[ancestor][:paths]
165
+ else # Use direct_parental paths already calculated
166
+ new_paths = path_attr_parental[:paths]
167
+ end
168
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
312
169
  end
313
170
  end
314
171
  end
315
172
  end
316
173
 
174
+ #############################################
175
+ # CLASS METHODS (TODO: TO BE TRANFORMED IN INSTANCE METHODS)
176
+ #############################################
317
177
 
318
- #
319
178
  # ===== Parameters
320
179
  # +root+:: main term to expand
321
180
  # +ontology+:: to be cutted
@@ -323,18 +182,32 @@ class Ontology
323
182
  # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
324
183
  # ===== Returns
325
184
  # An Ontology object with terms after cut the ontology.
326
- def self.mutate(root, ontology, clone: true, remove_up: true)
185
+ def self.mutate(root, ontology, clone: true, remove_up: true) #TODO, pending to fix and pass to instance method
327
186
  ontology = ontology.clone if clone
328
187
  # Obtain affected IDs
329
188
  descendants = ontology.descendants_index[root]
330
189
  descendants << root # Store itself to do not remove it
331
190
  # Remove unnecesary terms
332
- ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
191
+ terms = ontology.terms.select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
192
+ ids = terms.keys
193
+ terms.each do |id, term|
194
+ term[:is_a] = term[:is_a] & ids # Clean parental relations to keep only whose that exist between selected terms
195
+ end
196
+ ontology.terms = terms
333
197
  ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
334
198
  ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
335
199
  ontology.dicts = {}
336
- ontology.removable_terms = []
337
200
  ontology.term_paths = {}
201
+ ontology.reroot = true
202
+
203
+ ontology.ancestors_index = {}
204
+ ontology.descendants_index = {}
205
+ ontology.alternatives_index = {}
206
+ ontology.meta = {}
207
+ ontology.profiles = {}
208
+ ontology.items = {}
209
+
210
+
338
211
  # Recalculate metadata
339
212
  ontology.build_index
340
213
  ontology.add_observed_terms_from_profiles
@@ -342,33 +215,13 @@ class Ontology
342
215
  return ontology
343
216
  end
344
217
 
345
-
346
-
347
218
  #############################################
348
- # GENERAL METHODS
219
+ # TERM METHODS
349
220
  #############################################
350
221
 
351
- # Include removable terms to current removable terms list
352
- # ===== Parameters
353
- # +terms+:: terms array to be concatenated
354
- def add_removable_terms(terms)
355
- terms = terms.map{|term| term.to_sym}
356
- @removable_terms.concat(terms)
357
- end
358
-
359
-
360
- # Include removable terms to current removable terms list loading new
361
- # terms from a one column plain text file
362
- # ===== Parameters
363
- # +file+:: to be loaded
364
- def add_removable_terms_from_file(file)
365
- File.open(excluded_codes_file).each do |line|
366
- line.chomp!
367
- @removable_terms << line.to_sym
368
- end
369
- end
222
+ # I/O observed term from data
223
+ ####################################
370
224
 
371
-
372
225
  # Increase observed frequency for a specific term
373
226
  # ===== Parameters
374
227
  # +term+:: term which frequency is going to be increased
@@ -376,15 +229,7 @@ class Ontology
376
229
  # ===== Return
377
230
  # true if process ends without errors, false in other cases
378
231
  def add_observed_term(term:,increase: 1.0)
379
- # Check
380
- raise ArgumentError, "Term given is NIL" if term.nil?
381
- return false unless @stanzas[:terms].include?(term)
382
- return false if @removable_terms.include?(term)
383
- if @alternatives_index.include?(term)
384
- alt_id = @alternatives_index[term]
385
- @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
386
- @meta[term] = @meta[alt_id]
387
- end
232
+ return false unless term_exist?(term)
388
233
  # Check if exists
389
234
  @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
390
235
  # Add frequency
@@ -395,345 +240,199 @@ class Ontology
395
240
  return true
396
241
  end
397
242
 
243
+ # Obtain level and term relations
244
+ ####################################
398
245
 
399
- # Increase the arbitrary frequency of a given term set
400
246
  # ===== Parameters
401
- # +terms+:: set of terms to be updated
402
- # +increase+:: amount to be increased
403
- # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
404
- # ===== Return
405
- # true if process ends without errors and false in other cases
406
- def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
407
- # Check
408
- raise ArgumentError, 'Terms array given is NIL' if terms.nil?
409
- raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
410
- # Add observations
411
- if transform_to_sym
412
- checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
413
- else
414
- checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
247
+ # +term+:: which are requested
248
+ # +relation+:: can be :ancestor or :descendant
249
+ # ===== Returns
250
+ # Direct ancestors/descendants of given term or nil if any error occurs
251
+ def get_direct_related(term, relation)
252
+ target = nil
253
+ case relation
254
+ when :ancestor
255
+ target = :byTerm
256
+ when :descendant
257
+ target = :byValue
258
+ else
259
+ warn('Relation type not allowed. Returning nil')
415
260
  end
416
- return checks
261
+ query = @dicts.dig(:is_a, target, term)
262
+ return query
417
263
  end
418
264
 
419
-
420
- # Compare to terms sets
265
+ # Return direct ancestors/descendants of a given term
266
+ # Return direct ancestors of a given term
421
267
  # ===== Parameters
422
- # +termsA+:: set to be compared
423
- # +termsB+:: set to be compared
424
- # +sim_type+:: similitude method to be used. Default: resnik
425
- # +ic_type+:: ic type to be used. Default: resnik
426
- # +bidirectional+:: calculate bidirectional similitude. Default: false
427
- # ===== Return
428
- # similitude calculated
429
- def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
430
- # Check
431
- raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
432
- raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
433
- micasA = []
434
- # Compare A -> B
435
- termsA.each do |tA|
436
- micas = []
437
- termsB.each do |tB|
438
- if store_mica
439
- value = @mica_index.dig(tA, tB)
440
- else
441
- value = nil
442
- end
443
- if value.nil?
444
- value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
445
- if store_mica
446
- value = true if value.nil? # We use true to save that the operation was made but there is not mica value
447
- add2nestHash(@mica_index, tA, tB, value)
448
- end
449
- end
450
- micas << value if value.class == Float
451
- end
452
- if !micas.empty?
453
- micasA << micas.max # Obtain maximum value
454
- else
455
- micasA << 0
456
- end
457
- end
458
- means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
459
- # Compare B -> A
460
- if bidirectional
461
- means_simA = means_sim * micasA.size
462
- means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
463
- means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
464
- end
465
- # Return
466
- return means_sim
268
+ # +term+:: which ancestors are requested
269
+ # ===== Returns
270
+ # Direct ancestors of given term or nil if any error occurs
271
+ def get_direct_ancentors(term)
272
+ return self.get_direct_related(term, :ancestor)
467
273
  end
468
274
 
469
- def add2nestHash(h, key1, key2, val)
470
- query1 = h[key1]
471
- if query1.nil?
472
- h[key1] = {key2 => val}
473
- else
474
- query1[key2] = val
475
- end
275
+ # Return direct descendants of a given term
276
+ # ===== Parameters
277
+ # +term+:: which descendants are requested
278
+ # ===== Returns
279
+ # Direct descendants of given term or nil if any error occurs
280
+ def get_direct_descendants(term)
281
+ return self.get_direct_related(term, :descendant)
476
282
  end
477
283
 
478
- # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
284
+ # Find ancestors/descendants of a given term
479
285
  # ===== Parameters
480
- # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
481
- # +sim_type+:: similitude method to be used. Default: resnik
482
- # +ic_type+:: ic type to be used. Default: resnik
483
- # +bidirectional+:: calculate bidirectional similitude. Default: false
484
- # ===== Return
485
- # Similitudes calculated
486
- def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
487
- profiles_similarity = {} #calculate similarity between patients profile
488
- profiles_ids = @profiles.keys
489
- if external_profiles.nil?
490
- comp_ids = profiles_ids
491
- comp_profiles = @profiles
492
- main_ids = comp_ids
493
- main_profiles = comp_profiles
286
+ # +term+:: to be checked
287
+ # +return_ancestors+:: return ancestors if true or descendants if false
288
+ # ===== Returns
289
+ # an array with all ancestors/descendants of given term or nil if parents are not available yet
290
+ def get_familiar(term, return_ancestors = true)
291
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
292
+ if !familiars.nil?
293
+ familiars = familiars.clone
494
294
  else
495
- comp_ids = external_profiles.keys
496
- comp_profiles = external_profiles
497
- main_ids = profiles_ids
498
- main_profiles = @profiles
499
- end
500
- # Compare
501
- @mica_index = {}
502
- while !main_ids.empty?
503
- curr_id = main_ids.shift
504
- current_profile = main_profiles[curr_id]
505
- comp_ids.each do |id|
506
- profile = comp_profiles[id]
507
- value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
508
- query = profiles_similarity[curr_id]
509
- if query.nil?
510
- profiles_similarity[curr_id] = {id => value}
511
- else
512
- query[id] = value
513
- end
514
- end
295
+ familiars = []
515
296
  end
516
- return profiles_similarity
297
+ return familiars
517
298
  end
518
299
 
300
+ # Find ancestors of a given term
301
+ # ===== Parameters
302
+ # +term+:: to be checked
303
+ # ===== Returns
304
+ # an array with all ancestors of given term or false if parents are not available yet
305
+ def get_ancestors(term)
306
+ return self.get_familiar(term, true)
307
+ end
519
308
 
520
- # Expand alternative IDs arround all already stored terms
309
+ # Find descendants of a given term
521
310
  # ===== Parameters
522
- # +alt_tag+:: tag used to expand alternative IDs
311
+ # +term+:: to be checked
523
312
  # ===== Returns
524
- # true if process ends without errors and false in other cases
525
- def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
526
- # Check input
527
- raise('stanzas terms empty') if @stanzas[:terms].empty?
528
- # Take all alternative IDs
529
- alt_ids2add = {}
530
- @stanzas[:terms].each do |id, tags|
531
- if id == tags[:id] # Avoid simulated alternative terms
532
- # id = tags[:id] # Take always real ID in case of alternative terms simulted
533
- alt_ids = tags[alt_tag]
534
- if !alt_ids.nil?
535
- alt_ids = alt_ids - @removable_terms - [id]
536
- # Update info
537
- alt_ids.each do |alt_term|
538
- @alternatives_index[alt_term] = id
539
- alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
540
- @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
541
- end
313
+ # an array with all descendants of given term or false if parents are not available yet
314
+ def get_descendants(term)
315
+ return self.get_familiar(term, false)
316
+ end
317
+
318
+ # Gets ontology level of a specific term
319
+ # ===== Returns
320
+ # Term level
321
+ def get_term_level(term)
322
+ return @dicts[:level][:byValue][term]
323
+ end
324
+
325
+ # nil, term not found, [] term exists but not has parents
326
+ def get_parental_path(term, which_path = :shortest_path, level = 0)
327
+ path = nil
328
+ path_attr = @term_paths[term]
329
+ if !path_attr.nil?
330
+ path_length = path_attr[which_path]
331
+ all_paths = path_attr[:paths]
332
+ if all_paths.empty?
333
+ path = []
334
+ else
335
+ path = all_paths.select{|pt| pt.length == path_length}.first.clone
336
+ if level > 0 # we want the term and his ascendants until a specific level
337
+ n_parents = path_length - level
338
+ path = path[0..n_parents]
542
339
  end
340
+ path.shift # Discard the term itself
543
341
  end
544
342
  end
545
- @stanzas[:terms].merge!(alt_ids2add)
343
+ return path
546
344
  end
547
345
 
346
+ # ID Handlers
347
+ ####################################
548
348
 
549
- # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
550
349
  # ===== Returns
551
- # true if eprocess ends without errors and false in other cases
552
- def build_index()
553
- self.get_index_obsoletes
554
- self.get_index_alternatives
555
- self.get_index_child_parent_relations
556
- @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
557
- ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
558
- @alternatives_index.compact!
559
- @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
560
- @obsoletes_index.compact!
561
- @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
562
- @ancestors_index.compact!
563
- @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
564
- @descendants_index.compact!
565
- self.get_index_frequencies
566
- self.calc_dictionary(:name)
567
- self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
568
- self.calc_term_levels(calc_paths: true)
350
+ # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
351
+ # ===== Parameters
352
+ # +id+:: to be translated
353
+ # ===== Return
354
+ # main ID related to a given ID. Returns nil if given ID is not an allowed ID
355
+ def get_main_id(id)
356
+ mainID = @alternatives_index[id]
357
+ return nil if !term_exist?(id) && mainID.nil?
358
+ if !mainID.nil? # Recursive code to get the definitive final term id if there are several alt_id in chain
359
+ new_id = get_main_id(mainID)
360
+ if new_id != mainID
361
+ new_id = get_main_id(new_id)
362
+ end
363
+ id = new_id
364
+ end
365
+ return id
569
366
  end
570
367
 
571
-
572
- # Calculates regular frequencies based on ontology structure (using parentals)
573
- # ===== Returns
574
- # true if everything end without errors and false in other cases
575
- def get_index_frequencies()
576
- # Check
577
- if @ancestors_index.empty?
578
- warn('ancestors_index object is empty')
579
- else
580
- # Per each term, add frequencies
581
- @stanzas[:terms].each do |id, tags|
582
- if @alternatives_index.include?(id)
583
- alt_id = @alternatives_index[id]
584
- query = @meta[alt_id] # Check if exist
585
- if query.nil?
586
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
587
- @meta[alt_id] = query
588
- end
589
- @meta[id] = query
590
- # Note: alternative terms do not increase structural frequencies
591
- else # Official term
592
- query = @meta[id] # Check if exist
593
- if query.nil?
594
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
595
- @meta[id] = query
596
- end
597
- # Store metadata
598
- query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
599
- query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
600
- query[:struct_freq] = query[:descendants] + 1.0
601
- # Update maximums
602
- @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
603
- @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
604
- end
605
- end
606
- end
368
+ # Translate a given value using an already calcualted dictionary
369
+ # ===== Parameters
370
+ # +toTranslate+:: value to be translated using dictiontionary
371
+ # +tag+:: used to generate the dictionary
372
+ # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
373
+ # ===== Return
374
+ # translation
375
+ def translate(toTranslate, tag, byValue: true)
376
+ dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
377
+ toTranslate = get_main_id(toTranslate) if !byValue
378
+ return dict[toTranslate]
607
379
  end
608
380
 
609
-
610
- # Expand obsoletes set and link info to their alternative IDs
381
+ # Translate a name given
611
382
  # ===== Parameters
612
- # +obs_tags+:: tags to be used to find obsoletes
613
- # +alt_tags+:: tags to find alternative IDs (if are available)
614
- # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
615
- # ===== Returns
616
- # true if process ends without errors and false in other cases
617
- def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
618
- if @stanzas[:terms].empty?
619
- warn('stanzas terms empty')
620
- else
621
- # Check obsoletes
622
- @stanzas[:terms].each do |id, term_tags|
623
- next if term_tags.nil?
624
- next if self.is_alternative?(id)
625
- query = term_tags[obs_tag]
626
- if !query.nil? && query == 'true' # Obsolete tag presence
627
- next if !@obsoletes_index[id].nil? # Already stored
628
- # Check if alternative value is available
629
- alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
630
- if !alt_ids.empty?
631
- alt_id = alt_ids.first.first #FIRST tag, FIRST id
632
- # Store
633
- @alternatives_index[id] = alt_id
634
- @obsoletes_index[id] = alt_id
635
- end
636
- end
637
- end
638
- end
383
+ # +name+:: to be translated
384
+ # ===== Return
385
+ # translated name or nil if it's not stored into this ontology
386
+ def translate_name(name)
387
+ term = self.translate(name, :name)
388
+ term = self.translate(name, :synonym) if term.nil?
389
+ return term
639
390
  end
640
391
 
641
-
642
- # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
392
+ # Translates a given ID to it assigned name
643
393
  # ===== Parameters
644
- # +tag+:: tag used to expand parentals
645
- # +split_info_char+:: special regex used to split info (if it is necessary)
646
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
647
- # ===== Returns
648
- # true if process ends without errors and false in other cases
649
- def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
650
- # Check
651
- if @stanzas[:terms].nil?
652
- warn('stanzas terms empty')
653
- else
654
- # Expand
655
- structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
656
- target_tag: tag,
657
- alt_ids: @alternatives_index,
658
- obsoletes: @obsoletes_index.length)
659
- # Check
660
- raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
661
- # Prepare ancestors structure
662
- anc = {}
663
- des = {}
664
- parentals.each do |id, parents|
665
- parents = parents - @removable_terms
666
- anc[id] = parents
667
- parents.each do |anc_id| # Add descendants
668
- if !des.include?(anc_id)
669
- des[anc_id] = [id]
670
- else
671
- des[anc_id] << id
672
- end
673
- end
674
- end
675
- # Store alternatives
676
- # @alternatives_index.each do |id,alt|
677
- # anc[id] = anc[alt] if anc.include?(alt)
678
- # des[id] = des[alt] if des.include?(alt)
679
- # end
680
- # Check structure
681
- if ![:atomic,:sparse].include? structType
682
- structType = structType == :circular ? :circular : :hierarchical
683
- end
684
- # Store
685
- @ancestors_index = anc
686
- @descendants_index = des
687
- @structureType = structType
688
- end
689
- # Finish
394
+ # +id+:: to be translated
395
+ # ===== Return
396
+ # main name or nil if it's not included into this ontology
397
+ def translate_id(id)
398
+ name = self.translate(id, :name, byValue: false)
399
+ return name.nil? ? nil : name.first
690
400
  end
691
401
 
402
+ # Get term frequency and information
403
+ ####################################
692
404
 
693
- # Find ancestors of a given term
405
+ # One single term #
406
+
407
+ # Get a term frequency
694
408
  # ===== Parameters
695
- # +term+:: to be checked
696
- # +filter_alternatives+:: if true, remove alternatives from final results
409
+ # +term+:: term to be checked
410
+ # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
697
411
  # ===== Returns
698
- # an array with all ancestors of given term or false if parents are not available yet
699
- def get_ancestors(term, filter_alternatives = false)
700
- return self.get_familiar(term, true, filter_alternatives)
412
+ # frequency of term given or nil if term is not allowed
413
+ def get_frequency(term, type: :struct_freq)
414
+ queryFreq = @meta[term]
415
+ return queryFreq.nil? ? nil : queryFreq[type]
701
416
  end
702
417
 
703
-
704
- # Find descendants of a given term
418
+ # Geys structural frequency of a term given
705
419
  # ===== Parameters
706
420
  # +term+:: to be checked
707
- # +filter_alternatives+:: if true, remove alternatives from final results
708
421
  # ===== Returns
709
- # an array with all descendants of given term or false if parents are not available yet
710
- def get_descendants(term, filter_alternatives = false)
711
- return self.get_familiar(term, false, filter_alternatives)
422
+ # structural frequency of given term or nil if term is not allowed
423
+ def get_structural_frequency(term)
424
+ return self.get_frequency(term, type: :struct_freq)
712
425
  end
713
426
 
714
-
715
- # Find ancestors/descendants of a given term
427
+ # Gets observed frequency of a term given
716
428
  # ===== Parameters
717
429
  # +term+:: to be checked
718
- # +return_ancestors+:: return ancestors if true or descendants if false
719
- # +filter_alternatives+:: if true, remove alternatives from final results
720
430
  # ===== Returns
721
- # an array with all ancestors/descendants of given term or nil if parents are not available yet
722
- def get_familiar(term, return_ancestors = true, filter_alternatives = false)
723
- # Find into parentals
724
- familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
725
- if !familiars.nil?
726
- familiars = familiars.clone
727
- if filter_alternatives
728
- familiars.reject!{|fm| @alternatives_index.include?(fm)}
729
- end
730
- else
731
- familiars = []
732
- end
733
- return familiars
431
+ # observed frequency of given term or nil if term is not allowed
432
+ def get_observed_frequency(term)
433
+ return self.get_frequency(term, type: :observed_freq)
734
434
  end
735
435
 
736
-
737
436
  # Obtain IC of an specific term
738
437
  # ===== Parameters
739
438
  # +term+:: which IC will be calculated
@@ -787,7 +486,7 @@ class Ontology
787
486
  ###########################################
788
487
  when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
789
488
  # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
790
- ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
489
+ ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@terms.length))
791
490
  if :zhou # New Model of Semantic Similarity Measuring in Wordnet
792
491
  # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
793
492
  @ics[:seco][term] = ic # Special store
@@ -801,40 +500,25 @@ class Ontology
801
500
  return ic
802
501
  end
803
502
 
503
+ # Term vs Term #
804
504
 
805
- # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
806
- # ===== Returns
807
- # two hashes with resnik and resnik_observed ICs for observed terms
808
- def get_observed_ics_by_onto_and_freq
809
- # Chech there are observed terms
810
- if @profiles.empty?
811
- resnik = {}
812
- resnik_observed = {}
813
- else
814
- # Calc ICs for all terms
815
- observed_terms = @profiles.values.flatten.uniq
816
- observed_terms.each{ |term| get_IC(term)}
817
- observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
818
- resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
819
- resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
505
+ def get_LCA(termA, termB, lca_index: false)
506
+ lca = []
507
+ if lca_index
508
+ res = @lca_index.dig(termA, termB)
509
+ lca = [res] if !res.nil?
510
+ else # Obtain ancestors (include itselfs too)
511
+ anc_A = self.get_ancestors(termA)
512
+ anc_B = self.get_ancestors(termB)
513
+ if !(anc_A.empty? && anc_B.empty?)
514
+ anc_A << termA
515
+ anc_B << termB
516
+ lca = anc_A & anc_B
517
+ end
820
518
  end
821
- return resnik.clone, resnik_observed.clone
822
- end
823
-
824
-
825
- # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
826
- # ===== Parameters
827
- # +termA+:: term to be cheked
828
- # +termB+:: term to be checked
829
- # +ic_type+:: IC formula to be used
830
- # ===== Returns
831
- # the IC of the MICA(termA,termB)
832
- def get_ICMICA(termA, termB, ic_type = :resnik)
833
- term, ic = self.get_MICA(termA, termB, ic_type)
834
- return term.nil? ? nil : ic
519
+ return lca
835
520
  end
836
521
 
837
-
838
522
  # Find the Most Index Content shared Ancestor (MICA) of two given terms
839
523
  # ===== Parameters
840
524
  # +termA+:: term to be cheked
@@ -842,30 +526,31 @@ class Ontology
842
526
  # +ic_type+:: IC formula to be used
843
527
  # ===== Returns
844
528
  # the MICA(termA,termB) and it's IC
845
- def get_MICA(termA, termB, ic_type = :resnik)
846
- termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
847
- termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
529
+ def get_MICA(termA, termB, ic_type = :resnik, lca_index = false)
848
530
  mica = [nil,-1.0]
849
- # Special case
850
- if termA.eql?(termB)
531
+ if termA.eql?(termB) # Special case
851
532
  ic = self.get_IC(termA, type: ic_type)
852
533
  mica = [termA, ic]
853
- else
854
- # Obtain ancestors (include itselfs too)
855
- anc_A = self.get_ancestors(termA)
856
- anc_B = self.get_ancestors(termB)
857
- if !(anc_A.empty? && anc_B.empty?)
858
- anc_A << termA
859
- anc_B << termB
860
- (anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
861
- ic = self.get_IC(anc, type: ic_type)
862
- mica = [anc,ic] if ic > mica[1]
863
- end
534
+ else
535
+ get_LCA(termA, termB, lca_index: lca_index).each do |lca| # Find MICA in shared ancestors
536
+ ic = self.get_IC(lca, type: ic_type)
537
+ mica = [lca, ic] if ic > mica[1]
864
538
  end
865
539
  end
866
540
  return mica
867
541
  end
868
542
 
543
+ # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
544
+ # ===== Parameters
545
+ # +termA+:: term to be cheked
546
+ # +termB+:: term to be checked
547
+ # +ic_type+:: IC formula to be used
548
+ # ===== Returns
549
+ # the IC of the MICA(termA,termB)
550
+ def get_ICMICA(termA, termB, ic_type = :resnik)
551
+ term, ic = self.get_MICA(termA, termB, ic_type)
552
+ return term.nil? ? nil : ic
553
+ end
869
554
 
870
555
  # Calculate similarity between two given terms
871
556
  # ===== Parameters
@@ -875,11 +560,10 @@ class Ontology
875
560
  # +ic_type+:: IC formula to be used
876
561
  # ===== Returns
877
562
  # the similarity between both sets or false if frequencies are not available yet
878
- def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
879
- # Check
563
+ def get_similarity(termA, termB, type: :resnik, ic_type: :resnik, lca_index: false)
880
564
  raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
881
565
  sim = nil
882
- mica, sim_res = get_MICA(termA, termB, ic_type)
566
+ mica, sim_res = get_MICA(termA, termB, ic_type, lca_index)
883
567
  if !mica.nil?
884
568
  case type
885
569
  when :resnik
@@ -893,1568 +577,1027 @@ class Ontology
893
577
  return sim
894
578
  end
895
579
 
580
+ # Checking valid terms
581
+ ####################################
896
582
 
897
- # Method used to load information stored into an OBO file and store it into this object.
898
- # If a file is specified by input parameter, current @file value is updated
899
- # ===== Parameters
900
- # +file+:: optional file to update object stored file
901
- def load(file, build: true)
902
- _, header, stanzas = self.class.load_obo(file)
903
- @header = header
904
- @stanzas = stanzas
905
- self.remove_removable()
906
- # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
907
- self.build_index() if build
583
+ def term_exist?(id)
584
+ return @terms.include?(id)
908
585
  end
909
586
 
910
- #
911
- def remove_removable()
912
- @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
587
+ # Check if a term given is marked as obsolete
588
+ def is_obsolete?(term)
589
+ return @obsoletes.include?(term)
913
590
  end
914
591
 
592
+ #############################################
593
+ # ITEMS METHODS
594
+ #############################################
915
595
 
916
- # Exports an OBO_Handler object in json format
596
+ # I/O Items
597
+ ####################################
598
+
599
+ # Store specific relations hash given into ITEMS structure
917
600
  # ===== Parameters
918
- # +file+:: where info will be stored
919
- def write(file)
920
- # Take object stored info
921
- obj_info = {header: @header,
922
- stanzas: @stanzas,
923
- ancestors_index: @ancestors_index,
924
- descendants_index: @descendants_index,
925
- alternatives_index: @alternatives_index,
926
- obsoletes_index: @obsoletes_index,
927
- structureType: @structureType,
928
- ics: @ics,
929
- meta: @meta,
930
- special_tags: @special_tags,
931
- max_freqs: @max_freqs,
932
- dicts: @dicts,
933
- profiles: @profiles,
934
- profilesDict: @profilesDict,
935
- items: @items,
936
- removable_terms: @removable_terms,
937
- term_paths: @term_paths}
938
- # Convert to JSON format & write
939
- File.open(file, "w") { |f| f.write obj_info.to_json }
940
- end
601
+ # +relations+:: hash to be stored
602
+ # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
603
+ # +expand+:: if true, already stored keys will be updated with the unique union of both sets
604
+ def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
605
+ @items = {} if remove_old_relations
606
+ relations.each do |term, items|
607
+ if !term_exist?(term)
608
+ warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
609
+ break
610
+ end
611
+ end
612
+ if expand
613
+ @items = self.concatItems(@items, relations)
614
+ else
615
+ @items.merge!(relations)
616
+ end
617
+ end
941
618
 
619
+ # Defining Items from instance variables
620
+ ########################################
942
621
 
943
- def is_number? string
944
- true if Float(string) rescue false
622
+ # Assign a dictionary already calculated as a items set.
623
+ # ===== Parameters
624
+ # +dictID+:: dictionary ID to be stored (:byTerm will be used)
625
+ def set_items_from_dict(dictID, remove_old_relations = false)
626
+ @items = {} if remove_old_relations
627
+ query = @dicts[dictID]
628
+ if !query.nil?
629
+ @items.merge!(query[:byTerm])
630
+ else
631
+ warn('Specified ID is not calculated. Dict will not be added as a items set')
632
+ end
945
633
  end
946
634
 
947
-
948
- # Read a JSON file with an OBO_Handler object stored
635
+ # Get related profiles to a given term
949
636
  # ===== Parameters
950
- # +file+:: with object info
951
- # +file+:: if true, calculate indexes. Default: true
952
- # ===== Return
953
- # OBO_Handler internal fields
954
- def read(file, build: true)
955
- # Read file
956
- jsonFile = File.open(file)
957
- jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
958
- # Pre-process (Symbolize some hashs values)
959
- if !jsonInfo[:header].nil?
960
- aux = jsonInfo[:header].map do |entry,info|
961
- if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
962
- [entry,info.map{|item| item.to_sym}]
963
- else
964
- [entry,info]
965
- end
966
- end
967
- jsonInfo[:header] = aux.to_h
968
- end
969
- jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
970
- jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
971
- jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
972
- # Optional
973
- jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
974
- jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
975
- jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
976
- jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:obsoletes_index].nil?
977
- jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
978
- next if dictionaries.nil?
979
- # Special case: byTerm
980
- dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
981
- if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
982
- [term.to_s.to_i, value.map{|term| term.to_sym}]
983
- elsif value.is_a? Numeric # Numeric dictionary
984
- [term.to_sym, value]
985
- elsif value.kind_of?(Array) && flag == :is_a
986
- [term.to_sym, value.map{|v| v.to_sym}]
987
- else
988
- [term.to_sym, value]
989
- end
990
- end
991
- dictionaries[:byTerm] = dictionaries[:byTerm].to_h
992
- # By value
993
- dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
994
- if value.is_a? Numeric # Numeric dictionary
995
- [value, term.to_sym]
996
- elsif term.is_a? Numeric # Numeric dictionary
997
- [value.to_s.to_sym, term]
998
- elsif flag == :is_a
999
- [value.to_sym, term.map{|v| v.to_sym}]
1000
- elsif term.kind_of?(Array)
1001
- [value.to_sym, term.map{|t| t.to_sym}]
1002
- else
1003
- [value.to_s, term.to_sym]
1004
- end
1005
- end
1006
- dictionaries[:byValue] = dictionaries[:byValue].to_h
1007
- end
1008
- if !jsonInfo[:profiles].nil?
1009
- jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
1010
- jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
1011
- end
1012
- jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}} unless jsonInfo[:profilesDict].nil?
1013
- jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
1014
- jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
1015
- next if v.nil?
1016
- if v.kind_of?(Array)
1017
- jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
1018
- else
1019
- jsonInfo[:special_tags][k] = v.to_sym
1020
- end
637
+ # +term+:: to be checked
638
+ # ===== Returns
639
+ # profiles which contains given term
640
+ def get_items_from_term(term)
641
+ return @items[term]
642
+ end
643
+
644
+ # For each term in profiles add the ids in the items term-id dictionary
645
+ def get_items_from_profiles
646
+ @profiles.each do |id, terms|
647
+ terms.each {|term| add2hash(@items, term, id) }
1021
648
  end
1022
- jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
1023
- jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
1024
-
1025
- # Store info
1026
- @header = jsonInfo[:header]
1027
- @stanzas = jsonInfo[:stanzas]
1028
- @ancestors_index = jsonInfo[:ancestors_index]
1029
- @descendants_index = jsonInfo[:descendants_index]
1030
- @alternatives_index = jsonInfo[:alternatives_index]
1031
- @obsoletes_index = jsonInfo[:obsoletes_index]
1032
- jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
1033
- @structureType = jsonInfo[:structureType]
1034
- @ics = jsonInfo[:ics]
1035
- @meta = jsonInfo[:meta]
1036
- @special_tags = jsonInfo[:special_tags]
1037
- @max_freqs = jsonInfo[:max_freqs]
1038
- @dicts = jsonInfo[:dicts]
1039
- @profiles = jsonInfo[:profiles]
1040
- @profilesDict = jsonInfo[:profilesDict]
1041
- @items = jsonInfo[:items]
1042
- @removable_terms = jsonInfo[:removable_terms]
1043
- @term_paths = jsonInfo[:term_paths]
1044
-
1045
- self.build_index() if build
1046
- end
1047
-
1048
-
1049
- # Check if a given ID is stored as term into this object
1050
- # ===== Parameters
1051
- # +id+:: to be checked
1052
- # ===== Return
1053
- # True if term is allowed or false in other cases
1054
- def exists? id
1055
- return stanzas[:terms].include?(id)
1056
649
  end
1057
650
 
651
+ # Defining instance variables from items
652
+ ########################################
1058
653
 
1059
- # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1060
- # ===== Parameters
1061
- # +text+:: to be checked
1062
- # ===== Return
1063
- # The correct ID if it can be found or nil in other cases
1064
- def extract_id(text, splitBy: ' ')
1065
- if self.exists?(text)
1066
- return text
1067
- else
1068
- splittedText = text.to_s.split(splitBy).first.to_sym
1069
- return self.exists?(splittedText) ? splittedText : nil
654
+ def get_profiles_from_items
655
+ new_profiles = {}
656
+ @items.each do |term, ids|
657
+ ids.each{|id| add2hash(new_profiles, id, term) }
1070
658
  end
659
+ @profiles = new_profiles
1071
660
  end
1072
661
 
662
+ # Expanding items
663
+ ####################################
1073
664
 
1074
- # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1075
- # This functions stores calculated dictionary into @dicts field.
1076
- # This functions stores first value for multivalue tags
1077
- # This function does not handle synonyms for byValue dictionaries
665
+ # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
666
+ # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1078
667
  # ===== Parameters
1079
- # +tag+:: to be used to calculate dictionary
1080
- # +select_regex+:: gives a regfex that can be used to modify value to be stored
1081
- # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1082
- # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1083
- # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1084
- # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1085
- # ===== Return
1086
- # void. And stores calcualted bidirectional dictonary into dictionaries main container
1087
- def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1088
- tag = tag.to_sym
1089
- store_tag = tag if store_tag.nil?
1090
- if @stanzas[:terms].empty?
1091
- warn('Terms are not already loaded. Aborting dictionary calc')
1092
- else
1093
- byTerm = {}
1094
- byValue = {}
1095
- # Calc per term
1096
- @stanzas[:terms].each do |term, tags|
1097
- referenceTerm = term
1098
- if @alternatives_index.include?(term) && substitute_alternatives # Special case
1099
- referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1100
- end
1101
- queryTag = tags[tag]
1102
- if !queryTag.nil?
1103
- # Pre-process
1104
- if !select_regex.nil?
1105
- if queryTag.kind_of?(Array)
1106
- queryTag = queryTag.map{|value| value.scan(select_regex).first}
1107
- queryTag.flatten!
1108
- else
1109
- queryTag = queryTag.scan(select_regex).first
1110
- end
1111
- queryTag.compact!
1112
- end
1113
- if queryTag.kind_of?(Array) # Store
1114
- if !queryTag.empty?
1115
- if byTerm.include?(referenceTerm)
1116
- byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1117
- else
1118
- byTerm[referenceTerm] = queryTag
668
+ # +ontology+:: (Optional) ontology object which items given belongs
669
+ # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
670
+ # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
671
+ # ===== Returns
672
+ # void and update items object
673
+ def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
674
+ targetKeys = expand_profile_with_parents(@items.keys)
675
+ terms_per_level = list_terms_per_level(targetKeys)
676
+ terms_per_level = terms_per_level.to_a.sort{|l1, l2| l1.first <=> l2.first} # Obtain sorted levels
677
+ terms_per_level.pop # Leaves are not expandable # FRED: Thats comment could be not true
678
+
679
+ terms_per_level.reverse_each do |lvl, terms| # Expand from leaves to roots
680
+ terms.each do |term|
681
+ childs = self.get_descendants(term).select{|t| @items.include?(t)} # Get child with items
682
+ next if childs.length < minimum_childs
683
+ propagated_item_count = Hash.new(0)
684
+ if ontology.nil? # Count how many times is presented an item in childs
685
+ childs.each do |child|
686
+ @items[child].each{|i| propagated_item_count[i] += 1}
687
+ end
688
+ else # Count take into account similarity between terms in other ontology. Not pretty clear the full logic
689
+ while childs.length > 1
690
+ curr_term = childs.shift
691
+ childs.each do |child|
692
+ maxmica_counts = Hash.new(0)
693
+ curr_items = @items[curr_term]
694
+ child_items = @items[child]
695
+ curr_items.each do |item|
696
+ maxmica = ontology.get_maxmica_term2profile(item, child_items)
697
+ maxmica_counts[maxmica.first] += 1
1119
698
  end
1120
- if multiterm
1121
- queryTag.each do |value|
1122
- byValue[value] = [] if byValue[value].nil?
1123
- byValue[value] << referenceTerm
1124
- end
1125
- else
1126
- queryTag.each{|value| byValue[value] = referenceTerm}
699
+ child_items.each do |item|
700
+ maxmica = ontology.get_maxmica_term2profile(item, curr_items)
701
+ maxmica_counts[maxmica.first] += 1
1127
702
  end
1128
- end
1129
- else
1130
- if byTerm.include?(referenceTerm)
1131
- byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1132
- else
1133
- byTerm[referenceTerm] = [queryTag]
1134
- end
1135
- if multiterm
1136
- byValue[queryTag] = [] if byValue[queryTag].nil?
1137
- byValue[queryTag] << referenceTerm
1138
- else
1139
- byValue[queryTag] = referenceTerm
703
+ maxmica_counts.each{|t,freq| propagated_item_count[t] += freq if freq >= 2} #TODO: Maybe need Division by 2 due to the calculation of mica two times but test fails.
704
+ # FRED: Maybe for the childs.shift there is uniqueness
1140
705
  end
1141
706
  end
1142
707
  end
1143
- end
1144
-
1145
- # Check self-references
1146
- if self_type_references
1147
- byTerm.map do |term, references|
1148
- corrected_references = references.map do |t|
1149
- checked = self.extract_id(t)
1150
- if checked.nil?
1151
- t
1152
- else
1153
- byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
1154
- checked
1155
- end
708
+ propagated_items = propagated_item_count.select{|k,v| v >= minimum_childs}.keys
709
+ if propagated_items.length > 0
710
+ query = @items[term]
711
+ if query.nil?
712
+ @items[term] = propagated_items
713
+ else
714
+ terms = @items[term] | propagated_items
715
+ terms = ontology.clean_profile(terms) if clean_profiles && !ontology.nil?
716
+ @items[term] = terms
1156
717
  end
1157
- byTerm[term] = corrected_references.uniq
1158
718
  end
1159
719
  end
720
+ end
721
+ end
1160
722
 
1161
- # Check order
1162
- byTerm.map do |term,values|
1163
- if self.exists?(term)
1164
- referenceValue = @stanzas[:terms][term][tag]
1165
- if !referenceValue.nil?
1166
- if !select_regex.nil?
1167
- if referenceValue.kind_of?(Array)
1168
- referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1169
- referenceValue.flatten!
1170
- else
1171
- referenceValue = referenceValue.scan(select_regex).first
1172
- end
1173
- referenceValue.compact!
1174
- end
1175
- if self_type_references
1176
- if referenceValue.kind_of?(Array)
1177
- aux = referenceValue.map{|t| self.extract_id(t)}
1178
- else
1179
- aux = self.extract_id(referenceValue)
1180
- end
1181
- aux.compact! unless aux.nil?
1182
- referenceValue = aux unless aux.nil?
1183
- end
1184
- referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1185
- byTerm[term] = referenceValue + (values - referenceValue)
1186
- end
1187
- end
1188
- end
723
+ # Compute modified fisher between terms and items based on topgo methodology. Refactor to use all the possible methods of this class
724
+ #-------------------------------------------------------------------------------------------------------------------------------------
1189
725
 
1190
- # Store
1191
- @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
726
+ def compute_relations_to_items(external_item_list, total_items, mode, thresold) # NEED TEST, check with PSZ how to maintain these methods
727
+ terms_levels = list_terms_per_level_from_items
728
+ connect_familiars!(terms_levels)
729
+ item_list_with_transf_parental = get_item_list_parental(terms_levels)
730
+ results = []
731
+ if mode == :elim
732
+ results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
733
+ elsif mode == :weight
734
+ results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
1192
735
  end
736
+ return results
1193
737
  end
1194
738
 
1195
-
1196
- # Calculates :is_a dictionary without alternatives substitution
1197
- def calc_ancestors_dictionary
1198
- self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
739
+ def list_terms_per_level_from_items
740
+ return list_terms_per_level(@items.keys)
1199
741
  end
1200
742
 
743
+ def list_terms_per_level(terms)
744
+ terms_levels = {}
745
+ terms.each do |term|
746
+ level = self.get_term_level(term)
747
+ add2hash(terms_levels, level, term)
748
+ end
749
+ return terms_levels
750
+ end
1201
751
 
1202
- # Translate a given value using an already calcualted dictionary
1203
- # ===== Parameters
1204
- # +toTranslate+:: value to be translated using dictiontionary
1205
- # +tag+:: used to generate the dictionary
1206
- # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1207
- # ===== Return
1208
- # translation
1209
- def translate(toTranslate, tag, byValue: true)
1210
- dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1211
- toTranslate = get_main_id(toTranslate) if !byValue
1212
- return dict[toTranslate]
1213
- end
1214
-
1215
-
1216
- # Translate a name given
1217
- # ===== Parameters
1218
- # +name+:: to be translated
1219
- # ===== Return
1220
- # translated name or nil if it's not stored into this ontology
1221
- def translate_name(name)
1222
- term = self.translate(name, :name)
1223
- term = self.translate(name, :synonym) if term.nil?
1224
- return term
1225
- end
1226
-
1227
-
1228
- # Translate several names and return translations and a list of names which couldn't be translated
1229
- # ===== Parameters
1230
- # +names+:: array to be translated
1231
- # ===== Return
1232
- # two arrays with translations and names which couldn't be translated respectively
1233
- def translate_names(names)
1234
- translated = []
1235
- rejected = []
1236
- names.each do |name|
1237
- tr = self.translate_name(name)
1238
- if tr.nil?
1239
- rejected << name
1240
- else
1241
- translated << tr
1242
- end
1243
- end
1244
- return translated, rejected
1245
- end
1246
-
1247
-
1248
- # Translates a given ID to it assigned name
1249
- # ===== Parameters
1250
- # +id+:: to be translated
1251
- # ===== Return
1252
- # main name or nil if it's not included into this ontology
1253
- def translate_id(id)
1254
- name = self.translate(id, :name, byValue: false)
1255
- return name.nil? ? nil : name.first
1256
- end
1257
-
1258
-
1259
- # Translates several IDs and returns translations and not allowed IDs list
1260
- # ===== Parameters
1261
- # +ids+:: to be translated
1262
- # ===== Return
1263
- # two arrays with translations and names which couldn't be translated respectively
1264
- def translate_ids(ids)
1265
- translated = []
1266
- rejected = []
1267
- ids.each do |term_id|
1268
- tr = self.translate_id(term_id.to_sym)
1269
- if !tr.nil?
1270
- translated << tr
1271
- else
1272
- rejected << tr
1273
- end
1274
- end
1275
- return translated, rejected
1276
- end
1277
-
1278
-
1279
- # ===== Returns
1280
- # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1281
- # ===== Parameters
1282
- # +id+:: to be translated
1283
- # ===== Return
1284
- # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1285
- def get_main_id(id)
1286
- return nil if !@stanzas[:terms].include? id
1287
- new_id = id
1288
- mainID = @alternatives_index[id]
1289
- new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1290
- return new_id
1291
- end
1292
-
1293
-
1294
- # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1295
- # ===== Parameters
1296
- # +ids+:: to be checked
1297
- # ===== Return
1298
- # two arrays whit allowed and rejected IDs respectively
1299
- def check_ids(ids, substitute: true)
1300
- checked_codes = []
1301
- rejected_codes = []
1302
- ids.each do |id|
1303
- if @stanzas[:terms].include? id
1304
- if substitute
1305
- checked_codes << self.get_main_id(id)
1306
- else
1307
- checked_codes << id
1308
- end
1309
- else
1310
- rejected_codes << id
1311
- end
1312
- end
1313
- return checked_codes, rejected_codes
1314
- end
1315
-
1316
-
1317
- # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1318
- # ===== Parameters
1319
- # +id+:: assigned to profile
1320
- # +terms+:: array of terms
1321
- # +substitute+:: subsstitute flag from check_ids
1322
- def add_profile(id, terms, substitute: true)
1323
- warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1324
- correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1325
- if !rejected_terms.empty?
1326
- warn('Given terms contains erroneus IDs. These IDs will be removed')
1327
- end
1328
- if id.is_a? Numeric
1329
- @profiles[id] = correct_terms
1330
- else
1331
- @profiles[id.to_sym] = correct_terms
1332
- end
1333
- end
1334
-
1335
-
1336
- # Method used to store a pull of profiles
1337
- # ===== Parameters
1338
- # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1339
- # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1340
- # +reset_stored+:: if true, remove already stored profiles
1341
- # +substitute+:: subsstitute flag from check_ids
1342
- def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1343
- self.reset_profiles if reset_stored
1344
- # Check
1345
- if profiles.kind_of?(Array)
1346
- profiles.each_with_index do |items, i|
1347
- self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1348
- end
1349
- else # Hash
1350
- if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1351
- warn('Some profiles given are already stored. Stored version will be replaced')
1352
- end
1353
- profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1354
- end
1355
-
1356
- self.add_observed_terms_from_profiles(reset: true)
1357
-
1358
- if calc_metadata
1359
- self.calc_profiles_dictionary
1360
- end
1361
- end
1362
-
1363
-
1364
- # Internal method used to remove already stored profiles and restore observed frequencies
1365
- def reset_profiles
1366
- # Clean profiles storage
1367
- @profiles = {}
1368
- # Reset frequency observed
1369
- @meta.each{|term,info| info[:observed_freq] = 0}
1370
- @max_freqs[:observed_freq] = 0
1371
- end
1372
-
1373
-
1374
- # ===== Returns
1375
- # profiles assigned to a given ID
1376
- # ===== Parameters
1377
- # +id+:: profile ID
1378
- # ===== Return
1379
- # specific profile or nil if it's not stored
1380
- def get_profile(id)
1381
- return @profiles[id]
1382
- end
1383
-
1384
-
1385
- # ===== Returns
1386
- # an array of sizes for all stored profiles
1387
- # ===== Return
1388
- # array of profile sizes
1389
- def get_profiles_sizes()
1390
- return @profiles.map{|id,terms| terms.length}
1391
- end
1392
-
1393
-
1394
- # ===== Returns
1395
- # mean size of stored profiles
1396
- # ===== Parameters
1397
- # +round_digits+:: number of digits to round result. Default: 4
1398
- # ===== Returns
1399
- # mean size of stored profiles
1400
- def get_profiles_mean_size(round_digits: 4)
1401
- sizes = self.get_profiles_sizes
1402
- return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1403
- end
1404
-
1405
-
1406
- # Calculates profiles sizes and returns size assigned to percentile given
1407
- # ===== Parameters
1408
- # +perc+:: percentile to be returned
1409
- # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1410
- # ===== Returns
1411
- # values assigned to percentile asked
1412
- def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1413
- prof_lengths = self.get_profiles_sizes.sort
1414
- prof_lengths.reverse! if !increasing_sort
1415
- n_profiles = prof_lengths.length
1416
- percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1417
- percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1418
- return prof_lengths[percentile_index]
1419
- end
1420
-
1421
-
1422
- # Translate a given profile to terms names
1423
- # ===== Parameters
1424
- # +prof+:: array of terms to be translated
1425
- # ===== Returns
1426
- # array of translated terms. Can include nils if some IDs are not allowed
1427
- def profile_names(prof)
1428
- return prof.map{|term| self.translate_id(term)}
1429
- end
1430
-
1431
-
1432
- # Trnaslates a bunch of profiles to it sets of term names
1433
- # ===== Parameters
1434
- # +profs+:: array of profiles
1435
- # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1436
- # ===== Returns
1437
- # translated profiles
1438
- def translate_profiles_ids(profs = [], asArray: true)
1439
- profs = @profiles if profs.empty?
1440
- profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1441
- profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1442
- return asArray ? profs_names.values : profs_names
1443
- end
1444
-
1445
-
1446
- # Includes as "observed_terms" all terms included into stored profiles
1447
- # ===== Parameters
1448
- # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1449
- def add_observed_terms_from_profiles(reset: false)
1450
- @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1451
- @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1452
- end
1453
-
1454
-
1455
- # Get a term frequency
1456
- # ===== Parameters
1457
- # +term+:: term to be checked
1458
- # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1459
- # ===== Returns
1460
- # frequency of term given or nil if term is not allowed
1461
- def get_frequency(term, type: :struct_freq)
1462
- queryFreq = @meta[term]
1463
- return queryFreq.nil? ? nil : queryFreq[type]
1464
- end
1465
-
1466
-
1467
- # Geys structural frequency of a term given
1468
- # ===== Parameters
1469
- # +term+:: to be checked
1470
- # ===== Returns
1471
- # structural frequency of given term or nil if term is not allowed
1472
- def get_structural_frequency(term)
1473
- return self.get_frequency(term, type: :struct_freq)
1474
- end
1475
-
1476
-
1477
- # Gets observed frequency of a term given
1478
- # ===== Parameters
1479
- # +term+:: to be checked
1480
- # ===== Returns
1481
- # observed frequency of given term or nil if term is not allowed
1482
- def get_observed_frequency(term)
1483
- return self.get_frequency(term, type: :observed_freq)
1484
- end
1485
-
1486
-
1487
- # Calculates frequencies of stored profiles terms
1488
- # ===== Parameters
1489
- # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1490
- # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1491
- # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1492
- # +translate+:: if true, term IDs will be translated to
1493
- # ===== Returns
1494
- # stored profiles terms frequencies
1495
- def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1496
- n_profiles = @profiles.length
1497
- if literal
1498
- freqs = {}
1499
- @profiles.each do |id, terms|
1500
- terms.each do |literalTerm|
1501
- if freqs.include?(literalTerm)
1502
- freqs[literalTerm] += 1
1503
- else
1504
- freqs[literalTerm] = 1
1505
- end
1506
- end
1507
- end
1508
- if (ratio || translate)
1509
- aux_keys = freqs.keys
1510
- aux_keys.each do |term|
1511
- freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1512
- if translate
1513
- tr = self.translate_id(term)
1514
- freqs[tr] = freqs.delete(term) if !tr.nil?
1515
- end
1516
- end
1517
- end
1518
- if asArray
1519
- freqs = freqs.map{|term, freq| [term, freq]}
1520
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1521
- end
1522
- else # Freqs translating alternatives
1523
- freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1524
- freqs = freqs.to_h if !asArray
1525
- if translate
1526
- freqs = freqs.map do |term, freq|
1527
- tr = self.translate_id(term)
1528
- tr.nil? ? [term, freq] : [tr, freq]
1529
- end
1530
- end
1531
- if asArray
1532
- freqs = freqs.map{|term, freq| [term, freq]}
1533
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1534
- else
1535
- freqs = freqs.to_h
1536
- end
1537
- end
1538
- return freqs
1539
- end
1540
-
1541
-
1542
- # Clean a given profile returning cleaned set of terms and removed ancestors term.
1543
- # ===== Parameters
1544
- # +prof+:: array of terms to be checked
1545
- # ===== Returns
1546
- # two arrays, first is the cleaned profile and second is the removed elements array
1547
- def remove_ancestors_from_profile(prof)
1548
- ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1549
- redundant = prof.select{|term| ancestors.include?(term)}
1550
- return prof - redundant, redundant
1551
- end
1552
-
1553
-
1554
- # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1555
- # ===== Parameters
1556
- # +prof+:: array of terms to be checked
1557
- # ===== Returns
1558
- # two arrays, first is the cleaned profile and second is the removed elements array
1559
- def remove_alternatives_from_profile(prof)
1560
- alternatives = prof.select{|term| @alternatives_index.include?(term)}
1561
- redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1562
- return prof - redundant, redundant
1563
- end
1564
-
1565
-
1566
- # Remove alternatives (if official term is present) and ancestors terms of a given profile
1567
- # ===== Parameters
1568
- # +profile+:: profile to be cleaned
1569
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1570
- # ===== Returns
1571
- # cleaned profile
1572
- def clean_profile(profile, remove_alternatives: true)
1573
- warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
1574
- terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1575
- if remove_alternatives
1576
- terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1577
- else
1578
- terms_without_ancestors_and_alternatices = terms_without_ancestors
1579
- end
1580
- return terms_without_ancestors_and_alternatices
1581
- end
1582
-
1583
- def clean_profile_hard(profile)
1584
- profile, _ = check_ids(profile)
1585
- profile = profile.select{|t| !is_obsolete?(t)}
1586
- profile = clean_profile(profile.uniq)
1587
- return profile
1588
- end
1589
-
1590
- # Remove terms from a given profile using hierarchical info and scores set given
1591
- # ===== Parameters
1592
- # +profile+:: profile to be cleaned
1593
- # +scores+:: hash with terms by keys and numerical values (scores)
1594
- # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
1595
- # +remove_without_score+:: if true, terms without score will be removed. Default: true
1596
- # ===== Returns
1597
- # cleaned profile
1598
- def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
1599
- scores = scores.sort_by{|term,score| score}.to_h
1600
- keep = profile.map do |term|
1601
- if scores.include?(term)
1602
- parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
1603
- targetable = parentals.select{|parent| profile.include?(parent)}
1604
- if targetable.empty?
1605
- term
1606
- else
1607
- targetable << term
1608
- targets = scores.select{|term,score| targetable.include?(term)}.to_h
1609
- byMax ? targets.keys.last : targets.keys.first
1610
- end
1611
- elsif remove_without_score
1612
- nil
1613
- else
1614
- term
752
+ def connect_familiars!(terms_levels)
753
+ levels = terms_levels.keys.sort
754
+ while levels.length > 1 # Process when current level has a parental level
755
+ level = levels.pop
756
+ parental_level = level - 1
757
+ parental_terms = terms_levels[parental_level]
758
+ if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
759
+ parental_terms = [] # Initialize required parental level
760
+ terms_levels[parental_level] = parental_terms
761
+ levels << parental_level
762
+ end
763
+ terms_levels[level].each do |term|
764
+ path_info = @term_paths[term]
765
+ shortest_path_length = path_info[:shortest_path]
766
+ path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
767
+ parental = path[1] # the first elements is the term itself
768
+ parental_terms << parental if !parental_terms.include?(parental)
1615
769
  end
1616
770
  end
1617
- return keep.compact.uniq
1618
- end
1619
-
1620
-
1621
- # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1622
- # ===== Parameters
1623
- # +store+:: if true, clenaed profiles will replace already stored profiles
1624
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1625
- # ===== Returns
1626
- # a hash with cleaned profiles
1627
- def clean_profiles(store: false, remove_alternatives: true)
1628
- cleaned_profiles = {}
1629
- @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1630
- @profiles = cleaned_profiles if store
1631
- return cleaned_profiles
1632
771
  end
1633
772
 
1634
-
1635
- # Calculates number of ancestors present (redundant) in each profile stored
1636
- # ===== Returns
1637
- # array of parentals for each profile
1638
- def parentals_per_profile
1639
- cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1640
- parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1641
- return parentals
1642
- end
1643
-
1644
-
1645
- # Calculates mean IC of a given profile
1646
- # ===== Parameters
1647
- # +prof+:: profile to be checked
1648
- # +ic_type+:: ic_type to be used
1649
- # +zhou_k+:: special coeficient for Zhou IC method
1650
- # ===== Returns
1651
- # mean IC for a given profile
1652
- def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1653
- return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1654
- end
1655
-
1656
-
1657
- # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1658
- # ===== Returns
1659
- # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1660
- def get_profiles_resnik_dual_ICs
1661
- struct_ics = {}
1662
- observ_ics = {}
1663
- @profiles.each do |id, terms|
1664
- struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1665
- observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1666
- end
1667
- return struct_ics.clone, observ_ics.clone
1668
- end
1669
-
1670
-
1671
- # Calculates ontology structural levels for all ontology terms
1672
- # ===== Parameters
1673
- # +calc_paths+:: calculates term paths if it's not already calculated
1674
- # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1675
- def calc_term_levels(calc_paths: false, shortest_path: true)
1676
- if @term_paths.empty?
1677
- if calc_paths
1678
- self.calc_term_paths
1679
- else
1680
- warn('Term paths are not already loaded. Aborting dictionary calc')
1681
- end
1682
- end
1683
- if !@term_paths.empty?
1684
- byTerm = {}
1685
- byValue = {}
1686
- # Calc per term
1687
- @term_paths.each do |term, info|
1688
- level = shortest_path ? info[:shortest_path] : info[:largest_path]
1689
- if level.nil?
1690
- level = -1
1691
- else
1692
- level = level.round(0)
1693
- end
1694
- byTerm[term] = level
1695
- queryLevels = byValue[level]
1696
- if queryLevels.nil?
1697
- byValue[level] = [term]
773
+ def get_item_list_parental(terms_levels)
774
+ transfered_list = {}
775
+ parent_dict = @dicts[:is_a][:byTerm]
776
+ levels = terms_levels.keys.sort
777
+ while levels.length > 1
778
+ level = levels.pop
779
+ terms_levels[level].each do |term|
780
+ parents = parent_dict[term]
781
+ if parents.nil?
782
+ next
783
+ elsif parents.length == 1
784
+ parent = parents.first
1698
785
  else
1699
- byValue[level] << term
786
+ parent = (parents | terms_levels[level - 1]).first
1700
787
  end
788
+ term_it = @items[term]
789
+ parent_it = @items[parent]
790
+ curr_it = transfered_list[term]
791
+ parent_all_items = merge_groups([term_it, parent_it, curr_it])
792
+ transfered_list[parent] = parent_all_items if !parent_all_items.empty?
793
+ term_all_items = merge_groups([term_it, curr_it])
794
+ transfered_list[term] = term_all_items if !term_all_items.empty?
1701
795
  end
1702
- @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1703
- # Update maximum depth
1704
- @max_freqs[:max_depth] = byValue.keys.max
1705
796
  end
797
+ terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
798
+ transfered_list[term] = @items[term] if transfered_list[term].nil?
799
+ end
800
+ return transfered_list
1706
801
  end
1707
802
 
1708
-
1709
- # Check if a term given is marked as obsolete
1710
- def is_obsolete? term
1711
- return @obsoletes_index.include?(term)
803
+ def merge_groups(groups)
804
+ return groups.compact.inject([ ]){|it, a| it | a}
1712
805
  end
1713
806
 
1714
- # Check if a term given is marked as alternative
1715
- def is_alternative? term
1716
- return @alternatives_index.include?(term)
807
+ def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
808
+ results = []
809
+ penalized_terms = {}
810
+ levels = terms_levels.keys.sort
811
+ levels.reverse_each do |level|
812
+ terms_levels[level].each do |term|
813
+ associated_items = item_list[term]
814
+ items_to_remove = penalized_terms[term]
815
+ items_to_remove = [] if items_to_remove.nil?
816
+ pval = get_fisher_exact_test(
817
+ external_item_list - items_to_remove,
818
+ associated_items - items_to_remove,
819
+ #((associated_items | external_item_list) - items_to_remove).length
820
+ total_items
821
+ )
822
+ if pval <= thresold
823
+ parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
824
+ parents.each do |prnt|
825
+ query = penalized_terms[prnt]
826
+ if query.nil?
827
+ penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
828
+ else
829
+ query.concat(item_list[term])
830
+ end
831
+ end
832
+ end
833
+ results << [term, pval]
834
+ end
835
+ end
836
+ return results
1717
837
  end
1718
838
 
1719
- # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1720
- # Also calculates paths metadata and stores into @term_paths
1721
- def calc_term_paths(only_main_terms=false)
1722
- self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
1723
- visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
1724
- @term_paths = {}
1725
- if [:hierarchical, :sparse].include? @structureType
1726
- @stanzas[:terms].each do |term, t_attributes|
1727
- if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term)) # Special case (obsoletes)
1728
- special_term = term
1729
- term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1730
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1731
- @term_paths[special_term] = @term_paths[term]
1732
- visited_terms[special_term] = true
1733
- end
1734
- if !visited_terms.include?(term)
1735
- # PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
1736
- path_attr = @term_paths[term]
1737
- if path_attr.nil?
1738
- path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
1739
- @term_paths[term] = path_attr #save path data container
1740
- end
1741
- parentals = @dicts[:is_a][:byTerm][term]
1742
- if parentals.nil?
1743
- path_attr[:paths] << [term]
1744
- else
1745
- parentals.each do |direct_parental|
1746
- self.expand_path(direct_parental)
1747
- new_paths = @term_paths[direct_parental][:paths]
1748
- path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
1749
- end
1750
- end
1751
- anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
1752
- visited_terms[term] = true
839
+ def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
840
+ pvals = {}
841
+ item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
842
+ levels = terms_levels.keys.sort
843
+ levels.reverse_each do |level|
844
+ terms_levels[level].each do |term|
845
+ associated_items = item_list[term]
846
+ #initialize observed items in item_weigths_per_term list
847
+ add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
848
+ children = @dicts[:is_a][:byValue][term]
849
+ if children.nil?
850
+ children = []
851
+ else
852
+ children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
1753
853
  end
1754
- # Update metadata
1755
- path_attr = @term_paths[term]
1756
- path_attr[:total_paths] = path_attr[:paths].length
1757
- paths_sizes = path_attr[:paths].map{|path| path.length}
1758
- path_attr[:largest_path] = paths_sizes.max
1759
- path_attr[:shortest_path] = paths_sizes.min
854
+ computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
1760
855
  end
1761
- else
1762
- warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1763
856
  end
857
+ return pvals.to_a
1764
858
  end
1765
859
 
860
+ def add_items_to_weigthed_list(term, associated_items, weigthed_list)
861
+ term_weigthing = weigthed_list[term]
862
+ associated_items.each{|ai| term_weigthing[ai] = 1}
863
+ weigthed_list[term] = term_weigthing
864
+ end
1766
865
 
1767
- # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1768
- # ===== Parameters
1769
- # +curr_term+:: current visited term
1770
- # +visited_terms+:: already expanded terms
1771
- def expand_path(curr_term)
1772
- if !@term_paths.include?(curr_term)
1773
- path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
1774
- @term_paths[curr_term] = path_attr
1775
- direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1776
- if direct_parentals.nil? # No parents :: End of recurrence
1777
- path_attr[:paths] << [curr_term]
1778
- else # Expand and concat
1779
- direct_parentals.each do |ancestor|
1780
- path_attr_parental = @term_paths[ancestor]
1781
- if path_attr_parental.nil? # Calculate new paths
1782
- self.expand_path(ancestor)
1783
- new_paths = @term_paths[ancestor][:paths]
1784
- else # Use direct_parental paths already calculated
1785
- new_paths = path_attr_parental[:paths]
866
+ def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
867
+ #puts term.to_s.red
868
+ #puts @term_paths[term].inspect
869
+ #puts @dicts[:is_a][:byValue][term].inspect.light_blue
870
+ associated_items = item_weigths_per_term[term].keys
871
+ pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
872
+ 'two_sided', item_weigths_per_term[term], true)
873
+ pvals[term] = pval
874
+ if children.length > 0
875
+ rates = {}
876
+ sig_child = 0
877
+ children.each do |child|
878
+ ratio = sigRatio(pvals[child], pval)
879
+ rates[child] = ratio
880
+ sig_child += 1 if ratio >= 1
881
+ end
882
+ if sig_child == 0 # CASE 1
883
+ children.each do |child|
884
+ current_ratio = rates[child]
885
+ query_child = item_weigths_per_term[child]
886
+ query_child.transform_values!{|weight| weight * current_ratio}
887
+ pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
888
+ 'two_sided', item_weigths_per_term[child], true)
889
+ end
890
+ else
891
+ ancs = get_ancestors(term)
892
+ ancs << term
893
+ rates.each do |ch, ratio|# CASE 2
894
+ if ratio >= 1 # The child is better than parent
895
+ ancs.each do |anc|
896
+ query_anc = item_weigths_per_term[anc]
897
+ associated_items.each do |item|
898
+ query_anc[item] /= ratio # /= --> query_anc[item]/ratio
899
+ end
900
+ end
1786
901
  end
1787
- path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
1788
902
  end
903
+ computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
1789
904
  end
1790
905
  end
1791
906
  end
1792
907
 
1793
-
1794
- # Gets ontology levels calculated
1795
- # ===== Returns
1796
- # ontology levels calculated
1797
- def get_ontology_levels
1798
- return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
908
+ def sigRatio(pvalA, pvalB)
909
+ return Math.log(pvalA)/Math.log(pvalB)
1799
910
  end
1800
911
 
912
+ # END of methods involved with compute_relations_to_items
913
+ #-----------------------------------------------------------------------------------
914
+
915
+ #############################################
916
+ # PROFILE EXTERNAL METHODS
917
+ #############################################
1801
918
 
1802
- # Gets ontology level of a specific term
1803
- # ===== Returns
1804
- # Term level
1805
- def get_term_level(term)
1806
- return @dicts[:level][:byValue][term]
919
+ # I/O profile
920
+ ####################################
921
+
922
+ # Increase the arbitrary frequency of a given term set
923
+ # ===== Parameters
924
+ # +terms+:: set of terms to be updated
925
+ # +increase+:: amount to be increased
926
+ # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
927
+ # ===== Return
928
+ # true if process ends without errors and false in other cases
929
+ def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false, expand2parentals: true)
930
+ terms = terms.map{|term| [term] + get_ancestors(term.to_sym)}.flatten if expand2parentals
931
+ return terms.map{|id| self.add_observed_term(
932
+ term: transform_to_sym ? id.to_sym : id,
933
+ increase: increase)} # FRED: It is necessary the return?
1807
934
  end
1808
935
 
1809
- # nil, term not found, [] term exists but not has parents
1810
- def get_parental_path(term, which_path = :shortest_path, level = 0)
1811
- path = nil
1812
- path_attr = @term_paths[term]
1813
- if !path_attr.nil?
1814
- path_length = path_attr[which_path]
1815
- all_paths = path_attr[:paths]
1816
- if all_paths.empty?
1817
- path = []
1818
- else
1819
- path = all_paths.select{|pt| pt.length == path_length}.first.clone
1820
- if level > 0 # we want the term and his ascendants until a specific level
1821
- n_parents = path_length - level
1822
- path = path[0..n_parents]
1823
- end
1824
- path.shift # Discard the term itself
1825
- end
936
+ # Modifying Profile
937
+ ####################################
938
+
939
+ def expand_profile_with_parents(profile)
940
+ new_terms = []
941
+ profile.each do |term|
942
+ new_terms = new_terms | get_ancestors(term)
1826
943
  end
1827
- return path
944
+ return new_terms | profile
945
+ end
946
+
947
+ # Clean a given profile returning cleaned set of terms and removed ancestors term.
948
+ # ===== Parameters
949
+ # +prof+:: array of terms to be checked
950
+ # ===== Returns
951
+ # two arrays, first is the cleaned profile and second is the removed elements array
952
+ def remove_ancestors_from_profile(prof)
953
+ ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
954
+ redundant = prof & ancestors
955
+ return prof - redundant, redundant
1828
956
  end
1829
957
 
1830
- # Return ontology levels from profile terms
958
+ # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
959
+ # ===== Parameters
960
+ # +prof+:: array of terms to be checked
1831
961
  # ===== Returns
1832
- # hash of term levels (Key: level; Value: array of term IDs)
1833
- def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1834
- profiles_terms = @profiles.values.flatten
1835
- profiles_terms.uniq! if uniq
1836
- term_freqs_byProfile = {}
1837
- profiles_terms.each do |term|
1838
- query = term_freqs_byProfile[term]
1839
- if query.nil?
1840
- term_freqs_byProfile[term] = 1
1841
- else
1842
- term_freqs_byProfile[term] += 1
1843
- end
1844
- end
1845
- levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1846
- return levels_filtered
962
+ # two arrays, first is the cleaned profile and second is the removed elements array
963
+ def remove_alternatives_from_profile(prof)
964
+ alternatives = prof.select{|term| @alternatives_index.include?(term)}
965
+ redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
966
+ return prof - redundant, redundant
1847
967
  end
1848
968
 
1849
- def get_profile_ontology_distribution_tables
1850
- cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
1851
- uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
1852
- hpo_ontology_levels = get_ontology_levels
1853
- total_ontology_terms = hpo_ontology_levels.values.flatten.length
1854
- total_cohort_terms = cohort_ontology_levels.values.flatten.length
1855
- total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
1856
-
1857
- ontology_levels = []
1858
- distribution_percentage = []
1859
- hpo_ontology_levels.each do |level, terms|
1860
- cohort_terms = cohort_ontology_levels[level]
1861
- uniq_cohort_terms = uniq_cohort_ontology_levels[level]
1862
- if cohort_terms.nil? || uniq_cohort_terms.nil?
1863
- num = 0
1864
- u_num = 0
1865
- else
1866
- num = cohort_terms.length
1867
- u_num = uniq_cohort_terms.length
1868
- end
1869
- ontology_levels << [level, terms.length, num]
1870
- distribution_percentage << [
1871
- level,
1872
- (terms.length.fdiv(total_ontology_terms)*100).round(3),
1873
- (num.fdiv(total_cohort_terms)*100).round(3),
1874
- (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
1875
- ]
1876
- end
1877
- ontology_levels.sort! { |x,y| x.first <=> y.first }
1878
- distribution_percentage.sort! { |x,y| x.first <=> y.first }
1879
- return ontology_levels, distribution_percentage
969
+ # Remove alternatives (if official term is present) and ancestors terms of a given profile
970
+ # ===== Parameters
971
+ # +profile+:: profile to be cleaned
972
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
973
+ # ===== Returns
974
+ # cleaned profile
975
+ def clean_profile(profile, remove_alternatives: true)
976
+ warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
977
+ terms_without_ancestors, _ = remove_ancestors_from_profile(profile)
978
+ terms_without_ancestors, _ = remove_alternatives_from_profile(terms_without_ancestors) if remove_alternatives
979
+ return terms_without_ancestors
1880
980
  end
1881
981
 
1882
- def get_dataset_specifity_index(mode)
1883
- ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
1884
- if mode == 'uniq'
1885
- observed_distribution = 3
1886
- elsif mode == 'weigthed'
1887
- observed_distribution = 2
1888
- end
1889
- max_terms = distribution_percentage.map{|row| row[1]}.max
1890
- maxL = nil
1891
- distribution_percentage.each do |level_info|
1892
- maxL = level_info.first if level_info[1] == max_terms
1893
- end
1894
- diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
1895
- diffL.select!{|dL| dL.last > 0}
1896
- lowSection = diffL.select{|dL| dL.first <= maxL}
1897
- highSection = diffL.select{|dL| dL.first > maxL}
1898
- dsi = nil
1899
- if highSection.empty?
1900
- dsi = 0
1901
- else
1902
- accumulated_weigth = 0
1903
- accumulated_weigthed_diffL = 0
1904
- hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
1905
- lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
1906
- dsi = hss.fdiv(lss)
1907
- end
1908
- return dsi
982
+ def clean_profile_hard(profile, options = {})
983
+ profile, _ = check_ids(profile)
984
+ profile = profile.select{|t| !is_obsolete?(t)}
985
+ if !options[:term_filter].nil?
986
+ profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
987
+ end
988
+ profile = clean_profile(profile.uniq)
989
+ return profile
1909
990
  end
1910
991
 
1911
- def get_weigthed_level_contribution(section, maxL, nLevels)
1912
- accumulated_weigthed_diffL = 0
1913
- section.each do |level, diff|
1914
- weightL = maxL - level
1915
- if weightL >= 0
1916
- weightL += 1
992
+ # Remove terms from a given profile using hierarchical info and scores set given
993
+ # ===== Parameters
994
+ # +profile+:: profile to be cleaned
995
+ # +scores+:: hash with terms by keys and numerical values (scores)
996
+ # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
997
+ # +remove_without_score+:: if true, terms without score will be removed. Default: true
998
+ # ===== Returns
999
+ # cleaned profile
1000
+ def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
1001
+ scores = scores.sort_by{|term,score| score}.to_h
1002
+ keep = profile.map do |term|
1003
+ if scores.include?(term)
1004
+ parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
1005
+ targetable = parentals.select{|parent| profile.include?(parent)}
1006
+ if targetable.empty?
1007
+ term
1008
+ else
1009
+ targetable << term
1010
+ targets = scores.select{|term,score| targetable.include?(term)}.to_h
1011
+ byMax ? targets.keys.last : targets.keys.first
1012
+ end
1013
+ elsif remove_without_score
1014
+ nil
1917
1015
  else
1918
- weightL = weightL.abs
1016
+ term
1919
1017
  end
1920
- accumulated_weigthed_diffL += diff * weightL
1921
1018
  end
1922
- weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
1923
- return weigthed_contribution
1019
+ return keep.compact.uniq
1924
1020
  end
1925
1021
 
1022
+ # ID Handlers
1023
+ ####################################
1926
1024
 
1927
- # Calculate profiles dictionary with Key= Term; Value = Profiles
1928
- def calc_profiles_dictionary
1929
- if @profiles.empty?
1930
- warn('Profiles are not already loaded. Aborting dictionary calc')
1931
- else
1932
- byTerm = {} # Key: Terms
1933
- # byValue -- Key: Profile == @profiles
1934
- @profiles.each do |id, terms|
1935
- terms.each do |term|
1936
- if byTerm.include?(term)
1937
- byTerm[term] << id
1938
- else
1939
- byTerm[term] = [id]
1940
- end
1025
+ # Check a set of IDs and return allowed IDs removing which are not official terms on this ontology
1026
+ # ===== Parameters
1027
+ # +ids+:: to be checked
1028
+ # ===== Return
1029
+ # two arrays whit allowed and rejected IDs respectively
1030
+ def check_ids(ids, substitute: true)
1031
+ checked_codes = []
1032
+ rejected_codes = []
1033
+ ids.each do |id|
1034
+ new_id = get_main_id(id)
1035
+ if new_id.nil?
1036
+ rejected_codes << id
1037
+ else
1038
+ if substitute
1039
+ checked_codes << new_id
1040
+ else
1041
+ checked_codes << id
1941
1042
  end
1942
1043
  end
1943
- @profilesDict = byTerm
1944
1044
  end
1045
+ return checked_codes, rejected_codes
1945
1046
  end
1946
1047
 
1947
1048
 
1948
- # Gets profiles dictionary calculated
1049
+ # Translates several IDs and returns translations and not allowed IDs list
1050
+ # ===== Parameters
1051
+ # +ids+:: to be translated
1949
1052
  # ===== Return
1950
- # profiles dictionary (clone)
1951
- def get_terms_linked_profiles
1952
- return @profilesDict.clone
1953
- end
1954
-
1053
+ # two arrays with translations and ids which couldn't be translated respectively
1054
+ def translate_ids(ids)
1055
+ translated = []
1056
+ rejected = []
1057
+ ids.each do |term_id|
1058
+ tr = self.translate_id(term_id.to_sym)
1059
+ if !tr.nil?
1060
+ translated << tr # FRED: Why have this a different behaviour from ...->
1061
+ else
1062
+ rejected << tr
1063
+ end
1064
+ end
1065
+ return translated, rejected
1066
+ end
1955
1067
 
1956
- # Get related profiles to a given term
1068
+ # Translate several names and return translations and a list of names which couldn't be translated
1957
1069
  # ===== Parameters
1958
- # +term+:: to be checked
1959
- # ===== Returns
1960
- # profiles which contains given term
1961
- def get_term_linked_profiles(term)
1962
- return @profilesDict[term]
1070
+ # +names+:: array to be translated
1071
+ # ===== Return
1072
+ # two arrays with translations and names which couldn't be translated respectively
1073
+ def translate_names(names)
1074
+ translated = []
1075
+ rejected = []
1076
+ names.each do |name|
1077
+ tr = self.translate_name(name)
1078
+ if tr.nil?
1079
+ rejected << name # FRED: <-... this?
1080
+ else
1081
+ translated << tr
1082
+ end
1083
+ end
1084
+ return translated, rejected
1963
1085
  end
1964
1086
 
1087
+ # Description of profile's terms
1088
+ ####################################
1965
1089
 
1966
1090
  # Gets metainfo table from a set of terms
1967
1091
  # ===== Parameters
1968
1092
  # +terms+:: IDs to be expanded
1969
- # +filter_alternatives+:: flag to be used in get_descendants method
1970
1093
  # ===== Returns
1971
1094
  # an array with triplets [TermID, TermName, DescendantsNames]
1972
- def get_childs_table(terms, filter_alternatives = false)
1973
- expanded_terms = []
1974
- terms.each do |t|
1975
- expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1095
+ def get_childs_table(profile)
1096
+ expanded_profile = []
1097
+ profile.each do |t|
1098
+ expanded_profile << [[t, translate_id(t)], get_descendants(t).map{|child| [child, translate_id(child)]}]
1976
1099
  end
1977
- return expanded_terms
1100
+ return expanded_profile
1978
1101
  end
1979
1102
 
1980
-
1981
- # Store specific relations hash given into ITEMS structure
1982
- # ===== Parameters
1983
- # +relations+:: hash to be stored
1984
- # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
1985
- # +expand+:: if true, already stored keys will be updated with the unique union of both sets
1986
- def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
1987
- @items = {} if remove_old_relations
1988
- if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
1989
- warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
1103
+ def get_terms_levels(profile)
1104
+ termsAndLevels = []
1105
+ profile.each do |term|
1106
+ termsAndLevels << [term, get_term_level(term)]
1990
1107
  end
1991
- if !remove_old_relations
1992
- if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
1993
- warn('Some terms given are already stored. Stored version will be replaced')
1994
- end
1995
- end
1996
- if expand
1997
- @items = self.concatItems(@items,relations)
1998
- # relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
1999
- # if @items.keys.include?(k)
2000
- # if v.kind_of?(Array)
2001
- # @items[k] = (@items[k] + v).uniq
2002
- # elsif v.kind_of?(Hash)
2003
- # @items.merge!(relations) do |k, oldV, newV|
2004
- # if oldV.kind_of?(Array)
2005
- # return (oldV + newV).uniq
2006
- # else
2007
- # oldV = [oldV,newV]
2008
- # end
2009
- # end
2010
- # elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
2011
- # @items[k] = (@items[k] + [v]).uniq
2012
- # else
2013
- # @items[k] = [@items[k],v]
2014
- # end
2015
- # else
2016
- # @items[k] = v
2017
- # end
2018
- # end
2019
- else
2020
- @items.merge!(relations)
2021
- end
2022
- end
1108
+ return termsAndLevels
1109
+ end
2023
1110
 
2024
- # Internal function to concat two elements.
1111
+ # IC data
1112
+ ####################################
1113
+
1114
+ # Get information coefficient from profiles #
1115
+
1116
+ # Calculates mean IC of a given profile
2025
1117
  # ===== Parameters
2026
- # +itemA+:: item to be concatenated
2027
- # +itemB+:: item to be concatenated
2028
- # ===== Returns
2029
- # Concatenated objects
2030
- def concatItems(itemA,itemB)
2031
- # A is Array :: RETURN ARRAY
2032
- # A_array : B_array
2033
- # A_array : B_hash => NOT ALLOWED
2034
- # A_array : B_single => NOT ALLOWED
2035
- # A is Hash :: RETURN HASH
2036
- # A_hash : B_array => NOT ALLOWED
2037
- # A_hash : B_hash
2038
- # A_hash : B_single => NOT ALLOWED
2039
- # A is single element => RETURN ARRAY
2040
- # A_single : B_array
2041
- # A_single : B_hash => NOT ALLOWED
2042
- # A_single : B_single
2043
- concatenated = nil
2044
- if itemA.kind_of?(Array) && itemB.kind_of?(Array)
2045
- concatenated = (itemA + itemB).uniq
2046
- elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
2047
- concatenated = itemA.merge(itemB) do |k, oldV, newV|
2048
- self.concatItems(oldV,newV)
2049
- end
2050
- elsif itemB.kind_of?(Array)
2051
- concatenated = ([itemA] + itemB).uniq
2052
- elsif ![Array, Hash].include?(itemB.class)
2053
- concatenated = [itemA,itemB].uniq
2054
- end
2055
- return concatenated
2056
- end
1118
+ # +prof+:: profile to be checked
1119
+ # +ic_type+:: ic_type to be used
1120
+ # +zhou_k+:: special coeficient for Zhou IC method
1121
+ # ===== Returns
1122
+ # mean IC for a given profile
1123
+ def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1124
+ return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.sum.fdiv(prof.length)
1125
+ end
2057
1126
 
1127
+ # Term ref vs profile #
2058
1128
 
2059
- # Assign a dictionary already calculated as a items set.
1129
+ def get_maxmica_term2profile(ref_term, profile)
1130
+ micas = profile.map{|term| get_MICA(ref_term, term)}
1131
+ maxmica = micas.first
1132
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1133
+ return maxmica
1134
+ end
1135
+
1136
+ # Profile vs Profile #
1137
+
1138
+ # Get semantic similarity from two term sets
2060
1139
  # ===== Parameters
2061
- # +dictID+:: dictionary ID to be stored (:byTerm will be used)
2062
- def set_items_from_dict(dictID, remove_old_relations = false)
2063
- @items = {} if remove_old_relations
2064
- if !@dicts[dictID].nil?
2065
- @items.merge(@dicts[dictID][:byTerm])
2066
- else
2067
- warn('Specified ID is not calculated. Dict will not be added as a items set')
1140
+ # +termsA+:: set to be compared
1141
+ # +termsB+:: set to be compared
1142
+ # +sim_type+:: similitude method to be used. Default: resnik
1143
+ # +ic_type+:: ic type to be used. Default: resnik
1144
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
1145
+ # ===== Return
1146
+ # similitude calculated
1147
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
1148
+ # Check
1149
+ raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
1150
+ raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
1151
+ micasA = []
1152
+ # Compare A -> B
1153
+ termsA.each do |tA|
1154
+ micas = []
1155
+ termsB.each do |tB|
1156
+ if store_mica
1157
+ value = @mica_index[tA][tB]
1158
+ else
1159
+ value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
1160
+ end
1161
+ micas << value if value.class == Float
1162
+ end
1163
+ !micas.empty? ? micasA << micas.max : micasA << 0
1164
+ end
1165
+ means_sim = micasA.sum.fdiv(micasA.size)
1166
+ # Compare B -> A
1167
+ if bidirectional
1168
+ means_simA = means_sim * micasA.size
1169
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
1170
+ means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
2068
1171
  end
1172
+ # Return
1173
+ return means_sim
2069
1174
  end
2070
1175
 
2071
1176
 
2072
- # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
2073
- # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1177
+ #############################################
1178
+ # PROFILE INTERNAL METHODS
1179
+ #############################################
1180
+
1181
+ # I/O profiles
1182
+ ####################################
1183
+
1184
+ # Method used to store a pool of profiles
2074
1185
  # ===== Parameters
2075
- # +ontology+:: (Optional) ontology object which items given belongs
2076
- # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
2077
- # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
2078
- # ===== Returns
2079
- # void and update items object
2080
- def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
2081
- # Check item keys
2082
- if @items.empty?
2083
- warn('Items have been not provided yet')
2084
- return nil
2085
- end
2086
- targetKeys = @items.keys.select{|k| self.exists?(k)}
2087
- if targetKeys.length == 0
2088
- warn('Any item key is allowed')
2089
- return nil
2090
- elsif targetKeys.length < @items.keys.length
2091
- warn('Some item keys are not allowed')
2092
- end
2093
-
2094
- # Expand to parentals
2095
- targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
2096
- targetKeys.flatten!
2097
- targetKeys.uniq!
2098
-
2099
- # Obtain levels (go from leaves to roots)
2100
- levels = targetKeys.map{|term| self.get_term_level(term)}
2101
- levels.compact!
2102
- levels.uniq!
2103
- levels.sort!
2104
- levels.reverse!
2105
- levels.shift # Leaves are not expandable
2106
-
2107
- # Expand from leaves to roots
2108
- levels.map do |lvl|
2109
- curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
2110
- curr_keys.map do |term_expand|
2111
- to_infer = []
2112
- # Obtain childs
2113
- childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
2114
- # Expand
2115
- if childs.length > 0 && minimum_childs == 1 # Special case
2116
- to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
2117
- elsif childs.length >= minimum_childs
2118
- to_infer = Hash.new(0)
2119
- # Compare
2120
- while childs.length > 1
2121
- curr_term = childs.shift
2122
- childs.each do |compare_term|
2123
- pivot_items = @items[curr_term]
2124
- compare_items = @items[compare_term]
2125
- if ontology.nil? # Exact match
2126
- pivot_items.map do |pitem|
2127
- if compare_items.include?(pitem)
2128
- to_infer[pitem] += 2
2129
- end
2130
- end
2131
- else # Find MICAs
2132
- local_infer = Hash.new(0)
2133
- pivot_items.map do |pitem|
2134
- micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
2135
- maxmica = micas[0]
2136
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
2137
- local_infer[maxmica.first] += 1
2138
- end
2139
- compare_items.map do |citem|
2140
- micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
2141
- maxmica = micas[0]
2142
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
2143
- local_infer[maxmica.first] += 1
2144
- end
2145
- local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
2146
- end
2147
- end
2148
- end
2149
- # Filter infer
2150
- to_infer = to_infer.select{|k,v| v >= minimum_childs}
2151
- end
2152
- # Infer
2153
- if to_infer.length > 0
2154
- @items[term_expand] = [] if @items[term_expand].nil?
2155
- if to_infer.kind_of?(Array)
2156
- @items[term_expand] = (@items[term_expand] + to_infer).uniq
2157
- else
2158
- @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
2159
- end
2160
- @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
2161
- elsif !@items.include?(term_expand)
2162
- targetKeys.delete(term_expand)
2163
- end
1186
+ # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1187
+ # +calc_metadata+:: if true, launch get_items_from_profiles process
1188
+ # +reset_stored+:: if true, remove already stored profiles
1189
+ # +substitute+:: subsstitute flag from check_ids
1190
+ def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1191
+ self.reset_profiles if reset_stored
1192
+ # Check
1193
+ if profiles.kind_of?(Array)
1194
+ profiles.each_with_index do |items, i|
1195
+ self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1196
+ end
1197
+ else # Hash
1198
+ if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1199
+ warn('Some profiles given are already stored. Stored version will be replaced')
2164
1200
  end
1201
+ profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
2165
1202
  end
2166
- end
2167
1203
 
1204
+ self.add_observed_terms_from_profiles(reset: true)
2168
1205
 
2169
- # Return direct ancestors/descendants of a given term
2170
- # ===== Parameters
2171
- # +term+:: which are requested
2172
- # +relation+:: can be :ancestor or :descendant
2173
- # +remove_alternatives+:: if true, alternatives will be removed
2174
- # ===== Returns
2175
- # Direct ancestors/descendants of given term or nil if any error occurs
2176
- def get_direct_related(term, relation, remove_alternatives: false)
2177
- if @dicts[:is_a].nil?
2178
- warn("Hierarchy dictionary is not already calculated. Returning nil")
2179
- return nil
2180
- end
2181
- target = nil
2182
- case relation
2183
- when :ancestor
2184
- target = :byTerm
2185
- when :descendant
2186
- target = :byValue
2187
- else
2188
- warn('Relation type not allowed. Returning nil')
1206
+ if calc_metadata
1207
+ self.get_items_from_profiles
2189
1208
  end
2190
- return nil if target.nil?
2191
- query = @dicts[:is_a][target][term]
2192
- return query if query.nil?
2193
- query, _ = remove_alternatives_from_profile(query) if remove_alternatives
2194
- return query
2195
1209
  end
2196
1210
 
1211
+ # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1212
+ # ===== Parameters
1213
+ # +id+:: assigned to profile
1214
+ # +terms+:: array of terms
1215
+ # +substitute+:: subsstitute flag from check_ids
1216
+ def add_profile(id, terms, substitute: true) # FRED: Talk with PSZ about the uniqness of IDs translated
1217
+ warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1218
+ correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1219
+ if !rejected_terms.empty?
1220
+ warn("Given terms contains erroneus IDs: #{rejected_terms.join(",")}. These IDs will be removed")
1221
+ end
1222
+ if id.is_a? Numeric
1223
+ @profiles[id] = correct_terms
1224
+ else
1225
+ @profiles[id.to_sym] = correct_terms
1226
+ end
1227
+ end
1228
+
2197
1229
 
2198
- # Return direct ancestors of a given term
1230
+ # Includes as "observed_terms" all terms included into stored profiles
2199
1231
  # ===== Parameters
2200
- # +term+:: which ancestors are requested
2201
- # +remove_alternatives+:: if true, alternatives will be removed
2202
- # ===== Returns
2203
- # Direct ancestors of given term or nil if any error occurs
2204
- def get_direct_ancentors(term, remove_alternatives: false)
2205
- return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
1232
+ # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1233
+ def add_observed_terms_from_profiles(reset: false)
1234
+ @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1235
+ @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
2206
1236
  end
2207
1237
 
2208
- # Return direct descendants of a given term
1238
+ # ===== Returns
1239
+ # profiles assigned to a given ID
2209
1240
  # ===== Parameters
2210
- # +term+:: which descendants are requested
2211
- # +remove_alternatives+:: if true, alternatives will be removed
2212
- # ===== Returns
2213
- # Direct descendants of given term or nil if any error occurs
2214
- def get_direct_descendants(term, remove_alternatives: false)
2215
- return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
1241
+ # +id+:: profile ID
1242
+ # ===== Return
1243
+ # specific profile or nil if it's not stored
1244
+ def get_profile(id)
1245
+ return @profiles[id]
2216
1246
  end
2217
1247
 
1248
+ # Modifying profiles
1249
+ ####################################
2218
1250
 
1251
+ def reset_profiles # Internal method used to remove already stored profiles and restore observed frequencies #TODO FRED: Modify test for this method.
1252
+ @profiles = {} # Clean profiles storage
1253
+ # Reset frequency observed
1254
+ @meta.each{|term,info| info[:observed_freq] = 0}
1255
+ @max_freqs[:observed_freq] = 0
1256
+ @items = {}
1257
+ end
2219
1258
 
2220
- #============================================================================
2221
- #============================================================================
1259
+ def expand_profiles(meth, unwanted_terms: [], calc_metadata: true, ontology: nil, minimum_childs: 1, clean_profiles: true)
1260
+ if meth == 'parental'
1261
+ @profiles.each do |id, terms|
1262
+ @profiles[id] = expand_profile_with_parents(terms) - unwanted_terms
1263
+ end
1264
+ get_items_from_profiles if calc_metadata
1265
+ elsif meth == 'propagate'
1266
+ get_items_from_profiles
1267
+ expand_items_to_parentals(ontology: ontology, minimum_childs: minimum_childs, clean_profiles: clean_profiles)
1268
+ get_profiles_from_items
1269
+ end
1270
+ add_observed_terms_from_profiles(reset: true)
1271
+ end
2222
1272
 
2223
- # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1273
+ # Remove alternatives (if official term is present) and ancestors terms of stored profiles
2224
1274
  # ===== Parameters
2225
- # ++::
2226
- # ===== Returns
2227
- # ...
2228
- def compute_relations_to_items(external_item_list, total_items, mode, thresold)
2229
- terms_levels = list_terms_per_level_from_items
2230
- #puts terms_levels.inspect.yellow
2231
- connect_familiars!(terms_levels)
2232
- #puts terms_levels.inspect.blue
2233
- item_list_with_transf_parental = get_item_list_parental(terms_levels)
2234
- results = []
2235
- if mode == :elim
2236
- results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
2237
- elsif mode == :weight
2238
- results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
2239
- end
2240
- return results
1275
+ # +store+:: if true, clenaed profiles will replace already stored profiles
1276
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1277
+ # ===== Returns
1278
+ # a hash with cleaned profiles
1279
+ def clean_profiles(store: false, remove_alternatives: true)
1280
+ cleaned_profiles = {}
1281
+ @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1282
+ @profiles = cleaned_profiles if store
1283
+ return cleaned_profiles
2241
1284
  end
2242
1285
 
2243
- def get_item_list_parental(terms_levels)
2244
- transfered_list = {}
2245
- parent_dict = @dicts[:is_a][:byTerm]
2246
- levels = terms_levels.keys.sort
2247
- while levels.length > 1
2248
- level = levels.pop
2249
- terms_levels[level].each do |term|
2250
- parents = parent_dict[term]
2251
- if parents.nil?
2252
- next
2253
- elsif parents.length == 1
2254
- parent = parents.first
2255
- else
2256
- parent = (parents | terms_levels[level - 1]).first
2257
- end
2258
- term_it = @items[term]
2259
- parent_it = @items[parent]
2260
- curr_it = transfered_list[term]
2261
- parent_all_items = merge_groups([term_it, parent_it, curr_it])
2262
- transfered_list[parent] = parent_all_items if !parent_all_items.empty?
2263
- term_all_items = merge_groups([term_it, curr_it])
2264
- transfered_list[term] = term_all_items if !term_all_items.empty?
2265
- end
1286
+ # ID Handlers
1287
+ ####################################
1288
+
1289
+ # Trnaslates a bunch of profiles to it sets of term names
1290
+ # ===== Parameters
1291
+ # +profs+:: array of profiles
1292
+ # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1293
+ # ===== Returns
1294
+ # translated profiles
1295
+ def translate_profiles_ids(profs = [], asArray: true)
1296
+ profs2proc = {}
1297
+ if profs.empty?
1298
+ profs2proc = @profiles
1299
+ else
1300
+ profs.each_with_index{|terms, index| profs2proc[index] = terms} if profs.kind_of?(Array)
2266
1301
  end
2267
- terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
2268
- transfered_list[term] = @items[term] if transfered_list[term].nil?
1302
+ profs_names = {}
1303
+ profs2proc.each do |id, terms|
1304
+ names, _ = translate_ids(terms)
1305
+ profs_names[id] = names
2269
1306
  end
2270
- return transfered_list
1307
+ return asArray ? profs_names.values : profs_names
2271
1308
  end
2272
1309
 
2273
- def merge_groups(groups)
2274
- return groups.compact.inject([]){|it, a| it | a}
1310
+ # Description of profile size
1311
+ ####################################
1312
+
1313
+ def profile_stats
1314
+ stats = Hash.new(0)
1315
+ data = get_profiles_sizes
1316
+ stats[:average] = data.sum().fdiv(data.size)
1317
+ sum_devs = data.sum{|element| (element - stats[:average]) ** 2}
1318
+ stats[:variance] = sum_devs.fdiv(data.size)
1319
+ stats[:standardDeviation] = stats[:variance] ** 0.5
1320
+ stats[:max] = data.max
1321
+ stats[:min] = data.min
1322
+
1323
+ stats[:count] = data.size
1324
+ data.each do |value|
1325
+ stats[:countNonZero] += 1 if value != 0
1326
+ end
1327
+
1328
+ stats[:q1] = data.get_quantiles(0.25)
1329
+ stats[:median] = data.get_quantiles(0.5)
1330
+ stats[:q3] = data.get_quantiles(0.75)
1331
+ return stats
1332
+
2275
1333
  end
2276
1334
 
2277
- def list_terms_per_level_from_items
2278
- terms_levels = {}
2279
- @items.each do |term, items|
2280
- level = self.get_term_level(term)
2281
- query = terms_levels[level]
2282
- if query.nil?
2283
- terms_levels[level] = [term]
2284
- else
2285
- query << term
2286
- end
2287
- end
2288
- return terms_levels
1335
+ # ===== Returns
1336
+ # mean size of stored profiles
1337
+ # ===== Parameters
1338
+ # +round_digits+:: number of digits to round result. Default: 4
1339
+ # ===== Returns
1340
+ # mean size of stored profiles
1341
+ def get_profiles_mean_size(round_digits: 4)
1342
+ sizes = self.get_profiles_sizes
1343
+ return sizes.sum.fdiv(@profiles.length).round(round_digits)
2289
1344
  end
2290
1345
 
2291
- def connect_familiars!(terms_levels)
2292
- levels = terms_levels.keys.sort
2293
- while levels.length > 1 # Process when current level has a parental level
2294
- level = levels.pop
2295
- parental_level = level - 1
2296
- parental_terms = terms_levels[parental_level]
2297
- if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
2298
- parental_terms = [] # Initialize required parental level
2299
- terms_levels[parental_level] = parental_terms
2300
- levels << parental_level
2301
- end
2302
- terms_levels[level].each do |term|
2303
- path_info = @term_paths[term]
2304
- shortest_path_length = path_info[:shortest_path]
2305
- path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
2306
- parental = path[1] # the first elements is the term itself
2307
- parental_terms << parental if !parental_terms.include?(parental)
1346
+ # ===== Returns
1347
+ # an array of sizes for all stored profiles
1348
+ # ===== Return
1349
+ # array of profile sizes
1350
+ def get_profiles_sizes()
1351
+ return @profiles.map{|id,terms| terms.length}
1352
+ end
1353
+
1354
+ # Calculates profiles sizes and returns size assigned to percentile given
1355
+ # ===== Parameters
1356
+ # +perc+:: percentile to be returned
1357
+ # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1358
+ # ===== Returns
1359
+ # values assigned to percentile asked
1360
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1361
+ prof_lengths = self.get_profiles_sizes
1362
+ percentile_profile = prof_lengths.get_quantiles(perc.fdiv(100), decreasing_sort = !increasing_sort)
1363
+ return percentile_profile
1364
+ end
1365
+
1366
+ # IC data
1367
+ ####################################
1368
+
1369
+ # Get frequency terms and information coefficient from profiles #
1370
+
1371
+ # Calculates frequencies of stored profiles terms
1372
+ # ===== Parameters
1373
+ # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1374
+ # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1375
+ # +translate+:: if true, term IDs will be translated to
1376
+ # ===== Returns
1377
+ # stored profiles terms frequencies
1378
+ def get_profiles_terms_frequency(ratio: true, asArray: true, translate: true)
1379
+ freqs = Hash.new(0)
1380
+ @profiles.each do |id, terms|
1381
+ terms.each{|term| freqs[term] += 1}
1382
+ end
1383
+ if translate
1384
+ translated_freqs = {}
1385
+ freqs.each do |term, freq|
1386
+ tr = self.translate_id(term)
1387
+ translated_freqs[tr] = freq if !tr.nil?
2308
1388
  end
1389
+ freqs = translated_freqs
1390
+ end
1391
+ n_profiles = @profiles.length
1392
+ freqs.transform_values!{|freq| freq.fdiv(n_profiles)} if ratio
1393
+ if asArray
1394
+ freqs = freqs.to_a
1395
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
2309
1396
  end
1397
+ return freqs
2310
1398
  end
2311
1399
 
2312
- def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
2313
- results = []
2314
- penalized_terms = {}
2315
- levels = terms_levels.keys.sort
2316
- levels.reverse_each do |level|
2317
- terms_levels[level].each do |term|
2318
- associated_items = item_list[term]
2319
- items_to_remove = penalized_terms[term]
2320
- items_to_remove = [] if items_to_remove.nil?
2321
- pval = get_fisher_exact_test(
2322
- external_item_list - items_to_remove,
2323
- associated_items - items_to_remove,
2324
- #((associated_items | external_item_list) - items_to_remove).length
2325
- total_items
2326
- )
2327
- if pval <= thresold
2328
- parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
2329
- parents.each do |prnt|
2330
- query = penalized_terms[prnt]
2331
- if query.nil?
2332
- penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
2333
- else
2334
- query.concat(item_list[term])
2335
- end
1400
+ # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1401
+ # ===== Returns
1402
+ # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1403
+ def get_profiles_resnik_dual_ICs(struct: :resnik, observ: :resnik_observed) # Maybe change name during migration to get_profiles_dual_ICs
1404
+ struct_ics = {}
1405
+ observ_ics = {}
1406
+ @profiles.each do |id, terms|
1407
+ struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: struct)
1408
+ observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: observ)
1409
+ end
1410
+ return struct_ics, observ_ics
1411
+ end
1412
+
1413
+
1414
+ # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
1415
+ # ===== Returns
1416
+ # two hashes with resnik and resnik_observed ICs for observed terms
1417
+ def get_observed_ics_by_onto_and_freq()
1418
+ ic_ont = {}
1419
+ resnik_observed = {}
1420
+ observed_terms = @profiles.values.flatten.uniq
1421
+ observed_terms.each do |term|
1422
+ ic_ont[term] = get_IC(term)
1423
+ resnik_observed[term] = get_IC(term, type: :resnik_observed)
1424
+ end
1425
+ return ic_ont, resnik_observed
1426
+ end
1427
+
1428
+ # Profiles vs Profiles #
1429
+
1430
+ def get_pair_index(profiles_A, profiles_B)
1431
+ pair_index = {}
1432
+ profiles_A.each do |curr_id, profile_A|
1433
+ profiles_B.each do |id, profile_B|
1434
+ profile_A.each do |term_A|
1435
+ profile_B.each do |term_B|
1436
+ pair_index[[term_A, term_B].sort] = true
2336
1437
  end
2337
1438
  end
2338
- results << [term, pval]
2339
- end
1439
+ end
2340
1440
  end
2341
- return results
1441
+ return pair_index
2342
1442
  end
2343
1443
 
2344
- def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
2345
- pvals = {}
2346
- item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
2347
- levels = terms_levels.keys.sort
2348
- levels.reverse_each do |level|
2349
- terms_levels[level].each do |term|
2350
- associated_items = item_list[term]
2351
- #initialize observed items in item_weigths_per_term list
2352
- add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
2353
- children = @dicts[:is_a][:byValue][term]
2354
- if children.nil?
2355
- children = []
2356
- else
2357
- children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
2358
- end
2359
- computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2360
- end
1444
+ def get_mica_index_from_profiles(pair_index, sim_type: :resnik, ic_type: :resnik, lca_index: true)
1445
+ pair_index.each do |pair, val|
1446
+ tA, tB = pair
1447
+ value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type, lca_index: lca_index)
1448
+ value = true if value.nil? # We use true to save that the operation was made but there is not mica value
1449
+ add2nestHash(@mica_index, tA, tB, value)
1450
+ add2nestHash(@mica_index, tB, tA, value)
2361
1451
  end
2362
- return pvals.to_a
2363
1452
  end
2364
1453
 
2365
- def add_items_to_weigthed_list(term, associated_items, weigthed_list)
2366
- term_weigthing = weigthed_list[term]
2367
- associated_items.each{|ai| term_weigthing[ai] = 1}
2368
- weigthed_list[term] = term_weigthing
1454
+ # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
1455
+ # ===== Parameters
1456
+ # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
1457
+ # +sim_type+:: similitude method to be used. Default: resnik
1458
+ # +ic_type+:: ic type to be used. Default: resnik
1459
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
1460
+ # ===== Return
1461
+ # Similitudes calculated
1462
+ def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
1463
+ profiles_similarity = {} #calculate similarity between patients profile
1464
+ if external_profiles.nil?
1465
+ comp_profiles = @profiles
1466
+ main_profiles = comp_profiles
1467
+ else
1468
+ comp_profiles = external_profiles
1469
+ main_profiles = @profiles
1470
+ end
1471
+ # Compare
1472
+ pair_index = get_pair_index(main_profiles, comp_profiles)
1473
+ @mica_index = {}
1474
+ get_mica_index_from_profiles(pair_index, sim_type: sim_type, ic_type: ic_type, lca_index: false)
1475
+ main_profiles.each do |curr_id, current_profile|
1476
+ comp_profiles.each do |id, profile|
1477
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
1478
+ add2nestHash(profiles_similarity, curr_id, id, value)
1479
+ end
1480
+ end
1481
+ return profiles_similarity
2369
1482
  end
2370
1483
 
2371
- def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2372
- #puts term.to_s.red
2373
- #puts @term_paths[term].inspect
2374
- #puts @dicts[:is_a][:byValue][term].inspect.light_blue
2375
- associated_items = item_weigths_per_term[term].keys
2376
- pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
2377
- 'two_sided', item_weigths_per_term[term], true)
2378
- pvals[term] = pval
2379
- if children.length > 0
2380
- rates = {}
2381
- sig_child = 0
2382
- children.each do |child|
2383
- ratio = sigRatio(pvals[child], pval)
2384
- rates[child] = ratio
2385
- sig_child += 1 if ratio >= 1
2386
- end
2387
- if sig_child == 0 # CASE 1
2388
- children.each do |child|
2389
- current_ratio = rates[child]
2390
- query_child = item_weigths_per_term[child]
2391
- query_child.transform_values!{|weight| weight * current_ratio}
2392
- pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
2393
- 'two_sided', item_weigths_per_term[child], true)
2394
- end
1484
+ # specifity_index related methods
1485
+ ####################################
1486
+
1487
+ # Return ontology levels from profile terms
1488
+ # ===== Returns
1489
+ # hash of term levels (Key: level; Value: array of term IDs)
1490
+ def get_ontology_levels_from_profiles(uniq = true)
1491
+ profiles_terms = @profiles.values.flatten
1492
+ profiles_terms.uniq! if uniq
1493
+ term_freqs_byProfile = Hash.new(0)
1494
+ profiles_terms.each do |term|
1495
+ term_freqs_byProfile[term] += 1
1496
+ end
1497
+ levels_filtered = {}
1498
+ terms_levels = @dicts[:level][:byValue]
1499
+ term_freqs_byProfile.each do |term, count|
1500
+ level = terms_levels[term]
1501
+ term_repeat = Array.new(count, term)
1502
+ query = levels_filtered[level]
1503
+ if query.nil?
1504
+ levels_filtered[level] = term_repeat
2395
1505
  else
2396
- ancs = get_ancestors(term, filter_alternatives = true)
2397
- ancs << term
2398
- rates.each do |ch, ratio|# CASE 2
2399
- if ratio >= 1 # The child is better than parent
2400
- ancs.each do |anc|
2401
- query_anc = item_weigths_per_term[anc]
2402
- associated_items.each do |item|
2403
- query_anc[item] /= ratio # /= --> query_anc[item]/ratio
2404
- end
2405
- end
2406
- end
2407
- end
2408
- computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
1506
+ query.concat(term_repeat)
2409
1507
  end
2410
1508
  end
1509
+ return levels_filtered
2411
1510
  end
2412
1511
 
2413
- def sigRatio(pvalA, pvalB)
2414
- return Math.log(pvalA)/Math.log(pvalB)
2415
- end
1512
+ def get_profile_ontology_distribution_tables
1513
+ cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
1514
+ uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
1515
+ ontology_levels = get_ontology_levels
1516
+ total_ontology_terms = ontology_levels.values.flatten.length
1517
+ total_cohort_terms = cohort_ontology_levels.values.flatten.length
1518
+ total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
2416
1519
 
2417
- #============================================================================
2418
- #============================================================================
1520
+ distribution_ontology_levels = []
1521
+ distribution_percentage = []
1522
+ ontology_levels.each do |level, terms|
1523
+ cohort_terms = cohort_ontology_levels[level]
1524
+ uniq_cohort_terms = uniq_cohort_ontology_levels[level]
1525
+ if cohort_terms.nil? || uniq_cohort_terms.nil?
1526
+ num = 0
1527
+ u_num = 0
1528
+ else
1529
+ num = cohort_terms.length
1530
+ u_num = uniq_cohort_terms.length
1531
+ end
1532
+ distribution_ontology_levels << [level, terms.length, num]
1533
+ distribution_percentage << [
1534
+ level,
1535
+ (terms.length.fdiv(total_ontology_terms)*100).round(3),
1536
+ (num.fdiv(total_cohort_terms)*100).round(3),
1537
+ (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
1538
+ ]
1539
+ end
1540
+ distribution_ontology_levels.sort! { |x,y| x.first <=> y.first }
1541
+ distribution_percentage.sort! { |x,y| x.first <=> y.first }
1542
+ return distribution_ontology_levels, distribution_percentage
1543
+ end
2419
1544
 
2420
- # Check if a given ID is a removable (blacklist) term.
2421
- # +DEPRECATED+ use is_removable? instead
2422
- # ===== Parameters
2423
- # +id+:: to be checked
2424
- # ===== Returns
2425
- # true if given term is a removable (blacklist) term or false in other cases
2426
- def is_removable(id)
2427
- warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
2428
- return @removable_terms.include?(id.to_sym)
1545
+ def get_dataset_specifity_index(mode)
1546
+ ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
1547
+ if mode == 'uniq'
1548
+ observed_distribution = 3
1549
+ elsif mode == 'weigthed'
1550
+ observed_distribution = 2
1551
+ end
1552
+ max_terms = distribution_percentage.map{|row| row[1]}.max
1553
+ maxL = nil
1554
+ distribution_percentage.each do |level_info|
1555
+ maxL = level_info.first if level_info[1] == max_terms
1556
+ end
1557
+ diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
1558
+ diffL.select!{|dL| dL.last > 0}
1559
+ highSection = diffL.select{|dL| dL.first > maxL}
1560
+ lowSection = diffL.select{|dL| dL.first <= maxL}
1561
+ dsi = nil
1562
+ if highSection.empty?
1563
+ dsi = 0
1564
+ else
1565
+ hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
1566
+ lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
1567
+ dsi = hss.fdiv(lss)
1568
+ end
1569
+ return dsi
2429
1570
  end
2430
1571
 
2431
- # Check if a given ID is a removable (blacklist) term
2432
- # ===== Parameters
2433
- # +id+:: to be checked
2434
- # ===== Returns
2435
- # true if given term is a removable (blacklist) term or false in other cases
2436
- def is_removable? id
2437
- return @removable_terms.include?(id.to_sym)
1572
+ def get_weigthed_level_contribution(section, maxL, nLevels)
1573
+ accumulated_weigthed_diffL = 0
1574
+ section.each do |level, diff|
1575
+ weightL = maxL - level
1576
+ if weightL >= 0
1577
+ weightL += 1
1578
+ else
1579
+ weightL = weightL.abs
1580
+ end
1581
+ accumulated_weigthed_diffL += diff * weightL
1582
+ end
1583
+ weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
1584
+ return weigthed_contribution
2438
1585
  end
2439
1586
 
2440
- ############################################
2441
- # SPECIAL METHODS
2442
- #############################################
1587
+ ########################################
1588
+ ## GENERAL ONTOLOGY METHODS
1589
+ ########################################
1590
+
2443
1591
  def ==(other)
2444
- self.header == other.header &&
2445
- self.stanzas == other.stanzas &&
1592
+ self.terms == other.terms &&
2446
1593
  self.ancestors_index == other.ancestors_index &&
2447
1594
  self.alternatives_index == other.alternatives_index &&
2448
- self.obsoletes_index == other.obsoletes_index &&
2449
1595
  self.structureType == other.structureType &&
2450
1596
  self.ics == other.ics &&
2451
1597
  self.meta == other.meta &&
2452
1598
  self.dicts == other.dicts &&
2453
1599
  self.profiles == other.profiles &&
2454
- self.profilesDict == other.profilesDict &&
2455
1600
  (self.items.keys - other.items.keys).empty? &&
2456
- self.removable_terms == other.removable_terms &&
2457
- self.special_tags == other.special_tags &&
2458
1601
  self.items == other.items &&
2459
1602
  self.term_paths == other.term_paths &&
2460
1603
  self.max_freqs == other.max_freqs
@@ -2463,32 +1606,128 @@ class Ontology
2463
1606
 
2464
1607
  def clone
2465
1608
  copy = Ontology.new
2466
- copy.header = self.header.clone
2467
- copy.stanzas[:terms] = self.stanzas[:terms].clone
2468
- copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2469
- copy.stanzas[:instances] = self.stanzas[:instances].clone
1609
+ copy.terms = self.terms.clone
2470
1610
  copy.ancestors_index = self.ancestors_index.clone
2471
1611
  copy.descendants_index = self.descendants_index.clone
2472
1612
  copy.alternatives_index = self.alternatives_index.clone
2473
- copy.obsoletes_index = self.obsoletes_index.clone
2474
1613
  copy.structureType = self.structureType.clone
2475
1614
  copy.ics = self.ics.clone
2476
1615
  copy.meta = self.meta.clone
2477
1616
  copy.dicts = self.dicts.clone
2478
1617
  copy.profiles = self.profiles.clone
2479
- copy.profilesDict = self.profilesDict.clone
2480
1618
  copy.items = self.items.clone
2481
- copy.removable_terms = self.removable_terms.clone
2482
1619
  copy.term_paths = self.term_paths.clone
2483
1620
  copy.max_freqs = self.max_freqs.clone
2484
1621
  return copy
2485
1622
  end
2486
1623
 
1624
+ # Exports an OBO_Handler object in json format
1625
+ # ===== Parameters
1626
+ # +file+:: where info will be stored
1627
+ def write(file)
1628
+ # Take object stored info
1629
+ obj_info = {terms: @terms,
1630
+ ancestors_index: @ancestors_index,
1631
+ descendants_index: @descendants_index,
1632
+ alternatives_index: @alternatives_index,
1633
+ structureType: @structureType,
1634
+ ics: @ics,
1635
+ meta: @meta,
1636
+ max_freqs: @max_freqs,
1637
+ dicts: @dicts,
1638
+ profiles: @profiles,
1639
+ items: @items,
1640
+ term_paths: @term_paths}
1641
+ # Convert to JSON format & write
1642
+ File.open(file, "w") { |f| f.write obj_info.to_json }
1643
+ end
1644
+
1645
+
1646
+ def each(att = false)
1647
+ warn('terms empty') if @terms.empty?
1648
+ @terms.each do |id, tags|
1649
+ if att
1650
+ yield(id, tags)
1651
+ else
1652
+ yield(id)
1653
+ end
1654
+ end
1655
+ end
1656
+
1657
+ def get_root
1658
+ roots = []
1659
+ each do |term|
1660
+ roots << term if @ancestors_index[term].nil?
1661
+ end
1662
+ return roots
1663
+ end
1664
+
1665
+ def list_term_attributes
1666
+ terms = []
1667
+ each do |code|
1668
+ terms << [code, translate_id(code), get_term_level(code)]
1669
+ end
1670
+ return terms
1671
+ end
1672
+
1673
+ # Gets ontology levels calculated
1674
+ # ===== Returns
1675
+ # ontology levels calculated
1676
+ def get_ontology_levels
1677
+ return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1678
+ end
2487
1679
 
2488
- #############################################
2489
- # ACCESS CONTROL
2490
- #############################################
1680
+ private
1681
+
1682
+ def add2hash(hash, key, val)
1683
+ query = hash[key]
1684
+ if query.nil?
1685
+ hash[key] = [val]
1686
+ else
1687
+ query << val
1688
+ end
1689
+ end
1690
+
1691
+ def add2nestHash(h, key1, key2, val)
1692
+ query1 = h[key1]
1693
+ if query1.nil?
1694
+ h[key1] = {key2 => val}
1695
+ else
1696
+ query1[key2] = val
1697
+ end
1698
+ end
2491
1699
 
2492
- attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2493
- attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
1700
+ # Internal function to concat two elements.
1701
+ # ===== Parameters
1702
+ # +itemA+:: item to be concatenated
1703
+ # +itemB+:: item to be concatenated
1704
+ # ===== Returns
1705
+ # Concatenated objects
1706
+ def concatItems(itemA,itemB) # NEED TEST, CHECK WITH PSZ THIS METHOD
1707
+ # A is Array :: RETURN ARRAY
1708
+ # A_array : B_array
1709
+ # A_array : B_hash => NOT ALLOWED
1710
+ # A_array : B_single => NOT ALLOWED
1711
+ # A is Hash :: RETURN HASH
1712
+ # A_hash : B_array => NOT ALLOWED
1713
+ # A_hash : B_hash
1714
+ # A_hash : B_single => NOT ALLOWED
1715
+ # A is single element => RETURN ARRAY
1716
+ # A_single : B_array
1717
+ # A_single : B_hash => NOT ALLOWED
1718
+ # A_single : B_single
1719
+ concatenated = nil
1720
+ if itemA.kind_of?(Array) && itemB.kind_of?(Array)
1721
+ concatenated = itemA | itemB
1722
+ elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
1723
+ concatenated = itemA.merge(itemB) do |k, oldV, newV|
1724
+ self.concatItems(oldV,newV)
1725
+ end
1726
+ elsif itemB.kind_of?(Array)
1727
+ concatenated = ([itemA] + itemB).uniq
1728
+ elsif ![Array, Hash].include?(itemB.class)
1729
+ concatenated = [itemA,itemB].uniq
1730
+ end
1731
+ return concatenated
1732
+ end
2494
1733
  end