semtools 0.1.8 → 0.1.91

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,45 +8,30 @@ class Ontology
8
8
  # AUTHOR NOTES
9
9
  #########################################################
10
10
 
11
- # 1 - Store @profiles as @stanzas[:instances]
12
11
  # 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
13
12
 
14
-
15
13
  #############################################
16
14
  # FIELDS
17
15
  #############################################
18
- # Handled class variables
19
- # => @@basic_tags :: hash with main OBO structure tags
20
- # => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
21
- # => @@symbolizable_ids :: tags which can be symbolized
22
- # => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
23
- #
24
16
  # Handled object variables
25
- # => @header :: file header (if is available)
26
- # => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
17
+ # => @terms :: OBO terms descriptions
27
18
  # => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
28
19
  # => @descendants_index :: hash of descendants per each term handled with any structure relationships
29
20
  # => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
30
- # => @obsoletes_index :: hash of obsoletes and it's new ids
31
- # => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
32
21
  # => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
33
- # => @ics :: already calculated ICs for handled terms and IC types
34
- # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
35
- # => @max_freqs :: maximum freqs found for structural and observed freqs
36
22
  # => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
37
- # => @profiles :: set of terms assigned to an ID
38
- # => @profilesDict :: set of profile IDs assigned to a term
39
- # => @items :: hash with items relations to terms
40
23
  # => @removable_terms :: array of terms to not be considered
24
+ # => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
25
+ # => @ics :: already calculated ICs for handled terms and IC types
41
26
  # => @term_paths :: metainfo about parental paths of each term
27
+ # => @max_freqs :: maximum freqs found for structural and observed freqs
28
+ # => @items :: hash with items relations to terms
29
+ # => @profiles :: set of terms assigned to an ID
42
30
 
43
- @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
44
31
  @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
45
- @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
46
- @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
47
- @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
48
- @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
49
32
 
33
+ attr_accessor :terms, :ancestors_index, :descendants_index, :alternatives_index, :obsoletes, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :items, :term_paths, :reroot
34
+
50
35
  #############################################
51
36
  # CONSTRUCTOR
52
37
  #############################################
@@ -58,266 +43,138 @@ class Ontology
58
43
  # +removable_terms+: term to be removed from calcs
59
44
  # +build+: flag to launch metainfo calculation
60
45
  # +file_format+: force format type despite file extension. Can be :obo or :json
61
- def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
62
- # Initialize object variables
63
- @header = nil
64
- @stanzas = {terms: {}, typedefs: {}, instances: {}}
46
+ def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil, extra_dicts: [])
47
+ @terms = {}
65
48
  @ancestors_index = {}
66
49
  @descendants_index = {}
67
50
  @alternatives_index = {}
68
- @obsoletes_index = {}
51
+ @obsoletes = {} # id is obsolete but it could or not have an alt id
69
52
  @structureType = nil
70
53
  @ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
71
54
  @meta = {}
72
- @special_tags = @@basic_tags.clone
73
55
  @max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
74
56
  @dicts = {}
75
57
  @profiles = {}
76
- @profilesDict = {}
77
58
  @items = {}
78
- @removable_terms = []
79
59
  @term_paths = {}
80
- add_removable_terms(removable_terms) if !removable_terms.empty?
60
+ @reroot = false
81
61
  load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
82
62
  # Load if proceeds
83
63
  if load_file
84
64
  fformat = file_format
85
65
  fformat = File.extname(file) if fformat.nil? && !file.nil?
86
66
  if fformat == :obo || fformat == ".obo"
87
- load(file, build: build)
67
+ OboParser.load(self, file, build: build, black_list: removable_terms, extra_dicts: extra_dicts)
88
68
  elsif fformat == :json || fformat == ".json"
89
- self.read(file, build: build)
69
+ JsonParser.load(self, file, build: build)
90
70
  elsif !fformat.nil?
91
71
  warn 'Format not allowed. Loading process will not be performed'
92
72
  end
73
+ precompute if build
93
74
  end
94
75
  end
95
76
 
96
-
97
77
  #############################################
98
- # CLASS METHODS
78
+ # GENERATE METADATA FOR ALL TERMS
99
79
  #############################################
100
80
 
101
- # Expand a (starting) term using a specific tag and return all extended terms into an array and
102
- # the relationship structuture observed (hierarchical or circular). If circular structure is
103
- # foumd, extended array will be an unique vector without starting term (no loops).
104
- # +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
105
- # ===== Parameters
106
- # +start+:: term where start to expand
107
- # +terms+:: set to be used to expand
108
- # +target_tag+:: tag used to expand
109
- # +eexpansion+:: already expanded info
110
- # +split_info_char+:: special regex used to split info (if it is necessary)
111
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
112
- # +alt_ids+:: set of alternative IDs
113
- # ===== Returns
114
- # A vector with the observed structure (string) and the array with extended terms.
115
- def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
116
- # Take start_id term available info and already accumulated info
117
- current_associations = related_ids[start_id]
118
- current_associations = [] if current_associations.nil?
119
- return [:no_term,[]] if terms[start_id].nil?
120
- id_relations = terms[start_id][target_tag]
121
- return [:source,[]] if id_relations.nil?
122
-
123
- # Prepare auxiliar variables
124
- struct = :hierarchical
125
-
126
- # Study direct extensions
127
- id_relations = id_relations.clone
128
- while id_relations.length > 0
129
- id = id_relations.shift
130
- id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
131
-
132
- # Handle
133
- if current_associations.include?(id) # Check if already have been included into this expansion
134
- struct = :circular
135
- else
136
- current_associations << id
137
- if related_ids.include?(id) # Check if current already has been expanded
138
- current_associations = current_associations | related_ids[id]
139
- if current_associations.include?(start_id) # Check circular case
140
- struct = :circular
141
- [id, start_id].each{|repeated| current_associations.delete(repeated)}
142
- end
143
- else # Expand
144
- related_ids[start_id] = current_associations
145
- structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
146
- current_associations = current_associations | current_related_ids
147
- struct = :circular if structExp == :circular # Check struct
148
- if current_associations.include?(start_id) # Check circular case
149
- struct = :circular
150
- current_associations.delete(start_id)
151
- end
152
- end
153
- end
154
- end
155
- related_ids[start_id] = current_associations
156
-
157
- return struct, current_associations
158
- end
159
-
160
-
161
- # Expand terms using a specific tag and return all extended terms into an array and
162
- # the relationship structuture observed (hierarchical or circular). If circular structure is
163
- # foumd, extended array will be an unique vector without starting term (no loops)
164
- # ===== Parameters
165
- # +terms+:: set to be used to expand
166
- # +target_tag+:: tag used to expand
167
- # +split_info_char+:: special regex used to split info (if it is necessary)
168
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
169
- # +alt_ids+:: set of alternative IDs
170
- # +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
171
- # ===== Returns
172
- # A vector with the observed structure (string) and the hash with extended terms
173
- def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
174
- # Define structure type
175
- structType = :hierarchical
176
- related_ids = {}
177
- terms.each do |id, tags|
178
- # Check if target tag is defined
179
- if !tags[target_tag].nil?
180
- # Obtain related terms
181
- set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
182
- # Check structure
183
- structType = :circular if set_structure == :circular
184
- end
185
- end
186
-
187
- # Check special case
188
- structType = :atomic if related_ids.length <= 0
189
- structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
190
- # Return type and hash with related_ids
191
- return structType, related_ids
81
+ def precompute
82
+ get_index_frequencies
83
+ calc_term_levels(calc_paths: true)
192
84
  end
193
85
 
194
-
195
- # Class method to transform string with <tag : info> into hash structure
196
- # ===== Parameters
197
- # +attributes+:: array tuples with info to be transformed into hash format
86
+ # Calculates regular frequencies based on ontology structure (using parentals)
198
87
  # ===== Returns
199
- # Attributes stored into hash structure
200
- def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
201
- # Load info
202
- info_hash = {}
203
- # Only TERMS multivalue tags (future add Typedefs and Instance)
204
- # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
205
- attributes.each do |tag, value|
206
- value.gsub!(/{source=[\\\":A-Za-z0-9\/\.\-, =]+} /, '') if tag == 'is_a' # To delete "source" attributes in is_a tag of MONDO ontology
207
- # Check
208
- raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
209
- # Prepare
210
- tag = tag.lstrip.to_sym
211
- value.lstrip!
212
- value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
213
-
214
- # Store
215
- query = info_hash[tag]
216
- if !query.nil? # Tag already exists
217
- if !query.kind_of?(Array) # Check that tag is multivalue
218
- raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
219
- else
220
- query << value # Add new value to tag
221
- end
222
- else # New entry
223
- if @@multivalue_tags.include?(tag)
224
- info_hash[tag] = [value]
225
- else
226
- info_hash[tag] = value
88
+ # true if everything end without errors and false in other cases
89
+ def get_index_frequencies() # Per each term, add frequencies
90
+ if @ancestors_index.empty?
91
+ warn('ancestors_index object is empty')
92
+ else
93
+ each(att = true) do |id, tags|
94
+ query = @meta[id]
95
+ if query.nil?
96
+ query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
97
+ @meta[id] = query
227
98
  end
99
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].length.to_f : 0.0
100
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].length.to_f : 0.0
101
+ query[:struct_freq] = query[:descendants] + 1.0
102
+ # Update maximums
103
+ @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
104
+ @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
228
105
  end
229
106
  end
230
- self.symbolize_ids(info_hash)
231
- return info_hash
232
107
  end
233
108
 
234
-
235
- # Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
236
- # the Header, the Terms, the Typedefs and the Instances.
109
+ # Calculates ontology structural levels for all ontology terms
237
110
  # ===== Parameters
238
- # +file+:: OBO file to be loaded
239
- # ===== Returns
240
- # Hash with FILE, HEADER and STANZAS info
241
- def self.load_obo(file) #TODO: Send to obo_parser class
242
- raise("File is not defined") if file.nil?
243
- # Data variables
244
- header = ''
245
- stanzas = {terms: {}, typedefs: {}, instances: {}}
246
- # Auxiliar variables
247
- infoType = 'Header'
248
- currInfo = []
249
- stanzas_flags = %w[[Term] [Typedef] [Instance]]
250
- # Read file
251
- File.open(file).each do |line|
252
- line.chomp!
253
- next if line.empty?
254
- fields = line.split(':', 2)
255
- # Check if new instance is found
256
- if stanzas_flags.include?(line)
257
- header = self.process_entity(header, infoType, stanzas, currInfo)
258
- # Update info variables
259
- currInfo = []
260
- infoType = line.gsub!(/[\[\]]/, '')
261
- next
111
+ # +calc_paths+:: calculates term paths if it's not already calculated
112
+ # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
113
+ def calc_term_levels(calc_paths: false, shortest_path: true)
114
+ self.calc_term_paths if @term_paths.empty? && calc_paths
115
+ if !@term_paths.empty?
116
+ byTerm = {}
117
+ byValue = {}
118
+ @term_paths.each do |term, info|
119
+ level = shortest_path ? info[:shortest_path] : info[:largest_path]
120
+ level = level.nil? ? -1 : level.round(0)
121
+ byTerm[term] = level
122
+ add2hash(byValue, level, term)
262
123
  end
263
- # Concat info
264
- currInfo << fields
124
+ @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
125
+ @max_freqs[:max_depth] = byValue.keys.max # Update maximum depth
265
126
  end
266
- # Store last loaded info
267
- header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
268
-
269
- # Prepare to return
270
- finfo = {:file => file, :name => File.basename(file, File.extname(file))}
271
- return finfo, header, stanzas
272
127
  end
273
128
 
274
-
275
- # Handle OBO loaded info and stores it into correct container and format
276
- # ===== Parameters
277
- # +header+:: container
278
- # +infoType+:: current ontology item type detected
279
- # +stanzas+:: container
280
- # +currInfo+:: info to be stored
281
- # ===== Returns
282
- # header newly/already stored
283
- def self.process_entity(header, infoType, stanzas, currInfo)
284
- info = self.info2hash(currInfo)
285
- # Store current info
286
- if infoType.eql?('Header')
287
- header = info
288
- else
289
- id = info[:id]
290
- case infoType
291
- when 'Term'
292
- stanzas[:terms][id] = info
293
- when 'Typedef'
294
- stanzas[:typedefs][id] = info
295
- when 'Instance'
296
- stanzas[:instances][id] = info
129
+ # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
130
+ # Also calculates paths metadata and stores into @term_paths
131
+ def calc_term_paths
132
+ @term_paths = {}
133
+ if [:hierarchical, :sparse].include? @structureType
134
+ each do |term|
135
+ expand_path(term)
136
+ path_attr = @term_paths[term]
137
+ # expand_path is arecursive function so these pat attributes must be calculated once the recursion is finished
138
+ path_attr[:total_paths] = path_attr[:paths].length
139
+ paths_sizes = path_attr[:paths].map{|path| path.length}
140
+ path_attr[:largest_path] = paths_sizes.max
141
+ path_attr[:shortest_path] = paths_sizes.min
297
142
  end
143
+ else
144
+ warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
298
145
  end
299
- return header
300
146
  end
301
147
 
302
-
303
- # Symboliza all values into hashs using symbolizable tags as keys
148
+ # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
304
149
  # ===== Parameters
305
- # +item_hash+:: hash to be checked
306
- def self.symbolize_ids(item_hash)
307
- @@symbolizable_ids.each do |tag|
308
- query = item_hash[tag]
309
- if !query.nil?
310
- if query.kind_of?(Array)
311
- query.map!{|item| item.to_sym}
312
- else
313
- item_hash[tag] = query.to_sym if !query.nil?
150
+ # +curr_term+:: current visited term
151
+ # +visited_terms+:: already expanded terms
152
+ def expand_path(curr_term)
153
+ if !@term_paths.include?(curr_term)
154
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
155
+ @term_paths[curr_term] = path_attr
156
+ direct_parentals = @dicts[:is_a][:byTerm][curr_term]
157
+ if direct_parentals.nil? # No parents :: End of recurrence
158
+ path_attr[:paths] << [curr_term]
159
+ else # Expand and concat
160
+ direct_parentals.each do |ancestor|
161
+ path_attr_parental = @term_paths[ancestor]
162
+ if path_attr_parental.nil? # Calculate new paths
163
+ self.expand_path(ancestor)
164
+ new_paths = @term_paths[ancestor][:paths]
165
+ else # Use direct_parental paths already calculated
166
+ new_paths = path_attr_parental[:paths]
167
+ end
168
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
314
169
  end
315
170
  end
316
171
  end
317
172
  end
318
173
 
174
+ #############################################
175
+ # CLASS METHODS (TODO: TO BE TRANFORMED IN INSTANCE METHODS)
176
+ #############################################
319
177
 
320
- #
321
178
  # ===== Parameters
322
179
  # +root+:: main term to expand
323
180
  # +ontology+:: to be cutted
@@ -325,18 +182,32 @@ class Ontology
325
182
  # +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
326
183
  # ===== Returns
327
184
  # An Ontology object with terms after cut the ontology.
328
- def self.mutate(root, ontology, clone: true, remove_up: true)
185
+ def self.mutate(root, ontology, clone: true, remove_up: true) #TODO, pending to fix and pass to instance method
329
186
  ontology = ontology.clone if clone
330
187
  # Obtain affected IDs
331
188
  descendants = ontology.descendants_index[root]
332
189
  descendants << root # Store itself to do not remove it
333
190
  # Remove unnecesary terms
334
- ontology.stanzas[:terms] = ontology.stanzas[:terms].select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
191
+ terms = ontology.terms.select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
192
+ ids = terms.keys
193
+ terms.each do |id, term|
194
+ term[:is_a] = term[:is_a] & ids # Clean parental relations to keep only whose that exist between selected terms
195
+ end
196
+ ontology.terms = terms
335
197
  ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
336
198
  ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
337
199
  ontology.dicts = {}
338
- ontology.removable_terms = []
339
200
  ontology.term_paths = {}
201
+ ontology.reroot = true
202
+
203
+ ontology.ancestors_index = {}
204
+ ontology.descendants_index = {}
205
+ ontology.alternatives_index = {}
206
+ ontology.meta = {}
207
+ ontology.profiles = {}
208
+ ontology.items = {}
209
+
210
+
340
211
  # Recalculate metadata
341
212
  ontology.build_index
342
213
  ontology.add_observed_terms_from_profiles
@@ -344,33 +215,13 @@ class Ontology
344
215
  return ontology
345
216
  end
346
217
 
347
-
348
-
349
218
  #############################################
350
- # GENERAL METHODS
219
+ # TERM METHODS
351
220
  #############################################
352
221
 
353
- # Include removable terms to current removable terms list
354
- # ===== Parameters
355
- # +terms+:: terms array to be concatenated
356
- def add_removable_terms(terms)
357
- terms = terms.map{|term| term.to_sym}
358
- @removable_terms.concat(terms)
359
- end
360
-
361
-
362
- # Include removable terms to current removable terms list loading new
363
- # terms from a one column plain text file
364
- # ===== Parameters
365
- # +file+:: to be loaded
366
- def add_removable_terms_from_file(file)
367
- File.open(excluded_codes_file).each do |line|
368
- line.chomp!
369
- @removable_terms << line.to_sym
370
- end
371
- end
222
+ # I/O observed term from data
223
+ ####################################
372
224
 
373
-
374
225
  # Increase observed frequency for a specific term
375
226
  # ===== Parameters
376
227
  # +term+:: term which frequency is going to be increased
@@ -378,15 +229,7 @@ class Ontology
378
229
  # ===== Return
379
230
  # true if process ends without errors, false in other cases
380
231
  def add_observed_term(term:,increase: 1.0)
381
- # Check
382
- raise ArgumentError, "Term given is NIL" if term.nil?
383
- return false unless @stanzas[:terms].include?(term)
384
- return false if @removable_terms.include?(term)
385
- if @alternatives_index.include?(term)
386
- alt_id = @alternatives_index[term]
387
- @meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
388
- @meta[term] = @meta[alt_id]
389
- end
232
+ return false unless term_exist?(term)
390
233
  # Check if exists
391
234
  @meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
392
235
  # Add frequency
@@ -397,345 +240,199 @@ class Ontology
397
240
  return true
398
241
  end
399
242
 
243
+ # Obtain level and term relations
244
+ ####################################
400
245
 
401
- # Increase the arbitrary frequency of a given term set
402
246
  # ===== Parameters
403
- # +terms+:: set of terms to be updated
404
- # +increase+:: amount to be increased
405
- # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
406
- # ===== Return
407
- # true if process ends without errors and false in other cases
408
- def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false)
409
- # Check
410
- raise ArgumentError, 'Terms array given is NIL' if terms.nil?
411
- raise ArgumentError, 'Terms given is not an array' if !terms.is_a? Array
412
- # Add observations
413
- if transform_to_sym
414
- checks = terms.map{|id| self.add_observed_term(term: id.to_sym,increase: increase)}
415
- else
416
- checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
247
+ # +term+:: which are requested
248
+ # +relation+:: can be :ancestor or :descendant
249
+ # ===== Returns
250
+ # Direct ancestors/descendants of given term or nil if any error occurs
251
+ def get_direct_related(term, relation)
252
+ target = nil
253
+ case relation
254
+ when :ancestor
255
+ target = :byTerm
256
+ when :descendant
257
+ target = :byValue
258
+ else
259
+ warn('Relation type not allowed. Returning nil')
417
260
  end
418
- return checks
261
+ query = @dicts.dig(:is_a, target, term)
262
+ return query
419
263
  end
420
264
 
421
-
422
- # Compare to terms sets
265
+ # Return direct ancestors/descendants of a given term
266
+ # Return direct ancestors of a given term
423
267
  # ===== Parameters
424
- # +termsA+:: set to be compared
425
- # +termsB+:: set to be compared
426
- # +sim_type+:: similitude method to be used. Default: resnik
427
- # +ic_type+:: ic type to be used. Default: resnik
428
- # +bidirectional+:: calculate bidirectional similitude. Default: false
429
- # ===== Return
430
- # similitude calculated
431
- def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
432
- # Check
433
- raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
434
- raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
435
- micasA = []
436
- # Compare A -> B
437
- termsA.each do |tA|
438
- micas = []
439
- termsB.each do |tB|
440
- if store_mica
441
- value = @mica_index.dig(tA, tB)
442
- else
443
- value = nil
444
- end
445
- if value.nil?
446
- value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
447
- if store_mica
448
- value = true if value.nil? # We use true to save that the operation was made but there is not mica value
449
- add2nestHash(@mica_index, tA, tB, value)
450
- end
451
- end
452
- micas << value if value.class == Float
453
- end
454
- if !micas.empty?
455
- micasA << micas.max # Obtain maximum value
456
- else
457
- micasA << 0
458
- end
459
- end
460
- means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
461
- # Compare B -> A
462
- if bidirectional
463
- means_simA = means_sim * micasA.size
464
- means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
465
- means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
466
- end
467
- # Return
468
- return means_sim
268
+ # +term+:: which ancestors are requested
269
+ # ===== Returns
270
+ # Direct ancestors of given term or nil if any error occurs
271
+ def get_direct_ancentors(term)
272
+ return self.get_direct_related(term, :ancestor)
469
273
  end
470
274
 
471
- def add2nestHash(h, key1, key2, val)
472
- query1 = h[key1]
473
- if query1.nil?
474
- h[key1] = {key2 => val}
475
- else
476
- query1[key2] = val
477
- end
275
+ # Return direct descendants of a given term
276
+ # ===== Parameters
277
+ # +term+:: which descendants are requested
278
+ # ===== Returns
279
+ # Direct descendants of given term or nil if any error occurs
280
+ def get_direct_descendants(term)
281
+ return self.get_direct_related(term, :descendant)
478
282
  end
479
283
 
480
- # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
284
+ # Find ancestors/descendants of a given term
481
285
  # ===== Parameters
482
- # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
483
- # +sim_type+:: similitude method to be used. Default: resnik
484
- # +ic_type+:: ic type to be used. Default: resnik
485
- # +bidirectional+:: calculate bidirectional similitude. Default: false
486
- # ===== Return
487
- # Similitudes calculated
488
- def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
489
- profiles_similarity = {} #calculate similarity between patients profile
490
- profiles_ids = @profiles.keys
491
- if external_profiles.nil?
492
- comp_ids = profiles_ids
493
- comp_profiles = @profiles
494
- main_ids = comp_ids
495
- main_profiles = comp_profiles
286
+ # +term+:: to be checked
287
+ # +return_ancestors+:: return ancestors if true or descendants if false
288
+ # ===== Returns
289
+ # an array with all ancestors/descendants of given term or nil if parents are not available yet
290
+ def get_familiar(term, return_ancestors = true)
291
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
292
+ if !familiars.nil?
293
+ familiars = familiars.clone
496
294
  else
497
- comp_ids = external_profiles.keys
498
- comp_profiles = external_profiles
499
- main_ids = profiles_ids
500
- main_profiles = @profiles
501
- end
502
- # Compare
503
- @mica_index = {}
504
- while !main_ids.empty?
505
- curr_id = main_ids.shift
506
- current_profile = main_profiles[curr_id]
507
- comp_ids.each do |id|
508
- profile = comp_profiles[id]
509
- value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
510
- query = profiles_similarity[curr_id]
511
- if query.nil?
512
- profiles_similarity[curr_id] = {id => value}
513
- else
514
- query[id] = value
515
- end
516
- end
295
+ familiars = []
517
296
  end
518
- return profiles_similarity
297
+ return familiars
519
298
  end
520
299
 
300
+ # Find ancestors of a given term
301
+ # ===== Parameters
302
+ # +term+:: to be checked
303
+ # ===== Returns
304
+ # an array with all ancestors of given term or false if parents are not available yet
305
+ def get_ancestors(term)
306
+ return self.get_familiar(term, true)
307
+ end
521
308
 
522
- # Expand alternative IDs arround all already stored terms
309
+ # Find descendants of a given term
523
310
  # ===== Parameters
524
- # +alt_tag+:: tag used to expand alternative IDs
311
+ # +term+:: to be checked
525
312
  # ===== Returns
526
- # true if process ends without errors and false in other cases
527
- def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
528
- # Check input
529
- raise('stanzas terms empty') if @stanzas[:terms].empty?
530
- # Take all alternative IDs
531
- alt_ids2add = {}
532
- @stanzas[:terms].each do |id, tags|
533
- if id == tags[:id] # Avoid simulated alternative terms
534
- # id = tags[:id] # Take always real ID in case of alternative terms simulted
535
- alt_ids = tags[alt_tag]
536
- if !alt_ids.nil?
537
- alt_ids = alt_ids - @removable_terms - [id]
538
- # Update info
539
- alt_ids.each do |alt_term|
540
- @alternatives_index[alt_term] = id
541
- alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
542
- @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
543
- end
313
+ # an array with all descendants of given term or false if parents are not available yet
314
+ def get_descendants(term)
315
+ return self.get_familiar(term, false)
316
+ end
317
+
318
+ # Gets ontology level of a specific term
319
+ # ===== Returns
320
+ # Term level
321
+ def get_term_level(term)
322
+ return @dicts[:level][:byValue][term]
323
+ end
324
+
325
+ # nil, term not found, [] term exists but not has parents
326
+ def get_parental_path(term, which_path = :shortest_path, level = 0)
327
+ path = nil
328
+ path_attr = @term_paths[term]
329
+ if !path_attr.nil?
330
+ path_length = path_attr[which_path]
331
+ all_paths = path_attr[:paths]
332
+ if all_paths.empty?
333
+ path = []
334
+ else
335
+ path = all_paths.select{|pt| pt.length == path_length}.first.clone
336
+ if level > 0 # we want the term and his ascendants until a specific level
337
+ n_parents = path_length - level
338
+ path = path[0..n_parents]
544
339
  end
340
+ path.shift # Discard the term itself
545
341
  end
546
342
  end
547
- @stanzas[:terms].merge!(alt_ids2add)
343
+ return path
548
344
  end
549
345
 
346
+ # ID Handlers
347
+ ####################################
550
348
 
551
- # Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
552
349
  # ===== Returns
553
- # true if eprocess ends without errors and false in other cases
554
- def build_index()
555
- self.get_index_obsoletes
556
- self.get_index_alternatives
557
- self.get_index_child_parent_relations
558
- @alternatives_index.each{|k,v| @alternatives_index[k] = self.extract_id(v)}
559
- ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
560
- @alternatives_index.compact!
561
- @obsoletes_index.each{|k,v| @obsoletes_index[k] = self.extract_id(v)}
562
- @obsoletes_index.compact!
563
- @ancestors_index.each{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
564
- @ancestors_index.compact!
565
- @descendants_index.each{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
566
- @descendants_index.compact!
567
- self.get_index_frequencies
568
- self.calc_dictionary(:name)
569
- self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
570
- self.calc_term_levels(calc_paths: true)
350
+ # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
351
+ # ===== Parameters
352
+ # +id+:: to be translated
353
+ # ===== Return
354
+ # main ID related to a given ID. Returns nil if given ID is not an allowed ID
355
+ def get_main_id(id)
356
+ mainID = @alternatives_index[id]
357
+ return nil if !term_exist?(id) && mainID.nil?
358
+ if !mainID.nil? # Recursive code to get the definitive final term id if there are several alt_id in chain
359
+ new_id = get_main_id(mainID)
360
+ if new_id != mainID
361
+ new_id = get_main_id(new_id)
362
+ end
363
+ id = new_id
364
+ end
365
+ return id
571
366
  end
572
367
 
573
-
574
- # Calculates regular frequencies based on ontology structure (using parentals)
575
- # ===== Returns
576
- # true if everything end without errors and false in other cases
577
- def get_index_frequencies()
578
- # Check
579
- if @ancestors_index.empty?
580
- warn('ancestors_index object is empty')
581
- else
582
- # Per each term, add frequencies
583
- @stanzas[:terms].each do |id, tags|
584
- if @alternatives_index.include?(id)
585
- alt_id = @alternatives_index[id]
586
- query = @meta[alt_id] # Check if exist
587
- if query.nil?
588
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
589
- @meta[alt_id] = query
590
- end
591
- @meta[id] = query
592
- # Note: alternative terms do not increase structural frequencies
593
- else # Official term
594
- query = @meta[id] # Check if exist
595
- if query.nil?
596
- query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
597
- @meta[id] = query
598
- end
599
- # Store metadata
600
- query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
601
- query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
602
- query[:struct_freq] = query[:descendants] + 1.0
603
- # Update maximums
604
- @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
605
- @max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
606
- end
607
- end
608
- end
368
+ # Translate a given value using an already calcualted dictionary
369
+ # ===== Parameters
370
+ # +toTranslate+:: value to be translated using dictiontionary
371
+ # +tag+:: used to generate the dictionary
372
+ # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
373
+ # ===== Return
374
+ # translation
375
+ def translate(toTranslate, tag, byValue: true)
376
+ dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
377
+ toTranslate = get_main_id(toTranslate) if !byValue
378
+ return dict[toTranslate]
609
379
  end
610
380
 
611
-
612
- # Expand obsoletes set and link info to their alternative IDs
381
+ # Translate a name given
613
382
  # ===== Parameters
614
- # +obs_tags+:: tags to be used to find obsoletes
615
- # +alt_tags+:: tags to find alternative IDs (if are available)
616
- # +reset_obsoletes+:: flag to indicate if obsoletes set must be reset. Default: true
617
- # ===== Returns
618
- # true if process ends without errors and false in other cases
619
- def get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
620
- if @stanzas[:terms].empty?
621
- warn('stanzas terms empty')
622
- else
623
- # Check obsoletes
624
- @stanzas[:terms].each do |id, term_tags|
625
- next if term_tags.nil?
626
- next if self.is_alternative?(id)
627
- query = term_tags[obs_tag]
628
- if !query.nil? && query == 'true' # Obsolete tag presence
629
- next if !@obsoletes_index[id].nil? # Already stored
630
- # Check if alternative value is available
631
- alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
632
- if !alt_ids.empty?
633
- alt_id = alt_ids.first.first #FIRST tag, FIRST id
634
- # Store
635
- @alternatives_index[id] = alt_id
636
- @obsoletes_index[id] = alt_id
637
- end
638
- end
639
- end
640
- end
383
+ # +name+:: to be translated
384
+ # ===== Return
385
+ # translated name or nil if it's not stored into this ontology
386
+ def translate_name(name)
387
+ term = self.translate(name, :name)
388
+ term = self.translate(name, :synonym) if term.nil?
389
+ return term
641
390
  end
642
391
 
643
-
644
- # Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
392
+ # Translates a given ID to it assigned name
645
393
  # ===== Parameters
646
- # +tag+:: tag used to expand parentals
647
- # +split_info_char+:: special regex used to split info (if it is necessary)
648
- # +split_info_indx+:: special index to take splitted info (if it is necessary)
649
- # ===== Returns
650
- # true if process ends without errors and false in other cases
651
- def get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
652
- # Check
653
- if @stanzas[:terms].nil?
654
- warn('stanzas terms empty')
655
- else
656
- # Expand
657
- structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
658
- target_tag: tag,
659
- alt_ids: @alternatives_index,
660
- obsoletes: @obsoletes_index.length)
661
- # Check
662
- raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
663
- # Prepare ancestors structure
664
- anc = {}
665
- des = {}
666
- parentals.each do |id, parents|
667
- parents = parents - @removable_terms
668
- anc[id] = parents
669
- parents.each do |anc_id| # Add descendants
670
- if !des.include?(anc_id)
671
- des[anc_id] = [id]
672
- else
673
- des[anc_id] << id
674
- end
675
- end
676
- end
677
- # Store alternatives
678
- # @alternatives_index.each do |id,alt|
679
- # anc[id] = anc[alt] if anc.include?(alt)
680
- # des[id] = des[alt] if des.include?(alt)
681
- # end
682
- # Check structure
683
- if ![:atomic,:sparse].include? structType
684
- structType = structType == :circular ? :circular : :hierarchical
685
- end
686
- # Store
687
- @ancestors_index = anc
688
- @descendants_index = des
689
- @structureType = structType
690
- end
691
- # Finish
394
+ # +id+:: to be translated
395
+ # ===== Return
396
+ # main name or nil if it's not included into this ontology
397
+ def translate_id(id)
398
+ name = self.translate(id, :name, byValue: false)
399
+ return name.nil? ? nil : name.first
692
400
  end
693
401
 
402
+ # Get term frequency and information
403
+ ####################################
694
404
 
695
- # Find ancestors of a given term
405
+ # One single term #
406
+
407
+ # Get a term frequency
696
408
  # ===== Parameters
697
- # +term+:: to be checked
698
- # +filter_alternatives+:: if true, remove alternatives from final results
409
+ # +term+:: term to be checked
410
+ # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
699
411
  # ===== Returns
700
- # an array with all ancestors of given term or false if parents are not available yet
701
- def get_ancestors(term, filter_alternatives = false)
702
- return self.get_familiar(term, true, filter_alternatives)
412
+ # frequency of term given or nil if term is not allowed
413
+ def get_frequency(term, type: :struct_freq)
414
+ queryFreq = @meta[term]
415
+ return queryFreq.nil? ? nil : queryFreq[type]
703
416
  end
704
417
 
705
-
706
- # Find descendants of a given term
418
+ # Geys structural frequency of a term given
707
419
  # ===== Parameters
708
420
  # +term+:: to be checked
709
- # +filter_alternatives+:: if true, remove alternatives from final results
710
421
  # ===== Returns
711
- # an array with all descendants of given term or false if parents are not available yet
712
- def get_descendants(term, filter_alternatives = false)
713
- return self.get_familiar(term, false, filter_alternatives)
422
+ # structural frequency of given term or nil if term is not allowed
423
+ def get_structural_frequency(term)
424
+ return self.get_frequency(term, type: :struct_freq)
714
425
  end
715
426
 
716
-
717
- # Find ancestors/descendants of a given term
427
+ # Gets observed frequency of a term given
718
428
  # ===== Parameters
719
429
  # +term+:: to be checked
720
- # +return_ancestors+:: return ancestors if true or descendants if false
721
- # +filter_alternatives+:: if true, remove alternatives from final results
722
430
  # ===== Returns
723
- # an array with all ancestors/descendants of given term or nil if parents are not available yet
724
- def get_familiar(term, return_ancestors = true, filter_alternatives = false)
725
- # Find into parentals
726
- familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
727
- if !familiars.nil?
728
- familiars = familiars.clone
729
- if filter_alternatives
730
- familiars.reject!{|fm| @alternatives_index.include?(fm)}
731
- end
732
- else
733
- familiars = []
734
- end
735
- return familiars
431
+ # observed frequency of given term or nil if term is not allowed
432
+ def get_observed_frequency(term)
433
+ return self.get_frequency(term, type: :observed_freq)
736
434
  end
737
435
 
738
-
739
436
  # Obtain IC of an specific term
740
437
  # ===== Parameters
741
438
  # +term+:: which IC will be calculated
@@ -789,7 +486,7 @@ class Ontology
789
486
  ###########################################
790
487
  when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
791
488
  # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
792
- ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
489
+ ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@terms.length))
793
490
  if :zhou # New Model of Semantic Similarity Measuring in Wordnet
794
491
  # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
795
492
  @ics[:seco][term] = ic # Special store
@@ -803,40 +500,25 @@ class Ontology
803
500
  return ic
804
501
  end
805
502
 
503
+ # Term vs Term #
806
504
 
807
- # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
808
- # ===== Returns
809
- # two hashes with resnik and resnik_observed ICs for observed terms
810
- def get_observed_ics_by_onto_and_freq
811
- # Chech there are observed terms
812
- if @profiles.empty?
813
- resnik = {}
814
- resnik_observed = {}
815
- else
816
- # Calc ICs for all terms
817
- observed_terms = @profiles.values.flatten.uniq
818
- observed_terms.each{ |term| get_IC(term)}
819
- observed_terms.each{ |term| get_IC(term, type: :resnik_observed)}
820
- resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
821
- resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
505
+ def get_LCA(termA, termB, lca_index: false)
506
+ lca = []
507
+ if lca_index
508
+ res = @lca_index.dig(termA, termB)
509
+ lca = [res] if !res.nil?
510
+ else # Obtain ancestors (include itselfs too)
511
+ anc_A = self.get_ancestors(termA)
512
+ anc_B = self.get_ancestors(termB)
513
+ if !(anc_A.empty? && anc_B.empty?)
514
+ anc_A << termA
515
+ anc_B << termB
516
+ lca = anc_A & anc_B
517
+ end
822
518
  end
823
- return resnik.clone, resnik_observed.clone
824
- end
825
-
826
-
827
- # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
828
- # ===== Parameters
829
- # +termA+:: term to be cheked
830
- # +termB+:: term to be checked
831
- # +ic_type+:: IC formula to be used
832
- # ===== Returns
833
- # the IC of the MICA(termA,termB)
834
- def get_ICMICA(termA, termB, ic_type = :resnik)
835
- term, ic = self.get_MICA(termA, termB, ic_type)
836
- return term.nil? ? nil : ic
519
+ return lca
837
520
  end
838
521
 
839
-
840
522
  # Find the Most Index Content shared Ancestor (MICA) of two given terms
841
523
  # ===== Parameters
842
524
  # +termA+:: term to be cheked
@@ -844,30 +526,31 @@ class Ontology
844
526
  # +ic_type+:: IC formula to be used
845
527
  # ===== Returns
846
528
  # the MICA(termA,termB) and it's IC
847
- def get_MICA(termA, termB, ic_type = :resnik)
848
- termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
849
- termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
529
+ def get_MICA(termA, termB, ic_type = :resnik, lca_index = false)
850
530
  mica = [nil,-1.0]
851
- # Special case
852
- if termA.eql?(termB)
531
+ if termA.eql?(termB) # Special case
853
532
  ic = self.get_IC(termA, type: ic_type)
854
533
  mica = [termA, ic]
855
- else
856
- # Obtain ancestors (include itselfs too)
857
- anc_A = self.get_ancestors(termA)
858
- anc_B = self.get_ancestors(termB)
859
- if !(anc_A.empty? && anc_B.empty?)
860
- anc_A << termA
861
- anc_B << termB
862
- (anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
863
- ic = self.get_IC(anc, type: ic_type)
864
- mica = [anc,ic] if ic > mica[1]
865
- end
534
+ else
535
+ get_LCA(termA, termB, lca_index: lca_index).each do |lca| # Find MICA in shared ancestors
536
+ ic = self.get_IC(lca, type: ic_type)
537
+ mica = [lca, ic] if ic > mica[1]
866
538
  end
867
539
  end
868
540
  return mica
869
541
  end
870
542
 
543
+ # Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
544
+ # ===== Parameters
545
+ # +termA+:: term to be cheked
546
+ # +termB+:: term to be checked
547
+ # +ic_type+:: IC formula to be used
548
+ # ===== Returns
549
+ # the IC of the MICA(termA,termB)
550
+ def get_ICMICA(termA, termB, ic_type = :resnik)
551
+ term, ic = self.get_MICA(termA, termB, ic_type)
552
+ return term.nil? ? nil : ic
553
+ end
871
554
 
872
555
  # Calculate similarity between two given terms
873
556
  # ===== Parameters
@@ -877,11 +560,10 @@ class Ontology
877
560
  # +ic_type+:: IC formula to be used
878
561
  # ===== Returns
879
562
  # the similarity between both sets or false if frequencies are not available yet
880
- def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
881
- # Check
563
+ def get_similarity(termA, termB, type: :resnik, ic_type: :resnik, lca_index: false)
882
564
  raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
883
565
  sim = nil
884
- mica, sim_res = get_MICA(termA, termB, ic_type)
566
+ mica, sim_res = get_MICA(termA, termB, ic_type, lca_index)
885
567
  if !mica.nil?
886
568
  case type
887
569
  when :resnik
@@ -895,1631 +577,1027 @@ class Ontology
895
577
  return sim
896
578
  end
897
579
 
580
+ # Checking valid terms
581
+ ####################################
898
582
 
899
- # Method used to load information stored into an OBO file and store it into this object.
900
- # If a file is specified by input parameter, current @file value is updated
901
- # ===== Parameters
902
- # +file+:: optional file to update object stored file
903
- def load(file, build: true)
904
- _, header, stanzas = self.class.load_obo(file)
905
- @header = header
906
- @stanzas = stanzas
907
- self.remove_removable()
908
- # @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
909
- self.build_index() if build
583
+ def term_exist?(id)
584
+ return @terms.include?(id)
910
585
  end
911
586
 
912
- #
913
- def remove_removable()
914
- @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
587
+ # Check if a term given is marked as obsolete
588
+ def is_obsolete?(term)
589
+ return @obsoletes.include?(term)
915
590
  end
916
591
 
592
+ #############################################
593
+ # ITEMS METHODS
594
+ #############################################
595
+
596
+ # I/O Items
597
+ ####################################
917
598
 
918
- # Exports an OBO_Handler object in json format
599
+ # Store specific relations hash given into ITEMS structure
919
600
  # ===== Parameters
920
- # +file+:: where info will be stored
921
- def write(file)
922
- # Take object stored info
923
- obj_info = {header: @header,
924
- stanzas: @stanzas,
925
- ancestors_index: @ancestors_index,
926
- descendants_index: @descendants_index,
927
- alternatives_index: @alternatives_index,
928
- obsoletes_index: @obsoletes_index,
929
- structureType: @structureType,
930
- ics: @ics,
931
- meta: @meta,
932
- special_tags: @special_tags,
933
- max_freqs: @max_freqs,
934
- dicts: @dicts,
935
- profiles: @profiles,
936
- profilesDict: @profilesDict,
937
- items: @items,
938
- removable_terms: @removable_terms,
939
- term_paths: @term_paths}
940
- # Convert to JSON format & write
941
- File.open(file, "w") { |f| f.write obj_info.to_json }
942
- end
601
+ # +relations+:: hash to be stored
602
+ # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
603
+ # +expand+:: if true, already stored keys will be updated with the unique union of both sets
604
+ def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
605
+ @items = {} if remove_old_relations
606
+ relations.each do |term, items|
607
+ if !term_exist?(term)
608
+ warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
609
+ break
610
+ end
611
+ end
612
+ if expand
613
+ @items = self.concatItems(@items, relations)
614
+ else
615
+ @items.merge!(relations)
616
+ end
617
+ end
943
618
 
619
+ # Defining Items from instance variables
620
+ ########################################
944
621
 
945
- def is_number? string
946
- true if Float(string) rescue false
622
+ # Assign a dictionary already calculated as a items set.
623
+ # ===== Parameters
624
+ # +dictID+:: dictionary ID to be stored (:byTerm will be used)
625
+ def set_items_from_dict(dictID, remove_old_relations = false)
626
+ @items = {} if remove_old_relations
627
+ query = @dicts[dictID]
628
+ if !query.nil?
629
+ @items.merge!(query[:byTerm])
630
+ else
631
+ warn('Specified ID is not calculated. Dict will not be added as a items set')
632
+ end
947
633
  end
948
634
 
949
-
950
- # Read a JSON file with an OBO_Handler object stored
635
+ # Get related profiles to a given term
951
636
  # ===== Parameters
952
- # +file+:: with object info
953
- # +file+:: if true, calculate indexes. Default: true
954
- # ===== Return
955
- # OBO_Handler internal fields
956
- def read(file, build: true)
957
- # Read file
958
- jsonFile = File.open(file)
959
- jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
960
- # Pre-process (Symbolize some hashs values)
961
- if !jsonInfo[:header].nil?
962
- aux = jsonInfo[:header].map do |entry,info|
963
- if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
964
- [entry,info.map{|item| item.to_sym}]
965
- else
966
- [entry,info]
967
- end
968
- end
969
- jsonInfo[:header] = aux.to_h
970
- end
971
- jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
972
- jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
973
- jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
974
- # Optional
975
- jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
976
- jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
977
- jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
978
- jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:obsoletes_index].nil?
979
- jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
980
- next if dictionaries.nil?
981
- # Special case: byTerm
982
- dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
983
- if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
984
- [term.to_s.to_i, value.map{|term| term.to_sym}]
985
- elsif value.is_a? Numeric # Numeric dictionary
986
- [term.to_sym, value]
987
- elsif value.kind_of?(Array) && flag == :is_a
988
- [term.to_sym, value.map{|v| v.to_sym}]
989
- else
990
- [term.to_sym, value]
991
- end
992
- end
993
- dictionaries[:byTerm] = dictionaries[:byTerm].to_h
994
- # By value
995
- dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
996
- if value.is_a? Numeric # Numeric dictionary
997
- [value, term.to_sym]
998
- elsif term.is_a? Numeric # Numeric dictionary
999
- [value.to_s.to_sym, term]
1000
- elsif flag == :is_a
1001
- [value.to_sym, term.map{|v| v.to_sym}]
1002
- elsif term.kind_of?(Array)
1003
- [value.to_sym, term.map{|t| t.to_sym}]
1004
- else
1005
- [value.to_s, term.to_sym]
1006
- end
1007
- end
1008
- dictionaries[:byValue] = dictionaries[:byValue].to_h
1009
- end
1010
- if !jsonInfo[:profiles].nil?
1011
- jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
1012
- jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
1013
- end
1014
- jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}} unless jsonInfo[:profilesDict].nil?
1015
- jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
1016
- jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
1017
- next if v.nil?
1018
- if v.kind_of?(Array)
1019
- jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
1020
- else
1021
- jsonInfo[:special_tags][k] = v.to_sym
1022
- end
637
+ # +term+:: to be checked
638
+ # ===== Returns
639
+ # profiles which contains given term
640
+ def get_items_from_term(term)
641
+ return @items[term]
642
+ end
643
+
644
+ # For each term in profiles add the ids in the items term-id dictionary
645
+ def get_items_from_profiles
646
+ @profiles.each do |id, terms|
647
+ terms.each {|term| add2hash(@items, term, id) }
1023
648
  end
1024
- jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
1025
- jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
1026
-
1027
- # Store info
1028
- @header = jsonInfo[:header]
1029
- @stanzas = jsonInfo[:stanzas]
1030
- @ancestors_index = jsonInfo[:ancestors_index]
1031
- @descendants_index = jsonInfo[:descendants_index]
1032
- @alternatives_index = jsonInfo[:alternatives_index]
1033
- @obsoletes_index = jsonInfo[:obsoletes_index]
1034
- jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
1035
- @structureType = jsonInfo[:structureType]
1036
- @ics = jsonInfo[:ics]
1037
- @meta = jsonInfo[:meta]
1038
- @special_tags = jsonInfo[:special_tags]
1039
- @max_freqs = jsonInfo[:max_freqs]
1040
- @dicts = jsonInfo[:dicts]
1041
- @profiles = jsonInfo[:profiles]
1042
- @profilesDict = jsonInfo[:profilesDict]
1043
- @items = jsonInfo[:items]
1044
- @removable_terms = jsonInfo[:removable_terms]
1045
- @term_paths = jsonInfo[:term_paths]
1046
-
1047
- self.build_index() if build
1048
- end
1049
-
1050
-
1051
- # Check if a given ID is stored as term into this object
1052
- # ===== Parameters
1053
- # +id+:: to be checked
1054
- # ===== Return
1055
- # True if term is allowed or false in other cases
1056
- def exists? id
1057
- return stanzas[:terms].include?(id)
1058
649
  end
1059
650
 
651
+ # Defining instance variables from items
652
+ ########################################
1060
653
 
1061
- # This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
1062
- # ===== Parameters
1063
- # +text+:: to be checked
1064
- # ===== Return
1065
- # The correct ID if it can be found or nil in other cases
1066
- def extract_id(text, splitBy: ' ')
1067
- if self.exists?(text)
1068
- return text
1069
- else
1070
- splittedText = text.to_s.split(splitBy).first.to_sym
1071
- return self.exists?(splittedText) ? splittedText : nil
654
+ def get_profiles_from_items
655
+ new_profiles = {}
656
+ @items.each do |term, ids|
657
+ ids.each{|id| add2hash(new_profiles, id, term) }
1072
658
  end
659
+ @profiles = new_profiles
1073
660
  end
1074
661
 
662
+ # Expanding items
663
+ ####################################
1075
664
 
1076
- # Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
1077
- # This functions stores calculated dictionary into @dicts field.
1078
- # This functions stores first value for multivalue tags
1079
- # This function does not handle synonyms for byValue dictionaries
665
+ # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
666
+ # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1080
667
  # ===== Parameters
1081
- # +tag+:: to be used to calculate dictionary
1082
- # +select_regex+:: gives a regfex that can be used to modify value to be stored
1083
- # +substitute_alternatives+:: flag used to indicate if alternatives must, or not, be replaced by it official ID
1084
- # +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
1085
- # +multiterm+:: if true, byValue will allows multi-term linkage (array)
1086
- # +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
1087
- # ===== Return
1088
- # void. And stores calcualted bidirectional dictonary into dictionaries main container
1089
- def calc_dictionary(tag, select_regex: nil, substitute_alternatives: true, store_tag: nil, multiterm: false, self_type_references: false)
1090
- tag = tag.to_sym
1091
- store_tag = tag if store_tag.nil?
1092
- if @stanzas[:terms].empty?
1093
- warn('Terms are not already loaded. Aborting dictionary calc')
1094
- else
1095
- byTerm = {}
1096
- byValue = {}
1097
- # Calc per term
1098
- @stanzas[:terms].each do |term, tags|
1099
- referenceTerm = term
1100
- if @alternatives_index.include?(term) && substitute_alternatives # Special case
1101
- referenceTerm = @alternatives_index[term] if !@obsoletes_index.include?(@alternatives_index[term])
1102
- end
1103
- queryTag = tags[tag]
1104
- if !queryTag.nil?
1105
- # Pre-process
1106
- if !select_regex.nil?
1107
- if queryTag.kind_of?(Array)
1108
- queryTag = queryTag.map{|value| value.scan(select_regex).first}
1109
- queryTag.flatten!
1110
- else
1111
- queryTag = queryTag.scan(select_regex).first
1112
- end
1113
- queryTag.compact!
1114
- end
1115
- if queryTag.kind_of?(Array) # Store
1116
- if !queryTag.empty?
1117
- if byTerm.include?(referenceTerm)
1118
- byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
1119
- else
1120
- byTerm[referenceTerm] = queryTag
668
+ # +ontology+:: (Optional) ontology object which items given belongs
669
+ # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
670
+ # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
671
+ # ===== Returns
672
+ # void and update items object
673
+ def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
674
+ targetKeys = expand_profile_with_parents(@items.keys)
675
+ terms_per_level = list_terms_per_level(targetKeys)
676
+ terms_per_level = terms_per_level.to_a.sort{|l1, l2| l1.first <=> l2.first} # Obtain sorted levels
677
+ terms_per_level.pop # Leaves are not expandable # FRED: Thats comment could be not true
678
+
679
+ terms_per_level.reverse_each do |lvl, terms| # Expand from leaves to roots
680
+ terms.each do |term|
681
+ childs = self.get_descendants(term).select{|t| @items.include?(t)} # Get child with items
682
+ next if childs.length < minimum_childs
683
+ propagated_item_count = Hash.new(0)
684
+ if ontology.nil? # Count how many times is presented an item in childs
685
+ childs.each do |child|
686
+ @items[child].each{|i| propagated_item_count[i] += 1}
687
+ end
688
+ else # Count take into account similarity between terms in other ontology. Not pretty clear the full logic
689
+ while childs.length > 1
690
+ curr_term = childs.shift
691
+ childs.each do |child|
692
+ maxmica_counts = Hash.new(0)
693
+ curr_items = @items[curr_term]
694
+ child_items = @items[child]
695
+ curr_items.each do |item|
696
+ maxmica = ontology.get_maxmica_term2profile(item, child_items)
697
+ maxmica_counts[maxmica.first] += 1
1121
698
  end
1122
- if multiterm
1123
- queryTag.each do |value|
1124
- byValue[value] = [] if byValue[value].nil?
1125
- byValue[value] << referenceTerm
1126
- end
1127
- else
1128
- queryTag.each{|value| byValue[value] = referenceTerm}
699
+ child_items.each do |item|
700
+ maxmica = ontology.get_maxmica_term2profile(item, curr_items)
701
+ maxmica_counts[maxmica.first] += 1
1129
702
  end
1130
- end
1131
- else
1132
- if byTerm.include?(referenceTerm)
1133
- byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
1134
- else
1135
- byTerm[referenceTerm] = [queryTag]
1136
- end
1137
- if multiterm
1138
- byValue[queryTag] = [] if byValue[queryTag].nil?
1139
- byValue[queryTag] << referenceTerm
1140
- else
1141
- byValue[queryTag] = referenceTerm
1142
- end
1143
- end
1144
- end
1145
- end
1146
-
1147
- # Check self-references
1148
- if self_type_references
1149
- byTerm.map do |term, references|
1150
- corrected_references = references.map do |t|
1151
- checked = self.extract_id(t)
1152
- if checked.nil?
1153
- t
1154
- else
1155
- byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
1156
- checked
703
+ maxmica_counts.each{|t,freq| propagated_item_count[t] += freq if freq >= 2} #TODO: Maybe need Division by 2 due to the calculation of mica two times but test fails.
704
+ # FRED: Maybe for the childs.shift there is uniqueness
1157
705
  end
1158
706
  end
1159
- byTerm[term] = corrected_references.uniq
1160
707
  end
1161
- end
1162
-
1163
- # Check order
1164
- byTerm.map do |term,values|
1165
- if self.exists?(term)
1166
- referenceValue = @stanzas[:terms][term][tag]
1167
- if !referenceValue.nil?
1168
- if !select_regex.nil?
1169
- if referenceValue.kind_of?(Array)
1170
- referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
1171
- referenceValue.flatten!
1172
- else
1173
- referenceValue = referenceValue.scan(select_regex).first
1174
- end
1175
- referenceValue.compact!
1176
- end
1177
- if self_type_references
1178
- if referenceValue.kind_of?(Array)
1179
- aux = referenceValue.map{|t| self.extract_id(t)}
1180
- else
1181
- aux = self.extract_id(referenceValue)
1182
- end
1183
- aux.compact! unless aux.nil?
1184
- referenceValue = aux unless aux.nil?
1185
- end
1186
- referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1187
- byTerm[term] = referenceValue + (values - referenceValue)
708
+ propagated_items = propagated_item_count.select{|k,v| v >= minimum_childs}.keys
709
+ if propagated_items.length > 0
710
+ query = @items[term]
711
+ if query.nil?
712
+ @items[term] = propagated_items
713
+ else
714
+ terms = @items[term] | propagated_items
715
+ terms = ontology.clean_profile(terms) if clean_profiles && !ontology.nil?
716
+ @items[term] = terms
1188
717
  end
1189
718
  end
1190
719
  end
1191
-
1192
- # Store
1193
- @dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
1194
720
  end
1195
721
  end
1196
722
 
723
+ # Compute modified fisher between terms and items based on topgo methodology. Refactor to use all the possible methods of this class
724
+ #-------------------------------------------------------------------------------------------------------------------------------------
1197
725
 
1198
- # Calculates :is_a dictionary without alternatives substitution
1199
- def calc_ancestors_dictionary
1200
- self.calc_dictionary(:is_a, substitute_alternatives: false, self_type_references: true, multiterm: true)
726
+ def compute_relations_to_items(external_item_list, total_items, mode, thresold) # NEED TEST, check with PSZ how to maintain these methods
727
+ terms_levels = list_terms_per_level_from_items
728
+ connect_familiars!(terms_levels)
729
+ item_list_with_transf_parental = get_item_list_parental(terms_levels)
730
+ results = []
731
+ if mode == :elim
732
+ results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
733
+ elsif mode == :weight
734
+ results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
735
+ end
736
+ return results
1201
737
  end
1202
738
 
739
+ def list_terms_per_level_from_items
740
+ return list_terms_per_level(@items.keys)
741
+ end
1203
742
 
1204
- # Translate a given value using an already calcualted dictionary
1205
- # ===== Parameters
1206
- # +toTranslate+:: value to be translated using dictiontionary
1207
- # +tag+:: used to generate the dictionary
1208
- # +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
1209
- # ===== Return
1210
- # translation
1211
- def translate(toTranslate, tag, byValue: true)
1212
- dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
1213
- toTranslate = get_main_id(toTranslate) if !byValue
1214
- return dict[toTranslate]
1215
- end
1216
-
1217
-
1218
- # Translate a name given
1219
- # ===== Parameters
1220
- # +name+:: to be translated
1221
- # ===== Return
1222
- # translated name or nil if it's not stored into this ontology
1223
- def translate_name(name)
1224
- term = self.translate(name, :name)
1225
- term = self.translate(name, :synonym) if term.nil?
1226
- return term
1227
- end
1228
-
1229
-
1230
- # Translate several names and return translations and a list of names which couldn't be translated
1231
- # ===== Parameters
1232
- # +names+:: array to be translated
1233
- # ===== Return
1234
- # two arrays with translations and names which couldn't be translated respectively
1235
- def translate_names(names)
1236
- translated = []
1237
- rejected = []
1238
- names.each do |name|
1239
- tr = self.translate_name(name)
1240
- if tr.nil?
1241
- rejected << name
1242
- else
1243
- translated << tr
1244
- end
1245
- end
1246
- return translated, rejected
1247
- end
1248
-
1249
-
1250
- # Translates a given ID to it assigned name
1251
- # ===== Parameters
1252
- # +id+:: to be translated
1253
- # ===== Return
1254
- # main name or nil if it's not included into this ontology
1255
- def translate_id(id)
1256
- name = self.translate(id, :name, byValue: false)
1257
- return name.nil? ? nil : name.first
1258
- end
1259
-
1260
-
1261
- # Translates several IDs and returns translations and not allowed IDs list
1262
- # ===== Parameters
1263
- # +ids+:: to be translated
1264
- # ===== Return
1265
- # two arrays with translations and names which couldn't be translated respectively
1266
- def translate_ids(ids)
1267
- translated = []
1268
- rejected = []
1269
- ids.each do |term_id|
1270
- tr = self.translate_id(term_id.to_sym)
1271
- if !tr.nil?
1272
- translated << tr
1273
- else
1274
- rejected << tr
1275
- end
1276
- end
1277
- return translated, rejected
1278
- end
1279
-
1280
-
1281
- # ===== Returns
1282
- # the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
1283
- # ===== Parameters
1284
- # +id+:: to be translated
1285
- # ===== Return
1286
- # main ID related to a given ID. Returns nil if given ID is not an allowed ID
1287
- def get_main_id(id)
1288
- return nil if !@stanzas[:terms].include? id
1289
- new_id = id
1290
- mainID = @alternatives_index[id]
1291
- new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
1292
- return new_id
1293
- end
1294
-
1295
-
1296
- # Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
1297
- # ===== Parameters
1298
- # +ids+:: to be checked
1299
- # ===== Return
1300
- # two arrays whit allowed and rejected IDs respectively
1301
- def check_ids(ids, substitute: true)
1302
- checked_codes = []
1303
- rejected_codes = []
1304
- ids.each do |id|
1305
- if @stanzas[:terms].include? id
1306
- if substitute
1307
- checked_codes << self.get_main_id(id)
1308
- else
1309
- checked_codes << id
1310
- end
1311
- else
1312
- rejected_codes << id
1313
- end
1314
- end
1315
- return checked_codes, rejected_codes
1316
- end
1317
-
1318
-
1319
- # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
1320
- # ===== Parameters
1321
- # +id+:: assigned to profile
1322
- # +terms+:: array of terms
1323
- # +substitute+:: subsstitute flag from check_ids
1324
- def add_profile(id, terms, substitute: true)
1325
- warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1326
- correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1327
- if !rejected_terms.empty?
1328
- warn('Given terms contains erroneus IDs. These IDs will be removed')
1329
- end
1330
- if id.is_a? Numeric
1331
- @profiles[id] = correct_terms
1332
- else
1333
- @profiles[id.to_sym] = correct_terms
1334
- end
1335
- end
1336
-
1337
-
1338
- # Method used to store a pull of profiles
1339
- # ===== Parameters
1340
- # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1341
- # +calc_metadata+:: if true, launch calc_profiles_dictionary process
1342
- # +reset_stored+:: if true, remove already stored profiles
1343
- # +substitute+:: subsstitute flag from check_ids
1344
- def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1345
- self.reset_profiles if reset_stored
1346
- # Check
1347
- if profiles.kind_of?(Array)
1348
- profiles.each_with_index do |items, i|
1349
- self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
1350
- end
1351
- else # Hash
1352
- if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1353
- warn('Some profiles given are already stored. Stored version will be replaced')
1354
- end
1355
- profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
1356
- end
1357
-
1358
- self.add_observed_terms_from_profiles(reset: true)
1359
-
1360
- if calc_metadata
1361
- self.calc_profiles_dictionary
1362
- end
1363
- end
1364
-
1365
-
1366
- # Internal method used to remove already stored profiles and restore observed frequencies
1367
- def reset_profiles
1368
- # Clean profiles storage
1369
- @profiles = {}
1370
- # Reset frequency observed
1371
- @meta.each{|term,info| info[:observed_freq] = 0}
1372
- @max_freqs[:observed_freq] = 0
1373
- end
1374
-
1375
-
1376
- # ===== Returns
1377
- # profiles assigned to a given ID
1378
- # ===== Parameters
1379
- # +id+:: profile ID
1380
- # ===== Return
1381
- # specific profile or nil if it's not stored
1382
- def get_profile(id)
1383
- return @profiles[id]
1384
- end
1385
-
1386
-
1387
- # ===== Returns
1388
- # an array of sizes for all stored profiles
1389
- # ===== Return
1390
- # array of profile sizes
1391
- def get_profiles_sizes()
1392
- return @profiles.map{|id,terms| terms.length}
1393
- end
1394
-
1395
-
1396
- # ===== Returns
1397
- # mean size of stored profiles
1398
- # ===== Parameters
1399
- # +round_digits+:: number of digits to round result. Default: 4
1400
- # ===== Returns
1401
- # mean size of stored profiles
1402
- def get_profiles_mean_size(round_digits: 4)
1403
- sizes = self.get_profiles_sizes
1404
- return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
1405
- end
1406
-
1407
-
1408
- # Calculates profiles sizes and returns size assigned to percentile given
1409
- # ===== Parameters
1410
- # +perc+:: percentile to be returned
1411
- # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1412
- # ===== Returns
1413
- # values assigned to percentile asked
1414
- def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1415
- prof_lengths = self.get_profiles_sizes.sort
1416
- prof_lengths.reverse! if !increasing_sort
1417
- n_profiles = prof_lengths.length
1418
- percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
1419
- percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
1420
- return prof_lengths[percentile_index]
1421
- end
1422
-
1423
-
1424
- # Translate a given profile to terms names
1425
- # ===== Parameters
1426
- # +prof+:: array of terms to be translated
1427
- # ===== Returns
1428
- # array of translated terms. Can include nils if some IDs are not allowed
1429
- def profile_names(prof)
1430
- return prof.map{|term| self.translate_id(term)}
1431
- end
1432
-
1433
-
1434
- # Trnaslates a bunch of profiles to it sets of term names
1435
- # ===== Parameters
1436
- # +profs+:: array of profiles
1437
- # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1438
- # ===== Returns
1439
- # translated profiles
1440
- def translate_profiles_ids(profs = [], asArray: true)
1441
- profs = @profiles if profs.empty?
1442
- profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
1443
- profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
1444
- return asArray ? profs_names.values : profs_names
1445
- end
1446
-
1447
-
1448
- # Includes as "observed_terms" all terms included into stored profiles
1449
- # ===== Parameters
1450
- # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1451
- def add_observed_terms_from_profiles(reset: false)
1452
- @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1453
- @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
1454
- end
1455
-
1456
-
1457
- # Get a term frequency
1458
- # ===== Parameters
1459
- # +term+:: term to be checked
1460
- # +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
1461
- # ===== Returns
1462
- # frequency of term given or nil if term is not allowed
1463
- def get_frequency(term, type: :struct_freq)
1464
- queryFreq = @meta[term]
1465
- return queryFreq.nil? ? nil : queryFreq[type]
1466
- end
1467
-
1468
-
1469
- # Geys structural frequency of a term given
1470
- # ===== Parameters
1471
- # +term+:: to be checked
1472
- # ===== Returns
1473
- # structural frequency of given term or nil if term is not allowed
1474
- def get_structural_frequency(term)
1475
- return self.get_frequency(term, type: :struct_freq)
1476
- end
1477
-
1478
-
1479
- # Gets observed frequency of a term given
1480
- # ===== Parameters
1481
- # +term+:: to be checked
1482
- # ===== Returns
1483
- # observed frequency of given term or nil if term is not allowed
1484
- def get_observed_frequency(term)
1485
- return self.get_frequency(term, type: :observed_freq)
1486
- end
1487
-
1488
-
1489
- # Calculates frequencies of stored profiles terms
1490
- # ===== Parameters
1491
- # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1492
- # +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
1493
- # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1494
- # +translate+:: if true, term IDs will be translated to
1495
- # ===== Returns
1496
- # stored profiles terms frequencies
1497
- def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
1498
- n_profiles = @profiles.length
1499
- if literal
1500
- freqs = {}
1501
- @profiles.each do |id, terms|
1502
- terms.each do |literalTerm|
1503
- if freqs.include?(literalTerm)
1504
- freqs[literalTerm] += 1
1505
- else
1506
- freqs[literalTerm] = 1
1507
- end
1508
- end
1509
- end
1510
- if (ratio || translate)
1511
- aux_keys = freqs.keys
1512
- aux_keys.each do |term|
1513
- freqs[term] = freqs[term].fdiv(n_profiles) if ratio
1514
- if translate
1515
- tr = self.translate_id(term)
1516
- freqs[tr] = freqs.delete(term) if !tr.nil?
1517
- end
1518
- end
1519
- end
1520
- if asArray
1521
- freqs = freqs.map{|term, freq| [term, freq]}
1522
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1523
- end
1524
- else # Freqs translating alternatives
1525
- freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
1526
- freqs = freqs.to_h if !asArray
1527
- if translate
1528
- freqs = freqs.map do |term, freq|
1529
- tr = self.translate_id(term)
1530
- tr.nil? ? [term, freq] : [tr, freq]
1531
- end
1532
- end
1533
- if asArray
1534
- freqs = freqs.map{|term, freq| [term, freq]}
1535
- freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1536
- else
1537
- freqs = freqs.to_h
1538
- end
1539
- end
1540
- return freqs
1541
- end
1542
-
1543
-
1544
- # Clean a given profile returning cleaned set of terms and removed ancestors term.
1545
- # ===== Parameters
1546
- # +prof+:: array of terms to be checked
1547
- # ===== Returns
1548
- # two arrays, first is the cleaned profile and second is the removed elements array
1549
- def remove_ancestors_from_profile(prof)
1550
- ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
1551
- redundant = prof.select{|term| ancestors.include?(term)}
1552
- return prof - redundant, redundant
1553
- end
1554
-
1555
-
1556
- # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
1557
- # ===== Parameters
1558
- # +prof+:: array of terms to be checked
1559
- # ===== Returns
1560
- # two arrays, first is the cleaned profile and second is the removed elements array
1561
- def remove_alternatives_from_profile(prof)
1562
- alternatives = prof.select{|term| @alternatives_index.include?(term)}
1563
- redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
1564
- return prof - redundant, redundant
1565
- end
1566
-
1567
-
1568
- # Remove alternatives (if official term is present) and ancestors terms of a given profile
1569
- # ===== Parameters
1570
- # +profile+:: profile to be cleaned
1571
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1572
- # ===== Returns
1573
- # cleaned profile
1574
- def clean_profile(profile, remove_alternatives: true)
1575
- warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
1576
- terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1577
- if remove_alternatives
1578
- terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
1579
- else
1580
- terms_without_ancestors_and_alternatices = terms_without_ancestors
743
+ def list_terms_per_level(terms)
744
+ terms_levels = {}
745
+ terms.each do |term|
746
+ level = self.get_term_level(term)
747
+ add2hash(terms_levels, level, term)
1581
748
  end
1582
- return terms_without_ancestors_and_alternatices
1583
- end
1584
-
1585
- def clean_profile_hard(profile, options = {})
1586
- profile, _ = check_ids(profile)
1587
- profile = profile.select{|t| !is_obsolete?(t)}
1588
- if !options[:term_filter].nil?
1589
- profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
1590
- end
1591
- profile = clean_profile(profile.uniq)
1592
- return profile
749
+ return terms_levels
1593
750
  end
1594
751
 
1595
- # Remove terms from a given profile using hierarchical info and scores set given
1596
- # ===== Parameters
1597
- # +profile+:: profile to be cleaned
1598
- # +scores+:: hash with terms by keys and numerical values (scores)
1599
- # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
1600
- # +remove_without_score+:: if true, terms without score will be removed. Default: true
1601
- # ===== Returns
1602
- # cleaned profile
1603
- def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
1604
- scores = scores.sort_by{|term,score| score}.to_h
1605
- keep = profile.map do |term|
1606
- if scores.include?(term)
1607
- parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
1608
- targetable = parentals.select{|parent| profile.include?(parent)}
1609
- if targetable.empty?
1610
- term
1611
- else
1612
- targetable << term
1613
- targets = scores.select{|term,score| targetable.include?(term)}.to_h
1614
- byMax ? targets.keys.last : targets.keys.first
1615
- end
1616
- elsif remove_without_score
1617
- nil
1618
- else
1619
- term
752
+ def connect_familiars!(terms_levels)
753
+ levels = terms_levels.keys.sort
754
+ while levels.length > 1 # Process when current level has a parental level
755
+ level = levels.pop
756
+ parental_level = level - 1
757
+ parental_terms = terms_levels[parental_level]
758
+ if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
759
+ parental_terms = [] # Initialize required parental level
760
+ terms_levels[parental_level] = parental_terms
761
+ levels << parental_level
1620
762
  end
1621
- end
1622
- return keep.compact.uniq
1623
- end
1624
-
1625
-
1626
- # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1627
- # ===== Parameters
1628
- # +store+:: if true, clenaed profiles will replace already stored profiles
1629
- # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1630
- # ===== Returns
1631
- # a hash with cleaned profiles
1632
- def clean_profiles(store: false, remove_alternatives: true)
1633
- cleaned_profiles = {}
1634
- @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1635
- @profiles = cleaned_profiles if store
1636
- return cleaned_profiles
1637
- end
1638
-
1639
-
1640
- # Calculates number of ancestors present (redundant) in each profile stored
1641
- # ===== Returns
1642
- # array of parentals for each profile
1643
- def parentals_per_profile
1644
- cleaned_profiles = self.clean_profiles(remove_alternatives: false)
1645
- parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
1646
- return parentals
1647
- end
1648
-
1649
-
1650
- def get_profile_redundancy()
1651
- profile_sizes = self.get_profiles_sizes
1652
- parental_terms_per_profile = self.parentals_per_profile# clean_profiles
1653
- parental_terms_per_profile = parental_terms_per_profile.map{|item| item[0]}
1654
- profile_sizes, parental_terms_per_profile = profile_sizes.zip(parental_terms_per_profile).sort_by{|i| i.first}.reverse.transpose
1655
- return profile_sizes, parental_terms_per_profile
1656
- end
1657
-
1658
- def compute_term_list_and_childs()
1659
- suggested_childs = {}
1660
- total_terms = 0
1661
- terms_with_more_specific_childs = 0
1662
- @profiles.each do |id, terms|
1663
- total_terms += terms.length
1664
- more_specific_childs = self.get_childs_table(terms, true)
1665
- terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
1666
- suggested_childs[id] = more_specific_childs
1667
- end
1668
- return suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
1669
- end
1670
-
1671
- # Calculates mean IC of a given profile
1672
- # ===== Parameters
1673
- # +prof+:: profile to be checked
1674
- # +ic_type+:: ic_type to be used
1675
- # +zhou_k+:: special coeficient for Zhou IC method
1676
- # ===== Returns
1677
- # mean IC for a given profile
1678
- def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1679
- return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
1680
- end
1681
-
1682
-
1683
- # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1684
- # ===== Returns
1685
- # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1686
- def get_profiles_resnik_dual_ICs
1687
- struct_ics = {}
1688
- observ_ics = {}
1689
- @profiles.each do |id, terms|
1690
- struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
1691
- observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
1692
- end
1693
- return struct_ics.clone, observ_ics.clone
1694
- end
1695
-
1696
-
1697
- # Calculates ontology structural levels for all ontology terms
1698
- # ===== Parameters
1699
- # +calc_paths+:: calculates term paths if it's not already calculated
1700
- # +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
1701
- def calc_term_levels(calc_paths: false, shortest_path: true)
1702
- if @term_paths.empty?
1703
- if calc_paths
1704
- self.calc_term_paths
1705
- else
1706
- warn('Term paths are not already loaded. Aborting dictionary calc')
763
+ terms_levels[level].each do |term|
764
+ path_info = @term_paths[term]
765
+ shortest_path_length = path_info[:shortest_path]
766
+ path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
767
+ parental = path[1] # the first elements is the term itself
768
+ parental_terms << parental if !parental_terms.include?(parental)
1707
769
  end
1708
770
  end
1709
- if !@term_paths.empty?
1710
- byTerm = {}
1711
- byValue = {}
1712
- # Calc per term
1713
- @term_paths.each do |term, info|
1714
- level = shortest_path ? info[:shortest_path] : info[:largest_path]
1715
- if level.nil?
1716
- level = -1
1717
- else
1718
- level = level.round(0)
1719
- end
1720
- byTerm[term] = level
1721
- queryLevels = byValue[level]
1722
- if queryLevels.nil?
1723
- byValue[level] = [term]
771
+ end
772
+
773
+ def get_item_list_parental(terms_levels)
774
+ transfered_list = {}
775
+ parent_dict = @dicts[:is_a][:byTerm]
776
+ levels = terms_levels.keys.sort
777
+ while levels.length > 1
778
+ level = levels.pop
779
+ terms_levels[level].each do |term|
780
+ parents = parent_dict[term]
781
+ if parents.nil?
782
+ next
783
+ elsif parents.length == 1
784
+ parent = parents.first
1724
785
  else
1725
- byValue[level] << term
786
+ parent = (parents | terms_levels[level - 1]).first
1726
787
  end
788
+ term_it = @items[term]
789
+ parent_it = @items[parent]
790
+ curr_it = transfered_list[term]
791
+ parent_all_items = merge_groups([term_it, parent_it, curr_it])
792
+ transfered_list[parent] = parent_all_items if !parent_all_items.empty?
793
+ term_all_items = merge_groups([term_it, curr_it])
794
+ transfered_list[term] = term_all_items if !term_all_items.empty?
1727
795
  end
1728
- @dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
1729
- # Update maximum depth
1730
- @max_freqs[:max_depth] = byValue.keys.max
1731
796
  end
797
+ terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
798
+ transfered_list[term] = @items[term] if transfered_list[term].nil?
799
+ end
800
+ return transfered_list
1732
801
  end
1733
802
 
1734
-
1735
- # Check if a term given is marked as obsolete
1736
- def is_obsolete? term
1737
- return @obsoletes_index.include?(term)
803
+ def merge_groups(groups)
804
+ return groups.compact.inject([ ]){|it, a| it | a}
1738
805
  end
1739
806
 
1740
- # Check if a term given is marked as alternative
1741
- def is_alternative? term
1742
- return @alternatives_index.include?(term)
807
+ def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
808
+ results = []
809
+ penalized_terms = {}
810
+ levels = terms_levels.keys.sort
811
+ levels.reverse_each do |level|
812
+ terms_levels[level].each do |term|
813
+ associated_items = item_list[term]
814
+ items_to_remove = penalized_terms[term]
815
+ items_to_remove = [] if items_to_remove.nil?
816
+ pval = get_fisher_exact_test(
817
+ external_item_list - items_to_remove,
818
+ associated_items - items_to_remove,
819
+ #((associated_items | external_item_list) - items_to_remove).length
820
+ total_items
821
+ )
822
+ if pval <= thresold
823
+ parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
824
+ parents.each do |prnt|
825
+ query = penalized_terms[prnt]
826
+ if query.nil?
827
+ penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
828
+ else
829
+ query.concat(item_list[term])
830
+ end
831
+ end
832
+ end
833
+ results << [term, pval]
834
+ end
835
+ end
836
+ return results
1743
837
  end
1744
838
 
1745
- # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1746
- # Also calculates paths metadata and stores into @term_paths
1747
- def calc_term_paths(only_main_terms=false)
1748
- self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
1749
- visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
1750
- @term_paths = {}
1751
- if [:hierarchical, :sparse].include? @structureType
1752
- @stanzas[:terms].each do |term, t_attributes|
1753
- if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term)) # Special case (obsoletes)
1754
- special_term = term
1755
- term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1756
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1757
- @term_paths[special_term] = @term_paths[term]
1758
- visited_terms[special_term] = true
1759
- end
1760
- if !visited_terms.include?(term)
1761
- # PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
1762
- path_attr = @term_paths[term]
1763
- if path_attr.nil?
1764
- path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
1765
- @term_paths[term] = path_attr #save path data container
1766
- end
1767
- parentals = @dicts[:is_a][:byTerm][term]
1768
- if parentals.nil?
1769
- path_attr[:paths] << [term]
1770
- else
1771
- parentals.each do |direct_parental|
1772
- self.expand_path(direct_parental)
1773
- new_paths = @term_paths[direct_parental][:paths]
1774
- path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
1775
- end
1776
- end
1777
- anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
1778
- visited_terms[term] = true
839
+ def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
840
+ pvals = {}
841
+ item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
842
+ levels = terms_levels.keys.sort
843
+ levels.reverse_each do |level|
844
+ terms_levels[level].each do |term|
845
+ associated_items = item_list[term]
846
+ #initialize observed items in item_weigths_per_term list
847
+ add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
848
+ children = @dicts[:is_a][:byValue][term]
849
+ if children.nil?
850
+ children = []
851
+ else
852
+ children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
1779
853
  end
1780
- # Update metadata
1781
- path_attr = @term_paths[term]
1782
- path_attr[:total_paths] = path_attr[:paths].length
1783
- paths_sizes = path_attr[:paths].map{|path| path.length}
1784
- path_attr[:largest_path] = paths_sizes.max
1785
- path_attr[:shortest_path] = paths_sizes.min
854
+ computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
1786
855
  end
1787
- else
1788
- warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
1789
856
  end
857
+ return pvals.to_a
1790
858
  end
1791
859
 
860
+ def add_items_to_weigthed_list(term, associated_items, weigthed_list)
861
+ term_weigthing = weigthed_list[term]
862
+ associated_items.each{|ai| term_weigthing[ai] = 1}
863
+ weigthed_list[term] = term_weigthing
864
+ end
1792
865
 
1793
- # Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
1794
- # ===== Parameters
1795
- # +curr_term+:: current visited term
1796
- # +visited_terms+:: already expanded terms
1797
- def expand_path(curr_term)
1798
- if !@term_paths.include?(curr_term)
1799
- path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
1800
- @term_paths[curr_term] = path_attr
1801
- direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1802
- if direct_parentals.nil? # No parents :: End of recurrence
1803
- path_attr[:paths] << [curr_term]
1804
- else # Expand and concat
1805
- direct_parentals.each do |ancestor|
1806
- path_attr_parental = @term_paths[ancestor]
1807
- if path_attr_parental.nil? # Calculate new paths
1808
- self.expand_path(ancestor)
1809
- new_paths = @term_paths[ancestor][:paths]
1810
- else # Use direct_parental paths already calculated
1811
- new_paths = path_attr_parental[:paths]
866
+ def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
867
+ #puts term.to_s.red
868
+ #puts @term_paths[term].inspect
869
+ #puts @dicts[:is_a][:byValue][term].inspect.light_blue
870
+ associated_items = item_weigths_per_term[term].keys
871
+ pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
872
+ 'two_sided', item_weigths_per_term[term], true)
873
+ pvals[term] = pval
874
+ if children.length > 0
875
+ rates = {}
876
+ sig_child = 0
877
+ children.each do |child|
878
+ ratio = sigRatio(pvals[child], pval)
879
+ rates[child] = ratio
880
+ sig_child += 1 if ratio >= 1
881
+ end
882
+ if sig_child == 0 # CASE 1
883
+ children.each do |child|
884
+ current_ratio = rates[child]
885
+ query_child = item_weigths_per_term[child]
886
+ query_child.transform_values!{|weight| weight * current_ratio}
887
+ pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
888
+ 'two_sided', item_weigths_per_term[child], true)
889
+ end
890
+ else
891
+ ancs = get_ancestors(term)
892
+ ancs << term
893
+ rates.each do |ch, ratio|# CASE 2
894
+ if ratio >= 1 # The child is better than parent
895
+ ancs.each do |anc|
896
+ query_anc = item_weigths_per_term[anc]
897
+ associated_items.each do |item|
898
+ query_anc[item] /= ratio # /= --> query_anc[item]/ratio
899
+ end
900
+ end
1812
901
  end
1813
- path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
1814
902
  end
903
+ computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
1815
904
  end
1816
905
  end
1817
906
  end
1818
907
 
1819
-
1820
- # Gets ontology levels calculated
1821
- # ===== Returns
1822
- # ontology levels calculated
1823
- def get_ontology_levels
1824
- return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
908
+ def sigRatio(pvalA, pvalB)
909
+ return Math.log(pvalA)/Math.log(pvalB)
1825
910
  end
1826
911
 
912
+ # END of methods involved with compute_relations_to_items
913
+ #-----------------------------------------------------------------------------------
914
+
915
+ #############################################
916
+ # PROFILE EXTERNAL METHODS
917
+ #############################################
1827
918
 
1828
- # Gets ontology level of a specific term
1829
- # ===== Returns
1830
- # Term level
1831
- def get_term_level(term)
1832
- return @dicts[:level][:byValue][term]
919
+ # I/O profile
920
+ ####################################
921
+
922
+ # Increase the arbitrary frequency of a given term set
923
+ # ===== Parameters
924
+ # +terms+:: set of terms to be updated
925
+ # +increase+:: amount to be increased
926
+ # +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
927
+ # ===== Return
928
+ # true if process ends without errors and false in other cases
929
+ def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false, expand2parentals: true)
930
+ terms = terms.map{|term| [term] + get_ancestors(term.to_sym)}.flatten if expand2parentals
931
+ return terms.map{|id| self.add_observed_term(
932
+ term: transform_to_sym ? id.to_sym : id,
933
+ increase: increase)} # FRED: It is necessary the return?
1833
934
  end
1834
935
 
1835
- # nil, term not found, [] term exists but not has parents
1836
- def get_parental_path(term, which_path = :shortest_path, level = 0)
1837
- path = nil
1838
- path_attr = @term_paths[term]
1839
- if !path_attr.nil?
1840
- path_length = path_attr[which_path]
1841
- all_paths = path_attr[:paths]
1842
- if all_paths.empty?
1843
- path = []
1844
- else
1845
- path = all_paths.select{|pt| pt.length == path_length}.first.clone
1846
- if level > 0 # we want the term and his ascendants until a specific level
1847
- n_parents = path_length - level
1848
- path = path[0..n_parents]
1849
- end
1850
- path.shift # Discard the term itself
1851
- end
936
+ # Modifying Profile
937
+ ####################################
938
+
939
+ def expand_profile_with_parents(profile)
940
+ new_terms = []
941
+ profile.each do |term|
942
+ new_terms = new_terms | get_ancestors(term)
1852
943
  end
1853
- return path
1854
- end
944
+ return new_terms | profile
945
+ end
1855
946
 
1856
- # Return ontology levels from profile terms
947
+ # Clean a given profile returning cleaned set of terms and removed ancestors term.
948
+ # ===== Parameters
949
+ # +prof+:: array of terms to be checked
1857
950
  # ===== Returns
1858
- # hash of term levels (Key: level; Value: array of term IDs)
1859
- def get_ontology_levels_from_profiles(uniq = true) # TODO: remove uniq and check dependencies
1860
- profiles_terms = @profiles.values.flatten
1861
- profiles_terms.uniq! if uniq
1862
- term_freqs_byProfile = {}
1863
- profiles_terms.each do |term|
1864
- query = term_freqs_byProfile[term]
1865
- if query.nil?
1866
- term_freqs_byProfile[term] = 1
1867
- else
1868
- term_freqs_byProfile[term] += 1
1869
- end
1870
- end
1871
- levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
1872
- return levels_filtered
951
+ # two arrays, first is the cleaned profile and second is the removed elements array
952
+ def remove_ancestors_from_profile(prof)
953
+ ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
954
+ redundant = prof & ancestors
955
+ return prof - redundant, redundant
1873
956
  end
1874
957
 
1875
- def get_profile_ontology_distribution_tables
1876
- cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
1877
- uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
1878
- hpo_ontology_levels = get_ontology_levels
1879
- total_ontology_terms = hpo_ontology_levels.values.flatten.length
1880
- total_cohort_terms = cohort_ontology_levels.values.flatten.length
1881
- total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
958
+ # Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
959
+ # ===== Parameters
960
+ # +prof+:: array of terms to be checked
961
+ # ===== Returns
962
+ # two arrays, first is the cleaned profile and second is the removed elements array
963
+ def remove_alternatives_from_profile(prof)
964
+ alternatives = prof.select{|term| @alternatives_index.include?(term)}
965
+ redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
966
+ return prof - redundant, redundant
967
+ end
1882
968
 
1883
- ontology_levels = []
1884
- distribution_percentage = []
1885
- hpo_ontology_levels.each do |level, terms|
1886
- cohort_terms = cohort_ontology_levels[level]
1887
- uniq_cohort_terms = uniq_cohort_ontology_levels[level]
1888
- if cohort_terms.nil? || uniq_cohort_terms.nil?
1889
- num = 0
1890
- u_num = 0
1891
- else
1892
- num = cohort_terms.length
1893
- u_num = uniq_cohort_terms.length
1894
- end
1895
- ontology_levels << [level, terms.length, num]
1896
- distribution_percentage << [
1897
- level,
1898
- (terms.length.fdiv(total_ontology_terms)*100).round(3),
1899
- (num.fdiv(total_cohort_terms)*100).round(3),
1900
- (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
1901
- ]
1902
- end
1903
- ontology_levels.sort! { |x,y| x.first <=> y.first }
1904
- distribution_percentage.sort! { |x,y| x.first <=> y.first }
1905
- return ontology_levels, distribution_percentage
969
+ # Remove alternatives (if official term is present) and ancestors terms of a given profile
970
+ # ===== Parameters
971
+ # +profile+:: profile to be cleaned
972
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
973
+ # ===== Returns
974
+ # cleaned profile
975
+ def clean_profile(profile, remove_alternatives: true)
976
+ warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
977
+ terms_without_ancestors, _ = remove_ancestors_from_profile(profile)
978
+ terms_without_ancestors, _ = remove_alternatives_from_profile(terms_without_ancestors) if remove_alternatives
979
+ return terms_without_ancestors
1906
980
  end
1907
981
 
1908
- def get_dataset_specifity_index(mode)
1909
- ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
1910
- if mode == 'uniq'
1911
- observed_distribution = 3
1912
- elsif mode == 'weigthed'
1913
- observed_distribution = 2
1914
- end
1915
- max_terms = distribution_percentage.map{|row| row[1]}.max
1916
- maxL = nil
1917
- distribution_percentage.each do |level_info|
1918
- maxL = level_info.first if level_info[1] == max_terms
1919
- end
1920
- diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
1921
- diffL.select!{|dL| dL.last > 0}
1922
- lowSection = diffL.select{|dL| dL.first <= maxL}
1923
- highSection = diffL.select{|dL| dL.first > maxL}
1924
- dsi = nil
1925
- if highSection.empty?
1926
- dsi = 0
1927
- else
1928
- accumulated_weigth = 0
1929
- accumulated_weigthed_diffL = 0
1930
- hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
1931
- lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
1932
- dsi = hss.fdiv(lss)
1933
- end
1934
- return dsi
982
+ def clean_profile_hard(profile, options = {})
983
+ profile, _ = check_ids(profile)
984
+ profile = profile.select{|t| !is_obsolete?(t)}
985
+ if !options[:term_filter].nil?
986
+ profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
987
+ end
988
+ profile = clean_profile(profile.uniq)
989
+ return profile
1935
990
  end
1936
991
 
1937
- def get_weigthed_level_contribution(section, maxL, nLevels)
1938
- accumulated_weigthed_diffL = 0
1939
- section.each do |level, diff|
1940
- weightL = maxL - level
1941
- if weightL >= 0
1942
- weightL += 1
992
+ # Remove terms from a given profile using hierarchical info and scores set given
993
+ # ===== Parameters
994
+ # +profile+:: profile to be cleaned
995
+ # +scores+:: hash with terms by keys and numerical values (scores)
996
+ # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
997
+ # +remove_without_score+:: if true, terms without score will be removed. Default: true
998
+ # ===== Returns
999
+ # cleaned profile
1000
+ def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
1001
+ scores = scores.sort_by{|term,score| score}.to_h
1002
+ keep = profile.map do |term|
1003
+ if scores.include?(term)
1004
+ parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
1005
+ targetable = parentals.select{|parent| profile.include?(parent)}
1006
+ if targetable.empty?
1007
+ term
1008
+ else
1009
+ targetable << term
1010
+ targets = scores.select{|term,score| targetable.include?(term)}.to_h
1011
+ byMax ? targets.keys.last : targets.keys.first
1012
+ end
1013
+ elsif remove_without_score
1014
+ nil
1943
1015
  else
1944
- weightL = weightL.abs
1016
+ term
1945
1017
  end
1946
- accumulated_weigthed_diffL += diff * weightL
1947
1018
  end
1948
- weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
1949
- return weigthed_contribution
1019
+ return keep.compact.uniq
1950
1020
  end
1951
1021
 
1022
+ # ID Handlers
1023
+ ####################################
1952
1024
 
1953
- # Calculate profiles dictionary with Key= Term; Value = Profiles
1954
- def calc_profiles_dictionary
1955
- if @profiles.empty?
1956
- warn('Profiles are not already loaded. Aborting dictionary calc')
1957
- else
1958
- byTerm = {} # Key: Terms
1959
- # byValue -- Key: Profile == @profiles
1960
- @profiles.each do |id, terms|
1961
- terms.each do |term|
1962
- if byTerm.include?(term)
1963
- byTerm[term] << id
1964
- else
1965
- byTerm[term] = [id]
1966
- end
1025
+ # Check a set of IDs and return allowed IDs removing which are not official terms on this ontology
1026
+ # ===== Parameters
1027
+ # +ids+:: to be checked
1028
+ # ===== Return
1029
+ # two arrays whit allowed and rejected IDs respectively
1030
+ def check_ids(ids, substitute: true)
1031
+ checked_codes = []
1032
+ rejected_codes = []
1033
+ ids.each do |id|
1034
+ new_id = get_main_id(id)
1035
+ if new_id.nil?
1036
+ rejected_codes << id
1037
+ else
1038
+ if substitute
1039
+ checked_codes << new_id
1040
+ else
1041
+ checked_codes << id
1967
1042
  end
1968
1043
  end
1969
- @profilesDict = byTerm
1970
1044
  end
1045
+ return checked_codes, rejected_codes
1971
1046
  end
1972
1047
 
1973
1048
 
1974
- # Gets profiles dictionary calculated
1049
+ # Translates several IDs and returns translations and not allowed IDs list
1050
+ # ===== Parameters
1051
+ # +ids+:: to be translated
1975
1052
  # ===== Return
1976
- # profiles dictionary (clone)
1977
- def get_terms_linked_profiles
1978
- return @profilesDict.clone
1979
- end
1980
-
1053
+ # two arrays with translations and ids which couldn't be translated respectively
1054
+ def translate_ids(ids)
1055
+ translated = []
1056
+ rejected = []
1057
+ ids.each do |term_id|
1058
+ tr = self.translate_id(term_id.to_sym)
1059
+ if !tr.nil?
1060
+ translated << tr # FRED: Why have this a different behaviour from ...->
1061
+ else
1062
+ rejected << tr
1063
+ end
1064
+ end
1065
+ return translated, rejected
1066
+ end
1981
1067
 
1982
- # Get related profiles to a given term
1068
+ # Translate several names and return translations and a list of names which couldn't be translated
1983
1069
  # ===== Parameters
1984
- # +term+:: to be checked
1985
- # ===== Returns
1986
- # profiles which contains given term
1987
- def get_term_linked_profiles(term)
1988
- return @profilesDict[term]
1070
+ # +names+:: array to be translated
1071
+ # ===== Return
1072
+ # two arrays with translations and names which couldn't be translated respectively
1073
+ def translate_names(names)
1074
+ translated = []
1075
+ rejected = []
1076
+ names.each do |name|
1077
+ tr = self.translate_name(name)
1078
+ if tr.nil?
1079
+ rejected << name # FRED: <-... this?
1080
+ else
1081
+ translated << tr
1082
+ end
1083
+ end
1084
+ return translated, rejected
1989
1085
  end
1990
1086
 
1087
+ # Description of profile's terms
1088
+ ####################################
1991
1089
 
1992
1090
  # Gets metainfo table from a set of terms
1993
1091
  # ===== Parameters
1994
1092
  # +terms+:: IDs to be expanded
1995
- # +filter_alternatives+:: flag to be used in get_descendants method
1996
1093
  # ===== Returns
1997
1094
  # an array with triplets [TermID, TermName, DescendantsNames]
1998
- def get_childs_table(terms, filter_alternatives = false)
1999
- expanded_terms = []
2000
- terms.each do |t|
2001
- expanded_terms << [[t, self.translate_id(t)], self.get_descendants(t, filter_alternatives).map{|child| [child, self.translate_id(child)]}]
1095
+ def get_childs_table(profile)
1096
+ expanded_profile = []
1097
+ profile.each do |t|
1098
+ expanded_profile << [[t, translate_id(t)], get_descendants(t).map{|child| [child, translate_id(child)]}]
2002
1099
  end
2003
- return expanded_terms
1100
+ return expanded_profile
2004
1101
  end
2005
1102
 
1103
+ def get_terms_levels(profile)
1104
+ termsAndLevels = []
1105
+ profile.each do |term|
1106
+ termsAndLevels << [term, get_term_level(term)]
1107
+ end
1108
+ return termsAndLevels
1109
+ end
2006
1110
 
2007
- # Store specific relations hash given into ITEMS structure
1111
+ # IC data
1112
+ ####################################
1113
+
1114
+ # Get information coefficient from profiles #
1115
+
1116
+ # Calculates mean IC of a given profile
2008
1117
  # ===== Parameters
2009
- # +relations+:: hash to be stored
2010
- # +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
2011
- # +expand+:: if true, already stored keys will be updated with the unique union of both sets
2012
- def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
2013
- @items = {} if remove_old_relations
2014
- if !relations.select{|term, items| !@stanzas[:terms].include?(term)}.empty?
2015
- warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
2016
- end
2017
- if !remove_old_relations
2018
- if !relations.select{|term, items| @items.include?(term)}.empty? && !expand
2019
- warn('Some terms given are already stored. Stored version will be replaced')
1118
+ # +prof+:: profile to be checked
1119
+ # +ic_type+:: ic_type to be used
1120
+ # +zhou_k+:: special coeficient for Zhou IC method
1121
+ # ===== Returns
1122
+ # mean IC for a given profile
1123
+ def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
1124
+ return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.sum.fdiv(prof.length)
1125
+ end
1126
+
1127
+ # Term ref vs profile #
1128
+
1129
+ def get_maxmica_term2profile(ref_term, profile)
1130
+ micas = profile.map{|term| get_MICA(ref_term, term)}
1131
+ maxmica = micas.first
1132
+ micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
1133
+ return maxmica
1134
+ end
1135
+
1136
+ # Profile vs Profile #
1137
+
1138
+ # Get semantic similarity from two term sets
1139
+ # ===== Parameters
1140
+ # +termsA+:: set to be compared
1141
+ # +termsB+:: set to be compared
1142
+ # +sim_type+:: similitude method to be used. Default: resnik
1143
+ # +ic_type+:: ic type to be used. Default: resnik
1144
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
1145
+ # ===== Return
1146
+ # similitude calculated
1147
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
1148
+ # Check
1149
+ raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
1150
+ raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
1151
+ micasA = []
1152
+ # Compare A -> B
1153
+ termsA.each do |tA|
1154
+ micas = []
1155
+ termsB.each do |tB|
1156
+ if store_mica
1157
+ value = @mica_index[tA][tB]
1158
+ else
1159
+ value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
1160
+ end
1161
+ micas << value if value.class == Float
2020
1162
  end
1163
+ !micas.empty? ? micasA << micas.max : micasA << 0
2021
1164
  end
2022
- if expand
2023
- @items = self.concatItems(@items,relations)
2024
- # relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
2025
- # if @items.keys.include?(k)
2026
- # if v.kind_of?(Array)
2027
- # @items[k] = (@items[k] + v).uniq
2028
- # elsif v.kind_of?(Hash)
2029
- # @items.merge!(relations) do |k, oldV, newV|
2030
- # if oldV.kind_of?(Array)
2031
- # return (oldV + newV).uniq
2032
- # else
2033
- # oldV = [oldV,newV]
2034
- # end
2035
- # end
2036
- # elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
2037
- # @items[k] = (@items[k] + [v]).uniq
2038
- # else
2039
- # @items[k] = [@items[k],v]
2040
- # end
2041
- # else
2042
- # @items[k] = v
2043
- # end
2044
- # end
2045
- else
2046
- @items.merge!(relations)
1165
+ means_sim = micasA.sum.fdiv(micasA.size)
1166
+ # Compare B -> A
1167
+ if bidirectional
1168
+ means_simA = means_sim * micasA.size
1169
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
1170
+ means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
2047
1171
  end
2048
- end
1172
+ # Return
1173
+ return means_sim
1174
+ end
2049
1175
 
2050
- # Internal function to concat two elements.
1176
+
1177
+ #############################################
1178
+ # PROFILE INTERNAL METHODS
1179
+ #############################################
1180
+
1181
+ # I/O profiles
1182
+ ####################################
1183
+
1184
+ # Method used to store a pool of profiles
2051
1185
  # ===== Parameters
2052
- # +itemA+:: item to be concatenated
2053
- # +itemB+:: item to be concatenated
2054
- # ===== Returns
2055
- # Concatenated objects
2056
- def concatItems(itemA,itemB)
2057
- # A is Array :: RETURN ARRAY
2058
- # A_array : B_array
2059
- # A_array : B_hash => NOT ALLOWED
2060
- # A_array : B_single => NOT ALLOWED
2061
- # A is Hash :: RETURN HASH
2062
- # A_hash : B_array => NOT ALLOWED
2063
- # A_hash : B_hash
2064
- # A_hash : B_single => NOT ALLOWED
2065
- # A is single element => RETURN ARRAY
2066
- # A_single : B_array
2067
- # A_single : B_hash => NOT ALLOWED
2068
- # A_single : B_single
2069
- concatenated = nil
2070
- if itemA.kind_of?(Array) && itemB.kind_of?(Array)
2071
- concatenated = (itemA + itemB).uniq
2072
- elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
2073
- concatenated = itemA.merge(itemB) do |k, oldV, newV|
2074
- self.concatItems(oldV,newV)
1186
+ # +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
1187
+ # +calc_metadata+:: if true, launch get_items_from_profiles process
1188
+ # +reset_stored+:: if true, remove already stored profiles
1189
+ # +substitute+:: subsstitute flag from check_ids
1190
+ def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
1191
+ self.reset_profiles if reset_stored
1192
+ # Check
1193
+ if profiles.kind_of?(Array)
1194
+ profiles.each_with_index do |items, i|
1195
+ self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
2075
1196
  end
2076
- elsif itemB.kind_of?(Array)
2077
- concatenated = ([itemA] + itemB).uniq
2078
- elsif ![Array, Hash].include?(itemB.class)
2079
- concatenated = [itemA,itemB].uniq
1197
+ else # Hash
1198
+ if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
1199
+ warn('Some profiles given are already stored. Stored version will be replaced')
1200
+ end
1201
+ profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
2080
1202
  end
2081
- return concatenated
2082
- end
2083
1203
 
1204
+ self.add_observed_terms_from_profiles(reset: true)
2084
1205
 
2085
- # Assign a dictionary already calculated as a items set.
2086
- # ===== Parameters
2087
- # +dictID+:: dictionary ID to be stored (:byTerm will be used)
2088
- def set_items_from_dict(dictID, remove_old_relations = false)
2089
- @items = {} if remove_old_relations
2090
- if !@dicts[dictID].nil?
2091
- @items.merge(@dicts[dictID][:byTerm])
2092
- else
2093
- warn('Specified ID is not calculated. Dict will not be added as a items set')
1206
+ if calc_metadata
1207
+ self.get_items_from_profiles
2094
1208
  end
2095
1209
  end
2096
1210
 
2097
-
2098
- # This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
2099
- # Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
1211
+ # Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
2100
1212
  # ===== Parameters
2101
- # +ontology+:: (Optional) ontology object which items given belongs
2102
- # +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
2103
- # +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
2104
- # ===== Returns
2105
- # void and update items object
2106
- def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
2107
- # Check item keys
2108
- if @items.empty?
2109
- warn('Items have been not provided yet')
2110
- return nil
2111
- end
2112
- targetKeys = @items.keys.select{|k| self.exists?(k)}
2113
- if targetKeys.length == 0
2114
- warn('Any item key is allowed')
2115
- return nil
2116
- elsif targetKeys.length < @items.keys.length
2117
- warn('Some item keys are not allowed')
2118
- end
2119
-
2120
- # Expand to parentals
2121
- targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
2122
- targetKeys.flatten!
2123
- targetKeys.uniq!
2124
-
2125
- # Obtain levels (go from leaves to roots)
2126
- levels = targetKeys.map{|term| self.get_term_level(term)}
2127
- levels.compact!
2128
- levels.uniq!
2129
- levels.sort!
2130
- levels.reverse!
2131
- levels.shift # Leaves are not expandable
2132
-
2133
- # Expand from leaves to roots
2134
- levels.map do |lvl|
2135
- curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
2136
- curr_keys.map do |term_expand|
2137
- to_infer = []
2138
- # Obtain childs
2139
- childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
2140
- # Expand
2141
- if childs.length > 0 && minimum_childs == 1 # Special case
2142
- to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
2143
- elsif childs.length >= minimum_childs
2144
- to_infer = Hash.new(0)
2145
- # Compare
2146
- while childs.length > 1
2147
- curr_term = childs.shift
2148
- childs.each do |compare_term|
2149
- pivot_items = @items[curr_term]
2150
- compare_items = @items[compare_term]
2151
- if ontology.nil? # Exact match
2152
- pivot_items.map do |pitem|
2153
- if compare_items.include?(pitem)
2154
- to_infer[pitem] += 2
2155
- end
2156
- end
2157
- else # Find MICAs
2158
- local_infer = Hash.new(0)
2159
- pivot_items.map do |pitem|
2160
- micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
2161
- maxmica = micas[0]
2162
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
2163
- local_infer[maxmica.first] += 1
2164
- end
2165
- compare_items.map do |citem|
2166
- micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
2167
- maxmica = micas[0]
2168
- micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
2169
- local_infer[maxmica.first] += 1
2170
- end
2171
- local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
2172
- end
2173
- end
2174
- end
2175
- # Filter infer
2176
- to_infer = to_infer.select{|k,v| v >= minimum_childs}
2177
- end
2178
- # Infer
2179
- if to_infer.length > 0
2180
- @items[term_expand] = [] if @items[term_expand].nil?
2181
- if to_infer.kind_of?(Array)
2182
- @items[term_expand] = (@items[term_expand] + to_infer).uniq
2183
- else
2184
- @items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
2185
- end
2186
- @items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
2187
- elsif !@items.include?(term_expand)
2188
- targetKeys.delete(term_expand)
2189
- end
2190
- end
1213
+ # +id+:: assigned to profile
1214
+ # +terms+:: array of terms
1215
+ # +substitute+:: subsstitute flag from check_ids
1216
+ def add_profile(id, terms, substitute: true) # FRED: Talk with PSZ about the uniqness of IDs translated
1217
+ warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
1218
+ correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
1219
+ if !rejected_terms.empty?
1220
+ warn("Given terms contains erroneus IDs: #{rejected_terms.join(",")}. These IDs will be removed")
2191
1221
  end
2192
- end
1222
+ if id.is_a? Numeric
1223
+ @profiles[id] = correct_terms
1224
+ else
1225
+ @profiles[id.to_sym] = correct_terms
1226
+ end
1227
+ end
2193
1228
 
2194
1229
 
2195
- # Return direct ancestors/descendants of a given term
1230
+ # Includes as "observed_terms" all terms included into stored profiles
2196
1231
  # ===== Parameters
2197
- # +term+:: which are requested
2198
- # +relation+:: can be :ancestor or :descendant
2199
- # +remove_alternatives+:: if true, alternatives will be removed
2200
- # ===== Returns
2201
- # Direct ancestors/descendants of given term or nil if any error occurs
2202
- def get_direct_related(term, relation, remove_alternatives: false)
2203
- if @dicts[:is_a].nil?
2204
- warn("Hierarchy dictionary is not already calculated. Returning nil")
2205
- return nil
2206
- end
2207
- target = nil
2208
- case relation
2209
- when :ancestor
2210
- target = :byTerm
2211
- when :descendant
2212
- target = :byValue
2213
- else
2214
- warn('Relation type not allowed. Returning nil')
2215
- end
2216
- return nil if target.nil?
2217
- query = @dicts[:is_a][target][term]
2218
- return query if query.nil?
2219
- query, _ = remove_alternatives_from_profile(query) if remove_alternatives
2220
- return query
1232
+ # +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
1233
+ def add_observed_terms_from_profiles(reset: false)
1234
+ @meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
1235
+ @profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
2221
1236
  end
2222
1237
 
2223
-
2224
- # Return direct ancestors of a given term
1238
+ # ===== Returns
1239
+ # profiles assigned to a given ID
2225
1240
  # ===== Parameters
2226
- # +term+:: which ancestors are requested
2227
- # +remove_alternatives+:: if true, alternatives will be removed
2228
- # ===== Returns
2229
- # Direct ancestors of given term or nil if any error occurs
2230
- def get_direct_ancentors(term, remove_alternatives: false)
2231
- return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
1241
+ # +id+:: profile ID
1242
+ # ===== Return
1243
+ # specific profile or nil if it's not stored
1244
+ def get_profile(id)
1245
+ return @profiles[id]
2232
1246
  end
2233
1247
 
2234
- # Return direct descendants of a given term
2235
- # ===== Parameters
2236
- # +term+:: which descendants are requested
2237
- # +remove_alternatives+:: if true, alternatives will be removed
2238
- # ===== Returns
2239
- # Direct descendants of given term or nil if any error occurs
2240
- def get_direct_descendants(term, remove_alternatives: false)
2241
- return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
1248
+ # Modifying profiles
1249
+ ####################################
1250
+
1251
+ def reset_profiles # Internal method used to remove already stored profiles and restore observed frequencies #TODO FRED: Modify test for this method.
1252
+ @profiles = {} # Clean profiles storage
1253
+ # Reset frequency observed
1254
+ @meta.each{|term,info| info[:observed_freq] = 0}
1255
+ @max_freqs[:observed_freq] = 0
1256
+ @items = {}
2242
1257
  end
2243
1258
 
2244
- def each(att = false)
2245
- @stanzas[:terms].each do |id, tags|
2246
- next if @alternatives_index.include?(id)
2247
- if att
2248
- yield(id, tags)
2249
- else
2250
- yield(id)
1259
+ def expand_profiles(meth, unwanted_terms: [], calc_metadata: true, ontology: nil, minimum_childs: 1, clean_profiles: true)
1260
+ if meth == 'parental'
1261
+ @profiles.each do |id, terms|
1262
+ @profiles[id] = expand_profile_with_parents(terms) - unwanted_terms
2251
1263
  end
1264
+ get_items_from_profiles if calc_metadata
1265
+ elsif meth == 'propagate'
1266
+ get_items_from_profiles
1267
+ expand_items_to_parentals(ontology: ontology, minimum_childs: minimum_childs, clean_profiles: clean_profiles)
1268
+ get_profiles_from_items
2252
1269
  end
1270
+ add_observed_terms_from_profiles(reset: true)
2253
1271
  end
2254
1272
 
2255
- def list_term_attributes
2256
- terms = []
2257
- each do |code|
2258
- terms << [code, translate_id(code), get_term_level(code)]
2259
- end
2260
- return terms
1273
+ # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1274
+ # ===== Parameters
1275
+ # +store+:: if true, clenaed profiles will replace already stored profiles
1276
+ # +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
1277
+ # ===== Returns
1278
+ # a hash with cleaned profiles
1279
+ def clean_profiles(store: false, remove_alternatives: true)
1280
+ cleaned_profiles = {}
1281
+ @profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
1282
+ @profiles = cleaned_profiles if store
1283
+ return cleaned_profiles
2261
1284
  end
2262
1285
 
2263
- #============================================================================
2264
- #============================================================================
1286
+ # ID Handlers
1287
+ ####################################
2265
1288
 
2266
- # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1289
+ # Trnaslates a bunch of profiles to it sets of term names
2267
1290
  # ===== Parameters
2268
- # ++::
2269
- # ===== Returns
2270
- # ...
2271
- def compute_relations_to_items(external_item_list, total_items, mode, thresold)
2272
- terms_levels = list_terms_per_level_from_items
2273
- #puts terms_levels.inspect.yellow
2274
- connect_familiars!(terms_levels)
2275
- #puts terms_levels.inspect.blue
2276
- item_list_with_transf_parental = get_item_list_parental(terms_levels)
2277
- results = []
2278
- if mode == :elim
2279
- results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
2280
- elsif mode == :weight
2281
- results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
2282
- end
2283
- return results
2284
- end
2285
-
2286
- def get_item_list_parental(terms_levels)
2287
- transfered_list = {}
2288
- parent_dict = @dicts[:is_a][:byTerm]
2289
- levels = terms_levels.keys.sort
2290
- while levels.length > 1
2291
- level = levels.pop
2292
- terms_levels[level].each do |term|
2293
- parents = parent_dict[term]
2294
- if parents.nil?
2295
- next
2296
- elsif parents.length == 1
2297
- parent = parents.first
2298
- else
2299
- parent = (parents | terms_levels[level - 1]).first
2300
- end
2301
- term_it = @items[term]
2302
- parent_it = @items[parent]
2303
- curr_it = transfered_list[term]
2304
- parent_all_items = merge_groups([term_it, parent_it, curr_it])
2305
- transfered_list[parent] = parent_all_items if !parent_all_items.empty?
2306
- term_all_items = merge_groups([term_it, curr_it])
2307
- transfered_list[term] = term_all_items if !term_all_items.empty?
2308
- end
1291
+ # +profs+:: array of profiles
1292
+ # +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
1293
+ # ===== Returns
1294
+ # translated profiles
1295
+ def translate_profiles_ids(profs = [], asArray: true)
1296
+ profs2proc = {}
1297
+ if profs.empty?
1298
+ profs2proc = @profiles
1299
+ else
1300
+ profs.each_with_index{|terms, index| profs2proc[index] = terms} if profs.kind_of?(Array)
2309
1301
  end
2310
- terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
2311
- transfered_list[term] = @items[term] if transfered_list[term].nil?
1302
+ profs_names = {}
1303
+ profs2proc.each do |id, terms|
1304
+ names, _ = translate_ids(terms)
1305
+ profs_names[id] = names
2312
1306
  end
2313
- return transfered_list
1307
+ return asArray ? profs_names.values : profs_names
2314
1308
  end
2315
1309
 
2316
- def merge_groups(groups)
2317
- return groups.compact.inject([]){|it, a| it | a}
1310
+ # Description of profile size
1311
+ ####################################
1312
+
1313
+ def profile_stats
1314
+ stats = Hash.new(0)
1315
+ data = get_profiles_sizes
1316
+ stats[:average] = data.sum().fdiv(data.size)
1317
+ sum_devs = data.sum{|element| (element - stats[:average]) ** 2}
1318
+ stats[:variance] = sum_devs.fdiv(data.size)
1319
+ stats[:standardDeviation] = stats[:variance] ** 0.5
1320
+ stats[:max] = data.max
1321
+ stats[:min] = data.min
1322
+
1323
+ stats[:count] = data.size
1324
+ data.each do |value|
1325
+ stats[:countNonZero] += 1 if value != 0
1326
+ end
1327
+
1328
+ stats[:q1] = data.get_quantiles(0.25)
1329
+ stats[:median] = data.get_quantiles(0.5)
1330
+ stats[:q3] = data.get_quantiles(0.75)
1331
+ return stats
1332
+
2318
1333
  end
2319
1334
 
2320
- def list_terms_per_level_from_items
2321
- terms_levels = {}
2322
- @items.each do |term, items|
2323
- level = self.get_term_level(term)
2324
- query = terms_levels[level]
2325
- if query.nil?
2326
- terms_levels[level] = [term]
2327
- else
2328
- query << term
2329
- end
2330
- end
2331
- return terms_levels
1335
+ # ===== Returns
1336
+ # mean size of stored profiles
1337
+ # ===== Parameters
1338
+ # +round_digits+:: number of digits to round result. Default: 4
1339
+ # ===== Returns
1340
+ # mean size of stored profiles
1341
+ def get_profiles_mean_size(round_digits: 4)
1342
+ sizes = self.get_profiles_sizes
1343
+ return sizes.sum.fdiv(@profiles.length).round(round_digits)
2332
1344
  end
2333
1345
 
2334
- def connect_familiars!(terms_levels)
2335
- levels = terms_levels.keys.sort
2336
- while levels.length > 1 # Process when current level has a parental level
2337
- level = levels.pop
2338
- parental_level = level - 1
2339
- parental_terms = terms_levels[parental_level]
2340
- if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
2341
- parental_terms = [] # Initialize required parental level
2342
- terms_levels[parental_level] = parental_terms
2343
- levels << parental_level
2344
- end
2345
- terms_levels[level].each do |term|
2346
- path_info = @term_paths[term]
2347
- shortest_path_length = path_info[:shortest_path]
2348
- path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
2349
- parental = path[1] # the first elements is the term itself
2350
- parental_terms << parental if !parental_terms.include?(parental)
2351
- end
2352
- end
1346
+ # ===== Returns
1347
+ # an array of sizes for all stored profiles
1348
+ # ===== Return
1349
+ # array of profile sizes
1350
+ def get_profiles_sizes()
1351
+ return @profiles.map{|id,terms| terms.length}
2353
1352
  end
2354
1353
 
2355
- def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
2356
- results = []
2357
- penalized_terms = {}
2358
- levels = terms_levels.keys.sort
2359
- levels.reverse_each do |level|
2360
- terms_levels[level].each do |term|
2361
- associated_items = item_list[term]
2362
- items_to_remove = penalized_terms[term]
2363
- items_to_remove = [] if items_to_remove.nil?
2364
- pval = get_fisher_exact_test(
2365
- external_item_list - items_to_remove,
2366
- associated_items - items_to_remove,
2367
- #((associated_items | external_item_list) - items_to_remove).length
2368
- total_items
2369
- )
2370
- if pval <= thresold
2371
- parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
2372
- parents.each do |prnt|
2373
- query = penalized_terms[prnt]
2374
- if query.nil?
2375
- penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
2376
- else
2377
- query.concat(item_list[term])
2378
- end
2379
- end
2380
- end
2381
- results << [term, pval]
2382
- end
2383
- end
2384
- return results
1354
+ # Calculates profiles sizes and returns size assigned to percentile given
1355
+ # ===== Parameters
1356
+ # +perc+:: percentile to be returned
1357
+ # +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
1358
+ # ===== Returns
1359
+ # values assigned to percentile asked
1360
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
1361
+ prof_lengths = self.get_profiles_sizes
1362
+ percentile_profile = prof_lengths.get_quantiles(perc.fdiv(100), decreasing_sort = !increasing_sort)
1363
+ return percentile_profile
2385
1364
  end
2386
1365
 
2387
- def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
2388
- pvals = {}
2389
- item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
2390
- levels = terms_levels.keys.sort
2391
- levels.reverse_each do |level|
2392
- terms_levels[level].each do |term|
2393
- associated_items = item_list[term]
2394
- #initialize observed items in item_weigths_per_term list
2395
- add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
2396
- children = @dicts[:is_a][:byValue][term]
2397
- if children.nil?
2398
- children = []
2399
- else
2400
- children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
2401
- end
2402
- computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
1366
+ # IC data
1367
+ ####################################
1368
+
1369
+ # Get frequency terms and information coefficient from profiles #
1370
+
1371
+ # Calculates frequencies of stored profiles terms
1372
+ # ===== Parameters
1373
+ # +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
1374
+ # +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
1375
+ # +translate+:: if true, term IDs will be translated to
1376
+ # ===== Returns
1377
+ # stored profiles terms frequencies
1378
+ def get_profiles_terms_frequency(ratio: true, asArray: true, translate: true)
1379
+ freqs = Hash.new(0)
1380
+ @profiles.each do |id, terms|
1381
+ terms.each{|term| freqs[term] += 1}
1382
+ end
1383
+ if translate
1384
+ translated_freqs = {}
1385
+ freqs.each do |term, freq|
1386
+ tr = self.translate_id(term)
1387
+ translated_freqs[tr] = freq if !tr.nil?
2403
1388
  end
1389
+ freqs = translated_freqs
2404
1390
  end
2405
- return pvals.to_a
1391
+ n_profiles = @profiles.length
1392
+ freqs.transform_values!{|freq| freq.fdiv(n_profiles)} if ratio
1393
+ if asArray
1394
+ freqs = freqs.to_a
1395
+ freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
1396
+ end
1397
+ return freqs
2406
1398
  end
2407
1399
 
2408
- def add_items_to_weigthed_list(term, associated_items, weigthed_list)
2409
- term_weigthing = weigthed_list[term]
2410
- associated_items.each{|ai| term_weigthing[ai] = 1}
2411
- weigthed_list[term] = term_weigthing
1400
+ # Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
1401
+ # ===== Returns
1402
+ # two hashes with Profiles and IC calculated for resnik and observed resnik respectively
1403
+ def get_profiles_resnik_dual_ICs(struct: :resnik, observ: :resnik_observed) # Maybe change name during migration to get_profiles_dual_ICs
1404
+ struct_ics = {}
1405
+ observ_ics = {}
1406
+ @profiles.each do |id, terms|
1407
+ struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: struct)
1408
+ observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: observ)
1409
+ end
1410
+ return struct_ics, observ_ics
2412
1411
  end
2413
1412
 
2414
- def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2415
- #puts term.to_s.red
2416
- #puts @term_paths[term].inspect
2417
- #puts @dicts[:is_a][:byValue][term].inspect.light_blue
2418
- associated_items = item_weigths_per_term[term].keys
2419
- pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
2420
- 'two_sided', item_weigths_per_term[term], true)
2421
- pvals[term] = pval
2422
- if children.length > 0
2423
- rates = {}
2424
- sig_child = 0
2425
- children.each do |child|
2426
- ratio = sigRatio(pvals[child], pval)
2427
- rates[child] = ratio
2428
- sig_child += 1 if ratio >= 1
2429
- end
2430
- if sig_child == 0 # CASE 1
2431
- children.each do |child|
2432
- current_ratio = rates[child]
2433
- query_child = item_weigths_per_term[child]
2434
- query_child.transform_values!{|weight| weight * current_ratio}
2435
- pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
2436
- 'two_sided', item_weigths_per_term[child], true)
2437
- end
2438
- else
2439
- ancs = get_ancestors(term, filter_alternatives = true)
2440
- ancs << term
2441
- rates.each do |ch, ratio|# CASE 2
2442
- if ratio >= 1 # The child is better than parent
2443
- ancs.each do |anc|
2444
- query_anc = item_weigths_per_term[anc]
2445
- associated_items.each do |item|
2446
- query_anc[item] /= ratio # /= --> query_anc[item]/ratio
2447
- end
2448
- end
1413
+
1414
+ # Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
1415
+ # ===== Returns
1416
+ # two hashes with resnik and resnik_observed ICs for observed terms
1417
+ def get_observed_ics_by_onto_and_freq()
1418
+ ic_ont = {}
1419
+ resnik_observed = {}
1420
+ observed_terms = @profiles.values.flatten.uniq
1421
+ observed_terms.each do |term|
1422
+ ic_ont[term] = get_IC(term)
1423
+ resnik_observed[term] = get_IC(term, type: :resnik_observed)
1424
+ end
1425
+ return ic_ont, resnik_observed
1426
+ end
1427
+
1428
+ # Profiles vs Profiles #
1429
+
1430
+ def get_pair_index(profiles_A, profiles_B)
1431
+ pair_index = {}
1432
+ profiles_A.each do |curr_id, profile_A|
1433
+ profiles_B.each do |id, profile_B|
1434
+ profile_A.each do |term_A|
1435
+ profile_B.each do |term_B|
1436
+ pair_index[[term_A, term_B].sort] = true
2449
1437
  end
2450
1438
  end
2451
- computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
2452
- end
1439
+ end
2453
1440
  end
1441
+ return pair_index
2454
1442
  end
2455
1443
 
2456
- def sigRatio(pvalA, pvalB)
2457
- return Math.log(pvalA)/Math.log(pvalB)
1444
+ def get_mica_index_from_profiles(pair_index, sim_type: :resnik, ic_type: :resnik, lca_index: true)
1445
+ pair_index.each do |pair, val|
1446
+ tA, tB = pair
1447
+ value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type, lca_index: lca_index)
1448
+ value = true if value.nil? # We use true to save that the operation was made but there is not mica value
1449
+ add2nestHash(@mica_index, tA, tB, value)
1450
+ add2nestHash(@mica_index, tB, tA, value)
1451
+ end
2458
1452
  end
2459
1453
 
2460
- def profile_stats
2461
- stats = Hash.new(0)
2462
- data = @profiles.values.map{|ont_ids| ont_ids.size}
2463
- stats[:average] = data.sum().fdiv(data.size)
2464
- sum_devs = data.sum{|element| (element - stats[:avg]) ** 2}
2465
- stats[:variance] = sum_devs.fdiv(data.size)
2466
- stats[:standardDeviation] = stats[:variance] ** 0.5
2467
- stats[:max] = data.max
2468
- stats[:min] = data.min
2469
-
2470
- stats[:count] = data.size
2471
- data.each do |value|
2472
- stats[:countNonZero] += 1 if value != 0
2473
- end
1454
+ # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
1455
+ # ===== Parameters
1456
+ # +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
1457
+ # +sim_type+:: similitude method to be used. Default: resnik
1458
+ # +ic_type+:: ic type to be used. Default: resnik
1459
+ # +bidirectional+:: calculate bidirectional similitude. Default: false
1460
+ # ===== Return
1461
+ # Similitudes calculated
1462
+ def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
1463
+ profiles_similarity = {} #calculate similarity between patients profile
1464
+ if external_profiles.nil?
1465
+ comp_profiles = @profiles
1466
+ main_profiles = comp_profiles
1467
+ else
1468
+ comp_profiles = external_profiles
1469
+ main_profiles = @profiles
1470
+ end
1471
+ # Compare
1472
+ pair_index = get_pair_index(main_profiles, comp_profiles)
1473
+ @mica_index = {}
1474
+ get_mica_index_from_profiles(pair_index, sim_type: sim_type, ic_type: ic_type, lca_index: false)
1475
+ main_profiles.each do |curr_id, current_profile|
1476
+ comp_profiles.each do |id, profile|
1477
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
1478
+ add2nestHash(profiles_similarity, curr_id, id, value)
1479
+ end
1480
+ end
1481
+ return profiles_similarity
1482
+ end
2474
1483
 
2475
- stats[:q1] = data.get_quantiles(0.25)
2476
- stats[:median] = data.get_quantiles(0.5)
2477
- stats[:q3] = data.get_quantiles(0.75)
2478
- return stats
1484
+ # specifity_index related methods
1485
+ ####################################
2479
1486
 
1487
+ # Return ontology levels from profile terms
1488
+ # ===== Returns
1489
+ # hash of term levels (Key: level; Value: array of term IDs)
1490
+ def get_ontology_levels_from_profiles(uniq = true)
1491
+ profiles_terms = @profiles.values.flatten
1492
+ profiles_terms.uniq! if uniq
1493
+ term_freqs_byProfile = Hash.new(0)
1494
+ profiles_terms.each do |term|
1495
+ term_freqs_byProfile[term] += 1
1496
+ end
1497
+ levels_filtered = {}
1498
+ terms_levels = @dicts[:level][:byValue]
1499
+ term_freqs_byProfile.each do |term, count|
1500
+ level = terms_levels[term]
1501
+ term_repeat = Array.new(count, term)
1502
+ query = levels_filtered[level]
1503
+ if query.nil?
1504
+ levels_filtered[level] = term_repeat
1505
+ else
1506
+ query.concat(term_repeat)
1507
+ end
1508
+ end
1509
+ return levels_filtered
2480
1510
  end
2481
1511
 
2482
- #============================================================================
2483
- #============================================================================
1512
+ def get_profile_ontology_distribution_tables
1513
+ cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
1514
+ uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
1515
+ ontology_levels = get_ontology_levels
1516
+ total_ontology_terms = ontology_levels.values.flatten.length
1517
+ total_cohort_terms = cohort_ontology_levels.values.flatten.length
1518
+ total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
2484
1519
 
2485
- # Check if a given ID is a removable (blacklist) term.
2486
- # +DEPRECATED+ use is_removable? instead
2487
- # ===== Parameters
2488
- # +id+:: to be checked
2489
- # ===== Returns
2490
- # true if given term is a removable (blacklist) term or false in other cases
2491
- def is_removable(id)
2492
- warn "[DEPRECATION] `is_removable` is deprecated. Please use `is_removable?` instead."
2493
- return @removable_terms.include?(id.to_sym)
1520
+ distribution_ontology_levels = []
1521
+ distribution_percentage = []
1522
+ ontology_levels.each do |level, terms|
1523
+ cohort_terms = cohort_ontology_levels[level]
1524
+ uniq_cohort_terms = uniq_cohort_ontology_levels[level]
1525
+ if cohort_terms.nil? || uniq_cohort_terms.nil?
1526
+ num = 0
1527
+ u_num = 0
1528
+ else
1529
+ num = cohort_terms.length
1530
+ u_num = uniq_cohort_terms.length
1531
+ end
1532
+ distribution_ontology_levels << [level, terms.length, num]
1533
+ distribution_percentage << [
1534
+ level,
1535
+ (terms.length.fdiv(total_ontology_terms)*100).round(3),
1536
+ (num.fdiv(total_cohort_terms)*100).round(3),
1537
+ (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
1538
+ ]
1539
+ end
1540
+ distribution_ontology_levels.sort! { |x,y| x.first <=> y.first }
1541
+ distribution_percentage.sort! { |x,y| x.first <=> y.first }
1542
+ return distribution_ontology_levels, distribution_percentage
2494
1543
  end
2495
1544
 
2496
- # Check if a given ID is a removable (blacklist) term
2497
- # ===== Parameters
2498
- # +id+:: to be checked
2499
- # ===== Returns
2500
- # true if given term is a removable (blacklist) term or false in other cases
2501
- def is_removable? id
2502
- return @removable_terms.include?(id.to_sym)
1545
+ def get_dataset_specifity_index(mode)
1546
+ ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
1547
+ if mode == 'uniq'
1548
+ observed_distribution = 3
1549
+ elsif mode == 'weigthed'
1550
+ observed_distribution = 2
1551
+ end
1552
+ max_terms = distribution_percentage.map{|row| row[1]}.max
1553
+ maxL = nil
1554
+ distribution_percentage.each do |level_info|
1555
+ maxL = level_info.first if level_info[1] == max_terms
1556
+ end
1557
+ diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
1558
+ diffL.select!{|dL| dL.last > 0}
1559
+ highSection = diffL.select{|dL| dL.first > maxL}
1560
+ lowSection = diffL.select{|dL| dL.first <= maxL}
1561
+ dsi = nil
1562
+ if highSection.empty?
1563
+ dsi = 0
1564
+ else
1565
+ hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
1566
+ lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
1567
+ dsi = hss.fdiv(lss)
1568
+ end
1569
+ return dsi
2503
1570
  end
2504
1571
 
2505
- ############################################
2506
- # SPECIAL METHODS
2507
- #############################################
1572
+ def get_weigthed_level_contribution(section, maxL, nLevels)
1573
+ accumulated_weigthed_diffL = 0
1574
+ section.each do |level, diff|
1575
+ weightL = maxL - level
1576
+ if weightL >= 0
1577
+ weightL += 1
1578
+ else
1579
+ weightL = weightL.abs
1580
+ end
1581
+ accumulated_weigthed_diffL += diff * weightL
1582
+ end
1583
+ weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
1584
+ return weigthed_contribution
1585
+ end
1586
+
1587
+ ########################################
1588
+ ## GENERAL ONTOLOGY METHODS
1589
+ ########################################
1590
+
2508
1591
  def ==(other)
2509
- self.header == other.header &&
2510
- self.stanzas == other.stanzas &&
1592
+ self.terms == other.terms &&
2511
1593
  self.ancestors_index == other.ancestors_index &&
2512
1594
  self.alternatives_index == other.alternatives_index &&
2513
- self.obsoletes_index == other.obsoletes_index &&
2514
1595
  self.structureType == other.structureType &&
2515
1596
  self.ics == other.ics &&
2516
1597
  self.meta == other.meta &&
2517
1598
  self.dicts == other.dicts &&
2518
1599
  self.profiles == other.profiles &&
2519
- self.profilesDict == other.profilesDict &&
2520
1600
  (self.items.keys - other.items.keys).empty? &&
2521
- self.removable_terms == other.removable_terms &&
2522
- self.special_tags == other.special_tags &&
2523
1601
  self.items == other.items &&
2524
1602
  self.term_paths == other.term_paths &&
2525
1603
  self.max_freqs == other.max_freqs
@@ -2528,32 +1606,128 @@ class Ontology
2528
1606
 
2529
1607
  def clone
2530
1608
  copy = Ontology.new
2531
- copy.header = self.header.clone
2532
- copy.stanzas[:terms] = self.stanzas[:terms].clone
2533
- copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
2534
- copy.stanzas[:instances] = self.stanzas[:instances].clone
1609
+ copy.terms = self.terms.clone
2535
1610
  copy.ancestors_index = self.ancestors_index.clone
2536
1611
  copy.descendants_index = self.descendants_index.clone
2537
1612
  copy.alternatives_index = self.alternatives_index.clone
2538
- copy.obsoletes_index = self.obsoletes_index.clone
2539
1613
  copy.structureType = self.structureType.clone
2540
1614
  copy.ics = self.ics.clone
2541
1615
  copy.meta = self.meta.clone
2542
1616
  copy.dicts = self.dicts.clone
2543
1617
  copy.profiles = self.profiles.clone
2544
- copy.profilesDict = self.profilesDict.clone
2545
1618
  copy.items = self.items.clone
2546
- copy.removable_terms = self.removable_terms.clone
2547
1619
  copy.term_paths = self.term_paths.clone
2548
1620
  copy.max_freqs = self.max_freqs.clone
2549
1621
  return copy
2550
1622
  end
2551
1623
 
1624
+ # Exports an OBO_Handler object in json format
1625
+ # ===== Parameters
1626
+ # +file+:: where info will be stored
1627
+ def write(file)
1628
+ # Take object stored info
1629
+ obj_info = {terms: @terms,
1630
+ ancestors_index: @ancestors_index,
1631
+ descendants_index: @descendants_index,
1632
+ alternatives_index: @alternatives_index,
1633
+ structureType: @structureType,
1634
+ ics: @ics,
1635
+ meta: @meta,
1636
+ max_freqs: @max_freqs,
1637
+ dicts: @dicts,
1638
+ profiles: @profiles,
1639
+ items: @items,
1640
+ term_paths: @term_paths}
1641
+ # Convert to JSON format & write
1642
+ File.open(file, "w") { |f| f.write obj_info.to_json }
1643
+ end
1644
+
1645
+
1646
+ def each(att = false)
1647
+ warn('terms empty') if @terms.empty?
1648
+ @terms.each do |id, tags|
1649
+ if att
1650
+ yield(id, tags)
1651
+ else
1652
+ yield(id)
1653
+ end
1654
+ end
1655
+ end
1656
+
1657
+ def get_root
1658
+ roots = []
1659
+ each do |term|
1660
+ roots << term if @ancestors_index[term].nil?
1661
+ end
1662
+ return roots
1663
+ end
1664
+
1665
+ def list_term_attributes
1666
+ terms = []
1667
+ each do |code|
1668
+ terms << [code, translate_id(code), get_term_level(code)]
1669
+ end
1670
+ return terms
1671
+ end
1672
+
1673
+ # Gets ontology levels calculated
1674
+ # ===== Returns
1675
+ # ontology levels calculated
1676
+ def get_ontology_levels
1677
+ return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
1678
+ end
2552
1679
 
2553
- #############################################
2554
- # ACCESS CONTROL
2555
- #############################################
1680
+ private
1681
+
1682
+ def add2hash(hash, key, val)
1683
+ query = hash[key]
1684
+ if query.nil?
1685
+ hash[key] = [val]
1686
+ else
1687
+ query << val
1688
+ end
1689
+ end
1690
+
1691
+ def add2nestHash(h, key1, key2, val)
1692
+ query1 = h[key1]
1693
+ if query1.nil?
1694
+ h[key1] = {key2 => val}
1695
+ else
1696
+ query1[key2] = val
1697
+ end
1698
+ end
2556
1699
 
2557
- attr_reader :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
2558
- attr_writer :file, :header, :stanzas, :ancestors_index, :descendants_index, :special_tags, :alternatives_index, :obsoletes_index, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :profilesDict, :items, :removable_terms, :term_paths
1700
+ # Internal function to concat two elements.
1701
+ # ===== Parameters
1702
+ # +itemA+:: item to be concatenated
1703
+ # +itemB+:: item to be concatenated
1704
+ # ===== Returns
1705
+ # Concatenated objects
1706
+ def concatItems(itemA,itemB) # NEED TEST, CHECK WITH PSZ THIS METHOD
1707
+ # A is Array :: RETURN ARRAY
1708
+ # A_array : B_array
1709
+ # A_array : B_hash => NOT ALLOWED
1710
+ # A_array : B_single => NOT ALLOWED
1711
+ # A is Hash :: RETURN HASH
1712
+ # A_hash : B_array => NOT ALLOWED
1713
+ # A_hash : B_hash
1714
+ # A_hash : B_single => NOT ALLOWED
1715
+ # A is single element => RETURN ARRAY
1716
+ # A_single : B_array
1717
+ # A_single : B_hash => NOT ALLOWED
1718
+ # A_single : B_single
1719
+ concatenated = nil
1720
+ if itemA.kind_of?(Array) && itemB.kind_of?(Array)
1721
+ concatenated = itemA | itemB
1722
+ elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
1723
+ concatenated = itemA.merge(itemB) do |k, oldV, newV|
1724
+ self.concatItems(oldV,newV)
1725
+ end
1726
+ elsif itemB.kind_of?(Array)
1727
+ concatenated = ([itemA] + itemB).uniq
1728
+ elsif ![Array, Hash].include?(itemB.class)
1729
+ concatenated = [itemA,itemB].uniq
1730
+ end
1731
+ return concatenated
1732
+ end
2559
1733
  end