semtools 0.1.8 → 0.1.91
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/bin/semtools.rb +140 -65
- data/lib/semtools/ontology.rb +1235 -2061
- data/lib/semtools/parsers/file_parser.rb +32 -0
- data/lib/semtools/parsers/json_parser.rb +84 -0
- data/lib/semtools/parsers/oboparser.rb +511 -0
- data/lib/semtools/version.rb +1 -1
- data/lib/semtools.rb +3 -0
- data/semtools.gemspec +1 -1
- metadata +9 -6
- data/lib/semtools/math_methods.rb +0 -148
data/lib/semtools/ontology.rb
CHANGED
@@ -8,45 +8,30 @@ class Ontology
|
|
8
8
|
# AUTHOR NOTES
|
9
9
|
#########################################################
|
10
10
|
|
11
|
-
# 1 - Store @profiles as @stanzas[:instances]
|
12
11
|
# 2 - Items values (not keys) are imported as strings, not as symbols (maybe add a flag which indicates if values are, or not, symbols?)
|
13
12
|
|
14
|
-
|
15
13
|
#############################################
|
16
14
|
# FIELDS
|
17
15
|
#############################################
|
18
|
-
# Handled class variables
|
19
|
-
# => @@basic_tags :: hash with main OBO structure tags
|
20
|
-
# => @@allowed_calcs :: hash with allowed ICs and similaritites calcs
|
21
|
-
# => @@symbolizable_ids :: tags which can be symbolized
|
22
|
-
# => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
|
23
|
-
#
|
24
16
|
# Handled object variables
|
25
|
-
# => @
|
26
|
-
# => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
|
17
|
+
# => @terms :: OBO terms descriptions
|
27
18
|
# => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
|
28
19
|
# => @descendants_index :: hash of descendants per each term handled with any structure relationships
|
29
20
|
# => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
|
30
|
-
# => @obsoletes_index :: hash of obsoletes and it's new ids
|
31
|
-
# => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
|
32
21
|
# => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
|
33
|
-
# => @ics :: already calculated ICs for handled terms and IC types
|
34
|
-
# => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
|
35
|
-
# => @max_freqs :: maximum freqs found for structural and observed freqs
|
36
22
|
# => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
|
37
|
-
# => @profiles :: set of terms assigned to an ID
|
38
|
-
# => @profilesDict :: set of profile IDs assigned to a term
|
39
|
-
# => @items :: hash with items relations to terms
|
40
23
|
# => @removable_terms :: array of terms to not be considered
|
24
|
+
# => @meta :: meta_information about handled terms like [ancestors, descendants, struct_freq, observed_freq]
|
25
|
+
# => @ics :: already calculated ICs for handled terms and IC types
|
41
26
|
# => @term_paths :: metainfo about parental paths of each term
|
27
|
+
# => @max_freqs :: maximum freqs found for structural and observed freqs
|
28
|
+
# => @items :: hash with items relations to terms
|
29
|
+
# => @profiles :: set of terms assigned to an ID
|
42
30
|
|
43
|
-
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
|
44
31
|
@@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
|
45
|
-
@@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
|
46
|
-
@@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
|
47
|
-
@@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
|
48
|
-
@@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
|
49
32
|
|
33
|
+
attr_accessor :terms, :ancestors_index, :descendants_index, :alternatives_index, :obsoletes, :structureType, :ics, :max_freqs, :meta, :dicts, :profiles, :items, :term_paths, :reroot
|
34
|
+
|
50
35
|
#############################################
|
51
36
|
# CONSTRUCTOR
|
52
37
|
#############################################
|
@@ -58,266 +43,138 @@ class Ontology
|
|
58
43
|
# +removable_terms+: term to be removed from calcs
|
59
44
|
# +build+: flag to launch metainfo calculation
|
60
45
|
# +file_format+: force format type despite file extension. Can be :obo or :json
|
61
|
-
def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
|
62
|
-
|
63
|
-
@header = nil
|
64
|
-
@stanzas = {terms: {}, typedefs: {}, instances: {}}
|
46
|
+
def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil, extra_dicts: [])
|
47
|
+
@terms = {}
|
65
48
|
@ancestors_index = {}
|
66
49
|
@descendants_index = {}
|
67
50
|
@alternatives_index = {}
|
68
|
-
@
|
51
|
+
@obsoletes = {} # id is obsolete but it could or not have an alt id
|
69
52
|
@structureType = nil
|
70
53
|
@ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
|
71
54
|
@meta = {}
|
72
|
-
@special_tags = @@basic_tags.clone
|
73
55
|
@max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
|
74
56
|
@dicts = {}
|
75
57
|
@profiles = {}
|
76
|
-
@profilesDict = {}
|
77
58
|
@items = {}
|
78
|
-
@removable_terms = []
|
79
59
|
@term_paths = {}
|
80
|
-
|
60
|
+
@reroot = false
|
81
61
|
load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
|
82
62
|
# Load if proceeds
|
83
63
|
if load_file
|
84
64
|
fformat = file_format
|
85
65
|
fformat = File.extname(file) if fformat.nil? && !file.nil?
|
86
66
|
if fformat == :obo || fformat == ".obo"
|
87
|
-
load(file, build: build)
|
67
|
+
OboParser.load(self, file, build: build, black_list: removable_terms, extra_dicts: extra_dicts)
|
88
68
|
elsif fformat == :json || fformat == ".json"
|
89
|
-
|
69
|
+
JsonParser.load(self, file, build: build)
|
90
70
|
elsif !fformat.nil?
|
91
71
|
warn 'Format not allowed. Loading process will not be performed'
|
92
72
|
end
|
73
|
+
precompute if build
|
93
74
|
end
|
94
75
|
end
|
95
76
|
|
96
|
-
|
97
77
|
#############################################
|
98
|
-
#
|
78
|
+
# GENERATE METADATA FOR ALL TERMS
|
99
79
|
#############################################
|
100
80
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
# +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
|
105
|
-
# ===== Parameters
|
106
|
-
# +start+:: term where start to expand
|
107
|
-
# +terms+:: set to be used to expand
|
108
|
-
# +target_tag+:: tag used to expand
|
109
|
-
# +eexpansion+:: already expanded info
|
110
|
-
# +split_info_char+:: special regex used to split info (if it is necessary)
|
111
|
-
# +split_info_indx+:: special index to take splitted info (if it is necessary)
|
112
|
-
# +alt_ids+:: set of alternative IDs
|
113
|
-
# ===== Returns
|
114
|
-
# A vector with the observed structure (string) and the array with extended terms.
|
115
|
-
def self.get_related_ids(start_id, terms, target_tag, related_ids = {}, alt_ids = {})
|
116
|
-
# Take start_id term available info and already accumulated info
|
117
|
-
current_associations = related_ids[start_id]
|
118
|
-
current_associations = [] if current_associations.nil?
|
119
|
-
return [:no_term,[]] if terms[start_id].nil?
|
120
|
-
id_relations = terms[start_id][target_tag]
|
121
|
-
return [:source,[]] if id_relations.nil?
|
122
|
-
|
123
|
-
# Prepare auxiliar variables
|
124
|
-
struct = :hierarchical
|
125
|
-
|
126
|
-
# Study direct extensions
|
127
|
-
id_relations = id_relations.clone
|
128
|
-
while id_relations.length > 0
|
129
|
-
id = id_relations.shift
|
130
|
-
id = alt_ids[id].first if alt_ids.include?(id) # NOTE: if you want to persist current ID instead source ID, re-implement this
|
131
|
-
|
132
|
-
# Handle
|
133
|
-
if current_associations.include?(id) # Check if already have been included into this expansion
|
134
|
-
struct = :circular
|
135
|
-
else
|
136
|
-
current_associations << id
|
137
|
-
if related_ids.include?(id) # Check if current already has been expanded
|
138
|
-
current_associations = current_associations | related_ids[id]
|
139
|
-
if current_associations.include?(start_id) # Check circular case
|
140
|
-
struct = :circular
|
141
|
-
[id, start_id].each{|repeated| current_associations.delete(repeated)}
|
142
|
-
end
|
143
|
-
else # Expand
|
144
|
-
related_ids[start_id] = current_associations
|
145
|
-
structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids) # Expand current
|
146
|
-
current_associations = current_associations | current_related_ids
|
147
|
-
struct = :circular if structExp == :circular # Check struct
|
148
|
-
if current_associations.include?(start_id) # Check circular case
|
149
|
-
struct = :circular
|
150
|
-
current_associations.delete(start_id)
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
155
|
-
related_ids[start_id] = current_associations
|
156
|
-
|
157
|
-
return struct, current_associations
|
158
|
-
end
|
159
|
-
|
160
|
-
|
161
|
-
# Expand terms using a specific tag and return all extended terms into an array and
|
162
|
-
# the relationship structuture observed (hierarchical or circular). If circular structure is
|
163
|
-
# foumd, extended array will be an unique vector without starting term (no loops)
|
164
|
-
# ===== Parameters
|
165
|
-
# +terms+:: set to be used to expand
|
166
|
-
# +target_tag+:: tag used to expand
|
167
|
-
# +split_info_char+:: special regex used to split info (if it is necessary)
|
168
|
-
# +split_info_indx+:: special index to take splitted info (if it is necessary)
|
169
|
-
# +alt_ids+:: set of alternative IDs
|
170
|
-
# +obsoletes+:: integer with the number of obsolete IDs. used to calculate structure type.
|
171
|
-
# ===== Returns
|
172
|
-
# A vector with the observed structure (string) and the hash with extended terms
|
173
|
-
def self.get_related_ids_by_tag(terms:,target_tag:, alt_ids: {}, obsoletes: 0)
|
174
|
-
# Define structure type
|
175
|
-
structType = :hierarchical
|
176
|
-
related_ids = {}
|
177
|
-
terms.each do |id, tags|
|
178
|
-
# Check if target tag is defined
|
179
|
-
if !tags[target_tag].nil?
|
180
|
-
# Obtain related terms
|
181
|
-
set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids, alt_ids)
|
182
|
-
# Check structure
|
183
|
-
structType = :circular if set_structure == :circular
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
# Check special case
|
188
|
-
structType = :atomic if related_ids.length <= 0
|
189
|
-
structType = :sparse if related_ids.length > 0 && ((terms.length - related_ids.length - obsoletes) >= 2)
|
190
|
-
# Return type and hash with related_ids
|
191
|
-
return structType, related_ids
|
81
|
+
def precompute
|
82
|
+
get_index_frequencies
|
83
|
+
calc_term_levels(calc_paths: true)
|
192
84
|
end
|
193
85
|
|
194
|
-
|
195
|
-
# Class method to transform string with <tag : info> into hash structure
|
196
|
-
# ===== Parameters
|
197
|
-
# +attributes+:: array tuples with info to be transformed into hash format
|
86
|
+
# Calculates regular frequencies based on ontology structure (using parentals)
|
198
87
|
# ===== Returns
|
199
|
-
#
|
200
|
-
def
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
# Prepare
|
210
|
-
tag = tag.lstrip.to_sym
|
211
|
-
value.lstrip!
|
212
|
-
value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
|
213
|
-
|
214
|
-
# Store
|
215
|
-
query = info_hash[tag]
|
216
|
-
if !query.nil? # Tag already exists
|
217
|
-
if !query.kind_of?(Array) # Check that tag is multivalue
|
218
|
-
raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
|
219
|
-
else
|
220
|
-
query << value # Add new value to tag
|
221
|
-
end
|
222
|
-
else # New entry
|
223
|
-
if @@multivalue_tags.include?(tag)
|
224
|
-
info_hash[tag] = [value]
|
225
|
-
else
|
226
|
-
info_hash[tag] = value
|
88
|
+
# true if everything end without errors and false in other cases
|
89
|
+
def get_index_frequencies() # Per each term, add frequencies
|
90
|
+
if @ancestors_index.empty?
|
91
|
+
warn('ancestors_index object is empty')
|
92
|
+
else
|
93
|
+
each(att = true) do |id, tags|
|
94
|
+
query = @meta[id]
|
95
|
+
if query.nil?
|
96
|
+
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
97
|
+
@meta[id] = query
|
227
98
|
end
|
99
|
+
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].length.to_f : 0.0
|
100
|
+
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].length.to_f : 0.0
|
101
|
+
query[:struct_freq] = query[:descendants] + 1.0
|
102
|
+
# Update maximums
|
103
|
+
@max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
|
104
|
+
@max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
|
228
105
|
end
|
229
106
|
end
|
230
|
-
self.symbolize_ids(info_hash)
|
231
|
-
return info_hash
|
232
107
|
end
|
233
108
|
|
234
|
-
|
235
|
-
# Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
|
236
|
-
# the Header, the Terms, the Typedefs and the Instances.
|
109
|
+
# Calculates ontology structural levels for all ontology terms
|
237
110
|
# ===== Parameters
|
238
|
-
# +
|
239
|
-
#
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
# Read file
|
251
|
-
File.open(file).each do |line|
|
252
|
-
line.chomp!
|
253
|
-
next if line.empty?
|
254
|
-
fields = line.split(':', 2)
|
255
|
-
# Check if new instance is found
|
256
|
-
if stanzas_flags.include?(line)
|
257
|
-
header = self.process_entity(header, infoType, stanzas, currInfo)
|
258
|
-
# Update info variables
|
259
|
-
currInfo = []
|
260
|
-
infoType = line.gsub!(/[\[\]]/, '')
|
261
|
-
next
|
111
|
+
# +calc_paths+:: calculates term paths if it's not already calculated
|
112
|
+
# +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
|
113
|
+
def calc_term_levels(calc_paths: false, shortest_path: true)
|
114
|
+
self.calc_term_paths if @term_paths.empty? && calc_paths
|
115
|
+
if !@term_paths.empty?
|
116
|
+
byTerm = {}
|
117
|
+
byValue = {}
|
118
|
+
@term_paths.each do |term, info|
|
119
|
+
level = shortest_path ? info[:shortest_path] : info[:largest_path]
|
120
|
+
level = level.nil? ? -1 : level.round(0)
|
121
|
+
byTerm[term] = level
|
122
|
+
add2hash(byValue, level, term)
|
262
123
|
end
|
263
|
-
#
|
264
|
-
|
124
|
+
@dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
|
125
|
+
@max_freqs[:max_depth] = byValue.keys.max # Update maximum depth
|
265
126
|
end
|
266
|
-
# Store last loaded info
|
267
|
-
header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
|
268
|
-
|
269
|
-
# Prepare to return
|
270
|
-
finfo = {:file => file, :name => File.basename(file, File.extname(file))}
|
271
|
-
return finfo, header, stanzas
|
272
127
|
end
|
273
128
|
|
274
|
-
|
275
|
-
#
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
header = info
|
288
|
-
else
|
289
|
-
id = info[:id]
|
290
|
-
case infoType
|
291
|
-
when 'Term'
|
292
|
-
stanzas[:terms][id] = info
|
293
|
-
when 'Typedef'
|
294
|
-
stanzas[:typedefs][id] = info
|
295
|
-
when 'Instance'
|
296
|
-
stanzas[:instances][id] = info
|
129
|
+
# Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
|
130
|
+
# Also calculates paths metadata and stores into @term_paths
|
131
|
+
def calc_term_paths
|
132
|
+
@term_paths = {}
|
133
|
+
if [:hierarchical, :sparse].include? @structureType
|
134
|
+
each do |term|
|
135
|
+
expand_path(term)
|
136
|
+
path_attr = @term_paths[term]
|
137
|
+
# expand_path is arecursive function so these pat attributes must be calculated once the recursion is finished
|
138
|
+
path_attr[:total_paths] = path_attr[:paths].length
|
139
|
+
paths_sizes = path_attr[:paths].map{|path| path.length}
|
140
|
+
path_attr[:largest_path] = paths_sizes.max
|
141
|
+
path_attr[:shortest_path] = paths_sizes.min
|
297
142
|
end
|
143
|
+
else
|
144
|
+
warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
|
298
145
|
end
|
299
|
-
return header
|
300
146
|
end
|
301
147
|
|
302
|
-
|
303
|
-
# Symboliza all values into hashs using symbolizable tags as keys
|
148
|
+
# Recursive function whic finds paths of a term following it ancestors and stores all possible paths for it and it's parentals
|
304
149
|
# ===== Parameters
|
305
|
-
# +
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
150
|
+
# +curr_term+:: current visited term
|
151
|
+
# +visited_terms+:: already expanded terms
|
152
|
+
def expand_path(curr_term)
|
153
|
+
if !@term_paths.include?(curr_term)
|
154
|
+
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
|
155
|
+
@term_paths[curr_term] = path_attr
|
156
|
+
direct_parentals = @dicts[:is_a][:byTerm][curr_term]
|
157
|
+
if direct_parentals.nil? # No parents :: End of recurrence
|
158
|
+
path_attr[:paths] << [curr_term]
|
159
|
+
else # Expand and concat
|
160
|
+
direct_parentals.each do |ancestor|
|
161
|
+
path_attr_parental = @term_paths[ancestor]
|
162
|
+
if path_attr_parental.nil? # Calculate new paths
|
163
|
+
self.expand_path(ancestor)
|
164
|
+
new_paths = @term_paths[ancestor][:paths]
|
165
|
+
else # Use direct_parental paths already calculated
|
166
|
+
new_paths = path_attr_parental[:paths]
|
167
|
+
end
|
168
|
+
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
|
314
169
|
end
|
315
170
|
end
|
316
171
|
end
|
317
172
|
end
|
318
173
|
|
174
|
+
#############################################
|
175
|
+
# CLASS METHODS (TODO: TO BE TRANFORMED IN INSTANCE METHODS)
|
176
|
+
#############################################
|
319
177
|
|
320
|
-
#
|
321
178
|
# ===== Parameters
|
322
179
|
# +root+:: main term to expand
|
323
180
|
# +ontology+:: to be cutted
|
@@ -325,18 +182,32 @@ class Ontology
|
|
325
182
|
# +remove_up+:: if true, stores only the root term given an it descendants. If false, only root ancestors will be stored
|
326
183
|
# ===== Returns
|
327
184
|
# An Ontology object with terms after cut the ontology.
|
328
|
-
def self.mutate(root, ontology, clone: true, remove_up: true)
|
185
|
+
def self.mutate(root, ontology, clone: true, remove_up: true) #TODO, pending to fix and pass to instance method
|
329
186
|
ontology = ontology.clone if clone
|
330
187
|
# Obtain affected IDs
|
331
188
|
descendants = ontology.descendants_index[root]
|
332
189
|
descendants << root # Store itself to do not remove it
|
333
190
|
# Remove unnecesary terms
|
334
|
-
|
191
|
+
terms = ontology.terms.select{|id,v| remove_up ? descendants.include?(id) : !descendants.include?(id)}
|
192
|
+
ids = terms.keys
|
193
|
+
terms.each do |id, term|
|
194
|
+
term[:is_a] = term[:is_a] & ids # Clean parental relations to keep only whose that exist between selected terms
|
195
|
+
end
|
196
|
+
ontology.terms = terms
|
335
197
|
ontology.ics = Hash[@@allowed_calcs[:ics].map{|ictype| [ictype, {}]}]
|
336
198
|
ontology.max_freqs = {:struct_freq => -1.0, :observed_freq => -1.0, :max_depth => -1.0}
|
337
199
|
ontology.dicts = {}
|
338
|
-
ontology.removable_terms = []
|
339
200
|
ontology.term_paths = {}
|
201
|
+
ontology.reroot = true
|
202
|
+
|
203
|
+
ontology.ancestors_index = {}
|
204
|
+
ontology.descendants_index = {}
|
205
|
+
ontology.alternatives_index = {}
|
206
|
+
ontology.meta = {}
|
207
|
+
ontology.profiles = {}
|
208
|
+
ontology.items = {}
|
209
|
+
|
210
|
+
|
340
211
|
# Recalculate metadata
|
341
212
|
ontology.build_index
|
342
213
|
ontology.add_observed_terms_from_profiles
|
@@ -344,33 +215,13 @@ class Ontology
|
|
344
215
|
return ontology
|
345
216
|
end
|
346
217
|
|
347
|
-
|
348
|
-
|
349
218
|
#############################################
|
350
|
-
#
|
219
|
+
# TERM METHODS
|
351
220
|
#############################################
|
352
221
|
|
353
|
-
#
|
354
|
-
|
355
|
-
# +terms+:: terms array to be concatenated
|
356
|
-
def add_removable_terms(terms)
|
357
|
-
terms = terms.map{|term| term.to_sym}
|
358
|
-
@removable_terms.concat(terms)
|
359
|
-
end
|
360
|
-
|
361
|
-
|
362
|
-
# Include removable terms to current removable terms list loading new
|
363
|
-
# terms from a one column plain text file
|
364
|
-
# ===== Parameters
|
365
|
-
# +file+:: to be loaded
|
366
|
-
def add_removable_terms_from_file(file)
|
367
|
-
File.open(excluded_codes_file).each do |line|
|
368
|
-
line.chomp!
|
369
|
-
@removable_terms << line.to_sym
|
370
|
-
end
|
371
|
-
end
|
222
|
+
# I/O observed term from data
|
223
|
+
####################################
|
372
224
|
|
373
|
-
|
374
225
|
# Increase observed frequency for a specific term
|
375
226
|
# ===== Parameters
|
376
227
|
# +term+:: term which frequency is going to be increased
|
@@ -378,15 +229,7 @@ class Ontology
|
|
378
229
|
# ===== Return
|
379
230
|
# true if process ends without errors, false in other cases
|
380
231
|
def add_observed_term(term:,increase: 1.0)
|
381
|
-
|
382
|
-
raise ArgumentError, "Term given is NIL" if term.nil?
|
383
|
-
return false unless @stanzas[:terms].include?(term)
|
384
|
-
return false if @removable_terms.include?(term)
|
385
|
-
if @alternatives_index.include?(term)
|
386
|
-
alt_id = @alternatives_index[term]
|
387
|
-
@meta[alt_id] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[alt_id].nil?
|
388
|
-
@meta[term] = @meta[alt_id]
|
389
|
-
end
|
232
|
+
return false unless term_exist?(term)
|
390
233
|
# Check if exists
|
391
234
|
@meta[term] = {:ancestors => -1.0,:descendants => -1.0,:struct_freq => 0.0,:observed_freq => 0.0} if @meta[term].nil?
|
392
235
|
# Add frequency
|
@@ -397,345 +240,199 @@ class Ontology
|
|
397
240
|
return true
|
398
241
|
end
|
399
242
|
|
243
|
+
# Obtain level and term relations
|
244
|
+
####################################
|
400
245
|
|
401
|
-
# Increase the arbitrary frequency of a given term set
|
402
246
|
# ===== Parameters
|
403
|
-
# +
|
404
|
-
# +
|
405
|
-
#
|
406
|
-
#
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
checks = terms.map{|id| self.add_observed_term(term: id,increase: increase)}
|
247
|
+
# +term+:: which are requested
|
248
|
+
# +relation+:: can be :ancestor or :descendant
|
249
|
+
# ===== Returns
|
250
|
+
# Direct ancestors/descendants of given term or nil if any error occurs
|
251
|
+
def get_direct_related(term, relation)
|
252
|
+
target = nil
|
253
|
+
case relation
|
254
|
+
when :ancestor
|
255
|
+
target = :byTerm
|
256
|
+
when :descendant
|
257
|
+
target = :byValue
|
258
|
+
else
|
259
|
+
warn('Relation type not allowed. Returning nil')
|
417
260
|
end
|
418
|
-
|
261
|
+
query = @dicts.dig(:is_a, target, term)
|
262
|
+
return query
|
419
263
|
end
|
420
264
|
|
421
|
-
|
422
|
-
#
|
265
|
+
# Return direct ancestors/descendants of a given term
|
266
|
+
# Return direct ancestors of a given term
|
423
267
|
# ===== Parameters
|
424
|
-
# +
|
425
|
-
#
|
426
|
-
#
|
427
|
-
|
428
|
-
|
429
|
-
# ===== Return
|
430
|
-
# similitude calculated
|
431
|
-
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
|
432
|
-
# Check
|
433
|
-
raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
|
434
|
-
raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
|
435
|
-
micasA = []
|
436
|
-
# Compare A -> B
|
437
|
-
termsA.each do |tA|
|
438
|
-
micas = []
|
439
|
-
termsB.each do |tB|
|
440
|
-
if store_mica
|
441
|
-
value = @mica_index.dig(tA, tB)
|
442
|
-
else
|
443
|
-
value = nil
|
444
|
-
end
|
445
|
-
if value.nil?
|
446
|
-
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
|
447
|
-
if store_mica
|
448
|
-
value = true if value.nil? # We use true to save that the operation was made but there is not mica value
|
449
|
-
add2nestHash(@mica_index, tA, tB, value)
|
450
|
-
end
|
451
|
-
end
|
452
|
-
micas << value if value.class == Float
|
453
|
-
end
|
454
|
-
if !micas.empty?
|
455
|
-
micasA << micas.max # Obtain maximum value
|
456
|
-
else
|
457
|
-
micasA << 0
|
458
|
-
end
|
459
|
-
end
|
460
|
-
means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
|
461
|
-
# Compare B -> A
|
462
|
-
if bidirectional
|
463
|
-
means_simA = means_sim * micasA.size
|
464
|
-
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
|
465
|
-
means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
|
466
|
-
end
|
467
|
-
# Return
|
468
|
-
return means_sim
|
268
|
+
# +term+:: which ancestors are requested
|
269
|
+
# ===== Returns
|
270
|
+
# Direct ancestors of given term or nil if any error occurs
|
271
|
+
def get_direct_ancentors(term)
|
272
|
+
return self.get_direct_related(term, :ancestor)
|
469
273
|
end
|
470
274
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
275
|
+
# Return direct descendants of a given term
|
276
|
+
# ===== Parameters
|
277
|
+
# +term+:: which descendants are requested
|
278
|
+
# ===== Returns
|
279
|
+
# Direct descendants of given term or nil if any error occurs
|
280
|
+
def get_direct_descendants(term)
|
281
|
+
return self.get_direct_related(term, :descendant)
|
478
282
|
end
|
479
283
|
|
480
|
-
#
|
284
|
+
# Find ancestors/descendants of a given term
|
481
285
|
# ===== Parameters
|
482
|
-
# +
|
483
|
-
# +
|
484
|
-
#
|
485
|
-
#
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
profiles_ids = @profiles.keys
|
491
|
-
if external_profiles.nil?
|
492
|
-
comp_ids = profiles_ids
|
493
|
-
comp_profiles = @profiles
|
494
|
-
main_ids = comp_ids
|
495
|
-
main_profiles = comp_profiles
|
286
|
+
# +term+:: to be checked
|
287
|
+
# +return_ancestors+:: return ancestors if true or descendants if false
|
288
|
+
# ===== Returns
|
289
|
+
# an array with all ancestors/descendants of given term or nil if parents are not available yet
|
290
|
+
def get_familiar(term, return_ancestors = true)
|
291
|
+
familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
|
292
|
+
if !familiars.nil?
|
293
|
+
familiars = familiars.clone
|
496
294
|
else
|
497
|
-
|
498
|
-
comp_profiles = external_profiles
|
499
|
-
main_ids = profiles_ids
|
500
|
-
main_profiles = @profiles
|
501
|
-
end
|
502
|
-
# Compare
|
503
|
-
@mica_index = {}
|
504
|
-
while !main_ids.empty?
|
505
|
-
curr_id = main_ids.shift
|
506
|
-
current_profile = main_profiles[curr_id]
|
507
|
-
comp_ids.each do |id|
|
508
|
-
profile = comp_profiles[id]
|
509
|
-
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
|
510
|
-
query = profiles_similarity[curr_id]
|
511
|
-
if query.nil?
|
512
|
-
profiles_similarity[curr_id] = {id => value}
|
513
|
-
else
|
514
|
-
query[id] = value
|
515
|
-
end
|
516
|
-
end
|
295
|
+
familiars = []
|
517
296
|
end
|
518
|
-
return
|
297
|
+
return familiars
|
519
298
|
end
|
520
299
|
|
300
|
+
# Find ancestors of a given term
|
301
|
+
# ===== Parameters
|
302
|
+
# +term+:: to be checked
|
303
|
+
# ===== Returns
|
304
|
+
# an array with all ancestors of given term or false if parents are not available yet
|
305
|
+
def get_ancestors(term)
|
306
|
+
return self.get_familiar(term, true)
|
307
|
+
end
|
521
308
|
|
522
|
-
#
|
309
|
+
# Find descendants of a given term
|
523
310
|
# ===== Parameters
|
524
|
-
# +
|
311
|
+
# +term+:: to be checked
|
525
312
|
# ===== Returns
|
526
|
-
#
|
527
|
-
def
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
313
|
+
# an array with all descendants of given term or false if parents are not available yet
|
314
|
+
def get_descendants(term)
|
315
|
+
return self.get_familiar(term, false)
|
316
|
+
end
|
317
|
+
|
318
|
+
# Gets ontology level of a specific term
|
319
|
+
# ===== Returns
|
320
|
+
# Term level
|
321
|
+
def get_term_level(term)
|
322
|
+
return @dicts[:level][:byValue][term]
|
323
|
+
end
|
324
|
+
|
325
|
+
# nil, term not found, [] term exists but not has parents
|
326
|
+
def get_parental_path(term, which_path = :shortest_path, level = 0)
|
327
|
+
path = nil
|
328
|
+
path_attr = @term_paths[term]
|
329
|
+
if !path_attr.nil?
|
330
|
+
path_length = path_attr[which_path]
|
331
|
+
all_paths = path_attr[:paths]
|
332
|
+
if all_paths.empty?
|
333
|
+
path = []
|
334
|
+
else
|
335
|
+
path = all_paths.select{|pt| pt.length == path_length}.first.clone
|
336
|
+
if level > 0 # we want the term and his ascendants until a specific level
|
337
|
+
n_parents = path_length - level
|
338
|
+
path = path[0..n_parents]
|
544
339
|
end
|
340
|
+
path.shift # Discard the term itself
|
545
341
|
end
|
546
342
|
end
|
547
|
-
|
343
|
+
return path
|
548
344
|
end
|
549
345
|
|
346
|
+
# ID Handlers
|
347
|
+
####################################
|
550
348
|
|
551
|
-
# Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
|
552
349
|
# ===== Returns
|
553
|
-
#
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
|
570
|
-
self.calc_term_levels(calc_paths: true)
|
350
|
+
# the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
|
351
|
+
# ===== Parameters
|
352
|
+
# +id+:: to be translated
|
353
|
+
# ===== Return
|
354
|
+
# main ID related to a given ID. Returns nil if given ID is not an allowed ID
|
355
|
+
def get_main_id(id)
|
356
|
+
mainID = @alternatives_index[id]
|
357
|
+
return nil if !term_exist?(id) && mainID.nil?
|
358
|
+
if !mainID.nil? # Recursive code to get the definitive final term id if there are several alt_id in chain
|
359
|
+
new_id = get_main_id(mainID)
|
360
|
+
if new_id != mainID
|
361
|
+
new_id = get_main_id(new_id)
|
362
|
+
end
|
363
|
+
id = new_id
|
364
|
+
end
|
365
|
+
return id
|
571
366
|
end
|
572
367
|
|
573
|
-
|
574
|
-
#
|
575
|
-
#
|
576
|
-
#
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
if @alternatives_index.include?(id)
|
585
|
-
alt_id = @alternatives_index[id]
|
586
|
-
query = @meta[alt_id] # Check if exist
|
587
|
-
if query.nil?
|
588
|
-
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
589
|
-
@meta[alt_id] = query
|
590
|
-
end
|
591
|
-
@meta[id] = query
|
592
|
-
# Note: alternative terms do not increase structural frequencies
|
593
|
-
else # Official term
|
594
|
-
query = @meta[id] # Check if exist
|
595
|
-
if query.nil?
|
596
|
-
query = {ancestors: 0.0, descendants: 0.0, struct_freq: 0.0, observed_freq: 0.0}
|
597
|
-
@meta[id] = query
|
598
|
-
end
|
599
|
-
# Store metadata
|
600
|
-
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
|
601
|
-
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
|
602
|
-
query[:struct_freq] = query[:descendants] + 1.0
|
603
|
-
# Update maximums
|
604
|
-
@max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
|
605
|
-
@max_freqs[:max_depth] = query[:descendants] if @max_freqs[:max_depth] < query[:descendants]
|
606
|
-
end
|
607
|
-
end
|
608
|
-
end
|
368
|
+
# Translate a given value using an already calcualted dictionary
|
369
|
+
# ===== Parameters
|
370
|
+
# +toTranslate+:: value to be translated using dictiontionary
|
371
|
+
# +tag+:: used to generate the dictionary
|
372
|
+
# +byValue+:: boolean flag to indicate if dictionary must be used values as keys or terms as keys. Default: values as keys = true
|
373
|
+
# ===== Return
|
374
|
+
# translation
|
375
|
+
def translate(toTranslate, tag, byValue: true)
|
376
|
+
dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
|
377
|
+
toTranslate = get_main_id(toTranslate) if !byValue
|
378
|
+
return dict[toTranslate]
|
609
379
|
end
|
610
380
|
|
611
|
-
|
612
|
-
# Expand obsoletes set and link info to their alternative IDs
|
381
|
+
# Translate a name given
|
613
382
|
# ===== Parameters
|
614
|
-
# +
|
615
|
-
#
|
616
|
-
#
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
warn('stanzas terms empty')
|
622
|
-
else
|
623
|
-
# Check obsoletes
|
624
|
-
@stanzas[:terms].each do |id, term_tags|
|
625
|
-
next if term_tags.nil?
|
626
|
-
next if self.is_alternative?(id)
|
627
|
-
query = term_tags[obs_tag]
|
628
|
-
if !query.nil? && query == 'true' # Obsolete tag presence
|
629
|
-
next if !@obsoletes_index[id].nil? # Already stored
|
630
|
-
# Check if alternative value is available
|
631
|
-
alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact
|
632
|
-
if !alt_ids.empty?
|
633
|
-
alt_id = alt_ids.first.first #FIRST tag, FIRST id
|
634
|
-
# Store
|
635
|
-
@alternatives_index[id] = alt_id
|
636
|
-
@obsoletes_index[id] = alt_id
|
637
|
-
end
|
638
|
-
end
|
639
|
-
end
|
640
|
-
end
|
383
|
+
# +name+:: to be translated
|
384
|
+
# ===== Return
|
385
|
+
# translated name or nil if it's not stored into this ontology
|
386
|
+
def translate_name(name)
|
387
|
+
term = self.translate(name, :name)
|
388
|
+
term = self.translate(name, :synonym) if term.nil?
|
389
|
+
return term
|
641
390
|
end
|
642
391
|
|
643
|
-
|
644
|
-
# Expand parentals set and link all info to their alternative IDs. Also launch frequencies process
|
392
|
+
# Translates a given ID to it assigned name
|
645
393
|
# ===== Parameters
|
646
|
-
# +
|
647
|
-
#
|
648
|
-
#
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
# Check
|
653
|
-
if @stanzas[:terms].nil?
|
654
|
-
warn('stanzas terms empty')
|
655
|
-
else
|
656
|
-
# Expand
|
657
|
-
structType, parentals = self.class.get_related_ids_by_tag(terms: @stanzas[:terms],
|
658
|
-
target_tag: tag,
|
659
|
-
alt_ids: @alternatives_index,
|
660
|
-
obsoletes: @obsoletes_index.length)
|
661
|
-
# Check
|
662
|
-
raise('Error expanding parentals') if (structType.nil?) || parentals.nil?
|
663
|
-
# Prepare ancestors structure
|
664
|
-
anc = {}
|
665
|
-
des = {}
|
666
|
-
parentals.each do |id, parents|
|
667
|
-
parents = parents - @removable_terms
|
668
|
-
anc[id] = parents
|
669
|
-
parents.each do |anc_id| # Add descendants
|
670
|
-
if !des.include?(anc_id)
|
671
|
-
des[anc_id] = [id]
|
672
|
-
else
|
673
|
-
des[anc_id] << id
|
674
|
-
end
|
675
|
-
end
|
676
|
-
end
|
677
|
-
# Store alternatives
|
678
|
-
# @alternatives_index.each do |id,alt|
|
679
|
-
# anc[id] = anc[alt] if anc.include?(alt)
|
680
|
-
# des[id] = des[alt] if des.include?(alt)
|
681
|
-
# end
|
682
|
-
# Check structure
|
683
|
-
if ![:atomic,:sparse].include? structType
|
684
|
-
structType = structType == :circular ? :circular : :hierarchical
|
685
|
-
end
|
686
|
-
# Store
|
687
|
-
@ancestors_index = anc
|
688
|
-
@descendants_index = des
|
689
|
-
@structureType = structType
|
690
|
-
end
|
691
|
-
# Finish
|
394
|
+
# +id+:: to be translated
|
395
|
+
# ===== Return
|
396
|
+
# main name or nil if it's not included into this ontology
|
397
|
+
def translate_id(id)
|
398
|
+
name = self.translate(id, :name, byValue: false)
|
399
|
+
return name.nil? ? nil : name.first
|
692
400
|
end
|
693
401
|
|
402
|
+
# Get term frequency and information
|
403
|
+
####################################
|
694
404
|
|
695
|
-
#
|
405
|
+
# One single term #
|
406
|
+
|
407
|
+
# Get a term frequency
|
696
408
|
# ===== Parameters
|
697
|
-
# +term+:: to be checked
|
698
|
-
# +
|
409
|
+
# +term+:: term to be checked
|
410
|
+
# +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
|
699
411
|
# ===== Returns
|
700
|
-
#
|
701
|
-
def
|
702
|
-
|
412
|
+
# frequency of term given or nil if term is not allowed
|
413
|
+
def get_frequency(term, type: :struct_freq)
|
414
|
+
queryFreq = @meta[term]
|
415
|
+
return queryFreq.nil? ? nil : queryFreq[type]
|
703
416
|
end
|
704
417
|
|
705
|
-
|
706
|
-
# Find descendants of a given term
|
418
|
+
# Geys structural frequency of a term given
|
707
419
|
# ===== Parameters
|
708
420
|
# +term+:: to be checked
|
709
|
-
# +filter_alternatives+:: if true, remove alternatives from final results
|
710
421
|
# ===== Returns
|
711
|
-
#
|
712
|
-
def
|
713
|
-
return self.
|
422
|
+
# structural frequency of given term or nil if term is not allowed
|
423
|
+
def get_structural_frequency(term)
|
424
|
+
return self.get_frequency(term, type: :struct_freq)
|
714
425
|
end
|
715
426
|
|
716
|
-
|
717
|
-
# Find ancestors/descendants of a given term
|
427
|
+
# Gets observed frequency of a term given
|
718
428
|
# ===== Parameters
|
719
429
|
# +term+:: to be checked
|
720
|
-
# +return_ancestors+:: return ancestors if true or descendants if false
|
721
|
-
# +filter_alternatives+:: if true, remove alternatives from final results
|
722
430
|
# ===== Returns
|
723
|
-
#
|
724
|
-
def
|
725
|
-
|
726
|
-
familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
|
727
|
-
if !familiars.nil?
|
728
|
-
familiars = familiars.clone
|
729
|
-
if filter_alternatives
|
730
|
-
familiars.reject!{|fm| @alternatives_index.include?(fm)}
|
731
|
-
end
|
732
|
-
else
|
733
|
-
familiars = []
|
734
|
-
end
|
735
|
-
return familiars
|
431
|
+
# observed frequency of given term or nil if term is not allowed
|
432
|
+
def get_observed_frequency(term)
|
433
|
+
return self.get_frequency(term, type: :observed_freq)
|
736
434
|
end
|
737
435
|
|
738
|
-
|
739
436
|
# Obtain IC of an specific term
|
740
437
|
# ===== Parameters
|
741
438
|
# +term+:: which IC will be calculated
|
@@ -789,7 +486,7 @@ class Ontology
|
|
789
486
|
###########################################
|
790
487
|
when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
|
791
488
|
# 1 - ( log(hypo(x) + 1) / log(max_nodes) )
|
792
|
-
ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@
|
489
|
+
ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@terms.length))
|
793
490
|
if :zhou # New Model of Semantic Similarity Measuring in Wordnet
|
794
491
|
# k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
|
795
492
|
@ics[:seco][term] = ic # Special store
|
@@ -803,40 +500,25 @@ class Ontology
|
|
803
500
|
return ic
|
804
501
|
end
|
805
502
|
|
503
|
+
# Term vs Term #
|
806
504
|
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
resnik = @ics[:resnik].select{|k,v| observed_terms.include?(k)}
|
821
|
-
resnik_observed = @ics[:resnik_observed].select{|k,v| observed_terms.include?(k)}
|
505
|
+
def get_LCA(termA, termB, lca_index: false)
|
506
|
+
lca = []
|
507
|
+
if lca_index
|
508
|
+
res = @lca_index.dig(termA, termB)
|
509
|
+
lca = [res] if !res.nil?
|
510
|
+
else # Obtain ancestors (include itselfs too)
|
511
|
+
anc_A = self.get_ancestors(termA)
|
512
|
+
anc_B = self.get_ancestors(termB)
|
513
|
+
if !(anc_A.empty? && anc_B.empty?)
|
514
|
+
anc_A << termA
|
515
|
+
anc_B << termB
|
516
|
+
lca = anc_A & anc_B
|
517
|
+
end
|
822
518
|
end
|
823
|
-
return
|
824
|
-
end
|
825
|
-
|
826
|
-
|
827
|
-
# Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
|
828
|
-
# ===== Parameters
|
829
|
-
# +termA+:: term to be cheked
|
830
|
-
# +termB+:: term to be checked
|
831
|
-
# +ic_type+:: IC formula to be used
|
832
|
-
# ===== Returns
|
833
|
-
# the IC of the MICA(termA,termB)
|
834
|
-
def get_ICMICA(termA, termB, ic_type = :resnik)
|
835
|
-
term, ic = self.get_MICA(termA, termB, ic_type)
|
836
|
-
return term.nil? ? nil : ic
|
519
|
+
return lca
|
837
520
|
end
|
838
521
|
|
839
|
-
|
840
522
|
# Find the Most Index Content shared Ancestor (MICA) of two given terms
|
841
523
|
# ===== Parameters
|
842
524
|
# +termA+:: term to be cheked
|
@@ -844,30 +526,31 @@ class Ontology
|
|
844
526
|
# +ic_type+:: IC formula to be used
|
845
527
|
# ===== Returns
|
846
528
|
# the MICA(termA,termB) and it's IC
|
847
|
-
def get_MICA(termA, termB, ic_type = :resnik)
|
848
|
-
termA = @alternatives_index[termA] if @alternatives_index.include?(termA)
|
849
|
-
termB = @alternatives_index[termB] if @alternatives_index.include?(termB)
|
529
|
+
def get_MICA(termA, termB, ic_type = :resnik, lca_index = false)
|
850
530
|
mica = [nil,-1.0]
|
851
|
-
# Special case
|
852
|
-
if termA.eql?(termB)
|
531
|
+
if termA.eql?(termB) # Special case
|
853
532
|
ic = self.get_IC(termA, type: ic_type)
|
854
533
|
mica = [termA, ic]
|
855
|
-
else
|
856
|
-
#
|
857
|
-
|
858
|
-
|
859
|
-
if !(anc_A.empty? && anc_B.empty?)
|
860
|
-
anc_A << termA
|
861
|
-
anc_B << termB
|
862
|
-
(anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
|
863
|
-
ic = self.get_IC(anc, type: ic_type)
|
864
|
-
mica = [anc,ic] if ic > mica[1]
|
865
|
-
end
|
534
|
+
else
|
535
|
+
get_LCA(termA, termB, lca_index: lca_index).each do |lca| # Find MICA in shared ancestors
|
536
|
+
ic = self.get_IC(lca, type: ic_type)
|
537
|
+
mica = [lca, ic] if ic > mica[1]
|
866
538
|
end
|
867
539
|
end
|
868
540
|
return mica
|
869
541
|
end
|
870
542
|
|
543
|
+
# Find the IC of the Most Index Content shared Ancestor (MICA) of two given terms
|
544
|
+
# ===== Parameters
|
545
|
+
# +termA+:: term to be cheked
|
546
|
+
# +termB+:: term to be checked
|
547
|
+
# +ic_type+:: IC formula to be used
|
548
|
+
# ===== Returns
|
549
|
+
# the IC of the MICA(termA,termB)
|
550
|
+
def get_ICMICA(termA, termB, ic_type = :resnik)
|
551
|
+
term, ic = self.get_MICA(termA, termB, ic_type)
|
552
|
+
return term.nil? ? nil : ic
|
553
|
+
end
|
871
554
|
|
872
555
|
# Calculate similarity between two given terms
|
873
556
|
# ===== Parameters
|
@@ -877,11 +560,10 @@ class Ontology
|
|
877
560
|
# +ic_type+:: IC formula to be used
|
878
561
|
# ===== Returns
|
879
562
|
# the similarity between both sets or false if frequencies are not available yet
|
880
|
-
def get_similarity(termA, termB, type: :resnik, ic_type: :resnik)
|
881
|
-
# Check
|
563
|
+
def get_similarity(termA, termB, type: :resnik, ic_type: :resnik, lca_index: false)
|
882
564
|
raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
|
883
565
|
sim = nil
|
884
|
-
mica, sim_res = get_MICA(termA, termB, ic_type)
|
566
|
+
mica, sim_res = get_MICA(termA, termB, ic_type, lca_index)
|
885
567
|
if !mica.nil?
|
886
568
|
case type
|
887
569
|
when :resnik
|
@@ -895,1631 +577,1027 @@ class Ontology
|
|
895
577
|
return sim
|
896
578
|
end
|
897
579
|
|
580
|
+
# Checking valid terms
|
581
|
+
####################################
|
898
582
|
|
899
|
-
|
900
|
-
|
901
|
-
# ===== Parameters
|
902
|
-
# +file+:: optional file to update object stored file
|
903
|
-
def load(file, build: true)
|
904
|
-
_, header, stanzas = self.class.load_obo(file)
|
905
|
-
@header = header
|
906
|
-
@stanzas = stanzas
|
907
|
-
self.remove_removable()
|
908
|
-
# @removable_terms.each{|removableID| @stanzas[:terms].delete(removableID)} if !@removable_terms.empty? # Remove if proceed
|
909
|
-
self.build_index() if build
|
583
|
+
def term_exist?(id)
|
584
|
+
return @terms.include?(id)
|
910
585
|
end
|
911
586
|
|
912
|
-
#
|
913
|
-
def
|
914
|
-
|
587
|
+
# Check if a term given is marked as obsolete
|
588
|
+
def is_obsolete?(term)
|
589
|
+
return @obsoletes.include?(term)
|
915
590
|
end
|
916
591
|
|
592
|
+
#############################################
|
593
|
+
# ITEMS METHODS
|
594
|
+
#############################################
|
595
|
+
|
596
|
+
# I/O Items
|
597
|
+
####################################
|
917
598
|
|
918
|
-
#
|
599
|
+
# Store specific relations hash given into ITEMS structure
|
919
600
|
# ===== Parameters
|
920
|
-
# +
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
items: @items,
|
938
|
-
removable_terms: @removable_terms,
|
939
|
-
term_paths: @term_paths}
|
940
|
-
# Convert to JSON format & write
|
941
|
-
File.open(file, "w") { |f| f.write obj_info.to_json }
|
942
|
-
end
|
601
|
+
# +relations+:: hash to be stored
|
602
|
+
# +remove_old_relations+:: substitute ITEMS structure instead of merge new relations
|
603
|
+
# +expand+:: if true, already stored keys will be updated with the unique union of both sets
|
604
|
+
def load_item_relations_to_terms(relations, remove_old_relations = false, expand = false)
|
605
|
+
@items = {} if remove_old_relations
|
606
|
+
relations.each do |term, items|
|
607
|
+
if !term_exist?(term)
|
608
|
+
warn('Some terms specified are not stored into this ontology. These not correct terms will be stored too')
|
609
|
+
break
|
610
|
+
end
|
611
|
+
end
|
612
|
+
if expand
|
613
|
+
@items = self.concatItems(@items, relations)
|
614
|
+
else
|
615
|
+
@items.merge!(relations)
|
616
|
+
end
|
617
|
+
end
|
943
618
|
|
619
|
+
# Defining Items from instance variables
|
620
|
+
########################################
|
944
621
|
|
945
|
-
|
946
|
-
|
622
|
+
# Assign a dictionary already calculated as a items set.
|
623
|
+
# ===== Parameters
|
624
|
+
# +dictID+:: dictionary ID to be stored (:byTerm will be used)
|
625
|
+
def set_items_from_dict(dictID, remove_old_relations = false)
|
626
|
+
@items = {} if remove_old_relations
|
627
|
+
query = @dicts[dictID]
|
628
|
+
if !query.nil?
|
629
|
+
@items.merge!(query[:byTerm])
|
630
|
+
else
|
631
|
+
warn('Specified ID is not calculated. Dict will not be added as a items set')
|
632
|
+
end
|
947
633
|
end
|
948
634
|
|
949
|
-
|
950
|
-
# Read a JSON file with an OBO_Handler object stored
|
635
|
+
# Get related profiles to a given term
|
951
636
|
# ===== Parameters
|
952
|
-
# +
|
953
|
-
#
|
954
|
-
#
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
|
964
|
-
[entry,info.map{|item| item.to_sym}]
|
965
|
-
else
|
966
|
-
[entry,info]
|
967
|
-
end
|
968
|
-
end
|
969
|
-
jsonInfo[:header] = aux.to_h
|
970
|
-
end
|
971
|
-
jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
|
972
|
-
jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
|
973
|
-
jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
|
974
|
-
# Optional
|
975
|
-
jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
|
976
|
-
jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
|
977
|
-
jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
|
978
|
-
jsonInfo[:obsoletes_index] = jsonInfo[:obsoletes_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:obsoletes_index].nil?
|
979
|
-
jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
|
980
|
-
next if dictionaries.nil?
|
981
|
-
# Special case: byTerm
|
982
|
-
dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
|
983
|
-
if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
|
984
|
-
[term.to_s.to_i, value.map{|term| term.to_sym}]
|
985
|
-
elsif value.is_a? Numeric # Numeric dictionary
|
986
|
-
[term.to_sym, value]
|
987
|
-
elsif value.kind_of?(Array) && flag == :is_a
|
988
|
-
[term.to_sym, value.map{|v| v.to_sym}]
|
989
|
-
else
|
990
|
-
[term.to_sym, value]
|
991
|
-
end
|
992
|
-
end
|
993
|
-
dictionaries[:byTerm] = dictionaries[:byTerm].to_h
|
994
|
-
# By value
|
995
|
-
dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
|
996
|
-
if value.is_a? Numeric # Numeric dictionary
|
997
|
-
[value, term.to_sym]
|
998
|
-
elsif term.is_a? Numeric # Numeric dictionary
|
999
|
-
[value.to_s.to_sym, term]
|
1000
|
-
elsif flag == :is_a
|
1001
|
-
[value.to_sym, term.map{|v| v.to_sym}]
|
1002
|
-
elsif term.kind_of?(Array)
|
1003
|
-
[value.to_sym, term.map{|t| t.to_sym}]
|
1004
|
-
else
|
1005
|
-
[value.to_s, term.to_sym]
|
1006
|
-
end
|
1007
|
-
end
|
1008
|
-
dictionaries[:byValue] = dictionaries[:byValue].to_h
|
1009
|
-
end
|
1010
|
-
if !jsonInfo[:profiles].nil?
|
1011
|
-
jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
|
1012
|
-
jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
|
1013
|
-
end
|
1014
|
-
jsonInfo[:profilesDict].map{|term,ids| ids.map!{|id| id.to_sym if !id.is_a?(Numeric)}} unless jsonInfo[:profilesDict].nil?
|
1015
|
-
jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
|
1016
|
-
jsonInfo[:special_tags] = jsonInfo[:special_tags].each do |k, v|
|
1017
|
-
next if v.nil?
|
1018
|
-
if v.kind_of?(Array)
|
1019
|
-
jsonInfo[:special_tags][k] = v.map{|tag| tag.to_sym}
|
1020
|
-
else
|
1021
|
-
jsonInfo[:special_tags][k] = v.to_sym
|
1022
|
-
end
|
637
|
+
# +term+:: to be checked
|
638
|
+
# ===== Returns
|
639
|
+
# profiles which contains given term
|
640
|
+
def get_items_from_term(term)
|
641
|
+
return @items[term]
|
642
|
+
end
|
643
|
+
|
644
|
+
# For each term in profiles add the ids in the items term-id dictionary
|
645
|
+
def get_items_from_profiles
|
646
|
+
@profiles.each do |id, terms|
|
647
|
+
terms.each {|term| add2hash(@items, term, id) }
|
1023
648
|
end
|
1024
|
-
jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
|
1025
|
-
jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
|
1026
|
-
|
1027
|
-
# Store info
|
1028
|
-
@header = jsonInfo[:header]
|
1029
|
-
@stanzas = jsonInfo[:stanzas]
|
1030
|
-
@ancestors_index = jsonInfo[:ancestors_index]
|
1031
|
-
@descendants_index = jsonInfo[:descendants_index]
|
1032
|
-
@alternatives_index = jsonInfo[:alternatives_index]
|
1033
|
-
@obsoletes_index = jsonInfo[:obsoletes_index]
|
1034
|
-
jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
|
1035
|
-
@structureType = jsonInfo[:structureType]
|
1036
|
-
@ics = jsonInfo[:ics]
|
1037
|
-
@meta = jsonInfo[:meta]
|
1038
|
-
@special_tags = jsonInfo[:special_tags]
|
1039
|
-
@max_freqs = jsonInfo[:max_freqs]
|
1040
|
-
@dicts = jsonInfo[:dicts]
|
1041
|
-
@profiles = jsonInfo[:profiles]
|
1042
|
-
@profilesDict = jsonInfo[:profilesDict]
|
1043
|
-
@items = jsonInfo[:items]
|
1044
|
-
@removable_terms = jsonInfo[:removable_terms]
|
1045
|
-
@term_paths = jsonInfo[:term_paths]
|
1046
|
-
|
1047
|
-
self.build_index() if build
|
1048
|
-
end
|
1049
|
-
|
1050
|
-
|
1051
|
-
# Check if a given ID is stored as term into this object
|
1052
|
-
# ===== Parameters
|
1053
|
-
# +id+:: to be checked
|
1054
|
-
# ===== Return
|
1055
|
-
# True if term is allowed or false in other cases
|
1056
|
-
def exists? id
|
1057
|
-
return stanzas[:terms].include?(id)
|
1058
649
|
end
|
1059
650
|
|
651
|
+
# Defining instance variables from items
|
652
|
+
########################################
|
1060
653
|
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
# The correct ID if it can be found or nil in other cases
|
1066
|
-
def extract_id(text, splitBy: ' ')
|
1067
|
-
if self.exists?(text)
|
1068
|
-
return text
|
1069
|
-
else
|
1070
|
-
splittedText = text.to_s.split(splitBy).first.to_sym
|
1071
|
-
return self.exists?(splittedText) ? splittedText : nil
|
654
|
+
def get_profiles_from_items
|
655
|
+
new_profiles = {}
|
656
|
+
@items.each do |term, ids|
|
657
|
+
ids.each{|id| add2hash(new_profiles, id, term) }
|
1072
658
|
end
|
659
|
+
@profiles = new_profiles
|
1073
660
|
end
|
1074
661
|
|
662
|
+
# Expanding items
|
663
|
+
####################################
|
1075
664
|
|
1076
|
-
#
|
1077
|
-
#
|
1078
|
-
# This functions stores first value for multivalue tags
|
1079
|
-
# This function does not handle synonyms for byValue dictionaries
|
665
|
+
# This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
|
666
|
+
# Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
|
1080
667
|
# ===== Parameters
|
1081
|
-
# +
|
1082
|
-
# +
|
1083
|
-
# +
|
1084
|
-
#
|
1085
|
-
#
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
queryTag = queryTag.scan(select_regex).first
|
1112
|
-
end
|
1113
|
-
queryTag.compact!
|
1114
|
-
end
|
1115
|
-
if queryTag.kind_of?(Array) # Store
|
1116
|
-
if !queryTag.empty?
|
1117
|
-
if byTerm.include?(referenceTerm)
|
1118
|
-
byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
|
1119
|
-
else
|
1120
|
-
byTerm[referenceTerm] = queryTag
|
668
|
+
# +ontology+:: (Optional) ontology object which items given belongs
|
669
|
+
# +minimum_childs+:: minimum of childs needed to infer relations to parental. Default: 2
|
670
|
+
# +clean_profiles+:: if true, clena_profiles ontology method will be used over inferred profiles. Only if an ontology object is provided
|
671
|
+
# ===== Returns
|
672
|
+
# void and update items object
|
673
|
+
def expand_items_to_parentals(ontology: nil, minimum_childs: 2, clean_profiles: true)
|
674
|
+
targetKeys = expand_profile_with_parents(@items.keys)
|
675
|
+
terms_per_level = list_terms_per_level(targetKeys)
|
676
|
+
terms_per_level = terms_per_level.to_a.sort{|l1, l2| l1.first <=> l2.first} # Obtain sorted levels
|
677
|
+
terms_per_level.pop # Leaves are not expandable # FRED: Thats comment could be not true
|
678
|
+
|
679
|
+
terms_per_level.reverse_each do |lvl, terms| # Expand from leaves to roots
|
680
|
+
terms.each do |term|
|
681
|
+
childs = self.get_descendants(term).select{|t| @items.include?(t)} # Get child with items
|
682
|
+
next if childs.length < minimum_childs
|
683
|
+
propagated_item_count = Hash.new(0)
|
684
|
+
if ontology.nil? # Count how many times is presented an item in childs
|
685
|
+
childs.each do |child|
|
686
|
+
@items[child].each{|i| propagated_item_count[i] += 1}
|
687
|
+
end
|
688
|
+
else # Count take into account similarity between terms in other ontology. Not pretty clear the full logic
|
689
|
+
while childs.length > 1
|
690
|
+
curr_term = childs.shift
|
691
|
+
childs.each do |child|
|
692
|
+
maxmica_counts = Hash.new(0)
|
693
|
+
curr_items = @items[curr_term]
|
694
|
+
child_items = @items[child]
|
695
|
+
curr_items.each do |item|
|
696
|
+
maxmica = ontology.get_maxmica_term2profile(item, child_items)
|
697
|
+
maxmica_counts[maxmica.first] += 1
|
1121
698
|
end
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
byValue[value] << referenceTerm
|
1126
|
-
end
|
1127
|
-
else
|
1128
|
-
queryTag.each{|value| byValue[value] = referenceTerm}
|
699
|
+
child_items.each do |item|
|
700
|
+
maxmica = ontology.get_maxmica_term2profile(item, curr_items)
|
701
|
+
maxmica_counts[maxmica.first] += 1
|
1129
702
|
end
|
1130
|
-
|
1131
|
-
|
1132
|
-
if byTerm.include?(referenceTerm)
|
1133
|
-
byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
|
1134
|
-
else
|
1135
|
-
byTerm[referenceTerm] = [queryTag]
|
1136
|
-
end
|
1137
|
-
if multiterm
|
1138
|
-
byValue[queryTag] = [] if byValue[queryTag].nil?
|
1139
|
-
byValue[queryTag] << referenceTerm
|
1140
|
-
else
|
1141
|
-
byValue[queryTag] = referenceTerm
|
1142
|
-
end
|
1143
|
-
end
|
1144
|
-
end
|
1145
|
-
end
|
1146
|
-
|
1147
|
-
# Check self-references
|
1148
|
-
if self_type_references
|
1149
|
-
byTerm.map do |term, references|
|
1150
|
-
corrected_references = references.map do |t|
|
1151
|
-
checked = self.extract_id(t)
|
1152
|
-
if checked.nil?
|
1153
|
-
t
|
1154
|
-
else
|
1155
|
-
byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
|
1156
|
-
checked
|
703
|
+
maxmica_counts.each{|t,freq| propagated_item_count[t] += freq if freq >= 2} #TODO: Maybe need Division by 2 due to the calculation of mica two times but test fails.
|
704
|
+
# FRED: Maybe for the childs.shift there is uniqueness
|
1157
705
|
end
|
1158
706
|
end
|
1159
|
-
byTerm[term] = corrected_references.uniq
|
1160
707
|
end
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
if !
|
1169
|
-
|
1170
|
-
referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
|
1171
|
-
referenceValue.flatten!
|
1172
|
-
else
|
1173
|
-
referenceValue = referenceValue.scan(select_regex).first
|
1174
|
-
end
|
1175
|
-
referenceValue.compact!
|
1176
|
-
end
|
1177
|
-
if self_type_references
|
1178
|
-
if referenceValue.kind_of?(Array)
|
1179
|
-
aux = referenceValue.map{|t| self.extract_id(t)}
|
1180
|
-
else
|
1181
|
-
aux = self.extract_id(referenceValue)
|
1182
|
-
end
|
1183
|
-
aux.compact! unless aux.nil?
|
1184
|
-
referenceValue = aux unless aux.nil?
|
1185
|
-
end
|
1186
|
-
referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
|
1187
|
-
byTerm[term] = referenceValue + (values - referenceValue)
|
708
|
+
propagated_items = propagated_item_count.select{|k,v| v >= minimum_childs}.keys
|
709
|
+
if propagated_items.length > 0
|
710
|
+
query = @items[term]
|
711
|
+
if query.nil?
|
712
|
+
@items[term] = propagated_items
|
713
|
+
else
|
714
|
+
terms = @items[term] | propagated_items
|
715
|
+
terms = ontology.clean_profile(terms) if clean_profiles && !ontology.nil?
|
716
|
+
@items[term] = terms
|
1188
717
|
end
|
1189
718
|
end
|
1190
719
|
end
|
1191
|
-
|
1192
|
-
# Store
|
1193
|
-
@dicts[store_tag] = {byTerm: byTerm, byValue: byValue}
|
1194
720
|
end
|
1195
721
|
end
|
1196
722
|
|
723
|
+
# Compute modified fisher between terms and items based on topgo methodology. Refactor to use all the possible methods of this class
|
724
|
+
#-------------------------------------------------------------------------------------------------------------------------------------
|
1197
725
|
|
1198
|
-
#
|
1199
|
-
|
1200
|
-
|
726
|
+
def compute_relations_to_items(external_item_list, total_items, mode, thresold) # NEED TEST, check with PSZ how to maintain these methods
|
727
|
+
terms_levels = list_terms_per_level_from_items
|
728
|
+
connect_familiars!(terms_levels)
|
729
|
+
item_list_with_transf_parental = get_item_list_parental(terms_levels)
|
730
|
+
results = []
|
731
|
+
if mode == :elim
|
732
|
+
results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
|
733
|
+
elsif mode == :weight
|
734
|
+
results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
|
735
|
+
end
|
736
|
+
return results
|
1201
737
|
end
|
1202
738
|
|
739
|
+
def list_terms_per_level_from_items
|
740
|
+
return list_terms_per_level(@items.keys)
|
741
|
+
end
|
1203
742
|
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
# ===== Return
|
1210
|
-
# translation
|
1211
|
-
def translate(toTranslate, tag, byValue: true)
|
1212
|
-
dict = byValue ? @dicts[tag][:byValue] : @dicts[tag][:byTerm]
|
1213
|
-
toTranslate = get_main_id(toTranslate) if !byValue
|
1214
|
-
return dict[toTranslate]
|
1215
|
-
end
|
1216
|
-
|
1217
|
-
|
1218
|
-
# Translate a name given
|
1219
|
-
# ===== Parameters
|
1220
|
-
# +name+:: to be translated
|
1221
|
-
# ===== Return
|
1222
|
-
# translated name or nil if it's not stored into this ontology
|
1223
|
-
def translate_name(name)
|
1224
|
-
term = self.translate(name, :name)
|
1225
|
-
term = self.translate(name, :synonym) if term.nil?
|
1226
|
-
return term
|
1227
|
-
end
|
1228
|
-
|
1229
|
-
|
1230
|
-
# Translate several names and return translations and a list of names which couldn't be translated
|
1231
|
-
# ===== Parameters
|
1232
|
-
# +names+:: array to be translated
|
1233
|
-
# ===== Return
|
1234
|
-
# two arrays with translations and names which couldn't be translated respectively
|
1235
|
-
def translate_names(names)
|
1236
|
-
translated = []
|
1237
|
-
rejected = []
|
1238
|
-
names.each do |name|
|
1239
|
-
tr = self.translate_name(name)
|
1240
|
-
if tr.nil?
|
1241
|
-
rejected << name
|
1242
|
-
else
|
1243
|
-
translated << tr
|
1244
|
-
end
|
1245
|
-
end
|
1246
|
-
return translated, rejected
|
1247
|
-
end
|
1248
|
-
|
1249
|
-
|
1250
|
-
# Translates a given ID to it assigned name
|
1251
|
-
# ===== Parameters
|
1252
|
-
# +id+:: to be translated
|
1253
|
-
# ===== Return
|
1254
|
-
# main name or nil if it's not included into this ontology
|
1255
|
-
def translate_id(id)
|
1256
|
-
name = self.translate(id, :name, byValue: false)
|
1257
|
-
return name.nil? ? nil : name.first
|
1258
|
-
end
|
1259
|
-
|
1260
|
-
|
1261
|
-
# Translates several IDs and returns translations and not allowed IDs list
|
1262
|
-
# ===== Parameters
|
1263
|
-
# +ids+:: to be translated
|
1264
|
-
# ===== Return
|
1265
|
-
# two arrays with translations and names which couldn't be translated respectively
|
1266
|
-
def translate_ids(ids)
|
1267
|
-
translated = []
|
1268
|
-
rejected = []
|
1269
|
-
ids.each do |term_id|
|
1270
|
-
tr = self.translate_id(term_id.to_sym)
|
1271
|
-
if !tr.nil?
|
1272
|
-
translated << tr
|
1273
|
-
else
|
1274
|
-
rejected << tr
|
1275
|
-
end
|
1276
|
-
end
|
1277
|
-
return translated, rejected
|
1278
|
-
end
|
1279
|
-
|
1280
|
-
|
1281
|
-
# ===== Returns
|
1282
|
-
# the main ID assigned to a given ID. If it's a non alternative/obsolete ID itself will be returned
|
1283
|
-
# ===== Parameters
|
1284
|
-
# +id+:: to be translated
|
1285
|
-
# ===== Return
|
1286
|
-
# main ID related to a given ID. Returns nil if given ID is not an allowed ID
|
1287
|
-
def get_main_id(id)
|
1288
|
-
return nil if !@stanzas[:terms].include? id
|
1289
|
-
new_id = id
|
1290
|
-
mainID = @alternatives_index[id]
|
1291
|
-
new_id = mainID if !mainID.nil? & !@obsoletes_index.include?(mainID)
|
1292
|
-
return new_id
|
1293
|
-
end
|
1294
|
-
|
1295
|
-
|
1296
|
-
# Check a pull of IDs and return allowed IDs removing which are not official terms on this ontology
|
1297
|
-
# ===== Parameters
|
1298
|
-
# +ids+:: to be checked
|
1299
|
-
# ===== Return
|
1300
|
-
# two arrays whit allowed and rejected IDs respectively
|
1301
|
-
def check_ids(ids, substitute: true)
|
1302
|
-
checked_codes = []
|
1303
|
-
rejected_codes = []
|
1304
|
-
ids.each do |id|
|
1305
|
-
if @stanzas[:terms].include? id
|
1306
|
-
if substitute
|
1307
|
-
checked_codes << self.get_main_id(id)
|
1308
|
-
else
|
1309
|
-
checked_codes << id
|
1310
|
-
end
|
1311
|
-
else
|
1312
|
-
rejected_codes << id
|
1313
|
-
end
|
1314
|
-
end
|
1315
|
-
return checked_codes, rejected_codes
|
1316
|
-
end
|
1317
|
-
|
1318
|
-
|
1319
|
-
# Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
|
1320
|
-
# ===== Parameters
|
1321
|
-
# +id+:: assigned to profile
|
1322
|
-
# +terms+:: array of terms
|
1323
|
-
# +substitute+:: subsstitute flag from check_ids
|
1324
|
-
def add_profile(id, terms, substitute: true)
|
1325
|
-
warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
|
1326
|
-
correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
|
1327
|
-
if !rejected_terms.empty?
|
1328
|
-
warn('Given terms contains erroneus IDs. These IDs will be removed')
|
1329
|
-
end
|
1330
|
-
if id.is_a? Numeric
|
1331
|
-
@profiles[id] = correct_terms
|
1332
|
-
else
|
1333
|
-
@profiles[id.to_sym] = correct_terms
|
1334
|
-
end
|
1335
|
-
end
|
1336
|
-
|
1337
|
-
|
1338
|
-
# Method used to store a pull of profiles
|
1339
|
-
# ===== Parameters
|
1340
|
-
# +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
|
1341
|
-
# +calc_metadata+:: if true, launch calc_profiles_dictionary process
|
1342
|
-
# +reset_stored+:: if true, remove already stored profiles
|
1343
|
-
# +substitute+:: subsstitute flag from check_ids
|
1344
|
-
def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
|
1345
|
-
self.reset_profiles if reset_stored
|
1346
|
-
# Check
|
1347
|
-
if profiles.kind_of?(Array)
|
1348
|
-
profiles.each_with_index do |items, i|
|
1349
|
-
self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
|
1350
|
-
end
|
1351
|
-
else # Hash
|
1352
|
-
if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
|
1353
|
-
warn('Some profiles given are already stored. Stored version will be replaced')
|
1354
|
-
end
|
1355
|
-
profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
|
1356
|
-
end
|
1357
|
-
|
1358
|
-
self.add_observed_terms_from_profiles(reset: true)
|
1359
|
-
|
1360
|
-
if calc_metadata
|
1361
|
-
self.calc_profiles_dictionary
|
1362
|
-
end
|
1363
|
-
end
|
1364
|
-
|
1365
|
-
|
1366
|
-
# Internal method used to remove already stored profiles and restore observed frequencies
|
1367
|
-
def reset_profiles
|
1368
|
-
# Clean profiles storage
|
1369
|
-
@profiles = {}
|
1370
|
-
# Reset frequency observed
|
1371
|
-
@meta.each{|term,info| info[:observed_freq] = 0}
|
1372
|
-
@max_freqs[:observed_freq] = 0
|
1373
|
-
end
|
1374
|
-
|
1375
|
-
|
1376
|
-
# ===== Returns
|
1377
|
-
# profiles assigned to a given ID
|
1378
|
-
# ===== Parameters
|
1379
|
-
# +id+:: profile ID
|
1380
|
-
# ===== Return
|
1381
|
-
# specific profile or nil if it's not stored
|
1382
|
-
def get_profile(id)
|
1383
|
-
return @profiles[id]
|
1384
|
-
end
|
1385
|
-
|
1386
|
-
|
1387
|
-
# ===== Returns
|
1388
|
-
# an array of sizes for all stored profiles
|
1389
|
-
# ===== Return
|
1390
|
-
# array of profile sizes
|
1391
|
-
def get_profiles_sizes()
|
1392
|
-
return @profiles.map{|id,terms| terms.length}
|
1393
|
-
end
|
1394
|
-
|
1395
|
-
|
1396
|
-
# ===== Returns
|
1397
|
-
# mean size of stored profiles
|
1398
|
-
# ===== Parameters
|
1399
|
-
# +round_digits+:: number of digits to round result. Default: 4
|
1400
|
-
# ===== Returns
|
1401
|
-
# mean size of stored profiles
|
1402
|
-
def get_profiles_mean_size(round_digits: 4)
|
1403
|
-
sizes = self.get_profiles_sizes
|
1404
|
-
return sizes.inject(0){|sum, n| sum + n}.fdiv(@profiles.length).round(round_digits)
|
1405
|
-
end
|
1406
|
-
|
1407
|
-
|
1408
|
-
# Calculates profiles sizes and returns size assigned to percentile given
|
1409
|
-
# ===== Parameters
|
1410
|
-
# +perc+:: percentile to be returned
|
1411
|
-
# +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
|
1412
|
-
# ===== Returns
|
1413
|
-
# values assigned to percentile asked
|
1414
|
-
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
1415
|
-
prof_lengths = self.get_profiles_sizes.sort
|
1416
|
-
prof_lengths.reverse! if !increasing_sort
|
1417
|
-
n_profiles = prof_lengths.length
|
1418
|
-
percentile_index = ((perc * (n_profiles - 1)).fdiv(100) - 0.5).round # Take length which not overpass percentile selected
|
1419
|
-
percentile_index = 0 if percentile_index < 0 # Special case (caused by literal calc)
|
1420
|
-
return prof_lengths[percentile_index]
|
1421
|
-
end
|
1422
|
-
|
1423
|
-
|
1424
|
-
# Translate a given profile to terms names
|
1425
|
-
# ===== Parameters
|
1426
|
-
# +prof+:: array of terms to be translated
|
1427
|
-
# ===== Returns
|
1428
|
-
# array of translated terms. Can include nils if some IDs are not allowed
|
1429
|
-
def profile_names(prof)
|
1430
|
-
return prof.map{|term| self.translate_id(term)}
|
1431
|
-
end
|
1432
|
-
|
1433
|
-
|
1434
|
-
# Trnaslates a bunch of profiles to it sets of term names
|
1435
|
-
# ===== Parameters
|
1436
|
-
# +profs+:: array of profiles
|
1437
|
-
# +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
|
1438
|
-
# ===== Returns
|
1439
|
-
# translated profiles
|
1440
|
-
def translate_profiles_ids(profs = [], asArray: true)
|
1441
|
-
profs = @profiles if profs.empty?
|
1442
|
-
profs = profs.each_with_index.map{|terms, index| [index, terms]}.to_h if profs.kind_of?(Array)
|
1443
|
-
profs_names = profs.map{|id, terms| [id, self.profile_names(terms)]}.to_h
|
1444
|
-
return asArray ? profs_names.values : profs_names
|
1445
|
-
end
|
1446
|
-
|
1447
|
-
|
1448
|
-
# Includes as "observed_terms" all terms included into stored profiles
|
1449
|
-
# ===== Parameters
|
1450
|
-
# +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
|
1451
|
-
def add_observed_terms_from_profiles(reset: false)
|
1452
|
-
@meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
|
1453
|
-
@profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
|
1454
|
-
end
|
1455
|
-
|
1456
|
-
|
1457
|
-
# Get a term frequency
|
1458
|
-
# ===== Parameters
|
1459
|
-
# +term+:: term to be checked
|
1460
|
-
# +type+:: type of frequency to be returned. Allowed: [:struct_freq, :observed_freq]
|
1461
|
-
# ===== Returns
|
1462
|
-
# frequency of term given or nil if term is not allowed
|
1463
|
-
def get_frequency(term, type: :struct_freq)
|
1464
|
-
queryFreq = @meta[term]
|
1465
|
-
return queryFreq.nil? ? nil : queryFreq[type]
|
1466
|
-
end
|
1467
|
-
|
1468
|
-
|
1469
|
-
# Geys structural frequency of a term given
|
1470
|
-
# ===== Parameters
|
1471
|
-
# +term+:: to be checked
|
1472
|
-
# ===== Returns
|
1473
|
-
# structural frequency of given term or nil if term is not allowed
|
1474
|
-
def get_structural_frequency(term)
|
1475
|
-
return self.get_frequency(term, type: :struct_freq)
|
1476
|
-
end
|
1477
|
-
|
1478
|
-
|
1479
|
-
# Gets observed frequency of a term given
|
1480
|
-
# ===== Parameters
|
1481
|
-
# +term+:: to be checked
|
1482
|
-
# ===== Returns
|
1483
|
-
# observed frequency of given term or nil if term is not allowed
|
1484
|
-
def get_observed_frequency(term)
|
1485
|
-
return self.get_frequency(term, type: :observed_freq)
|
1486
|
-
end
|
1487
|
-
|
1488
|
-
|
1489
|
-
# Calculates frequencies of stored profiles terms
|
1490
|
-
# ===== Parameters
|
1491
|
-
# +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
|
1492
|
-
# +literal+:: if true, literal terms will be used to calculate frequencies instead translate alternative terms
|
1493
|
-
# +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
|
1494
|
-
# +translate+:: if true, term IDs will be translated to
|
1495
|
-
# ===== Returns
|
1496
|
-
# stored profiles terms frequencies
|
1497
|
-
def get_profiles_terms_frequency(ratio: true, literal: true, asArray: true, translate: true)
|
1498
|
-
n_profiles = @profiles.length
|
1499
|
-
if literal
|
1500
|
-
freqs = {}
|
1501
|
-
@profiles.each do |id, terms|
|
1502
|
-
terms.each do |literalTerm|
|
1503
|
-
if freqs.include?(literalTerm)
|
1504
|
-
freqs[literalTerm] += 1
|
1505
|
-
else
|
1506
|
-
freqs[literalTerm] = 1
|
1507
|
-
end
|
1508
|
-
end
|
1509
|
-
end
|
1510
|
-
if (ratio || translate)
|
1511
|
-
aux_keys = freqs.keys
|
1512
|
-
aux_keys.each do |term|
|
1513
|
-
freqs[term] = freqs[term].fdiv(n_profiles) if ratio
|
1514
|
-
if translate
|
1515
|
-
tr = self.translate_id(term)
|
1516
|
-
freqs[tr] = freqs.delete(term) if !tr.nil?
|
1517
|
-
end
|
1518
|
-
end
|
1519
|
-
end
|
1520
|
-
if asArray
|
1521
|
-
freqs = freqs.map{|term, freq| [term, freq]}
|
1522
|
-
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
1523
|
-
end
|
1524
|
-
else # Freqs translating alternatives
|
1525
|
-
freqs = @meta.select{|id, freqs| freqs[:observed_freq] > 0}.map{|id, freqs| [id, ratio ? freqs[:observed_freq].fdiv(n_profiles) : freqs[:observed_freq]]}
|
1526
|
-
freqs = freqs.to_h if !asArray
|
1527
|
-
if translate
|
1528
|
-
freqs = freqs.map do |term, freq|
|
1529
|
-
tr = self.translate_id(term)
|
1530
|
-
tr.nil? ? [term, freq] : [tr, freq]
|
1531
|
-
end
|
1532
|
-
end
|
1533
|
-
if asArray
|
1534
|
-
freqs = freqs.map{|term, freq| [term, freq]}
|
1535
|
-
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
1536
|
-
else
|
1537
|
-
freqs = freqs.to_h
|
1538
|
-
end
|
1539
|
-
end
|
1540
|
-
return freqs
|
1541
|
-
end
|
1542
|
-
|
1543
|
-
|
1544
|
-
# Clean a given profile returning cleaned set of terms and removed ancestors term.
|
1545
|
-
# ===== Parameters
|
1546
|
-
# +prof+:: array of terms to be checked
|
1547
|
-
# ===== Returns
|
1548
|
-
# two arrays, first is the cleaned profile and second is the removed elements array
|
1549
|
-
def remove_ancestors_from_profile(prof)
|
1550
|
-
ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
|
1551
|
-
redundant = prof.select{|term| ancestors.include?(term)}
|
1552
|
-
return prof - redundant, redundant
|
1553
|
-
end
|
1554
|
-
|
1555
|
-
|
1556
|
-
# Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
|
1557
|
-
# ===== Parameters
|
1558
|
-
# +prof+:: array of terms to be checked
|
1559
|
-
# ===== Returns
|
1560
|
-
# two arrays, first is the cleaned profile and second is the removed elements array
|
1561
|
-
def remove_alternatives_from_profile(prof)
|
1562
|
-
alternatives = prof.select{|term| @alternatives_index.include?(term)}
|
1563
|
-
redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
|
1564
|
-
return prof - redundant, redundant
|
1565
|
-
end
|
1566
|
-
|
1567
|
-
|
1568
|
-
# Remove alternatives (if official term is present) and ancestors terms of a given profile
|
1569
|
-
# ===== Parameters
|
1570
|
-
# +profile+:: profile to be cleaned
|
1571
|
-
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1572
|
-
# ===== Returns
|
1573
|
-
# cleaned profile
|
1574
|
-
def clean_profile(profile, remove_alternatives: true)
|
1575
|
-
warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
|
1576
|
-
terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
|
1577
|
-
if remove_alternatives
|
1578
|
-
terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
|
1579
|
-
else
|
1580
|
-
terms_without_ancestors_and_alternatices = terms_without_ancestors
|
743
|
+
def list_terms_per_level(terms)
|
744
|
+
terms_levels = {}
|
745
|
+
terms.each do |term|
|
746
|
+
level = self.get_term_level(term)
|
747
|
+
add2hash(terms_levels, level, term)
|
1581
748
|
end
|
1582
|
-
return
|
1583
|
-
end
|
1584
|
-
|
1585
|
-
def clean_profile_hard(profile, options = {})
|
1586
|
-
profile, _ = check_ids(profile)
|
1587
|
-
profile = profile.select{|t| !is_obsolete?(t)}
|
1588
|
-
if !options[:term_filter].nil?
|
1589
|
-
profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
|
1590
|
-
end
|
1591
|
-
profile = clean_profile(profile.uniq)
|
1592
|
-
return profile
|
749
|
+
return terms_levels
|
1593
750
|
end
|
1594
751
|
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
1603
|
-
|
1604
|
-
|
1605
|
-
keep = profile.map do |term|
|
1606
|
-
if scores.include?(term)
|
1607
|
-
parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
|
1608
|
-
targetable = parentals.select{|parent| profile.include?(parent)}
|
1609
|
-
if targetable.empty?
|
1610
|
-
term
|
1611
|
-
else
|
1612
|
-
targetable << term
|
1613
|
-
targets = scores.select{|term,score| targetable.include?(term)}.to_h
|
1614
|
-
byMax ? targets.keys.last : targets.keys.first
|
1615
|
-
end
|
1616
|
-
elsif remove_without_score
|
1617
|
-
nil
|
1618
|
-
else
|
1619
|
-
term
|
752
|
+
def connect_familiars!(terms_levels)
|
753
|
+
levels = terms_levels.keys.sort
|
754
|
+
while levels.length > 1 # Process when current level has a parental level
|
755
|
+
level = levels.pop
|
756
|
+
parental_level = level - 1
|
757
|
+
parental_terms = terms_levels[parental_level]
|
758
|
+
if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
|
759
|
+
parental_terms = [] # Initialize required parental level
|
760
|
+
terms_levels[parental_level] = parental_terms
|
761
|
+
levels << parental_level
|
1620
762
|
end
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
# ===== Parameters
|
1628
|
-
# +store+:: if true, clenaed profiles will replace already stored profiles
|
1629
|
-
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1630
|
-
# ===== Returns
|
1631
|
-
# a hash with cleaned profiles
|
1632
|
-
def clean_profiles(store: false, remove_alternatives: true)
|
1633
|
-
cleaned_profiles = {}
|
1634
|
-
@profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
|
1635
|
-
@profiles = cleaned_profiles if store
|
1636
|
-
return cleaned_profiles
|
1637
|
-
end
|
1638
|
-
|
1639
|
-
|
1640
|
-
# Calculates number of ancestors present (redundant) in each profile stored
|
1641
|
-
# ===== Returns
|
1642
|
-
# array of parentals for each profile
|
1643
|
-
def parentals_per_profile
|
1644
|
-
cleaned_profiles = self.clean_profiles(remove_alternatives: false)
|
1645
|
-
parentals = @profiles.map{ |id, terms| terms.length - cleaned_profiles[id].length}
|
1646
|
-
return parentals
|
1647
|
-
end
|
1648
|
-
|
1649
|
-
|
1650
|
-
def get_profile_redundancy()
|
1651
|
-
profile_sizes = self.get_profiles_sizes
|
1652
|
-
parental_terms_per_profile = self.parentals_per_profile# clean_profiles
|
1653
|
-
parental_terms_per_profile = parental_terms_per_profile.map{|item| item[0]}
|
1654
|
-
profile_sizes, parental_terms_per_profile = profile_sizes.zip(parental_terms_per_profile).sort_by{|i| i.first}.reverse.transpose
|
1655
|
-
return profile_sizes, parental_terms_per_profile
|
1656
|
-
end
|
1657
|
-
|
1658
|
-
def compute_term_list_and_childs()
|
1659
|
-
suggested_childs = {}
|
1660
|
-
total_terms = 0
|
1661
|
-
terms_with_more_specific_childs = 0
|
1662
|
-
@profiles.each do |id, terms|
|
1663
|
-
total_terms += terms.length
|
1664
|
-
more_specific_childs = self.get_childs_table(terms, true)
|
1665
|
-
terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
|
1666
|
-
suggested_childs[id] = more_specific_childs
|
1667
|
-
end
|
1668
|
-
return suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
|
1669
|
-
end
|
1670
|
-
|
1671
|
-
# Calculates mean IC of a given profile
|
1672
|
-
# ===== Parameters
|
1673
|
-
# +prof+:: profile to be checked
|
1674
|
-
# +ic_type+:: ic_type to be used
|
1675
|
-
# +zhou_k+:: special coeficient for Zhou IC method
|
1676
|
-
# ===== Returns
|
1677
|
-
# mean IC for a given profile
|
1678
|
-
def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
|
1679
|
-
return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.inject(0){|sum,x| sum + x}.fdiv(prof.length)
|
1680
|
-
end
|
1681
|
-
|
1682
|
-
|
1683
|
-
# Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
|
1684
|
-
# ===== Returns
|
1685
|
-
# two hashes with Profiles and IC calculated for resnik and observed resnik respectively
|
1686
|
-
def get_profiles_resnik_dual_ICs
|
1687
|
-
struct_ics = {}
|
1688
|
-
observ_ics = {}
|
1689
|
-
@profiles.each do |id, terms|
|
1690
|
-
struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik)
|
1691
|
-
observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: :resnik_observed)
|
1692
|
-
end
|
1693
|
-
return struct_ics.clone, observ_ics.clone
|
1694
|
-
end
|
1695
|
-
|
1696
|
-
|
1697
|
-
# Calculates ontology structural levels for all ontology terms
|
1698
|
-
# ===== Parameters
|
1699
|
-
# +calc_paths+:: calculates term paths if it's not already calculated
|
1700
|
-
# +shortest_path+:: if true, level is calculated with shortest path, largest path will be used in other cases
|
1701
|
-
def calc_term_levels(calc_paths: false, shortest_path: true)
|
1702
|
-
if @term_paths.empty?
|
1703
|
-
if calc_paths
|
1704
|
-
self.calc_term_paths
|
1705
|
-
else
|
1706
|
-
warn('Term paths are not already loaded. Aborting dictionary calc')
|
763
|
+
terms_levels[level].each do |term|
|
764
|
+
path_info = @term_paths[term]
|
765
|
+
shortest_path_length = path_info[:shortest_path]
|
766
|
+
path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
|
767
|
+
parental = path[1] # the first elements is the term itself
|
768
|
+
parental_terms << parental if !parental_terms.include?(parental)
|
1707
769
|
end
|
1708
770
|
end
|
1709
|
-
|
1710
|
-
|
1711
|
-
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
byValue[level] = [term]
|
771
|
+
end
|
772
|
+
|
773
|
+
def get_item_list_parental(terms_levels)
|
774
|
+
transfered_list = {}
|
775
|
+
parent_dict = @dicts[:is_a][:byTerm]
|
776
|
+
levels = terms_levels.keys.sort
|
777
|
+
while levels.length > 1
|
778
|
+
level = levels.pop
|
779
|
+
terms_levels[level].each do |term|
|
780
|
+
parents = parent_dict[term]
|
781
|
+
if parents.nil?
|
782
|
+
next
|
783
|
+
elsif parents.length == 1
|
784
|
+
parent = parents.first
|
1724
785
|
else
|
1725
|
-
|
786
|
+
parent = (parents | terms_levels[level - 1]).first
|
1726
787
|
end
|
788
|
+
term_it = @items[term]
|
789
|
+
parent_it = @items[parent]
|
790
|
+
curr_it = transfered_list[term]
|
791
|
+
parent_all_items = merge_groups([term_it, parent_it, curr_it])
|
792
|
+
transfered_list[parent] = parent_all_items if !parent_all_items.empty?
|
793
|
+
term_all_items = merge_groups([term_it, curr_it])
|
794
|
+
transfered_list[term] = term_all_items if !term_all_items.empty?
|
1727
795
|
end
|
1728
|
-
@dicts[:level] = {byTerm: byValue, byValue: byTerm} # Note: in this case, value has multiplicity and term is unique value
|
1729
|
-
# Update maximum depth
|
1730
|
-
@max_freqs[:max_depth] = byValue.keys.max
|
1731
796
|
end
|
797
|
+
terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
|
798
|
+
transfered_list[term] = @items[term] if transfered_list[term].nil?
|
799
|
+
end
|
800
|
+
return transfered_list
|
1732
801
|
end
|
1733
802
|
|
1734
|
-
|
1735
|
-
|
1736
|
-
def is_obsolete? term
|
1737
|
-
return @obsoletes_index.include?(term)
|
803
|
+
def merge_groups(groups)
|
804
|
+
return groups.compact.inject([ ]){|it, a| it | a}
|
1738
805
|
end
|
1739
806
|
|
1740
|
-
|
1741
|
-
|
1742
|
-
|
807
|
+
def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
|
808
|
+
results = []
|
809
|
+
penalized_terms = {}
|
810
|
+
levels = terms_levels.keys.sort
|
811
|
+
levels.reverse_each do |level|
|
812
|
+
terms_levels[level].each do |term|
|
813
|
+
associated_items = item_list[term]
|
814
|
+
items_to_remove = penalized_terms[term]
|
815
|
+
items_to_remove = [] if items_to_remove.nil?
|
816
|
+
pval = get_fisher_exact_test(
|
817
|
+
external_item_list - items_to_remove,
|
818
|
+
associated_items - items_to_remove,
|
819
|
+
#((associated_items | external_item_list) - items_to_remove).length
|
820
|
+
total_items
|
821
|
+
)
|
822
|
+
if pval <= thresold
|
823
|
+
parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
|
824
|
+
parents.each do |prnt|
|
825
|
+
query = penalized_terms[prnt]
|
826
|
+
if query.nil?
|
827
|
+
penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
|
828
|
+
else
|
829
|
+
query.concat(item_list[term])
|
830
|
+
end
|
831
|
+
end
|
832
|
+
end
|
833
|
+
results << [term, pval]
|
834
|
+
end
|
835
|
+
end
|
836
|
+
return results
|
1743
837
|
end
|
1744
838
|
|
1745
|
-
|
1746
|
-
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
end
|
1760
|
-
if !visited_terms.include?(term)
|
1761
|
-
# PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
|
1762
|
-
path_attr = @term_paths[term]
|
1763
|
-
if path_attr.nil?
|
1764
|
-
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
|
1765
|
-
@term_paths[term] = path_attr #save path data container
|
1766
|
-
end
|
1767
|
-
parentals = @dicts[:is_a][:byTerm][term]
|
1768
|
-
if parentals.nil?
|
1769
|
-
path_attr[:paths] << [term]
|
1770
|
-
else
|
1771
|
-
parentals.each do |direct_parental|
|
1772
|
-
self.expand_path(direct_parental)
|
1773
|
-
new_paths = @term_paths[direct_parental][:paths]
|
1774
|
-
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
|
1775
|
-
end
|
1776
|
-
end
|
1777
|
-
anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
|
1778
|
-
visited_terms[term] = true
|
839
|
+
def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
|
840
|
+
pvals = {}
|
841
|
+
item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
|
842
|
+
levels = terms_levels.keys.sort
|
843
|
+
levels.reverse_each do |level|
|
844
|
+
terms_levels[level].each do |term|
|
845
|
+
associated_items = item_list[term]
|
846
|
+
#initialize observed items in item_weigths_per_term list
|
847
|
+
add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
|
848
|
+
children = @dicts[:is_a][:byValue][term]
|
849
|
+
if children.nil?
|
850
|
+
children = []
|
851
|
+
else
|
852
|
+
children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
|
1779
853
|
end
|
1780
|
-
|
1781
|
-
path_attr = @term_paths[term]
|
1782
|
-
path_attr[:total_paths] = path_attr[:paths].length
|
1783
|
-
paths_sizes = path_attr[:paths].map{|path| path.length}
|
1784
|
-
path_attr[:largest_path] = paths_sizes.max
|
1785
|
-
path_attr[:shortest_path] = paths_sizes.min
|
854
|
+
computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
1786
855
|
end
|
1787
|
-
else
|
1788
|
-
warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
|
1789
856
|
end
|
857
|
+
return pvals.to_a
|
1790
858
|
end
|
1791
859
|
|
860
|
+
def add_items_to_weigthed_list(term, associated_items, weigthed_list)
|
861
|
+
term_weigthing = weigthed_list[term]
|
862
|
+
associated_items.each{|ai| term_weigthing[ai] = 1}
|
863
|
+
weigthed_list[term] = term_weigthing
|
864
|
+
end
|
1792
865
|
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
1807
|
-
|
1808
|
-
|
1809
|
-
|
1810
|
-
|
1811
|
-
|
866
|
+
def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
867
|
+
#puts term.to_s.red
|
868
|
+
#puts @term_paths[term].inspect
|
869
|
+
#puts @dicts[:is_a][:byValue][term].inspect.light_blue
|
870
|
+
associated_items = item_weigths_per_term[term].keys
|
871
|
+
pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
|
872
|
+
'two_sided', item_weigths_per_term[term], true)
|
873
|
+
pvals[term] = pval
|
874
|
+
if children.length > 0
|
875
|
+
rates = {}
|
876
|
+
sig_child = 0
|
877
|
+
children.each do |child|
|
878
|
+
ratio = sigRatio(pvals[child], pval)
|
879
|
+
rates[child] = ratio
|
880
|
+
sig_child += 1 if ratio >= 1
|
881
|
+
end
|
882
|
+
if sig_child == 0 # CASE 1
|
883
|
+
children.each do |child|
|
884
|
+
current_ratio = rates[child]
|
885
|
+
query_child = item_weigths_per_term[child]
|
886
|
+
query_child.transform_values!{|weight| weight * current_ratio}
|
887
|
+
pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
|
888
|
+
'two_sided', item_weigths_per_term[child], true)
|
889
|
+
end
|
890
|
+
else
|
891
|
+
ancs = get_ancestors(term)
|
892
|
+
ancs << term
|
893
|
+
rates.each do |ch, ratio|# CASE 2
|
894
|
+
if ratio >= 1 # The child is better than parent
|
895
|
+
ancs.each do |anc|
|
896
|
+
query_anc = item_weigths_per_term[anc]
|
897
|
+
associated_items.each do |item|
|
898
|
+
query_anc[item] /= ratio # /= --> query_anc[item]/ratio
|
899
|
+
end
|
900
|
+
end
|
1812
901
|
end
|
1813
|
-
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
|
1814
902
|
end
|
903
|
+
computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
|
1815
904
|
end
|
1816
905
|
end
|
1817
906
|
end
|
1818
907
|
|
1819
|
-
|
1820
|
-
|
1821
|
-
# ===== Returns
|
1822
|
-
# ontology levels calculated
|
1823
|
-
def get_ontology_levels
|
1824
|
-
return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
|
908
|
+
def sigRatio(pvalA, pvalB)
|
909
|
+
return Math.log(pvalA)/Math.log(pvalB)
|
1825
910
|
end
|
1826
911
|
|
912
|
+
# END of methods involved with compute_relations_to_items
|
913
|
+
#-----------------------------------------------------------------------------------
|
914
|
+
|
915
|
+
#############################################
|
916
|
+
# PROFILE EXTERNAL METHODS
|
917
|
+
#############################################
|
1827
918
|
|
1828
|
-
#
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
1832
|
-
|
919
|
+
# I/O profile
|
920
|
+
####################################
|
921
|
+
|
922
|
+
# Increase the arbitrary frequency of a given term set
|
923
|
+
# ===== Parameters
|
924
|
+
# +terms+:: set of terms to be updated
|
925
|
+
# +increase+:: amount to be increased
|
926
|
+
# +transform_to_sym+:: if true, transform observed terms to symbols. Default: false
|
927
|
+
# ===== Return
|
928
|
+
# true if process ends without errors and false in other cases
|
929
|
+
def add_observed_terms(terms:, increase: 1.0, transform_to_sym: false, expand2parentals: true)
|
930
|
+
terms = terms.map{|term| [term] + get_ancestors(term.to_sym)}.flatten if expand2parentals
|
931
|
+
return terms.map{|id| self.add_observed_term(
|
932
|
+
term: transform_to_sym ? id.to_sym : id,
|
933
|
+
increase: increase)} # FRED: It is necessary the return?
|
1833
934
|
end
|
1834
935
|
|
1835
|
-
#
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
if all_paths.empty?
|
1843
|
-
path = []
|
1844
|
-
else
|
1845
|
-
path = all_paths.select{|pt| pt.length == path_length}.first.clone
|
1846
|
-
if level > 0 # we want the term and his ascendants until a specific level
|
1847
|
-
n_parents = path_length - level
|
1848
|
-
path = path[0..n_parents]
|
1849
|
-
end
|
1850
|
-
path.shift # Discard the term itself
|
1851
|
-
end
|
936
|
+
# Modifying Profile
|
937
|
+
####################################
|
938
|
+
|
939
|
+
def expand_profile_with_parents(profile)
|
940
|
+
new_terms = []
|
941
|
+
profile.each do |term|
|
942
|
+
new_terms = new_terms | get_ancestors(term)
|
1852
943
|
end
|
1853
|
-
return
|
1854
|
-
end
|
944
|
+
return new_terms | profile
|
945
|
+
end
|
1855
946
|
|
1856
|
-
#
|
947
|
+
# Clean a given profile returning cleaned set of terms and removed ancestors term.
|
948
|
+
# ===== Parameters
|
949
|
+
# +prof+:: array of terms to be checked
|
1857
950
|
# ===== Returns
|
1858
|
-
#
|
1859
|
-
def
|
1860
|
-
|
1861
|
-
|
1862
|
-
|
1863
|
-
profiles_terms.each do |term|
|
1864
|
-
query = term_freqs_byProfile[term]
|
1865
|
-
if query.nil?
|
1866
|
-
term_freqs_byProfile[term] = 1
|
1867
|
-
else
|
1868
|
-
term_freqs_byProfile[term] += 1
|
1869
|
-
end
|
1870
|
-
end
|
1871
|
-
levels_filtered = @dicts[:level][:byTerm].map{|level, terms| [level,terms.map{|t| profiles_terms.include?(t) ? Array.new(term_freqs_byProfile[t], t) : nil}.flatten.compact]}.select{|level, filteredTerms| !filteredTerms.empty?}.to_h
|
1872
|
-
return levels_filtered
|
951
|
+
# two arrays, first is the cleaned profile and second is the removed elements array
|
952
|
+
def remove_ancestors_from_profile(prof)
|
953
|
+
ancestors = prof.map{|term| self.get_ancestors(term)}.flatten.uniq
|
954
|
+
redundant = prof & ancestors
|
955
|
+
return prof - redundant, redundant
|
1873
956
|
end
|
1874
957
|
|
1875
|
-
|
1876
|
-
|
1877
|
-
|
1878
|
-
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
958
|
+
# Remove alternative IDs if official ID is present. DOES NOT REMOVE synonyms or alternative IDs of the same official ID
|
959
|
+
# ===== Parameters
|
960
|
+
# +prof+:: array of terms to be checked
|
961
|
+
# ===== Returns
|
962
|
+
# two arrays, first is the cleaned profile and second is the removed elements array
|
963
|
+
def remove_alternatives_from_profile(prof)
|
964
|
+
alternatives = prof.select{|term| @alternatives_index.include?(term)}
|
965
|
+
redundant = alternatives.select{|alt_id| prof.include?(@alternatives_index[alt_id])}
|
966
|
+
return prof - redundant, redundant
|
967
|
+
end
|
1882
968
|
|
1883
|
-
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
1887
|
-
|
1888
|
-
|
1889
|
-
|
1890
|
-
|
1891
|
-
|
1892
|
-
|
1893
|
-
|
1894
|
-
end
|
1895
|
-
ontology_levels << [level, terms.length, num]
|
1896
|
-
distribution_percentage << [
|
1897
|
-
level,
|
1898
|
-
(terms.length.fdiv(total_ontology_terms)*100).round(3),
|
1899
|
-
(num.fdiv(total_cohort_terms)*100).round(3),
|
1900
|
-
(u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
|
1901
|
-
]
|
1902
|
-
end
|
1903
|
-
ontology_levels.sort! { |x,y| x.first <=> y.first }
|
1904
|
-
distribution_percentage.sort! { |x,y| x.first <=> y.first }
|
1905
|
-
return ontology_levels, distribution_percentage
|
969
|
+
# Remove alternatives (if official term is present) and ancestors terms of a given profile
|
970
|
+
# ===== Parameters
|
971
|
+
# +profile+:: profile to be cleaned
|
972
|
+
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
973
|
+
# ===== Returns
|
974
|
+
# cleaned profile
|
975
|
+
def clean_profile(profile, remove_alternatives: true)
|
976
|
+
warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
|
977
|
+
terms_without_ancestors, _ = remove_ancestors_from_profile(profile)
|
978
|
+
terms_without_ancestors, _ = remove_alternatives_from_profile(terms_without_ancestors) if remove_alternatives
|
979
|
+
return terms_without_ancestors
|
1906
980
|
end
|
1907
981
|
|
1908
|
-
def
|
1909
|
-
|
1910
|
-
|
1911
|
-
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1915
|
-
|
1916
|
-
maxL = nil
|
1917
|
-
distribution_percentage.each do |level_info|
|
1918
|
-
maxL = level_info.first if level_info[1] == max_terms
|
1919
|
-
end
|
1920
|
-
diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
|
1921
|
-
diffL.select!{|dL| dL.last > 0}
|
1922
|
-
lowSection = diffL.select{|dL| dL.first <= maxL}
|
1923
|
-
highSection = diffL.select{|dL| dL.first > maxL}
|
1924
|
-
dsi = nil
|
1925
|
-
if highSection.empty?
|
1926
|
-
dsi = 0
|
1927
|
-
else
|
1928
|
-
accumulated_weigth = 0
|
1929
|
-
accumulated_weigthed_diffL = 0
|
1930
|
-
hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
|
1931
|
-
lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
|
1932
|
-
dsi = hss.fdiv(lss)
|
1933
|
-
end
|
1934
|
-
return dsi
|
982
|
+
def clean_profile_hard(profile, options = {})
|
983
|
+
profile, _ = check_ids(profile)
|
984
|
+
profile = profile.select{|t| !is_obsolete?(t)}
|
985
|
+
if !options[:term_filter].nil?
|
986
|
+
profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
|
987
|
+
end
|
988
|
+
profile = clean_profile(profile.uniq)
|
989
|
+
return profile
|
1935
990
|
end
|
1936
991
|
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
|
1941
|
-
|
1942
|
-
|
992
|
+
# Remove terms from a given profile using hierarchical info and scores set given
|
993
|
+
# ===== Parameters
|
994
|
+
# +profile+:: profile to be cleaned
|
995
|
+
# +scores+:: hash with terms by keys and numerical values (scores)
|
996
|
+
# +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
|
997
|
+
# +remove_without_score+:: if true, terms without score will be removed. Default: true
|
998
|
+
# ===== Returns
|
999
|
+
# cleaned profile
|
1000
|
+
def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
|
1001
|
+
scores = scores.sort_by{|term,score| score}.to_h
|
1002
|
+
keep = profile.map do |term|
|
1003
|
+
if scores.include?(term)
|
1004
|
+
parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
|
1005
|
+
targetable = parentals.select{|parent| profile.include?(parent)}
|
1006
|
+
if targetable.empty?
|
1007
|
+
term
|
1008
|
+
else
|
1009
|
+
targetable << term
|
1010
|
+
targets = scores.select{|term,score| targetable.include?(term)}.to_h
|
1011
|
+
byMax ? targets.keys.last : targets.keys.first
|
1012
|
+
end
|
1013
|
+
elsif remove_without_score
|
1014
|
+
nil
|
1943
1015
|
else
|
1944
|
-
|
1016
|
+
term
|
1945
1017
|
end
|
1946
|
-
accumulated_weigthed_diffL += diff * weightL
|
1947
1018
|
end
|
1948
|
-
|
1949
|
-
return weigthed_contribution
|
1019
|
+
return keep.compact.uniq
|
1950
1020
|
end
|
1951
1021
|
|
1022
|
+
# ID Handlers
|
1023
|
+
####################################
|
1952
1024
|
|
1953
|
-
#
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
1958
|
-
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1025
|
+
# Check a set of IDs and return allowed IDs removing which are not official terms on this ontology
|
1026
|
+
# ===== Parameters
|
1027
|
+
# +ids+:: to be checked
|
1028
|
+
# ===== Return
|
1029
|
+
# two arrays whit allowed and rejected IDs respectively
|
1030
|
+
def check_ids(ids, substitute: true)
|
1031
|
+
checked_codes = []
|
1032
|
+
rejected_codes = []
|
1033
|
+
ids.each do |id|
|
1034
|
+
new_id = get_main_id(id)
|
1035
|
+
if new_id.nil?
|
1036
|
+
rejected_codes << id
|
1037
|
+
else
|
1038
|
+
if substitute
|
1039
|
+
checked_codes << new_id
|
1040
|
+
else
|
1041
|
+
checked_codes << id
|
1967
1042
|
end
|
1968
1043
|
end
|
1969
|
-
@profilesDict = byTerm
|
1970
1044
|
end
|
1045
|
+
return checked_codes, rejected_codes
|
1971
1046
|
end
|
1972
1047
|
|
1973
1048
|
|
1974
|
-
#
|
1049
|
+
# Translates several IDs and returns translations and not allowed IDs list
|
1050
|
+
# ===== Parameters
|
1051
|
+
# +ids+:: to be translated
|
1975
1052
|
# ===== Return
|
1976
|
-
#
|
1977
|
-
def
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1053
|
+
# two arrays with translations and ids which couldn't be translated respectively
|
1054
|
+
def translate_ids(ids)
|
1055
|
+
translated = []
|
1056
|
+
rejected = []
|
1057
|
+
ids.each do |term_id|
|
1058
|
+
tr = self.translate_id(term_id.to_sym)
|
1059
|
+
if !tr.nil?
|
1060
|
+
translated << tr # FRED: Why have this a different behaviour from ...->
|
1061
|
+
else
|
1062
|
+
rejected << tr
|
1063
|
+
end
|
1064
|
+
end
|
1065
|
+
return translated, rejected
|
1066
|
+
end
|
1981
1067
|
|
1982
|
-
#
|
1068
|
+
# Translate several names and return translations and a list of names which couldn't be translated
|
1983
1069
|
# ===== Parameters
|
1984
|
-
# +
|
1985
|
-
# =====
|
1986
|
-
#
|
1987
|
-
def
|
1988
|
-
|
1070
|
+
# +names+:: array to be translated
|
1071
|
+
# ===== Return
|
1072
|
+
# two arrays with translations and names which couldn't be translated respectively
|
1073
|
+
def translate_names(names)
|
1074
|
+
translated = []
|
1075
|
+
rejected = []
|
1076
|
+
names.each do |name|
|
1077
|
+
tr = self.translate_name(name)
|
1078
|
+
if tr.nil?
|
1079
|
+
rejected << name # FRED: <-... this?
|
1080
|
+
else
|
1081
|
+
translated << tr
|
1082
|
+
end
|
1083
|
+
end
|
1084
|
+
return translated, rejected
|
1989
1085
|
end
|
1990
1086
|
|
1087
|
+
# Description of profile's terms
|
1088
|
+
####################################
|
1991
1089
|
|
1992
1090
|
# Gets metainfo table from a set of terms
|
1993
1091
|
# ===== Parameters
|
1994
1092
|
# +terms+:: IDs to be expanded
|
1995
|
-
# +filter_alternatives+:: flag to be used in get_descendants method
|
1996
1093
|
# ===== Returns
|
1997
1094
|
# an array with triplets [TermID, TermName, DescendantsNames]
|
1998
|
-
def get_childs_table(
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
1095
|
+
def get_childs_table(profile)
|
1096
|
+
expanded_profile = []
|
1097
|
+
profile.each do |t|
|
1098
|
+
expanded_profile << [[t, translate_id(t)], get_descendants(t).map{|child| [child, translate_id(child)]}]
|
2002
1099
|
end
|
2003
|
-
return
|
1100
|
+
return expanded_profile
|
2004
1101
|
end
|
2005
1102
|
|
1103
|
+
def get_terms_levels(profile)
|
1104
|
+
termsAndLevels = []
|
1105
|
+
profile.each do |term|
|
1106
|
+
termsAndLevels << [term, get_term_level(term)]
|
1107
|
+
end
|
1108
|
+
return termsAndLevels
|
1109
|
+
end
|
2006
1110
|
|
2007
|
-
#
|
1111
|
+
# IC data
|
1112
|
+
####################################
|
1113
|
+
|
1114
|
+
# Get information coefficient from profiles #
|
1115
|
+
|
1116
|
+
# Calculates mean IC of a given profile
|
2008
1117
|
# ===== Parameters
|
2009
|
-
# +
|
2010
|
-
# +
|
2011
|
-
# +
|
2012
|
-
|
2013
|
-
|
2014
|
-
|
2015
|
-
|
2016
|
-
|
2017
|
-
|
2018
|
-
|
2019
|
-
|
1118
|
+
# +prof+:: profile to be checked
|
1119
|
+
# +ic_type+:: ic_type to be used
|
1120
|
+
# +zhou_k+:: special coeficient for Zhou IC method
|
1121
|
+
# ===== Returns
|
1122
|
+
# mean IC for a given profile
|
1123
|
+
def get_profile_mean_IC(prof, ic_type: :resnik, zhou_k: 0.5)
|
1124
|
+
return prof.map{|term| self.get_IC(term, type: ic_type, zhou_k: zhou_k)}.sum.fdiv(prof.length)
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
# Term ref vs profile #
|
1128
|
+
|
1129
|
+
def get_maxmica_term2profile(ref_term, profile)
|
1130
|
+
micas = profile.map{|term| get_MICA(ref_term, term)}
|
1131
|
+
maxmica = micas.first
|
1132
|
+
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
1133
|
+
return maxmica
|
1134
|
+
end
|
1135
|
+
|
1136
|
+
# Profile vs Profile #
|
1137
|
+
|
1138
|
+
# Get semantic similarity from two term sets
|
1139
|
+
# ===== Parameters
|
1140
|
+
# +termsA+:: set to be compared
|
1141
|
+
# +termsB+:: set to be compared
|
1142
|
+
# +sim_type+:: similitude method to be used. Default: resnik
|
1143
|
+
# +ic_type+:: ic type to be used. Default: resnik
|
1144
|
+
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
1145
|
+
# ===== Return
|
1146
|
+
# similitude calculated
|
1147
|
+
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
|
1148
|
+
# Check
|
1149
|
+
raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
|
1150
|
+
raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
|
1151
|
+
micasA = []
|
1152
|
+
# Compare A -> B
|
1153
|
+
termsA.each do |tA|
|
1154
|
+
micas = []
|
1155
|
+
termsB.each do |tB|
|
1156
|
+
if store_mica
|
1157
|
+
value = @mica_index[tA][tB]
|
1158
|
+
else
|
1159
|
+
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
|
1160
|
+
end
|
1161
|
+
micas << value if value.class == Float
|
2020
1162
|
end
|
1163
|
+
!micas.empty? ? micasA << micas.max : micasA << 0
|
2021
1164
|
end
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2025
|
-
|
2026
|
-
|
2027
|
-
|
2028
|
-
# elsif v.kind_of?(Hash)
|
2029
|
-
# @items.merge!(relations) do |k, oldV, newV|
|
2030
|
-
# if oldV.kind_of?(Array)
|
2031
|
-
# return (oldV + newV).uniq
|
2032
|
-
# else
|
2033
|
-
# oldV = [oldV,newV]
|
2034
|
-
# end
|
2035
|
-
# end
|
2036
|
-
# elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
|
2037
|
-
# @items[k] = (@items[k] + [v]).uniq
|
2038
|
-
# else
|
2039
|
-
# @items[k] = [@items[k],v]
|
2040
|
-
# end
|
2041
|
-
# else
|
2042
|
-
# @items[k] = v
|
2043
|
-
# end
|
2044
|
-
# end
|
2045
|
-
else
|
2046
|
-
@items.merge!(relations)
|
1165
|
+
means_sim = micasA.sum.fdiv(micasA.size)
|
1166
|
+
# Compare B -> A
|
1167
|
+
if bidirectional
|
1168
|
+
means_simA = means_sim * micasA.size
|
1169
|
+
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
|
1170
|
+
means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
|
2047
1171
|
end
|
2048
|
-
|
1172
|
+
# Return
|
1173
|
+
return means_sim
|
1174
|
+
end
|
2049
1175
|
|
2050
|
-
|
1176
|
+
|
1177
|
+
#############################################
|
1178
|
+
# PROFILE INTERNAL METHODS
|
1179
|
+
#############################################
|
1180
|
+
|
1181
|
+
# I/O profiles
|
1182
|
+
####################################
|
1183
|
+
|
1184
|
+
# Method used to store a pool of profiles
|
2051
1185
|
# ===== Parameters
|
2052
|
-
# +
|
2053
|
-
# +
|
2054
|
-
#
|
2055
|
-
#
|
2056
|
-
def
|
2057
|
-
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2062
|
-
# A_hash : B_array => NOT ALLOWED
|
2063
|
-
# A_hash : B_hash
|
2064
|
-
# A_hash : B_single => NOT ALLOWED
|
2065
|
-
# A is single element => RETURN ARRAY
|
2066
|
-
# A_single : B_array
|
2067
|
-
# A_single : B_hash => NOT ALLOWED
|
2068
|
-
# A_single : B_single
|
2069
|
-
concatenated = nil
|
2070
|
-
if itemA.kind_of?(Array) && itemB.kind_of?(Array)
|
2071
|
-
concatenated = (itemA + itemB).uniq
|
2072
|
-
elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
|
2073
|
-
concatenated = itemA.merge(itemB) do |k, oldV, newV|
|
2074
|
-
self.concatItems(oldV,newV)
|
1186
|
+
# +profiles+:: array/hash of profiles to be stored. If it's an array, numerical IDs will be assigned starting at 1
|
1187
|
+
# +calc_metadata+:: if true, launch get_items_from_profiles process
|
1188
|
+
# +reset_stored+:: if true, remove already stored profiles
|
1189
|
+
# +substitute+:: subsstitute flag from check_ids
|
1190
|
+
def load_profiles(profiles, calc_metadata: true, reset_stored: false, substitute: false)
|
1191
|
+
self.reset_profiles if reset_stored
|
1192
|
+
# Check
|
1193
|
+
if profiles.kind_of?(Array)
|
1194
|
+
profiles.each_with_index do |items, i|
|
1195
|
+
self.add_profile(i, items.map {|item| item.to_sym}, substitute: substitute)
|
2075
1196
|
end
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
1197
|
+
else # Hash
|
1198
|
+
if !profiles.keys.select{|id| @profiles.include?(id)}.empty?
|
1199
|
+
warn('Some profiles given are already stored. Stored version will be replaced')
|
1200
|
+
end
|
1201
|
+
profiles.each{|id, prof| self.add_profile(id, prof, substitute: substitute)}
|
2080
1202
|
end
|
2081
|
-
return concatenated
|
2082
|
-
end
|
2083
1203
|
|
1204
|
+
self.add_observed_terms_from_profiles(reset: true)
|
2084
1205
|
|
2085
|
-
|
2086
|
-
|
2087
|
-
# +dictID+:: dictionary ID to be stored (:byTerm will be used)
|
2088
|
-
def set_items_from_dict(dictID, remove_old_relations = false)
|
2089
|
-
@items = {} if remove_old_relations
|
2090
|
-
if !@dicts[dictID].nil?
|
2091
|
-
@items.merge(@dicts[dictID][:byTerm])
|
2092
|
-
else
|
2093
|
-
warn('Specified ID is not calculated. Dict will not be added as a items set')
|
1206
|
+
if calc_metadata
|
1207
|
+
self.get_items_from_profiles
|
2094
1208
|
end
|
2095
1209
|
end
|
2096
1210
|
|
2097
|
-
|
2098
|
-
# This method computes childs similarity and impute items to it parentals. To do that Item keys must be this ontology allowed terms.
|
2099
|
-
# Similarity will be calculated by text extact similarity unless an ontology object will be provided. In this case, MICAs will be used
|
1211
|
+
# Stores a given profile with an specific ID. If ID is already assigend to a profile, it will be replaced
|
2100
1212
|
# ===== Parameters
|
2101
|
-
# +
|
2102
|
-
# +
|
2103
|
-
# +
|
2104
|
-
#
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
warn('Items have been not provided yet')
|
2110
|
-
return nil
|
2111
|
-
end
|
2112
|
-
targetKeys = @items.keys.select{|k| self.exists?(k)}
|
2113
|
-
if targetKeys.length == 0
|
2114
|
-
warn('Any item key is allowed')
|
2115
|
-
return nil
|
2116
|
-
elsif targetKeys.length < @items.keys.length
|
2117
|
-
warn('Some item keys are not allowed')
|
2118
|
-
end
|
2119
|
-
|
2120
|
-
# Expand to parentals
|
2121
|
-
targetKeys << targetKeys.map{|t| self.get_ancestors(t, true)}
|
2122
|
-
targetKeys.flatten!
|
2123
|
-
targetKeys.uniq!
|
2124
|
-
|
2125
|
-
# Obtain levels (go from leaves to roots)
|
2126
|
-
levels = targetKeys.map{|term| self.get_term_level(term)}
|
2127
|
-
levels.compact!
|
2128
|
-
levels.uniq!
|
2129
|
-
levels.sort!
|
2130
|
-
levels.reverse!
|
2131
|
-
levels.shift # Leaves are not expandable
|
2132
|
-
|
2133
|
-
# Expand from leaves to roots
|
2134
|
-
levels.map do |lvl|
|
2135
|
-
curr_keys = targetKeys.select{|k| self.get_term_level(k) == lvl}
|
2136
|
-
curr_keys.map do |term_expand|
|
2137
|
-
to_infer = []
|
2138
|
-
# Obtain childs
|
2139
|
-
childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
|
2140
|
-
# Expand
|
2141
|
-
if childs.length > 0 && minimum_childs == 1 # Special case
|
2142
|
-
to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
|
2143
|
-
elsif childs.length >= minimum_childs
|
2144
|
-
to_infer = Hash.new(0)
|
2145
|
-
# Compare
|
2146
|
-
while childs.length > 1
|
2147
|
-
curr_term = childs.shift
|
2148
|
-
childs.each do |compare_term|
|
2149
|
-
pivot_items = @items[curr_term]
|
2150
|
-
compare_items = @items[compare_term]
|
2151
|
-
if ontology.nil? # Exact match
|
2152
|
-
pivot_items.map do |pitem|
|
2153
|
-
if compare_items.include?(pitem)
|
2154
|
-
to_infer[pitem] += 2
|
2155
|
-
end
|
2156
|
-
end
|
2157
|
-
else # Find MICAs
|
2158
|
-
local_infer = Hash.new(0)
|
2159
|
-
pivot_items.map do |pitem|
|
2160
|
-
micas = compare_items.map{|citem| ontology.get_MICA(pitem, citem)}
|
2161
|
-
maxmica = micas[0]
|
2162
|
-
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
2163
|
-
local_infer[maxmica.first] += 1
|
2164
|
-
end
|
2165
|
-
compare_items.map do |citem|
|
2166
|
-
micas = pivot_items.map{|pitem| ontology.get_MICA(pitem, citem)}
|
2167
|
-
maxmica = micas[0]
|
2168
|
-
micas.each{|mica| maxmica = mica if mica.last > maxmica.last}
|
2169
|
-
local_infer[maxmica.first] += 1
|
2170
|
-
end
|
2171
|
-
local_infer.each{|t,freq| to_infer[t] += freq if freq >= 2}
|
2172
|
-
end
|
2173
|
-
end
|
2174
|
-
end
|
2175
|
-
# Filter infer
|
2176
|
-
to_infer = to_infer.select{|k,v| v >= minimum_childs}
|
2177
|
-
end
|
2178
|
-
# Infer
|
2179
|
-
if to_infer.length > 0
|
2180
|
-
@items[term_expand] = [] if @items[term_expand].nil?
|
2181
|
-
if to_infer.kind_of?(Array)
|
2182
|
-
@items[term_expand] = (@items[term_expand] + to_infer).uniq
|
2183
|
-
else
|
2184
|
-
@items[term_expand] = (@items[term_expand] + to_infer.keys).uniq
|
2185
|
-
end
|
2186
|
-
@items[term_expand] = ontology.clean_profile(@items[term_expand]) if clean_profiles && !ontology.nil?
|
2187
|
-
elsif !@items.include?(term_expand)
|
2188
|
-
targetKeys.delete(term_expand)
|
2189
|
-
end
|
2190
|
-
end
|
1213
|
+
# +id+:: assigned to profile
|
1214
|
+
# +terms+:: array of terms
|
1215
|
+
# +substitute+:: subsstitute flag from check_ids
|
1216
|
+
def add_profile(id, terms, substitute: true) # FRED: Talk with PSZ about the uniqness of IDs translated
|
1217
|
+
warn("Profile assigned to ID (#{id}) is going to be replaced") if @profiles.include? id
|
1218
|
+
correct_terms, rejected_terms = self.check_ids(terms, substitute: substitute)
|
1219
|
+
if !rejected_terms.empty?
|
1220
|
+
warn("Given terms contains erroneus IDs: #{rejected_terms.join(",")}. These IDs will be removed")
|
2191
1221
|
end
|
2192
|
-
|
1222
|
+
if id.is_a? Numeric
|
1223
|
+
@profiles[id] = correct_terms
|
1224
|
+
else
|
1225
|
+
@profiles[id.to_sym] = correct_terms
|
1226
|
+
end
|
1227
|
+
end
|
2193
1228
|
|
2194
1229
|
|
2195
|
-
#
|
1230
|
+
# Includes as "observed_terms" all terms included into stored profiles
|
2196
1231
|
# ===== Parameters
|
2197
|
-
# +
|
2198
|
-
|
2199
|
-
|
2200
|
-
|
2201
|
-
# Direct ancestors/descendants of given term or nil if any error occurs
|
2202
|
-
def get_direct_related(term, relation, remove_alternatives: false)
|
2203
|
-
if @dicts[:is_a].nil?
|
2204
|
-
warn("Hierarchy dictionary is not already calculated. Returning nil")
|
2205
|
-
return nil
|
2206
|
-
end
|
2207
|
-
target = nil
|
2208
|
-
case relation
|
2209
|
-
when :ancestor
|
2210
|
-
target = :byTerm
|
2211
|
-
when :descendant
|
2212
|
-
target = :byValue
|
2213
|
-
else
|
2214
|
-
warn('Relation type not allowed. Returning nil')
|
2215
|
-
end
|
2216
|
-
return nil if target.nil?
|
2217
|
-
query = @dicts[:is_a][target][term]
|
2218
|
-
return query if query.nil?
|
2219
|
-
query, _ = remove_alternatives_from_profile(query) if remove_alternatives
|
2220
|
-
return query
|
1232
|
+
# +reset+:: if true, reset observed freqs alreeady stored befor re-calculate
|
1233
|
+
def add_observed_terms_from_profiles(reset: false)
|
1234
|
+
@meta.each{|term, freqs| freqs[:observed_freq] = -1} if reset
|
1235
|
+
@profiles.each{|id, terms| self.add_observed_terms(terms: terms)}
|
2221
1236
|
end
|
2222
1237
|
|
2223
|
-
|
2224
|
-
#
|
1238
|
+
# ===== Returns
|
1239
|
+
# profiles assigned to a given ID
|
2225
1240
|
# ===== Parameters
|
2226
|
-
# +
|
2227
|
-
#
|
2228
|
-
#
|
2229
|
-
|
2230
|
-
|
2231
|
-
return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
|
1241
|
+
# +id+:: profile ID
|
1242
|
+
# ===== Return
|
1243
|
+
# specific profile or nil if it's not stored
|
1244
|
+
def get_profile(id)
|
1245
|
+
return @profiles[id]
|
2232
1246
|
end
|
2233
1247
|
|
2234
|
-
#
|
2235
|
-
|
2236
|
-
|
2237
|
-
#
|
2238
|
-
|
2239
|
-
|
2240
|
-
|
2241
|
-
|
1248
|
+
# Modifying profiles
|
1249
|
+
####################################
|
1250
|
+
|
1251
|
+
def reset_profiles # Internal method used to remove already stored profiles and restore observed frequencies #TODO FRED: Modify test for this method.
|
1252
|
+
@profiles = {} # Clean profiles storage
|
1253
|
+
# Reset frequency observed
|
1254
|
+
@meta.each{|term,info| info[:observed_freq] = 0}
|
1255
|
+
@max_freqs[:observed_freq] = 0
|
1256
|
+
@items = {}
|
2242
1257
|
end
|
2243
1258
|
|
2244
|
-
def
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
yield(id, tags)
|
2249
|
-
else
|
2250
|
-
yield(id)
|
1259
|
+
def expand_profiles(meth, unwanted_terms: [], calc_metadata: true, ontology: nil, minimum_childs: 1, clean_profiles: true)
|
1260
|
+
if meth == 'parental'
|
1261
|
+
@profiles.each do |id, terms|
|
1262
|
+
@profiles[id] = expand_profile_with_parents(terms) - unwanted_terms
|
2251
1263
|
end
|
1264
|
+
get_items_from_profiles if calc_metadata
|
1265
|
+
elsif meth == 'propagate'
|
1266
|
+
get_items_from_profiles
|
1267
|
+
expand_items_to_parentals(ontology: ontology, minimum_childs: minimum_childs, clean_profiles: clean_profiles)
|
1268
|
+
get_profiles_from_items
|
2252
1269
|
end
|
1270
|
+
add_observed_terms_from_profiles(reset: true)
|
2253
1271
|
end
|
2254
1272
|
|
2255
|
-
|
2256
|
-
|
2257
|
-
|
2258
|
-
|
2259
|
-
|
2260
|
-
|
1273
|
+
# Remove alternatives (if official term is present) and ancestors terms of stored profiles
|
1274
|
+
# ===== Parameters
|
1275
|
+
# +store+:: if true, clenaed profiles will replace already stored profiles
|
1276
|
+
# +remove_alternatives+:: if true, clenaed profiles will replace already stored profiles
|
1277
|
+
# ===== Returns
|
1278
|
+
# a hash with cleaned profiles
|
1279
|
+
def clean_profiles(store: false, remove_alternatives: true)
|
1280
|
+
cleaned_profiles = {}
|
1281
|
+
@profiles.each{ |id, terms| cleaned_profiles[id] = self.clean_profile(terms, remove_alternatives: remove_alternatives)}
|
1282
|
+
@profiles = cleaned_profiles if store
|
1283
|
+
return cleaned_profiles
|
2261
1284
|
end
|
2262
1285
|
|
2263
|
-
|
2264
|
-
|
1286
|
+
# ID Handlers
|
1287
|
+
####################################
|
2265
1288
|
|
2266
|
-
#
|
1289
|
+
# Trnaslates a bunch of profiles to it sets of term names
|
2267
1290
|
# ===== Parameters
|
2268
|
-
#
|
2269
|
-
#
|
2270
|
-
#
|
2271
|
-
|
2272
|
-
|
2273
|
-
|
2274
|
-
|
2275
|
-
|
2276
|
-
|
2277
|
-
|
2278
|
-
if mode == :elim
|
2279
|
-
results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
|
2280
|
-
elsif mode == :weight
|
2281
|
-
results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
|
2282
|
-
end
|
2283
|
-
return results
|
2284
|
-
end
|
2285
|
-
|
2286
|
-
def get_item_list_parental(terms_levels)
|
2287
|
-
transfered_list = {}
|
2288
|
-
parent_dict = @dicts[:is_a][:byTerm]
|
2289
|
-
levels = terms_levels.keys.sort
|
2290
|
-
while levels.length > 1
|
2291
|
-
level = levels.pop
|
2292
|
-
terms_levels[level].each do |term|
|
2293
|
-
parents = parent_dict[term]
|
2294
|
-
if parents.nil?
|
2295
|
-
next
|
2296
|
-
elsif parents.length == 1
|
2297
|
-
parent = parents.first
|
2298
|
-
else
|
2299
|
-
parent = (parents | terms_levels[level - 1]).first
|
2300
|
-
end
|
2301
|
-
term_it = @items[term]
|
2302
|
-
parent_it = @items[parent]
|
2303
|
-
curr_it = transfered_list[term]
|
2304
|
-
parent_all_items = merge_groups([term_it, parent_it, curr_it])
|
2305
|
-
transfered_list[parent] = parent_all_items if !parent_all_items.empty?
|
2306
|
-
term_all_items = merge_groups([term_it, curr_it])
|
2307
|
-
transfered_list[term] = term_all_items if !term_all_items.empty?
|
2308
|
-
end
|
1291
|
+
# +profs+:: array of profiles
|
1292
|
+
# +asArray+:: flag to indicate if results must be returned as: true => an array of tuples [ProfID, ArrayOdNames] or ; false => hashs of translations
|
1293
|
+
# ===== Returns
|
1294
|
+
# translated profiles
|
1295
|
+
def translate_profiles_ids(profs = [], asArray: true)
|
1296
|
+
profs2proc = {}
|
1297
|
+
if profs.empty?
|
1298
|
+
profs2proc = @profiles
|
1299
|
+
else
|
1300
|
+
profs.each_with_index{|terms, index| profs2proc[index] = terms} if profs.kind_of?(Array)
|
2309
1301
|
end
|
2310
|
-
|
2311
|
-
|
1302
|
+
profs_names = {}
|
1303
|
+
profs2proc.each do |id, terms|
|
1304
|
+
names, _ = translate_ids(terms)
|
1305
|
+
profs_names[id] = names
|
2312
1306
|
end
|
2313
|
-
return
|
1307
|
+
return asArray ? profs_names.values : profs_names
|
2314
1308
|
end
|
2315
1309
|
|
2316
|
-
|
2317
|
-
|
1310
|
+
# Description of profile size
|
1311
|
+
####################################
|
1312
|
+
|
1313
|
+
def profile_stats
|
1314
|
+
stats = Hash.new(0)
|
1315
|
+
data = get_profiles_sizes
|
1316
|
+
stats[:average] = data.sum().fdiv(data.size)
|
1317
|
+
sum_devs = data.sum{|element| (element - stats[:average]) ** 2}
|
1318
|
+
stats[:variance] = sum_devs.fdiv(data.size)
|
1319
|
+
stats[:standardDeviation] = stats[:variance] ** 0.5
|
1320
|
+
stats[:max] = data.max
|
1321
|
+
stats[:min] = data.min
|
1322
|
+
|
1323
|
+
stats[:count] = data.size
|
1324
|
+
data.each do |value|
|
1325
|
+
stats[:countNonZero] += 1 if value != 0
|
1326
|
+
end
|
1327
|
+
|
1328
|
+
stats[:q1] = data.get_quantiles(0.25)
|
1329
|
+
stats[:median] = data.get_quantiles(0.5)
|
1330
|
+
stats[:q3] = data.get_quantiles(0.75)
|
1331
|
+
return stats
|
1332
|
+
|
2318
1333
|
end
|
2319
1334
|
|
2320
|
-
|
2321
|
-
|
2322
|
-
|
2323
|
-
|
2324
|
-
|
2325
|
-
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
end
|
2330
|
-
end
|
2331
|
-
return terms_levels
|
1335
|
+
# ===== Returns
|
1336
|
+
# mean size of stored profiles
|
1337
|
+
# ===== Parameters
|
1338
|
+
# +round_digits+:: number of digits to round result. Default: 4
|
1339
|
+
# ===== Returns
|
1340
|
+
# mean size of stored profiles
|
1341
|
+
def get_profiles_mean_size(round_digits: 4)
|
1342
|
+
sizes = self.get_profiles_sizes
|
1343
|
+
return sizes.sum.fdiv(@profiles.length).round(round_digits)
|
2332
1344
|
end
|
2333
1345
|
|
2334
|
-
|
2335
|
-
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
2339
|
-
|
2340
|
-
if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
|
2341
|
-
parental_terms = [] # Initialize required parental level
|
2342
|
-
terms_levels[parental_level] = parental_terms
|
2343
|
-
levels << parental_level
|
2344
|
-
end
|
2345
|
-
terms_levels[level].each do |term|
|
2346
|
-
path_info = @term_paths[term]
|
2347
|
-
shortest_path_length = path_info[:shortest_path]
|
2348
|
-
path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
|
2349
|
-
parental = path[1] # the first elements is the term itself
|
2350
|
-
parental_terms << parental if !parental_terms.include?(parental)
|
2351
|
-
end
|
2352
|
-
end
|
1346
|
+
# ===== Returns
|
1347
|
+
# an array of sizes for all stored profiles
|
1348
|
+
# ===== Return
|
1349
|
+
# array of profile sizes
|
1350
|
+
def get_profiles_sizes()
|
1351
|
+
return @profiles.map{|id,terms| terms.length}
|
2353
1352
|
end
|
2354
1353
|
|
2355
|
-
|
2356
|
-
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
|
2363
|
-
|
2364
|
-
|
2365
|
-
external_item_list - items_to_remove,
|
2366
|
-
associated_items - items_to_remove,
|
2367
|
-
#((associated_items | external_item_list) - items_to_remove).length
|
2368
|
-
total_items
|
2369
|
-
)
|
2370
|
-
if pval <= thresold
|
2371
|
-
parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
|
2372
|
-
parents.each do |prnt|
|
2373
|
-
query = penalized_terms[prnt]
|
2374
|
-
if query.nil?
|
2375
|
-
penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
|
2376
|
-
else
|
2377
|
-
query.concat(item_list[term])
|
2378
|
-
end
|
2379
|
-
end
|
2380
|
-
end
|
2381
|
-
results << [term, pval]
|
2382
|
-
end
|
2383
|
-
end
|
2384
|
-
return results
|
1354
|
+
# Calculates profiles sizes and returns size assigned to percentile given
|
1355
|
+
# ===== Parameters
|
1356
|
+
# +perc+:: percentile to be returned
|
1357
|
+
# +increasing_sort+:: flag to indicate if sizes order must be increasing. Default: false
|
1358
|
+
# ===== Returns
|
1359
|
+
# values assigned to percentile asked
|
1360
|
+
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
1361
|
+
prof_lengths = self.get_profiles_sizes
|
1362
|
+
percentile_profile = prof_lengths.get_quantiles(perc.fdiv(100), decreasing_sort = !increasing_sort)
|
1363
|
+
return percentile_profile
|
2385
1364
|
end
|
2386
1365
|
|
2387
|
-
|
2388
|
-
|
2389
|
-
|
2390
|
-
|
2391
|
-
|
2392
|
-
|
2393
|
-
|
2394
|
-
|
2395
|
-
|
2396
|
-
|
2397
|
-
|
2398
|
-
|
2399
|
-
|
2400
|
-
|
2401
|
-
|
2402
|
-
|
1366
|
+
# IC data
|
1367
|
+
####################################
|
1368
|
+
|
1369
|
+
# Get frequency terms and information coefficient from profiles #
|
1370
|
+
|
1371
|
+
# Calculates frequencies of stored profiles terms
|
1372
|
+
# ===== Parameters
|
1373
|
+
# +ratio+:: if true, frequencies will be returned as ratios between 0 and 1.
|
1374
|
+
# +asArray+:: used to transform returned structure format from hash of Term-Frequency to an array of tuples [Term, Frequency]
|
1375
|
+
# +translate+:: if true, term IDs will be translated to
|
1376
|
+
# ===== Returns
|
1377
|
+
# stored profiles terms frequencies
|
1378
|
+
def get_profiles_terms_frequency(ratio: true, asArray: true, translate: true)
|
1379
|
+
freqs = Hash.new(0)
|
1380
|
+
@profiles.each do |id, terms|
|
1381
|
+
terms.each{|term| freqs[term] += 1}
|
1382
|
+
end
|
1383
|
+
if translate
|
1384
|
+
translated_freqs = {}
|
1385
|
+
freqs.each do |term, freq|
|
1386
|
+
tr = self.translate_id(term)
|
1387
|
+
translated_freqs[tr] = freq if !tr.nil?
|
2403
1388
|
end
|
1389
|
+
freqs = translated_freqs
|
2404
1390
|
end
|
2405
|
-
|
1391
|
+
n_profiles = @profiles.length
|
1392
|
+
freqs.transform_values!{|freq| freq.fdiv(n_profiles)} if ratio
|
1393
|
+
if asArray
|
1394
|
+
freqs = freqs.to_a
|
1395
|
+
freqs.sort!{|h1, h2| h2[1] <=> h1[1]}
|
1396
|
+
end
|
1397
|
+
return freqs
|
2406
1398
|
end
|
2407
1399
|
|
2408
|
-
|
2409
|
-
|
2410
|
-
|
2411
|
-
|
1400
|
+
# Calculates resnik ontology, and resnik observed mean ICs for all profiles stored
|
1401
|
+
# ===== Returns
|
1402
|
+
# two hashes with Profiles and IC calculated for resnik and observed resnik respectively
|
1403
|
+
def get_profiles_resnik_dual_ICs(struct: :resnik, observ: :resnik_observed) # Maybe change name during migration to get_profiles_dual_ICs
|
1404
|
+
struct_ics = {}
|
1405
|
+
observ_ics = {}
|
1406
|
+
@profiles.each do |id, terms|
|
1407
|
+
struct_ics[id] = self.get_profile_mean_IC(terms, ic_type: struct)
|
1408
|
+
observ_ics[id] = self.get_profile_mean_IC(terms, ic_type: observ)
|
1409
|
+
end
|
1410
|
+
return struct_ics, observ_ics
|
2412
1411
|
end
|
2413
1412
|
|
2414
|
-
|
2415
|
-
|
2416
|
-
|
2417
|
-
|
2418
|
-
|
2419
|
-
|
2420
|
-
|
2421
|
-
|
2422
|
-
|
2423
|
-
|
2424
|
-
|
2425
|
-
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
2429
|
-
|
2430
|
-
|
2431
|
-
|
2432
|
-
|
2433
|
-
|
2434
|
-
|
2435
|
-
|
2436
|
-
|
2437
|
-
|
2438
|
-
else
|
2439
|
-
ancs = get_ancestors(term, filter_alternatives = true)
|
2440
|
-
ancs << term
|
2441
|
-
rates.each do |ch, ratio|# CASE 2
|
2442
|
-
if ratio >= 1 # The child is better than parent
|
2443
|
-
ancs.each do |anc|
|
2444
|
-
query_anc = item_weigths_per_term[anc]
|
2445
|
-
associated_items.each do |item|
|
2446
|
-
query_anc[item] /= ratio # /= --> query_anc[item]/ratio
|
2447
|
-
end
|
2448
|
-
end
|
1413
|
+
|
1414
|
+
# Calculates and return resnik ICs (by ontology and observed frequency) for observed terms
|
1415
|
+
# ===== Returns
|
1416
|
+
# two hashes with resnik and resnik_observed ICs for observed terms
|
1417
|
+
def get_observed_ics_by_onto_and_freq()
|
1418
|
+
ic_ont = {}
|
1419
|
+
resnik_observed = {}
|
1420
|
+
observed_terms = @profiles.values.flatten.uniq
|
1421
|
+
observed_terms.each do |term|
|
1422
|
+
ic_ont[term] = get_IC(term)
|
1423
|
+
resnik_observed[term] = get_IC(term, type: :resnik_observed)
|
1424
|
+
end
|
1425
|
+
return ic_ont, resnik_observed
|
1426
|
+
end
|
1427
|
+
|
1428
|
+
# Profiles vs Profiles #
|
1429
|
+
|
1430
|
+
def get_pair_index(profiles_A, profiles_B)
|
1431
|
+
pair_index = {}
|
1432
|
+
profiles_A.each do |curr_id, profile_A|
|
1433
|
+
profiles_B.each do |id, profile_B|
|
1434
|
+
profile_A.each do |term_A|
|
1435
|
+
profile_B.each do |term_B|
|
1436
|
+
pair_index[[term_A, term_B].sort] = true
|
2449
1437
|
end
|
2450
1438
|
end
|
2451
|
-
|
2452
|
-
end
|
1439
|
+
end
|
2453
1440
|
end
|
1441
|
+
return pair_index
|
2454
1442
|
end
|
2455
1443
|
|
2456
|
-
def
|
2457
|
-
|
1444
|
+
def get_mica_index_from_profiles(pair_index, sim_type: :resnik, ic_type: :resnik, lca_index: true)
|
1445
|
+
pair_index.each do |pair, val|
|
1446
|
+
tA, tB = pair
|
1447
|
+
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type, lca_index: lca_index)
|
1448
|
+
value = true if value.nil? # We use true to save that the operation was made but there is not mica value
|
1449
|
+
add2nestHash(@mica_index, tA, tB, value)
|
1450
|
+
add2nestHash(@mica_index, tB, tA, value)
|
1451
|
+
end
|
2458
1452
|
end
|
2459
1453
|
|
2460
|
-
|
2461
|
-
|
2462
|
-
|
2463
|
-
|
2464
|
-
|
2465
|
-
|
2466
|
-
|
2467
|
-
|
2468
|
-
|
2469
|
-
|
2470
|
-
|
2471
|
-
|
2472
|
-
|
2473
|
-
|
1454
|
+
# Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
|
1455
|
+
# ===== Parameters
|
1456
|
+
# +external_profiles+:: set of external profiles. If nil, internal profiles will be compared with itself
|
1457
|
+
# +sim_type+:: similitude method to be used. Default: resnik
|
1458
|
+
# +ic_type+:: ic type to be used. Default: resnik
|
1459
|
+
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
1460
|
+
# ===== Return
|
1461
|
+
# Similitudes calculated
|
1462
|
+
def compare_profiles(external_profiles: nil, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
|
1463
|
+
profiles_similarity = {} #calculate similarity between patients profile
|
1464
|
+
if external_profiles.nil?
|
1465
|
+
comp_profiles = @profiles
|
1466
|
+
main_profiles = comp_profiles
|
1467
|
+
else
|
1468
|
+
comp_profiles = external_profiles
|
1469
|
+
main_profiles = @profiles
|
1470
|
+
end
|
1471
|
+
# Compare
|
1472
|
+
pair_index = get_pair_index(main_profiles, comp_profiles)
|
1473
|
+
@mica_index = {}
|
1474
|
+
get_mica_index_from_profiles(pair_index, sim_type: sim_type, ic_type: ic_type, lca_index: false)
|
1475
|
+
main_profiles.each do |curr_id, current_profile|
|
1476
|
+
comp_profiles.each do |id, profile|
|
1477
|
+
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
|
1478
|
+
add2nestHash(profiles_similarity, curr_id, id, value)
|
1479
|
+
end
|
1480
|
+
end
|
1481
|
+
return profiles_similarity
|
1482
|
+
end
|
2474
1483
|
|
2475
|
-
|
2476
|
-
|
2477
|
-
stats[:q3] = data.get_quantiles(0.75)
|
2478
|
-
return stats
|
1484
|
+
# specifity_index related methods
|
1485
|
+
####################################
|
2479
1486
|
|
1487
|
+
# Return ontology levels from profile terms
|
1488
|
+
# ===== Returns
|
1489
|
+
# hash of term levels (Key: level; Value: array of term IDs)
|
1490
|
+
def get_ontology_levels_from_profiles(uniq = true)
|
1491
|
+
profiles_terms = @profiles.values.flatten
|
1492
|
+
profiles_terms.uniq! if uniq
|
1493
|
+
term_freqs_byProfile = Hash.new(0)
|
1494
|
+
profiles_terms.each do |term|
|
1495
|
+
term_freqs_byProfile[term] += 1
|
1496
|
+
end
|
1497
|
+
levels_filtered = {}
|
1498
|
+
terms_levels = @dicts[:level][:byValue]
|
1499
|
+
term_freqs_byProfile.each do |term, count|
|
1500
|
+
level = terms_levels[term]
|
1501
|
+
term_repeat = Array.new(count, term)
|
1502
|
+
query = levels_filtered[level]
|
1503
|
+
if query.nil?
|
1504
|
+
levels_filtered[level] = term_repeat
|
1505
|
+
else
|
1506
|
+
query.concat(term_repeat)
|
1507
|
+
end
|
1508
|
+
end
|
1509
|
+
return levels_filtered
|
2480
1510
|
end
|
2481
1511
|
|
2482
|
-
|
2483
|
-
|
1512
|
+
def get_profile_ontology_distribution_tables
|
1513
|
+
cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
|
1514
|
+
uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
|
1515
|
+
ontology_levels = get_ontology_levels
|
1516
|
+
total_ontology_terms = ontology_levels.values.flatten.length
|
1517
|
+
total_cohort_terms = cohort_ontology_levels.values.flatten.length
|
1518
|
+
total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
|
2484
1519
|
|
2485
|
-
|
2486
|
-
|
2487
|
-
|
2488
|
-
|
2489
|
-
|
2490
|
-
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
1520
|
+
distribution_ontology_levels = []
|
1521
|
+
distribution_percentage = []
|
1522
|
+
ontology_levels.each do |level, terms|
|
1523
|
+
cohort_terms = cohort_ontology_levels[level]
|
1524
|
+
uniq_cohort_terms = uniq_cohort_ontology_levels[level]
|
1525
|
+
if cohort_terms.nil? || uniq_cohort_terms.nil?
|
1526
|
+
num = 0
|
1527
|
+
u_num = 0
|
1528
|
+
else
|
1529
|
+
num = cohort_terms.length
|
1530
|
+
u_num = uniq_cohort_terms.length
|
1531
|
+
end
|
1532
|
+
distribution_ontology_levels << [level, terms.length, num]
|
1533
|
+
distribution_percentage << [
|
1534
|
+
level,
|
1535
|
+
(terms.length.fdiv(total_ontology_terms)*100).round(3),
|
1536
|
+
(num.fdiv(total_cohort_terms)*100).round(3),
|
1537
|
+
(u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
|
1538
|
+
]
|
1539
|
+
end
|
1540
|
+
distribution_ontology_levels.sort! { |x,y| x.first <=> y.first }
|
1541
|
+
distribution_percentage.sort! { |x,y| x.first <=> y.first }
|
1542
|
+
return distribution_ontology_levels, distribution_percentage
|
2494
1543
|
end
|
2495
1544
|
|
2496
|
-
|
2497
|
-
|
2498
|
-
|
2499
|
-
|
2500
|
-
|
2501
|
-
|
2502
|
-
|
1545
|
+
def get_dataset_specifity_index(mode)
|
1546
|
+
ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
|
1547
|
+
if mode == 'uniq'
|
1548
|
+
observed_distribution = 3
|
1549
|
+
elsif mode == 'weigthed'
|
1550
|
+
observed_distribution = 2
|
1551
|
+
end
|
1552
|
+
max_terms = distribution_percentage.map{|row| row[1]}.max
|
1553
|
+
maxL = nil
|
1554
|
+
distribution_percentage.each do |level_info|
|
1555
|
+
maxL = level_info.first if level_info[1] == max_terms
|
1556
|
+
end
|
1557
|
+
diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
|
1558
|
+
diffL.select!{|dL| dL.last > 0}
|
1559
|
+
highSection = diffL.select{|dL| dL.first > maxL}
|
1560
|
+
lowSection = diffL.select{|dL| dL.first <= maxL}
|
1561
|
+
dsi = nil
|
1562
|
+
if highSection.empty?
|
1563
|
+
dsi = 0
|
1564
|
+
else
|
1565
|
+
hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
|
1566
|
+
lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
|
1567
|
+
dsi = hss.fdiv(lss)
|
1568
|
+
end
|
1569
|
+
return dsi
|
2503
1570
|
end
|
2504
1571
|
|
2505
|
-
|
2506
|
-
|
2507
|
-
|
1572
|
+
def get_weigthed_level_contribution(section, maxL, nLevels)
|
1573
|
+
accumulated_weigthed_diffL = 0
|
1574
|
+
section.each do |level, diff|
|
1575
|
+
weightL = maxL - level
|
1576
|
+
if weightL >= 0
|
1577
|
+
weightL += 1
|
1578
|
+
else
|
1579
|
+
weightL = weightL.abs
|
1580
|
+
end
|
1581
|
+
accumulated_weigthed_diffL += diff * weightL
|
1582
|
+
end
|
1583
|
+
weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
|
1584
|
+
return weigthed_contribution
|
1585
|
+
end
|
1586
|
+
|
1587
|
+
########################################
|
1588
|
+
## GENERAL ONTOLOGY METHODS
|
1589
|
+
########################################
|
1590
|
+
|
2508
1591
|
def ==(other)
|
2509
|
-
self.
|
2510
|
-
self.stanzas == other.stanzas &&
|
1592
|
+
self.terms == other.terms &&
|
2511
1593
|
self.ancestors_index == other.ancestors_index &&
|
2512
1594
|
self.alternatives_index == other.alternatives_index &&
|
2513
|
-
self.obsoletes_index == other.obsoletes_index &&
|
2514
1595
|
self.structureType == other.structureType &&
|
2515
1596
|
self.ics == other.ics &&
|
2516
1597
|
self.meta == other.meta &&
|
2517
1598
|
self.dicts == other.dicts &&
|
2518
1599
|
self.profiles == other.profiles &&
|
2519
|
-
self.profilesDict == other.profilesDict &&
|
2520
1600
|
(self.items.keys - other.items.keys).empty? &&
|
2521
|
-
self.removable_terms == other.removable_terms &&
|
2522
|
-
self.special_tags == other.special_tags &&
|
2523
1601
|
self.items == other.items &&
|
2524
1602
|
self.term_paths == other.term_paths &&
|
2525
1603
|
self.max_freqs == other.max_freqs
|
@@ -2528,32 +1606,128 @@ class Ontology
|
|
2528
1606
|
|
2529
1607
|
def clone
|
2530
1608
|
copy = Ontology.new
|
2531
|
-
copy.
|
2532
|
-
copy.stanzas[:terms] = self.stanzas[:terms].clone
|
2533
|
-
copy.stanzas[:typedefs] = self.stanzas[:typedefs].clone
|
2534
|
-
copy.stanzas[:instances] = self.stanzas[:instances].clone
|
1609
|
+
copy.terms = self.terms.clone
|
2535
1610
|
copy.ancestors_index = self.ancestors_index.clone
|
2536
1611
|
copy.descendants_index = self.descendants_index.clone
|
2537
1612
|
copy.alternatives_index = self.alternatives_index.clone
|
2538
|
-
copy.obsoletes_index = self.obsoletes_index.clone
|
2539
1613
|
copy.structureType = self.structureType.clone
|
2540
1614
|
copy.ics = self.ics.clone
|
2541
1615
|
copy.meta = self.meta.clone
|
2542
1616
|
copy.dicts = self.dicts.clone
|
2543
1617
|
copy.profiles = self.profiles.clone
|
2544
|
-
copy.profilesDict = self.profilesDict.clone
|
2545
1618
|
copy.items = self.items.clone
|
2546
|
-
copy.removable_terms = self.removable_terms.clone
|
2547
1619
|
copy.term_paths = self.term_paths.clone
|
2548
1620
|
copy.max_freqs = self.max_freqs.clone
|
2549
1621
|
return copy
|
2550
1622
|
end
|
2551
1623
|
|
1624
|
+
# Exports an OBO_Handler object in json format
|
1625
|
+
# ===== Parameters
|
1626
|
+
# +file+:: where info will be stored
|
1627
|
+
def write(file)
|
1628
|
+
# Take object stored info
|
1629
|
+
obj_info = {terms: @terms,
|
1630
|
+
ancestors_index: @ancestors_index,
|
1631
|
+
descendants_index: @descendants_index,
|
1632
|
+
alternatives_index: @alternatives_index,
|
1633
|
+
structureType: @structureType,
|
1634
|
+
ics: @ics,
|
1635
|
+
meta: @meta,
|
1636
|
+
max_freqs: @max_freqs,
|
1637
|
+
dicts: @dicts,
|
1638
|
+
profiles: @profiles,
|
1639
|
+
items: @items,
|
1640
|
+
term_paths: @term_paths}
|
1641
|
+
# Convert to JSON format & write
|
1642
|
+
File.open(file, "w") { |f| f.write obj_info.to_json }
|
1643
|
+
end
|
1644
|
+
|
1645
|
+
|
1646
|
+
def each(att = false)
|
1647
|
+
warn('terms empty') if @terms.empty?
|
1648
|
+
@terms.each do |id, tags|
|
1649
|
+
if att
|
1650
|
+
yield(id, tags)
|
1651
|
+
else
|
1652
|
+
yield(id)
|
1653
|
+
end
|
1654
|
+
end
|
1655
|
+
end
|
1656
|
+
|
1657
|
+
def get_root
|
1658
|
+
roots = []
|
1659
|
+
each do |term|
|
1660
|
+
roots << term if @ancestors_index[term].nil?
|
1661
|
+
end
|
1662
|
+
return roots
|
1663
|
+
end
|
1664
|
+
|
1665
|
+
def list_term_attributes
|
1666
|
+
terms = []
|
1667
|
+
each do |code|
|
1668
|
+
terms << [code, translate_id(code), get_term_level(code)]
|
1669
|
+
end
|
1670
|
+
return terms
|
1671
|
+
end
|
1672
|
+
|
1673
|
+
# Gets ontology levels calculated
|
1674
|
+
# ===== Returns
|
1675
|
+
# ontology levels calculated
|
1676
|
+
def get_ontology_levels
|
1677
|
+
return @dicts[:level][:byTerm].clone # By term, in this case, is Key::Level, Value::Terms
|
1678
|
+
end
|
2552
1679
|
|
2553
|
-
|
2554
|
-
|
2555
|
-
|
1680
|
+
private
|
1681
|
+
|
1682
|
+
def add2hash(hash, key, val)
|
1683
|
+
query = hash[key]
|
1684
|
+
if query.nil?
|
1685
|
+
hash[key] = [val]
|
1686
|
+
else
|
1687
|
+
query << val
|
1688
|
+
end
|
1689
|
+
end
|
1690
|
+
|
1691
|
+
def add2nestHash(h, key1, key2, val)
|
1692
|
+
query1 = h[key1]
|
1693
|
+
if query1.nil?
|
1694
|
+
h[key1] = {key2 => val}
|
1695
|
+
else
|
1696
|
+
query1[key2] = val
|
1697
|
+
end
|
1698
|
+
end
|
2556
1699
|
|
2557
|
-
|
2558
|
-
|
1700
|
+
# Internal function to concat two elements.
|
1701
|
+
# ===== Parameters
|
1702
|
+
# +itemA+:: item to be concatenated
|
1703
|
+
# +itemB+:: item to be concatenated
|
1704
|
+
# ===== Returns
|
1705
|
+
# Concatenated objects
|
1706
|
+
def concatItems(itemA,itemB) # NEED TEST, CHECK WITH PSZ THIS METHOD
|
1707
|
+
# A is Array :: RETURN ARRAY
|
1708
|
+
# A_array : B_array
|
1709
|
+
# A_array : B_hash => NOT ALLOWED
|
1710
|
+
# A_array : B_single => NOT ALLOWED
|
1711
|
+
# A is Hash :: RETURN HASH
|
1712
|
+
# A_hash : B_array => NOT ALLOWED
|
1713
|
+
# A_hash : B_hash
|
1714
|
+
# A_hash : B_single => NOT ALLOWED
|
1715
|
+
# A is single element => RETURN ARRAY
|
1716
|
+
# A_single : B_array
|
1717
|
+
# A_single : B_hash => NOT ALLOWED
|
1718
|
+
# A_single : B_single
|
1719
|
+
concatenated = nil
|
1720
|
+
if itemA.kind_of?(Array) && itemB.kind_of?(Array)
|
1721
|
+
concatenated = itemA | itemB
|
1722
|
+
elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
|
1723
|
+
concatenated = itemA.merge(itemB) do |k, oldV, newV|
|
1724
|
+
self.concatItems(oldV,newV)
|
1725
|
+
end
|
1726
|
+
elsif itemB.kind_of?(Array)
|
1727
|
+
concatenated = ([itemA] + itemB).uniq
|
1728
|
+
elsif ![Array, Hash].include?(itemB.class)
|
1729
|
+
concatenated = [itemA,itemB].uniq
|
1730
|
+
end
|
1731
|
+
return concatenated
|
1732
|
+
end
|
2559
1733
|
end
|