semtools 0.1.6 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -1
- data/README.md +2 -0
- data/bin/semtools.rb +521 -0
- data/bin/strsimnet.rb +1 -2
- data/external_data/ontologies.txt +4 -0
- data/lib/semtools/ontology.rb +1241 -2002
- data/lib/semtools/parsers/file_parser.rb +32 -0
- data/lib/semtools/parsers/json_parser.rb +84 -0
- data/lib/semtools/parsers/oboparser.rb +511 -0
- data/lib/semtools/sim_handler.rb +1 -1
- data/lib/semtools/version.rb +1 -1
- data/lib/semtools.rb +3 -1
- data/semtools.gemspec +3 -1
- metadata +40 -6
- data/lib/semtools/math_methods.rb +0 -148
@@ -0,0 +1,32 @@
|
|
1
|
+
class FileParser
|
2
|
+
#############################################
|
3
|
+
# FIELDS
|
4
|
+
#############################################
|
5
|
+
# Handled class variables
|
6
|
+
# => @@basic_tags :: hash with main OBO structure tags
|
7
|
+
# => @@symbolizable_ids :: tags which can be symbolized
|
8
|
+
# => @@tags_with_trailing_modifiers :: tags which can include extra info after specific text modifiers
|
9
|
+
|
10
|
+
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
|
11
|
+
@@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
|
12
|
+
@@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
|
13
|
+
@@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
|
14
|
+
@@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
|
15
|
+
|
16
|
+
# Symboliza all values into hashs using symbolizable tags as keys
|
17
|
+
# ===== Parameters
|
18
|
+
# +item_hash+:: hash to be checked
|
19
|
+
def self.symbolize_ids(item_hash)
|
20
|
+
@@symbolizable_ids.each do |tag|
|
21
|
+
query = item_hash[tag]
|
22
|
+
if !query.nil?
|
23
|
+
if query.kind_of?(Array)
|
24
|
+
query.map!{|item| item.to_sym}
|
25
|
+
else
|
26
|
+
item_hash[tag] = query.to_sym if !query.nil?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
class JsonParser < FileParser
|
2
|
+
|
3
|
+
def self.load(ontology, file, build: true)
|
4
|
+
read(ontology, file)
|
5
|
+
end
|
6
|
+
|
7
|
+
# Read a JSON file with an OBO_Handler object stored
|
8
|
+
# ===== Parameters
|
9
|
+
# +file+:: with object info
|
10
|
+
# +file+:: if true, calculate indexes. Default: true
|
11
|
+
# ===== Return
|
12
|
+
# OBO_Handler internal fields
|
13
|
+
def self.read(ontology, file, build: true)
|
14
|
+
# Read file
|
15
|
+
jsonFile = File.open(file)
|
16
|
+
jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
|
17
|
+
# Pre-process (Symbolize some hashs values)
|
18
|
+
jsonInfo[:terms].map{|id,info| symbolize_ids(info)} # STANZAS
|
19
|
+
# Optional
|
20
|
+
jsonInfo[:alternatives_index] = jsonInfo[:alternatives_index].map{|id,value| [id, value.to_sym]}.to_h unless jsonInfo[:alternatives_index].nil?
|
21
|
+
jsonInfo[:ancestors_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:ancestors_index].nil?
|
22
|
+
jsonInfo[:descendants_index].map {|id,family_arr| family_arr.map!{|item| item.to_sym}} unless jsonInfo[:descendants_index].nil?
|
23
|
+
jsonInfo[:dicts] = jsonInfo[:dicts].each do |flag, dictionaries|
|
24
|
+
next if dictionaries.nil?
|
25
|
+
# Special case: byTerm
|
26
|
+
dictionaries[:byTerm] = dictionaries[:byTerm].map do |term, value|
|
27
|
+
if !term.to_s.scan(/\A[-+]?[0-9]*\.?[0-9]+\Z/).empty? # Numeric dictionary
|
28
|
+
[term.to_s.to_i, value.map{|term| term.to_sym}]
|
29
|
+
elsif value.is_a? Numeric # Numeric dictionary
|
30
|
+
[term.to_sym, value]
|
31
|
+
elsif value.kind_of?(Array) && flag == :is_a
|
32
|
+
[term.to_sym, value.map{|v| v.to_sym}]
|
33
|
+
else
|
34
|
+
[term.to_sym, value]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
dictionaries[:byTerm] = dictionaries[:byTerm].to_h
|
38
|
+
# By value
|
39
|
+
dictionaries[:byValue] = dictionaries[:byValue].map do |value, term|
|
40
|
+
if value.is_a? Numeric # Numeric dictionary
|
41
|
+
[value, term.to_sym]
|
42
|
+
elsif term.is_a? Numeric # Numeric dictionary
|
43
|
+
[value.to_s.to_sym, term]
|
44
|
+
elsif flag == :is_a
|
45
|
+
[value.to_sym, term.map{|v| v.to_sym}]
|
46
|
+
elsif term.kind_of?(Array)
|
47
|
+
[value.to_sym, term.map{|t| t.to_sym}]
|
48
|
+
else
|
49
|
+
[value.to_s, term.to_sym]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
dictionaries[:byValue] = dictionaries[:byValue].to_h
|
53
|
+
end
|
54
|
+
if !jsonInfo[:profiles].nil?
|
55
|
+
jsonInfo[:profiles].map{|id,terms| terms.map!{|term| term.to_sym}}
|
56
|
+
jsonInfo[:profiles].keys.map{|id| jsonInfo[:profiles][id.to_s.to_i] = jsonInfo[:profiles].delete(id) if self.is_number?(id.to_s)}
|
57
|
+
end
|
58
|
+
jsonInfo[:removable_terms] = jsonInfo[:removable_terms].map{|term| term.to_sym} unless jsonInfo[:removable_terms].nil?
|
59
|
+
jsonInfo[:items].each{|k,v| jsonInfo[:items][k] = v.map{|item| item.to_sym}} unless jsonInfo[:items].nil?
|
60
|
+
jsonInfo[:term_paths].each{|term,info| jsonInfo[:term_paths][term][:paths] = info[:paths].map{|path| path.map{|t| t.to_sym}}} unless jsonInfo[:term_paths].nil?
|
61
|
+
|
62
|
+
# Store info
|
63
|
+
ontology.terms = jsonInfo[:terms]
|
64
|
+
ontology.ancestors_index = jsonInfo[:ancestors_index]
|
65
|
+
ontology.descendants_index = jsonInfo[:descendants_index]
|
66
|
+
ontology.alternatives_index = jsonInfo[:alternatives_index]
|
67
|
+
jsonInfo[:structureType] = jsonInfo[:structureType].to_sym unless jsonInfo[:structureType].nil?
|
68
|
+
ontology.structureType = jsonInfo[:structureType]
|
69
|
+
ontology.ics = jsonInfo[:ics]
|
70
|
+
ontology.meta = jsonInfo[:meta]
|
71
|
+
ontology.max_freqs = jsonInfo[:max_freqs]
|
72
|
+
ontology.dicts = jsonInfo[:dicts]
|
73
|
+
ontology.profiles = jsonInfo[:profiles]
|
74
|
+
ontology.items = jsonInfo[:items]
|
75
|
+
ontology.term_paths = jsonInfo[:term_paths]
|
76
|
+
|
77
|
+
ontology.precompute() if build
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.is_number? string
|
81
|
+
true if Float(string) rescue false
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
@@ -0,0 +1,511 @@
|
|
1
|
+
class OboParser < FileParser
|
2
|
+
|
3
|
+
#############################################
|
4
|
+
# FIELDS
|
5
|
+
#############################################
|
6
|
+
# => @header :: file header (if is available)
|
7
|
+
# => @stanzas :: OBO stanzas {:terms,:typedefs,:instances}
|
8
|
+
# => @ancestors_index :: hash of ancestors per each term handled with any structure relationships
|
9
|
+
# => @descendants_index :: hash of descendants per each term handled with any structure relationships
|
10
|
+
# => @alternatives_index :: has of alternative IDs (include alt_id and obsoletes)
|
11
|
+
# => @special_tags :: set of special tags to be expanded (:is_a, :obsolete, :alt_id)
|
12
|
+
# => @structureType :: type of ontology structure depending on ancestors relationship. Allowed: {atomic, sparse, circular, hierarchical}
|
13
|
+
# => @dicts :: bidirectional dictionaries with three levels <key|value>: 1º) <tag|hash2>; 2º) <(:byTerm/:byValue)|hash3>; 3º) dictionary <k|v>
|
14
|
+
# => @removable_terms :: array of terms to not be considered
|
15
|
+
|
16
|
+
@@header = nil
|
17
|
+
@@stanzas = {terms: {}, typedefs: {}, instances: {}}
|
18
|
+
@@removable_terms = []
|
19
|
+
@@alternatives_index = {}
|
20
|
+
@@obsoletes = {}
|
21
|
+
@@structureType = nil
|
22
|
+
@@ancestors_index = {}
|
23
|
+
@@descendants_index = {}
|
24
|
+
@@reroot = false
|
25
|
+
@@dicts = {}
|
26
|
+
|
27
|
+
def self.reset
|
28
|
+
@@header = nil
|
29
|
+
@@stanzas = {terms: {}, typedefs: {}, instances: {}}
|
30
|
+
@@removable_terms = []
|
31
|
+
@@alternatives_index = {}
|
32
|
+
@@obsoletes = {}
|
33
|
+
@@structureType = nil
|
34
|
+
@@ancestors_index = {}
|
35
|
+
@@descendants_index = {}
|
36
|
+
@@reroot = false
|
37
|
+
@@dicts = {}
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.each(att = false, only_main = true)
|
41
|
+
warn('stanzas terms empty') if @@stanzas[:terms].empty?
|
42
|
+
@@stanzas[:terms].each do |id, tags|
|
43
|
+
next if only_main && (@@alternatives_index.include?(id) || @@obsoletes.include?(id))
|
44
|
+
if att
|
45
|
+
yield(id, tags)
|
46
|
+
else
|
47
|
+
yield(id)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.load(ontology, file, build: true, black_list: [], extra_dicts: [])
|
53
|
+
reset # Clean class variables to avoid the mix of several obo loads
|
54
|
+
@@removable_terms = black_list
|
55
|
+
_, header, stanzas = self.load_obo(file)
|
56
|
+
@@header = header
|
57
|
+
@@stanzas = stanzas
|
58
|
+
self.remove_black_list_terms() if !@@removable_terms.empty?
|
59
|
+
self.build_index(ontology, extra_dicts: extra_dicts) if build
|
60
|
+
end
|
61
|
+
|
62
|
+
# Class method to load an OBO format file (based on OBO 1.4 format). Specially focused on load
|
63
|
+
# the Header, the Terms, the Typedefs and the Instances.
|
64
|
+
# ===== Parameters
|
65
|
+
# +file+:: OBO file to be loaded
|
66
|
+
# ===== Returns
|
67
|
+
# Hash with FILE, HEADER and STANZAS info
|
68
|
+
def self.load_obo(file)
|
69
|
+
raise("File is not defined") if file.nil?
|
70
|
+
# Data variables
|
71
|
+
header = ''
|
72
|
+
stanzas = {terms: {}, typedefs: {}, instances: {}}
|
73
|
+
# Auxiliar variables
|
74
|
+
infoType = 'Header'
|
75
|
+
currInfo = []
|
76
|
+
stanzas_flags = %w[[Term] [Typedef] [Instance]]
|
77
|
+
# Read file
|
78
|
+
File.open(file).each do |line|
|
79
|
+
line.chomp!
|
80
|
+
next if line.empty?
|
81
|
+
fields = line.split(':', 2)
|
82
|
+
# Check if new instance is found
|
83
|
+
if stanzas_flags.include?(line)
|
84
|
+
header = self.process_entity(header, infoType, stanzas, currInfo)
|
85
|
+
# Update info variables
|
86
|
+
currInfo = []
|
87
|
+
infoType = line.gsub!(/[\[\]]/, '')
|
88
|
+
next
|
89
|
+
end
|
90
|
+
# Concat info
|
91
|
+
currInfo << fields
|
92
|
+
end
|
93
|
+
# Store last loaded info
|
94
|
+
header = self.process_entity(header, infoType, stanzas, currInfo) if !currInfo.empty?
|
95
|
+
# Prepare to return
|
96
|
+
finfo = {:file => file, :name => File.basename(file, File.extname(file))}
|
97
|
+
return finfo, header, stanzas
|
98
|
+
end
|
99
|
+
|
100
|
+
# Handle OBO loaded info and stores it into correct container and format
|
101
|
+
# ===== Parameters
|
102
|
+
# +header+:: container
|
103
|
+
# +infoType+:: current ontology item type detected
|
104
|
+
# +stanzas+:: container
|
105
|
+
# +currInfo+:: info to be stored
|
106
|
+
# ===== Returns
|
107
|
+
# header newly/already stored
|
108
|
+
def self.process_entity(header, infoType, stanzas, currInfo)
|
109
|
+
info = self.info2hash(currInfo)
|
110
|
+
# Store current info
|
111
|
+
if infoType.eql?('Header')
|
112
|
+
header = info
|
113
|
+
else
|
114
|
+
id = info[:id]
|
115
|
+
case infoType
|
116
|
+
when 'Term'
|
117
|
+
stanzas[:terms][id] = info
|
118
|
+
when 'Typedef'
|
119
|
+
stanzas[:typedefs][id] = info
|
120
|
+
when 'Instance'
|
121
|
+
stanzas[:instances][id] = info
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return header
|
125
|
+
end
|
126
|
+
|
127
|
+
# Class method to transform string with <tag : info> into hash structure
|
128
|
+
# ===== Parameters
|
129
|
+
# +attributes+:: array tuples with info to be transformed into hash format
|
130
|
+
# ===== Returns
|
131
|
+
# Attributes stored into hash structure
|
132
|
+
def self.info2hash(attributes, split_char = " ! ", selected_field = 0)
|
133
|
+
# Load info
|
134
|
+
info_hash = {}
|
135
|
+
# Only TERMS multivalue tags (future add Typedefs and Instance)
|
136
|
+
# multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
|
137
|
+
attributes.each do |tag, value|
|
138
|
+
value.gsub!(/{[\\\":A-Za-z0-9\/\.\-, =?&_]+} /, '') if tag == 'is_a' # To delete extra attributes (source, xref) in is_a tag of MONDO ontology
|
139
|
+
# Check
|
140
|
+
raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
|
141
|
+
# Prepare
|
142
|
+
tag = tag.lstrip.to_sym
|
143
|
+
value.lstrip!
|
144
|
+
value = value.split(split_char)[selected_field].to_sym if @@tags_with_trailing_modifiers.include?(tag)
|
145
|
+
|
146
|
+
# Store
|
147
|
+
query = info_hash[tag]
|
148
|
+
if !query.nil? # Tag already exists
|
149
|
+
if !query.kind_of?(Array) # Check that tag is multivalue
|
150
|
+
raise('Attempt to concatenate plain text with another. The tag is not declared as multivalue. [' + tag.to_s + '](' + query + ')')
|
151
|
+
else
|
152
|
+
query << value # Add new value to tag
|
153
|
+
end
|
154
|
+
else # New entry
|
155
|
+
if @@multivalue_tags.include?(tag)
|
156
|
+
info_hash[tag] = [value]
|
157
|
+
else
|
158
|
+
info_hash[tag] = value
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
self.symbolize_ids(info_hash)
|
163
|
+
return info_hash
|
164
|
+
end
|
165
|
+
|
166
|
+
def self.remove_black_list_terms()
|
167
|
+
@@removable_terms.each{|removableID| @@stanzas[:terms].delete(removableID)}
|
168
|
+
end
|
169
|
+
|
170
|
+
# Executes basic expansions of tags (alternatives, obsoletes and parentals) with default values
|
171
|
+
# ===== Returns
|
172
|
+
# true if eprocess ends without errors and false in other cases
|
173
|
+
def self.build_index(ontology, extra_dicts: [])
|
174
|
+
self.get_index_obsoletes
|
175
|
+
self.get_index_alternatives
|
176
|
+
self.remove_obsoletes_in_terms
|
177
|
+
self.get_index_child_parent_relations
|
178
|
+
@@alternatives_index.transform_values!{|v| self.extract_id(v)}
|
179
|
+
@@alternatives_index.compact!
|
180
|
+
@@ancestors_index.each{|k,v| @@ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
|
181
|
+
@@descendants_index.each{|k,v| @@descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
|
182
|
+
self.calc_dictionary(:name)
|
183
|
+
self.calc_dictionary(:synonym, select_regex: /\"(.*)\"/)
|
184
|
+
self.calc_ancestors_dictionary
|
185
|
+
extra_dicts.each do |dict_tag, extra_parameters|
|
186
|
+
self.calc_dictionary(dict_tag, **extra_parameters) # https://www.justinweiss.com/articles/fun-with-keyword-arguments/
|
187
|
+
end
|
188
|
+
ontology.terms = @@stanzas[:terms]
|
189
|
+
ontology.alternatives_index = @@alternatives_index
|
190
|
+
ontology.obsoletes = @@obsoletes
|
191
|
+
ontology.ancestors_index = @@ancestors_index
|
192
|
+
ontology.descendants_index = @@descendants_index
|
193
|
+
ontology.reroot = @@reroot
|
194
|
+
ontology.structureType = @@structureType
|
195
|
+
ontology.dicts = @@dicts
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
def self.remove_obsoletes_in_terms() # once alternative and obsolete indexes are loaded, use this to keep only working terms
|
200
|
+
terms = @@stanzas[:terms]
|
201
|
+
@@obsoletes.each do |term, val|
|
202
|
+
terms.delete(term)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
# Expand obsoletes set and link info to their alternative IDs
|
208
|
+
# ===== Parameters
|
209
|
+
# +obs_tags+:: tags to be used to find obsoletes
|
210
|
+
# +alt_tags+:: tags to find alternative IDs (if are available)
|
211
|
+
# ===== Returns
|
212
|
+
# true if process ends without errors and false in other cases
|
213
|
+
def self.get_index_obsoletes(obs_tag: @@basic_tags[:obsolete], alt_tags: @@basic_tags[:alternative])
|
214
|
+
each(att = true) do |id, term_tags|
|
215
|
+
obs_value = term_tags[obs_tag]
|
216
|
+
if obs_value == 'true' # Obsolete tag presence, must be checked as string
|
217
|
+
alt_ids = alt_tags.map{|alt| term_tags[alt]}.compact # Check if alternative value is available
|
218
|
+
if !alt_ids.empty?
|
219
|
+
alt_id = alt_ids.first.first #FIRST tag, FIRST id
|
220
|
+
@@alternatives_index[id] = alt_id
|
221
|
+
end
|
222
|
+
@@obsoletes[id] = true
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# Expand alternative IDs arround all already stored terms
|
228
|
+
# ===== Parameters
|
229
|
+
# +alt_tag+:: tag used to expand alternative IDs
|
230
|
+
# ===== Returns
|
231
|
+
# true if process ends without errors and false in other cases
|
232
|
+
def self.get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
|
233
|
+
each(att = true) do |id, tags|
|
234
|
+
alt_ids = tags[alt_tag]
|
235
|
+
if !alt_ids.nil?
|
236
|
+
alt_ids = alt_ids - @@removable_terms - [id]
|
237
|
+
alt_ids.each do |alt_term|
|
238
|
+
@@alternatives_index[alt_term] = id
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
# Expand parentals set. Also launch frequencies process
|
245
|
+
# ===== Parameters
|
246
|
+
# +tag+:: tag used to expand parentals
|
247
|
+
# ===== Returns
|
248
|
+
# true if process ends without errors and false in other cases
|
249
|
+
def self.get_index_child_parent_relations(tag: @@basic_tags[:ancestors][0])
|
250
|
+
structType, parentals = self.get_related_ids_by_tag(terms: @@stanzas[:terms],
|
251
|
+
target_tag: tag,
|
252
|
+
reroot: @@reroot)
|
253
|
+
if structType.nil? || parentals.nil?
|
254
|
+
raise('Error expanding parentals')
|
255
|
+
elsif ![:atomic,:sparse].include?(structType) # Check structure
|
256
|
+
structType = structType == :circular ? :circular : :hierarchical
|
257
|
+
end
|
258
|
+
@@structureType = structType
|
259
|
+
|
260
|
+
parentals.each do |id, parents|
|
261
|
+
parents = parents - @@removable_terms
|
262
|
+
@@ancestors_index[id] = parents
|
263
|
+
parents.each{|anc_id| self.add2hash(@@descendants_index, anc_id, id)}
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
|
268
|
+
# Expand terms using a specific tag and return all extended terms into an array and
|
269
|
+
# the relationship structuture observed (hierarchical or circular). If circular structure is
|
270
|
+
# foumd, extended array will be an unique vector without starting term (no loops)
|
271
|
+
# ===== Parameters
|
272
|
+
# +terms+:: set to be used to expand
|
273
|
+
# +target_tag+:: tag used to expand
|
274
|
+
# ===== Returns
|
275
|
+
# A vector with the observed structure (string) and the hash with extended terms
|
276
|
+
def self.get_related_ids_by_tag(terms:, target_tag:, reroot: false)
|
277
|
+
structType = :hierarchical
|
278
|
+
related_ids = {}
|
279
|
+
terms.each do |id, tags|
|
280
|
+
if !tags[target_tag].nil?
|
281
|
+
set_structure, _ = self.get_related_ids(id, terms, target_tag, related_ids)
|
282
|
+
structType = :circular if set_structure == :circular # Check structure
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
# Check special case
|
287
|
+
structType = :atomic if related_ids.length <= 0
|
288
|
+
structType = :sparse if reroot || (related_ids.length > 0 && ((terms.length - related_ids.length ) >= 2) )
|
289
|
+
return structType, related_ids
|
290
|
+
end
|
291
|
+
|
292
|
+
# Expand a (starting) term using a specific tag and return all extended terms into an array and
|
293
|
+
# the relationship structuture observed (hierarchical or circular). If circular structure is
|
294
|
+
# foumd, extended array will be an unique vector without starting term (no loops).
|
295
|
+
# +Note+: we extremly recomend use get_related_ids_by_tag function instead of it (directly)
|
296
|
+
# ===== Parameters
|
297
|
+
# +start+:: term where start to expand
|
298
|
+
# +terms+:: set to be used to expand
|
299
|
+
# +target_tag+:: tag used to expand
|
300
|
+
# +eexpansion+:: already expanded info
|
301
|
+
# ===== Returns
|
302
|
+
# A vector with the observed structure (string) and the array with extended terms.
|
303
|
+
def self.get_related_ids(start_id, terms, target_tag, related_ids = {})
|
304
|
+
# Take start_id term available info and already accumulated info
|
305
|
+
current_associations = related_ids[start_id]
|
306
|
+
current_associations = [] if current_associations.nil?
|
307
|
+
return [:no_term,[]] if terms[start_id].nil?
|
308
|
+
id_relations = terms[start_id][target_tag]
|
309
|
+
return [:source,[]] if id_relations.nil?
|
310
|
+
|
311
|
+
struct = :hierarchical
|
312
|
+
|
313
|
+
# Study direct extensions
|
314
|
+
id_relations.each do |id|
|
315
|
+
# Handle
|
316
|
+
if current_associations.include?(id) # Check if already have been included into this expansion
|
317
|
+
next
|
318
|
+
#struct = :circular # Old code that give circular status in real obo files. The apparent logic no makes sense. The change gives no error in tests.
|
319
|
+
# TODO: CHECK CAREFULLY THIS METHOD.
|
320
|
+
else
|
321
|
+
current_associations << id
|
322
|
+
if related_ids.include?(id) # Check if current already has been expanded
|
323
|
+
current_associations = current_associations | related_ids[id]
|
324
|
+
if current_associations.include?(start_id) # Check circular case
|
325
|
+
struct = :circular
|
326
|
+
current_associations = current_associations - [id, start_id]
|
327
|
+
end
|
328
|
+
else # Expand
|
329
|
+
related_ids[start_id] = current_associations
|
330
|
+
structExp, current_related_ids = self.get_related_ids(id, terms, target_tag, related_ids) # Expand current
|
331
|
+
current_associations = current_associations | current_related_ids
|
332
|
+
struct = :circular if structExp == :circular # Check struct
|
333
|
+
if current_associations.include?(start_id) # Check circular case
|
334
|
+
struct = :circular
|
335
|
+
current_associations.delete(start_id)
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|
340
|
+
related_ids[start_id] = current_associations
|
341
|
+
|
342
|
+
return struct, current_associations
|
343
|
+
end
|
344
|
+
|
345
|
+
# Calculates :is_a dictionary
|
346
|
+
def self.calc_ancestors_dictionary
|
347
|
+
self.calc_dictionary(:is_a, self_type_references: true, multiterm: true)
|
348
|
+
end
|
349
|
+
|
350
|
+
# Generate a bidirectinal dictionary set using a specific tag and terms stanzas set
|
351
|
+
# This functions stores calculated dictionary into @dicts field.
|
352
|
+
# This functions stores first value for multivalue tags
|
353
|
+
# This function does not handle synonyms for byValue dictionaries
|
354
|
+
# ===== Parameters
|
355
|
+
# +tag+:: to be used to calculate dictionary
|
356
|
+
# +select_regex+:: gives a regfex that can be used to modify value to be stored
|
357
|
+
# +store_tag+:: flag used to store dictionary. If nil, mandatory tag given will be used
|
358
|
+
# +multiterm+:: if true, byValue will allows multi-term linkage (array)
|
359
|
+
# +self_type_references+:: if true, program assumes that refrences will be between Ontology terms, and it term IDs will be checked
|
360
|
+
# ===== Return
|
361
|
+
# hash with dict data. And stores calcualted bidirectional dictonary into dictionaries main container
|
362
|
+
def self.calc_dictionary(tag, select_regex: nil, store_tag: nil, multiterm: false, self_type_references: false)
|
363
|
+
tag = tag.to_sym
|
364
|
+
store_tag = tag if store_tag.nil?
|
365
|
+
|
366
|
+
byTerm = {}
|
367
|
+
byValue = {}
|
368
|
+
# Calc per term
|
369
|
+
each(att = true, only_main = false) do |term, tags|
|
370
|
+
referenceTerm = term
|
371
|
+
queryTag = tags[tag]
|
372
|
+
if !queryTag.nil?
|
373
|
+
# Pre-process
|
374
|
+
if !select_regex.nil?
|
375
|
+
if queryTag.kind_of?(Array)
|
376
|
+
queryTag = queryTag.map{|value| value.scan(select_regex).first}
|
377
|
+
queryTag.flatten!
|
378
|
+
else
|
379
|
+
queryTag = queryTag.scan(select_regex).first
|
380
|
+
end
|
381
|
+
queryTag.compact!
|
382
|
+
end
|
383
|
+
if queryTag.kind_of?(Array) # Store
|
384
|
+
if !queryTag.empty?
|
385
|
+
if byTerm.include?(referenceTerm)
|
386
|
+
byTerm[referenceTerm] = (byTerm[referenceTerm] + queryTag).uniq
|
387
|
+
else
|
388
|
+
byTerm[referenceTerm] = queryTag
|
389
|
+
end
|
390
|
+
if multiterm
|
391
|
+
queryTag.each do |value|
|
392
|
+
byValue[value] = [] if byValue[value].nil?
|
393
|
+
byValue[value] << referenceTerm
|
394
|
+
end
|
395
|
+
else
|
396
|
+
queryTag.each{|value| byValue[value] = referenceTerm}
|
397
|
+
end
|
398
|
+
end
|
399
|
+
else
|
400
|
+
if byTerm.include?(referenceTerm)
|
401
|
+
byTerm[referenceTerm] = (byTerm[referenceTerm] + [queryTag]).uniq
|
402
|
+
else
|
403
|
+
byTerm[referenceTerm] = [queryTag]
|
404
|
+
end
|
405
|
+
if multiterm
|
406
|
+
byValue[queryTag] = [] if byValue[queryTag].nil?
|
407
|
+
byValue[queryTag] << referenceTerm
|
408
|
+
else
|
409
|
+
byValue[queryTag] = referenceTerm
|
410
|
+
end
|
411
|
+
end
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
# Check self-references
|
416
|
+
if self_type_references
|
417
|
+
byTerm.map do |term, references|
|
418
|
+
corrected_references = references.map do |t|
|
419
|
+
checked = self.extract_id(t)
|
420
|
+
if checked.nil?
|
421
|
+
t
|
422
|
+
else
|
423
|
+
byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
|
424
|
+
checked
|
425
|
+
end
|
426
|
+
end
|
427
|
+
byTerm[term] = corrected_references.uniq
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
# Check order
|
432
|
+
byTerm.map do |term,values|
|
433
|
+
if self.exists?(term)
|
434
|
+
referenceValue = @@stanzas[:terms][term][tag]
|
435
|
+
if !referenceValue.nil?
|
436
|
+
if !select_regex.nil?
|
437
|
+
if referenceValue.kind_of?(Array)
|
438
|
+
referenceValue = referenceValue.map{|value| value.scan(select_regex).first}
|
439
|
+
referenceValue.flatten!
|
440
|
+
else
|
441
|
+
referenceValue = referenceValue.scan(select_regex).first
|
442
|
+
end
|
443
|
+
referenceValue.compact!
|
444
|
+
end
|
445
|
+
if self_type_references
|
446
|
+
if referenceValue.kind_of?(Array)
|
447
|
+
aux = referenceValue.map{|t| self.extract_id(t)}
|
448
|
+
else
|
449
|
+
aux = self.extract_id(referenceValue)
|
450
|
+
end
|
451
|
+
aux.compact! unless aux.nil?
|
452
|
+
referenceValue = aux unless aux.nil?
|
453
|
+
end
|
454
|
+
referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
|
455
|
+
byTerm[term] = referenceValue + (values - referenceValue)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
end
|
459
|
+
|
460
|
+
# Store
|
461
|
+
dict = {byTerm: byTerm, byValue: byValue}
|
462
|
+
@@dicts[store_tag] = dict
|
463
|
+
return dict
|
464
|
+
end
|
465
|
+
|
466
|
+
# Check if a given ID is stored as term into this object
|
467
|
+
# ===== Parameters
|
468
|
+
# +id+:: to be checked
|
469
|
+
# ===== Return
|
470
|
+
# True if term is allowed or false in other cases
|
471
|
+
def self.exists? id
|
472
|
+
return @@stanzas[:terms].include?(id)
|
473
|
+
end
|
474
|
+
|
475
|
+
# Check if a term given is marked as obsolete
|
476
|
+
def self.is_obsolete? term
|
477
|
+
return @@obsoletes.include?(term)
|
478
|
+
end
|
479
|
+
|
480
|
+
# Check if a term given is marked as alternative
|
481
|
+
def self.is_alternative? term
|
482
|
+
return @@alternatives_index.include?(term)
|
483
|
+
end
|
484
|
+
|
485
|
+
# This method assumes that a text given contains an allowed ID. And will try to obtain it splitting it
|
486
|
+
# ===== Parameters
|
487
|
+
# +text+:: to be checked
|
488
|
+
# ===== Return
|
489
|
+
# The correct ID if it can be found or nil in other cases
|
490
|
+
def self.extract_id(text, splitBy: ' ')
|
491
|
+
if self.exists?(text)
|
492
|
+
return text
|
493
|
+
else
|
494
|
+
splittedText = text.to_s.split(splitBy).first.to_sym
|
495
|
+
return self.exists?(splittedText) ? splittedText : nil
|
496
|
+
end
|
497
|
+
end
|
498
|
+
|
499
|
+
|
500
|
+
private
|
501
|
+
|
502
|
+
def self.add2hash(hash, key, val)
|
503
|
+
query = hash[key]
|
504
|
+
if query.nil?
|
505
|
+
hash[key] = [val]
|
506
|
+
else
|
507
|
+
query << val
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
end
|
data/lib/semtools/sim_handler.rb
CHANGED
@@ -92,7 +92,7 @@ end
|
|
92
92
|
# +charsToRemove+:: char (or chars set) to be removed from texts to be compared
|
93
93
|
# +unique+:: boolean flag which indicates if repeated elements must be removed
|
94
94
|
# Returns the similarity percentage for all elements into array
|
95
|
-
def similitude_network(items_array, splitChar
|
95
|
+
def similitude_network(items_array, splitChar: ";", charsToRemove: "", unique: false)
|
96
96
|
# Special cases
|
97
97
|
return nil if items_array.nil?
|
98
98
|
return nil if !items_array.is_a? Array
|
data/lib/semtools/version.rb
CHANGED
data/lib/semtools.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
require "semtools/version"
|
2
2
|
require "semtools/sim_handler"
|
3
|
-
require "semtools/math_methods"
|
4
3
|
require "semtools/ontology"
|
4
|
+
require "semtools/parsers/file_parser"
|
5
|
+
require "semtools/parsers/json_parser"
|
6
|
+
require "semtools/parsers/oboparser"
|
5
7
|
|
6
8
|
module Semtools
|
7
9
|
# Your code goes here...
|
data/semtools.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["seoanezonjic", "fmjabato"]
|
10
10
|
spec.email = ["seoanezonjic@hotmail.com", "fmjabato@gmail.com"]
|
11
11
|
|
12
|
-
spec.summary = %q{Gem to handle semantic based calculations in text and defined ontologies as GO or HPO.}
|
12
|
+
spec.summary = %q{DEPRECATED PROJECT. MIGRATED TO PYTHON: https://github.com/seoanezonjic/py_semtools. Gem to handle semantic based calculations in text and defined ontologies as GO or HPO.}
|
13
13
|
spec.description = %q{This gem allows to perform ontology based operations and calculation of Semantic similarity and information coefficient using different implementations.}
|
14
14
|
spec.homepage = "https://github.com/seoanezonjic/semtools"
|
15
15
|
spec.license = "MIT"
|
@@ -31,6 +31,8 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.require_paths = ["lib"]
|
32
32
|
|
33
33
|
spec.add_dependency "text"
|
34
|
+
spec.add_dependency "down"
|
35
|
+
spec.add_dependency "expcalc"
|
34
36
|
|
35
37
|
spec.add_development_dependency "rake"
|
36
38
|
spec.add_development_dependency "rspec"
|