ovos-adapt-parser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ovos_adapt/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ # Copyright 2018 Mycroft AI Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ __author__ = 'seanfitz'
17
+ import os
18
+ if os.path.exists('README.md'):
19
+ import codecs
20
+ __doc__ = codecs.open('README.md', encoding='utf-8', mode='r').read()
21
+
ovos_adapt/context.py ADDED
@@ -0,0 +1,154 @@
1
+ # Copyright 2018 Mycroft AI Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ """
17
+ Context Management code for Adapt (where context ~= persistent session state).
18
+ """
19
+ from six.moves import xrange
20
+
21
+ __author__ = "seanfitz, Art McGee"
22
+
23
+
24
+ class ContextManagerFrame(object):
25
+ """
26
+ Manages entities and context for a single frame of conversation.
27
+ Provides simple equality querying.
28
+
29
+ Attributes:
30
+ entities(list): Entities that belong to ContextManagerFrame
31
+ metadata(object): metadata to describe context belonging to ContextManagerFrame
32
+ """
33
+ def __init__(self, entities=None, metadata=None):
34
+ """
35
+ Initialize ContextManagerFrame
36
+
37
+ Args:
38
+ entities(list): List of Entities...
39
+ metadata(object): metadata to describe context?
40
+ """
41
+ self.entities = entities or []
42
+ self.metadata = metadata or {}
43
+
44
+ def metadata_matches(self, query=None):
45
+ """
46
+ Returns key matches to metadata
47
+
48
+ Asserts that the contents of query exist within (logical subset of)
49
+ metadata in this frame.
50
+ Args:
51
+ query(object): metadata for matching
52
+
53
+ Returns:
54
+ bool:
55
+ True: when key count in query is > 0 and all keys in query in
56
+ self.metadata
57
+ False: if key count in query is <= 0 or any key in query not
58
+ found in self.metadata
59
+
60
+ """
61
+ query = query or {}
62
+ result = len(query.keys()) > 0
63
+ for key in query.keys():
64
+ result = result and query[key] == self.metadata.get(key)
65
+
66
+ return result
67
+
68
+ def merge_context(self, tag, metadata):
69
+ """
70
+ merge into contextManagerFrame new entity and metadata.
71
+
72
+ Appends tag as new entity and adds keys in metadata to keys in
73
+ self.metadata.
74
+
75
+ Args:
76
+ tag(str): entity to be added to self.entities
77
+ metadata(object): metadata containes keys to be added to self.metadata
78
+ """
79
+ self.entities.append(tag)
80
+ for k, v in metadata.items():
81
+ if k not in self.metadata:
82
+ self.metadata[k] = v
83
+
84
+
85
+ class ContextManager(object):
86
+ """
87
+ ContextManager
88
+ Use to track context throughout the course of a conversational session.
89
+ How to manage a session's lifecycle is not captured here.
90
+ """
91
+ def __init__(self):
92
+ self.frame_stack = []
93
+
94
+ def inject_context(self, entity, metadata=None):
95
+ """
96
+ Add an entity to the current context.
97
+ If metadata matches the top of the context frame stack, merge.
98
+ Else, create a new frame and push it on top of the stack.
99
+ Args:
100
+ entity(object):
101
+ format {'data': 'Entity tag as <str>',
102
+ 'key': 'entity proper name as <str>',
103
+ 'confidence': <float>'
104
+ }
105
+ metadata(object): dict, arbitrary metadata about the entity being added
106
+ """
107
+ metadata = metadata or {}
108
+ top_frame = self.frame_stack[0] if len(self.frame_stack) > 0 else None
109
+ if top_frame and top_frame.metadata_matches(metadata):
110
+ top_frame.merge_context(entity, metadata)
111
+ else:
112
+ frame = ContextManagerFrame(entities=[entity], metadata=metadata.copy())
113
+ self.frame_stack.insert(0, frame)
114
+
115
+ def get_context(self, max_frames=None, missing_entities=None):
116
+ """
117
+ Returns context, including decaying weights based on depth in stack.
118
+
119
+ Args:
120
+ max_frames(int): maximum number of frames to look back
121
+ missing_entities(list of str): a list or set of tag names, as strings
122
+
123
+ Returns:
124
+ list: a list of entities
125
+ """
126
+ missing_entities = missing_entities or []
127
+ if not max_frames or max_frames > len(self.frame_stack):
128
+ max_frames = len(self.frame_stack)
129
+
130
+ missing_entities = list(missing_entities)
131
+ context = []
132
+ for i in xrange(max_frames):
133
+ frame_entities = [entity.copy() for entity in self.frame_stack[i].entities]
134
+ for entity in frame_entities:
135
+ entity['confidence'] = entity.get('confidence', 1.0) / (2.0 + i)
136
+ context += frame_entities
137
+
138
+ result = []
139
+ if len(missing_entities) > 0:
140
+ for entity in context:
141
+ if entity.get('data') in missing_entities:
142
+ result.append(entity)
143
+ # NOTE: this implies that we will only ever get one
144
+ # of an entity kind from context, unless specified
145
+ # multiple times in missing_entities. Cannot get
146
+ # an arbitrary number of an entity kind.
147
+ missing_entities.remove(entity.get('data'))
148
+ else:
149
+ result = context
150
+
151
+ return result
152
+
153
+
154
+
ovos_adapt/engine.py ADDED
@@ -0,0 +1,490 @@
1
+ # Copyright 2018 Mycroft AI Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ import re
17
+ import heapq
18
+ from ovos_adapt.entity_tagger import EntityTagger
19
+ from ovos_adapt.parser import Parser
20
+ from ovos_adapt.tools.text.tokenizer import EnglishTokenizer
21
+ from ovos_adapt.tools.text.trie import Trie
22
+
23
+ __author__ = 'seanfitz'
24
+
25
+
26
+ class IntentDeterminationEngine(object):
27
+ """
28
+ IntentDeterminationEngine
29
+
30
+ The IntentDeterminationEngine is a greedy and naive implementation of intent determination. Given an utterance,
31
+ it uses the Adapt parsing tools to come up with a sorted collection of tagged parses. A valid parse result contains
32
+ no overlapping tagged entities, and its confidence is the sum of the tagged entity confidences, which are
33
+ weighted based on the percentage of the utterance (per character) that the entity match represents.
34
+
35
+ This system makes heavy use of generators to enable greedy algorithms to short circuit large portions of
36
+ computation, however making use of context or regular expressions prevents these optimizations.
37
+ """
38
+ def __init__(self, tokenizer=None, trie=None):
39
+ """
40
+ Initialize the IntentDeterminationEngine
41
+
42
+ Args:
43
+ tokenizer(tokenizer) : tokenizer used to break up spoken text
44
+ example EnglishTokenizer()
45
+ trie(Trie): tree of matches to Entites
46
+ """
47
+ self.tokenizer = tokenizer or EnglishTokenizer()
48
+ self.trie = trie or Trie()
49
+ self.regular_expressions_entities = []
50
+ self._regex_strings = set()
51
+ self.intent_parsers = []
52
+
53
+ def __best_intent(self, parse_result, context=[]):
54
+ """
55
+ For the specified parse_result, find the intent parser with the
56
+ highest confidence match.
57
+
58
+ Args:
59
+ parse_result(list): results used to match the best intent.
60
+ context(list): ?
61
+
62
+ Returns:
63
+ best_intent, best_tags:
64
+ best_intent : The best intent for given results
65
+ best_tags : The Tags for result
66
+ """
67
+ best_intent = None
68
+ best_tags = None
69
+ # TODO: there's a bunch of subtlety here around what the values of `match` and `key` should be
70
+ # Longer term, this should probably be typed, barring any performance regressions.
71
+ context_as_entities = [
72
+ {
73
+ 'key': c['key'],
74
+ 'entities': [c],
75
+ 'from_context': True
76
+ } for c in context
77
+ ]
78
+ for intent in self.intent_parsers:
79
+ i, tags = intent.validate_with_tags(parse_result.get('tags') + context_as_entities, parse_result.get('confidence'))
80
+ if not best_intent or (i and i.get('confidence') > best_intent.get('confidence')):
81
+ best_intent = i
82
+ best_tags = tags
83
+
84
+ return best_intent, best_tags
85
+
86
+ def __get_unused_context(self, parse_result, context):
87
+ """ Used to get unused context from context. Any keys not in
88
+ parse_result
89
+
90
+ Args:
91
+ parse_results(list): parsed results used to identify what keys
92
+ in the context are used.
93
+ context(list): this is the context used to match with parsed results
94
+ keys missing in the parsed results are the unused context
95
+
96
+ Returns:
97
+ list: A list of the unused context results.
98
+ """
99
+ tags_keys = set([t['key'] for t in parse_result['tags'] if t['from_context']])
100
+ result_context = [c for c in context if c['key'] not in tags_keys]
101
+ return result_context
102
+
103
+ @property
104
+ def tagger(self):
105
+ return EntityTagger(self.trie, self.tokenizer,
106
+ self.regular_expressions_entities)
107
+
108
+ def determine_intent(self, utterance, num_results=1, include_tags=False, context_manager=None):
109
+ """
110
+ Given an utterance, provide a valid intent.
111
+
112
+ Args:
113
+ utterance(str): an ascii or unicode string representing natural language speech
114
+ include_tags(bool): includes the parsed tags (including position and confidence)
115
+ as part of result
116
+ context_manager(list): a context manager to provide context to the utterance
117
+ num_results(int): a maximum number of results to be returned.
118
+
119
+ Returns: A generator that yields dictionaries.
120
+ """
121
+ parser = Parser(self.tokenizer, self.tagger)
122
+
123
+ context = []
124
+ if context_manager:
125
+ context = context_manager.get_context()
126
+
127
+ # Adapt consumers assume that results are sorted by confidence. parser
128
+ # will yield results sorted by utterance coverage, but regex
129
+ # and context entities will have different weights, and
130
+ # can influence final sorting.
131
+ requires_final_sort = self.regular_expressions_entities or context
132
+
133
+ def generate_intents():
134
+ for result in parser.parse(utterance, N=num_results, context=context):
135
+ # create a context without entities used in result
136
+ remaining_context = self.__get_unused_context(result, context)
137
+ best_intent, tags = self.__best_intent(result, remaining_context)
138
+ if best_intent and best_intent.get('confidence', 0.0) > 0:
139
+ if include_tags:
140
+ best_intent['__tags__'] = tags
141
+ yield best_intent
142
+
143
+ if requires_final_sort:
144
+ sorted_iterable = sorted([
145
+ i for i in generate_intents()
146
+ ], key=lambda x: -x.get('confidence', 0.0))
147
+ else:
148
+ sorted_iterable = generate_intents()
149
+
150
+ for intent in sorted_iterable:
151
+ yield intent
152
+
153
+ def register_entity(self, entity_value, entity_type, alias_of=None):
154
+ """
155
+ Register an entity to be tagged in potential parse results
156
+
157
+ Args:
158
+ entity_value(str): the value/proper name of an entity instance (Ex: "The Big Bang Theory")
159
+ entity_type(str): the type/tag of an entity instance (Ex: "Television Show")
160
+ """
161
+ if alias_of:
162
+ self.trie.insert(entity_value.lower(), data=(alias_of, entity_type))
163
+ else:
164
+ self.trie.insert(entity_value.lower(), data=(entity_value, entity_type))
165
+ self.trie.insert(entity_type.lower(), data=(entity_type, 'Concept'))
166
+
167
+ def register_regex_entity(self, regex_str):
168
+ """
169
+ A regular expression making use of python named group expressions.
170
+
171
+ Example: (?P<Artist>.*)
172
+
173
+ regex_str(str): a string representing a regular expression as defined above
174
+ """
175
+ if regex_str and regex_str not in self._regex_strings:
176
+ self._regex_strings.add(regex_str)
177
+ self.regular_expressions_entities.append(re.compile(regex_str, re.IGNORECASE))
178
+
179
+ def register_intent_parser(self, intent_parser):
180
+ """
181
+ "Enforce" the intent parser interface at registration time.
182
+
183
+ Args:
184
+ intent_parser(intent): Intent to be registered.
185
+
186
+ Raises:
187
+ ValueError: on invalid intent
188
+ """
189
+ if hasattr(intent_parser, 'validate') and callable(intent_parser.validate):
190
+ self.intent_parsers.append(intent_parser)
191
+ else:
192
+ raise ValueError("%s is not an intent parser" % str(intent_parser))
193
+
194
+ def drop_intent_parser(self, parser_names):
195
+ """Drop a registered intent parser.
196
+
197
+ Arguments:
198
+ parser_names (str or iterable): parser name to drop or list of
199
+ names
200
+
201
+ Returns:
202
+ (bool) True if a parser was dropped else False
203
+ """
204
+ if isinstance(parser_names, str):
205
+ parser_names = [parser_names]
206
+
207
+ new_parsers = [p for p in self.intent_parsers
208
+ if p.name not in parser_names]
209
+ num_original_parsers = len(self.intent_parsers)
210
+ self.intent_parsers = new_parsers
211
+
212
+ return len(self.intent_parsers) != num_original_parsers
213
+
214
+ def drop_entity(self, entity_type=None, match_func=None):
215
+ """Drop all entities mathching the given entity type or match function
216
+
217
+ Arguments:
218
+ entity_type (str): entity name to match against
219
+ match_func (callable): match function to find entities
220
+
221
+ Returns:
222
+ (bool) True if vocab was found and removed otherwise False.
223
+ """
224
+ def default_match_func(data):
225
+ return data and data[1] == entity_type
226
+
227
+ ent_tuples = self.trie.scan(match_func or default_match_func)
228
+ for entity in ent_tuples:
229
+ self.trie.remove(*entity)
230
+
231
+ return len(ent_tuples) != 0
232
+
233
+ def drop_regex_entity(self, entity_type=None, match_func=None):
234
+ """Remove registered regex entity.
235
+
236
+ Arguments:
237
+ entity_type (str): entity name to match against
238
+ match_func (callable): match function to find entities
239
+
240
+ Returns:
241
+ (bool) True if vocab was found and removed otherwise False.
242
+ """
243
+ def default_match_func(regexp):
244
+ return entity_type in regexp.groupindex.keys()
245
+
246
+ match_func = match_func or default_match_func
247
+ matches = [r for r in self.regular_expressions_entities
248
+ if match_func(r)]
249
+ matching_patterns = [r.pattern for r in matches]
250
+
251
+ matches = [
252
+ r for r in self.regular_expressions_entities if r in matches
253
+ ]
254
+ for match in matches:
255
+ self.regular_expressions_entities.remove(match)
256
+
257
+ self._regex_strings = {
258
+ r for r in self._regex_strings if r not in matching_patterns
259
+ }
260
+
261
+ return len(matches) != 0
262
+
263
+
264
+ class DomainIntentDeterminationEngine(object):
265
+ """
266
+ DomainIntentDeterminationEngine.
267
+
268
+ The DomainIntentDeterminationEngine is a greedy and naive implementation of intent
269
+ determination. Given an utterance, it uses the Adapt parsing tools to come up with a
270
+ sorted collection of tagged parses. A valid parse result contains no overlapping
271
+ tagged entities in a single domain, and its confidence is the sum of the tagged
272
+ entity confidences, which are weighted based on the percentage of the utterance
273
+ (per character) that the entity match represents.
274
+
275
+ This system makes heavy use of generators to enable greedy algorithms to short circuit
276
+ large portions of computation.
277
+ """
278
+
279
+ def __init__(self):
280
+ """
281
+ Initialize DomainIntentDeterminationEngine.
282
+ """
283
+ self.domains = {}
284
+
285
+ @property
286
+ def tokenizer(self):
287
+ """
288
+ A property to link into IntentEngine's tokenizer.
289
+
290
+ Warning: this is only for backwards compatiblility and should not be used if you
291
+ intend on using domains.
292
+
293
+ Return: the domains tokenizer from its IntentEngine
294
+ """
295
+ domain = 0
296
+ if domain not in self.domains:
297
+ self.register_domain(domain=domain)
298
+ return self.domains[domain].tokenizer
299
+
300
+ @property
301
+ def trie(self):
302
+ """
303
+ A property to link into IntentEngine's trie.
304
+
305
+ warning:: this is only for backwards compatiblility and should not be used if you
306
+ intend on using domains.
307
+
308
+ Return: the domains trie from its IntentEngine
309
+ """
310
+ domain = 0
311
+ if domain not in self.domains:
312
+ self.register_domain(domain=domain)
313
+ return self.domains[domain].trie
314
+
315
+ @property
316
+ def tagger(self):
317
+ """
318
+ A property to link into IntentEngine's intent_parsers.
319
+
320
+ Warning: this is only for backwards compatiblility and should not be used if you
321
+ intend on using domains.
322
+
323
+ Return: the domains intent_parsers from its IntentEngine
324
+ """
325
+ domain = 0
326
+ if domain not in self.domains:
327
+ self.register_domain(domain=domain)
328
+ return self.domains[domain].tagger
329
+
330
+ @property
331
+ def intent_parsers(self):
332
+ """
333
+ A property to link into IntentEngine's intent_parsers.
334
+
335
+ Warning: this is only for backwards compatiblility and should not be used if you
336
+ intend on using domains.
337
+
338
+ Returns: the domains intent_parsers from its IntentEngine
339
+ """
340
+ domain = 0
341
+ if domain not in self.domains:
342
+ self.register_domain(domain=domain)
343
+ return self.domains[domain].intent_parsers
344
+
345
+ @property
346
+ def _regex_strings(self):
347
+ """
348
+ A property to link into IntentEngine's _regex_strings.
349
+
350
+ Warning: this is only for backwards compatiblility and should not be used if you
351
+ intend on using domains.
352
+
353
+ Returns: the domains _regex_strings from its IntentEngine
354
+ """
355
+ domain = 0
356
+ if domain not in self.domains:
357
+ self.register_domain(domain=domain)
358
+ return self.domains[domain]._regex_strings
359
+
360
+ @property
361
+ def regular_expressions_entities(self):
362
+ """
363
+ A property to link into IntentEngine's regular_expressions_entities.
364
+
365
+ Warning: this is only for backwards compatiblility and should not be used if you
366
+ intend on using domains.
367
+
368
+ Returns: the domains regular_expression_entities from its IntentEngine
369
+ """
370
+ domain = 0
371
+ if domain not in self.domains:
372
+ self.register_domain(domain=domain)
373
+ return self.domains[domain].regular_expressions_entities
374
+
375
+ def register_domain(self, domain=0, tokenizer=None, trie=None):
376
+ """
377
+ Register a domain with the intent engine.
378
+
379
+ Args:
380
+ tokenizer(tokenizer): The tokenizer you wish to use.
381
+ trie(Trie): the Trie() you wish to use.
382
+ domain(str): a string representing the domain you wish to add
383
+ """
384
+ self.domains[domain] = IntentDeterminationEngine(
385
+ tokenizer=tokenizer, trie=trie)
386
+
387
+ def register_entity(self, entity_value, entity_type, alias_of=None, domain=0):
388
+ """
389
+ Register an entity to be tagged in potential parse results.
390
+
391
+ Args:
392
+ entity_value(str): the value/proper name of an entity instance
393
+ (Ex: "The Big Bang Theory")
394
+ entity_type(str): the type/tag of an entity instance (Ex: "Television Show")
395
+ domain(str): a string representing the domain you wish to add the entity to
396
+ """
397
+ if domain not in self.domains:
398
+ self.register_domain(domain=domain)
399
+ self.domains[domain].register_entity(entity_value=entity_value,
400
+ entity_type=entity_type,
401
+ alias_of=alias_of)
402
+
403
+ def register_regex_entity(self, regex_str, domain=0):
404
+ """
405
+ A regular expression making use of python named group expressions.
406
+
407
+ Example: (?P<Artist>.*)
408
+
409
+ Args:
410
+ regex_str(str): a string representing a regular expression as defined above
411
+ domain(str): a string representing the domain you wish to add the entity to
412
+ """
413
+ if domain not in self.domains:
414
+ self.register_domain(domain=domain)
415
+ self.domains[domain].register_regex_entity(regex_str=regex_str)
416
+
417
+ def determine_intent(self, utterance, num_results=1):
418
+ """
419
+ Given an utterance, provide a valid intent.
420
+
421
+ utterance(str): an ascii or unicode string representing natural language speech
422
+ num_results(int): a maximum number of results to be returned.
423
+
424
+ Returns: A generator the yields dictionaries.
425
+ """
426
+ intents = []
427
+ for domain in self.domains:
428
+ gen = self.domains[domain].determine_intent(utterance=utterance,
429
+ num_results=1)
430
+ for intent in gen:
431
+ intents.append(intent)
432
+
433
+ heapq.nlargest(
434
+ num_results, intents, key=lambda domain: domain['confidence'])
435
+ for intent in intents:
436
+ yield intent
437
+
438
+ def register_intent_parser(self, intent_parser, domain=0):
439
+ """
440
+ Register a intent parser with a domain.
441
+
442
+ Args:
443
+ intent_parser(intent): The intent parser you wish to register.
444
+ domain(str): a string representing the domain you wish register the intent
445
+ parser to.
446
+ """
447
+ if domain not in self.domains:
448
+ self.register_domain(domain=domain)
449
+ self.domains[domain].register_intent_parser(
450
+ intent_parser=intent_parser)
451
+
452
+ def drop_intent_parser(self, parser_names, domain):
453
+ """Drop a registered intent parser.
454
+
455
+ Arguments:
456
+ parser_names (list, str): parser names to drop.
457
+ domain (str): domain to drop from
458
+
459
+ Returns:
460
+ (bool) True if an intent parser was dropped else false.
461
+ """
462
+ return self.domains[domain].drop_intent_parser(parser_names)
463
+
464
+ def drop_entity(self, domain, entity_type=None, match_func=None):
465
+ """Drop all entities mathching the given entity type or match function.
466
+
467
+ Arguments:
468
+ domain (str): intent domain
469
+ entity_type (str): entity name to match against
470
+ match_func (callable): match function to find entities
471
+
472
+ Returns:
473
+ (bool) True if vocab was found and removed otherwise False.
474
+ """
475
+ return self.domains[domain].drop_entity(entity_type=entity_type,
476
+ match_func=match_func)
477
+
478
+ def drop_regex_entity(self, domain, entity_type=None, match_func=None):
479
+ """Remove registered regex entity.
480
+
481
+ Arguments:
482
+ domain (str): intent domain
483
+ entity_type (str): entity name to match against
484
+ match_func (callable): match function to find entities
485
+
486
+ Returns:
487
+ (bool) True if vocab was found and removed otherwise False.
488
+ """
489
+ return self.domains[domain].drop_regex_entity(entity_type=entity_type,
490
+ match_func=match_func)