typeagent-py 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- typeagent/aitools/auth.py +61 -0
- typeagent/aitools/embeddings.py +232 -0
- typeagent/aitools/utils.py +244 -0
- typeagent/aitools/vectorbase.py +175 -0
- typeagent/knowpro/answer_context_schema.py +49 -0
- typeagent/knowpro/answer_response_schema.py +34 -0
- typeagent/knowpro/answers.py +577 -0
- typeagent/knowpro/collections.py +759 -0
- typeagent/knowpro/common.py +9 -0
- typeagent/knowpro/convknowledge.py +112 -0
- typeagent/knowpro/convsettings.py +94 -0
- typeagent/knowpro/convutils.py +49 -0
- typeagent/knowpro/date_time_schema.py +32 -0
- typeagent/knowpro/field_helpers.py +87 -0
- typeagent/knowpro/fuzzyindex.py +144 -0
- typeagent/knowpro/interfaces.py +818 -0
- typeagent/knowpro/knowledge.py +88 -0
- typeagent/knowpro/kplib.py +125 -0
- typeagent/knowpro/query.py +1128 -0
- typeagent/knowpro/search.py +628 -0
- typeagent/knowpro/search_query_schema.py +165 -0
- typeagent/knowpro/searchlang.py +729 -0
- typeagent/knowpro/searchlib.py +345 -0
- typeagent/knowpro/secindex.py +100 -0
- typeagent/knowpro/serialization.py +390 -0
- typeagent/knowpro/textlocindex.py +179 -0
- typeagent/knowpro/utils.py +17 -0
- typeagent/mcp/server.py +139 -0
- typeagent/podcasts/podcast.py +473 -0
- typeagent/podcasts/podcast_import.py +105 -0
- typeagent/storage/__init__.py +25 -0
- typeagent/storage/memory/__init__.py +13 -0
- typeagent/storage/memory/collections.py +68 -0
- typeagent/storage/memory/convthreads.py +81 -0
- typeagent/storage/memory/messageindex.py +178 -0
- typeagent/storage/memory/propindex.py +289 -0
- typeagent/storage/memory/provider.py +84 -0
- typeagent/storage/memory/reltermsindex.py +318 -0
- typeagent/storage/memory/semrefindex.py +660 -0
- typeagent/storage/memory/timestampindex.py +176 -0
- typeagent/storage/sqlite/__init__.py +31 -0
- typeagent/storage/sqlite/collections.py +362 -0
- typeagent/storage/sqlite/messageindex.py +382 -0
- typeagent/storage/sqlite/propindex.py +119 -0
- typeagent/storage/sqlite/provider.py +293 -0
- typeagent/storage/sqlite/reltermsindex.py +328 -0
- typeagent/storage/sqlite/schema.py +248 -0
- typeagent/storage/sqlite/semrefindex.py +156 -0
- typeagent/storage/sqlite/timestampindex.py +146 -0
- typeagent/storage/utils.py +41 -0
- typeagent_py-0.1.0.dist-info/METADATA +28 -0
- typeagent_py-0.1.0.dist-info/RECORD +55 -0
- typeagent_py-0.1.0.dist-info/WHEEL +5 -0
- typeagent_py-0.1.0.dist-info/licenses/LICENSE +21 -0
- typeagent_py-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,289 @@
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
2
|
+
# Licensed under the MIT License.
|
3
|
+
|
4
|
+
import enum
|
5
|
+
from typing import assert_never
|
6
|
+
|
7
|
+
from ...knowpro.collections import TextRangesInScope
|
8
|
+
from ...knowpro.interfaces import (
|
9
|
+
IConversation,
|
10
|
+
IPropertyToSemanticRefIndex,
|
11
|
+
ISemanticRefCollection,
|
12
|
+
ScoredSemanticRefOrdinal,
|
13
|
+
SemanticRefOrdinal,
|
14
|
+
Tag,
|
15
|
+
Topic,
|
16
|
+
)
|
17
|
+
from ...knowpro import kplib
|
18
|
+
|
19
|
+
|
20
|
+
class PropertyNames(enum.Enum):
|
21
|
+
EntityName = "name"
|
22
|
+
EntityType = "type"
|
23
|
+
FacetName = "facet.name"
|
24
|
+
FacetValue = "facet.value"
|
25
|
+
Verb = "verb"
|
26
|
+
Subject = "subject"
|
27
|
+
Object = "object"
|
28
|
+
IndirectObject = "indirectObject"
|
29
|
+
Tag = "tag"
|
30
|
+
Topic = "topic"
|
31
|
+
|
32
|
+
|
33
|
+
async def add_facet(
|
34
|
+
facet: kplib.Facet | None,
|
35
|
+
property_index: IPropertyToSemanticRefIndex,
|
36
|
+
semantic_ref_ordinal: SemanticRefOrdinal,
|
37
|
+
) -> None:
|
38
|
+
if facet is not None:
|
39
|
+
await property_index.add_property(
|
40
|
+
PropertyNames.FacetName.value,
|
41
|
+
facet.name,
|
42
|
+
semantic_ref_ordinal,
|
43
|
+
)
|
44
|
+
value = facet.value
|
45
|
+
if value is not None:
|
46
|
+
# If the value is a float, we use .g format store it as a string.
|
47
|
+
if isinstance(value, float) and value:
|
48
|
+
value = f"{value:g}"
|
49
|
+
await property_index.add_property(
|
50
|
+
PropertyNames.FacetValue.value,
|
51
|
+
str(value),
|
52
|
+
semantic_ref_ordinal,
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
async def add_entity_properties_to_index(
|
57
|
+
entity: kplib.ConcreteEntity,
|
58
|
+
property_index: IPropertyToSemanticRefIndex,
|
59
|
+
semantic_ref_ordinal: SemanticRefOrdinal,
|
60
|
+
) -> None:
|
61
|
+
await property_index.add_property(
|
62
|
+
PropertyNames.EntityName.value,
|
63
|
+
entity.name,
|
64
|
+
semantic_ref_ordinal,
|
65
|
+
)
|
66
|
+
for type in entity.type:
|
67
|
+
await property_index.add_property(
|
68
|
+
PropertyNames.EntityType.value,
|
69
|
+
type,
|
70
|
+
semantic_ref_ordinal,
|
71
|
+
)
|
72
|
+
# Add every facet name as a separate term.
|
73
|
+
if entity.facets:
|
74
|
+
for facet in entity.facets:
|
75
|
+
await add_facet(facet, property_index, semantic_ref_ordinal)
|
76
|
+
|
77
|
+
|
78
|
+
async def add_action_properties_to_index(
|
79
|
+
action: kplib.Action,
|
80
|
+
property_index: IPropertyToSemanticRefIndex,
|
81
|
+
semantic_ref_ordinal: SemanticRefOrdinal,
|
82
|
+
) -> None:
|
83
|
+
await property_index.add_property(
|
84
|
+
PropertyNames.Verb.value,
|
85
|
+
" ".join(action.verbs),
|
86
|
+
semantic_ref_ordinal,
|
87
|
+
)
|
88
|
+
if action.subject_entity_name != "none":
|
89
|
+
await property_index.add_property(
|
90
|
+
PropertyNames.Subject.value,
|
91
|
+
action.subject_entity_name,
|
92
|
+
semantic_ref_ordinal,
|
93
|
+
)
|
94
|
+
if action.object_entity_name != "none":
|
95
|
+
await property_index.add_property(
|
96
|
+
PropertyNames.Object.value,
|
97
|
+
action.object_entity_name,
|
98
|
+
semantic_ref_ordinal,
|
99
|
+
)
|
100
|
+
if action.indirect_object_entity_name != "none":
|
101
|
+
await property_index.add_property(
|
102
|
+
PropertyNames.IndirectObject.value,
|
103
|
+
action.indirect_object_entity_name,
|
104
|
+
semantic_ref_ordinal,
|
105
|
+
)
|
106
|
+
|
107
|
+
|
108
|
+
async def build_property_index(conversation: IConversation) -> None:
|
109
|
+
await add_to_property_index(conversation, 0)
|
110
|
+
|
111
|
+
|
112
|
+
async def add_to_property_index(
|
113
|
+
conversation: IConversation,
|
114
|
+
start_at_ordinal: SemanticRefOrdinal,
|
115
|
+
) -> None:
|
116
|
+
"""Add semantic references from a conversation to the property index starting at a specific ordinal."""
|
117
|
+
if (
|
118
|
+
csi := conversation.secondary_indexes
|
119
|
+
) and conversation.semantic_refs is not None:
|
120
|
+
# Check if semantic_refs collection is not empty
|
121
|
+
if await conversation.semantic_refs.size() == 0:
|
122
|
+
return
|
123
|
+
|
124
|
+
if (property_index := csi.property_to_semantic_ref_index) is None:
|
125
|
+
property_index = csi.property_to_semantic_ref_index = PropertyIndex()
|
126
|
+
|
127
|
+
semantic_refs = conversation.semantic_refs
|
128
|
+
size = await semantic_refs.size()
|
129
|
+
|
130
|
+
for semantic_ref_ordinal, semantic_ref in enumerate(
|
131
|
+
await semantic_refs.get_slice(start_at_ordinal, size),
|
132
|
+
start_at_ordinal,
|
133
|
+
):
|
134
|
+
assert semantic_ref.semantic_ref_ordinal == semantic_ref_ordinal
|
135
|
+
if isinstance(semantic_ref.knowledge, kplib.Action):
|
136
|
+
await add_action_properties_to_index(
|
137
|
+
semantic_ref.knowledge, property_index, semantic_ref_ordinal
|
138
|
+
)
|
139
|
+
elif isinstance(semantic_ref.knowledge, kplib.ConcreteEntity):
|
140
|
+
await add_entity_properties_to_index(
|
141
|
+
semantic_ref.knowledge, property_index, semantic_ref_ordinal
|
142
|
+
)
|
143
|
+
elif isinstance(semantic_ref.knowledge, Tag):
|
144
|
+
tag = semantic_ref.knowledge
|
145
|
+
await property_index.add_property(
|
146
|
+
PropertyNames.Tag.value, tag.text, semantic_ref_ordinal
|
147
|
+
)
|
148
|
+
elif isinstance(semantic_ref.knowledge, Topic):
|
149
|
+
pass
|
150
|
+
else:
|
151
|
+
assert_never(semantic_ref.knowledge)
|
152
|
+
|
153
|
+
|
154
|
+
class PropertyIndex(IPropertyToSemanticRefIndex):
|
155
|
+
def __init__(self):
|
156
|
+
self._map: dict[str, list[ScoredSemanticRefOrdinal]] = {}
|
157
|
+
|
158
|
+
async def size(self) -> int:
|
159
|
+
return len(self._map)
|
160
|
+
|
161
|
+
async def get_values(self) -> list[str]:
|
162
|
+
terms: list[str] = []
|
163
|
+
for key in self._map.keys():
|
164
|
+
nv = split_property_term_text(key)
|
165
|
+
terms.append(nv[1])
|
166
|
+
return terms
|
167
|
+
|
168
|
+
async def add_property(
|
169
|
+
self,
|
170
|
+
property_name: str,
|
171
|
+
value: str,
|
172
|
+
semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal,
|
173
|
+
) -> None:
|
174
|
+
term_text = make_property_term_text(property_name, value)
|
175
|
+
if isinstance(semantic_ref_ordinal, int):
|
176
|
+
semantic_ref_ordinal = ScoredSemanticRefOrdinal(
|
177
|
+
semantic_ref_ordinal,
|
178
|
+
1.0,
|
179
|
+
)
|
180
|
+
term_text = self._prepare_term_text(term_text)
|
181
|
+
if term_text in self._map:
|
182
|
+
self._map[term_text].append(semantic_ref_ordinal)
|
183
|
+
else:
|
184
|
+
self._map[term_text] = [semantic_ref_ordinal]
|
185
|
+
|
186
|
+
async def clear(self) -> None:
|
187
|
+
self._map = {}
|
188
|
+
|
189
|
+
async def lookup_property(
|
190
|
+
self,
|
191
|
+
property_name: str,
|
192
|
+
value: str,
|
193
|
+
) -> list[ScoredSemanticRefOrdinal] | None:
|
194
|
+
term_text = make_property_term_text(property_name, value)
|
195
|
+
return self._map.get(self._prepare_term_text(term_text))
|
196
|
+
|
197
|
+
async def remove_property(self, prop_name: str, semref_id: int) -> None:
|
198
|
+
"""Remove all properties for a specific property name and semantic ref."""
|
199
|
+
# Find and remove entries matching both property name and semref_id
|
200
|
+
keys_to_remove = []
|
201
|
+
for term_text, scored_refs in self._map.items():
|
202
|
+
prop_name_from_term, _ = split_property_term_text(term_text)
|
203
|
+
# Remove "prop." prefix
|
204
|
+
if prop_name_from_term.startswith("prop."):
|
205
|
+
prop_name_from_term = prop_name_from_term[5:]
|
206
|
+
|
207
|
+
if prop_name_from_term == prop_name:
|
208
|
+
# Filter out entries with matching semref_id
|
209
|
+
filtered_refs = [
|
210
|
+
ref for ref in scored_refs if ref.semantic_ref_ordinal != semref_id
|
211
|
+
]
|
212
|
+
if filtered_refs:
|
213
|
+
self._map[term_text] = filtered_refs
|
214
|
+
else:
|
215
|
+
keys_to_remove.append(term_text)
|
216
|
+
|
217
|
+
# Remove empty entries
|
218
|
+
for key in keys_to_remove:
|
219
|
+
del self._map[key]
|
220
|
+
|
221
|
+
async def remove_all_for_semref(self, semref_id: int) -> None:
|
222
|
+
"""Remove all properties for a specific semantic ref."""
|
223
|
+
keys_to_remove = []
|
224
|
+
for term_text, scored_refs in self._map.items():
|
225
|
+
# Filter out entries with matching semref_id
|
226
|
+
filtered_refs = [
|
227
|
+
ref for ref in scored_refs if ref.semantic_ref_ordinal != semref_id
|
228
|
+
]
|
229
|
+
if filtered_refs:
|
230
|
+
self._map[term_text] = filtered_refs
|
231
|
+
else:
|
232
|
+
keys_to_remove.append(term_text)
|
233
|
+
|
234
|
+
# Remove empty entries
|
235
|
+
for key in keys_to_remove:
|
236
|
+
del self._map[key]
|
237
|
+
|
238
|
+
def _prepare_term_text(self, term_text: str) -> str:
|
239
|
+
"""Do any pre-processing of the term."""
|
240
|
+
return term_text.lower()
|
241
|
+
|
242
|
+
|
243
|
+
async def lookup_property_in_property_index(
|
244
|
+
property_index: IPropertyToSemanticRefIndex,
|
245
|
+
property_name: str,
|
246
|
+
property_value: str,
|
247
|
+
semantic_refs: ISemanticRefCollection,
|
248
|
+
ranges_in_scope: TextRangesInScope | None = None,
|
249
|
+
) -> list[ScoredSemanticRefOrdinal] | None:
|
250
|
+
scored_refs = await property_index.lookup_property(
|
251
|
+
property_name,
|
252
|
+
property_value,
|
253
|
+
)
|
254
|
+
if ranges_in_scope is not None and scored_refs:
|
255
|
+
filtered_refs = []
|
256
|
+
for sr in scored_refs:
|
257
|
+
semantic_ref = await semantic_refs.get_item(sr.semantic_ref_ordinal)
|
258
|
+
if ranges_in_scope.is_range_in_scope(semantic_ref.range):
|
259
|
+
filtered_refs.append(sr)
|
260
|
+
scored_refs = filtered_refs
|
261
|
+
|
262
|
+
return scored_refs or None # Return None if no results
|
263
|
+
|
264
|
+
|
265
|
+
async def is_known_property(
|
266
|
+
property_index: IPropertyToSemanticRefIndex | None,
|
267
|
+
property_name: PropertyNames,
|
268
|
+
property_value: str,
|
269
|
+
) -> bool:
|
270
|
+
if property_index is not None:
|
271
|
+
semantic_refs_with_name = await property_index.lookup_property(
|
272
|
+
property_name.value,
|
273
|
+
property_value,
|
274
|
+
)
|
275
|
+
return semantic_refs_with_name is not None and len(semantic_refs_with_name) > 0
|
276
|
+
else:
|
277
|
+
return False
|
278
|
+
|
279
|
+
|
280
|
+
PROPERTY_DELIMITER = "@@"
|
281
|
+
|
282
|
+
|
283
|
+
def make_property_term_text(name: str, value: str) -> str:
|
284
|
+
return f"prop.{name}{PROPERTY_DELIMITER}{value}"
|
285
|
+
|
286
|
+
|
287
|
+
def split_property_term_text(term_text: str) -> tuple[str, str]:
|
288
|
+
parts = term_text.split(PROPERTY_DELIMITER, 1)
|
289
|
+
return parts[0], parts[1]
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
2
|
+
# Licensed under the MIT License.
|
3
|
+
|
4
|
+
"""Memory storage provider implementation."""
|
5
|
+
|
6
|
+
from .collections import MemoryMessageCollection, MemorySemanticRefCollection
|
7
|
+
from .semrefindex import TermToSemanticRefIndex
|
8
|
+
from .convthreads import ConversationThreads
|
9
|
+
from .messageindex import MessageTextIndex
|
10
|
+
from .reltermsindex import RelatedTermsIndex
|
11
|
+
from .propindex import PropertyIndex
|
12
|
+
from .timestampindex import TimestampToTextRangeIndex
|
13
|
+
from ...knowpro.convsettings import MessageTextIndexSettings, RelatedTermIndexSettings
|
14
|
+
from ...knowpro.interfaces import (
|
15
|
+
IConversationThreads,
|
16
|
+
IMessage,
|
17
|
+
IMessageTextIndex,
|
18
|
+
IPropertyToSemanticRefIndex,
|
19
|
+
IStorageProvider,
|
20
|
+
ITermToRelatedTermsIndex,
|
21
|
+
ITermToSemanticRefIndex,
|
22
|
+
ITimestampToTextRangeIndex,
|
23
|
+
)
|
24
|
+
|
25
|
+
|
26
|
+
class MemoryStorageProvider[TMessage: IMessage](IStorageProvider[TMessage]):
|
27
|
+
"""A storage provider that operates in memory."""
|
28
|
+
|
29
|
+
_message_collection: MemoryMessageCollection[TMessage]
|
30
|
+
_semantic_ref_collection: MemorySemanticRefCollection
|
31
|
+
|
32
|
+
_conversation_index: TermToSemanticRefIndex
|
33
|
+
_property_index: PropertyIndex
|
34
|
+
_timestamp_index: TimestampToTextRangeIndex
|
35
|
+
_message_text_index: MessageTextIndex
|
36
|
+
_related_terms_index: RelatedTermsIndex
|
37
|
+
_conversation_threads: ConversationThreads
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
message_text_settings: MessageTextIndexSettings,
|
42
|
+
related_terms_settings: RelatedTermIndexSettings,
|
43
|
+
) -> None:
|
44
|
+
"""Create and initialize a MemoryStorageProvider with all indexes."""
|
45
|
+
self._message_collection = MemoryMessageCollection[TMessage]()
|
46
|
+
self._semantic_ref_collection = MemorySemanticRefCollection()
|
47
|
+
|
48
|
+
self._conversation_index = TermToSemanticRefIndex()
|
49
|
+
self._property_index = PropertyIndex()
|
50
|
+
self._timestamp_index = TimestampToTextRangeIndex()
|
51
|
+
self._message_text_index = MessageTextIndex(message_text_settings)
|
52
|
+
self._related_terms_index = RelatedTermsIndex(related_terms_settings)
|
53
|
+
thread_settings = message_text_settings.embedding_index_settings
|
54
|
+
self._conversation_threads = ConversationThreads(thread_settings)
|
55
|
+
|
56
|
+
async def get_semantic_ref_index(self) -> ITermToSemanticRefIndex:
|
57
|
+
return self._conversation_index
|
58
|
+
|
59
|
+
async def get_property_index(self) -> IPropertyToSemanticRefIndex:
|
60
|
+
return self._property_index
|
61
|
+
|
62
|
+
async def get_timestamp_index(self) -> ITimestampToTextRangeIndex:
|
63
|
+
return self._timestamp_index
|
64
|
+
|
65
|
+
async def get_message_text_index(self) -> IMessageTextIndex[TMessage]:
|
66
|
+
return self._message_text_index
|
67
|
+
|
68
|
+
async def get_related_terms_index(self) -> ITermToRelatedTermsIndex:
|
69
|
+
return self._related_terms_index
|
70
|
+
|
71
|
+
async def get_conversation_threads(self) -> IConversationThreads:
|
72
|
+
return self._conversation_threads
|
73
|
+
|
74
|
+
async def get_message_collection(
|
75
|
+
self, message_type: type[TMessage] | None = None
|
76
|
+
) -> MemoryMessageCollection[TMessage]:
|
77
|
+
return self._message_collection
|
78
|
+
|
79
|
+
async def get_semantic_ref_collection(self) -> MemorySemanticRefCollection:
|
80
|
+
return self._semantic_ref_collection
|
81
|
+
|
82
|
+
async def close(self) -> None:
|
83
|
+
"""Close the storage provider."""
|
84
|
+
pass
|
@@ -0,0 +1,318 @@
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
2
|
+
# Licensed under the MIT License.
|
3
|
+
|
4
|
+
from collections.abc import Callable
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Protocol, TYPE_CHECKING
|
7
|
+
|
8
|
+
from typeagent.aitools.vectorbase import (
|
9
|
+
ScoredInt,
|
10
|
+
TextEmbeddingIndexSettings,
|
11
|
+
VectorBase,
|
12
|
+
)
|
13
|
+
|
14
|
+
from typeagent.knowpro.collections import TermSet
|
15
|
+
from typeagent.knowpro.common import is_search_term_wildcard
|
16
|
+
from typeagent.knowpro.convsettings import RelatedTermIndexSettings
|
17
|
+
from typeagent.knowpro.interfaces import (
|
18
|
+
IConversation,
|
19
|
+
IMessage,
|
20
|
+
ITermToRelatedTerms,
|
21
|
+
ITermToRelatedTermsFuzzy,
|
22
|
+
ITermToRelatedTermsIndex,
|
23
|
+
ITermToSemanticRefIndex,
|
24
|
+
SearchTerm,
|
25
|
+
Term,
|
26
|
+
TermToRelatedTermsData,
|
27
|
+
TermsToRelatedTermsDataItem,
|
28
|
+
TermsToRelatedTermsIndexData,
|
29
|
+
TextEmbeddingIndexData,
|
30
|
+
)
|
31
|
+
|
32
|
+
if TYPE_CHECKING:
|
33
|
+
from typeagent.knowpro.query import CompiledSearchTerm, CompiledTermGroup
|
34
|
+
|
35
|
+
|
36
|
+
class TermToRelatedTermsMap(ITermToRelatedTerms):
|
37
|
+
def __init__(self):
|
38
|
+
# The inner dict represents a set of terms disregarding their weights.
|
39
|
+
self.map: dict[str, dict[str, Term]] = {}
|
40
|
+
|
41
|
+
async def add_related_term(
|
42
|
+
self, text: str, related_terms: Term | list[Term]
|
43
|
+
) -> None:
|
44
|
+
if not isinstance(related_terms, list):
|
45
|
+
related_terms = [related_terms]
|
46
|
+
terms: dict[str, Term] = self.map.setdefault(text, {})
|
47
|
+
for related in related_terms:
|
48
|
+
terms.setdefault(related.text, related)
|
49
|
+
|
50
|
+
async def lookup_term(self, text: str) -> list[Term] | None:
|
51
|
+
result = self.map.get(text)
|
52
|
+
if result:
|
53
|
+
return list(result.values())
|
54
|
+
else:
|
55
|
+
return None
|
56
|
+
|
57
|
+
async def remove_term(self, text: str) -> None:
|
58
|
+
self.map.pop(text, None)
|
59
|
+
|
60
|
+
async def clear(self) -> None:
|
61
|
+
self.map.clear()
|
62
|
+
|
63
|
+
async def size(self) -> int:
|
64
|
+
return len(self.map)
|
65
|
+
|
66
|
+
async def is_empty(self) -> bool:
|
67
|
+
return len(self.map) == 0
|
68
|
+
|
69
|
+
async def serialize(self) -> TermToRelatedTermsData:
|
70
|
+
related_terms: list[TermsToRelatedTermsDataItem] = []
|
71
|
+
for key, value in self.map.items():
|
72
|
+
related_terms.append(
|
73
|
+
TermsToRelatedTermsDataItem(
|
74
|
+
termText=key,
|
75
|
+
relatedTerms=[term.serialize() for term in value.values()],
|
76
|
+
)
|
77
|
+
)
|
78
|
+
return TermToRelatedTermsData(relatedTerms=related_terms)
|
79
|
+
|
80
|
+
async def deserialize(self, data: TermToRelatedTermsData | None) -> None:
|
81
|
+
self.map.clear()
|
82
|
+
if data is None:
|
83
|
+
return
|
84
|
+
related_terms_data = data.get("relatedTerms")
|
85
|
+
if related_terms_data is None:
|
86
|
+
return
|
87
|
+
for item in related_terms_data:
|
88
|
+
term_text = item["termText"]
|
89
|
+
related_terms_data = item["relatedTerms"]
|
90
|
+
related_terms: list[Term] = [
|
91
|
+
Term(term_data["text"], weight=term_data.get("weight"))
|
92
|
+
for term_data in related_terms_data
|
93
|
+
]
|
94
|
+
await self.add_related_term(term_text, related_terms)
|
95
|
+
|
96
|
+
|
97
|
+
async def build_related_terms_index[
|
98
|
+
TMessage: IMessage,
|
99
|
+
TTermToSemanticRefIndex: ITermToSemanticRefIndex,
|
100
|
+
](
|
101
|
+
conversation: IConversation[TMessage, TTermToSemanticRefIndex],
|
102
|
+
settings: RelatedTermIndexSettings,
|
103
|
+
) -> None:
|
104
|
+
csr = conversation.semantic_ref_index
|
105
|
+
assert csr is not None
|
106
|
+
csi = conversation.secondary_indexes
|
107
|
+
if csr is not None and csi is not None:
|
108
|
+
if csi.term_to_related_terms_index is None:
|
109
|
+
csi.term_to_related_terms_index = RelatedTermsIndex(settings)
|
110
|
+
fuzzy_index = csi.term_to_related_terms_index.fuzzy_index
|
111
|
+
if fuzzy_index is not None:
|
112
|
+
all_terms = await csr.get_terms()
|
113
|
+
if all_terms:
|
114
|
+
await fuzzy_index.add_terms(all_terms)
|
115
|
+
|
116
|
+
|
117
|
+
class RelatedTermsIndex(ITermToRelatedTermsIndex):
|
118
|
+
def __init__(self, settings: RelatedTermIndexSettings):
|
119
|
+
self.settings = settings
|
120
|
+
self._alias_map = TermToRelatedTermsMap()
|
121
|
+
self._term_index = TermEmbeddingIndex(settings.embedding_index_settings)
|
122
|
+
|
123
|
+
@property
|
124
|
+
def aliases(self) -> TermToRelatedTermsMap:
|
125
|
+
return self._alias_map
|
126
|
+
|
127
|
+
@property
|
128
|
+
def fuzzy_index(self) -> ITermToRelatedTermsFuzzy | None:
|
129
|
+
return self._term_index
|
130
|
+
|
131
|
+
async def serialize(self) -> TermsToRelatedTermsIndexData:
|
132
|
+
return TermsToRelatedTermsIndexData(
|
133
|
+
aliasData=await self._alias_map.serialize(),
|
134
|
+
textEmbeddingData=self._term_index.serialize(),
|
135
|
+
)
|
136
|
+
|
137
|
+
async def deserialize(self, data: TermsToRelatedTermsIndexData) -> None:
|
138
|
+
await self._alias_map.clear()
|
139
|
+
self._term_index.clear()
|
140
|
+
await self._alias_map.deserialize(data.get("aliasData"))
|
141
|
+
text_embedding_data = data.get("textEmbeddingData")
|
142
|
+
if text_embedding_data is not None:
|
143
|
+
self._term_index.deserialize(text_embedding_data)
|
144
|
+
|
145
|
+
|
146
|
+
async def resolve_related_terms(
|
147
|
+
related_terms_index: ITermToRelatedTermsIndex,
|
148
|
+
compiled_terms: list["CompiledTermGroup"],
|
149
|
+
ensure_single_occurrence: bool = True,
|
150
|
+
should_resolve_fuzzy: Callable[[SearchTerm], bool] | None = None,
|
151
|
+
) -> None:
|
152
|
+
"""Resolves related terms for those search terms that don't already have them.
|
153
|
+
|
154
|
+
NOTE: This modifies SearchTerm().related_terms in place.
|
155
|
+
|
156
|
+
Optionally ensures that related terms are not duplicated across search terms
|
157
|
+
because this can skew how semantic references are scored during search
|
158
|
+
(over-counting).
|
159
|
+
|
160
|
+
SUBTLE: If a search terms has related_terms == [], don't touch it;
|
161
|
+
only set related_terms if it is None.
|
162
|
+
"""
|
163
|
+
all_search_terms = [term for ct in compiled_terms for term in ct.terms]
|
164
|
+
searchable_terms = TermSet()
|
165
|
+
search_terms_needing_related: list[SearchTerm] = []
|
166
|
+
|
167
|
+
for search_term in all_search_terms:
|
168
|
+
if is_search_term_wildcard(search_term):
|
169
|
+
continue
|
170
|
+
searchable_terms.add_or_union(search_term.term)
|
171
|
+
term_text = search_term.term.text
|
172
|
+
# Resolve any specific term to related term mappings
|
173
|
+
if search_term.related_terms is None:
|
174
|
+
search_term.related_terms = await related_terms_index.aliases.lookup_term(
|
175
|
+
term_text
|
176
|
+
)
|
177
|
+
# If no mappings to aliases, add to fuzzy retrieval list
|
178
|
+
if search_term.related_terms is None:
|
179
|
+
if should_resolve_fuzzy is None or should_resolve_fuzzy(search_term):
|
180
|
+
search_terms_needing_related.append(search_term)
|
181
|
+
|
182
|
+
if related_terms_index.fuzzy_index is not None and search_terms_needing_related:
|
183
|
+
related_terms_for_search_terms = (
|
184
|
+
await related_terms_index.fuzzy_index.lookup_terms(
|
185
|
+
[st.term.text for st in search_terms_needing_related]
|
186
|
+
)
|
187
|
+
)
|
188
|
+
for search_term, related_terms in zip(
|
189
|
+
search_terms_needing_related, related_terms_for_search_terms
|
190
|
+
):
|
191
|
+
search_term.related_terms = related_terms
|
192
|
+
|
193
|
+
# Due to fuzzy matching, a search term may end with related terms that overlap with those of other search terms.
|
194
|
+
# This causes scoring problems... duplicate/redundant scoring that can cause items to seem more relevant than they are
|
195
|
+
# - The same related term can show up for different search terms but with different weights
|
196
|
+
# - related terms may also already be present as search terms
|
197
|
+
for ct in compiled_terms:
|
198
|
+
dedupe_related_terms(
|
199
|
+
ct.terms, ensure_single_occurrence and ct.boolean_op != "and"
|
200
|
+
)
|
201
|
+
|
202
|
+
|
203
|
+
def dedupe_related_terms(
|
204
|
+
compiled_terms: list["CompiledSearchTerm"],
|
205
|
+
ensure_single_occurrence: bool,
|
206
|
+
) -> None:
|
207
|
+
all_search_terms = TermSet()
|
208
|
+
all_related_terms: TermSet | None = None
|
209
|
+
|
210
|
+
# Collect all unique search and related terms.
|
211
|
+
# We end up with (term, maximum weight for term) pairs.
|
212
|
+
for st in compiled_terms:
|
213
|
+
all_search_terms.add(st.term)
|
214
|
+
if ensure_single_occurrence:
|
215
|
+
all_related_terms = TermSet()
|
216
|
+
for st in compiled_terms:
|
217
|
+
all_related_terms.add_or_union(st.related_terms)
|
218
|
+
|
219
|
+
for search_term in compiled_terms:
|
220
|
+
required = search_term.related_terms_required
|
221
|
+
if required:
|
222
|
+
continue
|
223
|
+
if search_term.related_terms:
|
224
|
+
unique_related_for_search_term: list[Term] = []
|
225
|
+
for candidate_related_term in search_term.related_terms:
|
226
|
+
if candidate_related_term in all_search_terms:
|
227
|
+
# This related term is already a search term
|
228
|
+
continue
|
229
|
+
if ensure_single_occurrence and all_related_terms is not None:
|
230
|
+
# Each unique related term should be searched for only once,
|
231
|
+
# and (if there were duplicates) assigned the maximum weight assigned to that term
|
232
|
+
term_with_max_weight = all_related_terms.get(candidate_related_term)
|
233
|
+
if (
|
234
|
+
term_with_max_weight is not None
|
235
|
+
and term_with_max_weight.weight == candidate_related_term.weight
|
236
|
+
):
|
237
|
+
# Associate this related term with the current search term
|
238
|
+
unique_related_for_search_term.append(term_with_max_weight)
|
239
|
+
all_related_terms.remove(candidate_related_term)
|
240
|
+
else:
|
241
|
+
unique_related_for_search_term.append(candidate_related_term)
|
242
|
+
search_term.related_terms = unique_related_for_search_term
|
243
|
+
|
244
|
+
|
245
|
+
class ITermEmbeddingIndex(ITermToRelatedTermsFuzzy, Protocol):
|
246
|
+
def serialize(self) -> TextEmbeddingIndexData: ...
|
247
|
+
|
248
|
+
def deserialize(self, data: TextEmbeddingIndexData) -> None: ...
|
249
|
+
|
250
|
+
|
251
|
+
# TODO: Inherit from TextEmbeddingCache too.
|
252
|
+
class TermEmbeddingIndex(ITermEmbeddingIndex):
|
253
|
+
# The Python version wraps a VectorBase
|
254
|
+
|
255
|
+
settings: TextEmbeddingIndexSettings
|
256
|
+
_vectorbase: VectorBase
|
257
|
+
_texts: list[str]
|
258
|
+
|
259
|
+
def __init__(
|
260
|
+
self,
|
261
|
+
settings: TextEmbeddingIndexSettings,
|
262
|
+
data: TextEmbeddingIndexData | None = None,
|
263
|
+
):
|
264
|
+
self.settings = settings
|
265
|
+
self._vectorbase = VectorBase(settings)
|
266
|
+
self._texts: list[str] = []
|
267
|
+
if data:
|
268
|
+
self.deserialize(data)
|
269
|
+
|
270
|
+
def clear(self) -> None:
|
271
|
+
self._vectorbase.clear()
|
272
|
+
self._texts.clear()
|
273
|
+
|
274
|
+
def serialize(self) -> TextEmbeddingIndexData:
|
275
|
+
return TextEmbeddingIndexData(
|
276
|
+
textItems=self._texts,
|
277
|
+
embeddings=self._vectorbase.serialize(),
|
278
|
+
)
|
279
|
+
|
280
|
+
def deserialize(self, data: TextEmbeddingIndexData | None) -> None:
|
281
|
+
self.clear()
|
282
|
+
if data is not None:
|
283
|
+
self._texts = data.get("textItems", [])
|
284
|
+
self._vectorbase.deserialize(data.get("embeddings"))
|
285
|
+
|
286
|
+
async def size(self) -> int:
|
287
|
+
return len(self._vectorbase)
|
288
|
+
|
289
|
+
async def add_terms(self, texts: list[str]) -> None:
|
290
|
+
await self._vectorbase.add_keys(texts)
|
291
|
+
self._texts.extend(texts)
|
292
|
+
|
293
|
+
async def lookup_term(
|
294
|
+
self, text: str, max_hits: int | None = None, min_score: float | None = None
|
295
|
+
) -> list[Term]:
|
296
|
+
matches = await self._vectorbase.fuzzy_lookup(
|
297
|
+
text, max_hits=max_hits, min_score=min_score
|
298
|
+
)
|
299
|
+
return self.matches_to_terms(matches)
|
300
|
+
|
301
|
+
async def lookup_terms(
|
302
|
+
self,
|
303
|
+
texts: list[str],
|
304
|
+
max_hits: int | None = None,
|
305
|
+
min_score: float | None = None,
|
306
|
+
) -> list[list[Term]]:
|
307
|
+
matches = [
|
308
|
+
await self._vectorbase.fuzzy_lookup(
|
309
|
+
text, max_hits=max_hits, min_score=min_score
|
310
|
+
)
|
311
|
+
for text in texts
|
312
|
+
]
|
313
|
+
return [self.matches_to_terms(m) for m in matches]
|
314
|
+
|
315
|
+
def matches_to_terms(self, matches: list[ScoredInt]) -> list[Term]:
|
316
|
+
return [
|
317
|
+
Term(text=self._texts[match.item], weight=match.score) for match in matches
|
318
|
+
]
|