typeagent-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. typeagent/aitools/auth.py +61 -0
  2. typeagent/aitools/embeddings.py +232 -0
  3. typeagent/aitools/utils.py +244 -0
  4. typeagent/aitools/vectorbase.py +175 -0
  5. typeagent/knowpro/answer_context_schema.py +49 -0
  6. typeagent/knowpro/answer_response_schema.py +34 -0
  7. typeagent/knowpro/answers.py +577 -0
  8. typeagent/knowpro/collections.py +759 -0
  9. typeagent/knowpro/common.py +9 -0
  10. typeagent/knowpro/convknowledge.py +112 -0
  11. typeagent/knowpro/convsettings.py +94 -0
  12. typeagent/knowpro/convutils.py +49 -0
  13. typeagent/knowpro/date_time_schema.py +32 -0
  14. typeagent/knowpro/field_helpers.py +87 -0
  15. typeagent/knowpro/fuzzyindex.py +144 -0
  16. typeagent/knowpro/interfaces.py +818 -0
  17. typeagent/knowpro/knowledge.py +88 -0
  18. typeagent/knowpro/kplib.py +125 -0
  19. typeagent/knowpro/query.py +1128 -0
  20. typeagent/knowpro/search.py +628 -0
  21. typeagent/knowpro/search_query_schema.py +165 -0
  22. typeagent/knowpro/searchlang.py +729 -0
  23. typeagent/knowpro/searchlib.py +345 -0
  24. typeagent/knowpro/secindex.py +100 -0
  25. typeagent/knowpro/serialization.py +390 -0
  26. typeagent/knowpro/textlocindex.py +179 -0
  27. typeagent/knowpro/utils.py +17 -0
  28. typeagent/mcp/server.py +139 -0
  29. typeagent/podcasts/podcast.py +473 -0
  30. typeagent/podcasts/podcast_import.py +105 -0
  31. typeagent/storage/__init__.py +25 -0
  32. typeagent/storage/memory/__init__.py +13 -0
  33. typeagent/storage/memory/collections.py +68 -0
  34. typeagent/storage/memory/convthreads.py +81 -0
  35. typeagent/storage/memory/messageindex.py +178 -0
  36. typeagent/storage/memory/propindex.py +289 -0
  37. typeagent/storage/memory/provider.py +84 -0
  38. typeagent/storage/memory/reltermsindex.py +318 -0
  39. typeagent/storage/memory/semrefindex.py +660 -0
  40. typeagent/storage/memory/timestampindex.py +176 -0
  41. typeagent/storage/sqlite/__init__.py +31 -0
  42. typeagent/storage/sqlite/collections.py +362 -0
  43. typeagent/storage/sqlite/messageindex.py +382 -0
  44. typeagent/storage/sqlite/propindex.py +119 -0
  45. typeagent/storage/sqlite/provider.py +293 -0
  46. typeagent/storage/sqlite/reltermsindex.py +328 -0
  47. typeagent/storage/sqlite/schema.py +248 -0
  48. typeagent/storage/sqlite/semrefindex.py +156 -0
  49. typeagent/storage/sqlite/timestampindex.py +146 -0
  50. typeagent/storage/utils.py +41 -0
  51. typeagent_py-0.1.0.dist-info/METADATA +28 -0
  52. typeagent_py-0.1.0.dist-info/RECORD +55 -0
  53. typeagent_py-0.1.0.dist-info/WHEEL +5 -0
  54. typeagent_py-0.1.0.dist-info/licenses/LICENSE +21 -0
  55. typeagent_py-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,289 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ import enum
5
+ from typing import assert_never
6
+
7
+ from ...knowpro.collections import TextRangesInScope
8
+ from ...knowpro.interfaces import (
9
+ IConversation,
10
+ IPropertyToSemanticRefIndex,
11
+ ISemanticRefCollection,
12
+ ScoredSemanticRefOrdinal,
13
+ SemanticRefOrdinal,
14
+ Tag,
15
+ Topic,
16
+ )
17
+ from ...knowpro import kplib
18
+
19
+
20
+ class PropertyNames(enum.Enum):
21
+ EntityName = "name"
22
+ EntityType = "type"
23
+ FacetName = "facet.name"
24
+ FacetValue = "facet.value"
25
+ Verb = "verb"
26
+ Subject = "subject"
27
+ Object = "object"
28
+ IndirectObject = "indirectObject"
29
+ Tag = "tag"
30
+ Topic = "topic"
31
+
32
+
33
+ async def add_facet(
34
+ facet: kplib.Facet | None,
35
+ property_index: IPropertyToSemanticRefIndex,
36
+ semantic_ref_ordinal: SemanticRefOrdinal,
37
+ ) -> None:
38
+ if facet is not None:
39
+ await property_index.add_property(
40
+ PropertyNames.FacetName.value,
41
+ facet.name,
42
+ semantic_ref_ordinal,
43
+ )
44
+ value = facet.value
45
+ if value is not None:
46
+ # If the value is a float, we use .g format store it as a string.
47
+ if isinstance(value, float) and value:
48
+ value = f"{value:g}"
49
+ await property_index.add_property(
50
+ PropertyNames.FacetValue.value,
51
+ str(value),
52
+ semantic_ref_ordinal,
53
+ )
54
+
55
+
56
+ async def add_entity_properties_to_index(
57
+ entity: kplib.ConcreteEntity,
58
+ property_index: IPropertyToSemanticRefIndex,
59
+ semantic_ref_ordinal: SemanticRefOrdinal,
60
+ ) -> None:
61
+ await property_index.add_property(
62
+ PropertyNames.EntityName.value,
63
+ entity.name,
64
+ semantic_ref_ordinal,
65
+ )
66
+ for type in entity.type:
67
+ await property_index.add_property(
68
+ PropertyNames.EntityType.value,
69
+ type,
70
+ semantic_ref_ordinal,
71
+ )
72
+ # Add every facet name as a separate term.
73
+ if entity.facets:
74
+ for facet in entity.facets:
75
+ await add_facet(facet, property_index, semantic_ref_ordinal)
76
+
77
+
78
+ async def add_action_properties_to_index(
79
+ action: kplib.Action,
80
+ property_index: IPropertyToSemanticRefIndex,
81
+ semantic_ref_ordinal: SemanticRefOrdinal,
82
+ ) -> None:
83
+ await property_index.add_property(
84
+ PropertyNames.Verb.value,
85
+ " ".join(action.verbs),
86
+ semantic_ref_ordinal,
87
+ )
88
+ if action.subject_entity_name != "none":
89
+ await property_index.add_property(
90
+ PropertyNames.Subject.value,
91
+ action.subject_entity_name,
92
+ semantic_ref_ordinal,
93
+ )
94
+ if action.object_entity_name != "none":
95
+ await property_index.add_property(
96
+ PropertyNames.Object.value,
97
+ action.object_entity_name,
98
+ semantic_ref_ordinal,
99
+ )
100
+ if action.indirect_object_entity_name != "none":
101
+ await property_index.add_property(
102
+ PropertyNames.IndirectObject.value,
103
+ action.indirect_object_entity_name,
104
+ semantic_ref_ordinal,
105
+ )
106
+
107
+
108
+ async def build_property_index(conversation: IConversation) -> None:
109
+ await add_to_property_index(conversation, 0)
110
+
111
+
112
+ async def add_to_property_index(
113
+ conversation: IConversation,
114
+ start_at_ordinal: SemanticRefOrdinal,
115
+ ) -> None:
116
+ """Add semantic references from a conversation to the property index starting at a specific ordinal."""
117
+ if (
118
+ csi := conversation.secondary_indexes
119
+ ) and conversation.semantic_refs is not None:
120
+ # Check if semantic_refs collection is not empty
121
+ if await conversation.semantic_refs.size() == 0:
122
+ return
123
+
124
+ if (property_index := csi.property_to_semantic_ref_index) is None:
125
+ property_index = csi.property_to_semantic_ref_index = PropertyIndex()
126
+
127
+ semantic_refs = conversation.semantic_refs
128
+ size = await semantic_refs.size()
129
+
130
+ for semantic_ref_ordinal, semantic_ref in enumerate(
131
+ await semantic_refs.get_slice(start_at_ordinal, size),
132
+ start_at_ordinal,
133
+ ):
134
+ assert semantic_ref.semantic_ref_ordinal == semantic_ref_ordinal
135
+ if isinstance(semantic_ref.knowledge, kplib.Action):
136
+ await add_action_properties_to_index(
137
+ semantic_ref.knowledge, property_index, semantic_ref_ordinal
138
+ )
139
+ elif isinstance(semantic_ref.knowledge, kplib.ConcreteEntity):
140
+ await add_entity_properties_to_index(
141
+ semantic_ref.knowledge, property_index, semantic_ref_ordinal
142
+ )
143
+ elif isinstance(semantic_ref.knowledge, Tag):
144
+ tag = semantic_ref.knowledge
145
+ await property_index.add_property(
146
+ PropertyNames.Tag.value, tag.text, semantic_ref_ordinal
147
+ )
148
+ elif isinstance(semantic_ref.knowledge, Topic):
149
+ pass
150
+ else:
151
+ assert_never(semantic_ref.knowledge)
152
+
153
+
154
+ class PropertyIndex(IPropertyToSemanticRefIndex):
155
+ def __init__(self):
156
+ self._map: dict[str, list[ScoredSemanticRefOrdinal]] = {}
157
+
158
+ async def size(self) -> int:
159
+ return len(self._map)
160
+
161
+ async def get_values(self) -> list[str]:
162
+ terms: list[str] = []
163
+ for key in self._map.keys():
164
+ nv = split_property_term_text(key)
165
+ terms.append(nv[1])
166
+ return terms
167
+
168
+ async def add_property(
169
+ self,
170
+ property_name: str,
171
+ value: str,
172
+ semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal,
173
+ ) -> None:
174
+ term_text = make_property_term_text(property_name, value)
175
+ if isinstance(semantic_ref_ordinal, int):
176
+ semantic_ref_ordinal = ScoredSemanticRefOrdinal(
177
+ semantic_ref_ordinal,
178
+ 1.0,
179
+ )
180
+ term_text = self._prepare_term_text(term_text)
181
+ if term_text in self._map:
182
+ self._map[term_text].append(semantic_ref_ordinal)
183
+ else:
184
+ self._map[term_text] = [semantic_ref_ordinal]
185
+
186
+ async def clear(self) -> None:
187
+ self._map = {}
188
+
189
+ async def lookup_property(
190
+ self,
191
+ property_name: str,
192
+ value: str,
193
+ ) -> list[ScoredSemanticRefOrdinal] | None:
194
+ term_text = make_property_term_text(property_name, value)
195
+ return self._map.get(self._prepare_term_text(term_text))
196
+
197
+ async def remove_property(self, prop_name: str, semref_id: int) -> None:
198
+ """Remove all properties for a specific property name and semantic ref."""
199
+ # Find and remove entries matching both property name and semref_id
200
+ keys_to_remove = []
201
+ for term_text, scored_refs in self._map.items():
202
+ prop_name_from_term, _ = split_property_term_text(term_text)
203
+ # Remove "prop." prefix
204
+ if prop_name_from_term.startswith("prop."):
205
+ prop_name_from_term = prop_name_from_term[5:]
206
+
207
+ if prop_name_from_term == prop_name:
208
+ # Filter out entries with matching semref_id
209
+ filtered_refs = [
210
+ ref for ref in scored_refs if ref.semantic_ref_ordinal != semref_id
211
+ ]
212
+ if filtered_refs:
213
+ self._map[term_text] = filtered_refs
214
+ else:
215
+ keys_to_remove.append(term_text)
216
+
217
+ # Remove empty entries
218
+ for key in keys_to_remove:
219
+ del self._map[key]
220
+
221
+ async def remove_all_for_semref(self, semref_id: int) -> None:
222
+ """Remove all properties for a specific semantic ref."""
223
+ keys_to_remove = []
224
+ for term_text, scored_refs in self._map.items():
225
+ # Filter out entries with matching semref_id
226
+ filtered_refs = [
227
+ ref for ref in scored_refs if ref.semantic_ref_ordinal != semref_id
228
+ ]
229
+ if filtered_refs:
230
+ self._map[term_text] = filtered_refs
231
+ else:
232
+ keys_to_remove.append(term_text)
233
+
234
+ # Remove empty entries
235
+ for key in keys_to_remove:
236
+ del self._map[key]
237
+
238
+ def _prepare_term_text(self, term_text: str) -> str:
239
+ """Do any pre-processing of the term."""
240
+ return term_text.lower()
241
+
242
+
243
+ async def lookup_property_in_property_index(
244
+ property_index: IPropertyToSemanticRefIndex,
245
+ property_name: str,
246
+ property_value: str,
247
+ semantic_refs: ISemanticRefCollection,
248
+ ranges_in_scope: TextRangesInScope | None = None,
249
+ ) -> list[ScoredSemanticRefOrdinal] | None:
250
+ scored_refs = await property_index.lookup_property(
251
+ property_name,
252
+ property_value,
253
+ )
254
+ if ranges_in_scope is not None and scored_refs:
255
+ filtered_refs = []
256
+ for sr in scored_refs:
257
+ semantic_ref = await semantic_refs.get_item(sr.semantic_ref_ordinal)
258
+ if ranges_in_scope.is_range_in_scope(semantic_ref.range):
259
+ filtered_refs.append(sr)
260
+ scored_refs = filtered_refs
261
+
262
+ return scored_refs or None # Return None if no results
263
+
264
+
265
+ async def is_known_property(
266
+ property_index: IPropertyToSemanticRefIndex | None,
267
+ property_name: PropertyNames,
268
+ property_value: str,
269
+ ) -> bool:
270
+ if property_index is not None:
271
+ semantic_refs_with_name = await property_index.lookup_property(
272
+ property_name.value,
273
+ property_value,
274
+ )
275
+ return semantic_refs_with_name is not None and len(semantic_refs_with_name) > 0
276
+ else:
277
+ return False
278
+
279
+
280
+ PROPERTY_DELIMITER = "@@"
281
+
282
+
283
+ def make_property_term_text(name: str, value: str) -> str:
284
+ return f"prop.{name}{PROPERTY_DELIMITER}{value}"
285
+
286
+
287
+ def split_property_term_text(term_text: str) -> tuple[str, str]:
288
+ parts = term_text.split(PROPERTY_DELIMITER, 1)
289
+ return parts[0], parts[1]
@@ -0,0 +1,84 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ """Memory storage provider implementation."""
5
+
6
+ from .collections import MemoryMessageCollection, MemorySemanticRefCollection
7
+ from .semrefindex import TermToSemanticRefIndex
8
+ from .convthreads import ConversationThreads
9
+ from .messageindex import MessageTextIndex
10
+ from .reltermsindex import RelatedTermsIndex
11
+ from .propindex import PropertyIndex
12
+ from .timestampindex import TimestampToTextRangeIndex
13
+ from ...knowpro.convsettings import MessageTextIndexSettings, RelatedTermIndexSettings
14
+ from ...knowpro.interfaces import (
15
+ IConversationThreads,
16
+ IMessage,
17
+ IMessageTextIndex,
18
+ IPropertyToSemanticRefIndex,
19
+ IStorageProvider,
20
+ ITermToRelatedTermsIndex,
21
+ ITermToSemanticRefIndex,
22
+ ITimestampToTextRangeIndex,
23
+ )
24
+
25
+
26
+ class MemoryStorageProvider[TMessage: IMessage](IStorageProvider[TMessage]):
27
+ """A storage provider that operates in memory."""
28
+
29
+ _message_collection: MemoryMessageCollection[TMessage]
30
+ _semantic_ref_collection: MemorySemanticRefCollection
31
+
32
+ _conversation_index: TermToSemanticRefIndex
33
+ _property_index: PropertyIndex
34
+ _timestamp_index: TimestampToTextRangeIndex
35
+ _message_text_index: MessageTextIndex
36
+ _related_terms_index: RelatedTermsIndex
37
+ _conversation_threads: ConversationThreads
38
+
39
+ def __init__(
40
+ self,
41
+ message_text_settings: MessageTextIndexSettings,
42
+ related_terms_settings: RelatedTermIndexSettings,
43
+ ) -> None:
44
+ """Create and initialize a MemoryStorageProvider with all indexes."""
45
+ self._message_collection = MemoryMessageCollection[TMessage]()
46
+ self._semantic_ref_collection = MemorySemanticRefCollection()
47
+
48
+ self._conversation_index = TermToSemanticRefIndex()
49
+ self._property_index = PropertyIndex()
50
+ self._timestamp_index = TimestampToTextRangeIndex()
51
+ self._message_text_index = MessageTextIndex(message_text_settings)
52
+ self._related_terms_index = RelatedTermsIndex(related_terms_settings)
53
+ thread_settings = message_text_settings.embedding_index_settings
54
+ self._conversation_threads = ConversationThreads(thread_settings)
55
+
56
+ async def get_semantic_ref_index(self) -> ITermToSemanticRefIndex:
57
+ return self._conversation_index
58
+
59
+ async def get_property_index(self) -> IPropertyToSemanticRefIndex:
60
+ return self._property_index
61
+
62
+ async def get_timestamp_index(self) -> ITimestampToTextRangeIndex:
63
+ return self._timestamp_index
64
+
65
+ async def get_message_text_index(self) -> IMessageTextIndex[TMessage]:
66
+ return self._message_text_index
67
+
68
+ async def get_related_terms_index(self) -> ITermToRelatedTermsIndex:
69
+ return self._related_terms_index
70
+
71
+ async def get_conversation_threads(self) -> IConversationThreads:
72
+ return self._conversation_threads
73
+
74
+ async def get_message_collection(
75
+ self, message_type: type[TMessage] | None = None
76
+ ) -> MemoryMessageCollection[TMessage]:
77
+ return self._message_collection
78
+
79
+ async def get_semantic_ref_collection(self) -> MemorySemanticRefCollection:
80
+ return self._semantic_ref_collection
81
+
82
+ async def close(self) -> None:
83
+ """Close the storage provider."""
84
+ pass
@@ -0,0 +1,318 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ from collections.abc import Callable
5
+ from dataclasses import dataclass
6
+ from typing import Protocol, TYPE_CHECKING
7
+
8
+ from typeagent.aitools.vectorbase import (
9
+ ScoredInt,
10
+ TextEmbeddingIndexSettings,
11
+ VectorBase,
12
+ )
13
+
14
+ from typeagent.knowpro.collections import TermSet
15
+ from typeagent.knowpro.common import is_search_term_wildcard
16
+ from typeagent.knowpro.convsettings import RelatedTermIndexSettings
17
+ from typeagent.knowpro.interfaces import (
18
+ IConversation,
19
+ IMessage,
20
+ ITermToRelatedTerms,
21
+ ITermToRelatedTermsFuzzy,
22
+ ITermToRelatedTermsIndex,
23
+ ITermToSemanticRefIndex,
24
+ SearchTerm,
25
+ Term,
26
+ TermToRelatedTermsData,
27
+ TermsToRelatedTermsDataItem,
28
+ TermsToRelatedTermsIndexData,
29
+ TextEmbeddingIndexData,
30
+ )
31
+
32
+ if TYPE_CHECKING:
33
+ from typeagent.knowpro.query import CompiledSearchTerm, CompiledTermGroup
34
+
35
+
36
+ class TermToRelatedTermsMap(ITermToRelatedTerms):
37
+ def __init__(self):
38
+ # The inner dict represents a set of terms disregarding their weights.
39
+ self.map: dict[str, dict[str, Term]] = {}
40
+
41
+ async def add_related_term(
42
+ self, text: str, related_terms: Term | list[Term]
43
+ ) -> None:
44
+ if not isinstance(related_terms, list):
45
+ related_terms = [related_terms]
46
+ terms: dict[str, Term] = self.map.setdefault(text, {})
47
+ for related in related_terms:
48
+ terms.setdefault(related.text, related)
49
+
50
+ async def lookup_term(self, text: str) -> list[Term] | None:
51
+ result = self.map.get(text)
52
+ if result:
53
+ return list(result.values())
54
+ else:
55
+ return None
56
+
57
+ async def remove_term(self, text: str) -> None:
58
+ self.map.pop(text, None)
59
+
60
+ async def clear(self) -> None:
61
+ self.map.clear()
62
+
63
+ async def size(self) -> int:
64
+ return len(self.map)
65
+
66
+ async def is_empty(self) -> bool:
67
+ return len(self.map) == 0
68
+
69
+ async def serialize(self) -> TermToRelatedTermsData:
70
+ related_terms: list[TermsToRelatedTermsDataItem] = []
71
+ for key, value in self.map.items():
72
+ related_terms.append(
73
+ TermsToRelatedTermsDataItem(
74
+ termText=key,
75
+ relatedTerms=[term.serialize() for term in value.values()],
76
+ )
77
+ )
78
+ return TermToRelatedTermsData(relatedTerms=related_terms)
79
+
80
+ async def deserialize(self, data: TermToRelatedTermsData | None) -> None:
81
+ self.map.clear()
82
+ if data is None:
83
+ return
84
+ related_terms_data = data.get("relatedTerms")
85
+ if related_terms_data is None:
86
+ return
87
+ for item in related_terms_data:
88
+ term_text = item["termText"]
89
+ related_terms_data = item["relatedTerms"]
90
+ related_terms: list[Term] = [
91
+ Term(term_data["text"], weight=term_data.get("weight"))
92
+ for term_data in related_terms_data
93
+ ]
94
+ await self.add_related_term(term_text, related_terms)
95
+
96
+
97
+ async def build_related_terms_index[
98
+ TMessage: IMessage,
99
+ TTermToSemanticRefIndex: ITermToSemanticRefIndex,
100
+ ](
101
+ conversation: IConversation[TMessage, TTermToSemanticRefIndex],
102
+ settings: RelatedTermIndexSettings,
103
+ ) -> None:
104
+ csr = conversation.semantic_ref_index
105
+ assert csr is not None
106
+ csi = conversation.secondary_indexes
107
+ if csr is not None and csi is not None:
108
+ if csi.term_to_related_terms_index is None:
109
+ csi.term_to_related_terms_index = RelatedTermsIndex(settings)
110
+ fuzzy_index = csi.term_to_related_terms_index.fuzzy_index
111
+ if fuzzy_index is not None:
112
+ all_terms = await csr.get_terms()
113
+ if all_terms:
114
+ await fuzzy_index.add_terms(all_terms)
115
+
116
+
117
+ class RelatedTermsIndex(ITermToRelatedTermsIndex):
118
+ def __init__(self, settings: RelatedTermIndexSettings):
119
+ self.settings = settings
120
+ self._alias_map = TermToRelatedTermsMap()
121
+ self._term_index = TermEmbeddingIndex(settings.embedding_index_settings)
122
+
123
+ @property
124
+ def aliases(self) -> TermToRelatedTermsMap:
125
+ return self._alias_map
126
+
127
+ @property
128
+ def fuzzy_index(self) -> ITermToRelatedTermsFuzzy | None:
129
+ return self._term_index
130
+
131
+ async def serialize(self) -> TermsToRelatedTermsIndexData:
132
+ return TermsToRelatedTermsIndexData(
133
+ aliasData=await self._alias_map.serialize(),
134
+ textEmbeddingData=self._term_index.serialize(),
135
+ )
136
+
137
+ async def deserialize(self, data: TermsToRelatedTermsIndexData) -> None:
138
+ await self._alias_map.clear()
139
+ self._term_index.clear()
140
+ await self._alias_map.deserialize(data.get("aliasData"))
141
+ text_embedding_data = data.get("textEmbeddingData")
142
+ if text_embedding_data is not None:
143
+ self._term_index.deserialize(text_embedding_data)
144
+
145
+
146
+ async def resolve_related_terms(
147
+ related_terms_index: ITermToRelatedTermsIndex,
148
+ compiled_terms: list["CompiledTermGroup"],
149
+ ensure_single_occurrence: bool = True,
150
+ should_resolve_fuzzy: Callable[[SearchTerm], bool] | None = None,
151
+ ) -> None:
152
+ """Resolves related terms for those search terms that don't already have them.
153
+
154
+ NOTE: This modifies SearchTerm().related_terms in place.
155
+
156
+ Optionally ensures that related terms are not duplicated across search terms
157
+ because this can skew how semantic references are scored during search
158
+ (over-counting).
159
+
160
+ SUBTLE: If a search terms has related_terms == [], don't touch it;
161
+ only set related_terms if it is None.
162
+ """
163
+ all_search_terms = [term for ct in compiled_terms for term in ct.terms]
164
+ searchable_terms = TermSet()
165
+ search_terms_needing_related: list[SearchTerm] = []
166
+
167
+ for search_term in all_search_terms:
168
+ if is_search_term_wildcard(search_term):
169
+ continue
170
+ searchable_terms.add_or_union(search_term.term)
171
+ term_text = search_term.term.text
172
+ # Resolve any specific term to related term mappings
173
+ if search_term.related_terms is None:
174
+ search_term.related_terms = await related_terms_index.aliases.lookup_term(
175
+ term_text
176
+ )
177
+ # If no mappings to aliases, add to fuzzy retrieval list
178
+ if search_term.related_terms is None:
179
+ if should_resolve_fuzzy is None or should_resolve_fuzzy(search_term):
180
+ search_terms_needing_related.append(search_term)
181
+
182
+ if related_terms_index.fuzzy_index is not None and search_terms_needing_related:
183
+ related_terms_for_search_terms = (
184
+ await related_terms_index.fuzzy_index.lookup_terms(
185
+ [st.term.text for st in search_terms_needing_related]
186
+ )
187
+ )
188
+ for search_term, related_terms in zip(
189
+ search_terms_needing_related, related_terms_for_search_terms
190
+ ):
191
+ search_term.related_terms = related_terms
192
+
193
+ # Due to fuzzy matching, a search term may end with related terms that overlap with those of other search terms.
194
+ # This causes scoring problems... duplicate/redundant scoring that can cause items to seem more relevant than they are
195
+ # - The same related term can show up for different search terms but with different weights
196
+ # - related terms may also already be present as search terms
197
+ for ct in compiled_terms:
198
+ dedupe_related_terms(
199
+ ct.terms, ensure_single_occurrence and ct.boolean_op != "and"
200
+ )
201
+
202
+
203
+ def dedupe_related_terms(
204
+ compiled_terms: list["CompiledSearchTerm"],
205
+ ensure_single_occurrence: bool,
206
+ ) -> None:
207
+ all_search_terms = TermSet()
208
+ all_related_terms: TermSet | None = None
209
+
210
+ # Collect all unique search and related terms.
211
+ # We end up with (term, maximum weight for term) pairs.
212
+ for st in compiled_terms:
213
+ all_search_terms.add(st.term)
214
+ if ensure_single_occurrence:
215
+ all_related_terms = TermSet()
216
+ for st in compiled_terms:
217
+ all_related_terms.add_or_union(st.related_terms)
218
+
219
+ for search_term in compiled_terms:
220
+ required = search_term.related_terms_required
221
+ if required:
222
+ continue
223
+ if search_term.related_terms:
224
+ unique_related_for_search_term: list[Term] = []
225
+ for candidate_related_term in search_term.related_terms:
226
+ if candidate_related_term in all_search_terms:
227
+ # This related term is already a search term
228
+ continue
229
+ if ensure_single_occurrence and all_related_terms is not None:
230
+ # Each unique related term should be searched for only once,
231
+ # and (if there were duplicates) assigned the maximum weight assigned to that term
232
+ term_with_max_weight = all_related_terms.get(candidate_related_term)
233
+ if (
234
+ term_with_max_weight is not None
235
+ and term_with_max_weight.weight == candidate_related_term.weight
236
+ ):
237
+ # Associate this related term with the current search term
238
+ unique_related_for_search_term.append(term_with_max_weight)
239
+ all_related_terms.remove(candidate_related_term)
240
+ else:
241
+ unique_related_for_search_term.append(candidate_related_term)
242
+ search_term.related_terms = unique_related_for_search_term
243
+
244
+
245
+ class ITermEmbeddingIndex(ITermToRelatedTermsFuzzy, Protocol):
246
+ def serialize(self) -> TextEmbeddingIndexData: ...
247
+
248
+ def deserialize(self, data: TextEmbeddingIndexData) -> None: ...
249
+
250
+
251
+ # TODO: Inherit from TextEmbeddingCache too.
252
+ class TermEmbeddingIndex(ITermEmbeddingIndex):
253
+ # The Python version wraps a VectorBase
254
+
255
+ settings: TextEmbeddingIndexSettings
256
+ _vectorbase: VectorBase
257
+ _texts: list[str]
258
+
259
+ def __init__(
260
+ self,
261
+ settings: TextEmbeddingIndexSettings,
262
+ data: TextEmbeddingIndexData | None = None,
263
+ ):
264
+ self.settings = settings
265
+ self._vectorbase = VectorBase(settings)
266
+ self._texts: list[str] = []
267
+ if data:
268
+ self.deserialize(data)
269
+
270
+ def clear(self) -> None:
271
+ self._vectorbase.clear()
272
+ self._texts.clear()
273
+
274
+ def serialize(self) -> TextEmbeddingIndexData:
275
+ return TextEmbeddingIndexData(
276
+ textItems=self._texts,
277
+ embeddings=self._vectorbase.serialize(),
278
+ )
279
+
280
+ def deserialize(self, data: TextEmbeddingIndexData | None) -> None:
281
+ self.clear()
282
+ if data is not None:
283
+ self._texts = data.get("textItems", [])
284
+ self._vectorbase.deserialize(data.get("embeddings"))
285
+
286
+ async def size(self) -> int:
287
+ return len(self._vectorbase)
288
+
289
+ async def add_terms(self, texts: list[str]) -> None:
290
+ await self._vectorbase.add_keys(texts)
291
+ self._texts.extend(texts)
292
+
293
+ async def lookup_term(
294
+ self, text: str, max_hits: int | None = None, min_score: float | None = None
295
+ ) -> list[Term]:
296
+ matches = await self._vectorbase.fuzzy_lookup(
297
+ text, max_hits=max_hits, min_score=min_score
298
+ )
299
+ return self.matches_to_terms(matches)
300
+
301
+ async def lookup_terms(
302
+ self,
303
+ texts: list[str],
304
+ max_hits: int | None = None,
305
+ min_score: float | None = None,
306
+ ) -> list[list[Term]]:
307
+ matches = [
308
+ await self._vectorbase.fuzzy_lookup(
309
+ text, max_hits=max_hits, min_score=min_score
310
+ )
311
+ for text in texts
312
+ ]
313
+ return [self.matches_to_terms(m) for m in matches]
314
+
315
+ def matches_to_terms(self, matches: list[ScoredInt]) -> list[Term]:
316
+ return [
317
+ Term(text=self._texts[match.item], weight=match.score) for match in matches
318
+ ]