typeagent-py 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- typeagent/aitools/auth.py +61 -0
- typeagent/aitools/embeddings.py +232 -0
- typeagent/aitools/utils.py +244 -0
- typeagent/aitools/vectorbase.py +175 -0
- typeagent/knowpro/answer_context_schema.py +49 -0
- typeagent/knowpro/answer_response_schema.py +34 -0
- typeagent/knowpro/answers.py +577 -0
- typeagent/knowpro/collections.py +759 -0
- typeagent/knowpro/common.py +9 -0
- typeagent/knowpro/convknowledge.py +112 -0
- typeagent/knowpro/convsettings.py +94 -0
- typeagent/knowpro/convutils.py +49 -0
- typeagent/knowpro/date_time_schema.py +32 -0
- typeagent/knowpro/field_helpers.py +87 -0
- typeagent/knowpro/fuzzyindex.py +144 -0
- typeagent/knowpro/interfaces.py +818 -0
- typeagent/knowpro/knowledge.py +88 -0
- typeagent/knowpro/kplib.py +125 -0
- typeagent/knowpro/query.py +1128 -0
- typeagent/knowpro/search.py +628 -0
- typeagent/knowpro/search_query_schema.py +165 -0
- typeagent/knowpro/searchlang.py +729 -0
- typeagent/knowpro/searchlib.py +345 -0
- typeagent/knowpro/secindex.py +100 -0
- typeagent/knowpro/serialization.py +390 -0
- typeagent/knowpro/textlocindex.py +179 -0
- typeagent/knowpro/utils.py +17 -0
- typeagent/mcp/server.py +139 -0
- typeagent/podcasts/podcast.py +473 -0
- typeagent/podcasts/podcast_import.py +105 -0
- typeagent/storage/__init__.py +25 -0
- typeagent/storage/memory/__init__.py +13 -0
- typeagent/storage/memory/collections.py +68 -0
- typeagent/storage/memory/convthreads.py +81 -0
- typeagent/storage/memory/messageindex.py +178 -0
- typeagent/storage/memory/propindex.py +289 -0
- typeagent/storage/memory/provider.py +84 -0
- typeagent/storage/memory/reltermsindex.py +318 -0
- typeagent/storage/memory/semrefindex.py +660 -0
- typeagent/storage/memory/timestampindex.py +176 -0
- typeagent/storage/sqlite/__init__.py +31 -0
- typeagent/storage/sqlite/collections.py +362 -0
- typeagent/storage/sqlite/messageindex.py +382 -0
- typeagent/storage/sqlite/propindex.py +119 -0
- typeagent/storage/sqlite/provider.py +293 -0
- typeagent/storage/sqlite/reltermsindex.py +328 -0
- typeagent/storage/sqlite/schema.py +248 -0
- typeagent/storage/sqlite/semrefindex.py +156 -0
- typeagent/storage/sqlite/timestampindex.py +146 -0
- typeagent/storage/utils.py +41 -0
- typeagent_py-0.1.0.dist-info/METADATA +28 -0
- typeagent_py-0.1.0.dist-info/RECORD +55 -0
- typeagent_py-0.1.0.dist-info/WHEEL +5 -0
- typeagent_py-0.1.0.dist-info/licenses/LICENSE +21 -0
- typeagent_py-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,818 @@
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
2
|
+
# Licensed under the MIT License.
|
3
|
+
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from collections.abc import AsyncIterable, Iterable, Sequence
|
6
|
+
from datetime import (
|
7
|
+
datetime as Datetime, # For export.
|
8
|
+
timedelta as Timedelta, # For export.
|
9
|
+
)
|
10
|
+
from typing import (
|
11
|
+
Any,
|
12
|
+
ClassVar,
|
13
|
+
Literal,
|
14
|
+
NotRequired,
|
15
|
+
Protocol,
|
16
|
+
Self,
|
17
|
+
TypedDict,
|
18
|
+
runtime_checkable,
|
19
|
+
)
|
20
|
+
|
21
|
+
from pydantic.dataclasses import dataclass
|
22
|
+
from pydantic import Field, AliasChoices
|
23
|
+
import typechat
|
24
|
+
|
25
|
+
from ..aitools.embeddings import NormalizedEmbeddings
|
26
|
+
from . import kplib
|
27
|
+
from .field_helpers import CamelCaseField
|
28
|
+
|
29
|
+
|
30
|
+
class IKnowledgeSource(Protocol):
|
31
|
+
"""A Knowledge Source is any object that returns knowledge."""
|
32
|
+
|
33
|
+
def get_knowledge(self) -> kplib.KnowledgeResponse:
|
34
|
+
"""Retrieves knowledge from the source."""
|
35
|
+
...
|
36
|
+
|
37
|
+
|
38
|
+
class IKnowledgeExtractor(Protocol):
|
39
|
+
"""Interface for extracting knowledge from messages."""
|
40
|
+
|
41
|
+
async def extract(self, message: str) -> typechat.Result[kplib.KnowledgeResponse]:
|
42
|
+
"""Extract knowledge from a message."""
|
43
|
+
...
|
44
|
+
|
45
|
+
|
46
|
+
@dataclass
|
47
|
+
class DeletionInfo:
|
48
|
+
timestamp: str
|
49
|
+
reason: str | None = None
|
50
|
+
|
51
|
+
|
52
|
+
# Messages are referenced by their sequential ordinal numbers.
|
53
|
+
type MessageOrdinal = int
|
54
|
+
|
55
|
+
|
56
|
+
class IMessageMetadata(Protocol):
|
57
|
+
"""Metadata associated with a message."""
|
58
|
+
|
59
|
+
# The source ("senders") of the message
|
60
|
+
source: str | list[str] | None = None
|
61
|
+
|
62
|
+
# The dest ("recipients") of the message
|
63
|
+
dest: str | list[str] | None = None
|
64
|
+
|
65
|
+
|
66
|
+
class IMessage[TMetadata: IMessageMetadata](IKnowledgeSource, Protocol):
|
67
|
+
"""A message in a conversation
|
68
|
+
|
69
|
+
A Message contains one or more text chunks.
|
70
|
+
"""
|
71
|
+
|
72
|
+
# The text of the message, split into chunks.
|
73
|
+
text_chunks: list[str]
|
74
|
+
|
75
|
+
# (Optional) tags associated with the message.
|
76
|
+
tags: list[str]
|
77
|
+
|
78
|
+
# The (optional) timestamp of the message.
|
79
|
+
timestamp: str | None = None
|
80
|
+
|
81
|
+
# (Future) Information about the deletion of the message.
|
82
|
+
deletion_info: DeletionInfo | None = None
|
83
|
+
|
84
|
+
# Metadata associated with the message such as its source.
|
85
|
+
metadata: TMetadata | None = None
|
86
|
+
|
87
|
+
|
88
|
+
type SemanticRefOrdinal = int
|
89
|
+
|
90
|
+
|
91
|
+
@dataclass
|
92
|
+
class ScoredSemanticRefOrdinal:
|
93
|
+
semantic_ref_ordinal: SemanticRefOrdinal = CamelCaseField(
|
94
|
+
"The ordinal of the semantic reference"
|
95
|
+
)
|
96
|
+
score: float = CamelCaseField("The relevance score")
|
97
|
+
|
98
|
+
def __repr__(self) -> str:
|
99
|
+
return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.score})"
|
100
|
+
|
101
|
+
def serialize(self) -> "ScoredSemanticRefOrdinalData":
|
102
|
+
return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore
|
103
|
+
|
104
|
+
@staticmethod
|
105
|
+
def deserialize(data: "ScoredSemanticRefOrdinalData") -> "ScoredSemanticRefOrdinal":
|
106
|
+
return ScoredSemanticRefOrdinal.__pydantic_validator__.validate_python(data) # type: ignore
|
107
|
+
|
108
|
+
|
109
|
+
@dataclass
|
110
|
+
class ScoredMessageOrdinal:
|
111
|
+
message_ordinal: MessageOrdinal
|
112
|
+
score: float
|
113
|
+
|
114
|
+
|
115
|
+
class ITermToSemanticRefIndex(Protocol):
|
116
|
+
async def size(self) -> int: ...
|
117
|
+
|
118
|
+
async def get_terms(self) -> list[str]: ...
|
119
|
+
|
120
|
+
async def add_term(
|
121
|
+
self,
|
122
|
+
term: str,
|
123
|
+
semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal,
|
124
|
+
) -> str: ...
|
125
|
+
|
126
|
+
async def remove_term(
|
127
|
+
self, term: str, semantic_ref_ordinal: SemanticRefOrdinal
|
128
|
+
) -> None: ...
|
129
|
+
|
130
|
+
async def lookup_term(self, term: str) -> list[ScoredSemanticRefOrdinal] | None: ...
|
131
|
+
|
132
|
+
async def clear(self) -> None: ...
|
133
|
+
|
134
|
+
async def serialize(self) -> Any: ...
|
135
|
+
|
136
|
+
async def deserialize(self, data: Any) -> None: ...
|
137
|
+
|
138
|
+
|
139
|
+
type KnowledgeType = Literal["entity", "action", "topic", "tag"]
|
140
|
+
|
141
|
+
|
142
|
+
@dataclass
|
143
|
+
class Topic:
|
144
|
+
knowledge_type: ClassVar[Literal["topic"]] = "topic"
|
145
|
+
text: str
|
146
|
+
|
147
|
+
|
148
|
+
@dataclass
|
149
|
+
class Tag:
|
150
|
+
knowledge_type: ClassVar[Literal["tag"]] = "tag"
|
151
|
+
text: str
|
152
|
+
|
153
|
+
|
154
|
+
type Knowledge = kplib.ConcreteEntity | kplib.Action | Topic | Tag
|
155
|
+
|
156
|
+
|
157
|
+
class TextLocationData(TypedDict):
|
158
|
+
messageOrdinal: MessageOrdinal
|
159
|
+
chunkOrdinal: int
|
160
|
+
|
161
|
+
|
162
|
+
@dataclass(order=True)
|
163
|
+
class TextLocation:
|
164
|
+
# The ordinal of the message.
|
165
|
+
message_ordinal: MessageOrdinal = CamelCaseField("The ordinal of the message")
|
166
|
+
# The ordinal of the chunk.
|
167
|
+
# In the end of a TextRange, 1 + ordinal of the last chunk in the range.
|
168
|
+
chunk_ordinal: int = CamelCaseField(
|
169
|
+
"The ordinal of the chunk; in the end of a TextRange, 1 + ordinal of the last chunk in the range",
|
170
|
+
default=0,
|
171
|
+
)
|
172
|
+
|
173
|
+
def __repr__(self) -> str:
|
174
|
+
return (
|
175
|
+
f"{self.__class__.__name__}({self.message_ordinal}, {self.chunk_ordinal})"
|
176
|
+
)
|
177
|
+
|
178
|
+
def serialize(self) -> TextLocationData:
|
179
|
+
return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore
|
180
|
+
|
181
|
+
@staticmethod
|
182
|
+
def deserialize(data: TextLocationData) -> "TextLocation":
|
183
|
+
return TextLocation.__pydantic_validator__.validate_python(data) # type: ignore
|
184
|
+
|
185
|
+
|
186
|
+
class TextRangeData(TypedDict):
|
187
|
+
start: TextLocationData
|
188
|
+
end: NotRequired[TextLocationData | None]
|
189
|
+
|
190
|
+
|
191
|
+
# A text range within a session.
|
192
|
+
# TODO: Are TextRanges totally ordered?
|
193
|
+
@dataclass
|
194
|
+
class TextRange:
|
195
|
+
# The start of the range.
|
196
|
+
start: TextLocation
|
197
|
+
# The end of the range (exclusive). If None, the range is a single point.
|
198
|
+
end: TextLocation | None = None
|
199
|
+
|
200
|
+
def __repr__(self) -> str:
|
201
|
+
if self.end is None:
|
202
|
+
return f"{self.__class__.__name__}({self.start})"
|
203
|
+
else:
|
204
|
+
return f"{self.__class__.__name__}({self.start}, {self.end})"
|
205
|
+
|
206
|
+
def __eq__(self, other: object) -> bool:
|
207
|
+
if not isinstance(other, TextRange):
|
208
|
+
return NotImplemented
|
209
|
+
|
210
|
+
if self.start != other.start:
|
211
|
+
return False
|
212
|
+
|
213
|
+
# Get the effective end for both ranges
|
214
|
+
self_end = self.end or TextLocation(
|
215
|
+
self.start.message_ordinal, self.start.chunk_ordinal + 1
|
216
|
+
)
|
217
|
+
other_end = other.end or TextLocation(
|
218
|
+
other.start.message_ordinal, other.start.chunk_ordinal + 1
|
219
|
+
)
|
220
|
+
|
221
|
+
return self_end == other_end
|
222
|
+
|
223
|
+
def __lt__(self, other: Self) -> bool:
|
224
|
+
if self.start != other.start:
|
225
|
+
return self.start < other.start
|
226
|
+
self_end = self.end or TextLocation(
|
227
|
+
self.start.message_ordinal, self.start.chunk_ordinal + 1
|
228
|
+
)
|
229
|
+
other_end = other.end or TextLocation(
|
230
|
+
other.start.message_ordinal, other.start.chunk_ordinal + 1
|
231
|
+
)
|
232
|
+
return self_end < other_end
|
233
|
+
|
234
|
+
def __gt__(self, other: Self) -> bool:
|
235
|
+
return other.__lt__(self)
|
236
|
+
|
237
|
+
def __ge__(self, other: Self) -> bool:
|
238
|
+
return not self.__lt__(other)
|
239
|
+
|
240
|
+
def __le__(self, other: Self) -> bool:
|
241
|
+
return not other.__lt__(self)
|
242
|
+
|
243
|
+
def __contains__(self, other: Self) -> bool:
|
244
|
+
other_end = other.end or TextLocation(
|
245
|
+
other.start.message_ordinal, other.start.chunk_ordinal + 1
|
246
|
+
)
|
247
|
+
self_end = self.end or TextLocation(
|
248
|
+
self.start.message_ordinal, self.start.chunk_ordinal + 1
|
249
|
+
)
|
250
|
+
return self.start <= other.start and other_end <= self_end
|
251
|
+
|
252
|
+
def serialize(self) -> TextRangeData:
|
253
|
+
return self.__pydantic_serializer__.to_python(self, by_alias=True, exclude_none=True) # type: ignore
|
254
|
+
|
255
|
+
@staticmethod
|
256
|
+
def deserialize(data: TextRangeData) -> "TextRange":
|
257
|
+
return TextRange.__pydantic_validator__.validate_python(data) # type: ignore
|
258
|
+
|
259
|
+
|
260
|
+
# TODO: Implement serializing KnowledgeData (or import from kplib).
|
261
|
+
class KnowledgeData(TypedDict):
|
262
|
+
pass
|
263
|
+
|
264
|
+
|
265
|
+
class SemanticRefData(TypedDict):
|
266
|
+
semanticRefOrdinal: SemanticRefOrdinal
|
267
|
+
range: TextRangeData
|
268
|
+
knowledgeType: KnowledgeType
|
269
|
+
knowledge: KnowledgeData
|
270
|
+
|
271
|
+
|
272
|
+
@dataclass
|
273
|
+
class SemanticRef:
|
274
|
+
semantic_ref_ordinal: SemanticRefOrdinal = CamelCaseField(
|
275
|
+
"The ordinal of the semantic reference"
|
276
|
+
)
|
277
|
+
range: TextRange = CamelCaseField("The text range of the semantic reference")
|
278
|
+
knowledge: Knowledge = CamelCaseField(
|
279
|
+
"The knowledge associated with this semantic reference"
|
280
|
+
)
|
281
|
+
|
282
|
+
def __repr__(self) -> str:
|
283
|
+
return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.range}, {self.knowledge.knowledge_type!r}, {self.knowledge})"
|
284
|
+
|
285
|
+
def serialize(self) -> SemanticRefData:
|
286
|
+
from . import serialization
|
287
|
+
|
288
|
+
return SemanticRefData(
|
289
|
+
semanticRefOrdinal=self.semantic_ref_ordinal,
|
290
|
+
range=self.range.serialize(),
|
291
|
+
knowledgeType=self.knowledge.knowledge_type,
|
292
|
+
knowledge=serialization.serialize_object(self.knowledge),
|
293
|
+
)
|
294
|
+
|
295
|
+
@staticmethod
|
296
|
+
def deserialize(data: SemanticRefData) -> "SemanticRef":
|
297
|
+
from . import serialization
|
298
|
+
|
299
|
+
knowledge = serialization.deserialize_knowledge(
|
300
|
+
data["knowledgeType"], data["knowledge"]
|
301
|
+
)
|
302
|
+
return SemanticRef(
|
303
|
+
semantic_ref_ordinal=data["semanticRefOrdinal"],
|
304
|
+
range=TextRange.deserialize(data["range"]),
|
305
|
+
knowledge=knowledge,
|
306
|
+
)
|
307
|
+
|
308
|
+
|
309
|
+
@dataclass
|
310
|
+
class DateRange:
|
311
|
+
start: Datetime
|
312
|
+
# Inclusive. If None, the range is unbounded.
|
313
|
+
end: Datetime | None = None
|
314
|
+
|
315
|
+
def __repr__(self) -> str:
|
316
|
+
if self.end is None:
|
317
|
+
return f"{self.__class__.__name__}({self.start!r})"
|
318
|
+
else:
|
319
|
+
return f"{self.__class__.__name__}({self.start!r}, {self.end!r})"
|
320
|
+
|
321
|
+
def __contains__(self, datetime: Datetime) -> bool:
|
322
|
+
if self.end is None:
|
323
|
+
return self.start <= datetime
|
324
|
+
return self.start <= datetime <= self.end
|
325
|
+
|
326
|
+
|
327
|
+
# Term must be hashable to allow using it as a dict key or set member.
|
328
|
+
@dataclass(unsafe_hash=True)
|
329
|
+
class Term:
|
330
|
+
text: str
|
331
|
+
# Optional weighting for these matches.
|
332
|
+
weight: float | None = None
|
333
|
+
|
334
|
+
def __repr__(self) -> str:
|
335
|
+
if self.weight is None:
|
336
|
+
return f"{self.__class__.__name__}({self.text!r})"
|
337
|
+
else:
|
338
|
+
return f"{self.__class__.__name__}({self.text!r}, {self.weight:.4g})"
|
339
|
+
|
340
|
+
def serialize(self) -> "TermData":
|
341
|
+
return self.__pydantic_serializer__.to_python(self, by_alias=True, exclude_none=True) # type: ignore
|
342
|
+
|
343
|
+
|
344
|
+
# Allows for faster retrieval of name, value properties
|
345
|
+
@runtime_checkable
|
346
|
+
class IPropertyToSemanticRefIndex(Protocol):
|
347
|
+
async def size(self) -> int: ...
|
348
|
+
|
349
|
+
async def get_values(self) -> list[str]: ...
|
350
|
+
|
351
|
+
async def add_property(
|
352
|
+
self,
|
353
|
+
property_name: str,
|
354
|
+
value: str,
|
355
|
+
semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal,
|
356
|
+
) -> None: ...
|
357
|
+
|
358
|
+
async def lookup_property(
|
359
|
+
self, property_name: str, value: str
|
360
|
+
) -> list[ScoredSemanticRefOrdinal] | None: ...
|
361
|
+
|
362
|
+
async def clear(self) -> None: ...
|
363
|
+
|
364
|
+
async def remove_property(self, prop_name: str, semref_id: int) -> None: ...
|
365
|
+
|
366
|
+
async def remove_all_for_semref(self, semref_id: int) -> None: ...
|
367
|
+
|
368
|
+
|
369
|
+
@dataclass
|
370
|
+
class TimestampedTextRange:
|
371
|
+
timestamp: str
|
372
|
+
range: TextRange
|
373
|
+
|
374
|
+
|
375
|
+
# Return text ranges in the given date range.
|
376
|
+
class ITimestampToTextRangeIndex(Protocol):
|
377
|
+
# Contract (stable across providers):
|
378
|
+
# - Timestamps must be ISO-8601 strings sortable lexicographically.
|
379
|
+
# - lookup_range(DateRange) returns items with start <= t < end (end exclusive).
|
380
|
+
# If end is None, treat as a point query with end = start + epsilon.
|
381
|
+
async def size(self) -> int: ...
|
382
|
+
|
383
|
+
async def add_timestamp(
|
384
|
+
self, message_ordinal: MessageOrdinal, timestamp: str
|
385
|
+
) -> bool: ...
|
386
|
+
|
387
|
+
async def add_timestamps(
|
388
|
+
self, message_timestamps: list[tuple[MessageOrdinal, str]]
|
389
|
+
) -> None: ...
|
390
|
+
|
391
|
+
async def lookup_range(
|
392
|
+
self, date_range: DateRange
|
393
|
+
) -> list[TimestampedTextRange]: ...
|
394
|
+
|
395
|
+
|
396
|
+
class ITermToRelatedTerms(Protocol):
|
397
|
+
async def lookup_term(self, text: str) -> list[Term] | None: ...
|
398
|
+
|
399
|
+
async def size(self) -> int: ...
|
400
|
+
|
401
|
+
async def is_empty(self) -> bool: ...
|
402
|
+
|
403
|
+
async def clear(self) -> None: ...
|
404
|
+
|
405
|
+
async def add_related_term(
|
406
|
+
self, text: str, related_terms: Term | list[Term]
|
407
|
+
) -> None: ...
|
408
|
+
|
409
|
+
async def remove_term(self, text: str) -> None: ...
|
410
|
+
|
411
|
+
async def serialize(self) -> "TermToRelatedTermsData": ...
|
412
|
+
|
413
|
+
async def deserialize(self, data: "TermToRelatedTermsData | None") -> None: ...
|
414
|
+
|
415
|
+
|
416
|
+
class ITermToRelatedTermsFuzzy(Protocol):
|
417
|
+
async def size(self) -> int: ...
|
418
|
+
|
419
|
+
async def add_terms(self, texts: list[str]) -> None: ...
|
420
|
+
|
421
|
+
async def lookup_term(
|
422
|
+
self,
|
423
|
+
text: str,
|
424
|
+
max_hits: int | None = None,
|
425
|
+
min_score: float | None = None,
|
426
|
+
) -> list[Term]: ...
|
427
|
+
|
428
|
+
async def lookup_terms(
|
429
|
+
self,
|
430
|
+
texts: list[str],
|
431
|
+
max_hits: int | None = None,
|
432
|
+
min_score: float | None = None,
|
433
|
+
) -> list[list[Term]]: ...
|
434
|
+
|
435
|
+
|
436
|
+
class ITermToRelatedTermsIndex(Protocol):
|
437
|
+
# Providers may implement aliases and fuzzy via separate tables, but must
|
438
|
+
# expose them through these properties.
|
439
|
+
@property
|
440
|
+
def aliases(self) -> ITermToRelatedTerms: ...
|
441
|
+
|
442
|
+
@property
|
443
|
+
def fuzzy_index(self) -> ITermToRelatedTermsFuzzy | None: ...
|
444
|
+
|
445
|
+
async def serialize(self) -> "TermsToRelatedTermsIndexData": ...
|
446
|
+
|
447
|
+
async def deserialize(self, data: "TermsToRelatedTermsIndexData") -> None: ...
|
448
|
+
|
449
|
+
|
450
|
+
class ThreadData(TypedDict):
|
451
|
+
description: str
|
452
|
+
ranges: list[TextRangeData]
|
453
|
+
|
454
|
+
|
455
|
+
# A Thread is a set of text ranges in a conversation.
|
456
|
+
@dataclass
|
457
|
+
class Thread:
|
458
|
+
description: str
|
459
|
+
ranges: Sequence[TextRange]
|
460
|
+
|
461
|
+
def serialize(self) -> ThreadData:
|
462
|
+
return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore
|
463
|
+
|
464
|
+
@staticmethod
|
465
|
+
def deserialize(data: ThreadData) -> "Thread":
|
466
|
+
return Thread.__pydantic_validator__.validate_python(data) # type: ignore
|
467
|
+
|
468
|
+
|
469
|
+
type ThreadOrdinal = int
|
470
|
+
|
471
|
+
|
472
|
+
@dataclass
|
473
|
+
class ScoredThreadOrdinal:
|
474
|
+
thread_ordinal: ThreadOrdinal
|
475
|
+
score: float
|
476
|
+
|
477
|
+
|
478
|
+
class IConversationThreads(Protocol):
|
479
|
+
threads: list[Thread]
|
480
|
+
|
481
|
+
async def add_thread(self, thread: Thread) -> None: ...
|
482
|
+
|
483
|
+
async def lookup_thread(
|
484
|
+
self,
|
485
|
+
thread_description: str,
|
486
|
+
max_matches: int | None = None,
|
487
|
+
threshold_score: float | None = None,
|
488
|
+
) -> list[ScoredThreadOrdinal] | None: ...
|
489
|
+
|
490
|
+
def serialize(self) -> "ConversationThreadData[ThreadDataItem]": ...
|
491
|
+
|
492
|
+
def deserialize(self, data: "ConversationThreadData[ThreadDataItem]") -> None: ...
|
493
|
+
|
494
|
+
|
495
|
+
@runtime_checkable
|
496
|
+
class IMessageTextIndex[TMessage: IMessage](Protocol):
|
497
|
+
|
498
|
+
async def add_messages(
|
499
|
+
self,
|
500
|
+
messages: Iterable[TMessage],
|
501
|
+
) -> None: ...
|
502
|
+
|
503
|
+
async def add_messages_starting_at(
|
504
|
+
self,
|
505
|
+
start_message_ordinal: int,
|
506
|
+
messages: list[TMessage],
|
507
|
+
) -> None: ...
|
508
|
+
|
509
|
+
async def lookup_messages(
|
510
|
+
self,
|
511
|
+
message_text: str,
|
512
|
+
max_matches: int | None = None,
|
513
|
+
threshold_score: float | None = None,
|
514
|
+
) -> list[ScoredMessageOrdinal]: ...
|
515
|
+
|
516
|
+
async def lookup_messages_in_subset(
|
517
|
+
self,
|
518
|
+
message_text: str,
|
519
|
+
ordinals_to_search: list[MessageOrdinal],
|
520
|
+
max_matches: int | None = None,
|
521
|
+
threshold_score: float | None = None,
|
522
|
+
) -> list[ScoredMessageOrdinal]: ...
|
523
|
+
|
524
|
+
# Async alternatives to __len__ and __bool__
|
525
|
+
async def size(self) -> int: ...
|
526
|
+
|
527
|
+
async def is_empty(self) -> bool: ...
|
528
|
+
|
529
|
+
# TODO: Others?
|
530
|
+
|
531
|
+
async def serialize(self) -> "MessageTextIndexData": ...
|
532
|
+
|
533
|
+
async def deserialize(self, data: "MessageTextIndexData") -> None: ...
|
534
|
+
|
535
|
+
|
536
|
+
class IConversationSecondaryIndexes[TMessage: IMessage](Protocol):
|
537
|
+
property_to_semantic_ref_index: IPropertyToSemanticRefIndex | None
|
538
|
+
timestamp_index: ITimestampToTextRangeIndex | None
|
539
|
+
term_to_related_terms_index: ITermToRelatedTermsIndex | None
|
540
|
+
threads: IConversationThreads | None = None
|
541
|
+
message_index: IMessageTextIndex[TMessage] | None = None
|
542
|
+
|
543
|
+
|
544
|
+
class IConversation[
|
545
|
+
TMessage: IMessage,
|
546
|
+
TTermToSemanticRefIndex: ITermToSemanticRefIndex,
|
547
|
+
](Protocol):
|
548
|
+
name_tag: str
|
549
|
+
tags: list[str]
|
550
|
+
messages: "IMessageCollection[TMessage]"
|
551
|
+
semantic_refs: "ISemanticRefCollection"
|
552
|
+
semantic_ref_index: TTermToSemanticRefIndex
|
553
|
+
secondary_indexes: IConversationSecondaryIndexes[TMessage] | None
|
554
|
+
|
555
|
+
|
556
|
+
# -------------
|
557
|
+
# Search Types
|
558
|
+
# -------------
|
559
|
+
|
560
|
+
|
561
|
+
@dataclass
|
562
|
+
class SearchTerm:
|
563
|
+
"""Represents a term being searched for.
|
564
|
+
|
565
|
+
Attributes:
|
566
|
+
term: The term being searched for.
|
567
|
+
related_terms: Additional terms related to the term. These can be supplied
|
568
|
+
from synonym tables and so on.
|
569
|
+
- An empty list indicates no related matches for this term.
|
570
|
+
- `None` indicates that the search processor may try to resolve related
|
571
|
+
terms from any available secondary indexes (e.g., ITermToRelatedTermsIndex).
|
572
|
+
"""
|
573
|
+
|
574
|
+
term: Term
|
575
|
+
related_terms: list[Term] | None = CamelCaseField(
|
576
|
+
"Additional terms related to the term. These can be supplied from synonym tables and so on",
|
577
|
+
default=None,
|
578
|
+
)
|
579
|
+
|
580
|
+
|
581
|
+
# Well-known knowledge properties.
|
582
|
+
type KnowledgePropertyName = Literal[
|
583
|
+
"name", # the name of an entity
|
584
|
+
"type", # the type of an entity
|
585
|
+
"verb", # the verb of an action
|
586
|
+
"subject", # the subject of an action
|
587
|
+
"object", # the object of an action
|
588
|
+
"indirectObject", # the indirect object of an action
|
589
|
+
"tag", # tag
|
590
|
+
"topic", # topic
|
591
|
+
]
|
592
|
+
|
593
|
+
|
594
|
+
@dataclass
|
595
|
+
class PropertySearchTerm:
|
596
|
+
"""PropertySearch terms let you match named property values.
|
597
|
+
|
598
|
+
- You can match a well-known property name (e.g., name("Bach"), type("book")).
|
599
|
+
- Or you can provide a SearchTerm as a propertyName.
|
600
|
+
For example, to match hue(red):
|
601
|
+
- propertyName as SearchTerm, set to 'hue'
|
602
|
+
- propertyValue as SearchTerm, set to 'red'
|
603
|
+
We also want hue(red) to match any facets called color(red).
|
604
|
+
|
605
|
+
SearchTerms can include related terms:
|
606
|
+
- For example, you could include "color" as a related term for the
|
607
|
+
propertyName "hue", or 'crimson' for red.
|
608
|
+
|
609
|
+
The query processor can also resolve related terms using a
|
610
|
+
related terms secondary index, if one is available.
|
611
|
+
"""
|
612
|
+
|
613
|
+
property_name: KnowledgePropertyName | SearchTerm = CamelCaseField(
|
614
|
+
"The property name to search for"
|
615
|
+
)
|
616
|
+
property_value: SearchTerm = CamelCaseField("The property value to search for")
|
617
|
+
|
618
|
+
|
619
|
+
@dataclass
|
620
|
+
class SearchTermGroup:
|
621
|
+
"""A group of search terms."""
|
622
|
+
|
623
|
+
boolean_op: Literal["and", "or", "or_max"] = CamelCaseField(
|
624
|
+
"The boolean operation to apply to the terms"
|
625
|
+
)
|
626
|
+
terms: list["SearchTermGroupTypes"] = CamelCaseField(
|
627
|
+
"The list of search terms in this group", default_factory=list
|
628
|
+
)
|
629
|
+
|
630
|
+
|
631
|
+
type SearchTermGroupTypes = SearchTerm | PropertySearchTerm | SearchTermGroup
|
632
|
+
|
633
|
+
|
634
|
+
@dataclass
|
635
|
+
class WhenFilter:
|
636
|
+
"""Additional constraints on when a SemanticRef is considered a match.
|
637
|
+
|
638
|
+
A SemanticRef matching a term is actually considered a match
|
639
|
+
when the following optional conditions are met (if present, must match):
|
640
|
+
knowledgeType matches, e.g. knowledgeType == 'entity'
|
641
|
+
dateRange matches, e.g. (Jan 3rd to Jan 10th)
|
642
|
+
Semantic Refs are within supplied SCOPE,
|
643
|
+
i.e. only Semantic Refs from a 'scoping' set of text ranges will match
|
644
|
+
"""
|
645
|
+
|
646
|
+
knowledge_type: KnowledgeType | None = None
|
647
|
+
date_range: DateRange | None = None
|
648
|
+
thread_description: str | None = None
|
649
|
+
tags: list[str] | None = None
|
650
|
+
|
651
|
+
# SCOPE DEFINITION
|
652
|
+
|
653
|
+
# Search terms whose matching text ranges supply the scope for this query
|
654
|
+
scope_defining_terms: SearchTermGroup | None = None
|
655
|
+
# Additional scoping ranges separately computed by caller
|
656
|
+
text_ranges_in_scope: list[TextRange] | None = None
|
657
|
+
|
658
|
+
|
659
|
+
@dataclass
|
660
|
+
class SearchSelectExpr:
|
661
|
+
"""An expression used to select structured contents of a conversation."""
|
662
|
+
|
663
|
+
search_term_group: SearchTermGroup = CamelCaseField(
|
664
|
+
"Term group that matches information"
|
665
|
+
) # Term group that matches information
|
666
|
+
when: WhenFilter | None = None # Filter that scopes what information to match
|
667
|
+
|
668
|
+
|
669
|
+
@dataclass
|
670
|
+
class SemanticRefSearchResult:
|
671
|
+
"""Result of a semantic reference search."""
|
672
|
+
|
673
|
+
term_matches: set[str]
|
674
|
+
semantic_ref_matches: list[ScoredSemanticRefOrdinal]
|
675
|
+
|
676
|
+
|
677
|
+
# --------------------------------------------------
|
678
|
+
# Serialization formats use TypedDict and camelCase
|
679
|
+
# --------------------------------------------------
|
680
|
+
|
681
|
+
|
682
|
+
class ThreadDataItem(TypedDict):
|
683
|
+
thread: ThreadData
|
684
|
+
embedding: list[float] | None # TODO: Why not NormalizedEmbedding?
|
685
|
+
|
686
|
+
|
687
|
+
class ConversationThreadData[TThreadDataItem: ThreadDataItem](TypedDict):
|
688
|
+
threads: list[TThreadDataItem] | None
|
689
|
+
|
690
|
+
|
691
|
+
class TermData(TypedDict):
|
692
|
+
text: str
|
693
|
+
weight: NotRequired[float | None]
|
694
|
+
|
695
|
+
|
696
|
+
class TermsToRelatedTermsDataItem(TypedDict):
|
697
|
+
termText: str
|
698
|
+
relatedTerms: list[TermData]
|
699
|
+
|
700
|
+
|
701
|
+
class TermToRelatedTermsData(TypedDict):
|
702
|
+
relatedTerms: NotRequired[list[TermsToRelatedTermsDataItem] | None]
|
703
|
+
|
704
|
+
|
705
|
+
class TextEmbeddingIndexData(TypedDict):
|
706
|
+
textItems: list[str]
|
707
|
+
embeddings: NormalizedEmbeddings | None
|
708
|
+
|
709
|
+
|
710
|
+
class TermsToRelatedTermsIndexData(TypedDict):
|
711
|
+
aliasData: NotRequired[TermToRelatedTermsData]
|
712
|
+
textEmbeddingData: NotRequired[TextEmbeddingIndexData]
|
713
|
+
|
714
|
+
|
715
|
+
class ScoredSemanticRefOrdinalData(TypedDict):
|
716
|
+
semanticRefOrdinal: SemanticRefOrdinal
|
717
|
+
score: float
|
718
|
+
|
719
|
+
|
720
|
+
class TermToSemanticRefIndexItemData(TypedDict):
|
721
|
+
term: str
|
722
|
+
semanticRefOrdinals: list[ScoredSemanticRefOrdinalData]
|
723
|
+
|
724
|
+
|
725
|
+
# Persistent form of a term index.
|
726
|
+
class TermToSemanticRefIndexData(TypedDict):
|
727
|
+
items: list[TermToSemanticRefIndexItemData]
|
728
|
+
|
729
|
+
|
730
|
+
class ConversationData[TMessageData](TypedDict):
|
731
|
+
nameTag: str
|
732
|
+
messages: list[TMessageData]
|
733
|
+
tags: list[str]
|
734
|
+
semanticRefs: list[SemanticRefData] | None
|
735
|
+
semanticIndexData: NotRequired[TermToSemanticRefIndexData | None]
|
736
|
+
|
737
|
+
|
738
|
+
class TextToTextLocationIndexData(TypedDict):
|
739
|
+
textLocations: list[TextLocationData]
|
740
|
+
embeddings: NormalizedEmbeddings | None
|
741
|
+
|
742
|
+
|
743
|
+
class MessageTextIndexData(TypedDict):
|
744
|
+
indexData: NotRequired[TextToTextLocationIndexData | None]
|
745
|
+
|
746
|
+
|
747
|
+
class ConversationDataWithIndexes[TMessageData](ConversationData[TMessageData]):
|
748
|
+
relatedTermsIndexData: NotRequired[TermsToRelatedTermsIndexData | None]
|
749
|
+
threadData: NotRequired[ConversationThreadData[ThreadDataItem] | None]
|
750
|
+
messageIndexData: NotRequired[MessageTextIndexData | None]
|
751
|
+
|
752
|
+
|
753
|
+
# --------------------------------
|
754
|
+
# Indexing helper data structures
|
755
|
+
# --------------------------------
|
756
|
+
|
757
|
+
|
758
|
+
# --------
|
759
|
+
# Storage
|
760
|
+
# --------
|
761
|
+
|
762
|
+
|
763
|
+
class IReadonlyCollection[T, TOrdinal](AsyncIterable[T], Protocol):
|
764
|
+
async def size(self) -> int: ...
|
765
|
+
|
766
|
+
async def get_item(self, arg: TOrdinal) -> T: ...
|
767
|
+
|
768
|
+
async def get_slice(self, start: int, stop: int) -> list[T]: ...
|
769
|
+
|
770
|
+
async def get_multiple(self, arg: list[TOrdinal]) -> list[T]: ...
|
771
|
+
|
772
|
+
|
773
|
+
class ICollection[T, TOrdinal](IReadonlyCollection[T, TOrdinal], Protocol):
|
774
|
+
"""An APPEND-ONLY collection."""
|
775
|
+
|
776
|
+
@property
|
777
|
+
def is_persistent(self) -> bool: ...
|
778
|
+
|
779
|
+
async def append(self, item: T) -> None: ...
|
780
|
+
|
781
|
+
async def extend(self, items: Iterable[T]) -> None:
|
782
|
+
"""Append multiple items to the collection."""
|
783
|
+
# The default implementation just calls append for each item.
|
784
|
+
for item in items:
|
785
|
+
await self.append(item)
|
786
|
+
|
787
|
+
|
788
|
+
class IMessageCollection[TMessage: IMessage](
|
789
|
+
ICollection[TMessage, MessageOrdinal], Protocol
|
790
|
+
):
|
791
|
+
"""A collection of Messages."""
|
792
|
+
|
793
|
+
|
794
|
+
class ISemanticRefCollection(ICollection[SemanticRef, SemanticRefOrdinal], Protocol):
|
795
|
+
"""A collection of SemanticRefs."""
|
796
|
+
|
797
|
+
|
798
|
+
class IStorageProvider[TMessage: IMessage](Protocol):
|
799
|
+
"""API spec for storage providers -- maybe in-memory or persistent."""
|
800
|
+
|
801
|
+
async def get_message_collection(self) -> IMessageCollection[TMessage]: ...
|
802
|
+
|
803
|
+
async def get_semantic_ref_collection(self) -> ISemanticRefCollection: ...
|
804
|
+
|
805
|
+
# Index getters - ALL 6 index types for this conversation
|
806
|
+
async def get_semantic_ref_index(self) -> ITermToSemanticRefIndex: ...
|
807
|
+
|
808
|
+
async def get_property_index(self) -> IPropertyToSemanticRefIndex: ...
|
809
|
+
|
810
|
+
async def get_timestamp_index(self) -> ITimestampToTextRangeIndex: ...
|
811
|
+
|
812
|
+
async def get_message_text_index(self) -> IMessageTextIndex[TMessage]: ...
|
813
|
+
|
814
|
+
async def get_related_terms_index(self) -> ITermToRelatedTermsIndex: ...
|
815
|
+
|
816
|
+
async def get_conversation_threads(self) -> IConversationThreads: ...
|
817
|
+
|
818
|
+
async def close(self) -> None: ...
|