typeagent-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. typeagent/aitools/auth.py +61 -0
  2. typeagent/aitools/embeddings.py +232 -0
  3. typeagent/aitools/utils.py +244 -0
  4. typeagent/aitools/vectorbase.py +175 -0
  5. typeagent/knowpro/answer_context_schema.py +49 -0
  6. typeagent/knowpro/answer_response_schema.py +34 -0
  7. typeagent/knowpro/answers.py +577 -0
  8. typeagent/knowpro/collections.py +759 -0
  9. typeagent/knowpro/common.py +9 -0
  10. typeagent/knowpro/convknowledge.py +112 -0
  11. typeagent/knowpro/convsettings.py +94 -0
  12. typeagent/knowpro/convutils.py +49 -0
  13. typeagent/knowpro/date_time_schema.py +32 -0
  14. typeagent/knowpro/field_helpers.py +87 -0
  15. typeagent/knowpro/fuzzyindex.py +144 -0
  16. typeagent/knowpro/interfaces.py +818 -0
  17. typeagent/knowpro/knowledge.py +88 -0
  18. typeagent/knowpro/kplib.py +125 -0
  19. typeagent/knowpro/query.py +1128 -0
  20. typeagent/knowpro/search.py +628 -0
  21. typeagent/knowpro/search_query_schema.py +165 -0
  22. typeagent/knowpro/searchlang.py +729 -0
  23. typeagent/knowpro/searchlib.py +345 -0
  24. typeagent/knowpro/secindex.py +100 -0
  25. typeagent/knowpro/serialization.py +390 -0
  26. typeagent/knowpro/textlocindex.py +179 -0
  27. typeagent/knowpro/utils.py +17 -0
  28. typeagent/mcp/server.py +139 -0
  29. typeagent/podcasts/podcast.py +473 -0
  30. typeagent/podcasts/podcast_import.py +105 -0
  31. typeagent/storage/__init__.py +25 -0
  32. typeagent/storage/memory/__init__.py +13 -0
  33. typeagent/storage/memory/collections.py +68 -0
  34. typeagent/storage/memory/convthreads.py +81 -0
  35. typeagent/storage/memory/messageindex.py +178 -0
  36. typeagent/storage/memory/propindex.py +289 -0
  37. typeagent/storage/memory/provider.py +84 -0
  38. typeagent/storage/memory/reltermsindex.py +318 -0
  39. typeagent/storage/memory/semrefindex.py +660 -0
  40. typeagent/storage/memory/timestampindex.py +176 -0
  41. typeagent/storage/sqlite/__init__.py +31 -0
  42. typeagent/storage/sqlite/collections.py +362 -0
  43. typeagent/storage/sqlite/messageindex.py +382 -0
  44. typeagent/storage/sqlite/propindex.py +119 -0
  45. typeagent/storage/sqlite/provider.py +293 -0
  46. typeagent/storage/sqlite/reltermsindex.py +328 -0
  47. typeagent/storage/sqlite/schema.py +248 -0
  48. typeagent/storage/sqlite/semrefindex.py +156 -0
  49. typeagent/storage/sqlite/timestampindex.py +146 -0
  50. typeagent/storage/utils.py +41 -0
  51. typeagent_py-0.1.0.dist-info/METADATA +28 -0
  52. typeagent_py-0.1.0.dist-info/RECORD +55 -0
  53. typeagent_py-0.1.0.dist-info/WHEEL +5 -0
  54. typeagent_py-0.1.0.dist-info/licenses/LICENSE +21 -0
  55. typeagent_py-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,105 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ import os
5
+ import re
6
+
7
+ from ..knowpro.convsettings import ConversationSettings
8
+ from ..knowpro.interfaces import Datetime
9
+ from ..storage.utils import create_storage_provider
10
+ from .podcast import Podcast, PodcastMessage, PodcastMessageMeta
11
+
12
+
13
+ async def import_podcast(
14
+ transcript_file_path: str,
15
+ settings: ConversationSettings,
16
+ podcast_name: str | None = None,
17
+ start_date: Datetime | None = None,
18
+ length_minutes: float = 60.0,
19
+ dbname: str | None = None,
20
+ ) -> Podcast:
21
+ with open(transcript_file_path, "r") as f:
22
+ transcript_lines = f.readlines()
23
+ if not podcast_name:
24
+ podcast_name = os.path.splitext(os.path.basename(transcript_file_path))[0]
25
+ # TODO: Don't use a regex, just basic string stuff
26
+ regex = r"""(?x) # Enable verbose regex syntax
27
+ ^
28
+ (?: # Optional speaker part
29
+ \s* # Optional leading whitespace
30
+ (?P<speaker> # Capture group for speaker
31
+ [A-Z0-9]+ # One or more uppercase letters/digits
32
+ (?:\s+[A-Z0-9]+)* # Optional additional words
33
+ )
34
+ \s* # Optional whitespace after speaker
35
+ : # Colon separator
36
+ \s* # Optional whitespace after colon
37
+ )?
38
+ (?P<speech>(?:.*\S)?) # Capture the rest as speech (ending in non-whitespace)
39
+ \s* # Optional trailing whitespace
40
+ $
41
+ """
42
+ turn_parse_regex = re.compile(regex)
43
+ participants: set[str] = set()
44
+
45
+ cur_msg: PodcastMessage | None = None
46
+ msgs: list[PodcastMessage] = []
47
+ for line in transcript_lines:
48
+ match = turn_parse_regex.match(line)
49
+ if match:
50
+ speaker = match.group("speaker")
51
+ if speaker:
52
+ speaker = speaker.lower()
53
+ speech = match.group("speech")
54
+ if not (speaker or speech):
55
+ continue
56
+ if cur_msg:
57
+ if not speaker:
58
+ cur_msg.add_content("\n" + speech)
59
+ else:
60
+ msgs.append(cur_msg)
61
+ cur_msg = None
62
+ if not cur_msg:
63
+ if speaker:
64
+ participants.add(speaker)
65
+ metadata = PodcastMessageMeta(speaker)
66
+ cur_msg = PodcastMessage([speech], metadata)
67
+ if cur_msg:
68
+ msgs.append(cur_msg)
69
+
70
+ assign_message_listeners(msgs, participants)
71
+
72
+ provider = await create_storage_provider(
73
+ settings.message_text_index_settings,
74
+ settings.related_term_index_settings,
75
+ dbname,
76
+ PodcastMessage,
77
+ )
78
+ msg_coll = await provider.get_message_collection()
79
+ semref_coll = await provider.get_semantic_ref_collection()
80
+ if await msg_coll.size() or await semref_coll.size():
81
+ raise RuntimeError(f"{dbname!r} already has messages or semantic refs.")
82
+
83
+ await msg_coll.extend(msgs)
84
+
85
+ pod = await Podcast.create(
86
+ settings,
87
+ name_tag=podcast_name,
88
+ messages=msg_coll,
89
+ tags=[podcast_name],
90
+ semantic_refs=semref_coll,
91
+ )
92
+ if start_date:
93
+ await pod.generate_timestamps(start_date, length_minutes)
94
+ # TODO: Add more tags.
95
+ return pod
96
+
97
+
98
+ def assign_message_listeners(
99
+ msgs: list[PodcastMessage],
100
+ participants: set[str],
101
+ ) -> None:
102
+ for msg in msgs:
103
+ if msg.metadata.speaker:
104
+ listeners = [p for p in participants if p != msg.metadata.speaker]
105
+ msg.metadata.listeners = listeners
@@ -0,0 +1,25 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ """Storage providers and implementations."""
5
+
6
+ # Import from new organized structure
7
+ from .memory import (
8
+ MemoryStorageProvider,
9
+ MemoryMessageCollection,
10
+ MemorySemanticRefCollection,
11
+ )
12
+ from .sqlite import (
13
+ SqliteStorageProvider,
14
+ SqliteMessageCollection,
15
+ SqliteSemanticRefCollection,
16
+ )
17
+
18
+ __all__ = [
19
+ "MemoryStorageProvider",
20
+ "MemoryMessageCollection",
21
+ "MemorySemanticRefCollection",
22
+ "SqliteStorageProvider",
23
+ "SqliteMessageCollection",
24
+ "SqliteSemanticRefCollection",
25
+ ]
@@ -0,0 +1,13 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ """Memory-based storage implementations."""
5
+
6
+ from .collections import MemoryMessageCollection, MemorySemanticRefCollection
7
+ from .provider import MemoryStorageProvider
8
+
9
+ __all__ = [
10
+ "MemoryMessageCollection",
11
+ "MemorySemanticRefCollection",
12
+ "MemoryStorageProvider",
13
+ ]
@@ -0,0 +1,68 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ """Memory-based collection implementations."""
5
+
6
+ from typing import Iterable
7
+ from ...knowpro.interfaces import (
8
+ ICollection,
9
+ IMessage,
10
+ ISemanticRefCollection,
11
+ IMessageCollection,
12
+ MessageOrdinal,
13
+ SemanticRef,
14
+ SemanticRefOrdinal,
15
+ )
16
+
17
+
18
+ class MemoryCollection[T, TOrdinal: int](ICollection[T, TOrdinal]):
19
+ """A generic in-memory (non-persistent) collection class."""
20
+
21
+ def __init__(self, items: list[T] | None = None):
22
+ self.items: list[T] = items or []
23
+
24
+ async def size(self) -> int:
25
+ return len(self.items)
26
+
27
+ def __aiter__(self):
28
+ """Return an async iterator over the collection."""
29
+ return self._async_iterator()
30
+
31
+ async def _async_iterator(self):
32
+ """Async generator that yields items from the collection."""
33
+ for item in self.items:
34
+ yield item
35
+
36
+ async def get_item(self, arg: int) -> T:
37
+ """Retrieve an item by its ordinal."""
38
+ return self.items[arg]
39
+
40
+ async def get_slice(self, start: int, stop: int) -> list[T]:
41
+ """Retrieve a slice of items."""
42
+ return self.items[start:stop]
43
+
44
+ async def get_multiple(self, arg: list[TOrdinal]) -> list[T]:
45
+ """Retrieve multiple items by their ordinals."""
46
+ return [await self.get_item(ordinal) for ordinal in arg]
47
+
48
+ @property
49
+ def is_persistent(self) -> bool:
50
+ return False
51
+
52
+ async def append(self, item: T) -> None:
53
+ """Append an item to the collection."""
54
+ self.items.append(item)
55
+
56
+ async def extend(self, items: Iterable[T]) -> None:
57
+ """Extend the collection with multiple items."""
58
+ self.items.extend(items)
59
+
60
+
61
+ class MemorySemanticRefCollection(MemoryCollection[SemanticRef, SemanticRefOrdinal]):
62
+ """A collection of semantic references."""
63
+
64
+
65
+ class MemoryMessageCollection[TMessage: IMessage](
66
+ MemoryCollection[TMessage, MessageOrdinal]
67
+ ):
68
+ """A collection of messages."""
@@ -0,0 +1,81 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ from ...knowpro.interfaces import (
5
+ ConversationThreadData,
6
+ IConversationThreads,
7
+ ThreadDataItem,
8
+ ScoredThreadOrdinal,
9
+ Thread,
10
+ )
11
+ from ...aitools.vectorbase import TextEmbeddingIndexSettings, VectorBase
12
+
13
+
14
+ class ConversationThreads(IConversationThreads):
15
+ threads: list[Thread]
16
+ vector_base: VectorBase
17
+
18
+ def __init__(self, settings: TextEmbeddingIndexSettings):
19
+ self.threads = []
20
+ self.vector_base = VectorBase(settings)
21
+
22
+ async def add_thread(self, thread: Thread) -> None:
23
+ assert len(self.threads) == len(self.vector_base)
24
+ await self.vector_base.add_key(thread.description, cache=False)
25
+ self.threads.append(thread)
26
+
27
+ async def lookup_thread(
28
+ self,
29
+ thread_description: str,
30
+ max_matches: int | None = None,
31
+ threshold_score: float | None = None,
32
+ ) -> list[ScoredThreadOrdinal]:
33
+ matches = await self.vector_base.fuzzy_lookup(
34
+ thread_description,
35
+ max_matches,
36
+ threshold_score,
37
+ )
38
+ return [
39
+ ScoredThreadOrdinal(
40
+ match.item,
41
+ match.score,
42
+ )
43
+ for match in matches
44
+ ]
45
+
46
+ def clear(self) -> None:
47
+ self.threads = []
48
+ self.vector_base.clear()
49
+
50
+ async def build_index(self) -> None:
51
+ self.vector_base.clear() # Just in case
52
+ await self.vector_base.add_keys(
53
+ [t.description for t in self.threads], cache=False
54
+ )
55
+
56
+ def serialize(self) -> ConversationThreadData[ThreadDataItem]:
57
+ thread_data: list[ThreadDataItem] = []
58
+ for i, thread in enumerate(self.threads):
59
+ emb = self.vector_base.serialize_embedding_at(i)
60
+ thread_data.append(
61
+ ThreadDataItem(
62
+ thread=thread.serialize(),
63
+ embedding=list(emb) if emb is not None else None,
64
+ )
65
+ )
66
+
67
+ return ConversationThreadData(threads=thread_data)
68
+
69
+ def deserialize(self, data: ConversationThreadData[ThreadDataItem]) -> None:
70
+ self.clear()
71
+ thread_data = data.get("threads")
72
+ if thread_data is None:
73
+ return
74
+ for item in thread_data:
75
+ thread_data = item["thread"]
76
+ embedding = item["embedding"]
77
+ thread = Thread.deserialize(thread_data)
78
+ self.threads.append(thread)
79
+ if embedding is not None:
80
+ # assert isinstance(embedding, list), "Expected embedding to be a list"
81
+ self.vector_base.add_embedding(thread_data["description"], embedding)
@@ -0,0 +1,178 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ from collections.abc import Callable, Iterable
5
+ from dataclasses import dataclass
6
+
7
+ from ...aitools.embeddings import NormalizedEmbedding
8
+ from ...aitools.vectorbase import TextEmbeddingIndexSettings
9
+ from ...knowpro.convsettings import MessageTextIndexSettings
10
+ from ...knowpro.interfaces import (
11
+ IConversation,
12
+ IMessage,
13
+ IMessageTextIndex,
14
+ IStorageProvider,
15
+ MessageTextIndexData,
16
+ ITermToSemanticRefIndex,
17
+ MessageOrdinal,
18
+ ScoredMessageOrdinal,
19
+ TextLocation,
20
+ )
21
+ from ...knowpro.textlocindex import ScoredTextLocation, TextToTextLocationIndex
22
+
23
+
24
+ async def build_message_index[
25
+ TMessage: IMessage,
26
+ TTermToSemanticRefIndex: ITermToSemanticRefIndex,
27
+ ](
28
+ conversation: IConversation[TMessage, TTermToSemanticRefIndex],
29
+ storage_provider: IStorageProvider[TMessage],
30
+ ) -> None:
31
+ csi = conversation.secondary_indexes
32
+ if csi is None:
33
+ return
34
+ if csi.message_index is None:
35
+ csi.message_index = await storage_provider.get_message_text_index()
36
+ messages = conversation.messages
37
+ # Convert collection to list for add_messages
38
+ messages_list = await messages.get_slice(0, await messages.size())
39
+ await csi.message_index.add_messages(messages_list)
40
+
41
+
42
+ class IMessageTextEmbeddingIndex(IMessageTextIndex):
43
+ async def generate_embedding(self, text: str) -> NormalizedEmbedding: ...
44
+
45
+ def lookup_by_embedding(
46
+ self,
47
+ text_embedding: NormalizedEmbedding,
48
+ max_matches: int | None = None,
49
+ threshold_score: float | None = None,
50
+ predicate: Callable[[MessageOrdinal], bool] | None = None,
51
+ ) -> list[ScoredMessageOrdinal]: ...
52
+
53
+ def lookup_in_subset_by_embedding(
54
+ self,
55
+ text_embedding: NormalizedEmbedding,
56
+ ordinals_to_search: list[MessageOrdinal],
57
+ max_matches: int | None = None,
58
+ threshold_score: float | None = None,
59
+ ) -> list[ScoredMessageOrdinal]: ...
60
+
61
+
62
+ class MessageTextIndex(IMessageTextEmbeddingIndex):
63
+ def __init__(self, settings: MessageTextIndexSettings):
64
+ self.settings = settings
65
+ self.text_location_index = TextToTextLocationIndex(
66
+ settings.embedding_index_settings
67
+ )
68
+
69
+ async def size(self) -> int:
70
+ return await self.text_location_index.size()
71
+
72
+ async def is_empty(self) -> bool:
73
+ return await self.text_location_index.is_empty()
74
+
75
+ async def add_messages[TMessage: IMessage](
76
+ self,
77
+ messages: Iterable[TMessage],
78
+ ) -> None:
79
+ base_message_ordinal: MessageOrdinal = await self.text_location_index.size()
80
+ all_chunks: list[tuple[str, TextLocation]] = []
81
+ # Collect everything so we can batch efficiently.
82
+ for message_ordinal, message in enumerate(messages, base_message_ordinal):
83
+ for chunk_ordinal, chunk in enumerate(message.text_chunks):
84
+ all_chunks.append((chunk, TextLocation(message_ordinal, chunk_ordinal)))
85
+ await self.text_location_index.add_text_locations(all_chunks)
86
+
87
+ async def add_messages_starting_at(
88
+ self,
89
+ start_message_ordinal: int,
90
+ messages: list[IMessage],
91
+ ) -> None:
92
+ """Add messages to the index starting at the given ordinal."""
93
+ all_chunks: list[tuple[str, TextLocation]] = []
94
+ for idx, message in enumerate(messages):
95
+ msg_ord = start_message_ordinal + idx
96
+ for chunk_ord, chunk in enumerate(message.text_chunks):
97
+ all_chunks.append((chunk, TextLocation(msg_ord, chunk_ord)))
98
+ await self.text_location_index.add_text_locations(all_chunks)
99
+
100
+ async def lookup_messages(
101
+ self,
102
+ message_text: str,
103
+ max_matches: int | None = None,
104
+ threshold_score: float | None = None,
105
+ ) -> list[ScoredMessageOrdinal]:
106
+ max_matches = max_matches or self.settings.embedding_index_settings.max_matches
107
+ threshold_score = (
108
+ threshold_score or self.settings.embedding_index_settings.min_score
109
+ )
110
+ scored_text_locations = await self.text_location_index.lookup_text(
111
+ message_text, max_matches, threshold_score
112
+ )
113
+ return self.to_scored_message_ordinals(scored_text_locations)
114
+
115
+ async def lookup_messages_in_subset(
116
+ self,
117
+ message_text: str,
118
+ ordinals_to_search: list[MessageOrdinal],
119
+ max_matches: int | None = None,
120
+ threshold_score: float | None = None,
121
+ ) -> list[ScoredMessageOrdinal]:
122
+ scored_text_locations = await self.text_location_index.lookup_text_in_subset(
123
+ message_text, ordinals_to_search, max_matches, threshold_score
124
+ )
125
+ return self.to_scored_message_ordinals(scored_text_locations)
126
+
127
+ async def generate_embedding(self, text: str) -> NormalizedEmbedding:
128
+ # Note: if you rename generate_embedding, be sure to also fix is_message_text_embedding_index.
129
+ # TODO: Retries?
130
+ # TODO: Find a prettier API to get an embedding rather than using _vector_base?
131
+ return await self.text_location_index.generate_embedding(text)
132
+
133
+ def lookup_in_subset_by_embedding(
134
+ self,
135
+ text_embedding: NormalizedEmbedding,
136
+ ordinals_to_search: list[MessageOrdinal],
137
+ max_matches: int | None = None,
138
+ threshold_score: float | None = None,
139
+ ) -> list[ScoredMessageOrdinal]:
140
+ scored_text_locations = self.text_location_index.lookup_in_subset_by_embedding(
141
+ text_embedding, ordinals_to_search, max_matches, threshold_score
142
+ )
143
+ return self.to_scored_message_ordinals(scored_text_locations)
144
+
145
+ def to_scored_message_ordinals(
146
+ self, scored_locations: list[ScoredTextLocation]
147
+ ) -> list[ScoredMessageOrdinal]:
148
+ matches: dict[MessageOrdinal, ScoredMessageOrdinal] = {}
149
+
150
+ for sl in scored_locations:
151
+ value = sl.text_location.message_ordinal
152
+ score = sl.score
153
+ match = matches.get(value)
154
+ if match is None:
155
+ matches[value] = ScoredMessageOrdinal(value, score)
156
+ else:
157
+ match.score = max(score, match.score)
158
+
159
+ return [
160
+ ScoredMessageOrdinal(
161
+ match.message_ordinal,
162
+ match.score,
163
+ )
164
+ for match in sorted(
165
+ matches.values(), key=lambda match: match.score, reverse=True
166
+ )
167
+ ]
168
+
169
+ async def serialize(self) -> MessageTextIndexData:
170
+ return MessageTextIndexData(
171
+ indexData=self.text_location_index.serialize(),
172
+ )
173
+
174
+ async def deserialize(self, data: MessageTextIndexData) -> None:
175
+ index_data = data.get("indexData")
176
+ if index_data is None:
177
+ return
178
+ self.text_location_index.deserialize(index_data)