PyPI - typeagent-py - Versions diffs - 0.1.0__py3-none-any.whl - Mend

typeagent-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

typeagent/aitools/auth.py +61 -0
typeagent/aitools/embeddings.py +232 -0
typeagent/aitools/utils.py +244 -0
typeagent/aitools/vectorbase.py +175 -0
typeagent/knowpro/answer_context_schema.py +49 -0
typeagent/knowpro/answer_response_schema.py +34 -0
typeagent/knowpro/answers.py +577 -0
typeagent/knowpro/collections.py +759 -0
typeagent/knowpro/common.py +9 -0
typeagent/knowpro/convknowledge.py +112 -0
typeagent/knowpro/convsettings.py +94 -0
typeagent/knowpro/convutils.py +49 -0
typeagent/knowpro/date_time_schema.py +32 -0
typeagent/knowpro/field_helpers.py +87 -0
typeagent/knowpro/fuzzyindex.py +144 -0
typeagent/knowpro/interfaces.py +818 -0
typeagent/knowpro/knowledge.py +88 -0
typeagent/knowpro/kplib.py +125 -0
typeagent/knowpro/query.py +1128 -0
typeagent/knowpro/search.py +628 -0
typeagent/knowpro/search_query_schema.py +165 -0
typeagent/knowpro/searchlang.py +729 -0
typeagent/knowpro/searchlib.py +345 -0
typeagent/knowpro/secindex.py +100 -0
typeagent/knowpro/serialization.py +390 -0
typeagent/knowpro/textlocindex.py +179 -0
typeagent/knowpro/utils.py +17 -0
typeagent/mcp/server.py +139 -0
typeagent/podcasts/podcast.py +473 -0
typeagent/podcasts/podcast_import.py +105 -0
typeagent/storage/__init__.py +25 -0
typeagent/storage/memory/__init__.py +13 -0
typeagent/storage/memory/collections.py +68 -0
typeagent/storage/memory/convthreads.py +81 -0
typeagent/storage/memory/messageindex.py +178 -0
typeagent/storage/memory/propindex.py +289 -0
typeagent/storage/memory/provider.py +84 -0
typeagent/storage/memory/reltermsindex.py +318 -0
typeagent/storage/memory/semrefindex.py +660 -0
typeagent/storage/memory/timestampindex.py +176 -0
typeagent/storage/sqlite/__init__.py +31 -0
typeagent/storage/sqlite/collections.py +362 -0
typeagent/storage/sqlite/messageindex.py +382 -0
typeagent/storage/sqlite/propindex.py +119 -0
typeagent/storage/sqlite/provider.py +293 -0
typeagent/storage/sqlite/reltermsindex.py +328 -0
typeagent/storage/sqlite/schema.py +248 -0
typeagent/storage/sqlite/semrefindex.py +156 -0
typeagent/storage/sqlite/timestampindex.py +146 -0
typeagent/storage/utils.py +41 -0
typeagent_py-0.1.0.dist-info/METADATA +28 -0
typeagent_py-0.1.0.dist-info/RECORD +55 -0
typeagent_py-0.1.0.dist-info/WHEEL +5 -0
typeagent_py-0.1.0.dist-info/licenses/LICENSE +21 -0
typeagent_py-0.1.0.dist-info/top_level.txt +1 -0

typeagent/podcasts/podcast_import.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import os
+import re
+from ..knowpro.convsettings import ConversationSettings
+from ..knowpro.interfaces import Datetime
+from ..storage.utils import create_storage_provider
+from .podcast import Podcast, PodcastMessage, PodcastMessageMeta
+async def import_podcast(
+    transcript_file_path: str,
+    settings: ConversationSettings,
+    podcast_name: str | None = None,
+    start_date: Datetime | None = None,
+    length_minutes: float = 60.0,
+    dbname: str | None = None,
+) -> Podcast:
+    with open(transcript_file_path, "r") as f:
+        transcript_lines = f.readlines()
+    if not podcast_name:
+        podcast_name = os.path.splitext(os.path.basename(transcript_file_path))[0]
+    # TODO: Don't use a regex, just basic string stuff
+    regex = r"""(?x)                  # Enable verbose regex syntax
+        ^
+        (?:                           # Optional speaker part
+            \s*                       # Optional leading whitespace
+            (?P<speaker>              # Capture group for speaker
+                [A-Z0-9]+             # One or more uppercase letters/digits
+                (?:\s+[A-Z0-9]+)*     # Optional additional words
+            )
+            \s*                       # Optional whitespace after speaker
+            :                         # Colon separator
+            \s*                       # Optional whitespace after colon
+        )?
+        (?P<speech>(?:.*\S)?)         # Capture the rest as speech (ending in non-whitespace)
+        \s*                           # Optional trailing whitespace
+        $
+    """
+    turn_parse_regex = re.compile(regex)
+    participants: set[str] = set()
+    cur_msg: PodcastMessage | None = None
+    msgs: list[PodcastMessage] = []
+    for line in transcript_lines:
+        match = turn_parse_regex.match(line)
+        if match:
+            speaker = match.group("speaker")
+            if speaker:
+                speaker = speaker.lower()
+            speech = match.group("speech")
+            if not (speaker or speech):
+                continue
+            if cur_msg:
+                if not speaker:
+                    cur_msg.add_content("\n" + speech)
+                else:
+                    msgs.append(cur_msg)
+                    cur_msg = None
+            if not cur_msg:
+                if speaker:
+                    participants.add(speaker)
+                metadata = PodcastMessageMeta(speaker)
+                cur_msg = PodcastMessage([speech], metadata)
+    if cur_msg:
+        msgs.append(cur_msg)
+    assign_message_listeners(msgs, participants)
+    provider = await create_storage_provider(
+        settings.message_text_index_settings,
+        settings.related_term_index_settings,
+        dbname,
+        PodcastMessage,
+    )
+    msg_coll = await provider.get_message_collection()
+    semref_coll = await provider.get_semantic_ref_collection()
+    if await msg_coll.size() or await semref_coll.size():
+        raise RuntimeError(f"{dbname!r} already has messages or semantic refs.")
+    await msg_coll.extend(msgs)
+    pod = await Podcast.create(
+        settings,
+        name_tag=podcast_name,
+        messages=msg_coll,
+        tags=[podcast_name],
+        semantic_refs=semref_coll,
+    )
+    if start_date:
+        await pod.generate_timestamps(start_date, length_minutes)
+    # TODO: Add more tags.
+    return pod
+def assign_message_listeners(
+    msgs: list[PodcastMessage],
+    participants: set[str],
+) -> None:
+    for msg in msgs:
+        if msg.metadata.speaker:
+            listeners = [p for p in participants if p != msg.metadata.speaker]
+            msg.metadata.listeners = listeners

typeagent/storage/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""Storage providers and implementations."""
+# Import from new organized structure
+from .memory import (
+    MemoryStorageProvider,
+    MemoryMessageCollection,
+    MemorySemanticRefCollection,
+)
+from .sqlite import (
+    SqliteStorageProvider,
+    SqliteMessageCollection,
+    SqliteSemanticRefCollection,
+)
+__all__ = [
+    "MemoryStorageProvider",
+    "MemoryMessageCollection",
+    "MemorySemanticRefCollection",
+    "SqliteStorageProvider",
+    "SqliteMessageCollection",
+    "SqliteSemanticRefCollection",
+]

typeagent/storage/memory/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""Memory-based storage implementations."""
+from .collections import MemoryMessageCollection, MemorySemanticRefCollection
+from .provider import MemoryStorageProvider
+__all__ = [
+    "MemoryMessageCollection",
+    "MemorySemanticRefCollection",
+    "MemoryStorageProvider",
+]

typeagent/storage/memory/collections.py ADDED Viewed

@@ -0,0 +1,68 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""Memory-based collection implementations."""
+from typing import Iterable
+from ...knowpro.interfaces import (
+    ICollection,
+    IMessage,
+    ISemanticRefCollection,
+    IMessageCollection,
+    MessageOrdinal,
+    SemanticRef,
+    SemanticRefOrdinal,
+)
+class MemoryCollection[T, TOrdinal: int](ICollection[T, TOrdinal]):
+    """A generic in-memory (non-persistent) collection class."""
+    def __init__(self, items: list[T] | None = None):
+        self.items: list[T] = items or []
+    async def size(self) -> int:
+        return len(self.items)
+    def __aiter__(self):
+        """Return an async iterator over the collection."""
+        return self._async_iterator()
+    async def _async_iterator(self):
+        """Async generator that yields items from the collection."""
+        for item in self.items:
+            yield item
+    async def get_item(self, arg: int) -> T:
+        """Retrieve an item by its ordinal."""
+        return self.items[arg]
+    async def get_slice(self, start: int, stop: int) -> list[T]:
+        """Retrieve a slice of items."""
+        return self.items[start:stop]
+    async def get_multiple(self, arg: list[TOrdinal]) -> list[T]:
+        """Retrieve multiple items by their ordinals."""
+        return [await self.get_item(ordinal) for ordinal in arg]
+    @property
+    def is_persistent(self) -> bool:
+        return False
+    async def append(self, item: T) -> None:
+        """Append an item to the collection."""
+        self.items.append(item)
+    async def extend(self, items: Iterable[T]) -> None:
+        """Extend the collection with multiple items."""
+        self.items.extend(items)
+class MemorySemanticRefCollection(MemoryCollection[SemanticRef, SemanticRefOrdinal]):
+    """A collection of semantic references."""
+class MemoryMessageCollection[TMessage: IMessage](
+    MemoryCollection[TMessage, MessageOrdinal]
+):
+    """A collection of messages."""

typeagent/storage/memory/convthreads.py ADDED Viewed

@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from ...knowpro.interfaces import (
+    ConversationThreadData,
+    IConversationThreads,
+    ThreadDataItem,
+    ScoredThreadOrdinal,
+    Thread,
+)
+from ...aitools.vectorbase import TextEmbeddingIndexSettings, VectorBase
+class ConversationThreads(IConversationThreads):
+    threads: list[Thread]
+    vector_base: VectorBase
+    def __init__(self, settings: TextEmbeddingIndexSettings):
+        self.threads = []
+        self.vector_base = VectorBase(settings)
+    async def add_thread(self, thread: Thread) -> None:
+        assert len(self.threads) == len(self.vector_base)
+        await self.vector_base.add_key(thread.description, cache=False)
+        self.threads.append(thread)
+    async def lookup_thread(
+        self,
+        thread_description: str,
+        max_matches: int | None = None,
+        threshold_score: float | None = None,
+    ) -> list[ScoredThreadOrdinal]:
+        matches = await self.vector_base.fuzzy_lookup(
+            thread_description,
+            max_matches,
+            threshold_score,
+        )
+        return [
+            ScoredThreadOrdinal(
+                match.item,
+                match.score,
+            )
+            for match in matches
+        ]
+    def clear(self) -> None:
+        self.threads = []
+        self.vector_base.clear()
+    async def build_index(self) -> None:
+        self.vector_base.clear()  # Just in case
+        await self.vector_base.add_keys(
+            [t.description for t in self.threads], cache=False
+        )
+    def serialize(self) -> ConversationThreadData[ThreadDataItem]:
+        thread_data: list[ThreadDataItem] = []
+        for i, thread in enumerate(self.threads):
+            emb = self.vector_base.serialize_embedding_at(i)
+            thread_data.append(
+                ThreadDataItem(
+                    thread=thread.serialize(),
+                    embedding=list(emb) if emb is not None else None,
+                )
+            )
+        return ConversationThreadData(threads=thread_data)
+    def deserialize(self, data: ConversationThreadData[ThreadDataItem]) -> None:
+        self.clear()
+        thread_data = data.get("threads")
+        if thread_data is None:
+            return
+        for item in thread_data:
+            thread_data = item["thread"]
+            embedding = item["embedding"]
+            thread = Thread.deserialize(thread_data)
+            self.threads.append(thread)
+            if embedding is not None:
+                # assert isinstance(embedding, list), "Expected embedding to be a list"
+                self.vector_base.add_embedding(thread_data["description"], embedding)

typeagent/storage/memory/messageindex.py ADDED Viewed

@@ -0,0 +1,178 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
+from ...aitools.embeddings import NormalizedEmbedding
+from ...aitools.vectorbase import TextEmbeddingIndexSettings
+from ...knowpro.convsettings import MessageTextIndexSettings
+from ...knowpro.interfaces import (
+    IConversation,
+    IMessage,
+    IMessageTextIndex,
+    IStorageProvider,
+    MessageTextIndexData,
+    ITermToSemanticRefIndex,
+    MessageOrdinal,
+    ScoredMessageOrdinal,
+    TextLocation,
+)
+from ...knowpro.textlocindex import ScoredTextLocation, TextToTextLocationIndex
+async def build_message_index[
+    TMessage: IMessage,
+    TTermToSemanticRefIndex: ITermToSemanticRefIndex,
+](
+    conversation: IConversation[TMessage, TTermToSemanticRefIndex],
+    storage_provider: IStorageProvider[TMessage],
+) -> None:
+    csi = conversation.secondary_indexes
+    if csi is None:
+        return
+    if csi.message_index is None:
+        csi.message_index = await storage_provider.get_message_text_index()
+    messages = conversation.messages
+    # Convert collection to list for add_messages
+    messages_list = await messages.get_slice(0, await messages.size())
+    await csi.message_index.add_messages(messages_list)
+class IMessageTextEmbeddingIndex(IMessageTextIndex):
+    async def generate_embedding(self, text: str) -> NormalizedEmbedding: ...
+    def lookup_by_embedding(
+        self,
+        text_embedding: NormalizedEmbedding,
+        max_matches: int | None = None,
+        threshold_score: float | None = None,
+        predicate: Callable[[MessageOrdinal], bool] | None = None,
+    ) -> list[ScoredMessageOrdinal]: ...
+    def lookup_in_subset_by_embedding(
+        self,
+        text_embedding: NormalizedEmbedding,
+        ordinals_to_search: list[MessageOrdinal],
+        max_matches: int | None = None,
+        threshold_score: float | None = None,
+    ) -> list[ScoredMessageOrdinal]: ...
+class MessageTextIndex(IMessageTextEmbeddingIndex):
+    def __init__(self, settings: MessageTextIndexSettings):
+        self.settings = settings
+        self.text_location_index = TextToTextLocationIndex(
+            settings.embedding_index_settings
+        )
+    async def size(self) -> int:
+        return await self.text_location_index.size()
+    async def is_empty(self) -> bool:
+        return await self.text_location_index.is_empty()
+    async def add_messages[TMessage: IMessage](
+        self,
+        messages: Iterable[TMessage],
+    ) -> None:
+        base_message_ordinal: MessageOrdinal = await self.text_location_index.size()
+        all_chunks: list[tuple[str, TextLocation]] = []
+        # Collect everything so we can batch efficiently.
+        for message_ordinal, message in enumerate(messages, base_message_ordinal):
+            for chunk_ordinal, chunk in enumerate(message.text_chunks):
+                all_chunks.append((chunk, TextLocation(message_ordinal, chunk_ordinal)))
+        await self.text_location_index.add_text_locations(all_chunks)
+    async def add_messages_starting_at(
+        self,
+        start_message_ordinal: int,
+        messages: list[IMessage],
+    ) -> None:
+        """Add messages to the index starting at the given ordinal."""
+        all_chunks: list[tuple[str, TextLocation]] = []
+        for idx, message in enumerate(messages):
+            msg_ord = start_message_ordinal + idx
+            for chunk_ord, chunk in enumerate(message.text_chunks):
+                all_chunks.append((chunk, TextLocation(msg_ord, chunk_ord)))
+        await self.text_location_index.add_text_locations(all_chunks)
+    async def lookup_messages(
+        self,
+        message_text: str,
+        max_matches: int | None = None,
+        threshold_score: float | None = None,
+    ) -> list[ScoredMessageOrdinal]:
+        max_matches = max_matches or self.settings.embedding_index_settings.max_matches
+        threshold_score = (
+            threshold_score or self.settings.embedding_index_settings.min_score
+        )
+        scored_text_locations = await self.text_location_index.lookup_text(
+            message_text, max_matches, threshold_score
+        )
+        return self.to_scored_message_ordinals(scored_text_locations)
+    async def lookup_messages_in_subset(
+        self,
+        message_text: str,
+        ordinals_to_search: list[MessageOrdinal],
+        max_matches: int | None = None,
+        threshold_score: float | None = None,
+    ) -> list[ScoredMessageOrdinal]:
+        scored_text_locations = await self.text_location_index.lookup_text_in_subset(
+            message_text, ordinals_to_search, max_matches, threshold_score
+        )
+        return self.to_scored_message_ordinals(scored_text_locations)
+    async def generate_embedding(self, text: str) -> NormalizedEmbedding:
+        # Note: if you rename generate_embedding, be sure to also fix is_message_text_embedding_index.
+        # TODO: Retries?
+        # TODO: Find a prettier API to get an embedding rather than using _vector_base?
+        return await self.text_location_index.generate_embedding(text)
+    def lookup_in_subset_by_embedding(
+        self,
+        text_embedding: NormalizedEmbedding,
+        ordinals_to_search: list[MessageOrdinal],
+        max_matches: int | None = None,
+        threshold_score: float | None = None,
+    ) -> list[ScoredMessageOrdinal]:
+        scored_text_locations = self.text_location_index.lookup_in_subset_by_embedding(
+            text_embedding, ordinals_to_search, max_matches, threshold_score
+        )
+        return self.to_scored_message_ordinals(scored_text_locations)
+    def to_scored_message_ordinals(
+        self, scored_locations: list[ScoredTextLocation]
+    ) -> list[ScoredMessageOrdinal]:
+        matches: dict[MessageOrdinal, ScoredMessageOrdinal] = {}
+        for sl in scored_locations:
+            value = sl.text_location.message_ordinal
+            score = sl.score
+            match = matches.get(value)
+            if match is None:
+                matches[value] = ScoredMessageOrdinal(value, score)
+            else:
+                match.score = max(score, match.score)
+        return [
+            ScoredMessageOrdinal(
+                match.message_ordinal,
+                match.score,
+            )
+            for match in sorted(
+                matches.values(), key=lambda match: match.score, reverse=True
+            )
+        ]
+    async def serialize(self) -> MessageTextIndexData:
+        return MessageTextIndexData(
+            indexData=self.text_location_index.serialize(),
+        )
+    async def deserialize(self, data: MessageTextIndexData) -> None:
+        index_data = data.get("indexData")
+        if index_data is None:
+            return
+        self.text_location_index.deserialize(index_data)