PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

gnosisllm_knowledge/__init__.py +91 -39
gnosisllm_knowledge/api/__init__.py +3 -2
gnosisllm_knowledge/api/knowledge.py +502 -32
gnosisllm_knowledge/api/memory.py +966 -0
gnosisllm_knowledge/backends/__init__.py +14 -5
gnosisllm_knowledge/backends/memory/indexer.py +27 -2
gnosisllm_knowledge/backends/memory/searcher.py +111 -10
gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
gnosisllm_knowledge/backends/opensearch/config.py +49 -28
gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
gnosisllm_knowledge/cli/app.py +436 -31
gnosisllm_knowledge/cli/commands/agentic.py +26 -9
gnosisllm_knowledge/cli/commands/load.py +169 -19
gnosisllm_knowledge/cli/commands/memory.py +733 -0
gnosisllm_knowledge/cli/commands/search.py +9 -10
gnosisllm_knowledge/cli/commands/setup.py +49 -23
gnosisllm_knowledge/cli/display/service.py +43 -0
gnosisllm_knowledge/cli/utils/config.py +62 -4
gnosisllm_knowledge/core/domain/__init__.py +54 -0
gnosisllm_knowledge/core/domain/discovery.py +166 -0
gnosisllm_knowledge/core/domain/document.py +19 -19
gnosisllm_knowledge/core/domain/memory.py +440 -0
gnosisllm_knowledge/core/domain/result.py +11 -3
gnosisllm_knowledge/core/domain/search.py +12 -25
gnosisllm_knowledge/core/domain/source.py +11 -12
gnosisllm_knowledge/core/events/__init__.py +8 -0
gnosisllm_knowledge/core/events/types.py +198 -5
gnosisllm_knowledge/core/exceptions.py +227 -0
gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
gnosisllm_knowledge/core/interfaces/memory.py +524 -0
gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
gnosisllm_knowledge/core/streaming/__init__.py +36 -0
gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
gnosisllm_knowledge/fetchers/__init__.py +8 -0
gnosisllm_knowledge/fetchers/config.py +27 -0
gnosisllm_knowledge/fetchers/neoreader.py +31 -3
gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
gnosisllm_knowledge/loaders/__init__.py +5 -1
gnosisllm_knowledge/loaders/base.py +3 -4
gnosisllm_knowledge/loaders/discovery.py +338 -0
gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
gnosisllm_knowledge/loaders/factory.py +46 -0
gnosisllm_knowledge/loaders/sitemap.py +129 -1
gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
gnosisllm_knowledge/services/indexing.py +100 -93
gnosisllm_knowledge/services/search.py +84 -31
gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/core/interfaces/memory.py ADDED Viewed

@@ -0,0 +1,524 @@
+"""Memory protocols - Interface Segregation Principle."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
+if TYPE_CHECKING:
+    from datetime import datetime
+    from gnosisllm_knowledge.core.domain.memory import (
+        ContainerConfig,
+        ContainerInfo,
+        HistoryEntry,
+        MemoryEntry,
+        MemoryStats,
+        MemoryStrategy,
+        MemoryType,
+        Message,
+        Namespace,
+        RecallResult,
+        SessionInfo,
+        StoreRequest,
+        StoreResult,
+    )
+@runtime_checkable
+class IMemoryContainerManager(Protocol):
+    """Protocol for memory container management.
+    Responsible for CRUD operations on memory containers.
+    """
+    async def create_container(
+        self,
+        config: ContainerConfig,
+        **options: Any,
+    ) -> ContainerInfo:
+        """Create a new memory container.
+        Args:
+            config: Container configuration.
+            **options: Backend-specific options.
+        Returns:
+            Created container info.
+        """
+        ...
+    async def get_container(
+        self,
+        container_id: str,
+        **options: Any,
+    ) -> ContainerInfo | None:
+        """Get container by ID.
+        Args:
+            container_id: Container ID.
+            **options: Backend-specific options.
+        Returns:
+            Container info or None if not found.
+        """
+        ...
+    async def list_containers(
+        self,
+        limit: int = 100,
+        **options: Any,
+    ) -> list[ContainerInfo]:
+        """List all containers.
+        Args:
+            limit: Maximum number to return.
+            **options: Backend-specific options.
+        Returns:
+            List of container info.
+        """
+        ...
+    async def update_container(
+        self,
+        container_id: str,
+        config: ContainerConfig,
+        **options: Any,
+    ) -> ContainerInfo:
+        """Update container configuration.
+        Args:
+            container_id: Container ID.
+            config: Updated configuration.
+            **options: Backend-specific options.
+        Returns:
+            Updated container info.
+        """
+        ...
+    async def delete_container(
+        self,
+        container_id: str,
+        **options: Any,
+    ) -> bool:
+        """Delete a container.
+        Args:
+            container_id: Container ID.
+            **options: Backend-specific options.
+        Returns:
+            True if deleted.
+        """
+        ...
+@runtime_checkable
+class IMemoryStore(Protocol):
+    """Protocol for storing memories.
+    Responsible for adding memories to containers.
+    """
+    async def store(
+        self,
+        container_id: str,
+        request: StoreRequest,
+        **options: Any,
+    ) -> StoreResult:
+        """Store memory in container.
+        Args:
+            container_id: Target container ID.
+            request: Store request with messages/data.
+            **options: Backend-specific options.
+        Returns:
+            Store result with IDs and counts.
+        """
+        ...
+    async def get_working_memory(
+        self,
+        container_id: str,
+        session_id: str | None = None,
+        namespace: Namespace | None = None,
+        limit: int = 50,
+        offset: int = 0,
+        **options: Any,
+    ) -> list[Message]:
+        """Get working memory messages.
+        Args:
+            container_id: Container ID.
+            session_id: Optional session filter.
+            namespace: Optional namespace filter.
+            limit: Maximum messages.
+            offset: Skip count.
+            **options: Backend-specific options.
+        Returns:
+            List of messages.
+        """
+        ...
+    async def clear_working_memory(
+        self,
+        container_id: str,
+        session_id: str | None = None,
+        namespace: Namespace | None = None,
+        **options: Any,
+    ) -> int:
+        """Clear working memory.
+        Args:
+            container_id: Container ID.
+            session_id: Optional session filter.
+            namespace: Optional namespace filter.
+            **options: Backend-specific options.
+        Returns:
+            Number of messages deleted.
+        """
+        ...
+@runtime_checkable
+class IMemoryRetriever(Protocol):
+    """Protocol for retrieving/searching memories.
+    Responsible for semantic search over long-term memory.
+    """
+    async def recall(
+        self,
+        container_id: str,
+        query: str,
+        namespace: Namespace | None = None,
+        strategies: list[MemoryStrategy] | None = None,
+        min_score: float | None = None,
+        limit: int = 10,
+        after: datetime | None = None,
+        before: datetime | None = None,
+        **options: Any,
+    ) -> RecallResult:
+        """Semantic search over long-term memories.
+        Args:
+            container_id: Container ID.
+            query: Search query.
+            namespace: Optional namespace filter.
+            strategies: Filter by strategies.
+            min_score: Minimum similarity score.
+            limit: Maximum results.
+            after: Filter by created after.
+            before: Filter by created before.
+            **options: Backend-specific options.
+        Returns:
+            Recall result with memory entries.
+        """
+        ...
+    async def get_memory(
+        self,
+        container_id: str,
+        memory_id: str,
+        memory_type: MemoryType,
+        **options: Any,
+    ) -> MemoryEntry | None:
+        """Get specific memory by ID.
+        For sessions, use ISessionManager.get_session().
+        For history, use IHistoryRetriever.get_history_entry().
+        Args:
+            container_id: Container ID.
+            memory_id: Memory document ID.
+            memory_type: Memory type (WORKING or LONG_TERM).
+            **options: Backend-specific options.
+        Returns:
+            Memory entry or None.
+        """
+        ...
+    async def delete_memory(
+        self,
+        container_id: str,
+        memory_id: str,
+        memory_type: MemoryType,
+        **options: Any,
+    ) -> bool:
+        """Delete specific memory.
+        Args:
+            container_id: Container ID.
+            memory_id: Memory document ID.
+            memory_type: Memory type (WORKING or LONG_TERM).
+            **options: Backend-specific options.
+        Returns:
+            True if deleted.
+        """
+        ...
+    async def delete_memories(
+        self,
+        container_id: str,
+        session_id: str | None = None,
+        namespace: Namespace | None = None,
+        before: datetime | None = None,
+        **options: Any,
+    ) -> int:
+        """Delete memories by filter.
+        Args:
+            container_id: Container ID.
+            session_id: Filter by session.
+            namespace: Filter by namespace.
+            before: Delete before timestamp.
+            **options: Backend-specific options.
+        Returns:
+            Number deleted.
+        """
+        ...
+    async def update_memory(
+        self,
+        container_id: str,
+        memory_id: str,
+        memory_type: MemoryType,
+        *,
+        memory: str | None = None,
+        tags: dict[str, str] | None = None,
+        **options: Any,
+    ) -> MemoryEntry:
+        """Update a specific memory.
+        Note: History memory type does NOT support updates.
+        Args:
+            container_id: Container ID.
+            memory_id: Memory document ID.
+            memory_type: Memory type (working, long-term, sessions).
+            memory: Updated memory content (for long-term).
+            tags: Updated tags.
+            **options: Backend-specific options.
+        Returns:
+            Updated memory entry.
+        """
+        ...
+    async def delete_by_query(
+        self,
+        container_id: str,
+        memory_type: MemoryType,
+        query: dict[str, Any],
+        **options: Any,
+    ) -> int:
+        """Delete memories matching an OpenSearch Query DSL query.
+        Provides full flexibility for complex deletion criteria.
+        Args:
+            container_id: Container ID.
+            memory_type: Memory type to delete from.
+            query: OpenSearch Query DSL query.
+            **options: Backend-specific options.
+        Returns:
+            Number of documents deleted.
+        """
+        ...
+@runtime_checkable
+class IHistoryRetriever(Protocol):
+    """Protocol for retrieving memory history (audit trail). READ-ONLY.
+    History is READ-ONLY. Updates and deletes are NOT supported.
+    """
+    async def get_history_entry(
+        self,
+        container_id: str,
+        history_id: str,
+        **options: Any,
+    ) -> HistoryEntry | None:
+        """Get a specific history entry by ID.
+        Args:
+            container_id: Container ID.
+            history_id: History entry ID.
+            **options: Backend-specific options.
+        Returns:
+            History entry or None.
+        """
+        ...
+    async def list_history(
+        self,
+        container_id: str,
+        memory_id: str | None = None,
+        namespace: Namespace | None = None,
+        limit: int = 100,
+        **options: Any,
+    ) -> list[HistoryEntry]:
+        """List history entries.
+        Args:
+            container_id: Container ID.
+            memory_id: Filter by specific memory ID.
+            namespace: Filter by namespace.
+            limit: Maximum entries to return.
+            **options: Backend-specific options.
+        Returns:
+            List of history entries.
+        """
+        ...
+@runtime_checkable
+class ISessionManager(Protocol):
+    """Protocol for session management.
+    Responsible for session lifecycle operations.
+    """
+    async def create_session(
+        self,
+        container_id: str,
+        *,
+        session_id: str | None = None,
+        summary: str | None = None,
+        namespace: Namespace | None = None,
+        metadata: dict[str, Any] | None = None,
+        **options: Any,
+    ) -> SessionInfo:
+        """Create a new session.
+        Args:
+            container_id: Container ID.
+            session_id: Custom session ID (auto-generated if not provided).
+            summary: Session summary text.
+            namespace: Session namespace.
+            metadata: Custom metadata (stored as additional_info).
+            **options: Backend-specific options.
+        Returns:
+            Created session info.
+        """
+        ...
+    async def get_session(
+        self,
+        container_id: str,
+        session_id: str,
+        include_messages: bool = False,
+        message_limit: int = 50,
+        **options: Any,
+    ) -> SessionInfo | None:
+        """Get session by ID.
+        Args:
+            container_id: Container ID.
+            session_id: Session ID.
+            include_messages: Include session messages.
+            message_limit: Max messages to include.
+            **options: Backend-specific options.
+        Returns:
+            Session info or None.
+        """
+        ...
+    async def list_sessions(
+        self,
+        container_id: str,
+        namespace: Namespace | None = None,
+        limit: int = 100,
+        **options: Any,
+    ) -> list[SessionInfo]:
+        """List sessions.
+        Args:
+            container_id: Container ID.
+            namespace: Filter by namespace.
+            limit: Maximum to return.
+            **options: Backend-specific options.
+        Returns:
+            List of session info.
+        """
+        ...
+    async def update_session(
+        self,
+        container_id: str,
+        session_id: str,
+        *,
+        summary: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        **options: Any,
+    ) -> SessionInfo:
+        """Update a session.
+        Use this to update session summary or metadata.
+        Note: There is no explicit "end session" API in OpenSearch.
+        Args:
+            container_id: Container ID.
+            session_id: Session ID.
+            summary: Updated summary text.
+            metadata: Updated metadata (additional_info).
+            **options: Backend-specific options.
+        Returns:
+            Updated session info.
+        """
+        ...
+    async def delete_session(
+        self,
+        container_id: str,
+        session_id: str,
+        **options: Any,
+    ) -> bool:
+        """Delete a session.
+        Args:
+            container_id: Container ID.
+            session_id: Session ID.
+            **options: Backend-specific options.
+        Returns:
+            True if deleted.
+        """
+        ...
+@runtime_checkable
+class IMemoryStats(Protocol):
+    """Protocol for memory statistics."""
+    async def get_stats(
+        self,
+        container_id: str,
+        **options: Any,
+    ) -> MemoryStats:
+        """Get container statistics.
+        Args:
+            container_id: Container ID.
+            **options: Backend-specific options.
+        Returns:
+            Memory statistics.
+        """
+        ...

gnosisllm_knowledge/core/interfaces/searcher.py CHANGED Viewed

@@ -1,4 +1,10 @@
-"""Knowledge searcher protocol - Interface Segregation Principle."""
+"""Knowledge searcher protocol - Interface Segregation Principle.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). Searcher implementations should
+    not include tenant filtering logic - callers should use tenant-specific indices.
+"""
 from __future__ import annotations
@@ -12,6 +18,9 @@ if TYPE_CHECKING:
 class IKnowledgeSearcher(Protocol):
     """Protocol for searching documents in a search backend.
+    This protocol is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation by using tenant-specific index names.
     Knowledge searchers are responsible for:
     - Executing different search modes (semantic, keyword, hybrid)
     - Generating embeddings for queries

gnosisllm_knowledge/core/interfaces/streaming.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Streaming interfaces for memory-efficient processing.
+These protocols define contracts for streaming operations that process
+data in bounded batches rather than loading everything into memory.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). Streaming implementations should
+    not include tenant filtering logic - callers should use tenant-specific indices.
+"""
+from __future__ import annotations
+from collections.abc import AsyncIterator, Awaitable, Callable
+from typing import TYPE_CHECKING, Any, Protocol
+if TYPE_CHECKING:
+    from gnosisllm_knowledge.core.domain.document import Document
+    from gnosisllm_knowledge.core.domain.result import IndexResult
+class IStreamingUrlDiscoverer(Protocol):
+    """Protocol for streaming URL discovery.
+    Implementations yield URLs as they are discovered rather than
+    collecting all URLs first. This enables processing to begin
+    immediately and keeps memory usage bounded.
+    Example:
+        ```python
+        discoverer: IStreamingUrlDiscoverer = StreamingSitemapDiscoverer()
+        async for url_batch in discoverer.discover_urls_streaming(
+            source="https://example.com/sitemap.xml",
+            batch_size=50,
+        ):
+            for url in url_batch:
+                await process(url)
+        ```
+    """
+    async def discover_urls_streaming(
+        self,
+        source: str,
+        batch_size: int = 100,
+        **options: Any,
+    ) -> AsyncIterator[list[str]]:
+        """Yield batches of discovered URLs.
+        Args:
+            source: The sitemap or source URL.
+            batch_size: Number of URLs per batch.
+            **options: Discoverer-specific options (max_urls, patterns, etc.)
+        Yields:
+            Batches of discovered URLs as they're found.
+        """
+        ...
+class IStreamingLoader(Protocol):
+    """Protocol for streaming content loading.
+    Processes URLs in bounded batches with immediate indexing,
+    preventing memory accumulation.
+    """
+    async def load_streaming_with_indexing(
+        self,
+        source: str,
+        index_callback: Callable[[list[Document]], Awaitable[IndexResult]],
+        url_batch_size: int = 50,
+        doc_batch_size: int = 100,
+        **options: Any,
+    ) -> IndexResult:
+        """Load and index with streaming, calling callback for each batch.
+        This method:
+        1. Discovers URLs in batches (not all at once)
+        2. Fetches content for each URL batch
+        3. Indexes documents immediately after fetching
+        4. Moves to next batch only after indexing completes
+        Memory usage is bounded by:
+        - url_batch_size * avg_url_length (URL strings)
+        - doc_batch_size * avg_doc_size (document content)
+        - fetch_concurrency * avg_page_size (in-flight fetches)
+        Args:
+            source: Source URL.
+            index_callback: Called with each batch of documents to index.
+            url_batch_size: URLs to process per iteration.
+            doc_batch_size: Documents per index batch.
+            **options: Additional options.
+        Returns:
+            Aggregated index result.
+        """
+        ...
+class IStreamingPipeline(Protocol):
+    """Protocol for streaming indexing pipelines.
+    This protocol is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation by using tenant-specific index names.
+    Orchestrates the full streaming load -> index pipeline with
+    bounded memory guarantees.
+    """
+    async def execute(
+        self,
+        source: str,
+        index_name: str,
+        *,
+        collection_id: str | None = None,
+        source_id: str | None = None,
+        **options: Any,
+    ) -> IndexResult:
+        """Execute the streaming pipeline.
+        Args:
+            source: Sitemap URL.
+            index_name: Target OpenSearch index (use tenant-specific name).
+            collection_id: Collection within the index.
+            source_id: Source identifier.
+            **options: Additional loader options.
+        Returns:
+            Aggregated index result.
+        """
+        ...

gnosisllm_knowledge/core/streaming/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Streaming pipeline configuration and utilities.
+This module provides infrastructure for memory-efficient streaming
+pipelines with bounded queues and backpressure support.
+Example:
+    ```python
+    from gnosisllm_knowledge.core.streaming import (
+        PipelineConfig,
+        BoundedQueue,
+        BatchCollector,
+    )
+    # Configure pipeline for large sitemap processing
+    config = PipelineConfig(
+        url_batch_size=50,
+        fetch_concurrency=10,
+        index_batch_size=100,
+    )
+    # Use bounded queue for backpressure
+    queue: BoundedQueue[str] = BoundedQueue(maxsize=100)
+    ```
+"""
+from gnosisllm_knowledge.core.streaming.pipeline import (
+    BatchCollector,
+    BoundedQueue,
+    PipelineConfig,
+)
+__all__ = [
+    "BatchCollector",
+    "BoundedQueue",
+    "PipelineConfig",
+]

gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl