PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

gnosisllm_knowledge/__init__.py +91 -39
gnosisllm_knowledge/api/__init__.py +3 -2
gnosisllm_knowledge/api/knowledge.py +287 -7
gnosisllm_knowledge/api/memory.py +966 -0
gnosisllm_knowledge/backends/__init__.py +14 -5
gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
gnosisllm_knowledge/backends/opensearch/config.py +49 -28
gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
gnosisllm_knowledge/cli/app.py +378 -12
gnosisllm_knowledge/cli/commands/agentic.py +11 -0
gnosisllm_knowledge/cli/commands/memory.py +723 -0
gnosisllm_knowledge/cli/commands/setup.py +24 -22
gnosisllm_knowledge/cli/display/service.py +43 -0
gnosisllm_knowledge/cli/utils/config.py +58 -0
gnosisllm_knowledge/core/domain/__init__.py +41 -0
gnosisllm_knowledge/core/domain/document.py +5 -0
gnosisllm_knowledge/core/domain/memory.py +440 -0
gnosisllm_knowledge/core/domain/result.py +11 -3
gnosisllm_knowledge/core/domain/search.py +2 -0
gnosisllm_knowledge/core/events/types.py +76 -0
gnosisllm_knowledge/core/exceptions.py +134 -0
gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
gnosisllm_knowledge/core/interfaces/memory.py +524 -0
gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
gnosisllm_knowledge/core/streaming/__init__.py +36 -0
gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
gnosisllm_knowledge/loaders/base.py +3 -4
gnosisllm_knowledge/loaders/sitemap.py +129 -1
gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
gnosisllm_knowledge/services/indexing.py +67 -75
gnosisllm_knowledge/services/search.py +47 -11
gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/__init__.py CHANGED Viewed

@@ -38,7 +38,7 @@ Features:
     - SOLID principles throughout
 """
-from gnosisllm_knowledge.api import Knowledge
+from gnosisllm_knowledge.api import Knowledge, Memory
 from gnosisllm_knowledge.backends import (
     AgenticSearchFallback,
     MemoryIndexer,
@@ -51,6 +51,20 @@ from gnosisllm_knowledge.backends import (
 )
 from gnosisllm_knowledge.chunking import FixedSizeChunker, SentenceChunker
 from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
+from gnosisllm_knowledge.core.domain.memory import (
+    ContainerConfig,
+    ContainerInfo,
+    HistoryEntry,
+    MemoryEntry,
+    MemoryStats,
+    MemoryStrategy,
+    MemoryType,
+    Message,
+    Namespace,
+    RecallResult,
+    SessionInfo,
+    StrategyConfig,
+)
 from gnosisllm_knowledge.core.domain.result import (
     BatchResult,
     IndexResult,
@@ -58,9 +72,9 @@ from gnosisllm_knowledge.core.domain.result import (
     ValidationResult,
 )
 from gnosisllm_knowledge.core.domain.search import (
-    AgentType,
     AgenticSearchQuery,
     AgenticSearchResult,
+    AgentType,
     ReasoningStep,
     SearchMode,
     SearchQuery,
@@ -72,15 +86,27 @@ from gnosisllm_knowledge.core.exceptions import (
     AgenticSearchError,
     ConfigurationError,
     ConnectionError,
+    ContainerExistsError,
+    ContainerNotFoundError,
     IndexError,
+    InferenceError,
+    InferenceTimeoutError,
     KnowledgeError,
     LoadError,
+    MemoryConfigurationError,
+    MemoryError,
     SearchError,
+    SessionNotFoundError,
+)
+from gnosisllm_knowledge.core.streaming import (
+    BatchCollector,
+    BoundedQueue,
+    PipelineConfig,
 )
 from gnosisllm_knowledge.fetchers import (
     HTTPContentFetcher,
-    NeoreaderContentFetcher,
     NeoreaderConfig,
+    NeoreaderContentFetcher,
 )
 from gnosisllm_knowledge.loaders import (
     LoaderFactory,
@@ -95,58 +121,84 @@ from gnosisllm_knowledge.services import (
 __version__ = "0.2.0"
 __all__ = [
-    # Main API
-    "Knowledge",
-    # Domain Models
-    "Document",
-    "DocumentStatus",
-    "TextChunk",
-    "SearchQuery",
-    "SearchResult",
-    "SearchResultItem",
-    "SearchMode",
     "AgentType",
+    "AgenticSearchError",
+    "AgenticSearchFallback",
     "AgenticSearchQuery",
     "AgenticSearchResult",
-    "ReasoningStep",
-    "LoadResult",
-    "IndexResult",
+    "BatchCollector",
     "BatchResult",
-    "ValidationResult",
+    "BoundedQueue",
+    "ConfigurationError",
+    "ConnectionError",
+    "ContainerConfig",
+    "ContainerExistsError",
+    "ContainerInfo",
+    "ContainerNotFoundError",
+    # Domain Models
+    "Document",
+    "DocumentStatus",
     # Events
     "Event",
-    "EventType",
     "EventEmitter",
+    "EventType",
+    "FixedSizeChunker",
+    # Fetchers
+    "HTTPContentFetcher",
+    "HistoryEntry",
+    "IndexError",
+    "IndexResult",
+    "InferenceError",
+    "InferenceTimeoutError",
+    # Main API
+    "Knowledge",
     # Exceptions
     "KnowledgeError",
-    "ConfigurationError",
-    "ConnectionError",
+    # Services
+    "KnowledgeIndexingService",
+    "KnowledgeSearchService",
     "LoadError",
-    "IndexError",
-    "SearchError",
-    "AgenticSearchError",
+    "LoadResult",
     # Loaders
     "LoaderFactory",
-    "WebsiteLoader",
-    "SitemapLoader",
-    # Fetchers
-    "HTTPContentFetcher",
-    "NeoreaderContentFetcher",
+    "Memory",
+    "MemoryConfigurationError",
+    "MemoryEntry",
+    # Memory Exceptions
+    "MemoryError",
+    # Memory Backend (for testing)
+    "MemoryIndexer",
+    "MemorySearcher",
+    "MemoryStats",
+    # Memory Domain Models
+    "MemoryStrategy",
+    "MemoryType",
+    "Message",
+    "Namespace",
     "NeoreaderConfig",
-    # Chunkers
-    "SentenceChunker",
-    "FixedSizeChunker",
+    "NeoreaderContentFetcher",
+    "OpenSearchAgenticSearcher",
     # OpenSearch Backend
     "OpenSearchConfig",
     "OpenSearchIndexer",
     "OpenSearchKnowledgeSearcher",
     "OpenSearchSetupAdapter",
-    "OpenSearchAgenticSearcher",
-    "AgenticSearchFallback",
-    # Memory Backend (for testing)
-    "MemoryIndexer",
-    "MemorySearcher",
-    # Services
-    "KnowledgeIndexingService",
-    "KnowledgeSearchService",
+    # Streaming Pipeline
+    "PipelineConfig",
+    "ReasoningStep",
+    "RecallResult",
+    "SearchError",
+    "SearchMode",
+    "SearchQuery",
+    "SearchResult",
+    "SearchResultItem",
+    # Chunkers
+    "SentenceChunker",
+    "SessionInfo",
+    "SessionNotFoundError",
+    "SitemapLoader",
+    "StrategyConfig",
+    "TextChunk",
+    "ValidationResult",
+    "WebsiteLoader",
 ]

gnosisllm_knowledge/api/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
-"""High-level API for knowledge operations."""
+"""High-level API for knowledge and memory operations."""
 from gnosisllm_knowledge.api.knowledge import Knowledge
+from gnosisllm_knowledge.api.memory import Memory
-__all__ = ["Knowledge"]
+__all__ = ["Knowledge", "Memory"]

gnosisllm_knowledge/api/knowledge.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import logging
+from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
 from gnosisllm_knowledge.backends.opensearch import (
@@ -11,15 +12,24 @@ from gnosisllm_knowledge.backends.opensearch import (
     OpenSearchKnowledgeSearcher,
     OpenSearchSetupAdapter,
 )
+from gnosisllm_knowledge.backends.opensearch.agentic import OpenSearchAgenticSearcher
 from gnosisllm_knowledge.chunking import SentenceChunker
 from gnosisllm_knowledge.core.domain.result import IndexResult
-from gnosisllm_knowledge.core.domain.search import SearchMode, SearchResult
+from gnosisllm_knowledge.core.domain.search import (
+    AgentType,
+    AgenticSearchQuery,
+    AgenticSearchResult,
+    SearchMode,
+    SearchResult,
+)
 from gnosisllm_knowledge.core.events.emitter import EventEmitter
 from gnosisllm_knowledge.core.interfaces.setup import DiagnosticReport, HealthReport
+from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
 from gnosisllm_knowledge.fetchers import NeoreaderContentFetcher
 from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
 from gnosisllm_knowledge.loaders import LoaderFactory
 from gnosisllm_knowledge.services import KnowledgeIndexingService, KnowledgeSearchService
+from gnosisllm_knowledge.services.streaming_pipeline import StreamingIndexingPipeline
 if TYPE_CHECKING:
     from opensearchpy import AsyncOpenSearch
@@ -159,11 +169,12 @@ class Knowledge:
                 **kwargs,
             )
-        # Create client
+        # Create client with proper timeout settings
         client_kwargs: dict[str, Any] = {
             "hosts": [{"host": config.host, "port": config.port}],
             "use_ssl": config.use_ssl,
             "verify_certs": config.verify_certs,
+            "timeout": max(config.read_timeout, config.agentic_timeout_seconds),
         }
         if config.username and config.password:
@@ -181,11 +192,16 @@ class Knowledge:
         # Create fetcher
         fetcher = None
         if neoreader_url:
-            neoreader_config = NeoreaderConfig(base_url=neoreader_url)
+            neoreader_config = NeoreaderConfig(host=neoreader_url)
             fetcher = NeoreaderContentFetcher(neoreader_config)
-        # Create loader factory
-        loader_factory = LoaderFactory(default_fetcher=fetcher)
+        # Create chunker
+        chunker = SentenceChunker()
+        # Create loader factory (fetcher is optional, defaults will be used if None)
+        loader_factory = None
+        if fetcher:
+            loader_factory = LoaderFactory(fetcher=fetcher, chunker=chunker)
         return cls(
             indexer=indexer,
@@ -335,9 +351,9 @@ class Knowledge:
         # Auto-detect or use explicit source type
         if source_type:
-            loader = self._loader_factory.create(source_type, self._fetcher)
+            loader = self._loader_factory.create(source_type)
         else:
-            loader = self._loader_factory.create_for_source(source, self._fetcher)
+            loader = self._loader_factory.create_for_source(source)
         # Create service for this load operation
         service = KnowledgeIndexingService(
@@ -356,6 +372,95 @@ class Knowledge:
             **options,
         )
+    async def load_streaming(
+        self,
+        source: str,
+        *,
+        index_name: str | None = None,
+        account_id: str | None = None,
+        collection_id: str | None = None,
+        collection_name: str | None = None,
+        source_id: str | None = None,
+        url_batch_size: int = 50,
+        fetch_concurrency: int = 10,
+        index_batch_size: int = 100,
+        on_progress: Callable[[int, int], None] | None = None,
+        **options: Any,
+    ) -> IndexResult:
+        """Load and index content using streaming pipeline with bounded memory.
+        This method is optimized for large sitemaps (10,000+ URLs) that would
+        otherwise exhaust memory. It processes URLs in batches, indexing
+        documents immediately rather than loading all content first.
+        Memory usage is bounded and independent of sitemap size:
+        - URL storage: O(url_batch_size)
+        - Document storage: O(index_batch_size)
+        - In-flight fetches: O(fetch_concurrency * avg_page_size)
+        Args:
+            source: Sitemap URL.
+            index_name: Target index (uses default if not provided).
+            account_id: Account ID for multi-tenancy.
+            collection_id: Collection ID.
+            collection_name: Collection name for display.
+            source_id: Source ID (auto-generated if not provided).
+            url_batch_size: URLs to discover per batch (default 50).
+            fetch_concurrency: Parallel URL fetches (default 10).
+            index_batch_size: Documents per index batch (default 100).
+            on_progress: Optional progress callback (urls_processed, docs_indexed).
+            **options: Additional loading options (max_urls, patterns, etc.).
+        Returns:
+            Index result with counts.
+        Example:
+            ```python
+            # Efficiently load 100k+ URL sitemap
+            result = await knowledge.load_streaming(
+                "https://large-site.com/sitemap.xml",
+                url_batch_size=100,
+                fetch_concurrency=20,
+                max_urls=50000,
+            )
+            print(f"Indexed {result.indexed_count} documents")
+            ```
+        """
+        if self._loader_factory is None:
+            raise ValueError("Loader factory not configured")
+        index = index_name or self._default_index
+        if not index:
+            raise ValueError("No index specified and no default index configured")
+        # Create sitemap loader specifically for streaming
+        loader = self._loader_factory.create("sitemap")
+        # Configure pipeline
+        config = PipelineConfig(
+            url_batch_size=url_batch_size,
+            fetch_concurrency=fetch_concurrency,
+            index_batch_size=index_batch_size,
+        )
+        # Create streaming pipeline
+        pipeline = StreamingIndexingPipeline(
+            loader=loader,
+            indexer=self._indexer,
+            config=config,
+            events=self._events,
+        )
+        return await pipeline.execute(
+            source=source,
+            index_name=index,
+            account_id=account_id,
+            collection_id=collection_id,
+            collection_name=collection_name,
+            source_id=source_id,
+            **options,
+        )
     # === Search Methods ===
     async def search(
@@ -542,6 +647,181 @@ class Knowledge:
             collection_id=collection_id,
         )
+    # === Collection and Stats Methods ===
+    async def get_collections(self) -> list[dict[str, Any]]:
+        """Get all collections with document counts.
+        Aggregates unique collection_ids from indexed documents.
+        Returns:
+            List of collection dictionaries with id, name, and document_count.
+        """
+        return await self.search_service.get_collections()
+    async def get_stats(self) -> dict[str, Any]:
+        """Get index statistics.
+        Returns:
+            Dictionary with document_count, index_name, and other stats.
+        """
+        return await self.search_service.get_stats()
+    async def list_documents(
+        self,
+        *,
+        source_id: str | None = None,
+        collection_id: str | None = None,
+        limit: int = 50,
+        offset: int = 0,
+    ) -> dict[str, Any]:
+        """List documents with optional filters.
+        Args:
+            source_id: Optional source ID filter.
+            collection_id: Optional collection ID filter.
+            limit: Maximum documents to return (max 100).
+            offset: Number of documents to skip.
+        Returns:
+            Dictionary with documents, total, limit, offset.
+        """
+        index = self._default_index
+        if not index:
+            raise ValueError("No default index configured")
+        # Clamp limit to reasonable bounds
+        limit = min(max(1, limit), 100)
+        offset = max(0, offset)
+        return await self._searcher.list_documents(
+            index_name=index,
+            source_id=source_id,
+            collection_id=collection_id,
+            limit=limit,
+            offset=offset,
+        )
+    # === Agentic Search Status ===
+    @property
+    def is_agentic_configured(self) -> bool:
+        """Check if agentic search is configured.
+        Returns:
+            True if at least one agent type is configured.
+        """
+        if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
+            return False
+        config = self._searcher._config
+        return bool(config.flow_agent_id or config.conversational_agent_id)
+    async def get_agentic_status(self) -> dict[str, Any]:
+        """Get status of agentic search configuration.
+        Returns:
+            Dictionary with agent availability status:
+            - available: True if any agent is configured
+            - flow_agent: True if flow agent is configured
+            - conversational_agent: True if conversational agent is configured
+        """
+        if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
+            return {
+                "available": False,
+                "flow_agent": False,
+                "conversational_agent": False,
+            }
+        config = self._searcher._config
+        return {
+            "available": bool(config.flow_agent_id or config.conversational_agent_id),
+            "flow_agent": bool(config.flow_agent_id),
+            "conversational_agent": bool(config.conversational_agent_id),
+        }
+    async def agentic_search(
+        self,
+        query: str,
+        *,
+        agent_type: AgentType = AgentType.FLOW,
+        index_name: str | None = None,
+        collection_ids: list[str] | None = None,
+        source_ids: list[str] | None = None,
+        conversation_id: str | None = None,
+        include_reasoning: bool = True,
+        limit: int = 10,
+        **options: Any,
+    ) -> AgenticSearchResult:
+        """Execute agentic search with AI-powered reasoning.
+        Uses OpenSearch ML agents to understand queries, retrieve relevant
+        documents, and generate natural language answers.
+        Args:
+            query: Search query text.
+            agent_type: Type of agent (FLOW for fast RAG, CONVERSATIONAL for multi-turn).
+            index_name: Index to search (uses default if not provided).
+            collection_ids: Filter by collection IDs.
+            source_ids: Filter by source IDs.
+            conversation_id: Conversation ID for multi-turn (conversational agent).
+            include_reasoning: Include reasoning steps in response.
+            limit: Maximum source documents to retrieve.
+            **options: Additional agent options.
+        Returns:
+            AgenticSearchResult with answer, reasoning steps, and sources.
+        Raises:
+            AgenticSearchError: If agent execution fails.
+            ValueError: If agentic search is not configured.
+        Example:
+            ```python
+            result = await knowledge.agentic_search(
+                "How does authentication work?",
+                agent_type=AgentType.FLOW,
+            )
+            print(result.answer)
+            for source in result.items:
+                print(f"- {source.title}")
+            ```
+        """
+        # Check if agentic search is configured
+        if not self.is_agentic_configured:
+            raise ValueError(
+                "Agentic search is not configured. "
+                "Run 'gnosisllm-knowledge agentic setup' and set agent IDs in environment."
+            )
+        # Get client and config from the searcher
+        if not hasattr(self._searcher, '_client') or not hasattr(self._searcher, '_config'):
+            raise ValueError("Searcher does not have OpenSearch client/config")
+        client = self._searcher._client
+        config = self._searcher._config
+        # Create agentic searcher
+        agentic_searcher = OpenSearchAgenticSearcher(client, config)
+        # Build agentic query
+        agentic_query = AgenticSearchQuery(
+            text=query,
+            agent_type=agent_type,
+            collection_ids=collection_ids,
+            source_ids=source_ids,
+            conversation_id=conversation_id,
+            include_reasoning=include_reasoning,
+            limit=limit,
+        )
+        # Determine index name
+        index = index_name or self._default_index
+        if not index:
+            raise ValueError("No index specified and no default index configured")
+        # Execute agentic search
+        return await agentic_searcher.agentic_search(agentic_query, index, **options)
     async def close(self) -> None:
         """Close connections and clean up resources."""
         # Subclasses or future implementations can override this

gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl