PyPI - gnosisllm-knowledge - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

gnosisllm_knowledge/api/knowledge.py +225 -35
gnosisllm_knowledge/backends/memory/indexer.py +27 -2
gnosisllm_knowledge/backends/memory/searcher.py +111 -10
gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
gnosisllm_knowledge/backends/opensearch/searcher.py +9 -6
gnosisllm_knowledge/cli/app.py +58 -19
gnosisllm_knowledge/cli/commands/agentic.py +15 -9
gnosisllm_knowledge/cli/commands/load.py +169 -19
gnosisllm_knowledge/cli/commands/memory.py +10 -0
gnosisllm_knowledge/cli/commands/search.py +9 -10
gnosisllm_knowledge/cli/commands/setup.py +25 -1
gnosisllm_knowledge/cli/utils/config.py +4 -4
gnosisllm_knowledge/core/domain/__init__.py +13 -0
gnosisllm_knowledge/core/domain/discovery.py +166 -0
gnosisllm_knowledge/core/domain/document.py +14 -19
gnosisllm_knowledge/core/domain/search.py +10 -25
gnosisllm_knowledge/core/domain/source.py +11 -12
gnosisllm_knowledge/core/events/__init__.py +8 -0
gnosisllm_knowledge/core/events/types.py +122 -5
gnosisllm_knowledge/core/exceptions.py +93 -0
gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
gnosisllm_knowledge/fetchers/__init__.py +8 -0
gnosisllm_knowledge/fetchers/config.py +27 -0
gnosisllm_knowledge/fetchers/neoreader.py +31 -3
gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
gnosisllm_knowledge/loaders/__init__.py +5 -1
gnosisllm_knowledge/loaders/discovery.py +338 -0
gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
gnosisllm_knowledge/loaders/factory.py +46 -0
gnosisllm_knowledge/services/indexing.py +35 -20
gnosisllm_knowledge/services/search.py +37 -20
gnosisllm_knowledge/services/streaming_pipeline.py +39 -7
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +30 -10
gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/api/knowledge.py CHANGED Viewed

@@ -1,4 +1,39 @@
-"""High-level Knowledge API facade."""
+"""High-level Knowledge API facade.
+This module provides the main entry point for the gnosisllm-knowledge library.
+The Knowledge class is a high-level facade that abstracts the complexity of
+loading, indexing, and searching knowledge documents.
+Note:
+    This library is tenant-agnostic. Multi-tenancy should be handled at the
+    API layer by using separate indices per account (e.g.,
+    `knowledge-{account_id}`) rather than filtering by account_id.
+Example:
+    ```python
+    # Create Knowledge instance for a specific tenant
+    knowledge = Knowledge.from_opensearch(
+        host="localhost",
+        port=9200,
+    )
+    # Use a tenant-specific index
+    tenant_index = f"knowledge-{account_id}"
+    # Load content
+    await knowledge.load(
+        "https://docs.example.com/sitemap.xml",
+        index_name=tenant_index,
+        collection_id="docs",
+    )
+    # Search (tenant isolation via index name)
+    results = await knowledge.search(
+        "how to configure",
+        index_name=tenant_index,
+    )
+    ```
+"""
 from __future__ import annotations
@@ -130,6 +165,10 @@ class Knowledge:
     ) -> Knowledge:
         """Create Knowledge instance with OpenSearch backend.
+        This factory creates a Knowledge instance configured for OpenSearch.
+        The returned instance is tenant-agnostic - multi-tenancy should be
+        handled by using separate indices per account.
         Args:
             host: OpenSearch host.
             port: OpenSearch port.
@@ -147,6 +186,19 @@ class Knowledge:
         Note:
             Embeddings are generated automatically by OpenSearch ingest pipeline.
             Run 'gnosisllm-knowledge setup' to configure the ML model.
+        Example:
+            ```python
+            # Create a Knowledge instance
+            knowledge = Knowledge.from_opensearch(
+                host="localhost",
+                port=9200,
+            )
+            # Use tenant-specific index for isolation
+            tenant_index = f"gnosisllm-{account_id}-knowledge"
+            await knowledge.load(source, index_name=tenant_index)
+            ```
         """
         # Import OpenSearch client
         try:
@@ -216,15 +268,29 @@ class Knowledge:
     def from_env(cls) -> Knowledge:
         """Create Knowledge instance from environment variables.
+        This factory creates a Knowledge instance using configuration from
+        environment variables. The returned instance is tenant-agnostic -
+        multi-tenancy should be handled by using separate indices per account.
         Returns:
             Configured Knowledge instance.
+        Example:
+            ```python
+            # Create from environment
+            knowledge = Knowledge.from_env()
+            # Use tenant-specific index for isolation
+            tenant_index = f"gnosisllm-{account_id}-knowledge"
+            await knowledge.search("query", index_name=tenant_index)
+            ```
         """
         config = OpenSearchConfig.from_env()
         neoreader_config = NeoreaderConfig.from_env()
         return cls.from_opensearch(
             config=config,
-            neoreader_url=neoreader_config.base_url if neoreader_config.base_url else None,
+            neoreader_url=neoreader_config.host if neoreader_config.host else None,
         )
     @property
@@ -318,7 +384,6 @@ class Knowledge:
         source: str,
         *,
         index_name: str | None = None,
-        account_id: str | None = None,
         collection_id: str | None = None,
         source_id: str | None = None,
         source_type: str | None = None,
@@ -329,10 +394,13 @@ class Knowledge:
         Automatically detects source type (sitemap, website, etc.).
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
         Args:
             source: Source URL or path.
-            index_name: Target index (uses default if not provided).
-            account_id: Account ID for multi-tenancy.
+            index_name: Target index (use tenant-specific name for isolation).
             collection_id: Collection ID.
             source_id: Source ID (auto-generated if not provided).
             source_type: Explicit source type (auto-detected if not provided).
@@ -366,7 +434,6 @@ class Knowledge:
         return await service.load_and_index(
             source=source,
             index_name=index,
-            account_id=account_id,
             collection_id=collection_id,
             source_id=source_id,
             **options,
@@ -377,7 +444,6 @@ class Knowledge:
         source: str,
         *,
         index_name: str | None = None,
-        account_id: str | None = None,
         collection_id: str | None = None,
         collection_name: str | None = None,
         source_id: str | None = None,
@@ -398,10 +464,13 @@ class Knowledge:
         - Document storage: O(index_batch_size)
         - In-flight fetches: O(fetch_concurrency * avg_page_size)
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
         Args:
             source: Sitemap URL.
-            index_name: Target index (uses default if not provided).
-            account_id: Account ID for multi-tenancy.
+            index_name: Target index (use tenant-specific name for isolation).
             collection_id: Collection ID.
             collection_name: Collection name for display.
             source_id: Source ID (auto-generated if not provided).
@@ -419,6 +488,7 @@ class Knowledge:
             # Efficiently load 100k+ URL sitemap
             result = await knowledge.load_streaming(
                 "https://large-site.com/sitemap.xml",
+                index_name="knowledge-account123",  # Tenant-specific
                 url_batch_size=100,
                 fetch_concurrency=20,
                 max_urls=50000,
@@ -454,7 +524,6 @@ class Knowledge:
         return await pipeline.execute(
             source=source,
             index_name=index,
-            account_id=account_id,
             collection_id=collection_id,
             collection_name=collection_name,
             source_id=source_id,
@@ -471,7 +540,6 @@ class Knowledge:
         mode: SearchMode = SearchMode.HYBRID,
         limit: int = 10,
         offset: int = 0,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         source_ids: list[str] | None = None,
         min_score: float | None = None,
@@ -479,13 +547,16 @@ class Knowledge:
     ) -> SearchResult:
         """Search for knowledge documents.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
         Args:
             query: Search query text.
-            index_name: Index to search (uses default if not provided).
+            index_name: Index to search (use tenant-specific name for isolation).
             mode: Search mode (semantic, keyword, hybrid).
             limit: Maximum results.
             offset: Result offset for pagination.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             source_ids: Filter by source IDs.
             min_score: Minimum score threshold.
@@ -500,7 +571,6 @@ class Knowledge:
             mode=mode,
             limit=limit,
             offset=offset,
-            account_id=account_id,
             collection_ids=collection_ids,
             source_ids=source_ids,
             min_score=min_score,
@@ -578,19 +648,73 @@ class Knowledge:
     # === Management Methods ===
+    async def get_document(
+        self,
+        document_id: str,
+        *,
+        index_name: str | None = None,
+    ) -> dict[str, Any] | None:
+        """Get a single document by ID.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
+        Args:
+            document_id: Document ID to retrieve.
+            index_name: Index name (use tenant-specific name for isolation).
+                Uses default index if not provided.
+        Returns:
+            Document dict with all fields (excluding embeddings) or None if not found.
+        """
+        index = index_name or self._default_index
+        if not index:
+            raise ValueError("No index specified and no default index configured")
+        return await self._indexer.get(document_id, index)
+    async def delete_document(
+        self,
+        document_id: str,
+        *,
+        index_name: str | None = None,
+    ) -> bool:
+        """Delete a single document by ID.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
+        Args:
+            document_id: Document ID to delete.
+            index_name: Index name (use tenant-specific name for isolation).
+                Uses default index if not provided.
+        Returns:
+            True if deleted, False if not found.
+        """
+        index = index_name or self._default_index
+        if not index:
+            raise ValueError("No index specified and no default index configured")
+        return await self._indexer.delete(document_id, index)
     async def delete_source(
         self,
         source_id: str,
         *,
         index_name: str | None = None,
-        account_id: str | None = None,
     ) -> int:
         """Delete all documents from a source.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
         Args:
             source_id: Source ID to delete.
-            index_name: Index name.
-            account_id: Account ID for multi-tenancy.
+            index_name: Index name (use tenant-specific name for isolation).
         Returns:
             Count of deleted documents.
@@ -599,21 +723,23 @@ class Knowledge:
         if not index:
             raise ValueError("No index specified")
-        return await self.indexing.delete_source(source_id, index, account_id)
+        return await self.indexing.delete_source(source_id, index)
     async def delete_collection(
         self,
         collection_id: str,
         *,
         index_name: str | None = None,
-        account_id: str | None = None,
     ) -> int:
         """Delete all documents from a collection.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
         Args:
             collection_id: Collection ID to delete.
-            index_name: Index name.
-            account_id: Account ID for multi-tenancy.
+            index_name: Index name (use tenant-specific name for isolation).
         Returns:
             Count of deleted documents.
@@ -622,54 +748,85 @@ class Knowledge:
         if not index:
             raise ValueError("No index specified")
-        return await self.indexing.delete_collection(collection_id, index, account_id)
+        return await self.indexing.delete_collection(collection_id, index)
     async def count(
         self,
         *,
         index_name: str | None = None,
-        account_id: str | None = None,
         collection_id: str | None = None,
+        source_id: str | None = None,
     ) -> int:
         """Count documents.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
         Args:
-            index_name: Index to count.
-            account_id: Filter by account.
+            index_name: Index to count (use tenant-specific name for isolation).
             collection_id: Filter by collection.
+            source_id: Filter by source (for source deletion confirmation).
         Returns:
             Document count.
         """
         return await self.search_service.count(
             index_name=index_name,
-            account_id=account_id,
             collection_id=collection_id,
+            source_id=source_id,
         )
     # === Collection and Stats Methods ===
-    async def get_collections(self) -> list[dict[str, Any]]:
+    async def get_collections(
+        self,
+        *,
+        index_name: str | None = None,
+    ) -> list[dict[str, Any]]:
         """Get all collections with document counts.
         Aggregates unique collection_ids from indexed documents.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
+        Args:
+            index_name: Index to query (use tenant-specific name for isolation).
+                Uses default index if not provided.
         Returns:
             List of collection dictionaries with id, name, and document_count.
         """
-        return await self.search_service.get_collections()
+        index = index_name or self._default_index
+        return await self.search_service.get_collections(index_name=index)
-    async def get_stats(self) -> dict[str, Any]:
+    async def get_stats(
+        self,
+        *,
+        index_name: str | None = None,
+    ) -> dict[str, Any]:
         """Get index statistics.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
+        Args:
+            index_name: Index to query (use tenant-specific name for isolation).
+                Uses default index if not provided.
         Returns:
             Dictionary with document_count, index_name, and other stats.
         """
-        return await self.search_service.get_stats()
+        index = index_name or self._default_index
+        return await self.search_service.get_stats(index_name=index)
     async def list_documents(
         self,
         *,
+        index_name: str | None = None,
         source_id: str | None = None,
         collection_id: str | None = None,
         limit: int = 50,
@@ -677,7 +834,13 @@ class Knowledge:
     ) -> dict[str, Any]:
         """List documents with optional filters.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            by using separate indices per account.
         Args:
+            index_name: Index to query (use tenant-specific name for isolation).
+                Uses default index if not provided.
             source_id: Optional source ID filter.
             collection_id: Optional collection ID filter.
             limit: Maximum documents to return (max 100).
@@ -686,9 +849,9 @@ class Knowledge:
         Returns:
             Dictionary with documents, total, limit, offset.
         """
-        index = self._default_index
+        index = index_name or self._default_index
         if not index:
-            raise ValueError("No default index configured")
+            raise ValueError("No index specified and no default index configured")
         # Clamp limit to reasonable bounds
         limit = min(max(1, limit), 100)
@@ -823,6 +986,33 @@ class Knowledge:
         return await agentic_searcher.agentic_search(agentic_query, index, **options)
     async def close(self) -> None:
-        """Close connections and clean up resources."""
-        # Subclasses or future implementations can override this
-        pass
+        """Close connections and clean up resources.
+        Closes the underlying AsyncOpenSearch client to prevent
+        unclosed aiohttp session warnings. Properly handles
+        CancelledError during event loop shutdown.
+        """
+        import asyncio
+        # Close the OpenSearch client via the searcher
+        # Note: indexer, searcher, and setup share the same client instance,
+        # so closing via searcher is sufficient
+        if hasattr(self._searcher, '_client') and self._searcher._client is not None:
+            client = self._searcher._client
+            try:
+                await client.close()
+                logger.debug("Closed OpenSearch client connection")
+            except asyncio.CancelledError:
+                # Event loop is shutting down - this is expected during cleanup
+                logger.debug("OpenSearch client close cancelled (event loop shutting down)")
+            except Exception as e:
+                logger.warning(f"Error closing OpenSearch client: {e}")
+            finally:
+                # Clear client reference on all components that share it
+                # This prevents any accidental reuse after close
+                if hasattr(self._searcher, '_client'):
+                    self._searcher._client = None
+                if hasattr(self._indexer, '_client'):
+                    self._indexer._client = None
+                if self._setup and hasattr(self._setup, '_client'):
+                    self._setup._client = None

gnosisllm_knowledge/backends/memory/indexer.py CHANGED Viewed

@@ -1,4 +1,10 @@
-"""In-memory document indexer for testing."""
+"""In-memory document indexer for testing.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). The memory indexer does not
+    include tenant filtering logic - use separate index names per tenant.
+"""
 from __future__ import annotations
@@ -14,6 +20,9 @@ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
 class MemoryIndexer:
     """In-memory document indexer for testing.
+    This indexer is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation by using tenant-specific index names.
     Stores documents in a dictionary for fast testing without
     requiring an external OpenSearch instance.
@@ -185,6 +194,22 @@ class MemoryIndexer:
         """
         return await self.index(document, index_name)
+    async def get(
+        self,
+        doc_id: str,
+        index_name: str,
+    ) -> dict[str, Any] | None:
+        """Get a document by ID (async interface).
+        Args:
+            doc_id: Document ID.
+            index_name: Index name.
+        Returns:
+            Document dictionary or None if not found.
+        """
+        return self._indices.get(index_name, {}).get(doc_id)
     async def delete(
         self,
         doc_id: str,
@@ -365,8 +390,8 @@ class MemoryIndexer:
             "url": document.url,
             "title": document.title,
             "source": document.source,
-            "account_id": document.account_id,
             "collection_id": document.collection_id,
+            "collection_name": document.collection_name,
             "source_id": document.source_id,
             "chunk_index": document.chunk_index,
             "total_chunks": document.total_chunks,

gnosisllm_knowledge/backends/memory/searcher.py CHANGED Viewed

@@ -1,10 +1,16 @@
-"""In-memory document searcher for testing."""
+"""In-memory document searcher for testing.
+Note: This module is tenant-agnostic. Multi-tenancy should be handled
+at the API layer by using separate indices per account (e.g.,
+gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
+"""
 from __future__ import annotations
 import math
 import re
 import time
+import warnings
 from typing import Any, Callable
 from gnosisllm_knowledge.backends.memory.indexer import MemoryIndexer
@@ -147,7 +153,7 @@ class MemorySearcher:
         for doc in filtered_docs:
             content = doc.get("content", "").lower()
-            title = doc.get("title", "").lower()
+            title = (doc.get("title") or "").lower()
             # Simple TF scoring
             content_score = sum(
@@ -209,7 +215,7 @@ class MemorySearcher:
         for doc in filtered_docs:
             # Keyword score
             content = doc.get("content", "").lower()
-            title = doc.get("title", "").lower()
+            title = (doc.get("title") or "").lower()
             keyword_score = sum(content.count(term) for term in query_terms)
             keyword_score += sum(title.count(term) for term in query_terms) * 2
@@ -348,6 +354,101 @@ class MemorySearcher:
             results.append(result)
         return results
+    async def list_documents(
+        self,
+        index_name: str,
+        *,
+        source_id: str | None = None,
+        collection_id: str | None = None,
+        limit: int = 50,
+        offset: int = 0,
+    ) -> dict[str, Any]:
+        """List documents with optional filters.
+        Args:
+            index_name: Index to query.
+            source_id: Optional source ID filter.
+            collection_id: Optional collection ID filter.
+            limit: Maximum documents to return.
+            offset: Number of documents to skip.
+        Returns:
+            Dictionary with documents, total, limit, offset.
+        """
+        documents = self._indexer.get_all(index_name)
+        # Apply filters
+        if source_id:
+            documents = [d for d in documents if d.get("source_id") == source_id]
+        if collection_id:
+            documents = [d for d in documents if d.get("collection_id") == collection_id]
+        total = len(documents)
+        # Apply pagination
+        paginated = documents[offset : offset + limit]
+        return {
+            "documents": paginated,
+            "total": total,
+            "limit": limit,
+            "offset": offset,
+        }
+    def count(self, index_name: str) -> int:
+        """Count documents in index.
+        Args:
+            index_name: Index to count.
+        Returns:
+            Document count.
+        """
+        return self._indexer.count(index_name)
+    async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
+        """Get unique collections with document counts.
+        Args:
+            index_name: Index to query.
+        Returns:
+            List of collections with id, name, and document_count.
+        """
+        documents = self._indexer.get_all(index_name)
+        collections: dict[str, dict[str, Any]] = {}
+        for doc in documents:
+            col_id = doc.get("collection_id")
+            if not col_id:
+                continue
+            if col_id not in collections:
+                collections[col_id] = {
+                    "id": col_id,
+                    "name": doc.get("collection_name") or col_id,
+                    "document_count": 0,
+                }
+            collections[col_id]["document_count"] += 1
+        return list(collections.values())
+    async def get_stats(self, index_name: str) -> dict[str, Any]:
+        """Get index statistics.
+        Args:
+            index_name: Index to query.
+        Returns:
+            Dictionary with document_count and index info.
+        """
+        count = self._indexer.count(index_name)
+        return {
+            "document_count": count,
+            "index_name": index_name,
+            "exists": count > 0 or index_name in self._indexer._indices,
+        }
     def _apply_filters(
         self,
         documents: list[dict[str, Any]],
@@ -355,6 +456,10 @@ class MemorySearcher:
     ) -> list[dict[str, Any]]:
         """Apply query filters to documents.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             documents: Documents to filter.
             query: Query with filter parameters.
@@ -364,10 +469,6 @@ class MemorySearcher:
         """
         filtered = documents
-        # Account filter
-        if query.account_id:
-            filtered = [d for d in filtered if d.get("account_id") == query.account_id]
         # Collection filter
         if query.collection_ids:
             filtered = [
@@ -378,9 +479,9 @@ class MemorySearcher:
         if query.source_ids:
             filtered = [d for d in filtered if d.get("source_id") in query.source_ids]
-        # Custom filters
-        if query.filters:
-            for field, value in query.filters.items():
+        # Custom metadata filters
+        if query.metadata_filters:
+            for field, value in query.metadata_filters.items():
                 if isinstance(value, list):
                     filtered = [d for d in filtered if d.get(field) in value]
                 else:

gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl