PyPI - gnosisllm-knowledge - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

gnosisllm_knowledge/api/knowledge.py +233 -35
gnosisllm_knowledge/backends/memory/indexer.py +27 -2
gnosisllm_knowledge/backends/memory/searcher.py +132 -10
gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
gnosisllm_knowledge/backends/opensearch/config.py +7 -0
gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
gnosisllm_knowledge/cli/app.py +58 -19
gnosisllm_knowledge/cli/commands/agentic.py +15 -9
gnosisllm_knowledge/cli/commands/load.py +169 -19
gnosisllm_knowledge/cli/commands/memory.py +10 -0
gnosisllm_knowledge/cli/commands/search.py +9 -10
gnosisllm_knowledge/cli/commands/setup.py +25 -1
gnosisllm_knowledge/cli/utils/config.py +4 -4
gnosisllm_knowledge/core/domain/__init__.py +13 -0
gnosisllm_knowledge/core/domain/discovery.py +166 -0
gnosisllm_knowledge/core/domain/document.py +14 -19
gnosisllm_knowledge/core/domain/search.py +10 -25
gnosisllm_knowledge/core/domain/source.py +11 -12
gnosisllm_knowledge/core/events/__init__.py +8 -0
gnosisllm_knowledge/core/events/types.py +122 -5
gnosisllm_knowledge/core/exceptions.py +93 -0
gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
gnosisllm_knowledge/fetchers/__init__.py +8 -0
gnosisllm_knowledge/fetchers/config.py +27 -0
gnosisllm_knowledge/fetchers/neoreader.py +31 -3
gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
gnosisllm_knowledge/loaders/__init__.py +5 -1
gnosisllm_knowledge/loaders/discovery.py +338 -0
gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
gnosisllm_knowledge/loaders/factory.py +46 -0
gnosisllm_knowledge/services/indexing.py +51 -21
gnosisllm_knowledge/services/search.py +42 -28
gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/backends/memory/searcher.py CHANGED Viewed

@@ -1,10 +1,16 @@
-"""In-memory document searcher for testing."""
+"""In-memory document searcher for testing.
+Note: This module is tenant-agnostic. Multi-tenancy should be handled
+at the API layer by using separate indices per account (e.g.,
+gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
+"""
 from __future__ import annotations
 import math
 import re
 import time
+import warnings
 from typing import Any, Callable
 from gnosisllm_knowledge.backends.memory.indexer import MemoryIndexer
@@ -147,7 +153,7 @@ class MemorySearcher:
         for doc in filtered_docs:
             content = doc.get("content", "").lower()
-            title = doc.get("title", "").lower()
+            title = (doc.get("title") or "").lower()
             # Simple TF scoring
             content_score = sum(
@@ -209,7 +215,7 @@ class MemorySearcher:
         for doc in filtered_docs:
             # Keyword score
             content = doc.get("content", "").lower()
-            title = doc.get("title", "").lower()
+            title = (doc.get("title") or "").lower()
             keyword_score = sum(content.count(term) for term in query_terms)
             keyword_score += sum(title.count(term) for term in query_terms) * 2
@@ -348,6 +354,122 @@ class MemorySearcher:
             results.append(result)
         return results
+    async def list_documents(
+        self,
+        index_name: str,
+        *,
+        source_id: str | None = None,
+        collection_id: str | None = None,
+        limit: int = 50,
+        offset: int = 0,
+    ) -> dict[str, Any]:
+        """List documents with optional filters.
+        Args:
+            index_name: Index to query.
+            source_id: Optional source ID filter.
+            collection_id: Optional collection ID filter.
+            limit: Maximum documents to return.
+            offset: Number of documents to skip.
+        Returns:
+            Dictionary with documents, total, limit, offset.
+        """
+        documents = self._indexer.get_all(index_name)
+        # Apply filters
+        if source_id:
+            documents = [d for d in documents if d.get("source_id") == source_id]
+        if collection_id:
+            documents = [d for d in documents if d.get("collection_id") == collection_id]
+        total = len(documents)
+        # Apply pagination
+        paginated = documents[offset : offset + limit]
+        return {
+            "documents": paginated,
+            "total": total,
+            "limit": limit,
+            "offset": offset,
+        }
+    async def count(
+        self,
+        index_name: str,
+        collection_id: str | None = None,
+        source_id: str | None = None,
+    ) -> int:
+        """Count documents in index with optional filters.
+        Args:
+            index_name: Index to count.
+            collection_id: Filter by collection.
+            source_id: Filter by source.
+        Returns:
+            Document count.
+        """
+        # Use efficient O(1) count when no filters
+        if not collection_id and not source_id:
+            return self._indexer.count(index_name)
+        # With filters, iterate over index values (memory backend is for testing only)
+        index_data = self._indexer._indices.get(index_name, {})
+        count = 0
+        for doc in index_data.values():
+            if collection_id and doc.get("collection_id") != collection_id:
+                continue
+            if source_id and doc.get("source_id") != source_id:
+                continue
+            count += 1
+        return count
+    async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
+        """Get unique collections with document counts.
+        Args:
+            index_name: Index to query.
+        Returns:
+            List of collections with id, name, and document_count.
+        """
+        documents = self._indexer.get_all(index_name)
+        collections: dict[str, dict[str, Any]] = {}
+        for doc in documents:
+            col_id = doc.get("collection_id")
+            if not col_id:
+                continue
+            if col_id not in collections:
+                collections[col_id] = {
+                    "id": col_id,
+                    "name": doc.get("collection_name") or col_id,
+                    "document_count": 0,
+                }
+            collections[col_id]["document_count"] += 1
+        return list(collections.values())
+    async def get_stats(self, index_name: str) -> dict[str, Any]:
+        """Get index statistics.
+        Args:
+            index_name: Index to query.
+        Returns:
+            Dictionary with document_count and index info.
+        """
+        count = self._indexer.count(index_name)
+        return {
+            "document_count": count,
+            "index_name": index_name,
+            "exists": count > 0 or index_name in self._indexer._indices,
+        }
     def _apply_filters(
         self,
         documents: list[dict[str, Any]],
@@ -355,6 +477,10 @@ class MemorySearcher:
     ) -> list[dict[str, Any]]:
         """Apply query filters to documents.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             documents: Documents to filter.
             query: Query with filter parameters.
@@ -364,10 +490,6 @@ class MemorySearcher:
         """
         filtered = documents
-        # Account filter
-        if query.account_id:
-            filtered = [d for d in filtered if d.get("account_id") == query.account_id]
         # Collection filter
         if query.collection_ids:
             filtered = [
@@ -378,9 +500,9 @@ class MemorySearcher:
         if query.source_ids:
             filtered = [d for d in filtered if d.get("source_id") in query.source_ids]
-        # Custom filters
-        if query.filters:
-            for field, value in query.filters.items():
+        # Custom metadata filters
+        if query.metadata_filters:
+            for field, value in query.metadata_filters.items():
                 if isinstance(value, list):
                     filtered = [d for d in filtered if d.get(field) in value]
                 else:

gnosisllm_knowledge/backends/opensearch/agentic.py CHANGED Viewed

@@ -2,6 +2,12 @@
 Uses OpenSearch ML agents for AI-powered search with reasoning capabilities.
 Supports flow agents (fast RAG) and conversational agents (multi-turn with memory).
+Note:
+    This module is **tenant-agnostic**. Multi-tenancy is achieved through index isolation:
+    each tenant's data resides in a separate OpenSearch index. The caller (e.g., gnosisllm-api)
+    is responsible for constructing the appropriate index name (e.g., `knowledge-{account_id}`).
+    The library operates on the provided index without any tenant-specific filtering logic.
 """
 from __future__ import annotations
@@ -9,7 +15,6 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
-import uuid
 from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any
@@ -297,13 +302,15 @@ class OpenSearchAgenticSearcher:
     async def list_conversations(
         self,
-        account_id: str | None = None,
         limit: int = 100,
     ) -> list[dict[str, Any]]:
         """List active conversations.
+        Note:
+            This library is tenant-agnostic. Multi-tenancy is achieved through
+            index isolation (separate index per account).
         Args:
-            account_id: Filter by account (multi-tenant).
             limit: Maximum number of conversations.
         Returns:
@@ -311,8 +318,6 @@ class OpenSearchAgenticSearcher:
         """
         try:
             body: dict[str, Any] = {"size": limit}
-            if account_id:
-                body["query"] = {"term": {"account_id": account_id}}
             response = await self._client.transport.perform_request(
                 "POST",
@@ -365,16 +370,18 @@ class OpenSearchAgenticSearcher:
     async def create_conversation(
         self,
         name: str | None = None,
-        account_id: str | None = None,
     ) -> str | None:
         """Create a new conversation memory.
         Uses the OpenSearch Memory API to create a conversation memory.
         The endpoint is POST /_plugins/_ml/memory (introduced in 2.12).
+        Note:
+            This library is tenant-agnostic. Multi-tenancy is achieved through
+            index isolation (separate index per account).
         Args:
             name: Optional name for the conversation.
-            account_id: Optional account ID for multi-tenancy.
         Returns:
             The new conversation/memory ID, or None if creation fails.
@@ -382,8 +389,6 @@ class OpenSearchAgenticSearcher:
         body: dict[str, Any] = {}
         if name:
             body["name"] = name
-        if account_id:
-            body["account_id"] = account_id
         try:
             # POST /_plugins/_ml/memory creates a new memory (OpenSearch 2.12+)

gnosisllm_knowledge/backends/opensearch/config.py CHANGED Viewed

@@ -109,6 +109,11 @@ class OpenSearchConfig:
     bulk_batch_size: int = 500
     bulk_max_concurrent: int = 3
+    # === Indexing Service ===
+    # Batch size for progressive indexing during load operations
+    # Documents are indexed in batches of this size as they stream in
+    indexing_batch_size: int = 10
     @property
     def url(self) -> str:
         """Get the full OpenSearch URL."""
@@ -213,4 +218,6 @@ class OpenSearchConfig:
             # === Bulk Indexing ===
             bulk_batch_size=int(os.getenv("OPENSEARCH_BULK_BATCH_SIZE", "500")),
             bulk_max_concurrent=int(os.getenv("OPENSEARCH_BULK_MAX_CONCURRENT", "3")),
+            # === Indexing Service ===
+            indexing_batch_size=int(os.getenv("GNOSISLLM_INDEXING_BATCH_SIZE", "10")),
         )

gnosisllm_knowledge/backends/opensearch/indexer.py CHANGED Viewed

@@ -87,13 +87,15 @@ class OpenSearchIndexer:
             # Embeddings are generated by OpenSearch ingest pipeline
             doc_body = self._prepare_document(document)
-            # Index the document
+            # Index the document with ingest pipeline for embedding generation
             refresh = options.get("refresh", False)
+            pipeline = self._config.ingest_pipeline_name
             await self._client.index(
                 index=index_name,
                 id=document.doc_id,
                 body=doc_body,
                 refresh=refresh,
+                pipeline=pipeline,
             )
             return IndexResult(
@@ -272,6 +274,43 @@ class OpenSearchIndexer:
             failed_count=0,
         )
+    async def get(
+        self,
+        doc_id: str,
+        index_name: str,
+    ) -> dict[str, Any] | None:
+        """Get a document by ID.
+        Uses OpenSearch client's direct get() API (CRUD operation, not search).
+        Args:
+            doc_id: Document ID to retrieve.
+            index_name: Index name.
+        Returns:
+            Document dict (source fields) or None if not found.
+            Excludes embeddings from response for efficiency.
+        """
+        try:
+            response = await self._client.get(
+                index=index_name,
+                id=doc_id,
+                _source_excludes=["content_embedding"],
+            )
+            source = response.get("_source", {})
+            # Include the document ID in the response
+            source["id"] = response.get("_id", doc_id)
+            return source
+        except Exception as e:
+            if "not_found" in str(e).lower():
+                return None
+            logger.error(f"Failed to get document {doc_id}: {e}")
+            raise IndexError(
+                message=f"Failed to get document: {e}",
+                details={"document_id": doc_id},
+                cause=e,
+            ) from e
     async def delete(
         self,
         doc_id: str,
@@ -434,7 +473,9 @@ class OpenSearchIndexer:
         if not actions:
             return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
-        response = await self._client.bulk(body=actions)
+        # Use ingest pipeline for embedding generation
+        pipeline = self._config.ingest_pipeline_name
+        response = await self._client.bulk(body=actions, pipeline=pipeline)
         indexed = 0
         failed = 0
@@ -460,6 +501,11 @@ class OpenSearchIndexer:
     def _prepare_document(self, document: Document) -> dict[str, Any]:
         """Prepare document for indexing.
+        Note:
+            This library is tenant-agnostic. Multi-tenancy is achieved through index
+            isolation. Tenant information should be passed in document.metadata if
+            needed for audit purposes.
         Args:
             document: Document to prepare.
@@ -479,7 +525,6 @@ class OpenSearchIndexer:
             "url": document.url,
             "title": document.title,
             "source": document.source,
-            "account_id": document.account_id,
             "collection_id": document.collection_id,
             "collection_name": document.collection_name,
             "source_id": document.source_id,

gnosisllm_knowledge/backends/opensearch/mappings.py CHANGED Viewed

@@ -1,4 +1,10 @@
-"""OpenSearch index mappings for knowledge documents."""
+"""OpenSearch index mappings for knowledge documents.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). Index mappings do not include
+    tenant-specific fields like account_id.
+"""
 from __future__ import annotations
@@ -56,8 +62,7 @@ def get_knowledge_index_mappings(config: OpenSearchConfig) -> dict[str, Any]:
                 "fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
             },
             "source": {"type": "keyword"},
-            # === Multi-tenant Fields ===
-            "account_id": {"type": "keyword"},
+            # === Collection Fields ===
             "collection_id": {"type": "keyword"},
             "collection_name": {"type": "keyword"},  # For aggregation display
             "source_id": {"type": "keyword"},
@@ -129,13 +134,16 @@ def get_memory_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
 def get_memory_index_mappings() -> dict[str, Any]:
     """Get index mappings for conversation memory.
+    Note:
+        This library is tenant-agnostic. Multi-tenancy is achieved through index
+        isolation. Use tenant-specific index names for conversation memory.
     Returns:
         Index mappings dictionary.
     """
     return {
         "properties": {
             "conversation_id": {"type": "keyword"},
-            "account_id": {"type": "keyword"},
             "user_id": {"type": "keyword"},
             "message_index": {"type": "integer"},
             "role": {"type": "keyword"},  # user, assistant, system

gnosisllm_knowledge/backends/opensearch/queries.py CHANGED Viewed

@@ -2,6 +2,10 @@
 Uses OpenSearch neural search - embeddings are generated automatically
 via the deployed model. No Python-side embedding generation needed.
+Note: This module is tenant-agnostic. Multi-tenancy should be handled
+at the API layer by using separate indices per account (e.g.,
+`knowledge-{account_id}`) rather than filtering by account_id.
 """
 from __future__ import annotations
@@ -18,9 +22,13 @@ class QueryBuilder:
     model handles embedding generation automatically via ingest and
     search pipelines.
+    Note:
+        This builder is tenant-agnostic. Multi-tenancy should be handled
+        by using separate indices per account.
     Example:
         ```python
-        query = SearchQuery(text="how to configure", account_id="acc123")
+        query = SearchQuery(text="how to configure", collection_ids=["col-1"])
         builder = QueryBuilder(query, model_id="abc123")
         os_query = builder.build_hybrid_query()
         ```
@@ -204,12 +212,12 @@ class QueryBuilder:
             },
         }
-        # Apply filters at top level for hybrid
+        # Apply filters using post_filter for hybrid queries
+        # Hybrid queries cannot be wrapped in bool - they must be top-level
         filters = self._build_filters()
         if filters:
-            query["query"] = {
+            query["post_filter"] = {
                 "bool": {
-                    "must": [query["query"]],
                     "filter": filters,
                 }
             }
@@ -270,15 +278,15 @@ class QueryBuilder:
     def _build_filters(self) -> list[dict[str, Any]]:
         """Build filter clauses from query parameters.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Returns:
-            List of filter clauses.
+            List of filter clauses for collection, source, and metadata filters.
         """
         filters: list[dict[str, Any]] = []
-        # Multi-tenant filter (required for security)
-        if self._query.account_id:
-            filters.append({"term": {"account_id": self._query.account_id}})
         # Collection filter
         if self._query.collection_ids:
             filters.append({"terms": {"collection_id": self._query.collection_ids}})
@@ -357,67 +365,61 @@ class QueryBuilder:
         ]
-def build_delete_by_source_query(
-    source_id: str,
-    account_id: str | None = None,
-) -> dict[str, Any]:
+def build_delete_by_source_query(source_id: str) -> dict[str, Any]:
     """Build query to delete documents by source.
+    Note:
+        This function is tenant-agnostic. Multi-tenancy should be handled
+        at the API layer by using separate indices per account.
     Args:
         source_id: Source ID to delete.
-        account_id: Optional account filter for multi-tenancy.
     Returns:
         Delete-by-query dictionary.
     """
-    filters = [{"term": {"source_id": source_id}}]
-    if account_id:
-        filters.append({"term": {"account_id": account_id}})
     return {
         "query": {
             "bool": {
-                "filter": filters,
+                "filter": [{"term": {"source_id": source_id}}],
             }
         }
     }
-def build_delete_by_collection_query(
-    collection_id: str,
-    account_id: str | None = None,
-) -> dict[str, Any]:
+def build_delete_by_collection_query(collection_id: str) -> dict[str, Any]:
     """Build query to delete documents by collection.
+    Note:
+        This function is tenant-agnostic. Multi-tenancy should be handled
+        at the API layer by using separate indices per account.
     Args:
         collection_id: Collection ID to delete.
-        account_id: Optional account filter for multi-tenancy.
     Returns:
         Delete-by-query dictionary.
     """
-    filters = [{"term": {"collection_id": collection_id}}]
-    if account_id:
-        filters.append({"term": {"account_id": account_id}})
     return {
         "query": {
             "bool": {
-                "filter": filters,
+                "filter": [{"term": {"collection_id": collection_id}}],
             }
         }
     }
 def build_count_query(
-    account_id: str | None = None,
     collection_id: str | None = None,
     source_id: str | None = None,
 ) -> dict[str, Any]:
     """Build query to count documents.
+    Note:
+        This function is tenant-agnostic. Multi-tenancy should be handled
+        at the API layer by using separate indices per account.
     Args:
-        account_id: Optional account filter.
         collection_id: Optional collection filter.
         source_id: Optional source filter.
@@ -426,8 +428,6 @@ def build_count_query(
     """
     filters: list[dict[str, Any]] = []
-    if account_id:
-        filters.append({"term": {"account_id": account_id}})
     if collection_id:
         filters.append({"term": {"collection_id": collection_id}})
     if source_id:

gnosisllm_knowledge/backends/opensearch/searcher.py CHANGED Viewed

@@ -2,6 +2,10 @@
 Uses OpenSearch neural search - embeddings are generated automatically
 by the deployed ML model. No Python-side embedding generation needed.
+Note: This module is tenant-agnostic. Multi-tenancy should be handled
+at the API layer by using separate indices per account (e.g.,
+`knowledge-{account_id}`) rather than filtering by account_id.
 """
 from __future__ import annotations
@@ -502,11 +506,65 @@ class OpenSearchKnowledgeSearcher:
                 "error": str(e),
             }
+    async def count(
+        self,
+        index_name: str,
+        collection_id: str | None = None,
+        source_id: str | None = None,
+    ) -> int:
+        """Count documents in index with optional filters.
+        Uses native _count API instead of search for efficiency and to avoid
+        hybrid search issues with empty queries.
+        Args:
+            index_name: Index to query.
+            collection_id: Filter by collection.
+            source_id: Filter by source.
+        Returns:
+            Document count.
+        """
+        try:
+            # Check if index exists first
+            exists = await self._client.indices.exists(index=index_name)
+            if not exists:
+                logger.debug(f"Index {index_name} does not exist, returning count 0")
+                return 0
+            # Build query with optional filters
+            query: dict[str, Any] = {"match_all": {}}
+            filters = []
+            if collection_id:
+                filters.append({"term": {"collection_id": collection_id}})
+            if source_id:
+                filters.append({"term": {"source_id": source_id}})
+            if filters:
+                query = {"bool": {"filter": filters}}
+            # Use native _count API
+            response = await self._client.count(
+                index=index_name,
+                body={"query": query},
+            )
+            count = response.get("count", 0)
+            logger.debug(f"Count for {index_name}: {count} (collection={collection_id}, source={source_id})")
+            return count
+        except Exception as e:
+            logger.error(f"Failed to count documents in {index_name}: {e}")
+            raise SearchError(
+                message=f"Count failed: {e}",
+                details={"index": index_name, "collection_id": collection_id, "source_id": source_id},
+            ) from e
     async def list_documents(
         self,
         index_name: str,
         *,
-        account_id: str | None = None,
         source_id: str | None = None,
         collection_id: str | None = None,
         limit: int = 50,
@@ -514,9 +572,12 @@ class OpenSearchKnowledgeSearcher:
     ) -> dict[str, Any]:
         """List documents with optional filters.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
-            index_name: Index to query.
-            account_id: Optional account ID filter.
+            index_name: Index to query (use tenant-specific name for isolation).
             source_id: Optional source ID filter.
             collection_id: Optional collection ID filter.
             limit: Maximum documents to return.
@@ -540,9 +601,6 @@ class OpenSearchKnowledgeSearcher:
             # Build filter clauses
             filters: list[dict[str, Any]] = []
-            if account_id:
-                filters.append({"term": {"account_id": account_id}})
             if source_id:
                 filters.append({"term": {"source_id": source_id}})

gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.3py3-none-any.whl