PyPI - gnosisllm-knowledge - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

gnosisllm_knowledge/api/knowledge.py +233 -35
gnosisllm_knowledge/backends/memory/indexer.py +27 -2
gnosisllm_knowledge/backends/memory/searcher.py +132 -10
gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
gnosisllm_knowledge/backends/opensearch/config.py +7 -0
gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
gnosisllm_knowledge/cli/app.py +58 -19
gnosisllm_knowledge/cli/commands/agentic.py +15 -9
gnosisllm_knowledge/cli/commands/load.py +169 -19
gnosisllm_knowledge/cli/commands/memory.py +10 -0
gnosisllm_knowledge/cli/commands/search.py +9 -10
gnosisllm_knowledge/cli/commands/setup.py +25 -1
gnosisllm_knowledge/cli/utils/config.py +4 -4
gnosisllm_knowledge/core/domain/__init__.py +13 -0
gnosisllm_knowledge/core/domain/discovery.py +166 -0
gnosisllm_knowledge/core/domain/document.py +14 -19
gnosisllm_knowledge/core/domain/search.py +10 -25
gnosisllm_knowledge/core/domain/source.py +11 -12
gnosisllm_knowledge/core/events/__init__.py +8 -0
gnosisllm_knowledge/core/events/types.py +122 -5
gnosisllm_knowledge/core/exceptions.py +93 -0
gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
gnosisllm_knowledge/fetchers/__init__.py +8 -0
gnosisllm_knowledge/fetchers/config.py +27 -0
gnosisllm_knowledge/fetchers/neoreader.py +31 -3
gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
gnosisllm_knowledge/loaders/__init__.py +5 -1
gnosisllm_knowledge/loaders/discovery.py +338 -0
gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
gnosisllm_knowledge/loaders/factory.py +46 -0
gnosisllm_knowledge/services/indexing.py +51 -21
gnosisllm_knowledge/services/search.py +42 -28
gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/loaders/factory.py CHANGED Viewed

@@ -9,7 +9,11 @@ from typing import Any
 from gnosisllm_knowledge.core.events.emitter import EventEmitter
 from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
 from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
+from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
+from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
+from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
 from gnosisllm_knowledge.loaders.base import BaseLoader
+from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
 from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
 from gnosisllm_knowledge.loaders.website import WebsiteLoader
@@ -20,6 +24,43 @@ LoaderCreator = Callable[
 ]
+def _create_discovery_loader(
+    fetcher: IContentFetcher,
+    chunker: ITextChunker,
+    config: dict[str, Any] | None,
+    event_emitter: EventEmitter | None,
+) -> DiscoveryLoader:
+    """Factory function for creating DiscoveryLoader instances.
+    Creates a DiscoveryLoader with a NeoreaderDiscoveryClient. If the fetcher
+    is a NeoreaderContentFetcher, reuses its config to ensure consistency.
+    Otherwise, creates config from environment variables.
+    Args:
+        fetcher: Content fetcher for retrieving URL content.
+        chunker: Text chunker for splitting content.
+        config: Optional configuration dictionary.
+        event_emitter: Optional event emitter for progress events.
+    Returns:
+        Configured DiscoveryLoader instance.
+    """
+    # Get config from fetcher if it's NeoreaderContentFetcher, otherwise use env
+    if isinstance(fetcher, NeoreaderContentFetcher):
+        neoreader_config = fetcher.config
+    else:
+        neoreader_config = NeoreaderConfig.from_env()
+    discovery_client = NeoreaderDiscoveryClient(neoreader_config)
+    return DiscoveryLoader(
+        fetcher=fetcher,
+        chunker=chunker,
+        discovery_client=discovery_client,
+        config=config,
+        event_emitter=event_emitter,
+    )
 class LoaderFactory:
     """Factory for creating content loaders (Registry Pattern).
@@ -29,6 +70,7 @@ class LoaderFactory:
     Built-in loaders:
     - website: Single URL loading
     - sitemap: Sitemap XML with recursive discovery
+    - discovery: Website crawling via Neo Reader Discovery API
     Example:
         ```python
@@ -40,6 +82,9 @@ class LoaderFactory:
         # Explicit type
         loader = factory.create("sitemap", config={"max_urls": 500})
+        # Discovery loader for full website crawling
+        loader = factory.create("discovery", config={"max_depth": 3, "max_pages": 100})
         # Register custom loader
         factory.register("custom", MyCustomLoader)
         ```
@@ -76,6 +121,7 @@ class LoaderFactory:
         """Register built-in loader types."""
         self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
         self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
+        self.register("discovery", _create_discovery_loader)
     def register(self, name: str, creator: LoaderCreator) -> None:
         """Register a loader type.

gnosisllm_knowledge/services/indexing.py CHANGED Viewed

@@ -1,4 +1,13 @@
-"""Knowledge indexing service."""
+"""Knowledge indexing service.
+This service orchestrates the document ingestion pipeline from source to index,
+including loading, chunking, and indexing.
+Note:
+    This service is tenant-agnostic. Multi-tenancy should be handled at the
+    API layer by using separate indices per account (e.g.,
+    `knowledge-{account_id}`) rather than filtering by account_id.
+"""
 from __future__ import annotations
@@ -82,7 +91,6 @@ class KnowledgeIndexingService:
         source: str,
         index_name: str,
         *,
-        account_id: str | None = None,
         collection_id: str | None = None,
         source_id: str | None = None,
         batch_size: int = 100,
@@ -93,10 +101,13 @@ class KnowledgeIndexingService:
         Uses streaming to process and index documents as they're fetched,
         avoiding memory issues with large sitemaps.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             source: Source URL or path.
-            index_name: Target index name.
-            account_id: Account ID for multi-tenancy.
+            index_name: Target index name (use tenant-specific name for isolation).
             collection_id: Collection ID.
             source_id: Source ID (auto-generated if not provided).
             batch_size: Documents per batch for indexing.
@@ -108,6 +119,16 @@ class KnowledgeIndexingService:
         source_id = source_id or str(uuid.uuid4())
         document_defaults = options.pop("document_defaults", {})
+        # Extract metadata from document_defaults to merge with doc.metadata later
+        # This allows callers to pass custom metadata (e.g., parent_collection_id)
+        # without conflicting with the explicit metadata= parameter
+        extra_metadata = document_defaults.pop("metadata", {})
+        # Ensure index exists with correct mappings before indexing
+        # This prevents OpenSearch from auto-creating the index with dynamic mapping
+        # which would map keyword fields (like collection_id) as text fields
+        await self._indexer.ensure_index(index_name)
         # Emit batch started event
         await self._events.emit_async(
             BatchStartedEvent(
@@ -127,21 +148,22 @@ class KnowledgeIndexingService:
             # Stream documents and index in batches as they arrive
             # Note: Loader already chunks content, so we don't re-chunk here
             async for doc in self._loader.load_streaming(source, **options):
-                # Enrich document with tenant info
+                # Enrich document with collection info
+                # Merge doc.metadata with extra_metadata from document_defaults
+                merged_metadata = {**doc.metadata, **extra_metadata}
                 enriched_doc = Document(
                     content=doc.content,
                     source=source,
                     doc_id=doc.doc_id,
                     url=doc.url,
                     title=doc.title,
-                    account_id=account_id,
                     collection_id=collection_id,
                     source_id=source_id,
                     chunk_index=doc.chunk_index,
                     total_chunks=doc.total_chunks,
                     parent_doc_id=doc.parent_doc_id,
                     status=DocumentStatus.INDEXED,
-                    metadata=doc.metadata,
+                    metadata=merged_metadata,
                     **document_defaults,
                 )
@@ -213,6 +235,9 @@ class KnowledgeIndexingService:
         Returns:
             Index result.
         """
+        # Ensure index exists with correct mappings before indexing
+        await self._indexer.ensure_index(index_name)
         total_indexed = 0
         total_failed = 0
         errors: list[str] = []
@@ -230,8 +255,8 @@ class KnowledgeIndexingService:
                         doc_id=f"{doc.doc_id}-chunk-{i}",
                         url=doc.url,
                         title=doc.title,
-                        account_id=doc.account_id,
                         collection_id=doc.collection_id,
+                        collection_name=doc.collection_name,
                         source_id=doc.source_id,
                         chunk_index=i,
                         total_chunks=len(chunks),
@@ -271,14 +296,16 @@ class KnowledgeIndexingService:
         self,
         source_id: str,
         index_name: str,
-        account_id: str | None = None,
     ) -> int:
         """Delete all documents from a source.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             source_id: Source ID to delete.
-            index_name: Index name.
-            account_id: Optional account filter.
+            index_name: Index name (use tenant-specific name for isolation).
         Returns:
             Count of deleted documents.
@@ -287,21 +314,23 @@ class KnowledgeIndexingService:
             build_delete_by_source_query,
         )
-        query = build_delete_by_source_query(source_id, account_id)
+        query = build_delete_by_source_query(source_id)
         return await self._indexer.delete_by_query(query, index_name)
     async def delete_collection(
         self,
         collection_id: str,
         index_name: str,
-        account_id: str | None = None,
     ) -> int:
         """Delete all documents from a collection.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             collection_id: Collection ID to delete.
-            index_name: Index name.
-            account_id: Optional account filter.
+            index_name: Index name (use tenant-specific name for isolation).
         Returns:
             Count of deleted documents.
@@ -310,7 +339,7 @@ class KnowledgeIndexingService:
             build_delete_by_collection_query,
         )
-        query = build_delete_by_collection_query(collection_id, account_id)
+        query = build_delete_by_collection_query(collection_id)
         return await self._indexer.delete_by_query(query, index_name)
     async def reindex_source(
@@ -319,17 +348,19 @@ class KnowledgeIndexingService:
         source_id: str,
         index_name: str,
         *,
-        account_id: str | None = None,
         collection_id: str | None = None,
         **options: Any,
     ) -> IndexResult:
         """Reindex a source by deleting and re-loading.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             source: Source URL or path.
             source_id: Existing source ID.
-            index_name: Index name.
-            account_id: Account ID.
+            index_name: Index name (use tenant-specific name for isolation).
             collection_id: Collection ID.
             **options: Additional options.
@@ -337,13 +368,12 @@ class KnowledgeIndexingService:
             Index result.
         """
         # Delete existing documents
-        await self.delete_source(source_id, index_name, account_id)
+        await self.delete_source(source_id, index_name)
         # Re-index
         return await self.load_and_index(
             source=source,
             index_name=index_name,
-            account_id=account_id,
             collection_id=collection_id,
             source_id=source_id,
             **options,

gnosisllm_knowledge/services/search.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Knowledge search service."""
+"""Knowledge search service.
+This service provides a high-level interface for searching knowledge documents
+using semantic, keyword, and hybrid search modes.
+Note:
+    This service is tenant-agnostic. Multi-tenancy should be handled at the
+    API layer by using separate indices per account (e.g., knowledge-{account_id}).
+"""
 from __future__ import annotations
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
         mode: SearchMode = SearchMode.HYBRID,
         limit: int = 10,
         offset: int = 0,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         source_ids: list[str] | None = None,
         min_score: float | None = None,
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
     ) -> SearchResult:
         """Search for knowledge documents.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search (uses default if not provided).
             mode: Search mode (semantic, keyword, hybrid).
             limit: Maximum results.
             offset: Result offset for pagination.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             source_ids: Filter by source IDs.
             min_score: Minimum score threshold.
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
             mode=mode,
             limit=limit,
             offset=offset,
-            account_id=account_id,
             collection_ids=collection_ids,
             source_ids=source_ids,
             min_score=min_score,
@@ -133,17 +142,19 @@ class KnowledgeSearchService:
         *,
         index_name: str | None = None,
         limit: int = 10,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         **options: Any,
     ) -> SearchResult:
         """Execute semantic (vector) search.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search.
             limit: Maximum results.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             **options: Additional options.
@@ -155,7 +166,6 @@ class KnowledgeSearchService:
             index_name=index_name,
             mode=SearchMode.SEMANTIC,
             limit=limit,
-            account_id=account_id,
             collection_ids=collection_ids,
             **options,
         )
@@ -166,17 +176,19 @@ class KnowledgeSearchService:
         *,
         index_name: str | None = None,
         limit: int = 10,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         **options: Any,
     ) -> SearchResult:
         """Execute keyword (BM25) search.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search.
             limit: Maximum results.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             **options: Additional options.
@@ -188,7 +200,6 @@ class KnowledgeSearchService:
             index_name=index_name,
             mode=SearchMode.KEYWORD,
             limit=limit,
-            account_id=account_id,
             collection_ids=collection_ids,
             **options,
         )
@@ -199,7 +210,6 @@ class KnowledgeSearchService:
         *,
         index_name: str | None = None,
         limit: int = 10,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         semantic_weight: float = 0.7,
         keyword_weight: float = 0.3,
@@ -207,11 +217,14 @@ class KnowledgeSearchService:
     ) -> SearchResult:
         """Execute hybrid search (semantic + keyword).
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search.
             limit: Maximum results.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             semantic_weight: Weight for semantic score.
             keyword_weight: Weight for keyword score.
@@ -225,7 +238,6 @@ class KnowledgeSearchService:
             index_name=index_name,
             mode=SearchMode.HYBRID,
             limit=limit,
-            account_id=account_id,
             collection_ids=collection_ids,
             semantic_weight=semantic_weight,
             keyword_weight=keyword_weight,
@@ -264,17 +276,19 @@ class KnowledgeSearchService:
         index_name: str | None = None,
         mode: SearchMode = SearchMode.HYBRID,
         limit: int = 10,
-        account_id: str | None = None,
         **options: Any,
     ) -> list[SearchResult]:
         """Execute multiple searches in parallel.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             queries: List of query texts.
             index_name: Index to search.
             mode: Search mode.
             limit: Maximum results per query.
-            account_id: Account ID for multi-tenancy.
             **options: Additional options.
         Returns:
@@ -289,7 +303,6 @@ class KnowledgeSearchService:
                 text=query,
                 mode=mode,
                 limit=limit,
-                account_id=account_id,
             )
             for query in queries
         ]
@@ -310,15 +323,22 @@ class KnowledgeSearchService:
     async def count(
         self,
         index_name: str | None = None,
-        account_id: str | None = None,
         collection_id: str | None = None,
+        source_id: str | None = None,
     ) -> int:
         """Count documents in index.
+        Uses native count API instead of search for efficiency and to avoid
+        hybrid search issues with empty queries.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             index_name: Index to count.
-            account_id: Filter by account.
             collection_id: Filter by collection.
+            source_id: Filter by source (for source deletion confirmation).
         Returns:
             Document count.
@@ -327,18 +347,12 @@ class KnowledgeSearchService:
         if not index:
             raise SearchError(message="No index specified")
-        # Build count query
-        query = SearchQuery(
-            text="",
-            limit=0,
-            account_id=account_id,
-            collection_ids=[collection_id] if collection_id else None,
+        return await self._searcher.count(
+            index_name=index,
+            collection_id=collection_id,
+            source_id=source_id,
         )
-        # Use a simple match_all to get total count
-        result = await self._searcher.search(query, index)
-        return result.total_hits
     async def get_collections(
         self,
         index_name: str | None = None,

gnosisllm_knowledge/services/streaming_pipeline.py CHANGED Viewed

@@ -2,12 +2,19 @@
 This module provides the StreamingIndexingPipeline that orchestrates
 the load -> index pipeline with guaranteed bounded memory usage.
+Note:
+    This module is tenant-agnostic. Multi-tenancy should be handled at the
+    API layer by using separate indices per account (e.g.,
+    gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
+    The account_id parameters are deprecated and will be ignored.
 """
 from __future__ import annotations
 import logging
 import time
+import warnings
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
@@ -141,10 +148,16 @@ class StreamingIndexingPipeline:
     ) -> IndexResult:
         """Execute the streaming pipeline.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account. The
+            account_id parameter is deprecated and will be ignored.
         Args:
             source: Sitemap URL.
             index_name: Target OpenSearch index.
-            account_id: For multi-tenancy filtering.
+            account_id: Deprecated. This parameter is ignored.
+                Use index isolation (separate index per account) instead.
             collection_id: Collection within account.
             collection_name: Collection name for display.
             source_id: Source identifier.
@@ -153,6 +166,19 @@ class StreamingIndexingPipeline:
         Returns:
             Aggregated index result.
         """
+        if account_id is not None:
+            warnings.warn(
+                "account_id parameter is deprecated and will be ignored. "
+                "Use index isolation (separate index per account) instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        # Ensure index exists with correct mappings before indexing
+        # This prevents OpenSearch from auto-creating the index with dynamic mapping
+        # which would map keyword fields (like collection_id) as text fields
+        await self._indexer.ensure_index(index_name)
         start_time = time.time()
         self._progress = StreamingProgress(current_phase="starting")
         await self._emit_progress()
@@ -167,7 +193,6 @@ class StreamingIndexingPipeline:
                 self._enrich_document(
                     doc,
                     source=source,
-                    account_id=account_id,
                     collection_id=collection_id,
                     collection_name=collection_name,
                     source_id=source_id,
@@ -248,31 +273,44 @@ class StreamingIndexingPipeline:
         self,
         doc: Document,
         source: str,
-        account_id: str | None,
         collection_id: str | None,
         collection_name: str | None,
         source_id: str | None,
+        account_id: str | None = None,
     ) -> Document:
-        """Add tenant and source info to document.
+        """Add source info to document.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account. The
+            account_id parameter is deprecated and will be ignored.
         Args:
             doc: Original document.
             source: Source URL.
-            account_id: Account identifier.
             collection_id: Collection identifier.
             collection_name: Collection name for display.
             source_id: Source identifier.
+            account_id: Deprecated. This parameter is ignored.
+                Use index isolation (separate index per account) instead.
         Returns:
-            New Document with tenant info.
+            New Document with source info.
         """
+        if account_id is not None:
+            warnings.warn(
+                "account_id parameter is deprecated and will be ignored. "
+                "Use index isolation (separate index per account) instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         return Document(
             content=doc.content,
             source=source,
             doc_id=doc.doc_id,
             url=doc.url,
             title=doc.title,
-            account_id=account_id,
             collection_id=collection_id,
             collection_name=collection_name,
             source_id=source_id,

gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.3py3-none-any.whl