PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

gnosisllm_knowledge/__init__.py +91 -39
gnosisllm_knowledge/api/__init__.py +3 -2
gnosisllm_knowledge/api/knowledge.py +502 -32
gnosisllm_knowledge/api/memory.py +966 -0
gnosisllm_knowledge/backends/__init__.py +14 -5
gnosisllm_knowledge/backends/memory/indexer.py +27 -2
gnosisllm_knowledge/backends/memory/searcher.py +111 -10
gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
gnosisllm_knowledge/backends/opensearch/config.py +49 -28
gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
gnosisllm_knowledge/cli/app.py +436 -31
gnosisllm_knowledge/cli/commands/agentic.py +26 -9
gnosisllm_knowledge/cli/commands/load.py +169 -19
gnosisllm_knowledge/cli/commands/memory.py +733 -0
gnosisllm_knowledge/cli/commands/search.py +9 -10
gnosisllm_knowledge/cli/commands/setup.py +49 -23
gnosisllm_knowledge/cli/display/service.py +43 -0
gnosisllm_knowledge/cli/utils/config.py +62 -4
gnosisllm_knowledge/core/domain/__init__.py +54 -0
gnosisllm_knowledge/core/domain/discovery.py +166 -0
gnosisllm_knowledge/core/domain/document.py +19 -19
gnosisllm_knowledge/core/domain/memory.py +440 -0
gnosisllm_knowledge/core/domain/result.py +11 -3
gnosisllm_knowledge/core/domain/search.py +12 -25
gnosisllm_knowledge/core/domain/source.py +11 -12
gnosisllm_knowledge/core/events/__init__.py +8 -0
gnosisllm_knowledge/core/events/types.py +198 -5
gnosisllm_knowledge/core/exceptions.py +227 -0
gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
gnosisllm_knowledge/core/interfaces/memory.py +524 -0
gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
gnosisllm_knowledge/core/streaming/__init__.py +36 -0
gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
gnosisllm_knowledge/fetchers/__init__.py +8 -0
gnosisllm_knowledge/fetchers/config.py +27 -0
gnosisllm_knowledge/fetchers/neoreader.py +31 -3
gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
gnosisllm_knowledge/loaders/__init__.py +5 -1
gnosisllm_knowledge/loaders/base.py +3 -4
gnosisllm_knowledge/loaders/discovery.py +338 -0
gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
gnosisllm_knowledge/loaders/factory.py +46 -0
gnosisllm_knowledge/loaders/sitemap.py +129 -1
gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
gnosisllm_knowledge/services/indexing.py +100 -93
gnosisllm_knowledge/services/search.py +84 -31
gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/services/indexing.py CHANGED Viewed

@@ -1,4 +1,13 @@
-"""Knowledge indexing service."""
+"""Knowledge indexing service.
+This service orchestrates the document ingestion pipeline from source to index,
+including loading, chunking, and indexing.
+Note:
+    This service is tenant-agnostic. Multi-tenancy should be handled at the
+    API layer by using separate indices per account (e.g.,
+    `knowledge-{account_id}`) rather than filtering by account_id.
+"""
 from __future__ import annotations
@@ -82,112 +91,107 @@ class KnowledgeIndexingService:
         source: str,
         index_name: str,
         *,
-        account_id: str | None = None,
         collection_id: str | None = None,
         source_id: str | None = None,
         batch_size: int = 100,
         **options: Any,
     ) -> IndexResult:
-        """Load content from source and index it.
+        """Load content from source and index it with streaming.
+        Uses streaming to process and index documents as they're fetched,
+        avoiding memory issues with large sitemaps.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             source: Source URL or path.
-            index_name: Target index name.
-            account_id: Account ID for multi-tenancy.
+            index_name: Target index name (use tenant-specific name for isolation).
             collection_id: Collection ID.
             source_id: Source ID (auto-generated if not provided).
-            batch_size: Documents per batch.
+            batch_size: Documents per batch for indexing.
             **options: Additional loader/indexer options.
         Returns:
             Index result with counts.
         """
         source_id = source_id or str(uuid.uuid4())
+        document_defaults = options.pop("document_defaults", {})
         # Emit batch started event
         await self._events.emit_async(
-            EventType.BATCH_STARTED,
             BatchStartedEvent(
-                source=source,
-                source_id=source_id,
+                batch_index=0,
+                batch_size=batch_size,
+                total_batches=0,  # Unknown for streaming
             ),
         )
-        try:
-            # Load documents
-            load_result = await self._loader.load(source, **options)
+        total_indexed = 0
+        total_failed = 0
+        errors: list[str] = []
+        batch: list[Document] = []
+        batch_index = 0
-            if not load_result.success:
-                raise LoadError(
-                    message=f"Failed to load from {source}",
-                    details={"errors": load_result.errors},
+        try:
+            # Stream documents and index in batches as they arrive
+            # Note: Loader already chunks content, so we don't re-chunk here
+            async for doc in self._loader.load_streaming(source, **options):
+                # Enrich document with collection info
+                enriched_doc = Document(
+                    content=doc.content,
+                    source=source,
+                    doc_id=doc.doc_id,
+                    url=doc.url,
+                    title=doc.title,
+                    collection_id=collection_id,
+                    source_id=source_id,
+                    chunk_index=doc.chunk_index,
+                    total_chunks=doc.total_chunks,
+                    parent_doc_id=doc.parent_doc_id,
+                    status=DocumentStatus.INDEXED,
+                    metadata=doc.metadata,
+                    **document_defaults,
                 )
-            # Process and index documents
-            total_indexed = 0
-            total_failed = 0
-            errors: list[str] = []
-            batch: list[Document] = []
+                batch.append(enriched_doc)
-            for doc in load_result.documents:
-                # Chunk the document
-                chunks = self._chunker.chunk(doc.content)
-                for i, chunk in enumerate(chunks):
-                    # Create chunk document
-                    chunk_doc = Document(
-                        id=f"{doc.id}-chunk-{i}",
-                        content=chunk.content,
-                        url=doc.url,
-                        title=doc.title,
-                        source=source,
-                        account_id=account_id,
-                        collection_id=collection_id,
-                        source_id=source_id,
-                        chunk_index=i,
-                        total_chunks=len(chunks),
-                        parent_doc_id=doc.id,
-                        status=DocumentStatus.INDEXED,
-                        metadata=doc.metadata,
-                    )
-                    batch.append(chunk_doc)
-                    # Index batch when full
-                    if len(batch) >= batch_size:
-                        result = await self._index_batch(batch, index_name)
-                        total_indexed += result.documents_indexed
-                        total_failed += result.documents_failed
-                        if result.errors:
-                            errors.extend(result.errors)
-                        batch = []
+                # Index batch when full
+                if len(batch) >= batch_size:
+                    result = await self._index_batch(batch, index_name)
+                    total_indexed += result.indexed_count
+                    total_failed += result.failed_count
+                    if result.errors:
+                        errors.extend(result.errors)
+                    batch = []
+                    batch_index += 1
+                    logger.info(f"Indexed batch {batch_index}: {total_indexed} total documents")
             # Index remaining documents
             if batch:
                 result = await self._index_batch(batch, index_name)
-                total_indexed += result.documents_indexed
-                total_failed += result.documents_failed
+                total_indexed += result.indexed_count
+                total_failed += result.failed_count
                 if result.errors:
                     errors.extend(result.errors)
             # Emit batch completed event
             await self._events.emit_async(
-                EventType.BATCH_COMPLETED,
                 BatchCompletedEvent(
-                    source=source,
-                    source_id=source_id,
-                    documents_indexed=total_indexed,
-                    documents_failed=total_failed,
-                    success=total_failed == 0,
+                    batch_index=batch_index,
+                    success_count=total_indexed,
+                    failure_count=total_failed,
                 ),
             )
+            logger.info(f"Completed indexing from {source}: {total_indexed} documents")
             return IndexResult(
                 success=total_failed == 0,
-                documents_indexed=total_indexed,
-                documents_failed=total_failed,
-                errors=errors if errors else None,
+                indexed_count=total_indexed,
+                failed_count=total_failed,
+                errors=errors if errors else [],
             )
         except Exception as e:
@@ -231,17 +235,17 @@ class KnowledgeIndexingService:
                 for i, chunk_obj in enumerate(chunks):
                     chunk_doc = Document(
-                        id=f"{doc.id}-chunk-{i}",
                         content=chunk_obj.content,
+                        source=doc.source,
+                        doc_id=f"{doc.doc_id}-chunk-{i}",
                         url=doc.url,
                         title=doc.title,
-                        source=doc.source,
-                        account_id=doc.account_id,
                         collection_id=doc.collection_id,
+                        collection_name=doc.collection_name,
                         source_id=doc.source_id,
                         chunk_index=i,
                         total_chunks=len(chunks),
-                        parent_doc_id=doc.id,
+                        parent_doc_id=doc.doc_id,
                         status=DocumentStatus.INDEXED,
                         metadata=doc.metadata,
                     )
@@ -252,8 +256,8 @@ class KnowledgeIndexingService:
             # Index batch when full
             if len(batch) >= batch_size:
                 result = await self._index_batch(batch, index_name)
-                total_indexed += result.documents_indexed
-                total_failed += result.documents_failed
+                total_indexed += result.indexed_count
+                total_failed += result.failed_count
                 if result.errors:
                     errors.extend(result.errors)
                 batch = []
@@ -261,30 +265,32 @@ class KnowledgeIndexingService:
         # Index remaining
         if batch:
             result = await self._index_batch(batch, index_name)
-            total_indexed += result.documents_indexed
-            total_failed += result.documents_failed
+            total_indexed += result.indexed_count
+            total_failed += result.failed_count
             if result.errors:
                 errors.extend(result.errors)
         return IndexResult(
             success=total_failed == 0,
-            documents_indexed=total_indexed,
-            documents_failed=total_failed,
-            errors=errors if errors else None,
+            indexed_count=total_indexed,
+            failed_count=total_failed,
+            errors=errors if errors else [],
         )
     async def delete_source(
         self,
         source_id: str,
         index_name: str,
-        account_id: str | None = None,
     ) -> int:
         """Delete all documents from a source.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             source_id: Source ID to delete.
-            index_name: Index name.
-            account_id: Optional account filter.
+            index_name: Index name (use tenant-specific name for isolation).
         Returns:
             Count of deleted documents.
@@ -293,21 +299,23 @@ class KnowledgeIndexingService:
             build_delete_by_source_query,
         )
-        query = build_delete_by_source_query(source_id, account_id)
+        query = build_delete_by_source_query(source_id)
         return await self._indexer.delete_by_query(query, index_name)
     async def delete_collection(
         self,
         collection_id: str,
         index_name: str,
-        account_id: str | None = None,
     ) -> int:
         """Delete all documents from a collection.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             collection_id: Collection ID to delete.
-            index_name: Index name.
-            account_id: Optional account filter.
+            index_name: Index name (use tenant-specific name for isolation).
         Returns:
             Count of deleted documents.
@@ -316,7 +324,7 @@ class KnowledgeIndexingService:
             build_delete_by_collection_query,
         )
-        query = build_delete_by_collection_query(collection_id, account_id)
+        query = build_delete_by_collection_query(collection_id)
         return await self._indexer.delete_by_query(query, index_name)
     async def reindex_source(
@@ -325,17 +333,19 @@ class KnowledgeIndexingService:
         source_id: str,
         index_name: str,
         *,
-        account_id: str | None = None,
         collection_id: str | None = None,
         **options: Any,
     ) -> IndexResult:
         """Reindex a source by deleting and re-loading.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             source: Source URL or path.
             source_id: Existing source ID.
-            index_name: Index name.
-            account_id: Account ID.
+            index_name: Index name (use tenant-specific name for isolation).
             collection_id: Collection ID.
             **options: Additional options.
@@ -343,13 +353,12 @@ class KnowledgeIndexingService:
             Index result.
         """
         # Delete existing documents
-        await self.delete_source(source_id, index_name, account_id)
+        await self.delete_source(source_id, index_name)
         # Re-index
         return await self.load_and_index(
             source=source,
             index_name=index_name,
-            account_id=account_id,
             collection_id=collection_id,
             source_id=source_id,
             **options,
@@ -375,12 +384,10 @@ class KnowledgeIndexingService:
         for doc in documents:
             if result.success:
                 await self._events.emit_async(
-                    EventType.DOCUMENT_INDEXED,
                     DocumentIndexedEvent(
-                        document_id=doc.id,
+                        doc_id=doc.doc_id,
                         index_name=index_name,
-                        chunk_index=doc.chunk_index,
-                        total_chunks=doc.total_chunks,
+                        success=True,
                     ),
                 )

gnosisllm_knowledge/services/search.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Knowledge search service."""
+"""Knowledge search service.
+This service provides a high-level interface for searching knowledge documents
+using semantic, keyword, and hybrid search modes.
+Note:
+    This service is tenant-agnostic. Multi-tenancy should be handled at the
+    API layer by using separate indices per account (e.g., knowledge-{account_id}).
+"""
 from __future__ import annotations
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
         mode: SearchMode = SearchMode.HYBRID,
         limit: int = 10,
         offset: int = 0,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         source_ids: list[str] | None = None,
         min_score: float | None = None,
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
     ) -> SearchResult:
         """Search for knowledge documents.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search (uses default if not provided).
             mode: Search mode (semantic, keyword, hybrid).
             limit: Maximum results.
             offset: Result offset for pagination.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             source_ids: Filter by source IDs.
             min_score: Minimum score threshold.
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
             mode=mode,
             limit=limit,
             offset=offset,
-            account_id=account_id,
             collection_ids=collection_ids,
             source_ids=source_ids,
             min_score=min_score,
@@ -114,17 +123,8 @@ class KnowledgeSearchService:
         try:
             result = await self._searcher.search(search_query, index, **options)
-            # Emit search event
-            await self._events.emit_async(
-                EventType.SEARCH_COMPLETED,
-                {
-                    "query": query,
-                    "mode": mode.value,
-                    "results_count": len(result.items),
-                    "total_hits": result.total_hits,
-                    "duration_ms": result.duration_ms,
-                },
-            )
+            # TODO: Emit search event when SearchCompletedEvent is defined
+            # await self._events.emit_async(SearchCompletedEvent(...))
             return result
@@ -142,17 +142,19 @@ class KnowledgeSearchService:
         *,
         index_name: str | None = None,
         limit: int = 10,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         **options: Any,
     ) -> SearchResult:
         """Execute semantic (vector) search.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search.
             limit: Maximum results.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             **options: Additional options.
@@ -164,7 +166,6 @@ class KnowledgeSearchService:
             index_name=index_name,
             mode=SearchMode.SEMANTIC,
             limit=limit,
-            account_id=account_id,
             collection_ids=collection_ids,
             **options,
         )
@@ -175,17 +176,19 @@ class KnowledgeSearchService:
         *,
         index_name: str | None = None,
         limit: int = 10,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         **options: Any,
     ) -> SearchResult:
         """Execute keyword (BM25) search.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search.
             limit: Maximum results.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             **options: Additional options.
@@ -197,7 +200,6 @@ class KnowledgeSearchService:
             index_name=index_name,
             mode=SearchMode.KEYWORD,
             limit=limit,
-            account_id=account_id,
             collection_ids=collection_ids,
             **options,
         )
@@ -208,7 +210,6 @@ class KnowledgeSearchService:
         *,
         index_name: str | None = None,
         limit: int = 10,
-        account_id: str | None = None,
         collection_ids: list[str] | None = None,
         semantic_weight: float = 0.7,
         keyword_weight: float = 0.3,
@@ -216,11 +217,14 @@ class KnowledgeSearchService:
     ) -> SearchResult:
         """Execute hybrid search (semantic + keyword).
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             query: Search query text.
             index_name: Index to search.
             limit: Maximum results.
-            account_id: Account ID for multi-tenancy.
             collection_ids: Filter by collection IDs.
             semantic_weight: Weight for semantic score.
             keyword_weight: Weight for keyword score.
@@ -234,7 +238,6 @@ class KnowledgeSearchService:
             index_name=index_name,
             mode=SearchMode.HYBRID,
             limit=limit,
-            account_id=account_id,
             collection_ids=collection_ids,
             semantic_weight=semantic_weight,
             keyword_weight=keyword_weight,
@@ -273,17 +276,19 @@ class KnowledgeSearchService:
         index_name: str | None = None,
         mode: SearchMode = SearchMode.HYBRID,
         limit: int = 10,
-        account_id: str | None = None,
         **options: Any,
     ) -> list[SearchResult]:
         """Execute multiple searches in parallel.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             queries: List of query texts.
             index_name: Index to search.
             mode: Search mode.
             limit: Maximum results per query.
-            account_id: Account ID for multi-tenancy.
             **options: Additional options.
         Returns:
@@ -298,7 +303,6 @@ class KnowledgeSearchService:
                 text=query,
                 mode=mode,
                 limit=limit,
-                account_id=account_id,
             )
             for query in queries
         ]
@@ -319,15 +323,19 @@ class KnowledgeSearchService:
     async def count(
         self,
         index_name: str | None = None,
-        account_id: str | None = None,
         collection_id: str | None = None,
+        source_id: str | None = None,
     ) -> int:
         """Count documents in index.
+        Note:
+            This method is tenant-agnostic. Multi-tenancy should be handled
+            at the API layer by using separate indices per account.
         Args:
             index_name: Index to count.
-            account_id: Filter by account.
             collection_id: Filter by collection.
+            source_id: Filter by source (for source deletion confirmation).
         Returns:
             Document count.
@@ -336,14 +344,59 @@ class KnowledgeSearchService:
         if not index:
             raise SearchError(message="No index specified")
-        # Build count query
+        # Build count query with optional filters
         query = SearchQuery(
             text="",
             limit=0,
-            account_id=account_id,
             collection_ids=[collection_id] if collection_id else None,
+            source_ids=[source_id] if source_id else None,
         )
         # Use a simple match_all to get total count
         result = await self._searcher.search(query, index)
         return result.total_hits
+    async def get_collections(
+        self,
+        index_name: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """Get all collections with document counts.
+        Args:
+            index_name: Index to query (uses default if not provided).
+        Returns:
+            List of collections with id, name, and document_count.
+        """
+        index = index_name or self._default_index
+        if not index:
+            logger.warning("No index specified for get_collections")
+            return []
+        try:
+            return await self._searcher.get_collections(index)
+        except Exception as e:
+            logger.error(f"Failed to get collections: {e}")
+            return []
+    async def get_stats(
+        self,
+        index_name: str | None = None,
+    ) -> dict[str, Any]:
+        """Get index statistics.
+        Args:
+            index_name: Index to query (uses default if not provided).
+        Returns:
+            Dictionary with document_count, index_name, and other stats.
+        """
+        index = index_name or self._default_index
+        if not index:
+            return {"document_count": 0, "index_name": "", "exists": False}
+        try:
+            return await self._searcher.get_stats(index)
+        except Exception as e:
+            logger.error(f"Failed to get stats: {e}")
+            return {"document_count": 0, "index_name": index, "error": str(e)}

gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl