PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

gnosisllm_knowledge/__init__.py +91 -39
gnosisllm_knowledge/api/__init__.py +3 -2
gnosisllm_knowledge/api/knowledge.py +287 -7
gnosisllm_knowledge/api/memory.py +966 -0
gnosisllm_knowledge/backends/__init__.py +14 -5
gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
gnosisllm_knowledge/backends/opensearch/config.py +49 -28
gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
gnosisllm_knowledge/cli/app.py +378 -12
gnosisllm_knowledge/cli/commands/agentic.py +11 -0
gnosisllm_knowledge/cli/commands/memory.py +723 -0
gnosisllm_knowledge/cli/commands/setup.py +24 -22
gnosisllm_knowledge/cli/display/service.py +43 -0
gnosisllm_knowledge/cli/utils/config.py +58 -0
gnosisllm_knowledge/core/domain/__init__.py +41 -0
gnosisllm_knowledge/core/domain/document.py +5 -0
gnosisllm_knowledge/core/domain/memory.py +440 -0
gnosisllm_knowledge/core/domain/result.py +11 -3
gnosisllm_knowledge/core/domain/search.py +2 -0
gnosisllm_knowledge/core/events/types.py +76 -0
gnosisllm_knowledge/core/exceptions.py +134 -0
gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
gnosisllm_knowledge/core/interfaces/memory.py +524 -0
gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
gnosisllm_knowledge/core/streaming/__init__.py +36 -0
gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
gnosisllm_knowledge/loaders/base.py +3 -4
gnosisllm_knowledge/loaders/sitemap.py +129 -1
gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
gnosisllm_knowledge/services/indexing.py +67 -75
gnosisllm_knowledge/services/search.py +47 -11
gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/loaders/sitemap_streaming.py ADDED Viewed

@@ -0,0 +1,258 @@
+"""Streaming sitemap discovery with bounded memory.
+This module provides streaming URL discovery from sitemaps, yielding
+batches of URLs as they're discovered rather than collecting all URLs
+first. This enables immediate processing and keeps memory bounded.
+"""
+from __future__ import annotations
+import asyncio
+import fnmatch
+import logging
+import re
+from collections.abc import AsyncIterator
+from typing import Any
+from xml.etree import ElementTree
+import httpx
+from gnosisllm_knowledge.core.streaming.pipeline import BoundedQueue, PipelineConfig
+# XML namespace for sitemaps
+SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
+class StreamingSitemapDiscoverer:
+    """Discovers sitemap URLs in a streaming fashion.
+    Instead of collecting all URLs before processing, this yields
+    batches of URLs as they're discovered, enabling immediate processing.
+    Key differences from SitemapLoader._get_urls():
+    - Yields batches instead of returning complete list
+    - Uses bounded queue for backpressure
+    - Memory usage is O(batch_size) not O(total_urls)
+    Example:
+        ```python
+        discoverer = StreamingSitemapDiscoverer()
+        async for url_batch in discoverer.discover_urls_streaming(
+            sitemap_url="https://example.com/sitemap.xml",
+            batch_size=50,
+            max_urls=1000,
+        ):
+            # Process batch immediately
+            for url in url_batch:
+                await fetch_and_process(url)
+        ```
+    """
+    def __init__(
+        self,
+        config: PipelineConfig | None = None,
+    ) -> None:
+        """Initialize the streaming sitemap discoverer.
+        Args:
+            config: Pipeline configuration with batch sizes and concurrency.
+        """
+        self._config = config or PipelineConfig()
+        self._logger = logging.getLogger(__name__)
+    async def discover_urls_streaming(
+        self,
+        sitemap_url: str,
+        batch_size: int | None = None,
+        max_urls: int = 10000,
+        max_depth: int = 3,
+        allowed_patterns: list[str] | None = None,
+        blocked_patterns: list[str] | None = None,
+        **options: Any,
+    ) -> AsyncIterator[list[str]]:
+        """Yield batches of URLs as they're discovered.
+        Args:
+            sitemap_url: Root sitemap URL.
+            batch_size: URLs per batch (default from config).
+            max_urls: Maximum total URLs to discover.
+            max_depth: Maximum sitemap recursion depth.
+            allowed_patterns: URL patterns to include.
+            blocked_patterns: URL patterns to exclude.
+            **options: Additional options (unused, for compatibility).
+        Yields:
+            Lists of discovered URLs, batch_size at a time.
+        """
+        batch_size = batch_size or self._config.url_batch_size
+        allowed_patterns = allowed_patterns or []
+        blocked_patterns = blocked_patterns or []
+        # Use bounded queue for discovered URLs
+        # Queue size is 2x batch_size to allow producer to stay ahead
+        url_queue: BoundedQueue[str] = BoundedQueue(maxsize=batch_size * 2)
+        # Tracking state
+        discovered_count = 0
+        seen_urls: set[str] = set()
+        async def discover_recursive(url: str, depth: int) -> None:
+            """Recursively discover URLs, pushing to queue."""
+            nonlocal discovered_count
+            if depth > max_depth or discovered_count >= max_urls:
+                return
+            content = await self._fetch_sitemap(url)
+            if not content:
+                return
+            try:
+                root = ElementTree.fromstring(content)
+            except ElementTree.ParseError as e:
+                self._logger.error(f"Failed to parse sitemap {url}: {e}")
+                return
+            # Check for sitemap index
+            sitemap_refs = root.findall(".//sm:sitemap/sm:loc", SITEMAP_NS)
+            if sitemap_refs:
+                self._logger.info(
+                    f"Found sitemap index with {len(sitemap_refs)} sitemaps at depth {depth}"
+                )
+                # Process nested sitemaps (limited parallelism to avoid overwhelming)
+                tasks = []
+                for ref in sitemap_refs[:10]:  # Limit parallel sitemap fetches
+                    if ref.text and discovered_count < max_urls:
+                        tasks.append(discover_recursive(ref.text.strip(), depth + 1))
+                await asyncio.gather(*tasks, return_exceptions=True)
+                return
+            # Process URL entries
+            url_elements = root.findall(".//sm:url/sm:loc", SITEMAP_NS)
+            for url_elem in url_elements:
+                if url_elem.text and discovered_count < max_urls:
+                    page_url = url_elem.text.strip()
+                    if page_url in seen_urls:
+                        continue
+                    if not self._should_include_url(
+                        page_url, allowed_patterns, blocked_patterns
+                    ):
+                        continue
+                    seen_urls.add(page_url)
+                    await url_queue.put(page_url)  # Backpressure if queue full
+                    discovered_count += 1
+                    if discovered_count % 100 == 0:
+                        self._logger.debug(f"Discovered {discovered_count} URLs so far")
+        async def discover_and_close() -> None:
+            """Run discovery and close queue when done."""
+            try:
+                await discover_recursive(sitemap_url, depth=0)
+            except Exception as e:
+                self._logger.error(f"Discovery error: {e}")
+            finally:
+                url_queue.close()
+        # Start discovery in background task
+        discovery_task = asyncio.create_task(discover_and_close())
+        # Yield batches from queue
+        batch: list[str] = []
+        try:
+            async for url in url_queue:
+                batch.append(url)
+                if len(batch) >= batch_size:
+                    yield batch
+                    batch = []
+            # Yield remaining URLs
+            if batch:
+                yield batch
+        finally:
+            # Ensure discovery task is complete or cancelled
+            if not discovery_task.done():
+                discovery_task.cancel()
+                try:
+                    await discovery_task
+                except asyncio.CancelledError:
+                    pass
+    async def _fetch_sitemap(self, url: str) -> str | None:
+        """Fetch sitemap XML content.
+        Args:
+            url: The sitemap URL to fetch.
+        Returns:
+            Sitemap XML content or None if fetch failed.
+        """
+        try:
+            async with httpx.AsyncClient(
+                timeout=self._config.fetch_timeout_seconds
+            ) as client:
+                response = await client.get(
+                    url,
+                    headers={"Accept": "application/xml, text/xml, */*"},
+                    follow_redirects=True,
+                )
+                response.raise_for_status()
+                return response.text
+        except Exception as e:
+            self._logger.error(f"Failed to fetch sitemap {url}: {e}")
+            return None
+    def _should_include_url(
+        self,
+        url: str,
+        allowed_patterns: list[str],
+        blocked_patterns: list[str],
+    ) -> bool:
+        """Check if a URL should be included based on patterns.
+        Args:
+            url: The URL to check.
+            allowed_patterns: Patterns that must match (if any).
+            blocked_patterns: Patterns that must not match.
+        Returns:
+            True if URL should be included.
+        """
+        # Check blocked patterns first
+        for pattern in blocked_patterns:
+            if self._matches_pattern(url, pattern):
+                return False
+        # If allowed patterns specified, at least one must match
+        if allowed_patterns:
+            return any(self._matches_pattern(url, p) for p in allowed_patterns)
+        return True
+    def _matches_pattern(self, url: str, pattern: str) -> bool:
+        """Check if URL matches a pattern.
+        Supports both glob patterns (with *) and regex patterns.
+        Args:
+            url: The URL to check.
+            pattern: The pattern to match against.
+        Returns:
+            True if URL matches the pattern.
+        """
+        # Try fnmatch for glob patterns
+        if "*" in pattern or "?" in pattern:
+            return fnmatch.fnmatch(url, pattern)
+        # Try regex
+        try:
+            return bool(re.search(pattern, url))
+        except re.error:
+            # Invalid regex, try substring match
+            return pattern in url

gnosisllm_knowledge/services/indexing.py CHANGED Viewed

@@ -88,7 +88,10 @@ class KnowledgeIndexingService:
         batch_size: int = 100,
         **options: Any,
     ) -> IndexResult:
-        """Load content from source and index it.
+        """Load content from source and index it with streaming.
+        Uses streaming to process and index documents as they're fetched,
+        avoiding memory issues with large sitemaps.
         Args:
             source: Source URL or path.
@@ -96,98 +99,89 @@ class KnowledgeIndexingService:
             account_id: Account ID for multi-tenancy.
             collection_id: Collection ID.
             source_id: Source ID (auto-generated if not provided).
-            batch_size: Documents per batch.
+            batch_size: Documents per batch for indexing.
             **options: Additional loader/indexer options.
         Returns:
             Index result with counts.
         """
         source_id = source_id or str(uuid.uuid4())
+        document_defaults = options.pop("document_defaults", {})
         # Emit batch started event
         await self._events.emit_async(
-            EventType.BATCH_STARTED,
             BatchStartedEvent(
-                source=source,
-                source_id=source_id,
+                batch_index=0,
+                batch_size=batch_size,
+                total_batches=0,  # Unknown for streaming
             ),
         )
-        try:
-            # Load documents
-            load_result = await self._loader.load(source, **options)
+        total_indexed = 0
+        total_failed = 0
+        errors: list[str] = []
+        batch: list[Document] = []
+        batch_index = 0
-            if not load_result.success:
-                raise LoadError(
-                    message=f"Failed to load from {source}",
-                    details={"errors": load_result.errors},
+        try:
+            # Stream documents and index in batches as they arrive
+            # Note: Loader already chunks content, so we don't re-chunk here
+            async for doc in self._loader.load_streaming(source, **options):
+                # Enrich document with tenant info
+                enriched_doc = Document(
+                    content=doc.content,
+                    source=source,
+                    doc_id=doc.doc_id,
+                    url=doc.url,
+                    title=doc.title,
+                    account_id=account_id,
+                    collection_id=collection_id,
+                    source_id=source_id,
+                    chunk_index=doc.chunk_index,
+                    total_chunks=doc.total_chunks,
+                    parent_doc_id=doc.parent_doc_id,
+                    status=DocumentStatus.INDEXED,
+                    metadata=doc.metadata,
+                    **document_defaults,
                 )
-            # Process and index documents
-            total_indexed = 0
-            total_failed = 0
-            errors: list[str] = []
-            batch: list[Document] = []
-            for doc in load_result.documents:
-                # Chunk the document
-                chunks = self._chunker.chunk(doc.content)
-                for i, chunk in enumerate(chunks):
-                    # Create chunk document
-                    chunk_doc = Document(
-                        id=f"{doc.id}-chunk-{i}",
-                        content=chunk.content,
-                        url=doc.url,
-                        title=doc.title,
-                        source=source,
-                        account_id=account_id,
-                        collection_id=collection_id,
-                        source_id=source_id,
-                        chunk_index=i,
-                        total_chunks=len(chunks),
-                        parent_doc_id=doc.id,
-                        status=DocumentStatus.INDEXED,
-                        metadata=doc.metadata,
-                    )
+                batch.append(enriched_doc)
-                    batch.append(chunk_doc)
-                    # Index batch when full
-                    if len(batch) >= batch_size:
-                        result = await self._index_batch(batch, index_name)
-                        total_indexed += result.documents_indexed
-                        total_failed += result.documents_failed
-                        if result.errors:
-                            errors.extend(result.errors)
-                        batch = []
+                # Index batch when full
+                if len(batch) >= batch_size:
+                    result = await self._index_batch(batch, index_name)
+                    total_indexed += result.indexed_count
+                    total_failed += result.failed_count
+                    if result.errors:
+                        errors.extend(result.errors)
+                    batch = []
+                    batch_index += 1
+                    logger.info(f"Indexed batch {batch_index}: {total_indexed} total documents")
             # Index remaining documents
             if batch:
                 result = await self._index_batch(batch, index_name)
-                total_indexed += result.documents_indexed
-                total_failed += result.documents_failed
+                total_indexed += result.indexed_count
+                total_failed += result.failed_count
                 if result.errors:
                     errors.extend(result.errors)
             # Emit batch completed event
             await self._events.emit_async(
-                EventType.BATCH_COMPLETED,
                 BatchCompletedEvent(
-                    source=source,
-                    source_id=source_id,
-                    documents_indexed=total_indexed,
-                    documents_failed=total_failed,
-                    success=total_failed == 0,
+                    batch_index=batch_index,
+                    success_count=total_indexed,
+                    failure_count=total_failed,
                 ),
             )
+            logger.info(f"Completed indexing from {source}: {total_indexed} documents")
             return IndexResult(
                 success=total_failed == 0,
-                documents_indexed=total_indexed,
-                documents_failed=total_failed,
-                errors=errors if errors else None,
+                indexed_count=total_indexed,
+                failed_count=total_failed,
+                errors=errors if errors else [],
             )
         except Exception as e:
@@ -231,17 +225,17 @@ class KnowledgeIndexingService:
                 for i, chunk_obj in enumerate(chunks):
                     chunk_doc = Document(
-                        id=f"{doc.id}-chunk-{i}",
                         content=chunk_obj.content,
+                        source=doc.source,
+                        doc_id=f"{doc.doc_id}-chunk-{i}",
                         url=doc.url,
                         title=doc.title,
-                        source=doc.source,
                         account_id=doc.account_id,
                         collection_id=doc.collection_id,
                         source_id=doc.source_id,
                         chunk_index=i,
                         total_chunks=len(chunks),
-                        parent_doc_id=doc.id,
+                        parent_doc_id=doc.doc_id,
                         status=DocumentStatus.INDEXED,
                         metadata=doc.metadata,
                     )
@@ -252,8 +246,8 @@ class KnowledgeIndexingService:
             # Index batch when full
             if len(batch) >= batch_size:
                 result = await self._index_batch(batch, index_name)
-                total_indexed += result.documents_indexed
-                total_failed += result.documents_failed
+                total_indexed += result.indexed_count
+                total_failed += result.failed_count
                 if result.errors:
                     errors.extend(result.errors)
                 batch = []
@@ -261,16 +255,16 @@ class KnowledgeIndexingService:
         # Index remaining
         if batch:
             result = await self._index_batch(batch, index_name)
-            total_indexed += result.documents_indexed
-            total_failed += result.documents_failed
+            total_indexed += result.indexed_count
+            total_failed += result.failed_count
             if result.errors:
                 errors.extend(result.errors)
         return IndexResult(
             success=total_failed == 0,
-            documents_indexed=total_indexed,
-            documents_failed=total_failed,
-            errors=errors if errors else None,
+            indexed_count=total_indexed,
+            failed_count=total_failed,
+            errors=errors if errors else [],
         )
     async def delete_source(
@@ -375,12 +369,10 @@ class KnowledgeIndexingService:
         for doc in documents:
             if result.success:
                 await self._events.emit_async(
-                    EventType.DOCUMENT_INDEXED,
                     DocumentIndexedEvent(
-                        document_id=doc.id,
+                        doc_id=doc.doc_id,
                         index_name=index_name,
-                        chunk_index=doc.chunk_index,
-                        total_chunks=doc.total_chunks,
+                        success=True,
                     ),
                 )

gnosisllm_knowledge/services/search.py CHANGED Viewed

@@ -114,17 +114,8 @@ class KnowledgeSearchService:
         try:
             result = await self._searcher.search(search_query, index, **options)
-            # Emit search event
-            await self._events.emit_async(
-                EventType.SEARCH_COMPLETED,
-                {
-                    "query": query,
-                    "mode": mode.value,
-                    "results_count": len(result.items),
-                    "total_hits": result.total_hits,
-                    "duration_ms": result.duration_ms,
-                },
-            )
+            # TODO: Emit search event when SearchCompletedEvent is defined
+            # await self._events.emit_async(SearchCompletedEvent(...))
             return result
@@ -347,3 +338,48 @@ class KnowledgeSearchService:
         # Use a simple match_all to get total count
         result = await self._searcher.search(query, index)
         return result.total_hits
+    async def get_collections(
+        self,
+        index_name: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """Get all collections with document counts.
+        Args:
+            index_name: Index to query (uses default if not provided).
+        Returns:
+            List of collections with id, name, and document_count.
+        """
+        index = index_name or self._default_index
+        if not index:
+            logger.warning("No index specified for get_collections")
+            return []
+        try:
+            return await self._searcher.get_collections(index)
+        except Exception as e:
+            logger.error(f"Failed to get collections: {e}")
+            return []
+    async def get_stats(
+        self,
+        index_name: str | None = None,
+    ) -> dict[str, Any]:
+        """Get index statistics.
+        Args:
+            index_name: Index to query (uses default if not provided).
+        Returns:
+            Dictionary with document_count, index_name, and other stats.
+        """
+        index = index_name or self._default_index
+        if not index:
+            return {"document_count": 0, "index_name": "", "exists": False}
+        try:
+            return await self._searcher.get_stats(index)
+        except Exception as e:
+            logger.error(f"Failed to get stats: {e}")
+            return {"document_count": 0, "index_name": index, "error": str(e)}

gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl