PyPI - gnosisllm-knowledge - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

gnosisllm_knowledge/api/knowledge.py +233 -35
gnosisllm_knowledge/backends/memory/indexer.py +27 -2
gnosisllm_knowledge/backends/memory/searcher.py +132 -10
gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
gnosisllm_knowledge/backends/opensearch/config.py +7 -0
gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
gnosisllm_knowledge/cli/app.py +58 -19
gnosisllm_knowledge/cli/commands/agentic.py +15 -9
gnosisllm_knowledge/cli/commands/load.py +169 -19
gnosisllm_knowledge/cli/commands/memory.py +10 -0
gnosisllm_knowledge/cli/commands/search.py +9 -10
gnosisllm_knowledge/cli/commands/setup.py +25 -1
gnosisllm_knowledge/cli/utils/config.py +4 -4
gnosisllm_knowledge/core/domain/__init__.py +13 -0
gnosisllm_knowledge/core/domain/discovery.py +166 -0
gnosisllm_knowledge/core/domain/document.py +14 -19
gnosisllm_knowledge/core/domain/search.py +10 -25
gnosisllm_knowledge/core/domain/source.py +11 -12
gnosisllm_knowledge/core/events/__init__.py +8 -0
gnosisllm_knowledge/core/events/types.py +122 -5
gnosisllm_knowledge/core/exceptions.py +93 -0
gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
gnosisllm_knowledge/fetchers/__init__.py +8 -0
gnosisllm_knowledge/fetchers/config.py +27 -0
gnosisllm_knowledge/fetchers/neoreader.py +31 -3
gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
gnosisllm_knowledge/loaders/__init__.py +5 -1
gnosisllm_knowledge/loaders/discovery.py +338 -0
gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
gnosisllm_knowledge/loaders/factory.py +46 -0
gnosisllm_knowledge/services/indexing.py +51 -21
gnosisllm_knowledge/services/search.py +42 -28
gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/core/interfaces/agentic.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""Agentic searcher protocol - Interface for AI-powered search operations."""
+"""Agentic searcher protocol - Interface for AI-powered search operations.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). Agentic searcher implementations
+    should not include tenant filtering logic - callers should use tenant-specific
+    indices.
+"""
 from __future__ import annotations
@@ -15,6 +22,9 @@ if TYPE_CHECKING:
 class IAgenticSearcher(Protocol):
     """Protocol for agentic search operations using AI agents.
+    This protocol is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation by using tenant-specific index names.
     Agentic searchers are responsible for:
     - Understanding natural language queries
     - Automatically constructing optimal search strategies
@@ -107,13 +117,11 @@ class IAgenticSearcher(Protocol):
     async def list_conversations(
         self,
-        account_id: str | None = None,
         limit: int = 100,
     ) -> list[dict[str, Any]]:
         """List active conversations.
         Args:
-            account_id: Filter by account (multi-tenant).
             limit: Maximum number of conversations.
         Returns:

gnosisllm_knowledge/core/interfaces/indexer.py CHANGED Viewed

@@ -1,4 +1,10 @@
-"""Document indexer protocol - Interface Segregation Principle."""
+"""Document indexer protocol - Interface Segregation Principle.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). Indexer implementations should
+    not include tenant filtering logic - callers should use tenant-specific indices.
+"""
 from __future__ import annotations
@@ -14,6 +20,9 @@ if TYPE_CHECKING:
 class IDocumentIndexer(Protocol):
     """Protocol for indexing documents into a search backend.
+    This protocol is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation by using tenant-specific index names.
     Document indexers are responsible for:
     - Generating embeddings for documents
     - Storing documents in the search backend

gnosisllm_knowledge/core/interfaces/searcher.py CHANGED Viewed

@@ -1,4 +1,10 @@
-"""Knowledge searcher protocol - Interface Segregation Principle."""
+"""Knowledge searcher protocol - Interface Segregation Principle.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). Searcher implementations should
+    not include tenant filtering logic - callers should use tenant-specific indices.
+"""
 from __future__ import annotations
@@ -12,6 +18,9 @@ if TYPE_CHECKING:
 class IKnowledgeSearcher(Protocol):
     """Protocol for searching documents in a search backend.
+    This protocol is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation by using tenant-specific index names.
     Knowledge searchers are responsible for:
     - Executing different search modes (semantic, keyword, hybrid)
     - Generating embeddings for queries
@@ -176,3 +185,23 @@ class IKnowledgeSearcher(Protocol):
             List of SearchResults in same order as queries.
         """
         ...
+    async def count(
+        self,
+        index_name: str,
+        collection_id: str | None = None,
+        source_id: str | None = None,
+    ) -> int:
+        """Count documents in index with optional filters.
+        Uses native count API instead of search for efficiency.
+        Args:
+            index_name: Target index name.
+            collection_id: Filter by collection.
+            source_id: Filter by source.
+        Returns:
+            Document count.
+        """
+        ...

gnosisllm_knowledge/core/interfaces/streaming.py CHANGED Viewed

@@ -2,6 +2,11 @@
 These protocols define contracts for streaming operations that process
 data in bounded batches rather than loading everything into memory.
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation (e.g., `knowledge-{account_id}`). Streaming implementations should
+    not include tenant filtering logic - callers should use tenant-specific indices.
 """
 from __future__ import annotations
@@ -97,6 +102,9 @@ class IStreamingLoader(Protocol):
 class IStreamingPipeline(Protocol):
     """Protocol for streaming indexing pipelines.
+    This protocol is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation by using tenant-specific index names.
     Orchestrates the full streaming load -> index pipeline with
     bounded memory guarantees.
     """
@@ -106,7 +114,6 @@ class IStreamingPipeline(Protocol):
         source: str,
         index_name: str,
         *,
-        account_id: str | None = None,
         collection_id: str | None = None,
         source_id: str | None = None,
         **options: Any,
@@ -115,9 +122,8 @@ class IStreamingPipeline(Protocol):
         Args:
             source: Sitemap URL.
-            index_name: Target OpenSearch index.
-            account_id: For multi-tenancy filtering.
-            collection_id: Collection within account.
+            index_name: Target OpenSearch index (use tenant-specific name).
+            collection_id: Collection within the index.
             source_id: Source identifier.
             **options: Additional loader options.

gnosisllm_knowledge/fetchers/__init__.py CHANGED Viewed

@@ -1,12 +1,20 @@
 """Content fetchers for retrieving content from URLs."""
+from gnosisllm_knowledge.core.exceptions import (
+    DiscoveryJobFailedError,
+    DiscoveryTimeoutError,
+)
 from gnosisllm_knowledge.fetchers.config import FetcherConfig, NeoreaderConfig
 from gnosisllm_knowledge.fetchers.http import HTTPContentFetcher
 from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
+from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
 __all__ = [
     "HTTPContentFetcher",
     "NeoreaderContentFetcher",
+    "NeoreaderDiscoveryClient",
     "FetcherConfig",
     "NeoreaderConfig",
+    "DiscoveryTimeoutError",
+    "DiscoveryJobFailedError",
 ]

gnosisllm_knowledge/fetchers/config.py CHANGED Viewed

@@ -40,6 +40,11 @@ class NeoreaderConfig:
         remove_selector: CSS selector for elements to remove.
         with_images: Whether to include image references.
         with_links: Whether to include link references.
+        discovery_enabled: Whether discovery loader is enabled.
+        discovery_poll_interval: Interval between status polls in seconds.
+        discovery_timeout: Maximum time to wait for discovery completion in seconds.
+        discovery_max_depth: Default maximum crawl depth for discovery.
+        discovery_max_pages: Default maximum pages to discover.
     """
     host: str = "http://localhost:3000"
@@ -50,6 +55,13 @@ class NeoreaderConfig:
     with_images: bool = False
     with_links: bool = True
+    # Discovery settings
+    discovery_enabled: bool = True
+    discovery_poll_interval: float = 2.0
+    discovery_timeout: float = 600.0
+    discovery_max_depth: int = 3
+    discovery_max_pages: int = 100
     @classmethod
     def from_env(cls) -> NeoreaderConfig:
         """Create configuration from environment variables.
@@ -62,6 +74,11 @@ class NeoreaderConfig:
         - NEOREADER_REMOVE_SELECTOR: CSS selector for removal
         - NEOREADER_WITH_IMAGES: Include images (true/false)
         - NEOREADER_WITH_LINKS: Include links (true/false)
+        - NEOREADER_DISCOVERY_ENABLED: Enable discovery loader (true/false)
+        - NEOREADER_DISCOVERY_POLL_INTERVAL: Discovery poll interval in seconds
+        - NEOREADER_DISCOVERY_TIMEOUT: Discovery timeout in seconds
+        - NEOREADER_DISCOVERY_MAX_DEPTH: Default max crawl depth
+        - NEOREADER_DISCOVERY_MAX_PAGES: Default max pages to discover
         Returns:
             NeoreaderConfig populated from environment.
@@ -74,4 +91,14 @@ class NeoreaderConfig:
             remove_selector=os.getenv("NEOREADER_REMOVE_SELECTOR"),
             with_images=os.getenv("NEOREADER_WITH_IMAGES", "").lower() == "true",
             with_links=os.getenv("NEOREADER_WITH_LINKS", "true").lower() == "true",
+            discovery_enabled=os.getenv("NEOREADER_DISCOVERY_ENABLED", "true").lower()
+            == "true",
+            discovery_poll_interval=float(
+                os.getenv("NEOREADER_DISCOVERY_POLL_INTERVAL", "2.0")
+            ),
+            discovery_timeout=float(
+                os.getenv("NEOREADER_DISCOVERY_TIMEOUT", "600.0")
+            ),
+            discovery_max_depth=int(os.getenv("NEOREADER_DISCOVERY_MAX_DEPTH", "3")),
+            discovery_max_pages=int(os.getenv("NEOREADER_DISCOVERY_MAX_PAGES", "100")),
         )

gnosisllm_knowledge/fetchers/neoreader.py CHANGED Viewed

@@ -43,6 +43,15 @@ class NeoreaderContentFetcher:
         self._config = config or NeoreaderConfig.from_env()
         self._logger = logging.getLogger(__name__)
+    @property
+    def config(self) -> NeoreaderConfig:
+        """Expose configuration for reuse by discovery client.
+        Returns:
+            The Neo Reader configuration used by this fetcher.
+        """
+        return self._config
     async def fetch(self, url: str, **options: Any) -> FetchResult:
         """Fetch content from a URL using Neoreader.
@@ -181,7 +190,7 @@ class NeoreaderContentFetcher:
     def _extract_title(self, content: str) -> str | None:
         """Extract title from markdown content.
-        Looks for the first H1 heading in the markdown.
+        Looks for the first H1 heading in various formats.
         Args:
             content: Markdown content.
@@ -189,14 +198,33 @@ class NeoreaderContentFetcher:
         Returns:
             Title string or None.
         """
-        # Look for first H1 heading
         lines = content.split("\n")
+        # Look for ATX-style H1 heading (# Title)
         for line in lines:
             line = line.strip()
             if line.startswith("# "):
                 return line[2:].strip()
-        # Try regex for H1
+        # Look for "Title: ..." prefix format (common in Neoreader output)
+        for line in lines:
+            line = line.strip()
+            if line.startswith("Title:"):
+                title = line[6:].strip()
+                # Stop at "URL" or "Source" if present on same line
+                for stop in [" URL", " Source"]:
+                    if stop in title:
+                        title = title[:title.index(stop)]
+                return title.strip() if title else None
+        # Look for Setext-style H1 (Title followed by === line)
+        for i, line in enumerate(lines[:-1]):
+            line = line.strip()
+            next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
+            if line and next_line and all(c == "=" for c in next_line) and len(next_line) >= 3:
+                return line
+        # Try regex for ATX H1
         match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
         if match:
             return match.group(1).strip()

gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

gnosisllm-knowledge 0.3.0py3-none-any.whl → 0.4.3py3-none-any.whl