PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

gnosisllm_knowledge/__init__.py +91 -39
gnosisllm_knowledge/api/__init__.py +3 -2
gnosisllm_knowledge/api/knowledge.py +502 -32
gnosisllm_knowledge/api/memory.py +966 -0
gnosisllm_knowledge/backends/__init__.py +14 -5
gnosisllm_knowledge/backends/memory/indexer.py +27 -2
gnosisllm_knowledge/backends/memory/searcher.py +111 -10
gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
gnosisllm_knowledge/backends/opensearch/config.py +49 -28
gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
gnosisllm_knowledge/cli/app.py +436 -31
gnosisllm_knowledge/cli/commands/agentic.py +26 -9
gnosisllm_knowledge/cli/commands/load.py +169 -19
gnosisllm_knowledge/cli/commands/memory.py +733 -0
gnosisllm_knowledge/cli/commands/search.py +9 -10
gnosisllm_knowledge/cli/commands/setup.py +49 -23
gnosisllm_knowledge/cli/display/service.py +43 -0
gnosisllm_knowledge/cli/utils/config.py +62 -4
gnosisllm_knowledge/core/domain/__init__.py +54 -0
gnosisllm_knowledge/core/domain/discovery.py +166 -0
gnosisllm_knowledge/core/domain/document.py +19 -19
gnosisllm_knowledge/core/domain/memory.py +440 -0
gnosisllm_knowledge/core/domain/result.py +11 -3
gnosisllm_knowledge/core/domain/search.py +12 -25
gnosisllm_knowledge/core/domain/source.py +11 -12
gnosisllm_knowledge/core/events/__init__.py +8 -0
gnosisllm_knowledge/core/events/types.py +198 -5
gnosisllm_knowledge/core/exceptions.py +227 -0
gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
gnosisllm_knowledge/core/interfaces/memory.py +524 -0
gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
gnosisllm_knowledge/core/streaming/__init__.py +36 -0
gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
gnosisllm_knowledge/fetchers/__init__.py +8 -0
gnosisllm_knowledge/fetchers/config.py +27 -0
gnosisllm_knowledge/fetchers/neoreader.py +31 -3
gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
gnosisllm_knowledge/loaders/__init__.py +5 -1
gnosisllm_knowledge/loaders/base.py +3 -4
gnosisllm_knowledge/loaders/discovery.py +338 -0
gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
gnosisllm_knowledge/loaders/factory.py +46 -0
gnosisllm_knowledge/loaders/sitemap.py +129 -1
gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
gnosisllm_knowledge/services/indexing.py +100 -93
gnosisllm_knowledge/services/search.py +84 -31
gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
{gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0

gnosisllm_knowledge/cli/commands/search.py CHANGED Viewed

@@ -5,6 +5,10 @@ Supports multiple search modes:
 - keyword: Traditional BM25 text matching
 - hybrid: Combined semantic + keyword (default, best results)
 - agentic: AI-powered search with reasoning and answer generation
+Note:
+    This library is tenant-agnostic. Multi-tenancy is achieved through index
+    isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
 """
 from __future__ import annotations
@@ -44,7 +48,6 @@ async def search_command(
     index_name: str = "knowledge",
     limit: int = 5,
     offset: int = 0,
-    account_id: str | None = None,
     collection_ids: str | None = None,
     source_ids: str | None = None,
     min_score: float = 0.0,
@@ -55,14 +58,17 @@ async def search_command(
 ) -> None:
     """Execute the search command.
+    Note:
+        Multi-tenancy is achieved through index isolation. Use tenant-specific
+        index names instead (e.g., --index knowledge-tenant-123).
     Args:
         display: Display service for output.
         query: Search query text.
         mode: Search mode (semantic, keyword, hybrid, agentic).
-        index_name: Index to search.
+        index_name: Index to search (use tenant-specific name for isolation).
         limit: Maximum results to return.
         offset: Pagination offset.
-        account_id: Filter by account ID.
         collection_ids: Filter by collection IDs (comma-separated).
         source_ids: Filter by source IDs (comma-separated).
         min_score: Minimum score threshold.
@@ -86,7 +92,6 @@ async def search_command(
             query=query or "",
             index_name=index_name,
             agent_type="flow",  # Default to flow for single queries
-            account_id=account_id,
             collection_ids=collection_ids,
             source_ids=source_ids,
             limit=limit,
@@ -117,7 +122,6 @@ async def search_command(
             index_name=index_name,
             mode=mode,
             limit=limit,
-            account_id=account_id,
             collection_ids=collection_ids,
             source_ids=source_ids,
             min_score=min_score,
@@ -146,7 +150,6 @@ async def search_command(
         index_name=index_name,
         limit=limit,
         offset=offset,
-        account_id=account_id,
         collection_ids=collection_ids,
         source_ids=source_ids,
         min_score=min_score,
@@ -167,7 +170,6 @@ async def _execute_search(
     index_name: str,
     limit: int,
     offset: int,
-    account_id: str | None,
     collection_ids: str | None,
     source_ids: str | None,
     min_score: float,
@@ -214,7 +216,6 @@ async def _execute_search(
             mode=_get_search_mode(mode),
             limit=limit,
             offset=offset,
-            account_id=account_id,
             collection_ids=collection_list,
             source_ids=source_list,
             min_score=min_score,
@@ -315,7 +316,6 @@ async def _interactive_search(
     index_name: str,
     mode: str,
     limit: int,
-    account_id: str | None,
     collection_ids: str | None,
     source_ids: str | None,
     min_score: float,
@@ -396,7 +396,6 @@ async def _interactive_search(
                     mode=_get_search_mode(mode),
                     limit=limit,
                     offset=0,
-                    account_id=account_id,
                     collection_ids=collection_list,
                     source_ids=source_list,
                     min_score=min_score,

gnosisllm_knowledge/cli/commands/setup.py CHANGED Viewed

@@ -26,12 +26,12 @@ if TYPE_CHECKING:
 async def setup_command(
     display: RichDisplayService,
-    host: str = "localhost",
-    port: int = 9200,
+    host: str | None = None,
+    port: int | None = None,
     username: str | None = None,
     password: str | None = None,
-    use_ssl: bool = False,
-    verify_certs: bool = False,
+    use_ssl: bool | None = None,
+    verify_certs: bool | None = None,
     force: bool = False,
     no_sample_data: bool = False,
     no_hybrid: bool = False,
@@ -40,24 +40,26 @@ async def setup_command(
     Args:
         display: Display service for output.
-        host: OpenSearch host.
-        port: OpenSearch port.
-        username: OpenSearch username.
-        password: OpenSearch password.
-        use_ssl: Enable SSL.
-        verify_certs: Verify SSL certificates.
+        host: OpenSearch host (overrides env).
+        port: OpenSearch port (overrides env).
+        username: OpenSearch username (overrides env).
+        password: OpenSearch password (overrides env).
+        use_ssl: Enable SSL (overrides env).
+        verify_certs: Verify SSL certificates (overrides env).
         force: Clean up existing resources first.
         no_sample_data: Skip sample data ingestion.
         no_hybrid: Skip hybrid search pipeline.
     """
-    # Load configuration
+    # Load configuration from environment
     cli_config = CliConfig.from_env()
-    # Override with CLI arguments
-    final_host = host or cli_config.opensearch_host
-    final_port = port or cli_config.opensearch_port
-    final_username = username or cli_config.opensearch_username
-    final_password = password or cli_config.opensearch_password
+    # CLI arguments override environment variables (only if explicitly provided)
+    final_host = host if host is not None else cli_config.opensearch_host
+    final_port = port if port is not None else cli_config.opensearch_port
+    final_username = username if username is not None else cli_config.opensearch_username
+    final_password = password if password is not None else cli_config.opensearch_password
+    final_use_ssl = use_ssl if use_ssl is not None else cli_config.opensearch_use_ssl
+    final_verify_certs = verify_certs if verify_certs is not None else cli_config.opensearch_verify_certs
     # Validate required config
     if not cli_config.openai_api_key:
@@ -79,7 +81,7 @@ async def setup_command(
         "Configuration",
         [
             ("Host", f"{final_host}:{final_port}"),
-            ("SSL", "Enabled" if use_ssl else "Disabled"),
+            ("SSL", "Enabled" if final_use_ssl else "Disabled"),
             ("Auth", "Configured" if final_username else "None"),
             ("Hybrid Search", "Disabled" if no_hybrid else "Enabled"),
             ("Force Recreate", "Yes" if force else "No"),
@@ -88,17 +90,41 @@ async def setup_command(
     display.newline()
-    # Create OpenSearch config
+    # Create OpenSearch config from environment, then override with CLI args
+    # This ensures all env vars (including pipeline names) are respected
+    base_config = OpenSearchConfig.from_env()
     opensearch_config = OpenSearchConfig(
+        # CLI overrides (if provided)
         host=final_host,
         port=final_port,
         username=final_username,
         password=final_password,
-        use_ssl=use_ssl,
-        verify_certs=verify_certs,
+        use_ssl=final_use_ssl,
+        verify_certs=final_verify_certs,
         openai_api_key=cli_config.openai_api_key,
         embedding_model=cli_config.openai_embedding_model,
         embedding_dimension=cli_config.openai_embedding_dimension,
+        # Preserve env-based config for pipelines and other settings
+        ingest_pipeline_name=base_config.ingest_pipeline_name,
+        search_pipeline_name=base_config.search_pipeline_name,
+        index_prefix=base_config.index_prefix,
+        model_id=base_config.model_id,
+        model_group_id=base_config.model_group_id,
+        embedding_field=base_config.embedding_field,
+        # k-NN settings
+        knn_engine=base_config.knn_engine,
+        knn_space_type=base_config.knn_space_type,
+        knn_algo_param_ef_search=base_config.knn_algo_param_ef_search,
+        knn_algo_param_ef_construction=base_config.knn_algo_param_ef_construction,
+        knn_algo_param_m=base_config.knn_algo_param_m,
+        # Index settings
+        number_of_shards=base_config.number_of_shards,
+        number_of_replicas=base_config.number_of_replicas,
+        refresh_interval=base_config.refresh_interval,
+        # Agentic settings
+        agentic_llm_model=base_config.agentic_llm_model,
+        agentic_max_iterations=base_config.agentic_max_iterations,
+        agentic_timeout_seconds=base_config.agentic_timeout_seconds,
     )
     # Create OpenSearch client
@@ -109,8 +135,8 @@ async def setup_command(
     client = AsyncOpenSearch(
         hosts=[{"host": final_host, "port": final_port}],
         http_auth=http_auth,
-        use_ssl=use_ssl,
-        verify_certs=verify_certs,
+        use_ssl=final_use_ssl,
+        verify_certs=final_verify_certs,
         ssl_show_warn=False,
     )
@@ -124,7 +150,7 @@ async def setup_command(
             display.format_error_with_suggestion(
                 error=f"Cannot connect to OpenSearch at {final_host}:{final_port}",
                 suggestion="Ensure OpenSearch is running and accessible.",
-                command=f"curl http{'s' if use_ssl else ''}://{final_host}:{final_port}",
+                command=f"curl http{'s' if final_use_ssl else ''}://{final_host}:{final_port}",
             )
             sys.exit(1)

gnosisllm_knowledge/cli/display/service.py CHANGED Viewed

@@ -553,3 +553,46 @@ class RichDisplayService:
                 suggestion="Run agentic setup to create agents.",
                 command="gnosisllm-knowledge agentic setup",
             )
+    def memory_status(
+        self,
+        llm_model_id: str | None,
+        embedding_model_id: str | None,
+        llm_model: str = "gpt-4o",
+        embedding_model: str = "text-embedding-3-small",
+    ) -> None:
+        """Display agentic memory configuration status.
+        Args:
+            llm_model_id: LLM model ID if configured.
+            embedding_model_id: Embedding model ID if configured.
+            llm_model: LLM model name for fact extraction.
+            embedding_model: Embedding model name.
+        """
+        status_rows = []
+        # LLM Model
+        if llm_model_id:
+            status_rows.append(("LLM Model", "[green]Configured[/green]"))
+            status_rows.append(("  ID", f"[dim]{llm_model_id}[/dim]"))
+            status_rows.append(("  Model", llm_model))
+        else:
+            status_rows.append(("LLM Model", "[red]Not configured[/red]"))
+        # Embedding Model
+        if embedding_model_id:
+            status_rows.append(("Embedding Model", "[green]Configured[/green]"))
+            status_rows.append(("  ID", f"[dim]{embedding_model_id}[/dim]"))
+            status_rows.append(("  Model", embedding_model))
+        else:
+            status_rows.append(("Embedding Model", "[red]Not configured[/red]"))
+        self.table("Agentic Memory Configuration", status_rows)
+        if not llm_model_id or not embedding_model_id:
+            self.newline()
+            self.format_error_with_suggestion(
+                error="Memory models not configured.",
+                suggestion="Run memory setup to create connectors and models.",
+                command="gnosisllm-knowledge memory setup --openai-key sk-...",
+            )

gnosisllm_knowledge/cli/utils/config.py CHANGED Viewed

@@ -27,7 +27,7 @@ class CliConfig:
     opensearch_verify_certs: bool = False
     opensearch_model_id: str | None = None
     opensearch_index_name: str = "knowledge"
-    opensearch_pipeline_name: str = "gnosisllm-ingest-pipeline"
+    opensearch_ingest_pipeline_name: str = "gnosisllm-ingest-pipeline"
     opensearch_search_pipeline_name: str = "gnosisllm-search-pipeline"
     # OpenAI
@@ -42,6 +42,13 @@ class CliConfig:
     agentic_max_iterations: int = 5
     agentic_timeout_seconds: int = 60
+    # Agentic Memory
+    memory_llm_model_id: str | None = None
+    memory_embedding_model_id: str | None = None
+    memory_llm_model: str = "gpt-4o"
+    memory_embedding_model: str = "text-embedding-3-small"
+    memory_embedding_dimension: int = 1536
     # Neoreader
     neoreader_host: str = "https://api.neoreader.dev"
@@ -71,11 +78,11 @@ class CliConfig:
             == "true",
             opensearch_model_id=os.getenv("OPENSEARCH_MODEL_ID"),
             opensearch_index_name=os.getenv("OPENSEARCH_INDEX_NAME", "knowledge"),
-            opensearch_pipeline_name=os.getenv(
-                "OPENSEARCH_PIPELINE_NAME", "gnosisllm-ingest-pipeline"
+            opensearch_ingest_pipeline_name=os.getenv(
+                "OPENSEARCH_INGEST_PIPELINE", "gnosisllm-ingest-pipeline"
             ),
             opensearch_search_pipeline_name=os.getenv(
-                "OPENSEARCH_SEARCH_PIPELINE_NAME", "gnosisllm-search-pipeline"
+                "OPENSEARCH_SEARCH_PIPELINE", "gnosisllm-search-pipeline"
             ),
             openai_api_key=os.getenv("OPENAI_API_KEY"),
             openai_embedding_model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"),
@@ -86,6 +93,12 @@ class CliConfig:
             agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
             agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
             agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
+            # Agentic Memory configuration
+            memory_llm_model_id=os.getenv("OPENSEARCH_MEMORY_LLM_MODEL_ID"),
+            memory_embedding_model_id=os.getenv("OPENSEARCH_MEMORY_EMBEDDING_MODEL_ID"),
+            memory_llm_model=os.getenv("MEMORY_LLM_MODEL", "gpt-4o"),
+            memory_embedding_model=os.getenv("MEMORY_EMBEDDING_MODEL", "text-embedding-3-small"),
+            memory_embedding_dimension=int(os.getenv("MEMORY_EMBEDDING_DIMENSION", "1536")),
             neoreader_host=os.getenv("NEOREADER_HOST", "https://api.neoreader.dev"),
         )
@@ -205,3 +218,48 @@ class CliConfig:
     def has_conversational_agent(self) -> bool:
         """Check if conversational agent is configured."""
         return bool(self.opensearch_conversational_agent_id)
+    # === Memory Configuration ===
+    def validate_for_memory(self) -> list[str]:
+        """Validate configuration for memory commands.
+        Returns:
+            List of validation errors (empty if valid).
+        """
+        errors = []
+        if not self.memory_llm_model_id:
+            errors.append(
+                "OPENSEARCH_MEMORY_LLM_MODEL_ID is required for memory operations. "
+                "Run 'gnosisllm-knowledge memory setup' first."
+            )
+        if not self.memory_embedding_model_id:
+            errors.append(
+                "OPENSEARCH_MEMORY_EMBEDDING_MODEL_ID is required for memory operations. "
+                "Run 'gnosisllm-knowledge memory setup' first."
+            )
+        return errors
+    def validate_for_memory_setup(self) -> list[str]:
+        """Validate configuration for memory setup command.
+        Returns:
+            List of validation errors (empty if valid).
+        """
+        errors = []
+        if not self.openai_api_key:
+            errors.append(
+                "OPENAI_API_KEY is required for memory setup. "
+                "Use --openai-key or set the environment variable."
+            )
+        return errors
+    @property
+    def has_memory_models(self) -> bool:
+        """Check if memory models are configured."""
+        return bool(self.memory_llm_model_id and self.memory_embedding_model_id)
+    @property
+    def memory_is_configured(self) -> bool:
+        """Check if memory is fully configured for operations."""
+        return self.has_memory_models

gnosisllm_knowledge/core/domain/__init__.py CHANGED Viewed

@@ -1,6 +1,34 @@
 """Domain models - Value objects and entities."""
+from gnosisllm_knowledge.core.domain.discovery import (
+    DiscoveredURL,
+    DiscoveryConfig,
+    DiscoveryJobStatus,
+    DiscoveryProgress,
+    DiscoveryStats,
+)
 from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
+from gnosisllm_knowledge.core.domain.memory import (
+    ContainerConfig,
+    ContainerIndexSettings,
+    ContainerInfo,
+    EmbeddingModelType,
+    HistoryAction,
+    HistoryEntry,
+    IndexSettings,
+    MemoryEntry,
+    MemoryStats,
+    MemoryStrategy,
+    MemoryType,
+    Message,
+    Namespace,
+    PayloadType,
+    RecallResult,
+    SessionInfo,
+    StoreRequest,
+    StoreResult,
+    StrategyConfig,
+)
 from gnosisllm_knowledge.core.domain.result import (
     BatchResult,
     IndexResult,
@@ -20,10 +48,36 @@ from gnosisllm_knowledge.core.domain.search import (
 from gnosisllm_knowledge.core.domain.source import SourceConfig
 __all__ = [
+    # Discovery
+    "DiscoveredURL",
+    "DiscoveryConfig",
+    "DiscoveryJobStatus",
+    "DiscoveryProgress",
+    "DiscoveryStats",
     # Document
     "Document",
     "DocumentStatus",
     "TextChunk",
+    # Memory
+    "MemoryStrategy",
+    "MemoryType",
+    "PayloadType",
+    "EmbeddingModelType",
+    "HistoryAction",
+    "StrategyConfig",
+    "IndexSettings",
+    "ContainerIndexSettings",
+    "ContainerConfig",
+    "ContainerInfo",
+    "Message",
+    "Namespace",
+    "StoreRequest",
+    "StoreResult",
+    "MemoryEntry",
+    "RecallResult",
+    "SessionInfo",
+    "HistoryEntry",
+    "MemoryStats",
     # Result
     "LoadResult",
     "IndexResult",

gnosisllm_knowledge/core/domain/discovery.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""Domain models for website discovery."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class DiscoveryConfig:
+    """Configuration for website discovery crawl.
+    Controls how the Neo Reader Discovery API crawls and discovers URLs.
+    Attributes:
+        max_depth: Maximum crawl depth from start URL.
+        max_pages: Maximum number of pages to crawl.
+        same_domain: Only crawl URLs on the same domain.
+        include_subdomains: Include subdomains when same_domain is True.
+        respect_robots: Respect robots.txt rules.
+        parse_sitemap: Also parse sitemap if available.
+        with_metadata: Include page metadata (title, etc.) in results.
+        crawl_timeout: Overall timeout for the crawl in seconds.
+        concurrent_requests: Number of concurrent crawl requests.
+        request_delay: Delay between requests in milliseconds.
+        include_pattern: Regex pattern for URLs to include.
+        exclude_pattern: Regex pattern for URLs to exclude.
+        path_prefix: Only crawl URLs with this path prefix.
+    """
+    max_depth: int = 3
+    max_pages: int = 100
+    same_domain: bool = True
+    include_subdomains: bool = True
+    respect_robots: bool = True
+    parse_sitemap: bool = False
+    with_metadata: bool = True
+    crawl_timeout: int = 300
+    concurrent_requests: int = 5
+    request_delay: int = 100
+    include_pattern: str | None = None
+    exclude_pattern: str | None = None
+    path_prefix: str | None = None
+    def to_headers(self) -> dict[str, str]:
+        """Convert config to HTTP headers for Neo Reader API.
+        Returns:
+            Dictionary of header name to value.
+        """
+        headers = {
+            "X-Max-Depth": str(self.max_depth),
+            "X-Max-Pages": str(self.max_pages),
+            "X-Same-Domain": str(self.same_domain).lower(),
+            "X-Include-Subdomains": str(self.include_subdomains).lower(),
+            "X-Respect-Robots": str(self.respect_robots).lower(),
+            "X-Parse-Sitemap": str(self.parse_sitemap).lower(),
+            "X-With-Metadata": str(self.with_metadata).lower(),
+            "X-Crawl-Timeout": str(self.crawl_timeout),
+            "X-Concurrent-Requests": str(self.concurrent_requests),
+            "X-Request-Delay": str(self.request_delay),
+        }
+        if self.include_pattern:
+            headers["X-Include-Pattern"] = self.include_pattern
+        if self.exclude_pattern:
+            headers["X-Exclude-Pattern"] = self.exclude_pattern
+        if self.path_prefix:
+            headers["X-Path-Prefix"] = self.path_prefix
+        return headers
+@dataclass
+class DiscoveryProgress:
+    """Progress information for a running discovery job.
+    Attributes:
+        percent: Completion percentage (0-100).
+        pages_crawled: Number of pages crawled so far.
+        urls_discovered: Number of URLs discovered so far.
+        current_depth: Current crawl depth.
+        message: Human-readable progress message.
+    """
+    percent: int = 0
+    pages_crawled: int = 0
+    urls_discovered: int = 0
+    current_depth: int = 0
+    message: str = ""
+@dataclass
+class DiscoveryStats:
+    """Statistics for a completed discovery job.
+    Attributes:
+        pages_crawled: Total pages crawled.
+        urls_found: Total URLs found during crawl.
+        urls_returned: URLs returned in results (after filtering).
+        urls_filtered: URLs excluded by filters.
+        errors: Number of errors during crawl.
+        duration_seconds: Total crawl duration.
+    """
+    pages_crawled: int = 0
+    urls_found: int = 0
+    urls_returned: int = 0
+    urls_filtered: int = 0
+    errors: int = 0
+    duration_seconds: float = 0.0
+@dataclass
+class DiscoveredURL:
+    """A URL discovered during crawl.
+    Attributes:
+        url: The discovered URL.
+        depth: Crawl depth at which URL was found.
+        title: Page title if available.
+        is_internal: Whether URL is internal to the domain.
+    """
+    url: str
+    depth: int = 0
+    title: str | None = None
+    is_internal: bool = True
+@dataclass
+class DiscoveryJobStatus:
+    """Status of a discovery job.
+    Represents the current state of an async discovery job.
+    Attributes:
+        job_id: Unique job identifier.
+        status: Job status (pending, queued, running, completed, failed, cancelled).
+        start_url: The URL that started the discovery.
+        progress: Progress information if job is running.
+        stats: Statistics if job is completed.
+        urls: Discovered URLs if job is completed.
+        error: Error message if job failed.
+    """
+    job_id: str
+    status: str
+    start_url: str
+    progress: DiscoveryProgress | None = None
+    stats: DiscoveryStats | None = None
+    urls: list[DiscoveredURL] = field(default_factory=list)
+    error: str | None = None
+    def is_terminal(self) -> bool:
+        """Check if job is in a terminal state.
+        Returns:
+            True if job is completed, failed, or cancelled.
+        """
+        return self.status in ("completed", "failed", "cancelled")
+    def is_running(self) -> bool:
+        """Check if job is currently running.
+        Returns:
+            True if job is pending, queued, or running.
+        """
+        return self.status in ("pending", "queued", "running")

gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

gnosisllm-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl