PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

gnosisllm_knowledge/__init__.py +152 -0
gnosisllm_knowledge/api/__init__.py +5 -0
gnosisllm_knowledge/api/knowledge.py +548 -0
gnosisllm_knowledge/backends/__init__.py +26 -0
gnosisllm_knowledge/backends/memory/__init__.py +9 -0
gnosisllm_knowledge/backends/memory/indexer.py +384 -0
gnosisllm_knowledge/backends/memory/searcher.py +516 -0
gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
gnosisllm_knowledge/backends/opensearch/config.py +195 -0
gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
gnosisllm_knowledge/chunking/__init__.py +9 -0
gnosisllm_knowledge/chunking/fixed.py +138 -0
gnosisllm_knowledge/chunking/sentence.py +239 -0
gnosisllm_knowledge/cli/__init__.py +18 -0
gnosisllm_knowledge/cli/app.py +509 -0
gnosisllm_knowledge/cli/commands/__init__.py +7 -0
gnosisllm_knowledge/cli/commands/agentic.py +529 -0
gnosisllm_knowledge/cli/commands/load.py +369 -0
gnosisllm_knowledge/cli/commands/search.py +440 -0
gnosisllm_knowledge/cli/commands/setup.py +228 -0
gnosisllm_knowledge/cli/display/__init__.py +5 -0
gnosisllm_knowledge/cli/display/service.py +555 -0
gnosisllm_knowledge/cli/utils/__init__.py +5 -0
gnosisllm_knowledge/cli/utils/config.py +207 -0
gnosisllm_knowledge/core/__init__.py +87 -0
gnosisllm_knowledge/core/domain/__init__.py +43 -0
gnosisllm_knowledge/core/domain/document.py +240 -0
gnosisllm_knowledge/core/domain/result.py +176 -0
gnosisllm_knowledge/core/domain/search.py +327 -0
gnosisllm_knowledge/core/domain/source.py +139 -0
gnosisllm_knowledge/core/events/__init__.py +23 -0
gnosisllm_knowledge/core/events/emitter.py +216 -0
gnosisllm_knowledge/core/events/types.py +226 -0
gnosisllm_knowledge/core/exceptions.py +407 -0
gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
gnosisllm_knowledge/core/interfaces/loader.py +102 -0
gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
gnosisllm_knowledge/core/interfaces/setup.py +164 -0
gnosisllm_knowledge/fetchers/__init__.py +12 -0
gnosisllm_knowledge/fetchers/config.py +77 -0
gnosisllm_knowledge/fetchers/http.py +167 -0
gnosisllm_knowledge/fetchers/neoreader.py +204 -0
gnosisllm_knowledge/loaders/__init__.py +13 -0
gnosisllm_knowledge/loaders/base.py +399 -0
gnosisllm_knowledge/loaders/factory.py +202 -0
gnosisllm_knowledge/loaders/sitemap.py +285 -0
gnosisllm_knowledge/loaders/website.py +57 -0
gnosisllm_knowledge/py.typed +0 -0
gnosisllm_knowledge/services/__init__.py +9 -0
gnosisllm_knowledge/services/indexing.py +387 -0
gnosisllm_knowledge/services/search.py +349 -0
gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0

gnosisllm_knowledge/core/domain/result.py ADDED Viewed

@@ -0,0 +1,176 @@
+"""Result domain models."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class LoadResult:
+    """Result of a load operation.
+    Attributes:
+        source: The source that was loaded (URL, file path, etc.).
+        source_type: Type of source (website, sitemap, file, etc.).
+        document_count: Number of documents loaded.
+        success: Whether the operation succeeded.
+        error_message: Error message if operation failed.
+        duration_ms: Duration of the operation in milliseconds.
+        metadata: Additional metadata about the load operation.
+        urls_processed: Number of URLs processed (for multi-URL sources).
+        urls_failed: Number of URLs that failed to load.
+        bytes_loaded: Total bytes of content loaded.
+    """
+    source: str
+    source_type: str
+    document_count: int
+    success: bool
+    error_message: str | None = None
+    duration_ms: float = 0.0
+    metadata: dict[str, Any] = field(default_factory=dict)
+    urls_processed: int = 0
+    urls_failed: int = 0
+    bytes_loaded: int = 0
+    @property
+    def success_rate(self) -> float:
+        """Calculate the success rate for multi-URL loads."""
+        total = self.urls_processed + self.urls_failed
+        if total == 0:
+            return 1.0 if self.success else 0.0
+        return self.urls_processed / total
+@dataclass
+class IndexResult:
+    """Result of an indexing operation.
+    Attributes:
+        success: Whether the operation succeeded.
+        document_id: ID of the indexed document (single doc operation).
+        index_name: Name of the index where documents were stored.
+        indexed_count: Number of documents successfully indexed.
+        failed_count: Number of documents that failed to index.
+        error_message: Error message if operation failed completely.
+        duration_ms: Duration of the operation in milliseconds.
+        failed_doc_ids: List of document IDs that failed to index.
+        errors: List of error details for failed documents.
+    """
+    success: bool
+    document_id: str | None = None
+    index_name: str | None = None
+    indexed_count: int = 0
+    failed_count: int = 0
+    error_message: str | None = None
+    duration_ms: float = 0.0
+    failed_doc_ids: list[str] = field(default_factory=list)
+    errors: list[dict[str, Any]] = field(default_factory=list)
+    @property
+    def total_attempted(self) -> int:
+        """Return total documents attempted to index."""
+        return self.indexed_count + self.failed_count
+    @property
+    def success_rate(self) -> float:
+        """Calculate the success rate."""
+        total = self.total_attempted
+        if total == 0:
+            return 1.0 if self.success else 0.0
+        return self.indexed_count / total
+    def merge(self, other: IndexResult) -> IndexResult:
+        """Merge two IndexResults into one.
+        Useful for combining batch results.
+        Args:
+            other: Another IndexResult to merge with.
+        Returns:
+            New IndexResult combining both results.
+        """
+        return IndexResult(
+            index_name=self.index_name,
+            indexed_count=self.indexed_count + other.indexed_count,
+            failed_count=self.failed_count + other.failed_count,
+            success=self.success and other.success,
+            error_message=(
+                f"{self.error_message}; {other.error_message}"
+                if self.error_message and other.error_message
+                else self.error_message or other.error_message
+            ),
+            duration_ms=self.duration_ms + other.duration_ms,
+            failed_doc_ids=self.failed_doc_ids + other.failed_doc_ids,
+            errors=self.errors + other.errors,
+        )
+@dataclass
+class BatchResult:
+    """Result of a batch operation.
+    Attributes:
+        total: Total items processed.
+        succeeded: Number of successful operations.
+        failed: Number of failed operations.
+        duration_ms: Duration of the batch operation in milliseconds.
+        errors: List of errors that occurred.
+    """
+    total: int
+    succeeded: int
+    failed: int
+    duration_ms: float = 0.0
+    errors: list[str] = field(default_factory=list)
+    @property
+    def success_rate(self) -> float:
+        """Calculate the success rate for this batch."""
+        total = self.total
+        if total == 0:
+            return 1.0
+        return self.succeeded / total
+@dataclass
+class ValidationResult:
+    """Result of a validation operation.
+    Attributes:
+        valid: Whether the content/source is valid.
+        message: Descriptive message about the validation.
+        errors: List of validation errors if any.
+        warnings: List of validation warnings if any.
+        metadata: Additional validation metadata.
+    """
+    valid: bool
+    message: str = ""
+    errors: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+    @classmethod
+    def success(cls, message: str = "Validation passed") -> ValidationResult:
+        """Create a successful validation result."""
+        return cls(valid=True, message=message)
+    @classmethod
+    def failure(cls, message: str, errors: list[str] | None = None) -> ValidationResult:
+        """Create a failed validation result."""
+        return cls(valid=False, message=message, errors=errors or [])
+    def add_error(self, error: str) -> ValidationResult:
+        """Add an error and return self for chaining."""
+        self.errors.append(error)
+        self.valid = False
+        return self
+    def add_warning(self, warning: str) -> ValidationResult:
+        """Add a warning and return self for chaining."""
+        self.warnings.append(warning)
+        return self

gnosisllm_knowledge/core/domain/search.py ADDED Viewed

@@ -0,0 +1,327 @@
+"""Search domain models."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+class SearchMode(str, Enum):
+    """Search mode options.
+    Attributes:
+        SEMANTIC: Vector similarity search only (KNN).
+        KEYWORD: BM25 text search only.
+        HYBRID: Combined semantic + keyword search (default).
+        AGENTIC: AI-powered search with reasoning and answer generation.
+    """
+    SEMANTIC = "semantic"
+    KEYWORD = "keyword"
+    HYBRID = "hybrid"
+    AGENTIC = "agentic"
+class AgentType(str, Enum):
+    """Agent types for agentic search.
+    Attributes:
+        FLOW: Fast RAG - single query/response with no conversation memory.
+        CONVERSATIONAL: Multi-turn conversations with memory support.
+    """
+    FLOW = "flow"
+    CONVERSATIONAL = "conversational"
+@dataclass
+class SearchQuery:
+    """Search query with filters and options.
+    Attributes:
+        text: The search query text.
+        mode: Search mode to use.
+        limit: Maximum number of results to return.
+        offset: Number of results to skip (for pagination).
+        min_score: Minimum score threshold for results.
+        Filters:
+            collection_ids: Filter by collection IDs.
+            source_ids: Filter by source IDs.
+            account_id: Multi-tenant account filter.
+            metadata_filters: Custom metadata filters.
+        Advanced options:
+            field_boosts: Field boosting weights.
+            include_highlights: Whether to include highlights.
+            include_fields: Fields to include in results.
+            exclude_fields: Fields to exclude from results.
+            explain: Whether to include score explanation.
+    """
+    text: str
+    mode: SearchMode = SearchMode.HYBRID
+    limit: int = 10
+    offset: int = 0
+    min_score: float | None = None
+    # Filters
+    collection_ids: list[str] | None = None
+    source_ids: list[str] | None = None
+    account_id: str | None = None
+    metadata_filters: dict[str, Any] = field(default_factory=dict)
+    # Advanced options
+    field_boosts: dict[str, float] | None = None
+    include_highlights: bool = True
+    include_fields: list[str] | None = None
+    exclude_fields: list[str] | None = None
+    explain: bool = False
+    def with_mode(self, mode: SearchMode) -> SearchQuery:
+        """Create a copy with a different search mode."""
+        return SearchQuery(
+            text=self.text,
+            mode=mode,
+            limit=self.limit,
+            offset=self.offset,
+            min_score=self.min_score,
+            collection_ids=self.collection_ids,
+            source_ids=self.source_ids,
+            account_id=self.account_id,
+            metadata_filters=self.metadata_filters.copy(),
+            field_boosts=self.field_boosts.copy() if self.field_boosts else None,
+            include_highlights=self.include_highlights,
+            include_fields=self.include_fields,
+            exclude_fields=self.exclude_fields,
+            explain=self.explain,
+        )
+    def with_tenant(self, account_id: str) -> SearchQuery:
+        """Create a copy with tenant information."""
+        return SearchQuery(
+            text=self.text,
+            mode=self.mode,
+            limit=self.limit,
+            offset=self.offset,
+            min_score=self.min_score,
+            collection_ids=self.collection_ids,
+            source_ids=self.source_ids,
+            account_id=account_id,
+            metadata_filters=self.metadata_filters.copy(),
+            field_boosts=self.field_boosts.copy() if self.field_boosts else None,
+            include_highlights=self.include_highlights,
+            include_fields=self.include_fields,
+            exclude_fields=self.exclude_fields,
+            explain=self.explain,
+        )
+@dataclass
+class SearchResultItem:
+    """A single search result.
+    Attributes:
+        doc_id: Document identifier.
+        content: Document content.
+        score: Relevance score.
+        title: Document title.
+        url: Document URL.
+        source: Source identifier.
+        collection_id: Collection identifier.
+        source_id: Source identifier within collection.
+        chunk_index: Chunk index if document is chunked.
+        total_chunks: Total chunks in parent document.
+        metadata: Document metadata.
+        highlights: Highlighted snippets from matching content.
+        explanation: Score explanation (when explain=True).
+    """
+    doc_id: str
+    content: str
+    score: float
+    title: str | None = None
+    url: str | None = None
+    source: str | None = None
+    collection_id: str | None = None
+    source_id: str | None = None
+    chunk_index: int | None = None
+    total_chunks: int | None = None
+    metadata: dict[str, Any] | None = None
+    highlights: list[str] | None = None
+    highlighted_title: str | None = None
+    explanation: dict[str, Any] | None = None
+@dataclass
+class SearchResult:
+    """Complete search result with metadata.
+    Attributes:
+        query: The original search query text.
+        mode: Search mode that was used.
+        items: List of search result items.
+        total_hits: Total number of matching documents.
+        duration_ms: Search duration in milliseconds.
+        max_score: Maximum score among results.
+        from_cache: Whether results came from cache.
+        cache_key: Cache key if results are cacheable.
+    """
+    query: str
+    mode: SearchMode
+    items: list[SearchResultItem]
+    total_hits: int
+    duration_ms: float
+    max_score: float | None = None
+    from_cache: bool = False
+    cache_key: str | None = None
+    search_after_token: Any | None = None  # For cursor-based pagination
+    has_more: bool = False
+    @property
+    def has_results(self) -> bool:
+        """Check if there are any results."""
+        return len(self.items) > 0
+    @property
+    def count(self) -> int:
+        """Return the number of results in this page."""
+        return len(self.items)
+@dataclass
+class ReasoningStep:
+    """A single step in the agent's reasoning process.
+    Attributes:
+        tool: The tool that was used (e.g., "VectorDBTool", "MLModelTool").
+        action: The action performed.
+        input: Input provided to the tool.
+        output: Output from the tool.
+        duration_ms: Duration of this step in milliseconds.
+        tokens_used: Number of tokens consumed by this step.
+    """
+    tool: str
+    action: str
+    input: str | None = None
+    output: str | None = None
+    duration_ms: float = 0.0
+    tokens_used: int = 0
+@dataclass
+class AgenticSearchQuery:
+    """Query for agentic search with conversation support.
+    Attributes:
+        text: The search query text.
+        agent_type: Type of agent to use.
+        conversation_id: ID for continuing a conversation.
+        collection_ids: Filter by collection IDs.
+        source_ids: Filter by source IDs.
+        account_id: Multi-tenant account filter.
+        limit: Maximum number of source documents to retrieve.
+        include_reasoning: Whether to include reasoning steps.
+        metadata_filters: Custom metadata filters.
+        temperature: LLM temperature (0.0 to 1.0).
+        max_iterations: Maximum agent iterations.
+    """
+    text: str
+    agent_type: AgentType = AgentType.FLOW
+    conversation_id: str | None = None
+    collection_ids: list[str] | None = None
+    source_ids: list[str] | None = None
+    account_id: str | None = None
+    limit: int = 10
+    include_reasoning: bool = True
+    metadata_filters: dict[str, Any] = field(default_factory=dict)
+    temperature: float = 0.0
+    max_iterations: int = 5
+    def to_search_query(self) -> SearchQuery:
+        """Convert to a standard SearchQuery for fallback."""
+        return SearchQuery(
+            text=self.text,
+            mode=SearchMode.HYBRID,
+            limit=self.limit,
+            collection_ids=self.collection_ids,
+            source_ids=self.source_ids,
+            account_id=self.account_id,
+            metadata_filters=self.metadata_filters.copy(),
+        )
+@dataclass
+class AgenticSearchResult:
+    """Search result with agentic enhancements.
+    Extends SearchResult with AI-generated answer and reasoning.
+    Attributes:
+        query: The original search query text.
+        mode: Search mode (always AGENTIC).
+        items: Retrieved source documents.
+        total_hits: Total number of matching documents.
+        duration_ms: Total search duration in milliseconds.
+        max_score: Maximum score among results.
+        answer: AI-generated answer to the query.
+        reasoning_steps: List of reasoning steps taken by the agent.
+        conversation_id: Conversation ID for multi-turn searches.
+        agent_type: Type of agent that was used.
+        citations: References to source documents used in answer.
+        total_tokens: Total tokens consumed.
+        prompt_tokens: Tokens used in prompts.
+        completion_tokens: Tokens used in completions.
+    """
+    query: str
+    mode: SearchMode
+    items: list[SearchResultItem]
+    total_hits: int
+    duration_ms: float
+    max_score: float | None = None
+    answer: str | None = None
+    reasoning_steps: list[ReasoningStep] = field(default_factory=list)
+    conversation_id: str | None = None
+    agent_type: AgentType = AgentType.FLOW
+    citations: list[str] = field(default_factory=list)
+    total_tokens: int = 0
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    @property
+    def has_answer(self) -> bool:
+        """Check if an answer was generated."""
+        return self.answer is not None and len(self.answer) > 0
+    @property
+    def has_reasoning(self) -> bool:
+        """Check if reasoning steps are available."""
+        return len(self.reasoning_steps) > 0
+    @classmethod
+    def from_search_result(
+        cls,
+        result: SearchResult,
+        answer: str | None = None,
+        reasoning_steps: list[ReasoningStep] | None = None,
+        agent_type: AgentType = AgentType.FLOW,
+        conversation_id: str | None = None,
+    ) -> AgenticSearchResult:
+        """Create AgenticSearchResult from a SearchResult."""
+        return cls(
+            query=result.query,
+            mode=SearchMode.AGENTIC,
+            items=result.items,
+            total_hits=result.total_hits,
+            duration_ms=result.duration_ms,
+            max_score=result.max_score,
+            answer=answer,
+            reasoning_steps=reasoning_steps or [],
+            conversation_id=conversation_id,
+            agent_type=agent_type,
+        )

gnosisllm_knowledge/core/domain/source.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""Source configuration domain model."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class SourceConfig:
+    """Configuration for a content source.
+    Attributes:
+        url: The source URL or path.
+        source_type: Type of source (website, sitemap, file, etc.).
+        options: Additional loader-specific options.
+        Sitemap-specific options:
+            max_urls: Maximum number of URLs to process.
+            max_depth: Maximum sitemap recursion depth.
+            allowed_patterns: URL patterns to include.
+            blocked_patterns: URL patterns to exclude.
+        Fetcher options:
+            target_selector: CSS selector for content extraction.
+            remove_selector: CSS selector for elements to remove.
+            timeout: Request timeout in seconds.
+        Multi-tenancy:
+            account_id: Account/tenant identifier.
+            collection_id: Collection identifier.
+            source_id: Source identifier within collection.
+    """
+    url: str
+    source_type: str = "website"
+    options: dict[str, Any] = field(default_factory=dict)
+    # Sitemap-specific options
+    max_urls: int | None = None
+    max_depth: int | None = None
+    allowed_patterns: list[str] = field(default_factory=list)
+    blocked_patterns: list[str] = field(default_factory=list)
+    # Fetcher options
+    target_selector: str | None = None
+    remove_selector: str | None = None
+    timeout: int | None = None
+    # Multi-tenancy
+    account_id: str | None = None
+    collection_id: str | None = None
+    source_id: str | None = None
+    def with_options(self, **options: Any) -> SourceConfig:
+        """Create a copy with additional options merged.
+        Args:
+            **options: Options to merge into the config.
+        Returns:
+            New SourceConfig with merged options.
+        """
+        merged_options = {**self.options, **options}
+        return SourceConfig(
+            url=self.url,
+            source_type=self.source_type,
+            options=merged_options,
+            max_urls=self.max_urls,
+            max_depth=self.max_depth,
+            allowed_patterns=self.allowed_patterns.copy(),
+            blocked_patterns=self.blocked_patterns.copy(),
+            target_selector=self.target_selector,
+            remove_selector=self.remove_selector,
+            timeout=self.timeout,
+            account_id=self.account_id,
+            collection_id=self.collection_id,
+            source_id=self.source_id,
+        )
+    def with_tenant(
+        self,
+        account_id: str,
+        collection_id: str | None = None,
+        source_id: str | None = None,
+    ) -> SourceConfig:
+        """Create a copy with tenant information.
+        Args:
+            account_id: Account/tenant identifier.
+            collection_id: Collection identifier.
+            source_id: Source identifier.
+        Returns:
+            New SourceConfig with tenant information.
+        """
+        return SourceConfig(
+            url=self.url,
+            source_type=self.source_type,
+            options=self.options.copy(),
+            max_urls=self.max_urls,
+            max_depth=self.max_depth,
+            allowed_patterns=self.allowed_patterns.copy(),
+            blocked_patterns=self.blocked_patterns.copy(),
+            target_selector=self.target_selector,
+            remove_selector=self.remove_selector,
+            timeout=self.timeout,
+            account_id=account_id,
+            collection_id=collection_id,
+            source_id=source_id,
+        )
+    @property
+    def is_sitemap(self) -> bool:
+        """Check if this is a sitemap source."""
+        return self.source_type == "sitemap" or self.url.endswith("sitemap.xml")
+    @property
+    def is_website(self) -> bool:
+        """Check if this is a website source."""
+        return self.source_type == "website"
+    @classmethod
+    def from_url(cls, url: str, **kwargs: Any) -> SourceConfig:
+        """Create a SourceConfig from a URL, auto-detecting source type.
+        Args:
+            url: The source URL.
+            **kwargs: Additional configuration options.
+        Returns:
+            SourceConfig with auto-detected source type.
+        """
+        # Auto-detect source type from URL
+        source_type = "website"
+        if "sitemap" in url.lower() or url.endswith(".xml"):
+            source_type = "sitemap"
+        return cls(url=url, source_type=source_type, **kwargs)

gnosisllm_knowledge/core/events/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Event system for decoupled communication (Observer pattern)."""
+from gnosisllm_knowledge.core.events.emitter import EventEmitter
+from gnosisllm_knowledge.core.events.types import (
+    BatchCompletedEvent,
+    BatchStartedEvent,
+    DocumentIndexedEvent,
+    DocumentLoadedEvent,
+    Event,
+    EventType,
+    SitemapDiscoveryEvent,
+)
+__all__ = [
+    "Event",
+    "EventType",
+    "EventEmitter",
+    "DocumentLoadedEvent",
+    "DocumentIndexedEvent",
+    "SitemapDiscoveryEvent",
+    "BatchStartedEvent",
+    "BatchCompletedEvent",
+]