PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

gnosisllm_knowledge/__init__.py +152 -0
gnosisllm_knowledge/api/__init__.py +5 -0
gnosisllm_knowledge/api/knowledge.py +548 -0
gnosisllm_knowledge/backends/__init__.py +26 -0
gnosisllm_knowledge/backends/memory/__init__.py +9 -0
gnosisllm_knowledge/backends/memory/indexer.py +384 -0
gnosisllm_knowledge/backends/memory/searcher.py +516 -0
gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
gnosisllm_knowledge/backends/opensearch/config.py +195 -0
gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
gnosisllm_knowledge/chunking/__init__.py +9 -0
gnosisllm_knowledge/chunking/fixed.py +138 -0
gnosisllm_knowledge/chunking/sentence.py +239 -0
gnosisllm_knowledge/cli/__init__.py +18 -0
gnosisllm_knowledge/cli/app.py +509 -0
gnosisllm_knowledge/cli/commands/__init__.py +7 -0
gnosisllm_knowledge/cli/commands/agentic.py +529 -0
gnosisllm_knowledge/cli/commands/load.py +369 -0
gnosisllm_knowledge/cli/commands/search.py +440 -0
gnosisllm_knowledge/cli/commands/setup.py +228 -0
gnosisllm_knowledge/cli/display/__init__.py +5 -0
gnosisllm_knowledge/cli/display/service.py +555 -0
gnosisllm_knowledge/cli/utils/__init__.py +5 -0
gnosisllm_knowledge/cli/utils/config.py +207 -0
gnosisllm_knowledge/core/__init__.py +87 -0
gnosisllm_knowledge/core/domain/__init__.py +43 -0
gnosisllm_knowledge/core/domain/document.py +240 -0
gnosisllm_knowledge/core/domain/result.py +176 -0
gnosisllm_knowledge/core/domain/search.py +327 -0
gnosisllm_knowledge/core/domain/source.py +139 -0
gnosisllm_knowledge/core/events/__init__.py +23 -0
gnosisllm_knowledge/core/events/emitter.py +216 -0
gnosisllm_knowledge/core/events/types.py +226 -0
gnosisllm_knowledge/core/exceptions.py +407 -0
gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
gnosisllm_knowledge/core/interfaces/loader.py +102 -0
gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
gnosisllm_knowledge/core/interfaces/setup.py +164 -0
gnosisllm_knowledge/fetchers/__init__.py +12 -0
gnosisllm_knowledge/fetchers/config.py +77 -0
gnosisllm_knowledge/fetchers/http.py +167 -0
gnosisllm_knowledge/fetchers/neoreader.py +204 -0
gnosisllm_knowledge/loaders/__init__.py +13 -0
gnosisllm_knowledge/loaders/base.py +399 -0
gnosisllm_knowledge/loaders/factory.py +202 -0
gnosisllm_knowledge/loaders/sitemap.py +285 -0
gnosisllm_knowledge/loaders/website.py +57 -0
gnosisllm_knowledge/py.typed +0 -0
gnosisllm_knowledge/services/__init__.py +9 -0
gnosisllm_knowledge/services/indexing.py +387 -0
gnosisllm_knowledge/services/search.py +349 -0
gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0

gnosisllm_knowledge/cli/utils/config.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""CLI configuration provider."""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from dotenv import load_dotenv
+@dataclass
+class CliConfig:
+    """CLI configuration loaded from environment.
+    Provides a unified interface for accessing configuration
+    with sensible defaults for CLI operations.
+    """
+    # OpenSearch
+    opensearch_host: str = "localhost"
+    opensearch_port: int = 9200
+    opensearch_username: str | None = None
+    opensearch_password: str | None = None
+    opensearch_use_ssl: bool = False
+    opensearch_verify_certs: bool = False
+    opensearch_model_id: str | None = None
+    opensearch_index_name: str = "knowledge"
+    opensearch_pipeline_name: str = "gnosisllm-ingest-pipeline"
+    opensearch_search_pipeline_name: str = "gnosisllm-search-pipeline"
+    # OpenAI
+    openai_api_key: str | None = None
+    openai_embedding_model: str = "text-embedding-ada-002"
+    openai_embedding_dimension: int = 1536
+    # Agentic Search
+    opensearch_flow_agent_id: str | None = None
+    opensearch_conversational_agent_id: str | None = None
+    agentic_llm_model: str = "gpt-4o"
+    agentic_max_iterations: int = 5
+    agentic_timeout_seconds: int = 60
+    # Neoreader
+    neoreader_host: str = "https://api.neoreader.dev"
+    @classmethod
+    def from_env(cls, env_file: str | Path | None = None) -> CliConfig:
+        """Load configuration from environment.
+        Args:
+            env_file: Optional path to .env file.
+        Returns:
+            Configured CliConfig instance.
+        """
+        # Load .env file if exists
+        if env_file:
+            load_dotenv(env_file)
+        else:
+            load_dotenv()
+        return cls(
+            opensearch_host=os.getenv("OPENSEARCH_HOST", "localhost"),
+            opensearch_port=int(os.getenv("OPENSEARCH_PORT", "9200")),
+            opensearch_username=os.getenv("OPENSEARCH_USERNAME"),
+            opensearch_password=os.getenv("OPENSEARCH_PASSWORD"),
+            opensearch_use_ssl=os.getenv("OPENSEARCH_USE_SSL", "false").lower() == "true",
+            opensearch_verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "false").lower()
+            == "true",
+            opensearch_model_id=os.getenv("OPENSEARCH_MODEL_ID"),
+            opensearch_index_name=os.getenv("OPENSEARCH_INDEX_NAME", "knowledge"),
+            opensearch_pipeline_name=os.getenv(
+                "OPENSEARCH_PIPELINE_NAME", "gnosisllm-ingest-pipeline"
+            ),
+            opensearch_search_pipeline_name=os.getenv(
+                "OPENSEARCH_SEARCH_PIPELINE_NAME", "gnosisllm-search-pipeline"
+            ),
+            openai_api_key=os.getenv("OPENAI_API_KEY"),
+            openai_embedding_model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"),
+            openai_embedding_dimension=int(os.getenv("OPENAI_EMBEDDING_DIMENSION", "1536")),
+            # Agentic search configuration
+            opensearch_flow_agent_id=os.getenv("OPENSEARCH_FLOW_AGENT_ID"),
+            opensearch_conversational_agent_id=os.getenv("OPENSEARCH_CONVERSATIONAL_AGENT_ID"),
+            agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
+            agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
+            agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
+            neoreader_host=os.getenv("NEOREADER_HOST", "https://api.neoreader.dev"),
+        )
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get configuration value by key.
+        Args:
+            key: Configuration key (e.g., "OPENSEARCH_HOST").
+            default: Default value if not found.
+        Returns:
+            Configuration value or default.
+        """
+        # Convert env-style key to attribute name
+        attr_name = key.lower()
+        return getattr(self, attr_name, default)
+    def require(self, key: str) -> str:
+        """Get required configuration value.
+        Args:
+            key: Configuration key.
+        Returns:
+            Configuration value.
+        Raises:
+            ValueError: If value is not set.
+        """
+        value = self.get(key)
+        if not value:
+            raise ValueError(f"{key} is required but not set")
+        return str(value)
+    @property
+    def opensearch_url(self) -> str:
+        """Get OpenSearch URL."""
+        protocol = "https" if self.opensearch_use_ssl else "http"
+        return f"{protocol}://{self.opensearch_host}:{self.opensearch_port}"
+    def validate_for_setup(self) -> list[str]:
+        """Validate configuration for setup command.
+        Returns:
+            List of validation errors (empty if valid).
+        """
+        errors = []
+        if not self.openai_api_key:
+            errors.append("OPENAI_API_KEY is required for setup")
+        return errors
+    def validate_for_search(self) -> list[str]:
+        """Validate configuration for search command.
+        Returns:
+            List of validation errors (empty if valid).
+        """
+        errors = []
+        if not self.opensearch_model_id:
+            errors.append(
+                "OPENSEARCH_MODEL_ID is required for semantic/hybrid search. "
+                "Run 'gnosisllm-knowledge setup' first."
+            )
+        return errors
+    def validate_for_agentic_search(self, agent_type: str = "flow") -> list[str]:
+        """Validate configuration for agentic search.
+        Args:
+            agent_type: Type of agent ('flow' or 'conversational').
+        Returns:
+            List of validation errors (empty if valid).
+        """
+        errors = self.validate_for_search()
+        if agent_type == "flow" and not self.opensearch_flow_agent_id:
+            errors.append(
+                "OPENSEARCH_FLOW_AGENT_ID is required for flow agent search. "
+                "Run 'gnosisllm-knowledge agentic setup' first."
+            )
+        elif agent_type == "conversational" and not self.opensearch_conversational_agent_id:
+            errors.append(
+                "OPENSEARCH_CONVERSATIONAL_AGENT_ID is required for conversational agent search. "
+                "Run 'gnosisllm-knowledge agentic setup' first."
+            )
+        return errors
+    def validate_for_agentic_setup(self) -> list[str]:
+        """Validate configuration for agentic setup command.
+        Returns:
+            List of validation errors (empty if valid).
+        """
+        errors = self.validate_for_setup()
+        if not self.opensearch_model_id:
+            errors.append(
+                "OPENSEARCH_MODEL_ID is required for agentic setup. "
+                "Run 'gnosisllm-knowledge setup' first to deploy the embedding model."
+            )
+        return errors
+    @property
+    def has_agentic_agents(self) -> bool:
+        """Check if any agentic agent is configured."""
+        return bool(self.opensearch_flow_agent_id or self.opensearch_conversational_agent_id)
+    @property
+    def has_flow_agent(self) -> bool:
+        """Check if flow agent is configured."""
+        return bool(self.opensearch_flow_agent_id)
+    @property
+    def has_conversational_agent(self) -> bool:
+        """Check if conversational agent is configured."""
+        return bool(self.opensearch_conversational_agent_id)

gnosisllm_knowledge/core/__init__.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Core module - Foundation layer with domain models, interfaces, and events."""
+from gnosisllm_knowledge.core.domain import (
+    AgenticSearchQuery,
+    AgenticSearchResult,
+    AgentType,
+    BatchResult,
+    Document,
+    DocumentStatus,
+    IndexResult,
+    LoadResult,
+    ReasoningStep,
+    SearchMode,
+    SearchQuery,
+    SearchResult,
+    SearchResultItem,
+    SourceConfig,
+    TextChunk,
+    ValidationResult,
+)
+from gnosisllm_knowledge.core.events import Event, EventEmitter, EventType
+from gnosisllm_knowledge.core.exceptions import (
+    AuthenticationError,
+    AuthorizationError,
+    ConfigurationError,
+    ConnectionError,
+    EmbeddingError,
+    IndexError,
+    KnowledgeError,
+    LoadError,
+    SearchError,
+    SetupError,
+    TimeoutError,
+    ValidationError,
+)
+from gnosisllm_knowledge.core.interfaces import (
+    IAgenticSearcher,
+    IContentFetcher,
+    IContentLoader,
+    IDocumentIndexer,
+    IKnowledgeSearcher,
+    ITextChunker,
+)
+__all__ = [
+    # Domain models
+    "Document",
+    "DocumentStatus",
+    "TextChunk",
+    "LoadResult",
+    "IndexResult",
+    "BatchResult",
+    "ValidationResult",
+    "SearchQuery",
+    "SearchResult",
+    "SearchResultItem",
+    "SearchMode",
+    "AgenticSearchQuery",
+    "AgenticSearchResult",
+    "AgentType",
+    "ReasoningStep",
+    "SourceConfig",
+    # Events
+    "Event",
+    "EventType",
+    "EventEmitter",
+    # Exceptions
+    "KnowledgeError",
+    "ConfigurationError",
+    "ConnectionError",
+    "AuthenticationError",
+    "AuthorizationError",
+    "LoadError",
+    "ValidationError",
+    "IndexError",
+    "SearchError",
+    "EmbeddingError",
+    "SetupError",
+    "TimeoutError",
+    # Interfaces
+    "IContentLoader",
+    "IContentFetcher",
+    "ITextChunker",
+    "IDocumentIndexer",
+    "IKnowledgeSearcher",
+    "IAgenticSearcher",
+]

gnosisllm_knowledge/core/domain/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Domain models - Value objects and entities."""
+from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
+from gnosisllm_knowledge.core.domain.result import (
+    BatchResult,
+    IndexResult,
+    LoadResult,
+    ValidationResult,
+)
+from gnosisllm_knowledge.core.domain.search import (
+    AgenticSearchQuery,
+    AgenticSearchResult,
+    AgentType,
+    ReasoningStep,
+    SearchMode,
+    SearchQuery,
+    SearchResult,
+    SearchResultItem,
+)
+from gnosisllm_knowledge.core.domain.source import SourceConfig
+__all__ = [
+    # Document
+    "Document",
+    "DocumentStatus",
+    "TextChunk",
+    # Result
+    "LoadResult",
+    "IndexResult",
+    "BatchResult",
+    "ValidationResult",
+    # Search
+    "SearchQuery",
+    "SearchResult",
+    "SearchResultItem",
+    "SearchMode",
+    "AgenticSearchQuery",
+    "AgenticSearchResult",
+    "AgentType",
+    "ReasoningStep",
+    # Source
+    "SourceConfig",
+]

gnosisllm_knowledge/core/domain/document.py ADDED Viewed

@@ -0,0 +1,240 @@
+"""Document domain models."""
+from __future__ import annotations
+import hashlib
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from enum import Enum
+from typing import Any
+class DocumentStatus(Enum):
+    """Document processing status."""
+    PENDING = "pending"
+    PROCESSING = "processing"
+    INDEXED = "indexed"
+    FAILED = "failed"
+    DELETED = "deleted"
+@dataclass
+class Document:
+    """Represents a document to be indexed.
+    This is the core domain object that flows through the knowledge pipeline.
+    Documents are created by loaders, processed by chunkers, and stored by indexers.
+    Attributes:
+        content: The main text content of the document.
+        source: Source identifier (URL, file path, etc.).
+        doc_id: Unique identifier. Auto-generated from content hash if not provided.
+        title: Optional document title.
+        url: URL where the document was fetched from.
+        metadata: Arbitrary metadata dictionary.
+        Multi-tenancy fields:
+            account_id: Account/tenant identifier.
+            collection_id: Collection the document belongs to.
+            source_id: Source identifier within the collection.
+        Chunking info:
+            chunk_index: Index of this chunk (0-based).
+            total_chunks: Total number of chunks for the parent document.
+            parent_doc_id: Reference to the original document ID.
+        Quality and validation:
+            quality_score: Quality score from 0.0 to 1.0.
+            language: Detected language code (ISO 639-1).
+            content_hash: SHA-256 hash for deduplication.
+            word_count: Number of words in content.
+        Status:
+            status: Current processing status.
+        PII handling:
+            pii_detected: Whether PII was detected.
+            pii_redacted: Whether PII was redacted.
+        Timestamps:
+            created_at: When the document was created.
+            updated_at: When the document was last updated.
+            indexed_at: When the document was indexed.
+    """
+    content: str
+    source: str
+    doc_id: str | None = None
+    title: str | None = None
+    url: str | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+    # Multi-tenancy fields
+    account_id: str | None = None
+    collection_id: str | None = None
+    source_id: str | None = None
+    # Chunking info
+    chunk_index: int | None = None
+    total_chunks: int | None = None
+    parent_doc_id: str | None = None
+    # Quality and validation
+    quality_score: float | None = None
+    language: str | None = None
+    content_hash: str | None = None
+    word_count: int | None = None
+    # Status
+    status: DocumentStatus = DocumentStatus.PENDING
+    # PII handling
+    pii_detected: bool = False
+    pii_redacted: bool = False
+    # Timestamps
+    created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+    updated_at: datetime | None = None
+    indexed_at: datetime | None = None
+    def __post_init__(self) -> None:
+        """Generate doc_id and content_hash if not provided."""
+        if not self.content:
+            raise ValueError("Document content cannot be empty")
+        # Generate content hash for deduplication
+        if self.content_hash is None:
+            self.content_hash = hashlib.sha256(self.content.encode()).hexdigest()
+        # Generate doc_id from content hash if not provided
+        if self.doc_id is None:
+            self.doc_id = f"{self.source}#{self.content_hash[:16]}"
+        # Calculate word count
+        if self.word_count is None:
+            self.word_count = len(self.content.split())
+    def with_chunk_info(
+        self,
+        chunk_index: int,
+        total_chunks: int,
+        parent_doc_id: str | None = None,
+    ) -> Document:
+        """Create a new document with chunk information.
+        Args:
+            chunk_index: Index of this chunk (0-based).
+            total_chunks: Total number of chunks.
+            parent_doc_id: Reference to the original document ID.
+        Returns:
+            New Document instance with chunk information set.
+        """
+        return Document(
+            content=self.content,
+            source=self.source,
+            doc_id=None,  # Will be regenerated
+            title=self.title,
+            url=self.url,
+            metadata=self.metadata.copy(),
+            account_id=self.account_id,
+            collection_id=self.collection_id,
+            source_id=self.source_id,
+            chunk_index=chunk_index,
+            total_chunks=total_chunks,
+            parent_doc_id=parent_doc_id or self.doc_id,
+            quality_score=self.quality_score,
+            language=self.language,
+            status=self.status,
+            pii_detected=self.pii_detected,
+            pii_redacted=self.pii_redacted,
+            created_at=self.created_at,
+        )
+    def with_tenant(
+        self,
+        account_id: str,
+        collection_id: str | None = None,
+        source_id: str | None = None,
+    ) -> Document:
+        """Create a new document with tenant information.
+        Args:
+            account_id: Account/tenant identifier.
+            collection_id: Collection identifier.
+            source_id: Source identifier.
+        Returns:
+            New Document instance with tenant information set.
+        """
+        return Document(
+            content=self.content,
+            source=self.source,
+            doc_id=self.doc_id,
+            title=self.title,
+            url=self.url,
+            metadata=self.metadata.copy(),
+            account_id=account_id,
+            collection_id=collection_id or self.collection_id,
+            source_id=source_id or self.source_id,
+            chunk_index=self.chunk_index,
+            total_chunks=self.total_chunks,
+            parent_doc_id=self.parent_doc_id,
+            quality_score=self.quality_score,
+            language=self.language,
+            content_hash=self.content_hash,
+            word_count=self.word_count,
+            status=self.status,
+            pii_detected=self.pii_detected,
+            pii_redacted=self.pii_redacted,
+            created_at=self.created_at,
+            updated_at=self.updated_at,
+            indexed_at=self.indexed_at,
+        )
+    @property
+    def is_chunk(self) -> bool:
+        """Check if this document is a chunk of a larger document."""
+        return self.chunk_index is not None and self.total_chunks is not None
+    @property
+    def is_multi_tenant(self) -> bool:
+        """Check if this document has tenant information."""
+        return self.account_id is not None
+@dataclass
+class TextChunk:
+    """Represents a chunk of text from a document.
+    Text chunks are created by chunkers to split large documents into
+    smaller, embedding-friendly pieces.
+    Attributes:
+        content: The text content of the chunk.
+        index: Index of this chunk (0-based).
+        start_position: Start position in the original text.
+        end_position: End position in the original text.
+        metadata: Optional metadata for the chunk.
+    """
+    content: str
+    index: int
+    start_position: int
+    end_position: int
+    metadata: dict[str, Any] = field(default_factory=dict)
+    @property
+    def length(self) -> int:
+        """Return the length of the chunk content."""
+        return len(self.content)
+    def __post_init__(self) -> None:
+        """Validate chunk data."""
+        if self.start_position < 0:
+            raise ValueError("start_position must be non-negative")
+        if self.end_position < self.start_position:
+            raise ValueError("end_position must be >= start_position")
+        if self.index < 0:
+            raise ValueError("index must be non-negative")