PyPI - gnosisllm-knowledge - Versions diffs - 0.2.0__py3-none-any.whl - Mend

gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

gnosisllm_knowledge/__init__.py +152 -0
gnosisllm_knowledge/api/__init__.py +5 -0
gnosisllm_knowledge/api/knowledge.py +548 -0
gnosisllm_knowledge/backends/__init__.py +26 -0
gnosisllm_knowledge/backends/memory/__init__.py +9 -0
gnosisllm_knowledge/backends/memory/indexer.py +384 -0
gnosisllm_knowledge/backends/memory/searcher.py +516 -0
gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
gnosisllm_knowledge/backends/opensearch/config.py +195 -0
gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
gnosisllm_knowledge/chunking/__init__.py +9 -0
gnosisllm_knowledge/chunking/fixed.py +138 -0
gnosisllm_knowledge/chunking/sentence.py +239 -0
gnosisllm_knowledge/cli/__init__.py +18 -0
gnosisllm_knowledge/cli/app.py +509 -0
gnosisllm_knowledge/cli/commands/__init__.py +7 -0
gnosisllm_knowledge/cli/commands/agentic.py +529 -0
gnosisllm_knowledge/cli/commands/load.py +369 -0
gnosisllm_knowledge/cli/commands/search.py +440 -0
gnosisllm_knowledge/cli/commands/setup.py +228 -0
gnosisllm_knowledge/cli/display/__init__.py +5 -0
gnosisllm_knowledge/cli/display/service.py +555 -0
gnosisllm_knowledge/cli/utils/__init__.py +5 -0
gnosisllm_knowledge/cli/utils/config.py +207 -0
gnosisllm_knowledge/core/__init__.py +87 -0
gnosisllm_knowledge/core/domain/__init__.py +43 -0
gnosisllm_knowledge/core/domain/document.py +240 -0
gnosisllm_knowledge/core/domain/result.py +176 -0
gnosisllm_knowledge/core/domain/search.py +327 -0
gnosisllm_knowledge/core/domain/source.py +139 -0
gnosisllm_knowledge/core/events/__init__.py +23 -0
gnosisllm_knowledge/core/events/emitter.py +216 -0
gnosisllm_knowledge/core/events/types.py +226 -0
gnosisllm_knowledge/core/exceptions.py +407 -0
gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
gnosisllm_knowledge/core/interfaces/loader.py +102 -0
gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
gnosisllm_knowledge/core/interfaces/setup.py +164 -0
gnosisllm_knowledge/fetchers/__init__.py +12 -0
gnosisllm_knowledge/fetchers/config.py +77 -0
gnosisllm_knowledge/fetchers/http.py +167 -0
gnosisllm_knowledge/fetchers/neoreader.py +204 -0
gnosisllm_knowledge/loaders/__init__.py +13 -0
gnosisllm_knowledge/loaders/base.py +399 -0
gnosisllm_knowledge/loaders/factory.py +202 -0
gnosisllm_knowledge/loaders/sitemap.py +285 -0
gnosisllm_knowledge/loaders/website.py +57 -0
gnosisllm_knowledge/py.typed +0 -0
gnosisllm_knowledge/services/__init__.py +9 -0
gnosisllm_knowledge/services/indexing.py +387 -0
gnosisllm_knowledge/services/search.py +349 -0
gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0

gnosisllm_knowledge/core/interfaces/fetcher.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Content fetcher protocol - Single Responsibility Principle."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Protocol, runtime_checkable
+@dataclass
+class FetchResult:
+    """Result of a fetch operation.
+    Attributes:
+        content: The fetched content (usually text or markdown).
+        status_code: HTTP status code or equivalent.
+        content_type: MIME type of the content.
+        url: The final URL after redirects.
+        title: Extracted document title.
+        metadata: Additional metadata from the fetch.
+        encoding: Content encoding.
+        headers: Response headers.
+    """
+    content: str
+    status_code: int
+    content_type: str
+    url: str
+    title: str | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+    encoding: str | None = None
+    headers: dict[str, str] = field(default_factory=dict)
+    @property
+    def is_success(self) -> bool:
+        """Check if the fetch was successful."""
+        return 200 <= self.status_code < 300
+    @property
+    def is_html(self) -> bool:
+        """Check if the content is HTML."""
+        return "html" in self.content_type.lower()
+    @property
+    def is_text(self) -> bool:
+        """Check if the content is text."""
+        return "text" in self.content_type.lower()
+    @property
+    def content_length(self) -> int:
+        """Return the length of the content."""
+        return len(self.content)
+@runtime_checkable
+class IContentFetcher(Protocol):
+    """Protocol for fetching raw content from URLs.
+    Content fetchers are responsible for:
+    - Making HTTP requests to URLs
+    - Converting content to a standard format (e.g., markdown)
+    - Handling authentication and headers
+    - Extracting metadata like titles
+    Implementations should follow the Single Responsibility Principle
+    and handle only content fetching, not parsing or chunking.
+    """
+    async def fetch(self, url: str, **options: Any) -> FetchResult:
+        """Fetch content from a URL.
+        Args:
+            url: The URL to fetch.
+            **options: Fetcher-specific options like:
+                - target_selector: CSS selector for content extraction
+                - remove_selector: CSS selector for elements to remove
+                - timeout: Request timeout in seconds
+                - headers: Additional HTTP headers
+        Returns:
+            FetchResult with content and metadata.
+        Raises:
+            ConnectionError: If the URL cannot be reached.
+            TimeoutError: If the request times out.
+        """
+        ...
+    async def health_check(self) -> bool:
+        """Check if the fetcher service is available.
+        Returns:
+            True if the service is healthy, False otherwise.
+        """
+        ...
+    async def fetch_batch(
+        self,
+        urls: list[str],
+        max_concurrent: int = 10,
+        **options: Any,
+    ) -> list[FetchResult | Exception]:
+        """Fetch multiple URLs concurrently.
+        Args:
+            urls: List of URLs to fetch.
+            max_concurrent: Maximum concurrent requests.
+            **options: Options passed to each fetch call.
+        Returns:
+            List of FetchResult objects or Exception for failed fetches.
+        """
+        ...

gnosisllm_knowledge/core/interfaces/indexer.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""Document indexer protocol - Interface Segregation Principle."""
+from __future__ import annotations
+from collections.abc import AsyncIterator, Callable, Sequence
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
+if TYPE_CHECKING:
+    from gnosisllm_knowledge.core.domain.document import Document
+    from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
+@runtime_checkable
+class IDocumentIndexer(Protocol):
+    """Protocol for indexing documents into a search backend.
+    Document indexers are responsible for:
+    - Generating embeddings for documents
+    - Storing documents in the search backend
+    - Managing index lifecycle (create, delete, refresh)
+    - Handling bulk operations efficiently
+    Implementations should follow the Interface Segregation Principle
+    and provide focused methods for each operation type.
+    """
+    async def index(
+        self,
+        document: Document,
+        index_name: str,
+        **options: Any,
+    ) -> IndexResult:
+        """Index a single document.
+        Args:
+            document: The document to index.
+            index_name: Target index name.
+            **options: Backend-specific options.
+        Returns:
+            IndexResult with success/failure information.
+        """
+        ...
+    async def bulk_index(
+        self,
+        documents: Sequence[Document],
+        index_name: str,
+        batch_size: int = 100,
+        **options: Any,
+    ) -> IndexResult:
+        """Bulk index multiple documents efficiently.
+        Args:
+            documents: Documents to index.
+            index_name: Target index name.
+            batch_size: Number of documents per batch.
+            **options: Backend-specific options.
+        Returns:
+            Aggregated IndexResult for all documents.
+        """
+        ...
+    async def bulk_index_streaming(
+        self,
+        documents: AsyncIterator[Document],
+        index_name: str,
+        batch_size: int = 100,
+        max_concurrent_batches: int = 3,
+        on_batch_complete: Callable[[BatchResult], None] | None = None,
+        **options: Any,
+    ) -> IndexResult:
+        """Stream-index documents with backpressure handling.
+        Memory-efficient indexing for large document streams.
+        Args:
+            documents: Async iterator of documents.
+            index_name: Target index name.
+            batch_size: Number of documents per batch.
+            max_concurrent_batches: Maximum concurrent batch operations.
+            on_batch_complete: Callback called after each batch completes.
+            **options: Backend-specific options.
+        Returns:
+            Aggregated IndexResult for all documents.
+        """
+        ...
+    async def upsert(
+        self,
+        document: Document,
+        index_name: str,
+        **options: Any,
+    ) -> IndexResult:
+        """Upsert (update or insert) a document.
+        Args:
+            document: Document to upsert.
+            index_name: Target index name.
+            **options: Backend-specific options.
+        Returns:
+            IndexResult with operation status.
+        """
+        ...
+    async def delete(
+        self,
+        doc_id: str,
+        index_name: str,
+    ) -> bool:
+        """Delete a document by ID.
+        Args:
+            doc_id: Document ID to delete.
+            index_name: Target index name.
+        Returns:
+            True if deleted, False if not found.
+        """
+        ...
+    async def bulk_delete(
+        self,
+        doc_ids: Sequence[str],
+        index_name: str,
+    ) -> int:
+        """Delete multiple documents by ID.
+        Args:
+            doc_ids: Document IDs to delete.
+            index_name: Target index name.
+        Returns:
+            Number of documents deleted.
+        """
+        ...
+    async def delete_by_source(
+        self,
+        source: str,
+        index_name: str,
+    ) -> int:
+        """Delete all documents from a specific source.
+        Args:
+            source: Source identifier.
+            index_name: Target index name.
+        Returns:
+            Number of documents deleted.
+        """
+        ...
+    async def delete_by_query(
+        self,
+        query: dict[str, Any],
+        index_name: str,
+    ) -> int:
+        """Delete documents matching a query.
+        Args:
+            query: Query dictionary in backend format.
+            index_name: Target index name.
+        Returns:
+            Number of documents deleted.
+        """
+        ...
+    async def ensure_index(
+        self,
+        index_name: str,
+        **options: Any,
+    ) -> bool:
+        """Ensure index exists with proper mapping.
+        Creates the index if it doesn't exist, or verifies
+        the existing mapping is compatible.
+        Args:
+            index_name: Index name to ensure.
+            **options: Index settings and mapping options.
+        Returns:
+            True if index exists or was created successfully.
+        """
+        ...
+    async def delete_index(self, index_name: str) -> bool:
+        """Delete an index.
+        Args:
+            index_name: Index name to delete.
+        Returns:
+            True if deleted, False if not found.
+        """
+        ...
+    async def refresh_index(self, index_name: str) -> bool:
+        """Refresh index to make documents searchable.
+        Args:
+            index_name: Index name to refresh.
+        Returns:
+            True if refresh succeeded.
+        """
+        ...
+    async def get_document(
+        self,
+        doc_id: str,
+        index_name: str,
+    ) -> Document | None:
+        """Get a document by ID.
+        Args:
+            doc_id: Document ID to retrieve.
+            index_name: Index name.
+        Returns:
+            Document if found, None otherwise.
+        """
+        ...
+    async def document_exists(
+        self,
+        doc_id: str,
+        index_name: str,
+    ) -> bool:
+        """Check if a document exists.
+        Args:
+            doc_id: Document ID to check.
+            index_name: Index name.
+        Returns:
+            True if document exists.
+        """
+        ...

gnosisllm_knowledge/core/interfaces/loader.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Content loader protocol - Interface Segregation Principle."""
+from __future__ import annotations
+from collections.abc import AsyncIterator, Callable
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
+if TYPE_CHECKING:
+    from gnosisllm_knowledge.core.domain.document import Document
+    from gnosisllm_knowledge.core.domain.result import LoadResult, ValidationResult
+@runtime_checkable
+class IContentLoader(Protocol):
+    """Protocol for loading content from various sources.
+    Content loaders are responsible for:
+    - Fetching content from a source (URL, file, etc.)
+    - Chunking content into documents
+    - Supporting both batch and streaming loading
+    Implementations should follow the Single Responsibility Principle
+    and handle only content loading, not indexing.
+    """
+    @property
+    def name(self) -> str:
+        """Return the loader name for registry identification."""
+        ...
+    def supports(self, source: str) -> bool:
+        """Check if this loader supports the given source.
+        Args:
+            source: The source URL or path.
+        Returns:
+            True if this loader can handle the source.
+        """
+        ...
+    async def validate_source(self, source: str) -> ValidationResult:
+        """Validate that the source is accessible and valid.
+        Args:
+            source: The source URL or path.
+        Returns:
+            ValidationResult with validation status and any errors.
+        """
+        ...
+    async def load(self, source: str, **options: Any) -> LoadResult:
+        """Load all documents from source.
+        Args:
+            source: The source URL or path.
+            **options: Loader-specific options.
+        Returns:
+            LoadResult with loaded documents and metadata.
+        """
+        ...
+    async def load_streaming(
+        self,
+        source: str,
+        **options: Any,
+    ) -> AsyncIterator[Document]:
+        """Stream documents from source for memory-efficient processing.
+        This method yields documents one at a time, which is more
+        memory-efficient for large sources.
+        Args:
+            source: The source URL or path.
+            **options: Loader-specific options.
+        Yields:
+            Document objects as they are loaded.
+        """
+        ...
+    async def load_with_callback(
+        self,
+        source: str,
+        callback: Callable[[list[Document]], Any],
+        batch_size: int = 5,
+        **options: Any,
+    ) -> int:
+        """Load documents with a callback for batch processing.
+        Args:
+            source: The source URL or path.
+            callback: Callback function called with each batch of documents.
+            batch_size: Number of documents per batch.
+            **options: Loader-specific options.
+        Returns:
+            Total number of documents loaded.
+        """
+        ...

gnosisllm_knowledge/core/interfaces/searcher.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""Knowledge searcher protocol - Interface Segregation Principle."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
+if TYPE_CHECKING:
+    from gnosisllm_knowledge.core.domain.search import SearchQuery, SearchResult
+@runtime_checkable
+class IKnowledgeSearcher(Protocol):
+    """Protocol for searching documents in a search backend.
+    Knowledge searchers are responsible for:
+    - Executing different search modes (semantic, keyword, hybrid)
+    - Generating embeddings for queries
+    - Filtering and ranking results
+    - Handling pagination
+    Implementations should follow the Interface Segregation Principle
+    and provide focused methods for each search type.
+    """
+    async def search(
+        self,
+        query: SearchQuery,
+        index_name: str,
+        **options: Any,
+    ) -> SearchResult:
+        """Execute search based on query mode.
+        Automatically selects the appropriate search method based
+        on the query's mode setting.
+        Args:
+            query: Search query with filters and options.
+            index_name: Target index name.
+            **options: Backend-specific options.
+        Returns:
+            SearchResult with hits and metadata.
+        """
+        ...
+    async def semantic_search(
+        self,
+        query: SearchQuery,
+        index_name: str,
+        **options: Any,
+    ) -> SearchResult:
+        """Execute semantic (vector) search only.
+        Uses embedding similarity to find relevant documents.
+        Args:
+            query: Search query.
+            index_name: Target index name.
+            **options: Backend-specific options.
+        Returns:
+            SearchResult with semantically similar documents.
+        """
+        ...
+    async def keyword_search(
+        self,
+        query: SearchQuery,
+        index_name: str,
+        **options: Any,
+    ) -> SearchResult:
+        """Execute keyword (BM25) search only.
+        Uses traditional text matching to find relevant documents.
+        Args:
+            query: Search query.
+            index_name: Target index name.
+            **options: Backend-specific options.
+        Returns:
+            SearchResult with keyword-matching documents.
+        """
+        ...
+    async def hybrid_search(
+        self,
+        query: SearchQuery,
+        index_name: str,
+        **options: Any,
+    ) -> SearchResult:
+        """Execute hybrid (semantic + keyword) search.
+        Combines vector similarity and text matching for best results.
+        Args:
+            query: Search query.
+            index_name: Target index name.
+            **options: Backend-specific options.
+        Returns:
+            SearchResult with combined ranking.
+        """
+        ...
+    async def get_embedding(
+        self,
+        text: str,
+        **options: Any,
+    ) -> list[float]:
+        """Get embedding vector for text.
+        Args:
+            text: Text to embed.
+            **options: Embedding model options.
+        Returns:
+            Embedding vector as list of floats.
+        """
+        ...
+    async def get_embeddings_batch(
+        self,
+        texts: list[str],
+        batch_size: int = 100,
+        **options: Any,
+    ) -> list[list[float]]:
+        """Get embeddings for multiple texts efficiently.
+        Args:
+            texts: List of texts to embed.
+            batch_size: Batch size for API calls.
+            **options: Embedding model options.
+        Returns:
+            List of embedding vectors.
+        """
+        ...
+    async def get_similar_documents(
+        self,
+        doc_id: str,
+        index_name: str,
+        limit: int = 10,
+        **options: Any,
+    ) -> SearchResult:
+        """Find documents similar to a given document.
+        Args:
+            doc_id: Document ID to find similar documents for.
+            index_name: Target index name.
+            limit: Maximum number of results.
+            **options: Backend-specific options.
+        Returns:
+            SearchResult with similar documents.
+        """
+        ...
+    async def multi_search(
+        self,
+        queries: list[SearchQuery],
+        index_name: str,
+        **options: Any,
+    ) -> list[SearchResult]:
+        """Execute multiple searches in a single request.
+        More efficient than individual search calls.
+        Args:
+            queries: List of search queries.
+            index_name: Target index name.
+            **options: Backend-specific options.
+        Returns:
+            List of SearchResults in same order as queries.
+        """
+        ...