PyPI - ragit - Versions diffs - 0.8.2__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

ragit 0.8.2py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

ragit/__init__.py +27 -15
ragit/assistant.py +431 -40
ragit/config.py +165 -22
ragit/core/experiment/experiment.py +7 -1
ragit/exceptions.py +271 -0
ragit/loaders.py +200 -44
ragit/logging.py +194 -0
ragit/monitor.py +307 -0
ragit/providers/__init__.py +1 -13
ragit/providers/ollama.py +379 -121
ragit/utils/__init__.py +0 -22
ragit/version.py +1 -1
{ragit-0.8.2.dist-info → ragit-0.11.0.dist-info}/METADATA +48 -25
ragit-0.11.0.dist-info/RECORD +22 -0
{ragit-0.8.2.dist-info → ragit-0.11.0.dist-info}/WHEEL +1 -1
ragit/providers/sentence_transformers.py +0 -225
ragit-0.8.2.dist-info/RECORD +0 -20
{ragit-0.8.2.dist-info → ragit-0.11.0.dist-info}/licenses/LICENSE +0 -0
{ragit-0.8.2.dist-info → ragit-0.11.0.dist-info}/top_level.txt +0 -0

ragit/loaders.py CHANGED Viewed

@@ -6,15 +6,78 @@
 Document loading and chunking utilities.
 Provides simple functions to load documents from files and chunk text.
+Includes ai4rag-inspired patterns:
+- Auto-generated document IDs via SHA256 hash
+- Sequence numbering for chunk ordering
+- Deduplication via content hashing
 """
+import hashlib
 import re
 from pathlib import Path
-from typing import Any
 from ragit.core.experiment.experiment import Chunk, Document
+def generate_document_id(content: str) -> str:
+    """
+    Generate a unique document ID from content using SHA256 hash.
+    Pattern from ai4rag langchain_chunker.py.
+    Parameters
+    ----------
+    content : str
+        Document content to hash.
+    Returns
+    -------
+    str
+        16-character hex string (first 64 bits of SHA256).
+    Examples
+    --------
+    >>> doc_id = generate_document_id("Hello, world!")
+    >>> len(doc_id)
+    16
+    """
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+def deduplicate_documents(documents: list[Document]) -> list[Document]:
+    """
+    Remove duplicate documents based on content hash.
+    Pattern from ai4rag chroma.py.
+    Parameters
+    ----------
+    documents : list[Document]
+        Documents to deduplicate.
+    Returns
+    -------
+    list[Document]
+        Unique documents (first occurrence kept).
+    Examples
+    --------
+    >>> unique_docs = deduplicate_documents(docs)
+    >>> print(f"Removed {len(docs) - len(unique_docs)} duplicates")
+    """
+    seen_hashes: set[str] = set()
+    unique_docs: list[Document] = []
+    for doc in documents:
+        content_hash = generate_document_id(doc.content)
+        if content_hash not in seen_hashes:
+            seen_hashes.add(content_hash)
+            unique_docs.append(doc)
+    return unique_docs
 def load_text(path: str | Path) -> Document:
     """
     Load a single text file as a Document.
@@ -77,11 +140,16 @@ def chunk_text(
     text: str,
     chunk_size: int = 512,
     chunk_overlap: int = 50,
-    doc_id: str = "doc",
-    metadata: dict[str, Any] | None = None,
+    doc_id: str | None = None,
+    include_metadata: bool = True,
 ) -> list[Chunk]:
     """
-    Split text into overlapping chunks.
+    Split text into overlapping chunks with rich metadata.
+    Includes ai4rag-inspired metadata:
+    - document_id: SHA256 hash for deduplication and window search
+    - sequence_number: Order within the document
+    - chunk_start/chunk_end: Character positions in original text
     Parameters
     ----------
@@ -91,37 +159,55 @@ def chunk_text(
         Maximum characters per chunk (default: 512).
     chunk_overlap : int
         Overlap between chunks (default: 50).
-    doc_id : str
-        Document ID for the chunks (default: "doc").
-    metadata : dict, optional
-        Metadata to attach to each chunk (default: None).
+    doc_id : str, optional
+        Document ID for the chunks. If None, generates from content hash.
+    include_metadata : bool
+        Include rich metadata in chunks (default: True).
     Returns
     -------
     list[Chunk]
-        List of text chunks.
+        List of text chunks with metadata.
     Examples
     --------
-    >>> chunks = chunk_text("Long document...", chunk_size=256, chunk_overlap=50)
+    >>> chunks = chunk_text("Long document...", chunk_size=256)
+    >>> print(chunks[0].metadata)
+    {'document_id': 'a1b2c3...', 'sequence_number': 0, 'chunk_start': 0, 'chunk_end': 256}
     """
     if chunk_overlap >= chunk_size:
         raise ValueError("chunk_overlap must be less than chunk_size")
+    # Generate document ID if not provided
+    effective_doc_id = doc_id or generate_document_id(text)
     chunks = []
     start = 0
-    chunk_idx = 0
-    chunk_metadata = metadata or {}
+    sequence_number = 0
     while start < len(text):
-        end = start + chunk_size
+        end = min(start + chunk_size, len(text))
         chunk_content = text[start:end].strip()
         if chunk_content:
+            metadata = {}
+            if include_metadata:
+                metadata = {
+                    "document_id": effective_doc_id,
+                    "sequence_number": sequence_number,
+                    "chunk_start": start,
+                    "chunk_end": end,
+                }
             chunks.append(
-                Chunk(content=chunk_content, doc_id=doc_id, chunk_index=chunk_idx, metadata=chunk_metadata.copy())
+                Chunk(
+                    content=chunk_content,
+                    doc_id=effective_doc_id,
+                    chunk_index=sequence_number,
+                    metadata=metadata,
+                )
             )
-            chunk_idx += 1
+            sequence_number += 1
         start = end - chunk_overlap
         if start >= len(text) - chunk_overlap:
@@ -130,9 +216,14 @@ def chunk_text(
     return chunks
-def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50) -> list[Chunk]:
+def chunk_document(
+    doc: Document,
+    chunk_size: int = 512,
+    chunk_overlap: int = 50,
+    include_metadata: bool = True,
+) -> list[Chunk]:
     """
-    Split a Document into overlapping chunks.
+    Split a Document into overlapping chunks with rich metadata.
     Parameters
     ----------
@@ -142,17 +233,29 @@ def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50
         Maximum characters per chunk.
     chunk_overlap : int
         Overlap between chunks.
+    include_metadata : bool
+        Include rich metadata in chunks (default: True).
     Returns
     -------
     list[Chunk]
-        List of chunks from the document.
+        List of chunks from the document with metadata.
     """
-    return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, metadata=doc.metadata)
+    chunks = chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, include_metadata)
+    # Merge document metadata into chunk metadata
+    if doc.metadata and include_metadata:
+        for chunk in chunks:
+            chunk.metadata = {**doc.metadata, **chunk.metadata}
+    return chunks
 def chunk_by_separator(
-    text: str, separator: str = "\n\n", doc_id: str = "doc", metadata: dict[str, Any] | None = None
+    text: str,
+    separator: str = "\n\n",
+    doc_id: str | None = None,
+    include_metadata: bool = True,
 ) -> list[Chunk]:
     """
     Split text by a separator (e.g., paragraphs, sections).
@@ -163,64 +266,96 @@ def chunk_by_separator(
         Text to split.
     separator : str
         Separator string (default: double newline for paragraphs).
-    doc_id : str
-        Document ID for the chunks.
-    metadata : dict, optional
-        Metadata to attach to each chunk (default: None).
+    doc_id : str, optional
+        Document ID for the chunks. If None, generates from content hash.
+    include_metadata : bool
+        Include rich metadata in chunks (default: True).
     Returns
     -------
     list[Chunk]
-        List of chunks.
+        List of chunks with metadata.
     Examples
     --------
     >>> chunks = chunk_by_separator(text, separator="\\n---\\n")
     """
+    effective_doc_id = doc_id or generate_document_id(text)
     parts = text.split(separator)
-    chunks = []
-    chunk_metadata = metadata or {}
+    chunks: list[Chunk] = []
+    current_pos = 0
-    for idx, part in enumerate(parts):
+    for _idx, part in enumerate(parts):
         content = part.strip()
         if content:
-            chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx, metadata=chunk_metadata.copy()))
+            metadata = {}
+            if include_metadata:
+                # Find actual position in original text
+                part_start = text.find(part, current_pos)
+                part_end = part_start + len(part) if part_start >= 0 else current_pos + len(part)
+                metadata = {
+                    "document_id": effective_doc_id,
+                    "sequence_number": len(chunks),
+                    "chunk_start": part_start if part_start >= 0 else current_pos,
+                    "chunk_end": part_end,
+                }
+                current_pos = part_end
+            chunks.append(
+                Chunk(
+                    content=content,
+                    doc_id=effective_doc_id,
+                    chunk_index=len(chunks),
+                    metadata=metadata,
+                )
+            )
     return chunks
-def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any] | None = None) -> list[Chunk]:
+def chunk_rst_sections(
+    text: str,
+    doc_id: str | None = None,
+    include_metadata: bool = True,
+) -> list[Chunk]:
     """
-    Split RST document by section headers.
+    Split RST document by section headers with rich metadata.
     Parameters
     ----------
     text : str
         RST document text.
-    doc_id : str
-        Document ID for the chunks.
-    metadata : dict, optional
-        Metadata to attach to each chunk (default: None).
+    doc_id : str, optional
+        Document ID for the chunks. If None, generates from content hash.
+    include_metadata : bool
+        Include rich metadata in chunks (default: True).
     Returns
     -------
     list[Chunk]
-        List of section chunks.
+        List of section chunks with metadata.
     """
+    effective_doc_id = doc_id or generate_document_id(text)
     # Match RST section headers (title followed by underline of =, -, ~, etc.)
     pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
-    chunk_metadata = metadata or {}
     # Find all section positions
     matches = list(re.finditer(pattern, text))
     if not matches:
         # No sections found, return whole text as one chunk
-        return (
-            [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy())]
-            if text.strip()
-            else []
-        )
+        if text.strip():
+            metadata = {}
+            if include_metadata:
+                metadata = {
+                    "document_id": effective_doc_id,
+                    "sequence_number": 0,
+                    "chunk_start": 0,
+                    "chunk_end": len(text),
+                }
+            return [Chunk(content=text.strip(), doc_id=effective_doc_id, chunk_index=0, metadata=metadata)]
+        return []
     chunks = []
@@ -229,7 +364,15 @@ def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any]
     if first_pos > 0:
         pre_content = text[:first_pos].strip()
         if pre_content:
-            chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy()))
+            metadata = {}
+            if include_metadata:
+                metadata = {
+                    "document_id": effective_doc_id,
+                    "sequence_number": 0,
+                    "chunk_start": 0,
+                    "chunk_end": first_pos,
+                }
+            chunks.append(Chunk(content=pre_content, doc_id=effective_doc_id, chunk_index=0, metadata=metadata))
     # Extract each section
     for i, match in enumerate(matches):
@@ -238,8 +381,21 @@ def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any]
         section_content = text[start:end].strip()
         if section_content:
+            metadata = {}
+            if include_metadata:
+                metadata = {
+                    "document_id": effective_doc_id,
+                    "sequence_number": len(chunks),
+                    "chunk_start": start,
+                    "chunk_end": end,
+                }
             chunks.append(
-                Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks), metadata=chunk_metadata.copy())
+                Chunk(
+                    content=section_content,
+                    doc_id=effective_doc_id,
+                    chunk_index=len(chunks),
+                    metadata=metadata,
+                )
             )
     return chunks

ragit/logging.py ADDED Viewed

@@ -0,0 +1,194 @@
+#
+# Copyright RODMENA LIMITED 2025
+# SPDX-License-Identifier: Apache-2.0
+#
+"""
+Structured logging for ragit.
+Provides consistent logging across all ragit components with:
+- Operation timing
+- Context tracking
+- Configurable log levels
+"""
+import logging
+import time
+from collections.abc import Callable, Generator
+from contextlib import contextmanager
+from functools import wraps
+from typing import Any, TypeVar
+# Create ragit logger
+logger = logging.getLogger("ragit")
+# Type variable for decorated functions
+F = TypeVar("F", bound=Callable[..., Any])
+def setup_logging(level: str = "INFO", format_string: str | None = None) -> None:
+    """Configure ragit logging.
+    Parameters
+    ----------
+    level : str
+        Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
+    format_string : str, optional
+        Custom format string. If None, uses default format.
+    Examples
+    --------
+    >>> from ragit.logging import setup_logging
+    >>> setup_logging("DEBUG")
+    """
+    logger.setLevel(level.upper())
+    # Only add handler if none exist
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        handler.setLevel(level.upper())
+        if format_string is None:
+            format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        formatter = logging.Formatter(format_string)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+@contextmanager
+def log_operation(operation: str, **context: Any) -> Generator[dict[str, Any], None, None]:
+    """Context manager for logging operations with timing.
+    Parameters
+    ----------
+    operation : str
+        Name of the operation being performed.
+    **context
+        Additional context to include in log messages.
+    Yields
+    ------
+    dict
+        Mutable dict to add additional context during the operation.
+    Examples
+    --------
+    >>> with log_operation("embed", model="nomic-embed-text") as ctx:
+    ...     result = provider.embed(text, model)
+    ...     ctx["dimensions"] = len(result.embedding)
+    """
+    start = time.perf_counter()
+    extra_context: dict[str, Any] = {}
+    # Build context string
+    ctx_str = ", ".join(f"{k}={v}" for k, v in context.items()) if context else ""
+    logger.debug(f"{operation}.start" + (f" [{ctx_str}]" if ctx_str else ""))
+    try:
+        yield extra_context
+        duration_ms = (time.perf_counter() - start) * 1000
+        # Combine original context with extra context
+        all_context = {**context, **extra_context, "duration_ms": f"{duration_ms:.2f}"}
+        ctx_str = ", ".join(f"{k}={v}" for k, v in all_context.items())
+        logger.info(f"{operation}.success [{ctx_str}]")
+    except Exception as e:
+        duration_ms = (time.perf_counter() - start) * 1000
+        all_context = {**context, **extra_context, "duration_ms": f"{duration_ms:.2f}", "error": str(e)}
+        ctx_str = ", ".join(f"{k}={v}" for k, v in all_context.items())
+        logger.error(f"{operation}.failed [{ctx_str}]", exc_info=True)
+        raise
+def log_method(operation: str) -> Callable[[F], F]:
+    """Decorator for logging method calls with timing.
+    Parameters
+    ----------
+    operation : str
+        Name of the operation for logging.
+    Returns
+    -------
+    Callable
+        Decorated function.
+    Examples
+    --------
+    >>> class MyProvider:
+    ...     @log_method("embed")
+    ...     def embed(self, text: str, model: str):
+    ...         ...
+    """
+    def decorator(func: F) -> F:
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            with log_operation(operation, method=func.__name__):
+                return func(*args, **kwargs)
+        return wrapper  # type: ignore
+    return decorator
+class LogContext:
+    """Context tracker for correlating related log messages.
+    Useful for tracing operations across multiple components.
+    Examples
+    --------
+    >>> ctx = LogContext("query-123")
+    >>> ctx.log("Starting retrieval", top_k=5)
+    >>> ctx.log("Retrieved chunks", count=3)
+    """
+    def __init__(self, request_id: str | None = None):
+        """Initialize log context.
+        Parameters
+        ----------
+        request_id : str, optional
+            Unique identifier for this context. Auto-generated if not provided.
+        """
+        self.request_id = request_id or f"req-{int(time.time() * 1000) % 100000}"
+        self._start_time = time.perf_counter()
+    def log(self, message: str, level: str = "INFO", **context: Any) -> None:
+        """Log a message with this context.
+        Parameters
+        ----------
+        message : str
+            Log message.
+        level : str
+            Log level (DEBUG, INFO, WARNING, ERROR).
+        **context
+            Additional context key-value pairs.
+        """
+        elapsed_ms = (time.perf_counter() - self._start_time) * 1000
+        ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
+        full_msg = f"[{self.request_id}] {message}" + (f" [{ctx_str}]" if ctx_str else "") + f" (+{elapsed_ms:.0f}ms)"
+        log_level = getattr(logging, level.upper(), logging.INFO)
+        logger.log(log_level, full_msg)
+    def debug(self, message: str, **context: Any) -> None:
+        """Log debug message."""
+        self.log(message, "DEBUG", **context)
+    def info(self, message: str, **context: Any) -> None:
+        """Log info message."""
+        self.log(message, "INFO", **context)
+    def warning(self, message: str, **context: Any) -> None:
+        """Log warning message."""
+        self.log(message, "WARNING", **context)
+    def error(self, message: str, **context: Any) -> None:
+        """Log error message."""
+        self.log(message, "ERROR", **context)

ragit 0.8.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

ragit 0.8.2py3-none-any.whl → 0.11.0py3-none-any.whl