PyPI - piragi - Versions diffs - 0.1.0__py3-none-any.whl - Mend

piragi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

piragi-0.1.0.dist-info/METADATA +149 -0
piragi-0.1.0.dist-info/RECORD +14 -0
piragi-0.1.0.dist-info/WHEEL +4 -0
piragi-0.1.0.dist-info/licenses/LICENSE +21 -0
ragi/__init__.py +28 -0
ragi/async_updater.py +345 -0
ragi/change_detection.py +211 -0
ragi/chunking.py +150 -0
ragi/core.py +318 -0
ragi/embeddings.py +150 -0
ragi/loader.py +131 -0
ragi/retrieval.py +125 -0
ragi/store.py +177 -0
ragi/types.py +54 -0

ragi/change_detection.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""Change detection for automatic updates."""
+import hashlib
+import os
+import time
+from typing import Dict, Optional
+from urllib.parse import urlparse
+import requests
+class ChangeDetector:
+    """Detects changes in files and URLs for automatic updates."""
+    @staticmethod
+    def compute_content_hash(content: str) -> str:
+        """
+        Compute SHA256 hash of content.
+        Args:
+            content: Content to hash
+        Returns:
+            Hex digest of SHA256 hash
+        """
+        return hashlib.sha256(content.encode("utf-8")).hexdigest()
+    @staticmethod
+    def is_url(source: str) -> bool:
+        """Check if source is a URL."""
+        parsed = urlparse(source)
+        return parsed.scheme in ("http", "https")
+    @staticmethod
+    def check_file_changed(
+        source: str, stored_mtime: Optional[float], stored_hash: str
+    ) -> bool:
+        """
+        Check if a file has changed using mtime and content hash.
+        Args:
+            source: File path
+            stored_mtime: Previously stored modification time
+            stored_hash: Previously stored content hash
+        Returns:
+            True if file changed, False otherwise
+        """
+        if not os.path.exists(source):
+            return False
+        # Quick check: modification time
+        current_mtime = os.path.getmtime(source)
+        if stored_mtime and current_mtime == stored_mtime:
+            # File hasn't been touched, definitely not changed
+            return False
+        # Modification time changed, check actual content
+        try:
+            with open(source, "r", encoding="utf-8", errors="ignore") as f:
+                content = f.read()
+            current_hash = ChangeDetector.compute_content_hash(content)
+            return current_hash != stored_hash
+        except Exception:
+            # If we can't read, assume changed to be safe
+            return True
+    @staticmethod
+    def check_url_changed(
+        source: str,
+        stored_etag: Optional[str],
+        stored_last_modified: Optional[str],
+        timeout: int = 10,
+    ) -> Dict[str, any]:
+        """
+        Check if a URL has changed using HTTP headers.
+        Uses conditional requests for minimal latency.
+        Args:
+            source: URL
+            stored_etag: Previously stored ETag
+            stored_last_modified: Previously stored Last-Modified
+            timeout: Request timeout in seconds
+        Returns:
+            Dict with 'changed' bool and optional 'etag', 'last_modified'
+        """
+        try:
+            headers = {}
+            # Add conditional request headers
+            if stored_etag:
+                headers["If-None-Match"] = stored_etag
+            if stored_last_modified:
+                headers["If-Modified-Since"] = stored_last_modified
+            # Send HEAD request first (faster, no body download)
+            response = requests.head(source, headers=headers, timeout=timeout, allow_redirects=True)
+            # 304 Not Modified - content hasn't changed
+            if response.status_code == 304:
+                return {"changed": False}
+            # If HEAD not supported, try GET with same conditional headers
+            if response.status_code == 405:  # Method Not Allowed
+                response = requests.get(
+                    source, headers=headers, timeout=timeout, stream=True, allow_redirects=True
+                )
+                # Close connection immediately without downloading body
+                response.close()
+            # 200 OK - content might have changed
+            if response.status_code == 200:
+                new_etag = response.headers.get("ETag")
+                new_last_modified = response.headers.get("Last-Modified")
+                # If server provides ETag or Last-Modified, use them
+                if new_etag and new_etag == stored_etag:
+                    return {"changed": False}
+                if new_last_modified and new_last_modified == stored_last_modified:
+                    return {"changed": False}
+                # Headers changed or not available, assume content changed
+                return {
+                    "changed": True,
+                    "etag": new_etag,
+                    "last_modified": new_last_modified,
+                }
+            # Other status codes - assume changed to be safe
+            return {"changed": True}
+        except Exception as e:
+            # Network error - can't verify, assume not changed
+            # This prevents errors from forcing unnecessary updates
+            return {"changed": False, "error": str(e)}
+    @staticmethod
+    def get_file_metadata(source: str, content: str) -> Dict[str, any]:
+        """
+        Get metadata for a file source.
+        Args:
+            source: File path
+            content: File content
+        Returns:
+            Metadata dict with mtime and content_hash
+        """
+        mtime = os.path.getmtime(source) if os.path.exists(source) else None
+        content_hash = ChangeDetector.compute_content_hash(content)
+        return {
+            "source": source,
+            "last_checked": time.time(),
+            "content_hash": content_hash,
+            "mtime": mtime,
+            "etag": None,
+            "last_modified": None,
+            "check_interval": 300.0,  # 5 minutes default
+        }
+    @staticmethod
+    def get_url_metadata(
+        source: str, content: str, timeout: int = 10
+    ) -> Dict[str, any]:
+        """
+        Get metadata for a URL source.
+        Args:
+            source: URL
+            content: URL content
+            timeout: Request timeout
+        Returns:
+            Metadata dict with etag, last_modified, and content_hash
+        """
+        content_hash = ChangeDetector.compute_content_hash(content)
+        # Fetch HTTP headers
+        try:
+            response = requests.head(source, timeout=timeout, allow_redirects=True)
+            etag = response.headers.get("ETag")
+            last_modified = response.headers.get("Last-Modified")
+        except Exception:
+            etag = None
+            last_modified = None
+        return {
+            "source": source,
+            "last_checked": time.time(),
+            "content_hash": content_hash,
+            "mtime": None,
+            "etag": etag,
+            "last_modified": last_modified,
+            "check_interval": 300.0,  # 5 minutes default for URLs
+        }
+    @staticmethod
+    def should_check_now(last_checked: float, check_interval: float) -> bool:
+        """
+        Determine if enough time has passed to check for updates.
+        Args:
+            last_checked: Unix timestamp of last check
+            check_interval: Seconds between checks
+        Returns:
+            True if should check now
+        """
+        return (time.time() - last_checked) >= check_interval

ragi/chunking.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Smart chunking strategies for documents."""
+import re
+from typing import List
+from transformers import AutoTokenizer
+from .types import Chunk, Document
+class Chunker:
+    """Smart document chunker with markdown awareness."""
+    def __init__(
+        self,
+        chunk_size: int = 512,
+        chunk_overlap: int = 50,
+        tokenizer_name: str = "nvidia/llama-embed-nemotron-8b",
+    ) -> None:
+        """
+        Initialize the chunker.
+        Args:
+            chunk_size: Target chunk size in tokens
+            chunk_overlap: Number of tokens to overlap between chunks
+            tokenizer_name: Tokenizer to use (default: nvidia/llama-embed-nemotron-8b)
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    def chunk_document(self, document: Document) -> List[Chunk]:
+        """
+        Chunk a document into smaller pieces.
+        Args:
+            document: Document to chunk
+        Returns:
+            List of chunks
+        """
+        # Split by markdown headers first to respect document structure
+        sections = self._split_by_headers(document.content)
+        chunks = []
+        chunk_index = 0
+        for section in sections:
+            section_chunks = self._chunk_text(section, document.source, chunk_index)
+            chunks.extend(section_chunks)
+            chunk_index += len(section_chunks)
+        # Add document metadata to all chunks
+        for chunk in chunks:
+            chunk.metadata.update(document.metadata)
+        return chunks
+    def _split_by_headers(self, text: str) -> List[str]:
+        """Split text by markdown headers while preserving structure."""
+        # Pattern to match markdown headers (# Header)
+        header_pattern = r"^(#{1,6}\s+.+)$"
+        lines = text.split("\n")
+        sections = []
+        current_section = []
+        for line in lines:
+            if re.match(header_pattern, line.strip()):
+                # Save previous section if it exists
+                if current_section:
+                    sections.append("\n".join(current_section))
+                current_section = [line]
+            else:
+                current_section.append(line)
+        # Add the last section
+        if current_section:
+            sections.append("\n".join(current_section))
+        return sections if sections else [text]
+    def _chunk_text(self, text: str, source: str, start_index: int) -> List[Chunk]:
+        """
+        Chunk text into token-sized pieces with overlap.
+        Args:
+            text: Text to chunk
+            source: Source identifier
+            start_index: Starting chunk index
+        Returns:
+            List of chunks
+        """
+        tokens = self.tokenizer.encode(text, add_special_tokens=False)
+        if len(tokens) <= self.chunk_size:
+            return [
+                Chunk(
+                    text=text,
+                    source=source,
+                    chunk_index=start_index,
+                    metadata={},
+                )
+            ]
+        chunks = []
+        start = 0
+        chunk_idx = start_index
+        while start < len(tokens):
+            end = start + self.chunk_size
+            chunk_tokens = tokens[start:end]
+            # Decode back to text
+            chunk_text = self.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
+            # Try to break at sentence boundary if possible
+            if end < len(tokens):
+                chunk_text = self._break_at_sentence(chunk_text)
+            chunks.append(
+                Chunk(
+                    text=chunk_text.strip(),
+                    source=source,
+                    chunk_index=chunk_idx,
+                    metadata={},
+                )
+            )
+            # Move start with overlap
+            start = end - self.chunk_overlap
+            chunk_idx += 1
+        return chunks
+    def _break_at_sentence(self, text: str) -> str:
+        """Try to break text at a sentence boundary."""
+        # Look for sentence endings
+        sentence_endings = [". ", ".\n", "? ", "?\n", "! ", "!\n"]
+        for ending in sentence_endings:
+            if ending in text:
+                # Find the last occurrence
+                idx = text.rfind(ending)
+                if idx > len(text) * 0.5:  # Only if it's in the latter half
+                    return text[: idx + len(ending)]
+        # If no good break point, return as is
+        return text

ragi/core.py ADDED Viewed

@@ -0,0 +1,318 @@
+"""Core Ragi class - the main interface."""
+from typing import Any, Dict, List, Optional, Union
+from .chunking import Chunker
+from .embeddings import EmbeddingGenerator
+from .loader import DocumentLoader
+from .retrieval import Retriever
+from .store import VectorStore
+from .types import Answer, Document
+from .async_updater import AsyncUpdater
+from .change_detection import ChangeDetector
+class Ragi:
+    """
+    Zero-setup RAG library with auto-chunking, embeddings, and smart citations.
+    Examples:
+        >>> from ragi import Ragi
+        >>>
+        >>> # Simple - uses free local models
+        >>> kb = Ragi("./docs")
+        >>>
+        >>> # Custom config
+        >>> kb = Ragi("./docs", config={
+        ...     "llm": {"model": "gpt-4o-mini"},
+        ...     "embedding": {"device": "cuda"}
+        ... })
+        >>>
+        >>> # Ask questions
+        >>> answer = kb.ask("How do I install this?")
+        >>> print(answer.text)
+        >>>
+        >>> # Callable shorthand
+        >>> answer = kb("What's the API?")
+    """
+    def __init__(
+        self,
+        sources: Union[str, List[str], None] = None,
+        persist_dir: str = ".ragi",
+        config: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Initialize Ragi with optional document sources.
+        Args:
+            sources: File paths, URLs, or glob patterns to load
+            persist_dir: Directory to persist vector database
+            config: Configuration dict with optional sections:
+                - llm: LLM configuration
+                    - model: Model name (default: "llama3.2")
+                    - base_url: API base URL (default: "http://localhost:11434/v1")
+                    - api_key: API key (default: "not-needed")
+                - embedding: Embedding configuration
+                    - model: Model name (default: "nvidia/llama-embed-nemotron-8b")
+                    - device: Device to use for local models (default: None for auto-detect)
+                    - base_url: API base URL for remote embeddings (optional)
+                    - api_key: API key for remote embeddings (optional)
+                - chunk: Chunking configuration
+                    - size: Chunk size in tokens (default: 512)
+                    - overlap: Overlap in tokens (default: 50)
+                - auto_update: Auto-update configuration (enabled by default)
+                    - enabled: Enable background updates (default: True)
+                    - interval: Check interval in seconds (default: 300)
+                    - workers: Number of background workers (default: 2)
+        Examples:
+            >>> # Use defaults
+            >>> kb = Ragi("./docs")
+            >>>
+            >>> # Custom LLM
+            >>> kb = Ragi("./docs", config={
+            ...     "llm": {"model": "gpt-4o-mini", "api_key": "sk-..."}
+            ... })
+            >>>
+            >>> # Full config
+            >>> kb = Ragi("./docs", config={
+            ...     "llm": {"model": "llama3.2"},
+            ...     "embedding": {"device": "cuda"},
+            ...     "chunk": {"size": 1024, "overlap": 200}
+            ... })
+        """
+        # Initialize config
+        cfg = config or {}
+        # Initialize components
+        self.loader = DocumentLoader()
+        # Chunking
+        chunk_cfg = cfg.get("chunk", {})
+        self.chunker = Chunker(
+            chunk_size=chunk_cfg.get("size", 512),
+            chunk_overlap=chunk_cfg.get("overlap", 50),
+        )
+        # Embeddings
+        embed_cfg = cfg.get("embedding", {})
+        self.embedder = EmbeddingGenerator(
+            model=embed_cfg.get("model", "nvidia/llama-embed-nemotron-8b"),
+            device=embed_cfg.get("device"),
+            base_url=embed_cfg.get("base_url"),
+            api_key=embed_cfg.get("api_key"),
+        )
+        # Vector store
+        self.store = VectorStore(persist_dir=persist_dir)
+        # LLM
+        llm_cfg = cfg.get("llm", {})
+        self.retriever = Retriever(
+            model=llm_cfg.get("model", "llama3.2"),
+            api_key=llm_cfg.get("api_key"),
+            base_url=llm_cfg.get("base_url"),
+        )
+        # State for filtering
+        self._filters: Optional[Dict[str, Any]] = None
+        # Auto-update setup
+        auto_update_cfg = cfg.get("auto_update", {})
+        self._auto_update_enabled = auto_update_cfg.get("enabled", True)
+        self._updater: Optional[AsyncUpdater] = None
+        self._tracked_sources: Dict[str, Document] = {}
+        if self._auto_update_enabled:
+            interval = auto_update_cfg.get("interval", 300.0)
+            workers = auto_update_cfg.get("workers", 2)
+            self._updater = AsyncUpdater(
+                refresh_callback=self._background_refresh,
+                check_interval=interval,
+                max_workers=workers,
+            )
+            self._updater.start()
+        # Load initial sources if provided
+        if sources:
+            self.add(sources)
+    def add(self, sources: Union[str, List[str]]) -> "Ragi":
+        """
+        Add documents to the knowledge base.
+        Args:
+            sources: File paths, URLs, or glob patterns
+        Returns:
+            Self for chaining
+        """
+        # Load documents
+        documents = self.loader.load(sources)
+        # Chunk documents
+        all_chunks = []
+        for doc in documents:
+            chunks = self.chunker.chunk_document(doc)
+            all_chunks.extend(chunks)
+        # Generate embeddings
+        chunks_with_embeddings = self.embedder.embed_chunks(all_chunks)
+        # Store in vector database
+        self.store.add_chunks(chunks_with_embeddings)
+        # Register sources for auto-update
+        if self._auto_update_enabled and self._updater:
+            for doc in documents:
+                self._tracked_sources[doc.source] = doc
+                # Register with updater
+                if ChangeDetector.is_url(doc.source):
+                    metadata = ChangeDetector.get_url_metadata(doc.source, doc.content)
+                else:
+                    metadata = ChangeDetector.get_file_metadata(doc.source, doc.content)
+                self._updater.register_source(
+                    doc.source, doc.content, check_interval=None
+                )
+        return self
+    def _background_refresh(self, source: Union[str, List[str]]) -> None:
+        """
+        Internal method called by background updater.
+        Refreshes sources without user interaction.
+        Args:
+            source: Source(s) to refresh
+        """
+        # This is called from background thread, so be careful with state
+        self.refresh(source)
+    def ask(
+        self,
+        query: str,
+        top_k: int = 5,
+        system_prompt: Optional[str] = None,
+    ) -> Answer:
+        """
+        Ask a question and get an answer with citations.
+        Args:
+            query: Question to ask
+            top_k: Number of relevant chunks to retrieve
+            system_prompt: Optional custom system prompt for answer generation
+        Returns:
+            Answer with citations
+        """
+        # Generate query embedding
+        query_embedding = self.embedder.embed_query(query)
+        # Search for relevant chunks
+        citations = self.store.search(
+            query_embedding=query_embedding,
+            top_k=top_k,
+            filters=self._filters,
+        )
+        # Generate answer
+        answer = self.retriever.generate_answer(
+            query=query,
+            citations=citations,
+            system_prompt=system_prompt,
+        )
+        # Reset filters after use
+        self._filters = None
+        return answer
+    def filter(self, **kwargs: Any) -> "Ragi":
+        """
+        Filter documents by metadata for the next query.
+        Args:
+            **kwargs: Metadata key-value pairs to filter by
+        Returns:
+            Self for chaining
+        Examples:
+            >>> kb.filter(type="api").ask("How does auth work?")
+            >>> kb.filter(source="docs/guide.pdf").ask("What's in the guide?")
+        """
+        self._filters = kwargs
+        return self
+    def __call__(self, query: str, top_k: int = 5) -> Answer:
+        """
+        Callable shorthand for ask().
+        Args:
+            query: Question to ask
+            top_k: Number of relevant chunks to retrieve
+        Returns:
+            Answer with citations
+        """
+        return self.ask(query, top_k=top_k)
+    def count(self) -> int:
+        """Return the number of chunks in the knowledge base."""
+        return self.store.count()
+    def refresh(self, sources: Union[str, List[str]]) -> "Ragi":
+        """
+        Refresh specific sources by deleting old chunks and re-adding.
+        Useful when documents have been updated.
+        Args:
+            sources: File paths, URLs, or glob patterns to refresh
+        Returns:
+            Self for chaining
+        Examples:
+            >>> # Refresh a single file
+            >>> kb.refresh("./docs/api.md")
+            >>>
+            >>> # Refresh multiple files
+            >>> kb.refresh(["./docs/*.pdf", "./README.md"])
+        """
+        # Load documents to get their actual source paths
+        documents = self.loader.load(sources)
+        # Delete old chunks for each source
+        for doc in documents:
+            deleted = self.store.delete_by_source(doc.source)
+        # Re-add the documents
+        all_chunks = []
+        for doc in documents:
+            chunks = self.chunker.chunk_document(doc)
+            all_chunks.extend(chunks)
+        # Generate embeddings
+        chunks_with_embeddings = self.embedder.embed_chunks(all_chunks)
+        # Store in vector database
+        self.store.add_chunks(chunks_with_embeddings)
+        return self
+    def clear(self) -> None:
+        """Clear all data from the knowledge base."""
+        # Stop auto-updater if running
+        if self._updater:
+            self._updater.stop()
+            self._tracked_sources.clear()
+        self.store.clear()
+    def __del__(self):
+        """Cleanup on deletion."""
+        if hasattr(self, "_updater") and self._updater:
+            self._updater.stop()