PyPI - ragit - Versions diffs - 0.8.1__py3-none-any.whl → 0.8.2__py3-none-any.whl - Mend

ragit 0.8.1py3-none-any.whl → 0.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

ragit/assistant.py +139 -4
ragit/core/experiment/experiment.py +3 -4
ragit/loaders.py +37 -11
ragit/version.py +1 -1
{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/METADATA +1 -1
{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/RECORD +9 -9
{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/WHEEL +1 -1
{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/licenses/LICENSE +0 -0
{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/top_level.txt +0 -0

ragit/assistant.py CHANGED Viewed

@@ -116,8 +116,7 @@ class RAGAssistant:
             # Use explicit provider
             if not isinstance(provider, BaseEmbeddingProvider):
                 raise ValueError(
-                    "Provider must implement BaseEmbeddingProvider for embeddings. "
-                    "Alternatively, provide embed_fn."
+                    "Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
                 )
             self._embedding_provider = provider
             if isinstance(provider, BaseLLMProvider):
@@ -156,7 +155,20 @@ class RAGAssistant:
         if path.is_dir():
             docs: list[Document] = []
-            for pattern in ("*.txt", "*.md", "*.rst"):
+            for pattern in (
+                "*.txt",
+                "*.md",
+                "*.rst",
+                "*.py",
+                "*.js",
+                "*.ts",
+                "*.go",
+                "*.java",
+                "*.c",
+                "*.cpp",
+                "*.h",
+                "*.hpp",
+            ):
                 docs.extend(load_directory(path, pattern))
             return docs
@@ -169,7 +181,7 @@ class RAGAssistant:
         for doc in self.documents:
             # Use RST section chunking for .rst files, otherwise regular chunking
             if doc.metadata.get("filename", "").endswith(".rst"):
-                chunks = chunk_rst_sections(doc.content, doc.id)
+                chunks = chunk_rst_sections(doc.content, doc.id, metadata=doc.metadata)
             else:
                 chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
             all_chunks.extend(chunks)
@@ -194,6 +206,129 @@ class RAGAssistant:
         self._chunks = tuple(all_chunks)
         self._embedding_matrix = embedding_matrix / norms
+    def add_documents(self, documents: list[Document] | str | Path) -> int:
+        """Add documents to the existing index incrementally.
+        Args:
+            documents: Documents to add.
+        Returns:
+            Number of chunks added.
+        """
+        new_docs = self._load_documents(documents)
+        if not new_docs:
+            return 0
+        self.documents.extend(new_docs)
+        # Chunk new docs
+        new_chunks: list[Chunk] = []
+        for doc in new_docs:
+            if doc.metadata.get("filename", "").endswith(".rst"):
+                chunks = chunk_rst_sections(doc.content, doc.id, metadata=doc.metadata)
+            else:
+                chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
+            new_chunks.extend(chunks)
+        if not new_chunks:
+            return 0
+        # Embed new chunks
+        texts = [chunk.content for chunk in new_chunks]
+        responses = self._embedding_provider.embed_batch(texts, self.embedding_model)
+        new_matrix = np.array([response.embedding for response in responses], dtype=np.float64)
+        # Normalize
+        norms = np.linalg.norm(new_matrix, axis=1, keepdims=True)
+        norms[norms == 0] = 1
+        new_matrix_norm = new_matrix / norms
+        # Update state
+        current_chunks = list(self._chunks)
+        current_chunks.extend(new_chunks)
+        self._chunks = tuple(current_chunks)
+        if self._embedding_matrix is None:
+            self._embedding_matrix = new_matrix_norm
+        else:
+            self._embedding_matrix = np.vstack((self._embedding_matrix, new_matrix_norm))
+        return len(new_chunks)
+    def remove_documents(self, source_path_pattern: str) -> int:
+        """Remove documents matching a source path pattern.
+        Args:
+            source_path_pattern: Glob pattern to match 'source' metadata.
+        Returns:
+            Number of chunks removed.
+        """
+        import fnmatch
+        if not self._chunks:
+            return 0
+        indices_to_keep = []
+        kept_chunks = []
+        removed_count = 0
+        for i, chunk in enumerate(self._chunks):
+            source = chunk.metadata.get("source", "")
+            if not source or not fnmatch.fnmatch(source, source_path_pattern):
+                indices_to_keep.append(i)
+                kept_chunks.append(chunk)
+            else:
+                removed_count += 1
+        if removed_count == 0:
+            return 0
+        self._chunks = tuple(kept_chunks)
+        if self._embedding_matrix is not None:
+            if not kept_chunks:
+                self._embedding_matrix = None
+            else:
+                self._embedding_matrix = self._embedding_matrix[indices_to_keep]
+        # Also remove from self.documents
+        self.documents = [
+            doc for doc in self.documents if not fnmatch.fnmatch(doc.metadata.get("source", ""), source_path_pattern)
+        ]
+        return removed_count
+    def update_documents(self, documents: list[Document] | str | Path) -> int:
+        """Update existing documents (remove old, add new).
+        Uses document source path to identify what to remove.
+        Args:
+            documents: New versions of documents.
+        Returns:
+            Number of chunks added.
+        """
+        new_docs = self._load_documents(documents)
+        if not new_docs:
+            return 0
+        # Identify sources to remove
+        sources_to_remove = set()
+        for doc in new_docs:
+            source = doc.metadata.get("source")
+            if source:
+                sources_to_remove.add(source)
+        # Remove old versions
+        for source in sources_to_remove:
+            self.remove_documents(source)
+        # Add new versions
+        return self.add_documents(new_docs)
     def retrieve(self, query: str, top_k: int = 3) -> list[tuple[Chunk, float]]:
         """
         Retrieve relevant chunks for a query.

ragit/core/experiment/experiment.py CHANGED Viewed

@@ -51,6 +51,7 @@ class Chunk:
     doc_id: str
     chunk_index: int
     embedding: tuple[float, ...] | list[float] | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
 @dataclass
@@ -203,8 +204,7 @@ class RagitExperiment:
         elif provider is not None:
             if not isinstance(provider, BaseEmbeddingProvider):
                 raise ValueError(
-                    "Provider must implement BaseEmbeddingProvider for embeddings. "
-                    "Alternatively, provide embed_fn."
+                    "Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
                 )
             self._embedding_provider = provider
             if isinstance(provider, BaseLLMProvider):
@@ -220,8 +220,7 @@ class RagitExperiment:
         # LLM is required for evaluation
         if self._llm_provider is None:
             raise ValueError(
-                "RagitExperiment requires LLM for evaluation. "
-                "Provide generate_fn or a provider with LLM support."
+                "RagitExperiment requires LLM for evaluation. Provide generate_fn or a provider with LLM support."
             )
     @property

ragit/loaders.py CHANGED Viewed

@@ -10,6 +10,7 @@ Provides simple functions to load documents from files and chunk text.
 import re
 from pathlib import Path
+from typing import Any
 from ragit.core.experiment.experiment import Chunk, Document
@@ -72,7 +73,13 @@ def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = F
     return documents
-def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id: str = "doc") -> list[Chunk]:
+def chunk_text(
+    text: str,
+    chunk_size: int = 512,
+    chunk_overlap: int = 50,
+    doc_id: str = "doc",
+    metadata: dict[str, Any] | None = None,
+) -> list[Chunk]:
     """
     Split text into overlapping chunks.
@@ -86,6 +93,8 @@ def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id
         Overlap between chunks (default: 50).
     doc_id : str
         Document ID for the chunks (default: "doc").
+    metadata : dict, optional
+        Metadata to attach to each chunk (default: None).
     Returns
     -------
@@ -102,13 +111,16 @@ def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id
     chunks = []
     start = 0
     chunk_idx = 0
+    chunk_metadata = metadata or {}
     while start < len(text):
         end = start + chunk_size
-        chunk_text = text[start:end].strip()
+        chunk_content = text[start:end].strip()
-        if chunk_text:
-            chunks.append(Chunk(content=chunk_text, doc_id=doc_id, chunk_index=chunk_idx))
+        if chunk_content:
+            chunks.append(
+                Chunk(content=chunk_content, doc_id=doc_id, chunk_index=chunk_idx, metadata=chunk_metadata.copy())
+            )
             chunk_idx += 1
         start = end - chunk_overlap
@@ -136,10 +148,12 @@ def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50
     list[Chunk]
         List of chunks from the document.
     """
-    return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id)
+    return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, metadata=doc.metadata)
-def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc") -> list[Chunk]:
+def chunk_by_separator(
+    text: str, separator: str = "\n\n", doc_id: str = "doc", metadata: dict[str, Any] | None = None
+) -> list[Chunk]:
     """
     Split text by a separator (e.g., paragraphs, sections).
@@ -151,6 +165,8 @@ def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc")
         Separator string (default: double newline for paragraphs).
     doc_id : str
         Document ID for the chunks.
+    metadata : dict, optional
+        Metadata to attach to each chunk (default: None).
     Returns
     -------
@@ -163,16 +179,17 @@ def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc")
     """
     parts = text.split(separator)
     chunks = []
+    chunk_metadata = metadata or {}
     for idx, part in enumerate(parts):
         content = part.strip()
         if content:
-            chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx))
+            chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx, metadata=chunk_metadata.copy()))
     return chunks
-def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
+def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any] | None = None) -> list[Chunk]:
     """
     Split RST document by section headers.
@@ -182,6 +199,8 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
         RST document text.
     doc_id : str
         Document ID for the chunks.
+    metadata : dict, optional
+        Metadata to attach to each chunk (default: None).
     Returns
     -------
@@ -190,13 +209,18 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
     """
     # Match RST section headers (title followed by underline of =, -, ~, etc.)
     pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
+    chunk_metadata = metadata or {}
     # Find all section positions
     matches = list(re.finditer(pattern, text))
     if not matches:
         # No sections found, return whole text as one chunk
-        return [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0)] if text.strip() else []
+        return (
+            [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy())]
+            if text.strip()
+            else []
+        )
     chunks = []
@@ -205,7 +229,7 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
     if first_pos > 0:
         pre_content = text[:first_pos].strip()
         if pre_content:
-            chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0))
+            chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy()))
     # Extract each section
     for i, match in enumerate(matches):
@@ -214,6 +238,8 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
         section_content = text[start:end].strip()
         if section_content:
-            chunks.append(Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks)))
+            chunks.append(
+                Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks), metadata=chunk_metadata.copy())
+            )
     return chunks

ragit/version.py CHANGED Viewed

@@ -2,4 +2,4 @@
 # Copyright RODMENA LIMITED 2025
 # SPDX-License-Identifier: Apache-2.0
 #
-__version__ = "0.8.1"
+__version__ = "0.8.2"

{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragit
-Version: 0.8.1
+Version: 0.8.2
 Summary: Automatic RAG Pattern Optimization Engine
 Author: RODMENA LIMITED
 Maintainer-email: RODMENA LIMITED <info@rodmena.co.uk>

{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 ragit/__init__.py,sha256=JUkL7ivgr4o4nZak-96P1C-pzKdNuN3Tl0X0WvpeXBU,3142
-ragit/assistant.py,sha256=FW8LVqEOA1nemTMdTZhb79aONeHsQM8tHADxCQ47p1Y,14705
+ragit/assistant.py,sha256=LNof1zJAQWLIfhd7aPmKCpPQDCShpt9ezeM2nQ8ouyQ,18777
 ragit/config.py,sha256=7XnueNO4h22ibeWd1akHnfVoGSD8xE5vuOCMYeQOOU4,1898
-ragit/loaders.py,sha256=keusuPzXPBiLDVj4hKfPCcge-rm-cnzNRk50fGXvTJs,5571
-ragit/version.py,sha256=_qpX4vMVMSqb-_4jdv6EZJ3tkvFsyu_Pj00vRC6T2sg,97
+ragit/loaders.py,sha256=1JXgDLorvmtaDaRpbnKEqQjbQ4O5yfZxlb4QRUdGr58,6415
+ragit/version.py,sha256=WCqbf2oV6eXhq3DvqECcVFop-dseJIExoMxZ4fCtkvs,97
 ragit/core/__init__.py,sha256=j53PFfoSMXwSbK1rRHpMbo8mX2i4R1LJ5kvTxBd7-0w,100
 ragit/core/experiment/__init__.py,sha256=4vAPOOYlY5Dcr2gOolyhBSPGIUxZKwEkgQffxS9BodA,452
-ragit/core/experiment/experiment.py,sha256=WQZWRLbLPuGpG0tpCZCEz3sKgSv4CNimmABbOLR_oKs,19314
+ragit/core/experiment/experiment.py,sha256=aANDJ-XlMB0ijT8SBsPkb2U-lM3cChOuRO3oP9u3XxA,19331
 ragit/core/experiment/results.py,sha256=KHpN3YSLJ83_JUfIMccRPS-q7LEt0S9p8ehDRawk_4k,3487
 ragit/providers/__init__.py,sha256=tKWjUV31OZprD8k9aUUidtDMg7C_dWBXN7igtxeB8Ec,1339
 ragit/providers/base.py,sha256=MJ8mVeXuGWhkX2XGTbkWIY3cVoTOPr4h5XBXw8rAX2Q,3434
@@ -13,8 +13,8 @@ ragit/providers/function_adapter.py,sha256=A-TQhBgBWbuO_w1sy795Dxep1FOCBpAlWpXCK
 ragit/providers/ollama.py,sha256=YJH5a9nQHnP0NrIK7G9PqjV5A53f9JxmEJDAJ6d297M,15410
 ragit/providers/sentence_transformers.py,sha256=tTkd4HpE1MyfFJAwur-a7w-GlBxe93HlyM_dRffDrdY,6996
 ragit/utils/__init__.py,sha256=-UsE5oJSnmEnBDswl-ph0A09Iu8yKNbPhd1-_7Lcb8Y,3051
-ragit-0.8.1.dist-info/licenses/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
-ragit-0.8.1.dist-info/METADATA,sha256=OaOeM-ujuMlkfjiNcXRUC6JpIApFgkvP536nHsaLW0g,4888
-ragit-0.8.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ragit-0.8.1.dist-info/top_level.txt,sha256=pkPbG7yrw61wt9_y_xcLE2vq2a55fzockASD0yq0g4s,6
-ragit-0.8.1.dist-info/RECORD,,
+ragit-0.8.2.dist-info/licenses/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
+ragit-0.8.2.dist-info/METADATA,sha256=wlBpVj_aHxR7ZWy5yzpo2Wt-IoLcVlFGo4oBXGzMajY,4888
+ragit-0.8.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
+ragit-0.8.2.dist-info/top_level.txt,sha256=pkPbG7yrw61wt9_y_xcLE2vq2a55fzockASD0yq0g4s,6
+ragit-0.8.2.dist-info/RECORD,,

{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

ragit 0.8.1__py3-none-any.whl → 0.8.2__py3-none-any.whl

ragit 0.8.1py3-none-any.whl → 0.8.2py3-none-any.whl