PyPI - arabic-rag-kit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

arabic-rag-kit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

arabic_rag_kit/__init__.py +38 -0
arabic_rag_kit/chunk.py +269 -0
arabic_rag_kit/loaders.py +65 -0
arabic_rag_kit/normalize.py +249 -0
arabic_rag_kit/search.py +197 -0
arabic_rag_kit-0.1.0.dist-info/METADATA +232 -0
arabic_rag_kit-0.1.0.dist-info/RECORD +9 -0
arabic_rag_kit-0.1.0.dist-info/WHEEL +4 -0
arabic_rag_kit-0.1.0.dist-info/licenses/LICENSE +21 -0

arabic_rag_kit/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""arabic-rag-kit — prepare Arabic (and mixed Arabic/English) documents for RAG.
+A small, dependency-light toolkit for the unglamorous-but-critical first mile
+of an Arabic RAG or search pipeline: normalization, sentence-aware chunking,
+and a provider-agnostic vector index.
+Built by Hasan Odeh at Gulf Business Machines (GBM). MIT licensed.
+"""
+from __future__ import annotations
+from .chunk import Chunk, chunk_text, split_sentences
+from .normalize import Normalizer, NormalizerConfig, normalize
+__version__ = "0.1.0"
+__all__ = [
+    "__version__",
+    # normalize
+    "normalize",
+    "Normalizer",
+    "NormalizerConfig",
+    # chunk
+    "chunk_text",
+    "split_sentences",
+    "Chunk",
+    # search (imported lazily; see __getattr__)
+    "VectorIndex",
+]
+def __getattr__(name: str):
+    """Lazily expose :class:`VectorIndex` without importing numpy at import time."""
+    if name == "VectorIndex":
+        from .search import VectorIndex
+        return VectorIndex
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

arabic_rag_kit/chunk.py ADDED Viewed

@@ -0,0 +1,269 @@
+"""RAG-aware text chunking that respects Arabic sentence boundaries.
+Pure standard library — the only import is this package's own
+:mod:`arabic_rag_kit.normalize` (which is itself dependency-free).
+Two public entry points:
+* :func:`split_sentences` — split text into sentences on Arabic *and* Latin
+  punctuation, without breaking on decimals or common abbreviations.
+* :func:`chunk_text` — recursive character chunking that prefers to break on
+  sentence boundaries, then on whitespace, and finally mid-token only when a
+  single token is larger than ``chunk_size``. Returns :class:`Chunk` objects
+  carrying exact character offsets into the chunked text.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from .normalize import normalize as _normalize
+__all__ = ["Chunk", "split_sentences", "chunk_text"]
+# Sentence terminators: Arabic question mark (؟), Arabic semicolon (؛),
+# Arabic comma (،), Arabic full stop (۔) and the Latin . ! ?
+_TERMINATORS = frozenset(".!?؟؛،۔")
+# Abbreviations whose trailing period should not end a sentence. Matched
+# case-insensitively against the word immediately preceding the period.
+_ABBREVIATIONS = frozenset({
+    "dr", "mr", "mrs", "ms", "prof", "sr", "jr", "vs", "etc", "no", "st",
+    "mt", "fig", "al", "ph", "inc", "ltd", "co", "eg", "ie", "e.g", "i.e",
+})
+@dataclass(frozen=True)
+class Chunk:
+    """A single chunk of text with its position in the source.
+    Attributes:
+        text: The chunk's text.
+        index: Zero-based position of this chunk in the returned list.
+        start_char: Inclusive start offset into the chunked text.
+        end_char: Exclusive end offset into the chunked text.
+    """
+    text: str
+    index: int
+    start_char: int
+    end_char: int
+# --------------------------------------------------------------------------- #
+# Sentence splitting.
+# --------------------------------------------------------------------------- #
+def _preceding_word(text: str, i: int) -> str:
+    """Return the run of alphanumerics ending just before index ``i``."""
+    j = i - 1
+    chars: list[str] = []
+    # Include internal dots so dotted abbreviations ("e.g", "i.e") are matched.
+    while j >= 0 and (text[j].isalnum() or text[j] == "."):
+        chars.append(text[j])
+        j -= 1
+    return "".join(reversed(chars)).strip(".")
+def _is_period_boundary(text: str, i: int) -> bool:
+    """Decide whether the ``.`` at ``text[i]`` really ends a sentence."""
+    n = len(text)
+    prev = text[i - 1] if i > 0 else ""
+    nxt = text[i + 1] if i + 1 < n else ""
+    # Decimal number: "3.14" — a period between two digits is not a boundary.
+    if prev.isdigit() and nxt.isdigit():
+        return False
+    # Inline abbreviation / initialism: a period immediately followed by a
+    # lowercase ASCII letter, e.g. "e.g." or "i.e." — not a boundary.
+    if "a" <= nxt <= "z":
+        return False
+    # Known abbreviation ("Dr. Ahmed", "etc. ") followed by space + capital.
+    word = _preceding_word(text, i)
+    if word and word.lower() in _ABBREVIATIONS:
+        return False
+    return True
+def _trim_span(text: str, start: int, end: int) -> tuple[int, int] | None:
+    """Strip surrounding whitespace from ``[start, end)``; ``None`` if empty."""
+    while start < end and text[start].isspace():
+        start += 1
+    while end > start and text[end - 1].isspace():
+        end -= 1
+    return (start, end) if end > start else None
+def _iter_sentence_spans(text: str):
+    """Yield ``(start, end)`` spans of trimmed sentences within ``text``."""
+    n = len(text)
+    seg_start = 0
+    i = 0
+    while i < n:
+        ch = text[i]
+        if ch == "\n":
+            span = _trim_span(text, seg_start, i)
+            if span:
+                yield span
+            i += 1
+            seg_start = i
+            continue
+        if ch in _TERMINATORS:
+            if ch == "." and not _is_period_boundary(text, i):
+                i += 1
+                continue
+            # Absorb any run of trailing terminators, e.g. "؟!" or "...".
+            j = i + 1
+            while j < n and text[j] != "\n" and text[j] in _TERMINATORS:
+                j += 1
+            span = _trim_span(text, seg_start, j)
+            if span:
+                yield span
+            i = j
+            seg_start = j
+            continue
+        i += 1
+    span = _trim_span(text, seg_start, n)
+    if span:
+        yield span
+def split_sentences(text: str) -> list[str]:
+    """Split ``text`` into sentences.
+    Splits on Arabic punctuation (؟ ؛ ، ۔), Latin ``.`` ``!`` ``?`` and
+    newlines. Periods inside decimal numbers (``3.14``) and common
+    abbreviations (``e.g.``, ``Dr.``) do not create a break.
+    Returns a list of trimmed sentence strings (empty list for empty input).
+    """
+    if not text:
+        return []
+    return [text[s:e] for (s, e) in _iter_sentence_spans(text)]
+# --------------------------------------------------------------------------- #
+# Chunking.
+# --------------------------------------------------------------------------- #
+def _iter_word_spans(text: str, start: int, end: int):
+    """Yield spans of whitespace-delimited tokens within ``[start, end)``."""
+    i = start
+    while i < end:
+        if text[i].isspace():
+            i += 1
+            continue
+        j = i
+        while j < end and not text[j].isspace():
+            j += 1
+        yield (i, j)
+        i = j
+def _unit_spans(text: str, chunk_size: int) -> list[tuple[int, int]]:
+    """Break text into the finest units no larger than ``chunk_size``.
+    Units are sentences; a sentence longer than ``chunk_size`` is broken into
+    words; a word longer than ``chunk_size`` is broken into fixed-size slices.
+    Every returned span therefore has ``end - start <= chunk_size``.
+    """
+    units: list[tuple[int, int]] = []
+    for s, e in _iter_sentence_spans(text):
+        if e - s <= chunk_size:
+            units.append((s, e))
+            continue
+        for ws, we in _iter_word_spans(text, s, e):
+            if we - ws <= chunk_size:
+                units.append((ws, we))
+            else:
+                pos = ws
+                while pos < we:
+                    units.append((pos, min(pos + chunk_size, we)))
+                    pos += chunk_size
+    return units
+def _merge_units(
+    units: list[tuple[int, int]], chunk_size: int, chunk_overlap: int
+) -> list[tuple[int, int]]:
+    """Greedily pack units into chunk spans, honoring size and overlap."""
+    m = len(units)
+    spans: list[tuple[int, int]] = []
+    a = 0
+    while a < m:
+        # Extend the window while the contiguous span stays within budget.
+        b = a
+        while b + 1 < m and (units[b + 1][1] - units[a][0]) <= chunk_size:
+            b += 1
+        start, end = units[a][0], units[b][1]
+        spans.append((start, end))
+        if b == m - 1:
+            break
+        if chunk_overlap == 0:
+            a = b + 1
+            continue
+        # Start the next chunk so its overlap with this one is <= chunk_overlap,
+        # snapping to a unit boundary. Always make forward progress.
+        target = end - chunk_overlap
+        next_a = None
+        for k in range(a + 1, b + 1):
+            if units[k][0] >= target:
+                next_a = k
+                break
+        if next_a is None:
+            next_a = b if b > a else a + 1
+        if next_a <= a:
+            next_a = a + 1
+        a = next_a
+    return spans
+def chunk_text(
+    text: str,
+    chunk_size: int = 1000,
+    chunk_overlap: int = 200,
+    normalize: bool = False,
+) -> list[Chunk]:
+    """Split ``text`` into overlapping, sentence-aware chunks.
+    Args:
+        text: The text to chunk.
+        chunk_size: Maximum characters per chunk. No chunk exceeds this bound.
+        chunk_overlap: Approximate number of characters shared between
+            consecutive chunks (for context continuity). Must be smaller than
+            ``chunk_size``.
+        normalize: If ``True``, run :func:`arabic_rag_kit.normalize` on the
+            text first; the returned offsets then refer to the normalized text.
+    Returns:
+        A list of :class:`Chunk` objects (empty list for empty/whitespace
+        input). Offsets index into the (possibly normalized) text.
+    Raises:
+        ValueError: If ``chunk_size <= 0``, ``chunk_overlap < 0``, or
+            ``chunk_overlap >= chunk_size``.
+    """
+    if chunk_size <= 0:
+        raise ValueError("chunk_size must be a positive integer")
+    if chunk_overlap < 0:
+        raise ValueError("chunk_overlap must be non-negative")
+    if chunk_overlap >= chunk_size:
+        raise ValueError("chunk_overlap must be smaller than chunk_size")
+    if normalize:
+        text = _normalize(text)
+    if not text or not text.strip():
+        return []
+    units = _unit_spans(text, chunk_size)
+    if not units:
+        return []
+    spans = _merge_units(units, chunk_size, chunk_overlap)
+    return [
+        Chunk(text=text[s:e], index=idx, start_char=s, end_char=e)
+        for idx, (s, e) in enumerate(spans)
+    ]

arabic_rag_kit/loaders.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Document loaders for common file types.
+``load_txt`` is pure standard library. ``load_pdf`` and ``load_docx`` rely on
+optional third-party packages that are imported only when the function is
+called, so importing this module never fails on a bare install.
+Install the extras with::
+    pip install "arabic-rag-kit[docs]"
+"""
+from __future__ import annotations
+from pathlib import Path
+__all__ = ["load_txt", "load_pdf", "load_docx"]
+def load_txt(path: str | Path, encoding: str = "utf-8") -> str:
+    """Read a plain-text file and return its contents.
+    Args:
+        path: Path to a ``.txt`` (or any UTF-8 text) file.
+        encoding: Text encoding, ``utf-8`` by default.
+    """
+    return Path(path).read_text(encoding=encoding)
+def load_pdf(path: str | Path) -> str:
+    """Extract text from a PDF using ``pypdf``.
+    Pages are joined with blank lines. Requires the ``docs`` extra::
+        pip install "arabic-rag-kit[docs]"
+    """
+    try:
+        from pypdf import PdfReader
+    except ImportError as exc:
+        raise ImportError(
+            "load_pdf requires pypdf. Install it with:\n"
+            '    pip install "arabic-rag-kit[docs]"'
+        ) from exc
+    reader = PdfReader(str(path))
+    pages = [page.extract_text() or "" for page in reader.pages]
+    return "\n\n".join(pages)
+def load_docx(path: str | Path) -> str:
+    """Extract text from a Word ``.docx`` file using ``python-docx``.
+    Paragraphs are joined with newlines. Requires the ``docs`` extra::
+        pip install "arabic-rag-kit[docs]"
+    """
+    try:
+        import docx  # python-docx
+    except ImportError as exc:
+        raise ImportError(
+            "load_docx requires python-docx. Install it with:\n"
+            '    pip install "arabic-rag-kit[docs]"'
+        ) from exc
+    document = docx.Document(str(path))
+    return "\n".join(p.text for p in document.paragraphs)

arabic_rag_kit/normalize.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Arabic text normalization for RAG and search pipelines.
+This module is **pure standard library** — it has no third-party dependencies.
+The public surface is:
+* :func:`normalize` — a one-shot convenience function.
+* :class:`NormalizerConfig` — a dataclass describing which operations to run.
+* :class:`Normalizer` — a reusable, pre-configured normalizer.
+Plus a set of small, composable helpers (``remove_diacritics``,
+``normalize_alef``, ``convert_digits`` …) that each do exactly one thing so you
+can build your own pipeline if the defaults do not fit.
+All operations are Unicode-aware and safe to run on mixed Arabic/English text:
+characters that are not targeted by a given operation are passed through
+untouched.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+__all__ = [
+    "NormalizerConfig",
+    "Normalizer",
+    "normalize",
+    "remove_diacritics",
+    "remove_tatweel",
+    "normalize_alef",
+    "normalize_hamza",
+    "normalize_ta_marbuta",
+    "normalize_alef_maqsura",
+    "convert_digits",
+    "strip_control_chars",
+    "collapse_whitespace",
+]
+# --------------------------------------------------------------------------- #
+# Character sets (documented with their Unicode code points for reviewers).
+# --------------------------------------------------------------------------- #
+# Tashkeel / harakat: U+064B..U+0652 (fathatan..sukun) + U+0670 superscript alef.
+_DIACRITICS = "".join(chr(c) for c in range(0x064B, 0x0653)) + "ٰ"
+_DIACRITICS_RE = re.compile("[" + re.escape(_DIACRITICS) + "]")
+# Tatweel / kashida.
+_TATWEEL = "ـ"
+# Alef variants -> plain alef (U+0627).
+#   أ U+0623, إ U+0625, آ U+0622, ٱ U+0671  ->  ا U+0627
+_ALEF_MAP = str.maketrans({
+    "أ": "ا",
+    "إ": "ا",
+    "آ": "ا",
+    "ٱ": "ا",
+})
+# Hamza carriers.
+#   ؤ U+0624 -> و U+0648,  ئ U+0626 -> ي U+064A
+_HAMZA_MAP = str.maketrans({
+    "ؤ": "و",
+    "ئ": "ي",
+})
+# Ta marbuta ة U+0629 -> ه U+0647
+_TA_MARBUTA_MAP = str.maketrans({"ة": "ه"})
+# Alef maqsura ى U+0649 -> ي U+064A
+_ALEF_MAQSURA_MAP = str.maketrans({"ى": "ي"})
+# Arabic-Indic (U+0660..U+0669) and Eastern Arabic-Indic (U+06F0..U+06F9)
+# digits -> ASCII 0..9.
+_DIGIT_MAP = str.maketrans(
+    {chr(0x0660 + i): str(i) for i in range(10)}
+    | {chr(0x06F0 + i): str(i) for i in range(10)}
+)
+# Zero-width and bidirectional control characters:
+#   U+200B..U+200F (ZWSP, ZWNJ, ZWJ, LRM, RLM),
+#   U+202A..U+202E (LRE, RLE, PDF, LRO, RLO),
+#   U+FEFF (BOM / zero-width no-break space).
+_CONTROL_CHARS = (
+    "".join(chr(c) for c in range(0x200B, 0x2010))
+    + "".join(chr(c) for c in range(0x202A, 0x202F))
+    + ""
+)
+_CONTROL_RE = re.compile("[" + re.escape(_CONTROL_CHARS) + "]")
+_WHITESPACE_RE = re.compile(r"\s+")
+# --------------------------------------------------------------------------- #
+# Composable helpers — each does one thing and returns a new string.
+# --------------------------------------------------------------------------- #
+def remove_diacritics(text: str) -> str:
+    """Remove Arabic tashkeel/harakat (U+064B–U+0652 and U+0670)."""
+    return _DIACRITICS_RE.sub("", text)
+def remove_tatweel(text: str) -> str:
+    """Remove tatweel/kashida elongation characters (U+0640)."""
+    return text.replace(_TATWEEL, "")
+def normalize_alef(text: str) -> str:
+    """Fold alef variants (أ إ آ ٱ) to plain alef (ا)."""
+    return text.translate(_ALEF_MAP)
+def normalize_hamza(text: str) -> str:
+    """Fold hamza carriers (ؤ → و, ئ → ي)."""
+    return text.translate(_HAMZA_MAP)
+def normalize_ta_marbuta(text: str) -> str:
+    """Fold ta marbuta (ة → ه)."""
+    return text.translate(_TA_MARBUTA_MAP)
+def normalize_alef_maqsura(text: str) -> str:
+    """Fold alef maqsura (ى → ي)."""
+    return text.translate(_ALEF_MAQSURA_MAP)
+def convert_digits(text: str) -> str:
+    """Convert Arabic-Indic and Eastern Arabic-Indic digits to ASCII 0–9."""
+    return text.translate(_DIGIT_MAP)
+def strip_control_chars(text: str) -> str:
+    """Remove zero-width and bidi control characters."""
+    return _CONTROL_RE.sub("", text)
+def collapse_whitespace(text: str) -> str:
+    """Collapse any run of whitespace to a single space and strip the ends."""
+    return _WHITESPACE_RE.sub(" ", text).strip()
+# --------------------------------------------------------------------------- #
+# Config + reusable normalizer.
+# --------------------------------------------------------------------------- #
+@dataclass
+class NormalizerConfig:
+    """Toggles for every normalization step.
+    Defaults are tuned for RAG/search recall on Modern Standard Arabic: the
+    "aggressive" folds that change meaning (hamza, ta marbuta, alef maqsura)
+    are **off** by default, while safe normalizations (diacritics, tatweel,
+    alef, digits, control chars, whitespace) are **on**.
+    """
+    remove_diacritics: bool = True
+    remove_tatweel: bool = True
+    normalize_alef: bool = True
+    normalize_hamza: bool = False
+    normalize_ta_marbuta: bool = False
+    normalize_alef_maqsura: bool = False
+    convert_digits: bool = True
+    strip_control_chars: bool = True
+    collapse_whitespace: bool = True
+class Normalizer:
+    """A reusable normalizer built from a :class:`NormalizerConfig`.
+    Create one instance and call it many times::
+        norm = Normalizer(NormalizerConfig(normalize_hamza=True))
+        norm("النَّصُّ العربي")  # -> "النص العربي"
+    The instance is stateless with respect to input, so it is safe to share
+    across threads.
+    """
+    def __init__(self, config: NormalizerConfig | None = None) -> None:
+        self.config = config or NormalizerConfig()
+    def normalize(self, text: str) -> str:
+        """Apply the configured pipeline to ``text``."""
+        if not text:
+            return ""
+        cfg = self.config
+        # Order matters: strip invisibles first, then character folds, and
+        # collapse whitespace last so earlier steps cannot leave stray runs.
+        if cfg.strip_control_chars:
+            text = strip_control_chars(text)
+        if cfg.remove_diacritics:
+            text = remove_diacritics(text)
+        if cfg.remove_tatweel:
+            text = remove_tatweel(text)
+        if cfg.normalize_alef:
+            text = normalize_alef(text)
+        if cfg.normalize_hamza:
+            text = normalize_hamza(text)
+        if cfg.normalize_ta_marbuta:
+            text = normalize_ta_marbuta(text)
+        if cfg.normalize_alef_maqsura:
+            text = normalize_alef_maqsura(text)
+        if cfg.convert_digits:
+            text = convert_digits(text)
+        if cfg.collapse_whitespace:
+            text = collapse_whitespace(text)
+        return text
+    # Allow ``normalizer(text)`` as a shorthand for ``normalizer.normalize``.
+    __call__ = normalize
+def normalize(
+    text: str,
+    *,
+    remove_diacritics: bool = True,
+    remove_tatweel: bool = True,
+    normalize_alef: bool = True,
+    normalize_hamza: bool = False,
+    normalize_ta_marbuta: bool = False,
+    normalize_alef_maqsura: bool = False,
+    convert_digits: bool = True,
+    strip_control_chars: bool = True,
+    collapse_whitespace: bool = True,
+) -> str:
+    """Normalize Arabic (or mixed Arabic/English) text in one call.
+    Every step is individually toggleable. See :class:`NormalizerConfig` for
+    the defaults and what each flag does.
+    Example::
+        >>> normalize("الْأَرْقَام: ١٢٣ and English")
+        'الارقام: 123 and English'
+    """
+    config = NormalizerConfig(
+        remove_diacritics=remove_diacritics,
+        remove_tatweel=remove_tatweel,
+        normalize_alef=normalize_alef,
+        normalize_hamza=normalize_hamza,
+        normalize_ta_marbuta=normalize_ta_marbuta,
+        normalize_alef_maqsura=normalize_alef_maqsura,
+        convert_digits=convert_digits,
+        strip_control_chars=strip_control_chars,
+        collapse_whitespace=collapse_whitespace,
+    )
+    return Normalizer(config).normalize(text)

arabic_rag_kit/search.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""A tiny, provider-agnostic vector index for semantic search.
+This module has an **optional** dependency on ``numpy``. The heavy lifting of
+turning text into vectors is delegated to a caller-supplied ``embed_fn`` — the
+index never hardcodes an embedding provider and never needs an API key.
+Install the extra with::
+    pip install "arabic-rag-kit[search]"
+Example::
+    from arabic_rag_kit import VectorIndex
+    def embed(text):            # your embedding of choice
+        ...
+    index = VectorIndex(embed)
+    index.add(["القاهرة عاصمة مصر", "باريس عاصمة فرنسا"])
+    for hit in index.search("ما هي عاصمة مصر؟", k=1):
+        print(hit.text, hit.score)
+"""
+from __future__ import annotations
+from collections.abc import Callable, Iterable, Sequence
+from dataclasses import dataclass, field
+from typing import Any
+__all__ = ["VectorIndex", "SearchResult", "sentence_transformers_embedder"]
+EmbedFn = Callable[[str], Sequence[float]]
+_NUMPY_HINT = (
+    "VectorIndex requires numpy. Install it with:\n"
+    '    pip install "arabic-rag-kit[search]"'
+)
+def _require_numpy():
+    """Import numpy lazily, raising a helpful error if it is missing."""
+    try:
+        import numpy as np
+    except ImportError as exc:  # pragma: no cover - exercised via monkeypatch
+        raise ImportError(_NUMPY_HINT) from exc
+    return np
+@dataclass
+class SearchResult:
+    """A single search hit."""
+    text: str
+    score: float
+    metadata: dict[str, Any] = field(default_factory=dict)
+    index: int = -1
+class VectorIndex:
+    """An in-memory cosine-similarity index over embedded texts.
+    Args:
+        embed_fn: Callable mapping a string to a vector (``list[float]`` or a
+            numpy array). Called once per text on :meth:`add` and once per
+            query on :meth:`search`.
+        normalize: If ``True`` (default), stored vectors are L2-normalized so
+            that cosine similarity reduces to a dot product. Set to ``False``
+            only if your ``embed_fn`` already returns unit vectors.
+    """
+    def __init__(self, embed_fn: EmbedFn, *, normalize: bool = True) -> None:
+        if not callable(embed_fn):
+            raise TypeError("embed_fn must be callable")
+        self._np = _require_numpy()
+        self.embed_fn = embed_fn
+        self.normalize = normalize
+        self._matrix = None  # numpy array, shape (n, dim)
+        self.texts: list[str] = []
+        self.metadatas: list[dict[str, Any]] = []
+    def __len__(self) -> int:
+        return len(self.texts)
+    @property
+    def dim(self) -> int | None:
+        """Embedding dimension, or ``None`` if the index is empty."""
+        if self._matrix is None:
+            return None
+        return int(self._matrix.shape[1])
+    def _vectorize(self, text: str):
+        np = self._np
+        vec = self._np.asarray(self.embed_fn(text), dtype=np.float32).ravel()
+        if vec.ndim != 1 or vec.size == 0:
+            raise ValueError("embed_fn must return a non-empty 1-D vector")
+        if self.normalize:
+            norm = float(np.linalg.norm(vec))
+            if norm > 0.0:
+                vec = vec / norm
+        return vec
+    def add(
+        self,
+        texts: Iterable[str],
+        metadatas: Sequence[dict[str, Any]] | None = None,
+    ) -> None:
+        """Embed and add ``texts`` (with optional parallel ``metadatas``)."""
+        texts = list(texts)
+        if metadatas is not None and len(metadatas) != len(texts):
+            raise ValueError("metadatas must be the same length as texts")
+        if not texts:
+            return
+        np = self._np
+        new_vecs = np.vstack([self._vectorize(t) for t in texts])
+        if self._matrix is None:
+            self._matrix = new_vecs
+        else:
+            if new_vecs.shape[1] != self._matrix.shape[1]:
+                raise ValueError(
+                    f"embedding dim {new_vecs.shape[1]} does not match "
+                    f"existing dim {self._matrix.shape[1]}"
+                )
+            self._matrix = np.vstack([self._matrix, new_vecs])
+        self.texts.extend(texts)
+        if metadatas is None:
+            self.metadatas.extend({} for _ in texts)
+        else:
+            self.metadatas.extend(dict(m) for m in metadatas)
+    def search(self, query: str, k: int = 5) -> list[SearchResult]:
+        """Return the top-``k`` matches for ``query`` by cosine similarity."""
+        if k <= 0:
+            raise ValueError("k must be a positive integer")
+        if self._matrix is None or len(self.texts) == 0:
+            return []
+        np = self._np
+        q = self._vectorize(query)
+        # Cosine similarity. Stored rows are unit vectors when normalize=True;
+        # otherwise divide by their norms here so the score stays in [-1, 1].
+        scores = self._matrix @ q
+        if not self.normalize:
+            row_norms = np.linalg.norm(self._matrix, axis=1)
+            qn = float(np.linalg.norm(q))
+            denom = row_norms * (qn or 1.0)
+            denom[denom == 0.0] = 1.0
+            scores = scores / denom
+        k = min(k, len(self.texts))
+        # Partial top-k, then sort just those k descending.
+        top = np.argpartition(-scores, k - 1)[:k]
+        top = top[np.argsort(-scores[top])]
+        return [
+            SearchResult(
+                text=self.texts[i],
+                score=float(scores[i]),
+                metadata=self.metadatas[i],
+                index=int(i),
+            )
+            for i in top
+        ]
+def sentence_transformers_embedder(
+    model_name: str = "paraphrase-multilingual-MiniLM-L12-v2",
+) -> EmbedFn:
+    """Return an ``embed_fn`` backed by ``sentence-transformers``.
+    The model — and the ``sentence-transformers`` dependency — is loaded only
+    when this function is called, not at import time. Install the extra with::
+        pip install "arabic-rag-kit[embeddings]"
+    Args:
+        model_name: Any model from the sentence-transformers hub. The default
+            is multilingual and handles Arabic well.
+    Returns:
+        A callable mapping ``str -> list[float]`` suitable for
+        :class:`VectorIndex`.
+    """
+    try:
+        from sentence_transformers import SentenceTransformer
+    except ImportError as exc:
+        raise ImportError(
+            "sentence_transformers_embedder requires sentence-transformers. "
+            'Install it with:\n    pip install "arabic-rag-kit[embeddings]"'
+        ) from exc
+    model = SentenceTransformer(model_name)
+    def embed(text: str) -> list[float]:
+        return model.encode(text, convert_to_numpy=True).tolist()
+    return embed

arabic_rag_kit-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,232 @@
+Metadata-Version: 2.4
+Name: arabic-rag-kit
+Version: 0.1.0
+Summary: Prepare Arabic (and mixed Arabic/English) documents for RAG and search: normalization, sentence-aware chunking, and a provider-agnostic vector index.
+Project-URL: Homepage, https://github.com/GBMUAE/arabic-rag-kit
+Project-URL: Repository, https://github.com/GBMUAE/arabic-rag-kit
+Project-URL: Issues, https://github.com/GBMUAE/arabic-rag-kit/issues
+Project-URL: Changelog, https://github.com/GBMUAE/arabic-rag-kit/blob/main/CHANGELOG.md
+Author-email: Hasan Odeh <hodeh84@gmail.com>
+Maintainer-email: Hasan Odeh <hodeh84@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: arabic,chunking,embeddings,information-retrieval,nlp,rag,text-normalization,vector-search
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: Arabic
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: Linguistic
+Classifier: Typing :: Typed
+Requires-Python: >=3.11
+Provides-Extra: all
+Requires-Dist: numpy>=1.23; extra == 'all'
+Requires-Dist: pypdf>=4.0; extra == 'all'
+Requires-Dist: python-docx>=1.1; extra == 'all'
+Requires-Dist: sentence-transformers>=2.2; extra == 'all'
+Provides-Extra: dev
+Requires-Dist: build>=1.2; extra == 'dev'
+Requires-Dist: numpy>=1.23; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: ruff>=0.5; extra == 'dev'
+Provides-Extra: docs
+Requires-Dist: pypdf>=4.0; extra == 'docs'
+Requires-Dist: python-docx>=1.1; extra == 'docs'
+Provides-Extra: embeddings
+Requires-Dist: sentence-transformers>=2.2; extra == 'embeddings'
+Provides-Extra: search
+Requires-Dist: numpy>=1.23; extra == 'search'
+Description-Content-Type: text/markdown
+# arabic-rag-kit
+**The missing first mile for Arabic RAG:** normalize, chunk, and index Arabic
+(and mixed Arabic/English) documents — with a dependency-free core.
+[![PyPI version](https://img.shields.io/pypi/v/arabic-rag-kit.svg)](https://pypi.org/project/arabic-rag-kit/)
+[![Python versions](https://img.shields.io/pypi/pyversions/arabic-rag-kit.svg)](https://pypi.org/project/arabic-rag-kit/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![CI](https://github.com/GBMUAE/arabic-rag-kit/actions/workflows/ci.yml/badge.svg)](https://github.com/GBMUAE/arabic-rag-kit/actions/workflows/ci.yml)
+---
+## Why this exists
+Most RAG and search tooling is built and tested against English. Arabic brings
+problems those tools quietly get wrong:
+- **Diacritics (tashkeel), tatweel, and letter variants** (`أ`/`إ`/`آ` vs `ا`)
+  fragment what should be the same token, tanking retrieval recall.
+- **Invisible characters** — zero-width joiners and bidirectional control marks —
+  sneak into copied text and corrupt indexes and embeddings.
+- **Arabic-Indic digits** (`٠١٢٣`) and **Arabic punctuation** (`؟ ؛ ،`) are
+  invisible to English-centric normalizers and sentence splitters, so chunks
+  break in the wrong places.
+`arabic-rag-kit` handles these correctly, with a **zero-dependency core** so you
+can drop it into any pipeline. Embeddings and file loaders are opt-in extras —
+the library never forces a vendor or an API key on you.
+## Install
+```bash
+# Core: normalization + chunking. Zero third-party dependencies.
+pip install arabic-rag-kit
+# Add the numpy-backed vector index:
+pip install "arabic-rag-kit[search]"
+# Add the sentence-transformers embedder helper:
+pip install "arabic-rag-kit[embeddings]"
+# Add PDF/DOCX loaders:
+pip install "arabic-rag-kit[docs]"
+# Everything:
+pip install "arabic-rag-kit[all]"
+```
+Requires Python **3.11+**.
+## Quickstart
+### 1. Normalize
+```python
+from arabic_rag_kit import normalize
+raw = "الْعَرَبِيَّةُ لُغَةٌ جَمِيلَة… كتـــاب رقم ١٢٣"
+print(normalize(raw))
+# -> "العربية لغة جميلة… كتاب رقم 123"
+```
+Every step is toggleable. Meaning-changing folds (hamza, ta-marbuta, alef
+maqsura) are **off by default** so you don't distort the text unless you ask:
+```python
+normalize("مؤسسة على مدرسة", normalize_hamza=True,
+          normalize_ta_marbuta=True, normalize_alef_maqsura=True)
+# -> "موسسه علي مدرسه"
+```
+Reuse a configured instance:
+```python
+from arabic_rag_kit import Normalizer, NormalizerConfig
+norm = Normalizer(NormalizerConfig(normalize_hamza=True))
+norm("شيء مؤكد")   # -> "شيء موكد"
+```
+### 2. Chunk (sentence-aware)
+```python
+from arabic_rag_kit import chunk_text
+text = (
+    "الذكاء الاصطناعي يغير طريقة عملنا. "
+    "أنظمة استرجاع المعلومات تعتمد على تقطيع جيد للنص. "
+    "كيف نضمن جودة التقطيع؟ عبر احترام حدود الجمل العربية."
+)
+chunks = chunk_text(text, chunk_size=80, chunk_overlap=20)
+for c in chunks:
+    print(f"[{c.index}] ({c.start_char}:{c.end_char}) {c.text}")
+```
+Chunks never exceed `chunk_size`, prefer to break on Arabic/Latin sentence
+boundaries, and carry exact character offsets back into the source. `؟ ؛ ،` and
+the Arabic full stop are all recognized; decimals (`3.14`) and abbreviations
+(`Dr.`, `e.g.`) don't cause false breaks. Pass `normalize=True` to normalize
+before chunking in one step.
+### 3. Index & search (optional `[search]` extra)
+`VectorIndex` never hardcodes an embedding provider — you hand it any
+`embed_fn` (text → vector). Bring your own model, or use the built-in
+sentence-transformers helper:
+```python
+from arabic_rag_kit import VectorIndex, chunk_text
+from arabic_rag_kit.search import sentence_transformers_embedder
+embed = sentence_transformers_embedder()   # multilingual, handles Arabic
+index = VectorIndex(embed)
+docs = [c.text for c in chunks]
+index.add(docs, metadatas=[{"chunk": c.index} for c in chunks])
+for hit in index.search("ما أهمية تقطيع النص؟", k=3):
+    print(round(hit.score, 3), hit.metadata, hit.text)
+```
+Any callable works — no model download required for testing:
+```python
+def my_embed(text: str) -> list[float]:
+    ...  # call OpenAI, Cohere, a local model, whatever
+index = VectorIndex(my_embed)
+```
+### 4. Load documents (optional `[docs]` extra)
+```python
+from arabic_rag_kit.loaders import load_txt, load_pdf, load_docx
+text = load_pdf("report_ar.pdf")     # needs [docs]
+text = load_docx("memo_ar.docx")     # needs [docs]
+text = load_txt("notes_ar.txt")      # stdlib, always available
+```
+## API overview
+| Symbol | Import | Extra | What it does |
+| --- | --- | --- | --- |
+| `normalize(text, **opts)` | `arabic_rag_kit` | — | One-shot Arabic normalization |
+| `Normalizer` / `NormalizerConfig` | `arabic_rag_kit` | — | Reusable, configured normalizer |
+| `split_sentences(text)` | `arabic_rag_kit` | — | Arabic/Latin sentence splitting |
+| `chunk_text(text, chunk_size, chunk_overlap, normalize)` | `arabic_rag_kit` | — | Sentence-aware chunking |
+| `Chunk` | `arabic_rag_kit` | — | `text, index, start_char, end_char` |
+| `VectorIndex` | `arabic_rag_kit` | `[search]` | Cosine-similarity vector index |
+| `sentence_transformers_embedder(model_name)` | `arabic_rag_kit.search` | `[embeddings]` | Ready-made `embed_fn` |
+| `load_txt` / `load_pdf` / `load_docx` | `arabic_rag_kit.loaders` | `[docs]`\* | File loaders (\*txt is stdlib) |
+### Normalization options (defaults)
+| Option | Default | Effect |
+| --- | --- | --- |
+| `remove_diacritics` | `True` | Strip tashkeel/harakat (U+064B–U+0652, U+0670) |
+| `remove_tatweel` | `True` | Remove kashida elongation (U+0640) |
+| `normalize_alef` | `True` | `أ إ آ ٱ` → `ا` |
+| `normalize_hamza` | `False` | `ؤ` → `و`, `ئ` → `ي` |
+| `normalize_ta_marbuta` | `False` | `ة` → `ه` |
+| `normalize_alef_maqsura` | `False` | `ى` → `ي` |
+| `convert_digits` | `True` | `٠–٩` and `۰–۹` → `0–9` |
+| `strip_control_chars` | `True` | Remove zero-width & bidi controls |
+| `collapse_whitespace` | `True` | Collapse runs of whitespace and trim |
+## Development
+```bash
+pip install -e ".[dev]"
+ruff check .
+pytest
+```
+See [CONTRIBUTING.md](CONTRIBUTING.md).
+## Built by GBM
+Created and maintained by **Hasan Odeh** at **Gulf Business Machines (GBM)**.
+Born out of real Arabic RAG work, and open-sourced because Arabic NLP deserves
+better tooling. Contributions welcome.
+## License
+[MIT](LICENSE) © Gulf Business Machines (GBM)

arabic_rag_kit-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+arabic_rag_kit/__init__.py,sha256=VeESqL7BUzxG07KJqKs4Zge_Xp3wor3lNKuLwChFzUc,1056
+arabic_rag_kit/chunk.py,sha256=lkMG8M3R6Fg-ioZZsdTTiIicY4Sfup4gbgLmv8Jc4H8,9070
+arabic_rag_kit/loaders.py,sha256=ll7pj5OwBRlBupeJ_OR2IOMc0YPgMJtBk0T2cFK8i38,1895
+arabic_rag_kit/normalize.py,sha256=ZhTf95-NanMInwWMQs8wM40WwvHzpfS_QQ__GiVvr4o,8040
+arabic_rag_kit/search.py,sha256=NOzDu4NYx-1cqykgIkc_1Ww06jzdP1ZF3ZC9zXip9cg,6648
+arabic_rag_kit-0.1.0.dist-info/METADATA,sha256=0ihN2JVMOgIK7dulqh2IMdnJiiy269kB13hg2o7w1QM,8796
+arabic_rag_kit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+arabic_rag_kit-0.1.0.dist-info/licenses/LICENSE,sha256=Lib4WVWsPoK3nOejd5d0cXvLXOj0Mc8rvwBc0UTtMLw,1085
+arabic_rag_kit-0.1.0.dist-info/RECORD,,

arabic_rag_kit-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

arabic_rag_kit-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Gulf Business Machines (GBM)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.