arabic-rag-kit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ """arabic-rag-kit — prepare Arabic (and mixed Arabic/English) documents for RAG.
2
+
3
+ A small, dependency-light toolkit for the unglamorous-but-critical first mile
4
+ of an Arabic RAG or search pipeline: normalization, sentence-aware chunking,
5
+ and a provider-agnostic vector index.
6
+
7
+ Built by Hasan Odeh at Gulf Business Machines (GBM). MIT licensed.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from .chunk import Chunk, chunk_text, split_sentences
13
+ from .normalize import Normalizer, NormalizerConfig, normalize
14
+
15
+ __version__ = "0.1.0"
16
+
17
+ __all__ = [
18
+ "__version__",
19
+ # normalize
20
+ "normalize",
21
+ "Normalizer",
22
+ "NormalizerConfig",
23
+ # chunk
24
+ "chunk_text",
25
+ "split_sentences",
26
+ "Chunk",
27
+ # search (imported lazily; see __getattr__)
28
+ "VectorIndex",
29
+ ]
30
+
31
+
32
+ def __getattr__(name: str):
33
+ """Lazily expose :class:`VectorIndex` without importing numpy at import time."""
34
+ if name == "VectorIndex":
35
+ from .search import VectorIndex
36
+
37
+ return VectorIndex
38
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,269 @@
1
+ """RAG-aware text chunking that respects Arabic sentence boundaries.
2
+
3
+ Pure standard library — the only import is this package's own
4
+ :mod:`arabic_rag_kit.normalize` (which is itself dependency-free).
5
+
6
+ Two public entry points:
7
+
8
+ * :func:`split_sentences` — split text into sentences on Arabic *and* Latin
9
+ punctuation, without breaking on decimals or common abbreviations.
10
+ * :func:`chunk_text` — recursive character chunking that prefers to break on
11
+ sentence boundaries, then on whitespace, and finally mid-token only when a
12
+ single token is larger than ``chunk_size``. Returns :class:`Chunk` objects
13
+ carrying exact character offsets into the chunked text.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass
19
+
20
+ from .normalize import normalize as _normalize
21
+
22
+ __all__ = ["Chunk", "split_sentences", "chunk_text"]
23
+
24
+
25
+ # Sentence terminators: Arabic question mark (؟), Arabic semicolon (؛),
26
+ # Arabic comma (،), Arabic full stop (۔) and the Latin . ! ?
27
+ _TERMINATORS = frozenset(".!?؟؛،۔")
28
+
29
+ # Abbreviations whose trailing period should not end a sentence. Matched
30
+ # case-insensitively against the word immediately preceding the period.
31
+ _ABBREVIATIONS = frozenset({
32
+ "dr", "mr", "mrs", "ms", "prof", "sr", "jr", "vs", "etc", "no", "st",
33
+ "mt", "fig", "al", "ph", "inc", "ltd", "co", "eg", "ie", "e.g", "i.e",
34
+ })
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class Chunk:
39
+ """A single chunk of text with its position in the source.
40
+
41
+ Attributes:
42
+ text: The chunk's text.
43
+ index: Zero-based position of this chunk in the returned list.
44
+ start_char: Inclusive start offset into the chunked text.
45
+ end_char: Exclusive end offset into the chunked text.
46
+ """
47
+
48
+ text: str
49
+ index: int
50
+ start_char: int
51
+ end_char: int
52
+
53
+
54
+ # --------------------------------------------------------------------------- #
55
+ # Sentence splitting.
56
+ # --------------------------------------------------------------------------- #
57
+
58
+ def _preceding_word(text: str, i: int) -> str:
59
+ """Return the run of alphanumerics ending just before index ``i``."""
60
+ j = i - 1
61
+ chars: list[str] = []
62
+ # Include internal dots so dotted abbreviations ("e.g", "i.e") are matched.
63
+ while j >= 0 and (text[j].isalnum() or text[j] == "."):
64
+ chars.append(text[j])
65
+ j -= 1
66
+ return "".join(reversed(chars)).strip(".")
67
+
68
+
69
+ def _is_period_boundary(text: str, i: int) -> bool:
70
+ """Decide whether the ``.`` at ``text[i]`` really ends a sentence."""
71
+ n = len(text)
72
+ prev = text[i - 1] if i > 0 else ""
73
+ nxt = text[i + 1] if i + 1 < n else ""
74
+
75
+ # Decimal number: "3.14" — a period between two digits is not a boundary.
76
+ if prev.isdigit() and nxt.isdigit():
77
+ return False
78
+ # Inline abbreviation / initialism: a period immediately followed by a
79
+ # lowercase ASCII letter, e.g. "e.g." or "i.e." — not a boundary.
80
+ if "a" <= nxt <= "z":
81
+ return False
82
+ # Known abbreviation ("Dr. Ahmed", "etc. ") followed by space + capital.
83
+ word = _preceding_word(text, i)
84
+ if word and word.lower() in _ABBREVIATIONS:
85
+ return False
86
+ return True
87
+
88
+
89
+ def _trim_span(text: str, start: int, end: int) -> tuple[int, int] | None:
90
+ """Strip surrounding whitespace from ``[start, end)``; ``None`` if empty."""
91
+ while start < end and text[start].isspace():
92
+ start += 1
93
+ while end > start and text[end - 1].isspace():
94
+ end -= 1
95
+ return (start, end) if end > start else None
96
+
97
+
98
+ def _iter_sentence_spans(text: str):
99
+ """Yield ``(start, end)`` spans of trimmed sentences within ``text``."""
100
+ n = len(text)
101
+ seg_start = 0
102
+ i = 0
103
+ while i < n:
104
+ ch = text[i]
105
+ if ch == "\n":
106
+ span = _trim_span(text, seg_start, i)
107
+ if span:
108
+ yield span
109
+ i += 1
110
+ seg_start = i
111
+ continue
112
+ if ch in _TERMINATORS:
113
+ if ch == "." and not _is_period_boundary(text, i):
114
+ i += 1
115
+ continue
116
+ # Absorb any run of trailing terminators, e.g. "؟!" or "...".
117
+ j = i + 1
118
+ while j < n and text[j] != "\n" and text[j] in _TERMINATORS:
119
+ j += 1
120
+ span = _trim_span(text, seg_start, j)
121
+ if span:
122
+ yield span
123
+ i = j
124
+ seg_start = j
125
+ continue
126
+ i += 1
127
+ span = _trim_span(text, seg_start, n)
128
+ if span:
129
+ yield span
130
+
131
+
132
+ def split_sentences(text: str) -> list[str]:
133
+ """Split ``text`` into sentences.
134
+
135
+ Splits on Arabic punctuation (؟ ؛ ، ۔), Latin ``.`` ``!`` ``?`` and
136
+ newlines. Periods inside decimal numbers (``3.14``) and common
137
+ abbreviations (``e.g.``, ``Dr.``) do not create a break.
138
+
139
+ Returns a list of trimmed sentence strings (empty list for empty input).
140
+ """
141
+ if not text:
142
+ return []
143
+ return [text[s:e] for (s, e) in _iter_sentence_spans(text)]
144
+
145
+
146
+ # --------------------------------------------------------------------------- #
147
+ # Chunking.
148
+ # --------------------------------------------------------------------------- #
149
+
150
+ def _iter_word_spans(text: str, start: int, end: int):
151
+ """Yield spans of whitespace-delimited tokens within ``[start, end)``."""
152
+ i = start
153
+ while i < end:
154
+ if text[i].isspace():
155
+ i += 1
156
+ continue
157
+ j = i
158
+ while j < end and not text[j].isspace():
159
+ j += 1
160
+ yield (i, j)
161
+ i = j
162
+
163
+
164
+ def _unit_spans(text: str, chunk_size: int) -> list[tuple[int, int]]:
165
+ """Break text into the finest units no larger than ``chunk_size``.
166
+
167
+ Units are sentences; a sentence longer than ``chunk_size`` is broken into
168
+ words; a word longer than ``chunk_size`` is broken into fixed-size slices.
169
+ Every returned span therefore has ``end - start <= chunk_size``.
170
+ """
171
+ units: list[tuple[int, int]] = []
172
+ for s, e in _iter_sentence_spans(text):
173
+ if e - s <= chunk_size:
174
+ units.append((s, e))
175
+ continue
176
+ for ws, we in _iter_word_spans(text, s, e):
177
+ if we - ws <= chunk_size:
178
+ units.append((ws, we))
179
+ else:
180
+ pos = ws
181
+ while pos < we:
182
+ units.append((pos, min(pos + chunk_size, we)))
183
+ pos += chunk_size
184
+ return units
185
+
186
+
187
+ def _merge_units(
188
+ units: list[tuple[int, int]], chunk_size: int, chunk_overlap: int
189
+ ) -> list[tuple[int, int]]:
190
+ """Greedily pack units into chunk spans, honoring size and overlap."""
191
+ m = len(units)
192
+ spans: list[tuple[int, int]] = []
193
+ a = 0
194
+ while a < m:
195
+ # Extend the window while the contiguous span stays within budget.
196
+ b = a
197
+ while b + 1 < m and (units[b + 1][1] - units[a][0]) <= chunk_size:
198
+ b += 1
199
+ start, end = units[a][0], units[b][1]
200
+ spans.append((start, end))
201
+
202
+ if b == m - 1:
203
+ break
204
+ if chunk_overlap == 0:
205
+ a = b + 1
206
+ continue
207
+
208
+ # Start the next chunk so its overlap with this one is <= chunk_overlap,
209
+ # snapping to a unit boundary. Always make forward progress.
210
+ target = end - chunk_overlap
211
+ next_a = None
212
+ for k in range(a + 1, b + 1):
213
+ if units[k][0] >= target:
214
+ next_a = k
215
+ break
216
+ if next_a is None:
217
+ next_a = b if b > a else a + 1
218
+ if next_a <= a:
219
+ next_a = a + 1
220
+ a = next_a
221
+ return spans
222
+
223
+
224
+ def chunk_text(
225
+ text: str,
226
+ chunk_size: int = 1000,
227
+ chunk_overlap: int = 200,
228
+ normalize: bool = False,
229
+ ) -> list[Chunk]:
230
+ """Split ``text`` into overlapping, sentence-aware chunks.
231
+
232
+ Args:
233
+ text: The text to chunk.
234
+ chunk_size: Maximum characters per chunk. No chunk exceeds this bound.
235
+ chunk_overlap: Approximate number of characters shared between
236
+ consecutive chunks (for context continuity). Must be smaller than
237
+ ``chunk_size``.
238
+ normalize: If ``True``, run :func:`arabic_rag_kit.normalize` on the
239
+ text first; the returned offsets then refer to the normalized text.
240
+
241
+ Returns:
242
+ A list of :class:`Chunk` objects (empty list for empty/whitespace
243
+ input). Offsets index into the (possibly normalized) text.
244
+
245
+ Raises:
246
+ ValueError: If ``chunk_size <= 0``, ``chunk_overlap < 0``, or
247
+ ``chunk_overlap >= chunk_size``.
248
+ """
249
+ if chunk_size <= 0:
250
+ raise ValueError("chunk_size must be a positive integer")
251
+ if chunk_overlap < 0:
252
+ raise ValueError("chunk_overlap must be non-negative")
253
+ if chunk_overlap >= chunk_size:
254
+ raise ValueError("chunk_overlap must be smaller than chunk_size")
255
+
256
+ if normalize:
257
+ text = _normalize(text)
258
+ if not text or not text.strip():
259
+ return []
260
+
261
+ units = _unit_spans(text, chunk_size)
262
+ if not units:
263
+ return []
264
+
265
+ spans = _merge_units(units, chunk_size, chunk_overlap)
266
+ return [
267
+ Chunk(text=text[s:e], index=idx, start_char=s, end_char=e)
268
+ for idx, (s, e) in enumerate(spans)
269
+ ]
@@ -0,0 +1,65 @@
1
+ """Document loaders for common file types.
2
+
3
+ ``load_txt`` is pure standard library. ``load_pdf`` and ``load_docx`` rely on
4
+ optional third-party packages that are imported only when the function is
5
+ called, so importing this module never fails on a bare install.
6
+
7
+ Install the extras with::
8
+
9
+ pip install "arabic-rag-kit[docs]"
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from pathlib import Path
15
+
16
+ __all__ = ["load_txt", "load_pdf", "load_docx"]
17
+
18
+
19
+ def load_txt(path: str | Path, encoding: str = "utf-8") -> str:
20
+ """Read a plain-text file and return its contents.
21
+
22
+ Args:
23
+ path: Path to a ``.txt`` (or any UTF-8 text) file.
24
+ encoding: Text encoding, ``utf-8`` by default.
25
+ """
26
+ return Path(path).read_text(encoding=encoding)
27
+
28
+
29
+ def load_pdf(path: str | Path) -> str:
30
+ """Extract text from a PDF using ``pypdf``.
31
+
32
+ Pages are joined with blank lines. Requires the ``docs`` extra::
33
+
34
+ pip install "arabic-rag-kit[docs]"
35
+ """
36
+ try:
37
+ from pypdf import PdfReader
38
+ except ImportError as exc:
39
+ raise ImportError(
40
+ "load_pdf requires pypdf. Install it with:\n"
41
+ ' pip install "arabic-rag-kit[docs]"'
42
+ ) from exc
43
+
44
+ reader = PdfReader(str(path))
45
+ pages = [page.extract_text() or "" for page in reader.pages]
46
+ return "\n\n".join(pages)
47
+
48
+
49
+ def load_docx(path: str | Path) -> str:
50
+ """Extract text from a Word ``.docx`` file using ``python-docx``.
51
+
52
+ Paragraphs are joined with newlines. Requires the ``docs`` extra::
53
+
54
+ pip install "arabic-rag-kit[docs]"
55
+ """
56
+ try:
57
+ import docx # python-docx
58
+ except ImportError as exc:
59
+ raise ImportError(
60
+ "load_docx requires python-docx. Install it with:\n"
61
+ ' pip install "arabic-rag-kit[docs]"'
62
+ ) from exc
63
+
64
+ document = docx.Document(str(path))
65
+ return "\n".join(p.text for p in document.paragraphs)
@@ -0,0 +1,249 @@
1
+ """Arabic text normalization for RAG and search pipelines.
2
+
3
+ This module is **pure standard library** — it has no third-party dependencies.
4
+
5
+ The public surface is:
6
+
7
+ * :func:`normalize` — a one-shot convenience function.
8
+ * :class:`NormalizerConfig` — a dataclass describing which operations to run.
9
+ * :class:`Normalizer` — a reusable, pre-configured normalizer.
10
+
11
+ Plus a set of small, composable helpers (``remove_diacritics``,
12
+ ``normalize_alef``, ``convert_digits`` …) that each do exactly one thing so you
13
+ can build your own pipeline if the defaults do not fit.
14
+
15
+ All operations are Unicode-aware and safe to run on mixed Arabic/English text:
16
+ characters that are not targeted by a given operation are passed through
17
+ untouched.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from dataclasses import dataclass
24
+
25
+ __all__ = [
26
+ "NormalizerConfig",
27
+ "Normalizer",
28
+ "normalize",
29
+ "remove_diacritics",
30
+ "remove_tatweel",
31
+ "normalize_alef",
32
+ "normalize_hamza",
33
+ "normalize_ta_marbuta",
34
+ "normalize_alef_maqsura",
35
+ "convert_digits",
36
+ "strip_control_chars",
37
+ "collapse_whitespace",
38
+ ]
39
+
40
+ # --------------------------------------------------------------------------- #
41
+ # Character sets (documented with their Unicode code points for reviewers).
42
+ # --------------------------------------------------------------------------- #
43
+
44
+ # Tashkeel / harakat: U+064B..U+0652 (fathatan..sukun) + U+0670 superscript alef.
45
+ _DIACRITICS = "".join(chr(c) for c in range(0x064B, 0x0653)) + "ٰ"
46
+ _DIACRITICS_RE = re.compile("[" + re.escape(_DIACRITICS) + "]")
47
+
48
+ # Tatweel / kashida.
49
+ _TATWEEL = "ـ"
50
+
51
+ # Alef variants -> plain alef (U+0627).
52
+ # أ U+0623, إ U+0625, آ U+0622, ٱ U+0671 -> ا U+0627
53
+ _ALEF_MAP = str.maketrans({
54
+ "أ": "ا",
55
+ "إ": "ا",
56
+ "آ": "ا",
57
+ "ٱ": "ا",
58
+ })
59
+
60
+ # Hamza carriers.
61
+ # ؤ U+0624 -> و U+0648, ئ U+0626 -> ي U+064A
62
+ _HAMZA_MAP = str.maketrans({
63
+ "ؤ": "و",
64
+ "ئ": "ي",
65
+ })
66
+
67
+ # Ta marbuta ة U+0629 -> ه U+0647
68
+ _TA_MARBUTA_MAP = str.maketrans({"ة": "ه"})
69
+
70
+ # Alef maqsura ى U+0649 -> ي U+064A
71
+ _ALEF_MAQSURA_MAP = str.maketrans({"ى": "ي"})
72
+
73
+ # Arabic-Indic (U+0660..U+0669) and Eastern Arabic-Indic (U+06F0..U+06F9)
74
+ # digits -> ASCII 0..9.
75
+ _DIGIT_MAP = str.maketrans(
76
+ {chr(0x0660 + i): str(i) for i in range(10)}
77
+ | {chr(0x06F0 + i): str(i) for i in range(10)}
78
+ )
79
+
80
+ # Zero-width and bidirectional control characters:
81
+ # U+200B..U+200F (ZWSP, ZWNJ, ZWJ, LRM, RLM),
82
+ # U+202A..U+202E (LRE, RLE, PDF, LRO, RLO),
83
+ # U+FEFF (BOM / zero-width no-break space).
84
+ _CONTROL_CHARS = (
85
+ "".join(chr(c) for c in range(0x200B, 0x2010))
86
+ + "".join(chr(c) for c in range(0x202A, 0x202F))
87
+ + ""
88
+ )
89
+ _CONTROL_RE = re.compile("[" + re.escape(_CONTROL_CHARS) + "]")
90
+
91
+ _WHITESPACE_RE = re.compile(r"\s+")
92
+
93
+
94
+ # --------------------------------------------------------------------------- #
95
+ # Composable helpers — each does one thing and returns a new string.
96
+ # --------------------------------------------------------------------------- #
97
+
98
+ def remove_diacritics(text: str) -> str:
99
+ """Remove Arabic tashkeel/harakat (U+064B–U+0652 and U+0670)."""
100
+ return _DIACRITICS_RE.sub("", text)
101
+
102
+
103
+ def remove_tatweel(text: str) -> str:
104
+ """Remove tatweel/kashida elongation characters (U+0640)."""
105
+ return text.replace(_TATWEEL, "")
106
+
107
+
108
+ def normalize_alef(text: str) -> str:
109
+ """Fold alef variants (أ إ آ ٱ) to plain alef (ا)."""
110
+ return text.translate(_ALEF_MAP)
111
+
112
+
113
+ def normalize_hamza(text: str) -> str:
114
+ """Fold hamza carriers (ؤ → و, ئ → ي)."""
115
+ return text.translate(_HAMZA_MAP)
116
+
117
+
118
+ def normalize_ta_marbuta(text: str) -> str:
119
+ """Fold ta marbuta (ة → ه)."""
120
+ return text.translate(_TA_MARBUTA_MAP)
121
+
122
+
123
+ def normalize_alef_maqsura(text: str) -> str:
124
+ """Fold alef maqsura (ى → ي)."""
125
+ return text.translate(_ALEF_MAQSURA_MAP)
126
+
127
+
128
+ def convert_digits(text: str) -> str:
129
+ """Convert Arabic-Indic and Eastern Arabic-Indic digits to ASCII 0–9."""
130
+ return text.translate(_DIGIT_MAP)
131
+
132
+
133
+ def strip_control_chars(text: str) -> str:
134
+ """Remove zero-width and bidi control characters."""
135
+ return _CONTROL_RE.sub("", text)
136
+
137
+
138
+ def collapse_whitespace(text: str) -> str:
139
+ """Collapse any run of whitespace to a single space and strip the ends."""
140
+ return _WHITESPACE_RE.sub(" ", text).strip()
141
+
142
+
143
+ # --------------------------------------------------------------------------- #
144
+ # Config + reusable normalizer.
145
+ # --------------------------------------------------------------------------- #
146
+
147
+ @dataclass
148
+ class NormalizerConfig:
149
+ """Toggles for every normalization step.
150
+
151
+ Defaults are tuned for RAG/search recall on Modern Standard Arabic: the
152
+ "aggressive" folds that change meaning (hamza, ta marbuta, alef maqsura)
153
+ are **off** by default, while safe normalizations (diacritics, tatweel,
154
+ alef, digits, control chars, whitespace) are **on**.
155
+ """
156
+
157
+ remove_diacritics: bool = True
158
+ remove_tatweel: bool = True
159
+ normalize_alef: bool = True
160
+ normalize_hamza: bool = False
161
+ normalize_ta_marbuta: bool = False
162
+ normalize_alef_maqsura: bool = False
163
+ convert_digits: bool = True
164
+ strip_control_chars: bool = True
165
+ collapse_whitespace: bool = True
166
+
167
+
168
+ class Normalizer:
169
+ """A reusable normalizer built from a :class:`NormalizerConfig`.
170
+
171
+ Create one instance and call it many times::
172
+
173
+ norm = Normalizer(NormalizerConfig(normalize_hamza=True))
174
+ norm("النَّصُّ العربي") # -> "النص العربي"
175
+
176
+ The instance is stateless with respect to input, so it is safe to share
177
+ across threads.
178
+ """
179
+
180
+ def __init__(self, config: NormalizerConfig | None = None) -> None:
181
+ self.config = config or NormalizerConfig()
182
+
183
+ def normalize(self, text: str) -> str:
184
+ """Apply the configured pipeline to ``text``."""
185
+ if not text:
186
+ return ""
187
+
188
+ cfg = self.config
189
+ # Order matters: strip invisibles first, then character folds, and
190
+ # collapse whitespace last so earlier steps cannot leave stray runs.
191
+ if cfg.strip_control_chars:
192
+ text = strip_control_chars(text)
193
+ if cfg.remove_diacritics:
194
+ text = remove_diacritics(text)
195
+ if cfg.remove_tatweel:
196
+ text = remove_tatweel(text)
197
+ if cfg.normalize_alef:
198
+ text = normalize_alef(text)
199
+ if cfg.normalize_hamza:
200
+ text = normalize_hamza(text)
201
+ if cfg.normalize_ta_marbuta:
202
+ text = normalize_ta_marbuta(text)
203
+ if cfg.normalize_alef_maqsura:
204
+ text = normalize_alef_maqsura(text)
205
+ if cfg.convert_digits:
206
+ text = convert_digits(text)
207
+ if cfg.collapse_whitespace:
208
+ text = collapse_whitespace(text)
209
+ return text
210
+
211
+ # Allow ``normalizer(text)`` as a shorthand for ``normalizer.normalize``.
212
+ __call__ = normalize
213
+
214
+
215
+ def normalize(
216
+ text: str,
217
+ *,
218
+ remove_diacritics: bool = True,
219
+ remove_tatweel: bool = True,
220
+ normalize_alef: bool = True,
221
+ normalize_hamza: bool = False,
222
+ normalize_ta_marbuta: bool = False,
223
+ normalize_alef_maqsura: bool = False,
224
+ convert_digits: bool = True,
225
+ strip_control_chars: bool = True,
226
+ collapse_whitespace: bool = True,
227
+ ) -> str:
228
+ """Normalize Arabic (or mixed Arabic/English) text in one call.
229
+
230
+ Every step is individually toggleable. See :class:`NormalizerConfig` for
231
+ the defaults and what each flag does.
232
+
233
+ Example::
234
+
235
+ >>> normalize("الْأَرْقَام: ١٢٣ and English")
236
+ 'الارقام: 123 and English'
237
+ """
238
+ config = NormalizerConfig(
239
+ remove_diacritics=remove_diacritics,
240
+ remove_tatweel=remove_tatweel,
241
+ normalize_alef=normalize_alef,
242
+ normalize_hamza=normalize_hamza,
243
+ normalize_ta_marbuta=normalize_ta_marbuta,
244
+ normalize_alef_maqsura=normalize_alef_maqsura,
245
+ convert_digits=convert_digits,
246
+ strip_control_chars=strip_control_chars,
247
+ collapse_whitespace=collapse_whitespace,
248
+ )
249
+ return Normalizer(config).normalize(text)
@@ -0,0 +1,197 @@
1
+ """A tiny, provider-agnostic vector index for semantic search.
2
+
3
+ This module has an **optional** dependency on ``numpy``. The heavy lifting of
4
+ turning text into vectors is delegated to a caller-supplied ``embed_fn`` — the
5
+ index never hardcodes an embedding provider and never needs an API key.
6
+
7
+ Install the extra with::
8
+
9
+ pip install "arabic-rag-kit[search]"
10
+
11
+ Example::
12
+
13
+ from arabic_rag_kit import VectorIndex
14
+
15
+ def embed(text): # your embedding of choice
16
+ ...
17
+
18
+ index = VectorIndex(embed)
19
+ index.add(["القاهرة عاصمة مصر", "باريس عاصمة فرنسا"])
20
+ for hit in index.search("ما هي عاصمة مصر؟", k=1):
21
+ print(hit.text, hit.score)
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from collections.abc import Callable, Iterable, Sequence
27
+ from dataclasses import dataclass, field
28
+ from typing import Any
29
+
30
+ __all__ = ["VectorIndex", "SearchResult", "sentence_transformers_embedder"]
31
+
32
+ EmbedFn = Callable[[str], Sequence[float]]
33
+
34
+ _NUMPY_HINT = (
35
+ "VectorIndex requires numpy. Install it with:\n"
36
+ ' pip install "arabic-rag-kit[search]"'
37
+ )
38
+
39
+
40
+ def _require_numpy():
41
+ """Import numpy lazily, raising a helpful error if it is missing."""
42
+ try:
43
+ import numpy as np
44
+ except ImportError as exc: # pragma: no cover - exercised via monkeypatch
45
+ raise ImportError(_NUMPY_HINT) from exc
46
+ return np
47
+
48
+
49
+ @dataclass
50
+ class SearchResult:
51
+ """A single search hit."""
52
+
53
+ text: str
54
+ score: float
55
+ metadata: dict[str, Any] = field(default_factory=dict)
56
+ index: int = -1
57
+
58
+
59
+ class VectorIndex:
60
+ """An in-memory cosine-similarity index over embedded texts.
61
+
62
+ Args:
63
+ embed_fn: Callable mapping a string to a vector (``list[float]`` or a
64
+ numpy array). Called once per text on :meth:`add` and once per
65
+ query on :meth:`search`.
66
+ normalize: If ``True`` (default), stored vectors are L2-normalized so
67
+ that cosine similarity reduces to a dot product. Set to ``False``
68
+ only if your ``embed_fn`` already returns unit vectors.
69
+ """
70
+
71
+ def __init__(self, embed_fn: EmbedFn, *, normalize: bool = True) -> None:
72
+ if not callable(embed_fn):
73
+ raise TypeError("embed_fn must be callable")
74
+ self._np = _require_numpy()
75
+ self.embed_fn = embed_fn
76
+ self.normalize = normalize
77
+ self._matrix = None # numpy array, shape (n, dim)
78
+ self.texts: list[str] = []
79
+ self.metadatas: list[dict[str, Any]] = []
80
+
81
+ def __len__(self) -> int:
82
+ return len(self.texts)
83
+
84
+ @property
85
+ def dim(self) -> int | None:
86
+ """Embedding dimension, or ``None`` if the index is empty."""
87
+ if self._matrix is None:
88
+ return None
89
+ return int(self._matrix.shape[1])
90
+
91
+ def _vectorize(self, text: str):
92
+ np = self._np
93
+ vec = self._np.asarray(self.embed_fn(text), dtype=np.float32).ravel()
94
+ if vec.ndim != 1 or vec.size == 0:
95
+ raise ValueError("embed_fn must return a non-empty 1-D vector")
96
+ if self.normalize:
97
+ norm = float(np.linalg.norm(vec))
98
+ if norm > 0.0:
99
+ vec = vec / norm
100
+ return vec
101
+
102
+ def add(
103
+ self,
104
+ texts: Iterable[str],
105
+ metadatas: Sequence[dict[str, Any]] | None = None,
106
+ ) -> None:
107
+ """Embed and add ``texts`` (with optional parallel ``metadatas``)."""
108
+ texts = list(texts)
109
+ if metadatas is not None and len(metadatas) != len(texts):
110
+ raise ValueError("metadatas must be the same length as texts")
111
+ if not texts:
112
+ return
113
+
114
+ np = self._np
115
+ new_vecs = np.vstack([self._vectorize(t) for t in texts])
116
+ if self._matrix is None:
117
+ self._matrix = new_vecs
118
+ else:
119
+ if new_vecs.shape[1] != self._matrix.shape[1]:
120
+ raise ValueError(
121
+ f"embedding dim {new_vecs.shape[1]} does not match "
122
+ f"existing dim {self._matrix.shape[1]}"
123
+ )
124
+ self._matrix = np.vstack([self._matrix, new_vecs])
125
+
126
+ self.texts.extend(texts)
127
+ if metadatas is None:
128
+ self.metadatas.extend({} for _ in texts)
129
+ else:
130
+ self.metadatas.extend(dict(m) for m in metadatas)
131
+
132
+ def search(self, query: str, k: int = 5) -> list[SearchResult]:
133
+ """Return the top-``k`` matches for ``query`` by cosine similarity."""
134
+ if k <= 0:
135
+ raise ValueError("k must be a positive integer")
136
+ if self._matrix is None or len(self.texts) == 0:
137
+ return []
138
+
139
+ np = self._np
140
+ q = self._vectorize(query)
141
+ # Cosine similarity. Stored rows are unit vectors when normalize=True;
142
+ # otherwise divide by their norms here so the score stays in [-1, 1].
143
+ scores = self._matrix @ q
144
+ if not self.normalize:
145
+ row_norms = np.linalg.norm(self._matrix, axis=1)
146
+ qn = float(np.linalg.norm(q))
147
+ denom = row_norms * (qn or 1.0)
148
+ denom[denom == 0.0] = 1.0
149
+ scores = scores / denom
150
+
151
+ k = min(k, len(self.texts))
152
+ # Partial top-k, then sort just those k descending.
153
+ top = np.argpartition(-scores, k - 1)[:k]
154
+ top = top[np.argsort(-scores[top])]
155
+ return [
156
+ SearchResult(
157
+ text=self.texts[i],
158
+ score=float(scores[i]),
159
+ metadata=self.metadatas[i],
160
+ index=int(i),
161
+ )
162
+ for i in top
163
+ ]
164
+
165
+
166
+ def sentence_transformers_embedder(
167
+ model_name: str = "paraphrase-multilingual-MiniLM-L12-v2",
168
+ ) -> EmbedFn:
169
+ """Return an ``embed_fn`` backed by ``sentence-transformers``.
170
+
171
+ The model — and the ``sentence-transformers`` dependency — is loaded only
172
+ when this function is called, not at import time. Install the extra with::
173
+
174
+ pip install "arabic-rag-kit[embeddings]"
175
+
176
+ Args:
177
+ model_name: Any model from the sentence-transformers hub. The default
178
+ is multilingual and handles Arabic well.
179
+
180
+ Returns:
181
+ A callable mapping ``str -> list[float]`` suitable for
182
+ :class:`VectorIndex`.
183
+ """
184
+ try:
185
+ from sentence_transformers import SentenceTransformer
186
+ except ImportError as exc:
187
+ raise ImportError(
188
+ "sentence_transformers_embedder requires sentence-transformers. "
189
+ 'Install it with:\n pip install "arabic-rag-kit[embeddings]"'
190
+ ) from exc
191
+
192
+ model = SentenceTransformer(model_name)
193
+
194
+ def embed(text: str) -> list[float]:
195
+ return model.encode(text, convert_to_numpy=True).tolist()
196
+
197
+ return embed
@@ -0,0 +1,232 @@
1
+ Metadata-Version: 2.4
2
+ Name: arabic-rag-kit
3
+ Version: 0.1.0
4
+ Summary: Prepare Arabic (and mixed Arabic/English) documents for RAG and search: normalization, sentence-aware chunking, and a provider-agnostic vector index.
5
+ Project-URL: Homepage, https://github.com/GBMUAE/arabic-rag-kit
6
+ Project-URL: Repository, https://github.com/GBMUAE/arabic-rag-kit
7
+ Project-URL: Issues, https://github.com/GBMUAE/arabic-rag-kit/issues
8
+ Project-URL: Changelog, https://github.com/GBMUAE/arabic-rag-kit/blob/main/CHANGELOG.md
9
+ Author-email: Hasan Odeh <hodeh84@gmail.com>
10
+ Maintainer-email: Hasan Odeh <hodeh84@gmail.com>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: arabic,chunking,embeddings,information-retrieval,nlp,rag,text-normalization,vector-search
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Natural Language :: Arabic
18
+ Classifier: Natural Language :: English
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Text Processing :: Linguistic
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.11
28
+ Provides-Extra: all
29
+ Requires-Dist: numpy>=1.23; extra == 'all'
30
+ Requires-Dist: pypdf>=4.0; extra == 'all'
31
+ Requires-Dist: python-docx>=1.1; extra == 'all'
32
+ Requires-Dist: sentence-transformers>=2.2; extra == 'all'
33
+ Provides-Extra: dev
34
+ Requires-Dist: build>=1.2; extra == 'dev'
35
+ Requires-Dist: numpy>=1.23; extra == 'dev'
36
+ Requires-Dist: pytest>=8.0; extra == 'dev'
37
+ Requires-Dist: ruff>=0.5; extra == 'dev'
38
+ Provides-Extra: docs
39
+ Requires-Dist: pypdf>=4.0; extra == 'docs'
40
+ Requires-Dist: python-docx>=1.1; extra == 'docs'
41
+ Provides-Extra: embeddings
42
+ Requires-Dist: sentence-transformers>=2.2; extra == 'embeddings'
43
+ Provides-Extra: search
44
+ Requires-Dist: numpy>=1.23; extra == 'search'
45
+ Description-Content-Type: text/markdown
46
+
47
+ # arabic-rag-kit
48
+
49
+ **The missing first mile for Arabic RAG:** normalize, chunk, and index Arabic
50
+ (and mixed Arabic/English) documents — with a dependency-free core.
51
+
52
+ [![PyPI version](https://img.shields.io/pypi/v/arabic-rag-kit.svg)](https://pypi.org/project/arabic-rag-kit/)
53
+ [![Python versions](https://img.shields.io/pypi/pyversions/arabic-rag-kit.svg)](https://pypi.org/project/arabic-rag-kit/)
54
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
55
+ [![CI](https://github.com/GBMUAE/arabic-rag-kit/actions/workflows/ci.yml/badge.svg)](https://github.com/GBMUAE/arabic-rag-kit/actions/workflows/ci.yml)
56
+
57
+ ---
58
+
59
+ ## Why this exists
60
+
61
+ Most RAG and search tooling is built and tested against English. Arabic brings
62
+ problems those tools quietly get wrong:
63
+
64
+ - **Diacritics (tashkeel), tatweel, and letter variants** (`أ`/`إ`/`آ` vs `ا`)
65
+ fragment what should be the same token, tanking retrieval recall.
66
+ - **Invisible characters** — zero-width joiners and bidirectional control marks —
67
+ sneak into copied text and corrupt indexes and embeddings.
68
+ - **Arabic-Indic digits** (`٠١٢٣`) and **Arabic punctuation** (`؟ ؛ ،`) are
69
+ invisible to English-centric normalizers and sentence splitters, so chunks
70
+ break in the wrong places.
71
+
72
+ `arabic-rag-kit` handles these correctly, with a **zero-dependency core** so you
73
+ can drop it into any pipeline. Embeddings and file loaders are opt-in extras —
74
+ the library never forces a vendor or an API key on you.
75
+
76
+ ## Install
77
+
78
+ ```bash
79
+ # Core: normalization + chunking. Zero third-party dependencies.
80
+ pip install arabic-rag-kit
81
+
82
+ # Add the numpy-backed vector index:
83
+ pip install "arabic-rag-kit[search]"
84
+
85
+ # Add the sentence-transformers embedder helper:
86
+ pip install "arabic-rag-kit[embeddings]"
87
+
88
+ # Add PDF/DOCX loaders:
89
+ pip install "arabic-rag-kit[docs]"
90
+
91
+ # Everything:
92
+ pip install "arabic-rag-kit[all]"
93
+ ```
94
+
95
+ Requires Python **3.11+**.
96
+
97
+ ## Quickstart
98
+
99
+ ### 1. Normalize
100
+
101
+ ```python
102
+ from arabic_rag_kit import normalize
103
+
104
+ raw = "الْعَرَبِيَّةُ لُغَةٌ جَمِيلَة… كتـــاب رقم ١٢٣"
105
+ print(normalize(raw))
106
+ # -> "العربية لغة جميلة… كتاب رقم 123"
107
+ ```
108
+
109
+ Every step is toggleable. Meaning-changing folds (hamza, ta-marbuta, alef
110
+ maqsura) are **off by default** so you don't distort the text unless you ask:
111
+
112
+ ```python
113
+ normalize("مؤسسة على مدرسة", normalize_hamza=True,
114
+ normalize_ta_marbuta=True, normalize_alef_maqsura=True)
115
+ # -> "موسسه علي مدرسه"
116
+ ```
117
+
118
+ Reuse a configured instance:
119
+
120
+ ```python
121
+ from arabic_rag_kit import Normalizer, NormalizerConfig
122
+
123
+ norm = Normalizer(NormalizerConfig(normalize_hamza=True))
124
+ norm("شيء مؤكد") # -> "شيء موكد"
125
+ ```
126
+
127
+ ### 2. Chunk (sentence-aware)
128
+
129
+ ```python
130
+ from arabic_rag_kit import chunk_text
131
+
132
+ text = (
133
+ "الذكاء الاصطناعي يغير طريقة عملنا. "
134
+ "أنظمة استرجاع المعلومات تعتمد على تقطيع جيد للنص. "
135
+ "كيف نضمن جودة التقطيع؟ عبر احترام حدود الجمل العربية."
136
+ )
137
+
138
+ chunks = chunk_text(text, chunk_size=80, chunk_overlap=20)
139
+ for c in chunks:
140
+ print(f"[{c.index}] ({c.start_char}:{c.end_char}) {c.text}")
141
+ ```
142
+
143
+ Chunks never exceed `chunk_size`, prefer to break on Arabic/Latin sentence
144
+ boundaries, and carry exact character offsets back into the source. `؟ ؛ ،` and
145
+ the Arabic full stop are all recognized; decimals (`3.14`) and abbreviations
146
+ (`Dr.`, `e.g.`) don't cause false breaks. Pass `normalize=True` to normalize
147
+ before chunking in one step.
148
+
149
+ ### 3. Index & search (optional `[search]` extra)
150
+
151
+ `VectorIndex` never hardcodes an embedding provider — you hand it any
152
+ `embed_fn` (text → vector). Bring your own model, or use the built-in
153
+ sentence-transformers helper:
154
+
155
+ ```python
156
+ from arabic_rag_kit import VectorIndex, chunk_text
157
+ from arabic_rag_kit.search import sentence_transformers_embedder
158
+
159
+ embed = sentence_transformers_embedder() # multilingual, handles Arabic
160
+ index = VectorIndex(embed)
161
+
162
+ docs = [c.text for c in chunks]
163
+ index.add(docs, metadatas=[{"chunk": c.index} for c in chunks])
164
+
165
+ for hit in index.search("ما أهمية تقطيع النص؟", k=3):
166
+ print(round(hit.score, 3), hit.metadata, hit.text)
167
+ ```
168
+
169
+ Any callable works — no model download required for testing:
170
+
171
+ ```python
172
+ def my_embed(text: str) -> list[float]:
173
+ ... # call OpenAI, Cohere, a local model, whatever
174
+ index = VectorIndex(my_embed)
175
+ ```
176
+
177
+ ### 4. Load documents (optional `[docs]` extra)
178
+
179
+ ```python
180
+ from arabic_rag_kit.loaders import load_txt, load_pdf, load_docx
181
+
182
+ text = load_pdf("report_ar.pdf") # needs [docs]
183
+ text = load_docx("memo_ar.docx") # needs [docs]
184
+ text = load_txt("notes_ar.txt") # stdlib, always available
185
+ ```
186
+
187
+ ## API overview
188
+
189
+ | Symbol | Import | Extra | What it does |
190
+ | --- | --- | --- | --- |
191
+ | `normalize(text, **opts)` | `arabic_rag_kit` | — | One-shot Arabic normalization |
192
+ | `Normalizer` / `NormalizerConfig` | `arabic_rag_kit` | — | Reusable, configured normalizer |
193
+ | `split_sentences(text)` | `arabic_rag_kit` | — | Arabic/Latin sentence splitting |
194
+ | `chunk_text(text, chunk_size, chunk_overlap, normalize)` | `arabic_rag_kit` | — | Sentence-aware chunking |
195
+ | `Chunk` | `arabic_rag_kit` | — | `text, index, start_char, end_char` |
196
+ | `VectorIndex` | `arabic_rag_kit` | `[search]` | Cosine-similarity vector index |
197
+ | `sentence_transformers_embedder(model_name)` | `arabic_rag_kit.search` | `[embeddings]` | Ready-made `embed_fn` |
198
+ | `load_txt` / `load_pdf` / `load_docx` | `arabic_rag_kit.loaders` | `[docs]`\* | File loaders (\*txt is stdlib) |
199
+
200
+ ### Normalization options (defaults)
201
+
202
+ | Option | Default | Effect |
203
+ | --- | --- | --- |
204
+ | `remove_diacritics` | `True` | Strip tashkeel/harakat (U+064B–U+0652, U+0670) |
205
+ | `remove_tatweel` | `True` | Remove kashida elongation (U+0640) |
206
+ | `normalize_alef` | `True` | `أ إ آ ٱ` → `ا` |
207
+ | `normalize_hamza` | `False` | `ؤ` → `و`, `ئ` → `ي` |
208
+ | `normalize_ta_marbuta` | `False` | `ة` → `ه` |
209
+ | `normalize_alef_maqsura` | `False` | `ى` → `ي` |
210
+ | `convert_digits` | `True` | `٠–٩` and `۰–۹` → `0–9` |
211
+ | `strip_control_chars` | `True` | Remove zero-width & bidi controls |
212
+ | `collapse_whitespace` | `True` | Collapse runs of whitespace and trim |
213
+
214
+ ## Development
215
+
216
+ ```bash
217
+ pip install -e ".[dev]"
218
+ ruff check .
219
+ pytest
220
+ ```
221
+
222
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
223
+
224
+ ## Built by GBM
225
+
226
+ Created and maintained by **Hasan Odeh** at **Gulf Business Machines (GBM)**.
227
+ Born out of real Arabic RAG work, and open-sourced because Arabic NLP deserves
228
+ better tooling. Contributions welcome.
229
+
230
+ ## License
231
+
232
+ [MIT](LICENSE) © Gulf Business Machines (GBM)
@@ -0,0 +1,9 @@
1
+ arabic_rag_kit/__init__.py,sha256=VeESqL7BUzxG07KJqKs4Zge_Xp3wor3lNKuLwChFzUc,1056
2
+ arabic_rag_kit/chunk.py,sha256=lkMG8M3R6Fg-ioZZsdTTiIicY4Sfup4gbgLmv8Jc4H8,9070
3
+ arabic_rag_kit/loaders.py,sha256=ll7pj5OwBRlBupeJ_OR2IOMc0YPgMJtBk0T2cFK8i38,1895
4
+ arabic_rag_kit/normalize.py,sha256=ZhTf95-NanMInwWMQs8wM40WwvHzpfS_QQ__GiVvr4o,8040
5
+ arabic_rag_kit/search.py,sha256=NOzDu4NYx-1cqykgIkc_1Ww06jzdP1ZF3ZC9zXip9cg,6648
6
+ arabic_rag_kit-0.1.0.dist-info/METADATA,sha256=0ihN2JVMOgIK7dulqh2IMdnJiiy269kB13hg2o7w1QM,8796
7
+ arabic_rag_kit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
8
+ arabic_rag_kit-0.1.0.dist-info/licenses/LICENSE,sha256=Lib4WVWsPoK3nOejd5d0cXvLXOj0Mc8rvwBc0UTtMLw,1085
9
+ arabic_rag_kit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Gulf Business Machines (GBM)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.