arabic-rag-kit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arabic_rag_kit-0.1.0/.gitignore +56 -0
- arabic_rag_kit-0.1.0/CHANGELOG.md +34 -0
- arabic_rag_kit-0.1.0/CONTRIBUTING.md +54 -0
- arabic_rag_kit-0.1.0/LICENSE +21 -0
- arabic_rag_kit-0.1.0/PKG-INFO +232 -0
- arabic_rag_kit-0.1.0/README.md +186 -0
- arabic_rag_kit-0.1.0/arabic_rag_kit/__init__.py +38 -0
- arabic_rag_kit-0.1.0/arabic_rag_kit/chunk.py +269 -0
- arabic_rag_kit-0.1.0/arabic_rag_kit/loaders.py +65 -0
- arabic_rag_kit-0.1.0/arabic_rag_kit/normalize.py +249 -0
- arabic_rag_kit-0.1.0/arabic_rag_kit/search.py +197 -0
- arabic_rag_kit-0.1.0/pyproject.toml +87 -0
- arabic_rag_kit-0.1.0/tests/test_chunk.py +157 -0
- arabic_rag_kit-0.1.0/tests/test_normalize.py +134 -0
- arabic_rag_kit-0.1.0/tests/test_search.py +138 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
.Python
|
|
8
|
+
build/
|
|
9
|
+
develop-eggs/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
eggs/
|
|
13
|
+
.eggs/
|
|
14
|
+
lib/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
*.egg-info/
|
|
22
|
+
.installed.cfg
|
|
23
|
+
*.egg
|
|
24
|
+
MANIFEST
|
|
25
|
+
|
|
26
|
+
# Unit test / coverage reports
|
|
27
|
+
htmlcov/
|
|
28
|
+
.tox/
|
|
29
|
+
.nox/
|
|
30
|
+
.coverage
|
|
31
|
+
.coverage.*
|
|
32
|
+
.cache
|
|
33
|
+
nosetests.xml
|
|
34
|
+
coverage.xml
|
|
35
|
+
*.cover
|
|
36
|
+
.pytest_cache/
|
|
37
|
+
.ruff_cache/
|
|
38
|
+
|
|
39
|
+
# Virtual environments
|
|
40
|
+
.venv/
|
|
41
|
+
venv/
|
|
42
|
+
env/
|
|
43
|
+
ENV/
|
|
44
|
+
|
|
45
|
+
# Type checkers
|
|
46
|
+
.mypy_cache/
|
|
47
|
+
.pyre/
|
|
48
|
+
|
|
49
|
+
# Editors / OS
|
|
50
|
+
.idea/
|
|
51
|
+
.vscode/
|
|
52
|
+
*.swp
|
|
53
|
+
.DS_Store
|
|
54
|
+
|
|
55
|
+
# Jupyter
|
|
56
|
+
.ipynb_checkpoints
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-07-02
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `normalize()`, `Normalizer`, and `NormalizerConfig` for Arabic text
|
|
14
|
+
normalization: diacritic/tashkeel removal, tatweel stripping, alef/hamza/
|
|
15
|
+
ta-marbuta/alef-maqsura folding, Arabic-Indic and Eastern Arabic-Indic digit
|
|
16
|
+
conversion, zero-width/bidi control-character stripping, and whitespace
|
|
17
|
+
collapsing. Pure standard library.
|
|
18
|
+
- Composable single-purpose helpers (`remove_diacritics`, `normalize_alef`,
|
|
19
|
+
`convert_digits`, `strip_control_chars`, …).
|
|
20
|
+
- `split_sentences()` — Arabic- and Latin-aware sentence splitting that does
|
|
21
|
+
not break on decimals or common abbreviations.
|
|
22
|
+
- `chunk_text()` and the `Chunk` dataclass — recursive, sentence-aware
|
|
23
|
+
character chunking with configurable overlap and exact character offsets.
|
|
24
|
+
- `VectorIndex` — a provider-agnostic, numpy-backed cosine-similarity index
|
|
25
|
+
that takes a caller-supplied `embed_fn` (optional `[search]` extra).
|
|
26
|
+
- `sentence_transformers_embedder()` — optional helper returning an `embed_fn`
|
|
27
|
+
backed by sentence-transformers (optional `[embeddings]` extra).
|
|
28
|
+
- `load_txt`, `load_pdf`, `load_docx` document loaders (optional `[docs]`
|
|
29
|
+
extra for PDF/DOCX).
|
|
30
|
+
- Full pytest suite, ruff configuration, and GitHub Actions CI + PyPI Trusted
|
|
31
|
+
Publishing workflows.
|
|
32
|
+
|
|
33
|
+
[Unreleased]: https://github.com/GBMUAE/arabic-rag-kit/compare/v0.1.0...HEAD
|
|
34
|
+
[0.1.0]: https://github.com/GBMUAE/arabic-rag-kit/releases/tag/v0.1.0
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Contributing to arabic-rag-kit
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in improving **arabic-rag-kit**! Contributions of all
|
|
4
|
+
kinds are welcome — bug reports, documentation, tests, and code.
|
|
5
|
+
|
|
6
|
+
## Development setup
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
git clone https://github.com/GBMUAE/arabic-rag-kit.git
|
|
10
|
+
cd arabic-rag-kit
|
|
11
|
+
|
|
12
|
+
python -m venv .venv
|
|
13
|
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
14
|
+
|
|
15
|
+
pip install -e ".[dev]" # editable install with dev tools
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Running the checks
|
|
19
|
+
|
|
20
|
+
Before opening a pull request, make sure both the linter and the test suite
|
|
21
|
+
pass:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
ruff check .
|
|
25
|
+
pytest
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Please add tests for any new behavior. The suite must stay green on Python
|
|
29
|
+
3.11 and 3.12 (this is what CI runs).
|
|
30
|
+
|
|
31
|
+
## Design principles
|
|
32
|
+
|
|
33
|
+
- **Core stays dependency-free.** The base install must have zero required
|
|
34
|
+
third-party dependencies. Anything heavier (numpy, sentence-transformers,
|
|
35
|
+
pypdf, python-docx) belongs behind an optional extra and must be imported
|
|
36
|
+
lazily with a clear `ImportError` pointing at the right extra.
|
|
37
|
+
- **Correctness first for Arabic.** Normalization and sentence splitting
|
|
38
|
+
should be justified against real Arabic text. Include examples in the PR.
|
|
39
|
+
- **Small, composable functions.** Prefer pure helpers that do one thing.
|
|
40
|
+
|
|
41
|
+
## Commit messages
|
|
42
|
+
|
|
43
|
+
Use clear, imperative commit messages (e.g. "Add hamza folding option"). Group
|
|
44
|
+
related changes into logical commits.
|
|
45
|
+
|
|
46
|
+
## Reporting bugs
|
|
47
|
+
|
|
48
|
+
Open an issue with a minimal reproducible example, the input text (copy the
|
|
49
|
+
exact Unicode), the expected output, and the actual output.
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
By contributing, you agree that your contributions will be licensed under the
|
|
54
|
+
MIT License that covers the project.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Gulf Business Machines (GBM)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: arabic-rag-kit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Prepare Arabic (and mixed Arabic/English) documents for RAG and search: normalization, sentence-aware chunking, and a provider-agnostic vector index.
|
|
5
|
+
Project-URL: Homepage, https://github.com/GBMUAE/arabic-rag-kit
|
|
6
|
+
Project-URL: Repository, https://github.com/GBMUAE/arabic-rag-kit
|
|
7
|
+
Project-URL: Issues, https://github.com/GBMUAE/arabic-rag-kit/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/GBMUAE/arabic-rag-kit/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Hasan Odeh <hodeh84@gmail.com>
|
|
10
|
+
Maintainer-email: Hasan Odeh <hodeh84@gmail.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: arabic,chunking,embeddings,information-retrieval,nlp,rag,text-normalization,vector-search
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Natural Language :: Arabic
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.11
|
|
28
|
+
Provides-Extra: all
|
|
29
|
+
Requires-Dist: numpy>=1.23; extra == 'all'
|
|
30
|
+
Requires-Dist: pypdf>=4.0; extra == 'all'
|
|
31
|
+
Requires-Dist: python-docx>=1.1; extra == 'all'
|
|
32
|
+
Requires-Dist: sentence-transformers>=2.2; extra == 'all'
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
35
|
+
Requires-Dist: numpy>=1.23; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: pypdf>=4.0; extra == 'docs'
|
|
40
|
+
Requires-Dist: python-docx>=1.1; extra == 'docs'
|
|
41
|
+
Provides-Extra: embeddings
|
|
42
|
+
Requires-Dist: sentence-transformers>=2.2; extra == 'embeddings'
|
|
43
|
+
Provides-Extra: search
|
|
44
|
+
Requires-Dist: numpy>=1.23; extra == 'search'
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
# arabic-rag-kit
|
|
48
|
+
|
|
49
|
+
**The missing first mile for Arabic RAG:** normalize, chunk, and index Arabic
|
|
50
|
+
(and mixed Arabic/English) documents — with a dependency-free core.
|
|
51
|
+
|
|
52
|
+
[](https://pypi.org/project/arabic-rag-kit/)
|
|
53
|
+
[](https://pypi.org/project/arabic-rag-kit/)
|
|
54
|
+
[](https://opensource.org/licenses/MIT)
|
|
55
|
+
[](https://github.com/GBMUAE/arabic-rag-kit/actions/workflows/ci.yml)
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Why this exists
|
|
60
|
+
|
|
61
|
+
Most RAG and search tooling is built and tested against English. Arabic brings
|
|
62
|
+
problems those tools quietly get wrong:
|
|
63
|
+
|
|
64
|
+
- **Diacritics (tashkeel), tatweel, and letter variants** (`أ`/`إ`/`آ` vs `ا`)
|
|
65
|
+
fragment what should be the same token, tanking retrieval recall.
|
|
66
|
+
- **Invisible characters** — zero-width joiners and bidirectional control marks —
|
|
67
|
+
sneak into copied text and corrupt indexes and embeddings.
|
|
68
|
+
- **Arabic-Indic digits** (`٠١٢٣`) and **Arabic punctuation** (`؟ ؛ ،`) are
|
|
69
|
+
invisible to English-centric normalizers and sentence splitters, so chunks
|
|
70
|
+
break in the wrong places.
|
|
71
|
+
|
|
72
|
+
`arabic-rag-kit` handles these correctly, with a **zero-dependency core** so you
|
|
73
|
+
can drop it into any pipeline. Embeddings and file loaders are opt-in extras —
|
|
74
|
+
the library never forces a vendor or an API key on you.
|
|
75
|
+
|
|
76
|
+
## Install
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Core: normalization + chunking. Zero third-party dependencies.
|
|
80
|
+
pip install arabic-rag-kit
|
|
81
|
+
|
|
82
|
+
# Add the numpy-backed vector index:
|
|
83
|
+
pip install "arabic-rag-kit[search]"
|
|
84
|
+
|
|
85
|
+
# Add the sentence-transformers embedder helper:
|
|
86
|
+
pip install "arabic-rag-kit[embeddings]"
|
|
87
|
+
|
|
88
|
+
# Add PDF/DOCX loaders:
|
|
89
|
+
pip install "arabic-rag-kit[docs]"
|
|
90
|
+
|
|
91
|
+
# Everything:
|
|
92
|
+
pip install "arabic-rag-kit[all]"
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Requires Python **3.11+**.
|
|
96
|
+
|
|
97
|
+
## Quickstart
|
|
98
|
+
|
|
99
|
+
### 1. Normalize
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from arabic_rag_kit import normalize
|
|
103
|
+
|
|
104
|
+
raw = "الْعَرَبِيَّةُ لُغَةٌ جَمِيلَة… كتـــاب رقم ١٢٣"
|
|
105
|
+
print(normalize(raw))
|
|
106
|
+
# -> "العربية لغة جميلة… كتاب رقم 123"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Every step is toggleable. Meaning-changing folds (hamza, ta-marbuta, alef
|
|
110
|
+
maqsura) are **off by default** so you don't distort the text unless you ask:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
normalize("مؤسسة على مدرسة", normalize_hamza=True,
|
|
114
|
+
normalize_ta_marbuta=True, normalize_alef_maqsura=True)
|
|
115
|
+
# -> "موسسه علي مدرسه"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Reuse a configured instance:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from arabic_rag_kit import Normalizer, NormalizerConfig
|
|
122
|
+
|
|
123
|
+
norm = Normalizer(NormalizerConfig(normalize_hamza=True))
|
|
124
|
+
norm("شيء مؤكد") # -> "شيء موكد"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### 2. Chunk (sentence-aware)
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from arabic_rag_kit import chunk_text
|
|
131
|
+
|
|
132
|
+
text = (
|
|
133
|
+
"الذكاء الاصطناعي يغير طريقة عملنا. "
|
|
134
|
+
"أنظمة استرجاع المعلومات تعتمد على تقطيع جيد للنص. "
|
|
135
|
+
"كيف نضمن جودة التقطيع؟ عبر احترام حدود الجمل العربية."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
chunks = chunk_text(text, chunk_size=80, chunk_overlap=20)
|
|
139
|
+
for c in chunks:
|
|
140
|
+
print(f"[{c.index}] ({c.start_char}:{c.end_char}) {c.text}")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Chunks never exceed `chunk_size`, prefer to break on Arabic/Latin sentence
|
|
144
|
+
boundaries, and carry exact character offsets back into the source. `؟ ؛ ،` and
|
|
145
|
+
the Arabic full stop are all recognized; decimals (`3.14`) and abbreviations
|
|
146
|
+
(`Dr.`, `e.g.`) don't cause false breaks. Pass `normalize=True` to normalize
|
|
147
|
+
before chunking in one step.
|
|
148
|
+
|
|
149
|
+
### 3. Index & search (optional `[search]` extra)
|
|
150
|
+
|
|
151
|
+
`VectorIndex` never hardcodes an embedding provider — you hand it any
|
|
152
|
+
`embed_fn` (text → vector). Bring your own model, or use the built-in
|
|
153
|
+
sentence-transformers helper:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from arabic_rag_kit import VectorIndex, chunk_text
|
|
157
|
+
from arabic_rag_kit.search import sentence_transformers_embedder
|
|
158
|
+
|
|
159
|
+
embed = sentence_transformers_embedder() # multilingual, handles Arabic
|
|
160
|
+
index = VectorIndex(embed)
|
|
161
|
+
|
|
162
|
+
docs = [c.text for c in chunks]
|
|
163
|
+
index.add(docs, metadatas=[{"chunk": c.index} for c in chunks])
|
|
164
|
+
|
|
165
|
+
for hit in index.search("ما أهمية تقطيع النص؟", k=3):
|
|
166
|
+
print(round(hit.score, 3), hit.metadata, hit.text)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Any callable works — no model download required for testing:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
def my_embed(text: str) -> list[float]:
|
|
173
|
+
... # call OpenAI, Cohere, a local model, whatever
|
|
174
|
+
index = VectorIndex(my_embed)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### 4. Load documents (optional `[docs]` extra)
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from arabic_rag_kit.loaders import load_txt, load_pdf, load_docx
|
|
181
|
+
|
|
182
|
+
text = load_pdf("report_ar.pdf") # needs [docs]
|
|
183
|
+
text = load_docx("memo_ar.docx") # needs [docs]
|
|
184
|
+
text = load_txt("notes_ar.txt") # stdlib, always available
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## API overview
|
|
188
|
+
|
|
189
|
+
| Symbol | Import | Extra | What it does |
|
|
190
|
+
| --- | --- | --- | --- |
|
|
191
|
+
| `normalize(text, **opts)` | `arabic_rag_kit` | — | One-shot Arabic normalization |
|
|
192
|
+
| `Normalizer` / `NormalizerConfig` | `arabic_rag_kit` | — | Reusable, configured normalizer |
|
|
193
|
+
| `split_sentences(text)` | `arabic_rag_kit` | — | Arabic/Latin sentence splitting |
|
|
194
|
+
| `chunk_text(text, chunk_size, chunk_overlap, normalize)` | `arabic_rag_kit` | — | Sentence-aware chunking |
|
|
195
|
+
| `Chunk` | `arabic_rag_kit` | — | `text, index, start_char, end_char` |
|
|
196
|
+
| `VectorIndex` | `arabic_rag_kit` | `[search]` | Cosine-similarity vector index |
|
|
197
|
+
| `sentence_transformers_embedder(model_name)` | `arabic_rag_kit.search` | `[embeddings]` | Ready-made `embed_fn` |
|
|
198
|
+
| `load_txt` / `load_pdf` / `load_docx` | `arabic_rag_kit.loaders` | `[docs]`\* | File loaders (\*txt is stdlib) |
|
|
199
|
+
|
|
200
|
+
### Normalization options (defaults)
|
|
201
|
+
|
|
202
|
+
| Option | Default | Effect |
|
|
203
|
+
| --- | --- | --- |
|
|
204
|
+
| `remove_diacritics` | `True` | Strip tashkeel/harakat (U+064B–U+0652, U+0670) |
|
|
205
|
+
| `remove_tatweel` | `True` | Remove kashida elongation (U+0640) |
|
|
206
|
+
| `normalize_alef` | `True` | `أ إ آ ٱ` → `ا` |
|
|
207
|
+
| `normalize_hamza` | `False` | `ؤ` → `و`, `ئ` → `ي` |
|
|
208
|
+
| `normalize_ta_marbuta` | `False` | `ة` → `ه` |
|
|
209
|
+
| `normalize_alef_maqsura` | `False` | `ى` → `ي` |
|
|
210
|
+
| `convert_digits` | `True` | `٠–٩` and `۰–۹` → `0–9` |
|
|
211
|
+
| `strip_control_chars` | `True` | Remove zero-width & bidi controls |
|
|
212
|
+
| `collapse_whitespace` | `True` | Collapse runs of whitespace and trim |
|
|
213
|
+
|
|
214
|
+
## Development
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
pip install -e ".[dev]"
|
|
218
|
+
ruff check .
|
|
219
|
+
pytest
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
223
|
+
|
|
224
|
+
## Built by GBM
|
|
225
|
+
|
|
226
|
+
Created and maintained by **Hasan Odeh** at **Gulf Business Machines (GBM)**.
|
|
227
|
+
Born out of real Arabic RAG work, and open-sourced because Arabic NLP deserves
|
|
228
|
+
better tooling. Contributions welcome.
|
|
229
|
+
|
|
230
|
+
## License
|
|
231
|
+
|
|
232
|
+
[MIT](LICENSE) © Gulf Business Machines (GBM)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# arabic-rag-kit
|
|
2
|
+
|
|
3
|
+
**The missing first mile for Arabic RAG:** normalize, chunk, and index Arabic
|
|
4
|
+
(and mixed Arabic/English) documents — with a dependency-free core.
|
|
5
|
+
|
|
6
|
+
[](https://pypi.org/project/arabic-rag-kit/)
|
|
7
|
+
[](https://pypi.org/project/arabic-rag-kit/)
|
|
8
|
+
[](https://opensource.org/licenses/MIT)
|
|
9
|
+
[](https://github.com/GBMUAE/arabic-rag-kit/actions/workflows/ci.yml)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Why this exists
|
|
14
|
+
|
|
15
|
+
Most RAG and search tooling is built and tested against English. Arabic brings
|
|
16
|
+
problems those tools quietly get wrong:
|
|
17
|
+
|
|
18
|
+
- **Diacritics (tashkeel), tatweel, and letter variants** (`أ`/`إ`/`آ` vs `ا`)
|
|
19
|
+
fragment what should be the same token, tanking retrieval recall.
|
|
20
|
+
- **Invisible characters** — zero-width joiners and bidirectional control marks —
|
|
21
|
+
sneak into copied text and corrupt indexes and embeddings.
|
|
22
|
+
- **Arabic-Indic digits** (`٠١٢٣`) and **Arabic punctuation** (`؟ ؛ ،`) are
|
|
23
|
+
invisible to English-centric normalizers and sentence splitters, so chunks
|
|
24
|
+
break in the wrong places.
|
|
25
|
+
|
|
26
|
+
`arabic-rag-kit` handles these correctly, with a **zero-dependency core** so you
|
|
27
|
+
can drop it into any pipeline. Embeddings and file loaders are opt-in extras —
|
|
28
|
+
the library never forces a vendor or an API key on you.
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Core: normalization + chunking. Zero third-party dependencies.
|
|
34
|
+
pip install arabic-rag-kit
|
|
35
|
+
|
|
36
|
+
# Add the numpy-backed vector index:
|
|
37
|
+
pip install "arabic-rag-kit[search]"
|
|
38
|
+
|
|
39
|
+
# Add the sentence-transformers embedder helper:
|
|
40
|
+
pip install "arabic-rag-kit[embeddings]"
|
|
41
|
+
|
|
42
|
+
# Add PDF/DOCX loaders:
|
|
43
|
+
pip install "arabic-rag-kit[docs]"
|
|
44
|
+
|
|
45
|
+
# Everything:
|
|
46
|
+
pip install "arabic-rag-kit[all]"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Requires Python **3.11+**.
|
|
50
|
+
|
|
51
|
+
## Quickstart
|
|
52
|
+
|
|
53
|
+
### 1. Normalize
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from arabic_rag_kit import normalize
|
|
57
|
+
|
|
58
|
+
raw = "الْعَرَبِيَّةُ لُغَةٌ جَمِيلَة… كتـــاب رقم ١٢٣"
|
|
59
|
+
print(normalize(raw))
|
|
60
|
+
# -> "العربية لغة جميلة… كتاب رقم 123"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Every step is toggleable. Meaning-changing folds (hamza, ta-marbuta, alef
|
|
64
|
+
maqsura) are **off by default** so you don't distort the text unless you ask:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
normalize("مؤسسة على مدرسة", normalize_hamza=True,
|
|
68
|
+
normalize_ta_marbuta=True, normalize_alef_maqsura=True)
|
|
69
|
+
# -> "موسسه علي مدرسه"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Reuse a configured instance:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from arabic_rag_kit import Normalizer, NormalizerConfig
|
|
76
|
+
|
|
77
|
+
norm = Normalizer(NormalizerConfig(normalize_hamza=True))
|
|
78
|
+
norm("شيء مؤكد") # -> "شيء موكد"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 2. Chunk (sentence-aware)
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from arabic_rag_kit import chunk_text
|
|
85
|
+
|
|
86
|
+
text = (
|
|
87
|
+
"الذكاء الاصطناعي يغير طريقة عملنا. "
|
|
88
|
+
"أنظمة استرجاع المعلومات تعتمد على تقطيع جيد للنص. "
|
|
89
|
+
"كيف نضمن جودة التقطيع؟ عبر احترام حدود الجمل العربية."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
chunks = chunk_text(text, chunk_size=80, chunk_overlap=20)
|
|
93
|
+
for c in chunks:
|
|
94
|
+
print(f"[{c.index}] ({c.start_char}:{c.end_char}) {c.text}")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Chunks never exceed `chunk_size`, prefer to break on Arabic/Latin sentence
|
|
98
|
+
boundaries, and carry exact character offsets back into the source. `؟ ؛ ،` and
|
|
99
|
+
the Arabic full stop are all recognized; decimals (`3.14`) and abbreviations
|
|
100
|
+
(`Dr.`, `e.g.`) don't cause false breaks. Pass `normalize=True` to normalize
|
|
101
|
+
before chunking in one step.
|
|
102
|
+
|
|
103
|
+
### 3. Index & search (optional `[search]` extra)
|
|
104
|
+
|
|
105
|
+
`VectorIndex` never hardcodes an embedding provider — you hand it any
|
|
106
|
+
`embed_fn` (text → vector). Bring your own model, or use the built-in
|
|
107
|
+
sentence-transformers helper:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from arabic_rag_kit import VectorIndex, chunk_text
|
|
111
|
+
from arabic_rag_kit.search import sentence_transformers_embedder
|
|
112
|
+
|
|
113
|
+
embed = sentence_transformers_embedder() # multilingual, handles Arabic
|
|
114
|
+
index = VectorIndex(embed)
|
|
115
|
+
|
|
116
|
+
docs = [c.text for c in chunks]
|
|
117
|
+
index.add(docs, metadatas=[{"chunk": c.index} for c in chunks])
|
|
118
|
+
|
|
119
|
+
for hit in index.search("ما أهمية تقطيع النص؟", k=3):
|
|
120
|
+
print(round(hit.score, 3), hit.metadata, hit.text)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Any callable works — no model download required for testing:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
def my_embed(text: str) -> list[float]:
|
|
127
|
+
... # call OpenAI, Cohere, a local model, whatever
|
|
128
|
+
index = VectorIndex(my_embed)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### 4. Load documents (optional `[docs]` extra)
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from arabic_rag_kit.loaders import load_txt, load_pdf, load_docx
|
|
135
|
+
|
|
136
|
+
text = load_pdf("report_ar.pdf") # needs [docs]
|
|
137
|
+
text = load_docx("memo_ar.docx") # needs [docs]
|
|
138
|
+
text = load_txt("notes_ar.txt") # stdlib, always available
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## API overview
|
|
142
|
+
|
|
143
|
+
| Symbol | Import | Extra | What it does |
|
|
144
|
+
| --- | --- | --- | --- |
|
|
145
|
+
| `normalize(text, **opts)` | `arabic_rag_kit` | — | One-shot Arabic normalization |
|
|
146
|
+
| `Normalizer` / `NormalizerConfig` | `arabic_rag_kit` | — | Reusable, configured normalizer |
|
|
147
|
+
| `split_sentences(text)` | `arabic_rag_kit` | — | Arabic/Latin sentence splitting |
|
|
148
|
+
| `chunk_text(text, chunk_size, chunk_overlap, normalize)` | `arabic_rag_kit` | — | Sentence-aware chunking |
|
|
149
|
+
| `Chunk` | `arabic_rag_kit` | — | `text, index, start_char, end_char` |
|
|
150
|
+
| `VectorIndex` | `arabic_rag_kit` | `[search]` | Cosine-similarity vector index |
|
|
151
|
+
| `sentence_transformers_embedder(model_name)` | `arabic_rag_kit.search` | `[embeddings]` | Ready-made `embed_fn` |
|
|
152
|
+
| `load_txt` / `load_pdf` / `load_docx` | `arabic_rag_kit.loaders` | `[docs]`\* | File loaders (\*txt is stdlib) |
|
|
153
|
+
|
|
154
|
+
### Normalization options (defaults)
|
|
155
|
+
|
|
156
|
+
| Option | Default | Effect |
|
|
157
|
+
| --- | --- | --- |
|
|
158
|
+
| `remove_diacritics` | `True` | Strip tashkeel/harakat (U+064B–U+0652, U+0670) |
|
|
159
|
+
| `remove_tatweel` | `True` | Remove kashida elongation (U+0640) |
|
|
160
|
+
| `normalize_alef` | `True` | `أ إ آ ٱ` → `ا` |
|
|
161
|
+
| `normalize_hamza` | `False` | `ؤ` → `و`, `ئ` → `ي` |
|
|
162
|
+
| `normalize_ta_marbuta` | `False` | `ة` → `ه` |
|
|
163
|
+
| `normalize_alef_maqsura` | `False` | `ى` → `ي` |
|
|
164
|
+
| `convert_digits` | `True` | `٠–٩` and `۰–۹` → `0–9` |
|
|
165
|
+
| `strip_control_chars` | `True` | Remove zero-width & bidi controls |
|
|
166
|
+
| `collapse_whitespace` | `True` | Collapse runs of whitespace and trim |
|
|
167
|
+
|
|
168
|
+
## Development
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
pip install -e ".[dev]"
|
|
172
|
+
ruff check .
|
|
173
|
+
pytest
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
177
|
+
|
|
178
|
+
## Built by GBM
|
|
179
|
+
|
|
180
|
+
Created and maintained by **Hasan Odeh** at **Gulf Business Machines (GBM)**.
|
|
181
|
+
Born out of real Arabic RAG work, and open-sourced because Arabic NLP deserves
|
|
182
|
+
better tooling. Contributions welcome.
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
[MIT](LICENSE) © Gulf Business Machines (GBM)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""arabic-rag-kit — prepare Arabic (and mixed Arabic/English) documents for RAG.
|
|
2
|
+
|
|
3
|
+
A small, dependency-light toolkit for the unglamorous-but-critical first mile
|
|
4
|
+
of an Arabic RAG or search pipeline: normalization, sentence-aware chunking,
|
|
5
|
+
and a provider-agnostic vector index.
|
|
6
|
+
|
|
7
|
+
Built by Hasan Odeh at Gulf Business Machines (GBM). MIT licensed.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .chunk import Chunk, chunk_text, split_sentences
|
|
13
|
+
from .normalize import Normalizer, NormalizerConfig, normalize
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"__version__",
|
|
19
|
+
# normalize
|
|
20
|
+
"normalize",
|
|
21
|
+
"Normalizer",
|
|
22
|
+
"NormalizerConfig",
|
|
23
|
+
# chunk
|
|
24
|
+
"chunk_text",
|
|
25
|
+
"split_sentences",
|
|
26
|
+
"Chunk",
|
|
27
|
+
# search (imported lazily; see __getattr__)
|
|
28
|
+
"VectorIndex",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def __getattr__(name: str):
|
|
33
|
+
"""Lazily expose :class:`VectorIndex` without importing numpy at import time."""
|
|
34
|
+
if name == "VectorIndex":
|
|
35
|
+
from .search import VectorIndex
|
|
36
|
+
|
|
37
|
+
return VectorIndex
|
|
38
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|