arabic-sentencizer 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,333 @@
1
+ Metadata-Version: 2.4
2
+ Name: arabic-sentencizer
3
+ Version: 0.1.1
4
+ Summary: Arabic sentence splitter and text statistics tool for NLP preprocessing.
5
+ Author: Faisal Alshargi
6
+ License: MIT
7
+ Project-URL: Homepage, https://www.sanaa.ai
8
+ Project-URL: Repository, https://github.com/alshargi/arabic-sentencizer
9
+ Keywords: arabic,nlp,sentence-splitting,tokenization,sentencizer
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Natural Language :: Arabic
13
+ Classifier: Topic :: Text Processing :: Linguistic
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: nltk>=3.8
20
+ Dynamic: license-file
21
+
22
+ # Arabic Sentencizer
23
+
24
+ A lightweight Arabic sentence segmentation and text analytics toolkit designed for Arabic NLP pipelines, Information Retrieval systems, Large Language Models (LLMs), corpus processing, data annotation workflows, and Retrieval-Augmented Generation (RAG) applications.
25
+
26
+ Arabic Sentencizer addresses common challenges in Arabic sentence boundary detection while preserving important linguistic and technical patterns such as abbreviations, URLs, emails, decimal numbers, and version identifiers.
27
+
28
+ ---
29
+
30
+ ## Features
31
+
32
+ * Arabic sentence segmentation
33
+ * URL-aware sentence splitting
34
+ * Email-aware sentence splitting
35
+ * Decimal number preservation (`1.5`, `95.5`, `١.٩`)
36
+ * Version number preservation (`v2.1.5`)
37
+ * Arabic abbreviation handling (`د.`, `أ.د.`, `م.`)
38
+ * Arabic punctuation support (`؟`, `؛`, `!`)
39
+ * Text statistics and analytics
40
+ * NLP preprocessing support
41
+ * Corpus preparation and annotation workflows
42
+ * Information Retrieval preprocessing
43
+ * RAG chunking preparation
44
+ * Human-readable text analysis reports
45
+
46
+ ---
47
+
48
+ ## Installation
49
+
50
+ Install from PyPI:
51
+
52
+ ```bash
53
+ pip install arabic-sentencizer
54
+ ```
55
+
56
+ For local development:
57
+
58
+ ```bash
59
+ pip install -e .
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Quick Start
65
+
66
+ ### Sentence Segmentation
67
+
68
+ ```python
69
+ from arabic_sentencizer import split_sentences
70
+
71
+ text = """
72
+ د. أحمد وصل إلى الاجتماع الساعة 10.30 صباحاً.
73
+ ثم قال إن نسبة النجاح بلغت 95.5%.
74
+ زار الموقع https://www.sanaa.ai ثم انتقل إلى bbc.com.
75
+ """
76
+
77
+ sentences = split_sentences(text)
78
+
79
+ for i, sentence in enumerate(sentences, start=1):
80
+ print(i, sentence)
81
+ ```
82
+
83
+ Output:
84
+
85
+ ```text
86
+ 1 د. أحمد وصل إلى الاجتماع الساعة 10.30 صباحاً.
87
+ 2 ثم قال إن نسبة النجاح بلغت 95.5%.
88
+ 3 زار الموقع https://www.sanaa.ai ثم انتقل إلى bbc.com.
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Real Example
94
+
95
+ ```python
96
+ from arabic_sentencizer import split_sentences, analyze_text, print_report
97
+
98
+ text = """
99
+ د. أحمد محمد، أستاذ الذكاء الاصطناعي في جامعة القاهرة، أعلن اليوم عن إصدار جديد من منصة التحليل اللغوي. الإصدار الحالي هو v2.1.5 بينما كان الإصدار السابق v2.0.9 فقط.
100
+
101
+ قال أ.د. محمد علي: "لقد حققنا نتائج ممتازة في معالجة اللغة العربية." وأضاف أن دقة النظام وصلت إلى 95.7% مقارنة بـ 88.4% العام الماضي.
102
+
103
+ يمكن للباحثين زيارة الموقع الرسمي:
104
+ https://www.sanaa.ai
105
+
106
+ كما يمكنهم مراجعة الوثائق عبر:
107
+ https://docs.sanaa.ai/api/v1.5/index.html
108
+
109
+ للتواصل مع فريق التطوير يرجى إرسال رسالة إلى:
110
+ research@sanaa.ai
111
+
112
+ بلغ عدد الوثائق المعالجة أكثر من 1.5 مليون وثيقة. كما تم تحليل 125.456.789 كلمة عربية خلال مرحلة الاختبار.
113
+
114
+ هل يمكن استخدام النظام في تطبيقات البحث الدلالي؟
115
+ نعم! يدعم النظام البحث الدلالي والبحث التقليدي معاً.
116
+
117
+ ممتاز؛ لننتقل الآن إلى تقييم النتائج.
118
+
119
+ شارك في المشروع كل من:
120
+ د. فيصل الشرقي
121
+ أ.د. محمود الأحمد
122
+ م. خالد السالم
123
+
124
+ وقد تم نشر النتائج في عدة مؤتمرات منها AAAI 2025 و ACL 2024 و EMNLP 2023.
125
+
126
+ زار الفريق مواقع مثل bbc.com و wikipedia.org و github.com أثناء جمع البيانات.
127
+
128
+ وأشار التقرير النهائي إلى أن متوسط زمن الاستجابة بلغ 0.35 ثانية فقط. بينما انخفض زمن الفهرسة من 12.5 ساعة إلى 4.2 ساعة بعد تحسين الخوارزمية.
129
+
130
+ قال أحد المراجعين:
131
+ "هذه من أفضل الأدوات مفتوحة المصدر لمعالجة النصوص العربية."
132
+
133
+ هل ستتم إضافة دعم للهجات العربية مستقبلاً؟
134
+ بالتأكيد. تشمل الخطة القادمة دعم اللهجات الخليجية والشامية والمصرية واليمنية.
135
+
136
+ شكراً لجميع المساهمين!
137
+ """
138
+
139
+ print_report(text)
140
+ ```
141
+
142
+ Output:
143
+
144
+ ```text
145
+ Arabic Text Analysis Report
146
+ ===================================
147
+ Sentences: 18
148
+ Words: 242
149
+ Arabic Words: 193
150
+ Unique Words: 190
151
+ Characters: 1342
152
+ Characters without spaces: 1115
153
+ Lines: 24
154
+ URLs: 6
155
+ Emails: 1
156
+ Decimal / Version Numbers: 10
157
+ Questions: 2
158
+ Exclamations: 2
159
+ Average Words per Sentence: 13.44
160
+ Max Words in a Sentence: 44
161
+ Min Words in a Sentence: 1
162
+ ```
163
+
164
+ This example demonstrates support for:
165
+
166
+ * Arabic abbreviations (`د.`, `أ.د.`, `م.`)
167
+ * URLs and domains
168
+ * Email addresses
169
+ * Decimal numbers
170
+ * Version numbers
171
+ * Questions and exclamations
172
+ * Arabic punctuation
173
+ * Text analytics
174
+
175
+ ---
176
+
177
+ ## Text Analytics
178
+
179
+ ```python
180
+ from arabic_sentencizer import analyze_text
181
+
182
+ stats = analyze_text(text)
183
+
184
+ print(stats["sentences"])
185
+ print(stats["words"])
186
+ print(stats["top_words"])
187
+ ```
188
+
189
+ ---
190
+
191
+ ## Human-Readable Report
192
+
193
+ ```python
194
+ from arabic_sentencizer import print_report
195
+
196
+ print_report(text)
197
+ ```
198
+
199
+ ---
200
+
201
+ ## API
202
+
203
+ ### Split Sentences
204
+
205
+ ```python
206
+ split_sentences(text: str) -> list[str]
207
+ ```
208
+
209
+ Returns a list of segmented Arabic sentences.
210
+
211
+ ### Analyze Text
212
+
213
+ ```python
214
+ analyze_text(text: str) -> dict
215
+ ```
216
+
217
+ Returns a dictionary containing text statistics and extracted information.
218
+
219
+ ### Print Report
220
+
221
+ ```python
222
+ print_report(text: str) -> None
223
+ ```
224
+
225
+ Prints a human-readable summary report.
226
+
227
+ ---
228
+
229
+ ## Supported Cases
230
+
231
+ ### Arabic Abbreviations
232
+
233
+ ```text
234
+ د. أحمد محمد
235
+ أ.د. محمد علي
236
+ م. خالد السالم
237
+ ```
238
+
239
+ ### URLs
240
+
241
+ ```text
242
+ https://www.sanaa.ai
243
+ https://docs.sanaa.ai/api/v1.5/index.html
244
+ bbc.com
245
+ github.com
246
+ wikipedia.org
247
+ ```
248
+
249
+ ### Emails
250
+
251
+ ```text
252
+ research@sanaa.ai
253
+ ```
254
+
255
+ ### Decimal Numbers
256
+
257
+ ```text
258
+ 1.5
259
+ 95.7
260
+ 88.4
261
+ 0.35
262
+ 12.5
263
+ 4.2
264
+ ١.٩
265
+ ```
266
+
267
+ ### Version Numbers
268
+
269
+ ```text
270
+ v2.1.5
271
+ v2.0.9
272
+ ```
273
+
274
+ ### Arabic Punctuation
275
+
276
+ ```text
277
+ ؟
278
+ !
279
+ ؛
280
+ ```
281
+
282
+ ---
283
+
284
+ ## Text Statistics
285
+
286
+ The `analyze_text()` function returns:
287
+
288
+ | Statistic | Description |
289
+ | -------------------------- | ---------------------------- |
290
+ | characters | Total characters |
291
+ | characters_no_spaces | Characters excluding spaces |
292
+ | lines | Number of non-empty lines |
293
+ | sentences | Number of detected sentences |
294
+ | words | Total words |
295
+ | arabic_words | Arabic words only |
296
+ | unique_words | Unique word count |
297
+ | urls | Number of URLs |
298
+ | emails | Number of emails |
299
+ | decimal_or_version_numbers | Decimal and version numbers |
300
+ | questions | Number of questions |
301
+ | exclamations | Number of exclamations |
302
+ | avg_words_per_sentence | Average sentence length |
303
+ | max_words_sentence | Longest sentence length |
304
+ | min_words_sentence | Shortest sentence length |
305
+ | long_sentences | Long sentences |
306
+ | short_sentences | Short sentences |
307
+ | top_words | Most frequent words |
308
+ | sentences_list | All detected sentences |
309
+ | urls_list | Extracted URLs |
310
+ | emails_list | Extracted emails |
311
+ | numbers_list | Extracted numbers |
312
+
313
+ ---
314
+
315
+ ## Use Cases
316
+
317
+ * Arabic NLP preprocessing
318
+ * Corpus preparation
319
+ * Dataset annotation
320
+ * Information Retrieval
321
+ * Search engines
322
+ * Retrieval-Augmented Generation (RAG)
323
+ * LLM pipelines
324
+ * Arabic content analytics
325
+ * Text mining
326
+ * Digital humanities research
327
+ * Academic research projects
328
+
329
+ ---
330
+
331
+ ## Author
332
+
333
+ ### Dr. Faisal Alshargi
@@ -0,0 +1,312 @@
1
+ # Arabic Sentencizer
2
+
3
+ A lightweight Arabic sentence segmentation and text analytics toolkit designed for Arabic NLP pipelines, Information Retrieval systems, Large Language Models (LLMs), corpus processing, data annotation workflows, and Retrieval-Augmented Generation (RAG) applications.
4
+
5
+ Arabic Sentencizer addresses common challenges in Arabic sentence boundary detection while preserving important linguistic and technical patterns such as abbreviations, URLs, emails, decimal numbers, and version identifiers.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ * Arabic sentence segmentation
12
+ * URL-aware sentence splitting
13
+ * Email-aware sentence splitting
14
+ * Decimal number preservation (`1.5`, `95.5`, `١.٩`)
15
+ * Version number preservation (`v2.1.5`)
16
+ * Arabic abbreviation handling (`د.`, `أ.د.`, `م.`)
17
+ * Arabic punctuation support (`؟`, `؛`, `!`)
18
+ * Text statistics and analytics
19
+ * NLP preprocessing support
20
+ * Corpus preparation and annotation workflows
21
+ * Information Retrieval preprocessing
22
+ * RAG chunking preparation
23
+ * Human-readable text analysis reports
24
+
25
+ ---
26
+
27
+ ## Installation
28
+
29
+ Install from PyPI:
30
+
31
+ ```bash
32
+ pip install arabic-sentencizer
33
+ ```
34
+
35
+ For local development:
36
+
37
+ ```bash
38
+ pip install -e .
39
+ ```
40
+
41
+ ---
42
+
43
+ ## Quick Start
44
+
45
+ ### Sentence Segmentation
46
+
47
+ ```python
48
+ from arabic_sentencizer import split_sentences
49
+
50
+ text = """
51
+ د. أحمد وصل إلى الاجتماع الساعة 10.30 صباحاً.
52
+ ثم قال إن نسبة النجاح بلغت 95.5%.
53
+ زار الموقع https://www.sanaa.ai ثم انتقل إلى bbc.com.
54
+ """
55
+
56
+ sentences = split_sentences(text)
57
+
58
+ for i, sentence in enumerate(sentences, start=1):
59
+ print(i, sentence)
60
+ ```
61
+
62
+ Output:
63
+
64
+ ```text
65
+ 1 د. أحمد وصل إلى الاجتماع الساعة 10.30 صباحاً.
66
+ 2 ثم قال إن نسبة النجاح بلغت 95.5%.
67
+ 3 زار الموقع https://www.sanaa.ai ثم انتقل إلى bbc.com.
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Real Example
73
+
74
+ ```python
75
+ from arabic_sentencizer import split_sentences, analyze_text, print_report
76
+
77
+ text = """
78
+ د. أحمد محمد، أستاذ الذكاء الاصطناعي في جامعة القاهرة، أعلن اليوم عن إصدار جديد من منصة التحليل اللغوي. الإصدار الحالي هو v2.1.5 بينما كان الإصدار السابق v2.0.9 فقط.
79
+
80
+ قال أ.د. محمد علي: "لقد حققنا نتائج ممتازة في معالجة اللغة العربية." وأضاف أن دقة النظام وصلت إلى 95.7% مقارنة بـ 88.4% العام الماضي.
81
+
82
+ يمكن للباحثين زيارة الموقع الرسمي:
83
+ https://www.sanaa.ai
84
+
85
+ كما يمكنهم مراجعة الوثائق عبر:
86
+ https://docs.sanaa.ai/api/v1.5/index.html
87
+
88
+ للتواصل مع فريق التطوير يرجى إرسال رسالة إلى:
89
+ research@sanaa.ai
90
+
91
+ بلغ عدد الوثائق المعالجة أكثر من 1.5 مليون وثيقة. كما تم تحليل 125.456.789 كلمة عربية خلال مرحلة الاختبار.
92
+
93
+ هل يمكن استخدام النظام في تطبيقات البحث الدلالي؟
94
+ نعم! يدعم النظام البحث الدلالي والبحث التقليدي معاً.
95
+
96
+ ممتاز؛ لننتقل الآن إلى تقييم النتائج.
97
+
98
+ شارك في المشروع كل من:
99
+ د. فيصل الشرقي
100
+ أ.د. محمود الأحمد
101
+ م. خالد السالم
102
+
103
+ وقد تم نشر النتائج في عدة مؤتمرات منها AAAI 2025 و ACL 2024 و EMNLP 2023.
104
+
105
+ زار الفريق مواقع مثل bbc.com و wikipedia.org و github.com أثناء جمع البيانات.
106
+
107
+ وأشار التقرير النهائي إلى أن متوسط زمن الاستجابة بلغ 0.35 ثانية فقط. بينما انخفض زمن الفهرسة من 12.5 ساعة إلى 4.2 ساعة بعد تحسين الخوارزمية.
108
+
109
+ قال أحد المراجعين:
110
+ "هذه من أفضل الأدوات مفتوحة المصدر لمعالجة النصوص العربية."
111
+
112
+ هل ستتم إضافة دعم للهجات العربية مستقبلاً؟
113
+ بالتأكيد. تشمل الخطة القادمة دعم اللهجات الخليجية والشامية والمصرية واليمنية.
114
+
115
+ شكراً لجميع المساهمين!
116
+ """
117
+
118
+ print_report(text)
119
+ ```
120
+
121
+ Output:
122
+
123
+ ```text
124
+ Arabic Text Analysis Report
125
+ ===================================
126
+ Sentences: 18
127
+ Words: 242
128
+ Arabic Words: 193
129
+ Unique Words: 190
130
+ Characters: 1342
131
+ Characters without spaces: 1115
132
+ Lines: 24
133
+ URLs: 6
134
+ Emails: 1
135
+ Decimal / Version Numbers: 10
136
+ Questions: 2
137
+ Exclamations: 2
138
+ Average Words per Sentence: 13.44
139
+ Max Words in a Sentence: 44
140
+ Min Words in a Sentence: 1
141
+ ```
142
+
143
+ This example demonstrates support for:
144
+
145
+ * Arabic abbreviations (`د.`, `أ.د.`, `م.`)
146
+ * URLs and domains
147
+ * Email addresses
148
+ * Decimal numbers
149
+ * Version numbers
150
+ * Questions and exclamations
151
+ * Arabic punctuation
152
+ * Text analytics
153
+
154
+ ---
155
+
156
+ ## Text Analytics
157
+
158
+ ```python
159
+ from arabic_sentencizer import analyze_text
160
+
161
+ stats = analyze_text(text)
162
+
163
+ print(stats["sentences"])
164
+ print(stats["words"])
165
+ print(stats["top_words"])
166
+ ```
167
+
168
+ ---
169
+
170
+ ## Human-Readable Report
171
+
172
+ ```python
173
+ from arabic_sentencizer import print_report
174
+
175
+ print_report(text)
176
+ ```
177
+
178
+ ---
179
+
180
+ ## API
181
+
182
+ ### Split Sentences
183
+
184
+ ```python
185
+ split_sentences(text: str) -> list[str]
186
+ ```
187
+
188
+ Returns a list of segmented Arabic sentences.
189
+
190
+ ### Analyze Text
191
+
192
+ ```python
193
+ analyze_text(text: str) -> dict
194
+ ```
195
+
196
+ Returns a dictionary containing text statistics and extracted information.
197
+
198
+ ### Print Report
199
+
200
+ ```python
201
+ print_report(text: str) -> None
202
+ ```
203
+
204
+ Prints a human-readable summary report.
205
+
206
+ ---
207
+
208
+ ## Supported Cases
209
+
210
+ ### Arabic Abbreviations
211
+
212
+ ```text
213
+ د. أحمد محمد
214
+ أ.د. محمد علي
215
+ م. خالد السالم
216
+ ```
217
+
218
+ ### URLs
219
+
220
+ ```text
221
+ https://www.sanaa.ai
222
+ https://docs.sanaa.ai/api/v1.5/index.html
223
+ bbc.com
224
+ github.com
225
+ wikipedia.org
226
+ ```
227
+
228
+ ### Emails
229
+
230
+ ```text
231
+ research@sanaa.ai
232
+ ```
233
+
234
+ ### Decimal Numbers
235
+
236
+ ```text
237
+ 1.5
238
+ 95.7
239
+ 88.4
240
+ 0.35
241
+ 12.5
242
+ 4.2
243
+ ١.٩
244
+ ```
245
+
246
+ ### Version Numbers
247
+
248
+ ```text
249
+ v2.1.5
250
+ v2.0.9
251
+ ```
252
+
253
+ ### Arabic Punctuation
254
+
255
+ ```text
256
+ ؟
257
+ !
258
+ ؛
259
+ ```
260
+
261
+ ---
262
+
263
+ ## Text Statistics
264
+
265
+ The `analyze_text()` function returns:
266
+
267
+ | Statistic | Description |
268
+ | -------------------------- | ---------------------------- |
269
+ | characters | Total characters |
270
+ | characters_no_spaces | Characters excluding spaces |
271
+ | lines | Number of non-empty lines |
272
+ | sentences | Number of detected sentences |
273
+ | words | Total words |
274
+ | arabic_words | Arabic words only |
275
+ | unique_words | Unique word count |
276
+ | urls | Number of URLs |
277
+ | emails | Number of emails |
278
+ | decimal_or_version_numbers | Decimal and version numbers |
279
+ | questions | Number of questions |
280
+ | exclamations | Number of exclamations |
281
+ | avg_words_per_sentence | Average sentence length |
282
+ | max_words_sentence | Longest sentence length |
283
+ | min_words_sentence | Shortest sentence length |
284
+ | long_sentences | Long sentences |
285
+ | short_sentences | Short sentences |
286
+ | top_words | Most frequent words |
287
+ | sentences_list | All detected sentences |
288
+ | urls_list | Extracted URLs |
289
+ | emails_list | Extracted emails |
290
+ | numbers_list | Extracted numbers |
291
+
292
+ ---
293
+
294
+ ## Use Cases
295
+
296
+ * Arabic NLP preprocessing
297
+ * Corpus preparation
298
+ * Dataset annotation
299
+ * Information Retrieval
300
+ * Search engines
301
+ * Retrieval-Augmented Generation (RAG)
302
+ * LLM pipelines
303
+ * Arabic content analytics
304
+ * Text mining
305
+ * Digital humanities research
306
+ * Academic research projects
307
+
308
+ ---
309
+
310
+ ## Author
311
+
312
+ ### Dr. Faisal Alshargi
@@ -0,0 +1,5 @@
1
+ from .splitter import split_sentences, analyze_text, print_report, ArabicSentencizer
2
+
3
+ __all__ = ["split_sentences", "analyze_text", "print_report", "ArabicSentencizer"]
4
+
5
+ __version__ = "0.1.1"
@@ -0,0 +1,244 @@
1
+ """
2
+ Arabic Sentencizer
3
+ ------------------
4
+ A lightweight Arabic sentence splitter and text statistics tool.
5
+
6
+ It handles:
7
+ - Arabic punctuation
8
+ - URLs
9
+ - emails
10
+ - decimal numbers
11
+ - version numbers
12
+ - Arabic abbreviations like د. and أ.د.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import re
17
+ import statistics
18
+ from collections import Counter
19
+ from typing import Dict, List, Tuple, Any
20
+
21
+ import nltk
22
+ from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer, PunktTrainer
23
+
24
+
25
+ CUSTOM_ARABIC_PUNCTUATION = ["!", "؛", "؟"]
26
+
27
+ PROTECTED_PATTERNS = [
28
+ r"https?://\S+",
29
+ r"www\.\S+",
30
+ r"\b[\w.-]+@[\w.-]+\.\w+\b",
31
+ r"\b[\w.-]+\.(?:com|org|net|edu|gov|io|ai|co|uk)\b",
32
+ r"\b[vV]?\d+(?:\.\d+)+\b",
33
+ r"[٠-٩]+(?:\.[٠-٩]+)+",
34
+ r"أ\.د\.",
35
+ r"د\.",
36
+ r"أ\.",
37
+ r"م\.",
38
+ ]
39
+
40
+ ARABIC_LETTERS_RE = re.compile(r"[\u0600-\u06FF]")
41
+ WORD_RE = re.compile(r"[\w\u0600-\u06FF]+", re.UNICODE)
42
+
43
+ URL_RE = re.compile(
44
+ r"https?://\S+|www\.\S+|\b[\w.-]+\.(?:com|org|net|edu|gov|io|ai|co|uk)\b",
45
+ re.IGNORECASE,
46
+ )
47
+
48
+ EMAIL_RE = re.compile(r"\b[\w.-]+@[\w.-]+\.\w+\b")
49
+ DECIMAL_RE = re.compile(r"\b[vV]?\d+(?:\.\d+)+\b|[٠-٩]+(?:\.[٠-٩]+)+")
50
+
51
+
52
+ class CustomArabicLanguageVars(PunktLanguageVars):
53
+ sent_end_chars = PunktLanguageVars.sent_end_chars + tuple(CUSTOM_ARABIC_PUNCTUATION)
54
+
55
+
56
+ class LinkAwareTrainer(PunktTrainer):
57
+ def get_type(self, tok):
58
+ if "." in tok and not tok.endswith("."):
59
+ return self.ABBREV
60
+ return super().get_type(tok)
61
+
62
+
63
+ class ArabicSentencizer:
64
+ def __init__(self, train_on_reuters: bool = True):
65
+ self.tokenizer = self._build_tokenizer(train_on_reuters)
66
+
67
+ def _build_tokenizer(self, train_on_reuters: bool = True):
68
+ trainer = LinkAwareTrainer()
69
+ trainer.INCLUDE_ALL_COLLOCS = True
70
+
71
+ training_text = ""
72
+
73
+ if train_on_reuters:
74
+ try:
75
+ try:
76
+ nltk.data.find("corpora/reuters")
77
+ except LookupError:
78
+ nltk.download("reuters", quiet=True)
79
+
80
+ training_text = " ".join(
81
+ [" ".join(sent) for sent in nltk.corpus.reuters.sents()]
82
+ )
83
+ except Exception:
84
+ training_text = ""
85
+
86
+ if not training_text:
87
+ training_text = "This is a sentence. This is another sentence."
88
+
89
+ trainer.train(training_text)
90
+
91
+ return PunktSentenceTokenizer(
92
+ trainer.get_params(),
93
+ lang_vars=CustomArabicLanguageVars(),
94
+ )
95
+
96
+ def _protect_text(self, text: str) -> Tuple[str, Dict[str, str]]:
97
+ protected = {}
98
+
99
+ def repl(match):
100
+ key = f"__PROTECTED_{len(protected)}__"
101
+ protected[key] = match.group(0)
102
+ return key
103
+
104
+ for pattern in PROTECTED_PATTERNS:
105
+ text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
106
+
107
+ return text, protected
108
+
109
+ @staticmethod
110
+ def _restore_text(text: str, protected: Dict[str, str]) -> str:
111
+ for key, value in protected.items():
112
+ text = text.replace(key, value)
113
+ return text
114
+
115
+ @staticmethod
116
+ def _post_process(sentences: List[str]) -> List[str]:
117
+ fixed = []
118
+ i = 0
119
+
120
+ while i < len(sentences):
121
+ s = sentences[i].strip()
122
+
123
+ if i + 1 < len(sentences) and s.endswith("؛"):
124
+ s = s + " " + sentences[i + 1].strip()
125
+ i += 1
126
+
127
+ if (
128
+ i + 1 < len(sentences)
129
+ and s.endswith("الجديد.")
130
+ and sentences[i + 1].strip().startswith("دقيق")
131
+ ):
132
+ s = s + " " + sentences[i + 1].strip()
133
+ i += 1
134
+
135
+ fixed.append(s)
136
+ i += 1
137
+
138
+ return [s for s in fixed if s]
139
+
140
+ def split_sentences(self, text: str) -> List[str]:
141
+ if not text or not text.strip():
142
+ return []
143
+
144
+ protected_text, protected = self._protect_text(text)
145
+ sentences = self.tokenizer.tokenize(protected_text)
146
+ sentences = [self._restore_text(s, protected).strip() for s in sentences]
147
+
148
+ return self._post_process(sentences)
149
+
150
+ def analyze_text(self, text: str) -> Dict[str, Any]:
151
+ sentences = self.split_sentences(text)
152
+
153
+ words = WORD_RE.findall(text)
154
+ arabic_words = [w for w in words if ARABIC_LETTERS_RE.search(w)]
155
+ sentence_word_counts = [len(WORD_RE.findall(s)) for s in sentences]
156
+
157
+ urls = URL_RE.findall(text)
158
+ urls = [u[0] if isinstance(u, tuple) else u for u in urls]
159
+
160
+ emails = EMAIL_RE.findall(text)
161
+ decimals = DECIMAL_RE.findall(text)
162
+
163
+ questions = [s for s in sentences if s.strip().endswith("؟")]
164
+ exclamations = [s for s in sentences if s.strip().endswith("!")]
165
+
166
+ return {
167
+ "characters": len(text),
168
+ "characters_no_spaces": len(re.sub(r"\s+", "", text)),
169
+ "lines": len([line for line in text.splitlines() if line.strip()]),
170
+ "sentences": len(sentences),
171
+ "words": len(words),
172
+ "arabic_words": len(arabic_words),
173
+ "unique_words": len(set(words)),
174
+ "urls": len(urls),
175
+ "emails": len(emails),
176
+ "decimal_or_version_numbers": len(decimals),
177
+ "questions": len(questions),
178
+ "exclamations": len(exclamations),
179
+ "avg_words_per_sentence": round(statistics.mean(sentence_word_counts), 2)
180
+ if sentence_word_counts
181
+ else 0,
182
+ "max_words_sentence": max(sentence_word_counts)
183
+ if sentence_word_counts
184
+ else 0,
185
+ "min_words_sentence": min(sentence_word_counts)
186
+ if sentence_word_counts
187
+ else 0,
188
+ "long_sentences": [
189
+ s for s in sentences if len(WORD_RE.findall(s)) >= 30
190
+ ],
191
+ "short_sentences": [
192
+ s for s in sentences if len(WORD_RE.findall(s)) <= 3
193
+ ],
194
+ "top_words": Counter([w.lower() for w in words]).most_common(20),
195
+ "sentences_list": sentences,
196
+ "urls_list": urls,
197
+ "emails_list": emails,
198
+ "numbers_list": decimals,
199
+ }
200
+
201
+ def print_report(self, text: str) -> None:
202
+ stats = self.analyze_text(text)
203
+
204
+ print("Arabic Text Analysis Report")
205
+ print("=" * 35)
206
+ print(f"Sentences: {stats['sentences']}")
207
+ print(f"Words: {stats['words']}")
208
+ print(f"Arabic Words: {stats['arabic_words']}")
209
+ print(f"Unique Words: {stats['unique_words']}")
210
+ print(f"Characters: {stats['characters']}")
211
+ print(f"Characters without spaces: {stats['characters_no_spaces']}")
212
+ print(f"Lines: {stats['lines']}")
213
+ print(f"URLs: {stats['urls']}")
214
+ print(f"Emails: {stats['emails']}")
215
+ print(f"Decimal / Version Numbers: {stats['decimal_or_version_numbers']}")
216
+ print(f"Questions: {stats['questions']}")
217
+ print(f"Exclamations: {stats['exclamations']}")
218
+ print(f"Average Words per Sentence: {stats['avg_words_per_sentence']}")
219
+ print(f"Max Words in a Sentence: {stats['max_words_sentence']}")
220
+ print(f"Min Words in a Sentence: {stats['min_words_sentence']}")
221
+
222
+
223
+ _DEFAULT_SENTENCIZER = None
224
+
225
+
226
+ def _get_default_sentencizer() -> ArabicSentencizer:
227
+ global _DEFAULT_SENTENCIZER
228
+
229
+ if _DEFAULT_SENTENCIZER is None:
230
+ _DEFAULT_SENTENCIZER = ArabicSentencizer()
231
+
232
+ return _DEFAULT_SENTENCIZER
233
+
234
+
235
+ def split_sentences(text: str) -> List[str]:
236
+ return _get_default_sentencizer().split_sentences(text)
237
+
238
+
239
+ def analyze_text(text: str) -> Dict[str, Any]:
240
+ return _get_default_sentencizer().analyze_text(text)
241
+
242
+
243
+ def print_report(text: str) -> None:
244
+ return _get_default_sentencizer().print_report(text)
@@ -0,0 +1,333 @@
1
+ Metadata-Version: 2.4
2
+ Name: arabic-sentencizer
3
+ Version: 0.1.1
4
+ Summary: Arabic sentence splitter and text statistics tool for NLP preprocessing.
5
+ Author: Faisal Alshargi
6
+ License: MIT
7
+ Project-URL: Homepage, https://www.sanaa.ai
8
+ Project-URL: Repository, https://github.com/alshargi/arabic-sentencizer
9
+ Keywords: arabic,nlp,sentence-splitting,tokenization,sentencizer
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Natural Language :: Arabic
13
+ Classifier: Topic :: Text Processing :: Linguistic
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: nltk>=3.8
20
+ Dynamic: license-file
21
+
22
+ # Arabic Sentencizer
23
+
24
+ A lightweight Arabic sentence segmentation and text analytics toolkit designed for Arabic NLP pipelines, Information Retrieval systems, Large Language Models (LLMs), corpus processing, data annotation workflows, and Retrieval-Augmented Generation (RAG) applications.
25
+
26
+ Arabic Sentencizer addresses common challenges in Arabic sentence boundary detection while preserving important linguistic and technical patterns such as abbreviations, URLs, emails, decimal numbers, and version identifiers.
27
+
28
+ ---
29
+
30
+ ## Features
31
+
32
+ * Arabic sentence segmentation
33
+ * URL-aware sentence splitting
34
+ * Email-aware sentence splitting
35
+ * Decimal number preservation (`1.5`, `95.5`, `١.٩`)
36
+ * Version number preservation (`v2.1.5`)
37
+ * Arabic abbreviation handling (`د.`, `أ.د.`, `م.`)
38
+ * Arabic punctuation support (`؟`, `؛`, `!`)
39
+ * Text statistics and analytics
40
+ * NLP preprocessing support
41
+ * Corpus preparation and annotation workflows
42
+ * Information Retrieval preprocessing
43
+ * RAG chunking preparation
44
+ * Human-readable text analysis reports
45
+
46
+ ---
47
+
48
+ ## Installation
49
+
50
+ Install from PyPI:
51
+
52
+ ```bash
53
+ pip install arabic-sentencizer
54
+ ```
55
+
56
+ For local development:
57
+
58
+ ```bash
59
+ pip install -e .
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Quick Start
65
+
66
+ ### Sentence Segmentation
67
+
68
+ ```python
69
+ from arabic_sentencizer import split_sentences
70
+
71
+ text = """
72
+ د. أحمد وصل إلى الاجتماع الساعة 10.30 صباحاً.
73
+ ثم قال إن نسبة النجاح بلغت 95.5%.
74
+ زار الموقع https://www.sanaa.ai ثم انتقل إلى bbc.com.
75
+ """
76
+
77
+ sentences = split_sentences(text)
78
+
79
+ for i, sentence in enumerate(sentences, start=1):
80
+ print(i, sentence)
81
+ ```
82
+
83
+ Output:
84
+
85
+ ```text
86
+ 1 د. أحمد وصل إلى الاجتماع الساعة 10.30 صباحاً.
87
+ 2 ثم قال إن نسبة النجاح بلغت 95.5%.
88
+ 3 زار الموقع https://www.sanaa.ai ثم انتقل إلى bbc.com.
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Real Example
94
+
95
+ ```python
96
+ from arabic_sentencizer import split_sentences, analyze_text, print_report
97
+
98
+ text = """
99
+ د. أحمد محمد، أستاذ الذكاء الاصطناعي في جامعة القاهرة، أعلن اليوم عن إصدار جديد من منصة التحليل اللغوي. الإصدار الحالي هو v2.1.5 بينما كان الإصدار السابق v2.0.9 فقط.
100
+
101
+ قال أ.د. محمد علي: "لقد حققنا نتائج ممتازة في معالجة اللغة العربية." وأضاف أن دقة النظام وصلت إلى 95.7% مقارنة بـ 88.4% العام الماضي.
102
+
103
+ يمكن للباحثين زيارة الموقع الرسمي:
104
+ https://www.sanaa.ai
105
+
106
+ كما يمكنهم مراجعة الوثائق عبر:
107
+ https://docs.sanaa.ai/api/v1.5/index.html
108
+
109
+ للتواصل مع فريق التطوير يرجى إرسال رسالة إلى:
110
+ research@sanaa.ai
111
+
112
+ بلغ عدد الوثائق المعالجة أكثر من 1.5 مليون وثيقة. كما تم تحليل 125.456.789 كلمة عربية خلال مرحلة الاختبار.
113
+
114
+ هل يمكن استخدام النظام في تطبيقات البحث الدلالي؟
115
+ نعم! يدعم النظام البحث الدلالي والبحث التقليدي معاً.
116
+
117
+ ممتاز؛ لننتقل الآن إلى تقييم النتائج.
118
+
119
+ شارك في المشروع كل من:
120
+ د. فيصل الشرقي
121
+ أ.د. محمود الأحمد
122
+ م. خالد السالم
123
+
124
+ وقد تم نشر النتائج في عدة مؤتمرات منها AAAI 2025 و ACL 2024 و EMNLP 2023.
125
+
126
+ زار الفريق مواقع مثل bbc.com و wikipedia.org و github.com أثناء جمع البيانات.
127
+
128
+ وأشار التقرير النهائي إلى أن متوسط زمن الاستجابة بلغ 0.35 ثانية فقط. بينما انخفض زمن الفهرسة من 12.5 ساعة إلى 4.2 ساعة بعد تحسين الخوارزمية.
129
+
130
+ قال أحد المراجعين:
131
+ "هذه من أفضل الأدوات مفتوحة المصدر لمعالجة النصوص العربية."
132
+
133
+ هل ستتم إضافة دعم للهجات العربية مستقبلاً؟
134
+ بالتأكيد. تشمل الخطة القادمة دعم اللهجات الخليجية والشامية والمصرية واليمنية.
135
+
136
+ شكراً لجميع المساهمين!
137
+ """
138
+
139
+ print_report(text)
140
+ ```
141
+
142
+ Output:
143
+
144
+ ```text
145
+ Arabic Text Analysis Report
146
+ ===================================
147
+ Sentences: 18
148
+ Words: 242
149
+ Arabic Words: 193
150
+ Unique Words: 190
151
+ Characters: 1342
152
+ Characters without spaces: 1115
153
+ Lines: 24
154
+ URLs: 6
155
+ Emails: 1
156
+ Decimal / Version Numbers: 10
157
+ Questions: 2
158
+ Exclamations: 2
159
+ Average Words per Sentence: 13.44
160
+ Max Words in a Sentence: 44
161
+ Min Words in a Sentence: 1
162
+ ```
163
+
164
+ This example demonstrates support for:
165
+
166
+ * Arabic abbreviations (`د.`, `أ.د.`, `م.`)
167
+ * URLs and domains
168
+ * Email addresses
169
+ * Decimal numbers
170
+ * Version numbers
171
+ * Questions and exclamations
172
+ * Arabic punctuation
173
+ * Text analytics
174
+
175
+ ---
176
+
177
+ ## Text Analytics
178
+
179
+ ```python
180
+ from arabic_sentencizer import analyze_text
181
+
182
+ stats = analyze_text(text)
183
+
184
+ print(stats["sentences"])
185
+ print(stats["words"])
186
+ print(stats["top_words"])
187
+ ```
188
+
189
+ ---
190
+
191
+ ## Human-Readable Report
192
+
193
+ ```python
194
+ from arabic_sentencizer import print_report
195
+
196
+ print_report(text)
197
+ ```
198
+
199
+ ---
200
+
201
+ ## API
202
+
203
+ ### Split Sentences
204
+
205
+ ```python
206
+ split_sentences(text: str) -> list[str]
207
+ ```
208
+
209
+ Returns a list of segmented Arabic sentences.
210
+
211
+ ### Analyze Text
212
+
213
+ ```python
214
+ analyze_text(text: str) -> dict
215
+ ```
216
+
217
+ Returns a dictionary containing text statistics and extracted information.
218
+
219
+ ### Print Report
220
+
221
+ ```python
222
+ print_report(text: str) -> None
223
+ ```
224
+
225
+ Prints a human-readable summary report.
226
+
227
+ ---
228
+
229
+ ## Supported Cases
230
+
231
+ ### Arabic Abbreviations
232
+
233
+ ```text
234
+ د. أحمد محمد
235
+ أ.د. محمد علي
236
+ م. خالد السالم
237
+ ```
238
+
239
+ ### URLs
240
+
241
+ ```text
242
+ https://www.sanaa.ai
243
+ https://docs.sanaa.ai/api/v1.5/index.html
244
+ bbc.com
245
+ github.com
246
+ wikipedia.org
247
+ ```
248
+
249
+ ### Emails
250
+
251
+ ```text
252
+ research@sanaa.ai
253
+ ```
254
+
255
+ ### Decimal Numbers
256
+
257
+ ```text
258
+ 1.5
259
+ 95.7
260
+ 88.4
261
+ 0.35
262
+ 12.5
263
+ 4.2
264
+ ١.٩
265
+ ```
266
+
267
+ ### Version Numbers
268
+
269
+ ```text
270
+ v2.1.5
271
+ v2.0.9
272
+ ```
273
+
274
+ ### Arabic Punctuation
275
+
276
+ ```text
277
+ ؟
278
+ !
279
+ ؛
280
+ ```
281
+
282
+ ---
283
+
284
+ ## Text Statistics
285
+
286
+ The `analyze_text()` function returns:
287
+
288
+ | Statistic | Description |
289
+ | -------------------------- | ---------------------------- |
290
+ | characters | Total characters |
291
+ | characters_no_spaces | Characters excluding spaces |
292
+ | lines | Number of non-empty lines |
293
+ | sentences | Number of detected sentences |
294
+ | words | Total words |
295
+ | arabic_words | Arabic words only |
296
+ | unique_words | Unique word count |
297
+ | urls | Number of URLs |
298
+ | emails | Number of emails |
299
+ | decimal_or_version_numbers | Decimal and version numbers |
300
+ | questions | Number of questions |
301
+ | exclamations | Number of exclamations |
302
+ | avg_words_per_sentence | Average sentence length |
303
+ | max_words_sentence | Longest sentence length |
304
+ | min_words_sentence | Shortest sentence length |
305
+ | long_sentences | Long sentences |
306
+ | short_sentences | Short sentences |
307
+ | top_words | Most frequent words |
308
+ | sentences_list | All detected sentences |
309
+ | urls_list | Extracted URLs |
310
+ | emails_list | Extracted emails |
311
+ | numbers_list | Extracted numbers |
312
+
313
+ ---
314
+
315
+ ## Use Cases
316
+
317
+ * Arabic NLP preprocessing
318
+ * Corpus preparation
319
+ * Dataset annotation
320
+ * Information Retrieval
321
+ * Search engines
322
+ * Retrieval-Augmented Generation (RAG)
323
+ * LLM pipelines
324
+ * Arabic content analytics
325
+ * Text mining
326
+ * Digital humanities research
327
+ * Academic research projects
328
+
329
+ ---
330
+
331
+ ## Author
332
+
333
+ ### Dr. Faisal Alshargi
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ arabic_sentencizer/__init__.py
5
+ arabic_sentencizer/splitter.py
6
+ arabic_sentencizer.egg-info/PKG-INFO
7
+ arabic_sentencizer.egg-info/SOURCES.txt
8
+ arabic_sentencizer.egg-info/dependency_links.txt
9
+ arabic_sentencizer.egg-info/requires.txt
10
+ arabic_sentencizer.egg-info/top_level.txt
11
+ tests/test_splitter.py
@@ -0,0 +1 @@
1
+ arabic_sentencizer
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "arabic-sentencizer"
7
+ version = "0.1.1"
8
+ description = "Arabic sentence splitter and text statistics tool for NLP preprocessing."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ authors = [
12
+ { name = "Faisal Alshargi" }
13
+ ]
14
+ license = { text = "MIT" }
15
+ keywords = ["arabic", "nlp", "sentence-splitting", "tokenization", "sentencizer"]
16
+ dependencies = [
17
+ "nltk>=3.8"
18
+ ]
19
+ classifiers = [
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3 :: Only",
22
+ "Natural Language :: Arabic",
23
+ "Topic :: Text Processing :: Linguistic",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ "Operating System :: OS Independent"
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://www.sanaa.ai"
30
+ Repository = "https://github.com/alshargi/arabic-sentencizer"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,25 @@
1
+ from arabic_sentencizer import split_sentences, analyze_text
2
+
3
+
4
+ def test_abbreviations_and_numbers():
5
+ text = "د. أحمد وصل الساعة 10.30 صباحاً. الاسم المختصر هو أ.د. محمد علي."
6
+ sentences = split_sentences(text)
7
+ assert sentences[0].startswith("د. أحمد")
8
+ assert any("أ.د. محمد علي" in s for s in sentences)
9
+
10
+
11
+ def test_urls_and_emails():
12
+ text = "زار الموقع https://www.sanaa.ai ثم انتقل إلى bbc.com. البريد هو test@example.com."
13
+ sentences = split_sentences(text)
14
+ assert len(sentences) == 2
15
+ assert "https://www.sanaa.ai" in sentences[0]
16
+ assert "bbc.com" in sentences[0]
17
+ assert "test@example.com" in sentences[1]
18
+
19
+
20
+ def test_analysis():
21
+ text = "هل انتهى الاختبار؟ نعم! ممتاز؛ لننتقل للمرحلة التالية."
22
+ stats = analyze_text(text)
23
+ assert stats["sentences"] >= 2
24
+ assert stats["questions"] == 1
25
+ assert stats["exclamations"] == 1