persian-readability 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- persian_readability-0.1.2/LICENSE +21 -0
- persian_readability-0.1.2/PKG-INFO +313 -0
- persian_readability-0.1.2/README.md +279 -0
- persian_readability-0.1.2/persian_readability/__init__.py +63 -0
- persian_readability-0.1.2/persian_readability/core.py +829 -0
- persian_readability-0.1.2/persian_readability.egg-info/PKG-INFO +313 -0
- persian_readability-0.1.2/persian_readability.egg-info/SOURCES.txt +12 -0
- persian_readability-0.1.2/persian_readability.egg-info/dependency_links.txt +1 -0
- persian_readability-0.1.2/persian_readability.egg-info/entry_points.txt +2 -0
- persian_readability-0.1.2/persian_readability.egg-info/requires.txt +9 -0
- persian_readability-0.1.2/persian_readability.egg-info/top_level.txt +1 -0
- persian_readability-0.1.2/pyproject.toml +66 -0
- persian_readability-0.1.2/setup.cfg +4 -0
- persian_readability-0.1.2/tests/test_core.py +501 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Dr. Mohammad Pirouzan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: persian-readability
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A lightweight Python tool for Persian/Farsi readability analysis using the Flesch-Dayani formula.
|
|
5
|
+
Author-email: Mohammad Pirouzan <mohammadpirouzan@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Drpirouzan/Persian-Readability
|
|
8
|
+
Project-URL: Repository, https://github.com/Drpirouzan/Persian-Readability
|
|
9
|
+
Project-URL: Issues, https://github.com/Drpirouzan/Persian-Readability/issues
|
|
10
|
+
Keywords: persian,farsi,readability,nlp,flesch,flesch-dayani,text-analysis,persian-language,education,accessibility
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Education
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Natural Language :: Persian
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Text Processing
|
|
22
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: hazm
|
|
27
|
+
Provides-Extra: pos
|
|
28
|
+
Requires-Dist: parsivar; extra == "pos"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Requires-Dist: build; extra == "dev"
|
|
32
|
+
Requires-Dist: twine; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# Persian Readability (Flesch–Dayani)
|
|
36
|
+
|
|
37
|
+
A lightweight Python package and command-line tool to calculate the **Flesch–Dayani readability score** for Persian (Farsi) text — with an optional POS-enhanced syllable counter for higher accuracy.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- Persian text normalization and tokenization via `hazm`
|
|
44
|
+
- **Punctuation-aware tokenization** — علائم نشانهگذاری از شمارش کلمات و هجاها حذف میشوند
|
|
45
|
+
- **Two-tier syllable counting:**
|
|
46
|
+
- **POS-enhanced** (Better Accuracy) — if `parsivar` is installed, uses part-of-speech tags to correctly count syllables in verbs with attached prefixes (`میرود`، `نمیدانم`) and comparative adjectives (`بهتر`، `بزرگترین`)
|
|
47
|
+
- **Morphological heuristic** (Good Accuracy) — used automatically if `parsivar` is not installed
|
|
48
|
+
- **Context-aware خواه classifier** — three-layer disambiguation prevents confusing `خواهش`, `خواهر`, `آزادیخواه`, and `خواه ... خواه ...` with the future auxiliary (`خواهم رفت`)
|
|
49
|
+
- Computes:
|
|
50
|
+
- Number of sentences, words, letters, and syllables
|
|
51
|
+
- **ASL** — Average Sentence Length (words per sentence)
|
|
52
|
+
- **WL** — Average Word Length (letters per word)
|
|
53
|
+
- **ASYL** — Average Syllables per Word *(used in the original Dayani formula)*
|
|
54
|
+
- Flesch–Dayani readability score
|
|
55
|
+
- **Human-readable level** (e.g. *متوسط — مناسب دانشآموزان دبیرستان*)
|
|
56
|
+
- Accepts input from a file, a command-line argument, or **stdin** (pipe-friendly)
|
|
57
|
+
- `--plain` flag for scripting and pipeline use
|
|
58
|
+
- `--verbose` flag for debug logging
|
|
59
|
+
- Warns when text is too short for a reliable score (< 50 words)
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Readability Levels
|
|
64
|
+
|
|
65
|
+
| Score | Level |
|
|
66
|
+
|-------|-------|
|
|
67
|
+
| ≥ 90 | بسیار آسان — مناسب کودکان دبستانی |
|
|
68
|
+
| ≥ 80 | آسان — مناسب نوجوانان |
|
|
69
|
+
| ≥ 70 | نسبتاً آسان — مناسب عموم مردم |
|
|
70
|
+
| ≥ 60 | متوسط — مناسب دانشآموزان دبیرستان |
|
|
71
|
+
| ≥ 50 | نسبتاً دشوار — مناسب دانشجویان |
|
|
72
|
+
| ≥ 30 | دشوار — مناسب متخصصان |
|
|
73
|
+
| < 30 | بسیار دشوار — متون علمی/تخصصی |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Installation
|
|
78
|
+
|
|
79
|
+
Install from PyPI after release:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install persian-readability
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
For local development:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install -e ".[dev]"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
For optional POS-enhanced syllable counting:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install "persian-readability[pos]"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Requirements
|
|
100
|
+
|
|
101
|
+
### Required
|
|
102
|
+
|
|
103
|
+
- Python **3.10** or newer
|
|
104
|
+
- [`hazm`](https://github.com/roshan-research/hazm) — Persian NLP library
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
pip install hazm
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Optional (for higher syllable accuracy)
|
|
111
|
+
|
|
112
|
+
- [`parsivar`](https://github.com/ICTRC/Parsivar) — Persian preprocessing toolkit with POS tagger
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
pip install parsivar
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
> If `parsivar` is not installed, the script falls back to the morphological heuristic automatically — no configuration needed.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Usage
|
|
123
|
+
|
|
124
|
+
**Direct text:**
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
persian-readability -t "متن فارسی شما"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
**From a file:**
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
persian-readability -f sample.txt
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**From stdin (pipe):**
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
echo "متن فارسی شما" | persian-readability
|
|
140
|
+
cat article.txt | persian-readability
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**Raw score only (for scripting):**
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
persian-readability -f sample.txt --plain
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**With debug logging:**
|
|
150
|
+
|
|
151
|
+
```
|
|
152
|
+
persian-readability -f sample.txt --verbose
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Python API Usage
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from persian_readability import calculate_readability
|
|
161
|
+
|
|
162
|
+
result = calculate_readability("برای پیشگیری از پوسیدگی دندان، روزی دو بار مسواک بزنید.")
|
|
163
|
+
print(result)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Real-World Examples
|
|
169
|
+
|
|
170
|
+
### Example 1 — Public health text
|
|
171
|
+
|
|
172
|
+
**Input:**
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
persian-readability -t "برای پیشگیری از پوسیدگی دندان، بهتر است روزی دو بار مسواک بزنید و مصرف مواد قندی را کاهش دهید."
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**Possible use case:**
|
|
179
|
+
|
|
180
|
+
This can help public health educators check whether patient-facing Persian health messages are simple enough for the general public.
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
### Example 2 — Academic text
|
|
185
|
+
|
|
186
|
+
**Input:**
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
persian-readability -t "شاخصهای زیستی بزاقی میتوانند در تشخیص زودهنگام برخی بیماریهای دهان و فک و صورت نقش مهمی داشته باشند."
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Possible use case:**
|
|
193
|
+
|
|
194
|
+
Researchers can compare the readability of Persian academic summaries, abstracts, or educational materials.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
### Example 3 — Pipeline use
|
|
199
|
+
|
|
200
|
+
**Input:**
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
cat article.txt | persian-readability --plain
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**Possible use case:**
|
|
207
|
+
|
|
208
|
+
Developers can integrate the readability score into larger Persian NLP or content-quality workflows.
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Sample Output
|
|
213
|
+
|
|
214
|
+
```text
|
|
215
|
+
══════════════════════════════════════════════════════
|
|
216
|
+
Persian Readability — Flesch–Dayani
|
|
217
|
+
══════════════════════════════════════════════════════
|
|
218
|
+
جملات : 5
|
|
219
|
+
کلمات : 87
|
|
220
|
+
حروف : 412
|
|
221
|
+
هجاها : 201
|
|
222
|
+
روش : POS-enhanced — Parsivar
|
|
223
|
+
────────────────────────────────────────────────────
|
|
224
|
+
ASL (کلمه/جمله) : 17.40
|
|
225
|
+
WL (حرف/کلمه) : 4.74
|
|
226
|
+
ASYL (هجا/کلمه) : 2.31
|
|
227
|
+
────────────────────────────────────────────────────
|
|
228
|
+
امتیاز Flesch–Dayani : 58.34
|
|
229
|
+
سطح خوانایی : متوسط — مناسب دانشآموزان دبیرستان
|
|
230
|
+
══════════════════════════════════════════════════════
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Formula
|
|
236
|
+
|
|
237
|
+
```
|
|
238
|
+
FDR = 262.835 − 0.846 × ASYL − 1.015 × ASL
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Where **ASYL** = average syllables per word and **ASL** = average words per sentence.
|
|
242
|
+
Higher scores indicate easier text.
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## How Syllable Accuracy Tiers Work
|
|
247
|
+
|
|
248
|
+
| Mode | Accuracy | How |
|
|
249
|
+
|------|----------|-----|
|
|
250
|
+
| POS-enhanced | ~85% | Parsivar POSTagger (wapiti CRF, Bijankhan corpus) detects verb/adjective tags; prefix/suffix rules applied per POS |
|
|
251
|
+
| Morphological heuristic | ~75% | Counts written long vowels (ا و ی), diacritics, and word-final ه; no POS context |
|
|
252
|
+
|
|
253
|
+
Main cases where POS tagging improves accuracy:
|
|
254
|
+
|
|
255
|
+
- Verbs with attached `می`/`نمی` prefix (no half-space): `میرود` → +1 syllable
|
|
256
|
+
- Comparative/superlative adjectives: `بهترین` → suffix `ترین` = 2 syllables
|
|
257
|
+
|
|
258
|
+
### خواه Classifier
|
|
259
|
+
|
|
260
|
+
The word `خواه` has multiple roles in Persian. A three-layer classifier resolves ambiguity **before** syllable counting:
|
|
261
|
+
|
|
262
|
+
| Label | Examples | Treatment |
|
|
263
|
+
|-------|---------|-----------|
|
|
264
|
+
| `FUTURE_AUX` | خواهم رفت، نخواهند پذیرفت | syllable count unchanged (هجاشماری base درست است) |
|
|
265
|
+
| `LEXICAL_KHASTAN` | خواهد که برود، این را خواهد | tag اصلی حفظ میشود |
|
|
266
|
+
| `PARTICLE_KHAH` | خواه بیاید خواه نیاید | treated as non-verb |
|
|
267
|
+
| `NOMINAL_DERIVATIVE` | خواهش، خواهان، خواهنده | treated as non-verb |
|
|
268
|
+
| `INDEPENDENT_WORD` | خواهر، خواهران | treated as non-verb |
|
|
269
|
+
| `SUFFIX_COMPOUND` | آزادیخواه، خیرخواه، دادخواه | treated as non-verb |
|
|
270
|
+
|
|
271
|
+
The classifier uses exact lexical sets (layer 1), suffix-compound detection (layer 2), and a 2-token context window (layer 3) — never a simple prefix regex.
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## Notes
|
|
276
|
+
|
|
277
|
+
- **Minimum text length:** The Flesch–Dayani formula is designed for running prose. Texts shorter than ~50 words produce unstable scores. A warning is emitted in this case (visible with `--verbose`).
|
|
278
|
+
- **Punctuation filtering:** علائم نشانهگذاری فارسی و لاتین (گیومه، نقطه، ویرگول، ...) از لبههای هر توکن پاک میشوند و توکنهای تمامعلامت از شمارش حذف میشوند.
|
|
279
|
+
- **stdin:** When running interactively without `-t` or `-f`, the script waits for input and prints a prompt. Press `Ctrl+D` to signal end of input.
|
|
280
|
+
- **Log messages:** All warnings go to stderr and do not affect `--plain` output.
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Running Tests
|
|
285
|
+
|
|
286
|
+
```
|
|
287
|
+
pip install pytest hazm
|
|
288
|
+
python -m pytest tests/test_core.py -v
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
76 tests covering: خواه classifier (all 9 document cases), punctuation filtering,
|
|
292
|
+
syllable counting, heuristic limitations, formula verification, and edge cases.
|
|
293
|
+
|
|
294
|
+
---
|
|
295
|
+
|
|
296
|
+
## References
|
|
297
|
+
|
|
298
|
+
- Dayani, M. (1374/1995). *سنجش خوانایی متون فارسی*. Persian adaptation of the Flesch Reading Ease formula.
|
|
299
|
+
- Mohtaj et al. (2018). [Parsivar: A Language Processing Toolkit for Persian](https://github.com/ICTRC/Parsivar). LREC 2018.
|
|
300
|
+
- Mohammadi & Khasteh (2020). [A Machine Learning Approach to Persian Text Readability](https://arxiv.org/abs/1810.06639).
|
|
301
|
+
- Sobhe. [hazm — Persian NLP library](https://github.com/roshan-research/hazm).
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## Author
|
|
306
|
+
|
|
307
|
+
**Dr. Mohammad Pirouzan** — [@Drpirouzan](https://github.com/Drpirouzan)
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## License
|
|
312
|
+
|
|
313
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# Persian Readability (Flesch–Dayani)
|
|
2
|
+
|
|
3
|
+
A lightweight Python package and command-line tool to calculate the **Flesch–Dayani readability score** for Persian (Farsi) text — with an optional POS-enhanced syllable counter for higher accuracy.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Persian text normalization and tokenization via `hazm`
|
|
10
|
+
- **Punctuation-aware tokenization** — علائم نشانهگذاری از شمارش کلمات و هجاها حذف میشوند
|
|
11
|
+
- **Two-tier syllable counting:**
|
|
12
|
+
- **POS-enhanced** (Better Accuracy) — if `parsivar` is installed, uses part-of-speech tags to correctly count syllables in verbs with attached prefixes (`میرود`، `نمیدانم`) and comparative adjectives (`بهتر`، `بزرگترین`)
|
|
13
|
+
- **Morphological heuristic** (Good Accuracy) — used automatically if `parsivar` is not installed
|
|
14
|
+
- **Context-aware خواه classifier** — three-layer disambiguation prevents confusing `خواهش`, `خواهر`, `آزادیخواه`, and `خواه ... خواه ...` with the future auxiliary (`خواهم رفت`)
|
|
15
|
+
- Computes:
|
|
16
|
+
- Number of sentences, words, letters, and syllables
|
|
17
|
+
- **ASL** — Average Sentence Length (words per sentence)
|
|
18
|
+
- **WL** — Average Word Length (letters per word)
|
|
19
|
+
- **ASYL** — Average Syllables per Word *(used in the original Dayani formula)*
|
|
20
|
+
- Flesch–Dayani readability score
|
|
21
|
+
- **Human-readable level** (e.g. *متوسط — مناسب دانشآموزان دبیرستان*)
|
|
22
|
+
- Accepts input from a file, a command-line argument, or **stdin** (pipe-friendly)
|
|
23
|
+
- `--plain` flag for scripting and pipeline use
|
|
24
|
+
- `--verbose` flag for debug logging
|
|
25
|
+
- Warns when text is too short for a reliable score (< 50 words)
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Readability Levels
|
|
30
|
+
|
|
31
|
+
| Score | Level |
|
|
32
|
+
|-------|-------|
|
|
33
|
+
| ≥ 90 | بسیار آسان — مناسب کودکان دبستانی |
|
|
34
|
+
| ≥ 80 | آسان — مناسب نوجوانان |
|
|
35
|
+
| ≥ 70 | نسبتاً آسان — مناسب عموم مردم |
|
|
36
|
+
| ≥ 60 | متوسط — مناسب دانشآموزان دبیرستان |
|
|
37
|
+
| ≥ 50 | نسبتاً دشوار — مناسب دانشجویان |
|
|
38
|
+
| ≥ 30 | دشوار — مناسب متخصصان |
|
|
39
|
+
| < 30 | بسیار دشوار — متون علمی/تخصصی |
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
Install from PyPI after release:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install persian-readability
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
For local development:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e ".[dev]"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
For optional POS-enhanced syllable counting:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install "persian-readability[pos]"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Requirements
|
|
66
|
+
|
|
67
|
+
### Required
|
|
68
|
+
|
|
69
|
+
- Python **3.10** or newer
|
|
70
|
+
- [`hazm`](https://github.com/roshan-research/hazm) — Persian NLP library
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
pip install hazm
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Optional (for higher syllable accuracy)
|
|
77
|
+
|
|
78
|
+
- [`parsivar`](https://github.com/ICTRC/Parsivar) — Persian preprocessing toolkit with POS tagger
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
pip install parsivar
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
> If `parsivar` is not installed, the script falls back to the morphological heuristic automatically — no configuration needed.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Usage
|
|
89
|
+
|
|
90
|
+
**Direct text:**
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
persian-readability -t "متن فارسی شما"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**From a file:**
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
persian-readability -f sample.txt
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**From stdin (pipe):**
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
echo "متن فارسی شما" | persian-readability
|
|
106
|
+
cat article.txt | persian-readability
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Raw score only (for scripting):**
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
persian-readability -f sample.txt --plain
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
**With debug logging:**
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
persian-readability -f sample.txt --verbose
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Python API Usage
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from persian_readability import calculate_readability
|
|
127
|
+
|
|
128
|
+
result = calculate_readability("برای پیشگیری از پوسیدگی دندان، روزی دو بار مسواک بزنید.")
|
|
129
|
+
print(result)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Real-World Examples
|
|
135
|
+
|
|
136
|
+
### Example 1 — Public health text
|
|
137
|
+
|
|
138
|
+
**Input:**
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
persian-readability -t "برای پیشگیری از پوسیدگی دندان، بهتر است روزی دو بار مسواک بزنید و مصرف مواد قندی را کاهش دهید."
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Possible use case:**
|
|
145
|
+
|
|
146
|
+
This can help public health educators check whether patient-facing Persian health messages are simple enough for the general public.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
### Example 2 — Academic text
|
|
151
|
+
|
|
152
|
+
**Input:**
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
persian-readability -t "شاخصهای زیستی بزاقی میتوانند در تشخیص زودهنگام برخی بیماریهای دهان و فک و صورت نقش مهمی داشته باشند."
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**Possible use case:**
|
|
159
|
+
|
|
160
|
+
Researchers can compare the readability of Persian academic summaries, abstracts, or educational materials.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
### Example 3 — Pipeline use
|
|
165
|
+
|
|
166
|
+
**Input:**
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
cat article.txt | persian-readability --plain
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
**Possible use case:**
|
|
173
|
+
|
|
174
|
+
Developers can integrate the readability score into larger Persian NLP or content-quality workflows.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Sample Output
|
|
179
|
+
|
|
180
|
+
```text
|
|
181
|
+
══════════════════════════════════════════════════════
|
|
182
|
+
Persian Readability — Flesch–Dayani
|
|
183
|
+
══════════════════════════════════════════════════════
|
|
184
|
+
جملات : 5
|
|
185
|
+
کلمات : 87
|
|
186
|
+
حروف : 412
|
|
187
|
+
هجاها : 201
|
|
188
|
+
روش : POS-enhanced — Parsivar
|
|
189
|
+
────────────────────────────────────────────────────
|
|
190
|
+
ASL (کلمه/جمله) : 17.40
|
|
191
|
+
WL (حرف/کلمه) : 4.74
|
|
192
|
+
ASYL (هجا/کلمه) : 2.31
|
|
193
|
+
────────────────────────────────────────────────────
|
|
194
|
+
امتیاز Flesch–Dayani : 58.34
|
|
195
|
+
سطح خوانایی : متوسط — مناسب دانشآموزان دبیرستان
|
|
196
|
+
══════════════════════════════════════════════════════
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Formula
|
|
202
|
+
|
|
203
|
+
```
|
|
204
|
+
FDR = 262.835 − 0.846 × ASYL − 1.015 × ASL
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Where **ASYL** = average syllables per word and **ASL** = average words per sentence.
|
|
208
|
+
Higher scores indicate easier text.
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## How Syllable Accuracy Tiers Work
|
|
213
|
+
|
|
214
|
+
| Mode | Accuracy | How |
|
|
215
|
+
|------|----------|-----|
|
|
216
|
+
| POS-enhanced | ~85% | Parsivar POSTagger (wapiti CRF, Bijankhan corpus) detects verb/adjective tags; prefix/suffix rules applied per POS |
|
|
217
|
+
| Morphological heuristic | ~75% | Counts written long vowels (ا و ی), diacritics, and word-final ه; no POS context |
|
|
218
|
+
|
|
219
|
+
Main cases where POS tagging improves accuracy:
|
|
220
|
+
|
|
221
|
+
- Verbs with attached `می`/`نمی` prefix (no half-space): `میرود` → +1 syllable
|
|
222
|
+
- Comparative/superlative adjectives: `بهترین` → suffix `ترین` = 2 syllables
|
|
223
|
+
|
|
224
|
+
### خواه Classifier
|
|
225
|
+
|
|
226
|
+
The word `خواه` has multiple roles in Persian. A three-layer classifier resolves ambiguity **before** syllable counting:
|
|
227
|
+
|
|
228
|
+
| Label | Examples | Treatment |
|
|
229
|
+
|-------|---------|-----------|
|
|
230
|
+
| `FUTURE_AUX` | خواهم رفت، نخواهند پذیرفت | syllable count unchanged (هجاشماری base درست است) |
|
|
231
|
+
| `LEXICAL_KHASTAN` | خواهد که برود، این را خواهد | tag اصلی حفظ میشود |
|
|
232
|
+
| `PARTICLE_KHAH` | خواه بیاید خواه نیاید | treated as non-verb |
|
|
233
|
+
| `NOMINAL_DERIVATIVE` | خواهش، خواهان، خواهنده | treated as non-verb |
|
|
234
|
+
| `INDEPENDENT_WORD` | خواهر، خواهران | treated as non-verb |
|
|
235
|
+
| `SUFFIX_COMPOUND` | آزادیخواه، خیرخواه، دادخواه | treated as non-verb |
|
|
236
|
+
|
|
237
|
+
The classifier uses exact lexical sets (layer 1), suffix-compound detection (layer 2), and a 2-token context window (layer 3) — never a simple prefix regex.
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Notes
|
|
242
|
+
|
|
243
|
+
- **Minimum text length:** The Flesch–Dayani formula is designed for running prose. Texts shorter than ~50 words produce unstable scores. A warning is emitted in this case (visible with `--verbose`).
|
|
244
|
+
- **Punctuation filtering:** علائم نشانهگذاری فارسی و لاتین (گیومه، نقطه، ویرگول، ...) از لبههای هر توکن پاک میشوند و توکنهای تمامعلامت از شمارش حذف میشوند.
|
|
245
|
+
- **stdin:** When running interactively without `-t` or `-f`, the script waits for input and prints a prompt. Press `Ctrl+D` to signal end of input.
|
|
246
|
+
- **Log messages:** All warnings go to stderr and do not affect `--plain` output.
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## Running Tests
|
|
251
|
+
|
|
252
|
+
```
|
|
253
|
+
pip install pytest hazm
|
|
254
|
+
python -m pytest tests/test_core.py -v
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
76 tests covering: خواه classifier (all 9 document cases), punctuation filtering,
|
|
258
|
+
syllable counting, heuristic limitations, formula verification, and edge cases.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## References
|
|
263
|
+
|
|
264
|
+
- Dayani, M. (1374/1995). *سنجش خوانایی متون فارسی*. Persian adaptation of the Flesch Reading Ease formula.
|
|
265
|
+
- Mohtaj et al. (2018). [Parsivar: A Language Processing Toolkit for Persian](https://github.com/ICTRC/Parsivar). LREC 2018.
|
|
266
|
+
- Mohammadi & Khasteh (2020). [A Machine Learning Approach to Persian Text Readability](https://arxiv.org/abs/1810.06639).
|
|
267
|
+
- Sobhe. [hazm — Persian NLP library](https://github.com/roshan-research/hazm).
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Author
|
|
272
|
+
|
|
273
|
+
**Dr. Mohammad Pirouzan** — [@Drpirouzan](https://github.com/Drpirouzan)
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## License
|
|
278
|
+
|
|
279
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Persian Readability — Flesch–Dayani readability score for Persian/Farsi text.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from . import core as _core
|
|
6
|
+
|
|
7
|
+
# Re-export core functions/classes, including existing internal helpers,
|
|
8
|
+
# so older tests/imports keep working after converting the project to a package.
|
|
9
|
+
for _name in dir(_core):
|
|
10
|
+
if not _name.startswith("__"):
|
|
11
|
+
globals()[_name] = getattr(_core, _name)
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.2"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PersianReadability:
|
|
17
|
+
"""Small convenience wrapper around the core readability calculator."""
|
|
18
|
+
|
|
19
|
+
def analyze(self, text: str, mode="auto"):
|
|
20
|
+
return compute_flesch_dayani(text, mode=mode)
|
|
21
|
+
|
|
22
|
+
def calculate(self, text: str, mode="auto") -> dict:
|
|
23
|
+
return calculate_readability(text, mode=mode)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def calculate_readability(text: str, mode="auto") -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Calculate Persian/Farsi readability and return a simple dictionary.
|
|
29
|
+
|
|
30
|
+
This wrapper is useful for users who prefer a JSON-like output instead of
|
|
31
|
+
the ReadabilityResult dataclass.
|
|
32
|
+
"""
|
|
33
|
+
result = compute_flesch_dayani(text, mode=mode)
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
"score": result.flesch_dayani,
|
|
37
|
+
"level": result.level,
|
|
38
|
+
"sentences": result.sentences,
|
|
39
|
+
"words": result.words,
|
|
40
|
+
"letters": result.letters,
|
|
41
|
+
"syllables": result.syllables,
|
|
42
|
+
"asl": result.asl,
|
|
43
|
+
"wl": result.wl,
|
|
44
|
+
"asyl": result.asyl,
|
|
45
|
+
"pos_mode": result.pos_mode,
|
|
46
|
+
"pos_enhanced": result.pos_enhanced,
|
|
47
|
+
"is_likely_poetry": result.is_likely_poetry,
|
|
48
|
+
"diacritics_mode": result.diacritics_mode,
|
|
49
|
+
"diacritic_ratio": result.diacritic_ratio,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
__all__ = [
|
|
54
|
+
"PersianReadability",
|
|
55
|
+
"calculate_readability",
|
|
56
|
+
"compute_flesch_dayani",
|
|
57
|
+
"ReadabilityResult",
|
|
58
|
+
"InputMode",
|
|
59
|
+
"count_syllables",
|
|
60
|
+
"count_letters",
|
|
61
|
+
"interpret_score",
|
|
62
|
+
"analyze_diacritics",
|
|
63
|
+
]
|