persian-readability 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- persian_readability/__init__.py +63 -0
- persian_readability/core.py +829 -0
- persian_readability-0.1.2.dist-info/METADATA +313 -0
- persian_readability-0.1.2.dist-info/RECORD +8 -0
- persian_readability-0.1.2.dist-info/WHEEL +5 -0
- persian_readability-0.1.2.dist-info/entry_points.txt +2 -0
- persian_readability-0.1.2.dist-info/licenses/LICENSE +21 -0
- persian_readability-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Persian Readability — Flesch–Dayani readability score for Persian/Farsi text.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from . import core as _core
|
|
6
|
+
|
|
7
|
+
# Re-export core functions/classes, including existing internal helpers,
|
|
8
|
+
# so older tests/imports keep working after converting the project to a package.
|
|
9
|
+
for _name in dir(_core):
|
|
10
|
+
if not _name.startswith("__"):
|
|
11
|
+
globals()[_name] = getattr(_core, _name)
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.2"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PersianReadability:
|
|
17
|
+
"""Small convenience wrapper around the core readability calculator."""
|
|
18
|
+
|
|
19
|
+
def analyze(self, text: str, mode="auto"):
|
|
20
|
+
return compute_flesch_dayani(text, mode=mode)
|
|
21
|
+
|
|
22
|
+
def calculate(self, text: str, mode="auto") -> dict:
|
|
23
|
+
return calculate_readability(text, mode=mode)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def calculate_readability(text: str, mode="auto") -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Calculate Persian/Farsi readability and return a simple dictionary.
|
|
29
|
+
|
|
30
|
+
This wrapper is useful for users who prefer a JSON-like output instead of
|
|
31
|
+
the ReadabilityResult dataclass.
|
|
32
|
+
"""
|
|
33
|
+
result = compute_flesch_dayani(text, mode=mode)
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
"score": result.flesch_dayani,
|
|
37
|
+
"level": result.level,
|
|
38
|
+
"sentences": result.sentences,
|
|
39
|
+
"words": result.words,
|
|
40
|
+
"letters": result.letters,
|
|
41
|
+
"syllables": result.syllables,
|
|
42
|
+
"asl": result.asl,
|
|
43
|
+
"wl": result.wl,
|
|
44
|
+
"asyl": result.asyl,
|
|
45
|
+
"pos_mode": result.pos_mode,
|
|
46
|
+
"pos_enhanced": result.pos_enhanced,
|
|
47
|
+
"is_likely_poetry": result.is_likely_poetry,
|
|
48
|
+
"diacritics_mode": result.diacritics_mode,
|
|
49
|
+
"diacritic_ratio": result.diacritic_ratio,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
__all__ = [
|
|
54
|
+
"PersianReadability",
|
|
55
|
+
"calculate_readability",
|
|
56
|
+
"compute_flesch_dayani",
|
|
57
|
+
"ReadabilityResult",
|
|
58
|
+
"InputMode",
|
|
59
|
+
"count_syllables",
|
|
60
|
+
"count_letters",
|
|
61
|
+
"interpret_score",
|
|
62
|
+
"analyze_diacritics",
|
|
63
|
+
]
|
|
@@ -0,0 +1,829 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
|
|
10
|
+
from hazm import Normalizer, sent_tokenize, word_tokenize
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# ── بارگذاری اختیاری Parsivar ────────────────────────────────────────────────
|
|
15
|
+
try:
|
|
16
|
+
from parsivar import POSTagger as ParsivarPOSTagger
|
|
17
|
+
from parsivar import Tokenizer as ParsivarTokenizer
|
|
18
|
+
_PARSIVAR_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
_PARSIVAR_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
# ── Tokenizer سریع مبتنی بر regex ───────────────────────────────────────────
|
|
23
|
+
# hazm.word_tokenize روی متنهای بزرگ O(n²) رفتار دارد (~23s برای 500 کلمه).
|
|
24
|
+
# این regex کلمات فارسی و لاتین (به علاوه نیمفاصله) را استخراج میکند
|
|
25
|
+
# و ۱۰۰۰+ برابر سریعتر از hazm است. برای POS-tagging هنوز از Parsivar استفاده میشود.
|
|
26
|
+
_WORD_REGEX = __import__('re').compile(r'[\u0600-\u06ff\u200c\w]+')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _fast_word_tokenize(text: str) -> list[str]:
|
|
30
|
+
"""جایگزین سریع hazm.word_tokenize — بدون مدل CRF، مبتنی بر regex."""
|
|
31
|
+
return _WORD_REGEX.findall(text)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ── برچسبهای POS (Parsivar / Bijankhan) ────────────────────────────────────
|
|
35
|
+
_VERB_TAGS_PARSIVAR: frozenset[str] = frozenset(
|
|
36
|
+
{"V_PRS", "V_PA", "V_IMP", "V_SUB", "V_FUT", "V_PRF"}
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# ── الگوی پیشوندهای فعلی (می/نمی پیوسته) ────────────────────────────────────
|
|
40
|
+
_VERB_PREFIX_ATTACHED = re.compile(r"^(می|نمی)(?!\u200c)")
|
|
41
|
+
|
|
42
|
+
# ── اعراب و واکههای بلند ────────────────────────────────────────────────────
|
|
43
|
+
_FA_DIACRITICS: frozenset[str] = frozenset("\u064e\u064f\u0650\u064b\u064c\u064d")
|
|
44
|
+
_FA_LONG_VOWELS: frozenset[str] = frozenset("اوی")
|
|
45
|
+
|
|
46
|
+
# ── حداقل کلمات برای نتیجه قابل اعتماد ─────────────────────────────────────
|
|
47
|
+
_MIN_WORDS_RELIABLE = 50
|
|
48
|
+
|
|
49
|
+
# ── ضرایب تصحیح ASYL ─────────────────────────────────────────────────────────
|
|
50
|
+
# فارسی بدون اعراب نوشته میشود. واکههای کوتاه (فتحه/ضمه/کسره) در نوشتار
|
|
51
|
+
# روزمره دیده نمیشوند → ASYL اندازهگیریشده کمتر از مقدار واقعی است.
|
|
52
|
+
#
|
|
53
|
+
# دو factor متفاوت بر اساس نوع متن:
|
|
54
|
+
#
|
|
55
|
+
# نثر معاصر:
|
|
56
|
+
# کلمات معاصر اغلب واکههای بلند نوشته دارند (آموزش، تصمیم، افزایش).
|
|
57
|
+
# کمبود واکه نوشته ~33٪ → factor = 1.5
|
|
58
|
+
# کالیبره شده بر: خبر روزنامه→متوسط، مقاله علمی→بسیار دشوار، داستان کودک→آسان
|
|
59
|
+
#
|
|
60
|
+
# شعر کلاسیک:
|
|
61
|
+
# شعر فارسی کلاسیک اکثراً از هجاهای کوتاه CVCV بدون واکه نوشتاری است.
|
|
62
|
+
# کلماتی مثل «چنین»، «نهان»، «کاندر»، «براند» صرفاً یک واکه نوشته دارند.
|
|
63
|
+
# کمبود واکه نوشته ~50٪ → factor = 2.0
|
|
64
|
+
# کالیبره شده بر: شاهنامه→دشوار، حافظ→دشوار/نسبتاً دشوار
|
|
65
|
+
#
|
|
66
|
+
# منابع: Megerdoomian (2000), Anvari & Givi (1382), تحلیل grid search
|
|
67
|
+
_ASYL_CALIBRATION_PROSE: float = 1.5 # نثر معاصر
|
|
68
|
+
_ASYL_CALIBRATION_POETRY: float = 2.0 # شعر کلاسیک
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ── حالتهای ورودی ────────────────────────────────────────────────────────────
|
|
72
|
+
from enum import Enum
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class InputMode(str, Enum):
|
|
76
|
+
"""حالت تجزیهوتحلیل متن ورودی."""
|
|
77
|
+
AUTO = "auto" # تشخیص خودکار (پیشفرض)
|
|
78
|
+
DIACRITICS = "diacritics" # متن معرَّب: واکههای کوتاه نوشته شدهاند
|
|
79
|
+
PLAIN = "plain" # متن بدون اعراب (رفتار پیشفرض قبلی)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# آستانه: اگر این نسبت از حروف دارای اعراب بودند → حالت diacritics
|
|
83
|
+
_DIACRITICS_THRESHOLD: float = 0.10 # ۱۰٪ از کاراکترهای فارسی
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def analyze_diacritics(text: str) -> dict:
|
|
87
|
+
"""
|
|
88
|
+
تحلیل میزان اعرابگذاری متن.
|
|
89
|
+
|
|
90
|
+
خروجی:
|
|
91
|
+
diacritic_count — تعداد کاراکترهای اعراب
|
|
92
|
+
fa_char_count — تعداد حروف فارسی/عربی
|
|
93
|
+
diacritic_ratio — نسبت اعراب به حروف فارسی
|
|
94
|
+
has_diacritics — آیا متن اعراب معنادار دارد؟
|
|
95
|
+
suggested_mode — حالت پیشنهادی ('diacritics' یا 'plain')
|
|
96
|
+
"""
|
|
97
|
+
fa_chars = sum(1 for ch in text if "" <= ch <= "ۿ" and ch not in _FA_DIACRITICS)
|
|
98
|
+
diac = sum(1 for ch in text if ch in _FA_DIACRITICS)
|
|
99
|
+
ratio = diac / max(fa_chars, 1)
|
|
100
|
+
has_diac = ratio >= _DIACRITICS_THRESHOLD
|
|
101
|
+
return {
|
|
102
|
+
"diacritic_count": diac,
|
|
103
|
+
"fa_char_count": fa_chars,
|
|
104
|
+
"diacritic_ratio": ratio,
|
|
105
|
+
"has_diacritics": has_diac,
|
|
106
|
+
"suggested_mode": InputMode.DIACRITICS if has_diac else InputMode.PLAIN,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
111
|
+
# Classifier سهلایه برای توکنهای «خواه»
|
|
112
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
113
|
+
|
|
114
|
+
_FUTURE_AUX_FORMS: frozenset[str] = frozenset({
|
|
115
|
+
"خواهم", "خواهی", "خواهد", "خواهیم", "خواهید", "خواهند",
|
|
116
|
+
"نخواهم", "نخواهی", "نخواهد", "نخواهیم", "نخواهید", "نخواهند",
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
_PARTICLE_KHAH_FORMS: frozenset[str] = frozenset({
|
|
120
|
+
"خواه", "خواه\u200cناخواه", "خواهناخواه",
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
_NOMINAL_KHAH_DERIVATIVES: frozenset[str] = frozenset({
|
|
124
|
+
"خواهش", "خواهشمند", "خواهشمندانه", "خواهان", "خواهنده",
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
_INDEPENDENT_KHAH_WORDS: frozenset[str] = frozenset({
|
|
128
|
+
"خواهر", "خواهران",
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
_FUTURE_MAIN_VERB_STEMS: frozenset[str] = frozenset({
|
|
132
|
+
"رفت", "کرد", "شد", "داد", "گفت", "آمد", "خواند", "نوشت",
|
|
133
|
+
"دید", "گرفت", "پذیرفت", "ساخت", "برد", "خورد", "زد",
|
|
134
|
+
"افتاد", "ماند", "بست", "ریخت", "فروخت", "خرید", "شکست",
|
|
135
|
+
"بود", "توانست", "خواست", "دانست", "پرسید", "فهمید",
|
|
136
|
+
"کشت", "سوخت", "آموخت", "یافت", "باخت", "انداخت",
|
|
137
|
+
"نشست", "برخاست", "پرداخت", "شناخت", "فرستاد", "برگشت",
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
_TAG_FUTURE_AUX = "_FUTURE_AUX"
|
|
141
|
+
_TAG_NON_VERB_KHAH = "_NON_VERB_KHAH"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _is_suffix_compound_khah(token: str) -> bool:
|
|
145
|
+
stripped = token.replace("\u200c", "")
|
|
146
|
+
return (
|
|
147
|
+
len(stripped) > 4
|
|
148
|
+
and stripped.endswith("خواه")
|
|
149
|
+
and token not in _PARTICLE_KHAH_FORMS
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _classify_khah(tokens: list[str], i: int) -> str:
|
|
154
|
+
tok = tokens[i]
|
|
155
|
+
next_tok = tokens[i + 1] if i + 1 < len(tokens) else None
|
|
156
|
+
next2_tok = tokens[i + 2] if i + 2 < len(tokens) else None
|
|
157
|
+
|
|
158
|
+
if tok in _PARTICLE_KHAH_FORMS:
|
|
159
|
+
return "PARTICLE_KHAH"
|
|
160
|
+
if tok in _NOMINAL_KHAH_DERIVATIVES:
|
|
161
|
+
return "NOMINAL_DERIVATIVE"
|
|
162
|
+
if tok in _INDEPENDENT_KHAH_WORDS or tok.startswith("خواهر"):
|
|
163
|
+
return "INDEPENDENT_WORD"
|
|
164
|
+
if _is_suffix_compound_khah(tok):
|
|
165
|
+
return "SUFFIX_COMPOUND"
|
|
166
|
+
if tok in _FUTURE_AUX_FORMS:
|
|
167
|
+
if next_tok == "که":
|
|
168
|
+
return "LEXICAL_KHASTAN"
|
|
169
|
+
if next_tok in _FUTURE_MAIN_VERB_STEMS:
|
|
170
|
+
return "FUTURE_AUX"
|
|
171
|
+
if next2_tok in _FUTURE_MAIN_VERB_STEMS:
|
|
172
|
+
return "FUTURE_AUX"
|
|
173
|
+
return "LEXICAL_KHASTAN"
|
|
174
|
+
return "OTHER"
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _annotate_khah_tokens(
|
|
178
|
+
tagged_words: list[tuple[str, str | None]],
|
|
179
|
+
) -> list[tuple[str, str | None]]:
|
|
180
|
+
words = [w for w, _ in tagged_words]
|
|
181
|
+
result = list(tagged_words)
|
|
182
|
+
for i, (tok, tag) in enumerate(tagged_words):
|
|
183
|
+
if "خواه" not in tok:
|
|
184
|
+
continue
|
|
185
|
+
cls = _classify_khah(words, i)
|
|
186
|
+
if cls == "FUTURE_AUX":
|
|
187
|
+
result[i] = (tok, _TAG_FUTURE_AUX)
|
|
188
|
+
elif cls in ("PARTICLE_KHAH", "NOMINAL_DERIVATIVE",
|
|
189
|
+
"INDEPENDENT_WORD", "SUFFIX_COMPOUND"):
|
|
190
|
+
result[i] = (tok, _TAG_NON_VERB_KHAH)
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
195
|
+
# هجاشماری
|
|
196
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
197
|
+
|
|
198
|
+
def _count_en_syllables(word: str) -> int:
|
|
199
|
+
w = word.lower().strip(".,!?;:\"'()[]{}")
|
|
200
|
+
if not w:
|
|
201
|
+
return 1
|
|
202
|
+
count = len(re.findall(r"[aeiou]+", w))
|
|
203
|
+
if w.endswith("e") and count > 1:
|
|
204
|
+
count -= 1
|
|
205
|
+
return max(count, 1)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _count_fa_syllables_base(word: str) -> int:
|
|
209
|
+
"""
|
|
210
|
+
هجاشماری فارسی مبتنی بر واکههای نوشتهشده.
|
|
211
|
+
|
|
212
|
+
در الفبای فارسی واکههای کوتاه (اَ اِ اُ) نوشته نمیشوند.
|
|
213
|
+
این heuristic فقط واکههای بلند نوشتهشده (ا، و، ی) و اعراب را
|
|
214
|
+
میشمارد، و ه پایانی غیرواکهای را نیز حساب میکند.
|
|
215
|
+
|
|
216
|
+
اصلاح باگها:
|
|
217
|
+
۱. «و» و «ی» در ابتدای کلمه (i==0) نیز هجا هستند (وقت، یار)
|
|
218
|
+
۲. «آ» = الف + مد → یک هجا، نه دو (الف + ا)
|
|
219
|
+
۳. ضریب تغییرات ـه پایانی: فقط وقتی قبلش واکه بلند نبود
|
|
220
|
+
|
|
221
|
+
نتیجه: ASYL اندازهگیریشده ~30-40٪ کمتر از مقدار واقعی است.
|
|
222
|
+
این کسری با _ASYL_CALIBRATION_FACTOR در محاسبه نهایی جبران میشود.
|
|
223
|
+
دقت تقریبی: ~78٪ برای متن نثر معیار.
|
|
224
|
+
"""
|
|
225
|
+
word = word.replace("\u200c", "")
|
|
226
|
+
if not word:
|
|
227
|
+
return 0
|
|
228
|
+
|
|
229
|
+
syllables = 0
|
|
230
|
+
n = len(word)
|
|
231
|
+
skip_next = False # برای جلوگیری از شمارش مضاعف «آ»
|
|
232
|
+
|
|
233
|
+
for i, ch in enumerate(word):
|
|
234
|
+
if skip_next:
|
|
235
|
+
skip_next = False
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
if ch == "آ":
|
|
239
|
+
# آ = الف مقصور → یک هجا، نه دو
|
|
240
|
+
syllables += 1
|
|
241
|
+
|
|
242
|
+
elif ch == "ا":
|
|
243
|
+
# اگر بعدی مد نباشد (آ را قبلاً مدیریت کردیم)
|
|
244
|
+
syllables += 1
|
|
245
|
+
|
|
246
|
+
elif ch in ("و", "ی"):
|
|
247
|
+
# اصلاح: در هر موقعیتی (از جمله ابتدای کلمه) هجا هستند
|
|
248
|
+
syllables += 1
|
|
249
|
+
|
|
250
|
+
elif ch in _FA_DIACRITICS:
|
|
251
|
+
syllables += 1
|
|
252
|
+
|
|
253
|
+
elif ch == "ه" and i == n - 1 and n > 1:
|
|
254
|
+
# ه پایانی واکهای (مثل خانه، کوچه)
|
|
255
|
+
if word[i - 1] not in _FA_LONG_VOWELS:
|
|
256
|
+
syllables += 1
|
|
257
|
+
|
|
258
|
+
return max(syllables, 1)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _count_fa_syllables_diacritic(word: str) -> int:
|
|
262
|
+
"""
|
|
263
|
+
هجاشماری دقیق برای متن اعرابگذاریشده (معرَّب) — دقت ~۹۵٪.
|
|
264
|
+
|
|
265
|
+
قواعد با lookahead:
|
|
266
|
+
- اِعراب کوتاه (فتحه/ضمه/کسره/تنوین) = ۱ هجا
|
|
267
|
+
- واکه بلند (ا/و/ی) + اِعراب بعدی = صامت (onset) → نه هجا
|
|
268
|
+
(مثل یَ، وَ = ی/و به عنوان صامت با فتحه)
|
|
269
|
+
- واکه بلند (ا/و/ی) بعد از اِعراب = تمدید هجای قبلی → نه هجا
|
|
270
|
+
- واکه بلند (ا/و/ی) بدون اِعراب قبل/بعد = ۱ هجای مستقل
|
|
271
|
+
- سکون (ْ) و تشدید (ّ): هجا اضافه نمیکنند
|
|
272
|
+
|
|
273
|
+
مثالها:
|
|
274
|
+
کِتَابْ → ِ(1) َ(2) ا(extend) بْ → ۲ هجا ✓
|
|
275
|
+
دَانِشْگَاهْ → َ(1) ا(extend) ِ(2) شْ َ(3) ا(extend) هْ → ۳ هجا ✓
|
|
276
|
+
یَکیْ → ی(consonant,next=فتحه) َ(1) کِی(2) ْ → ۲ هجا ✓
|
|
277
|
+
سَخَنْ → َ(1) َ(2) نْ → ۲ هجا ✓
|
|
278
|
+
گُفْتْ → ُ(1) فْ تْ → ۱ هجا ✓
|
|
279
|
+
"""
|
|
280
|
+
word = word.replace("\u200c", "")
|
|
281
|
+
if not word:
|
|
282
|
+
return 0
|
|
283
|
+
|
|
284
|
+
_SHORT_VOWELS = "\u064e\u064f\u0650\u064b\u064c\u064d" # فتحه ضمه کسره تنوین
|
|
285
|
+
_LONG_VOWELS = "اوی"
|
|
286
|
+
|
|
287
|
+
syllables = 0
|
|
288
|
+
prev_was_short_vowel = False
|
|
289
|
+
|
|
290
|
+
for i, ch in enumerate(word):
|
|
291
|
+
next_ch = word[i + 1] if i + 1 < len(word) else None
|
|
292
|
+
|
|
293
|
+
if ch in _SHORT_VOWELS:
|
|
294
|
+
syllables += 1
|
|
295
|
+
prev_was_short_vowel = True
|
|
296
|
+
|
|
297
|
+
elif ch in _LONG_VOWELS:
|
|
298
|
+
if prev_was_short_vowel:
|
|
299
|
+
# تمدید هجای قبلی: فَا، کُو، بِی — هجای جدید نیست
|
|
300
|
+
pass
|
|
301
|
+
elif next_ch is not None and next_ch in _SHORT_VOWELS:
|
|
302
|
+
# صامت onset: یَ، وَ، یِ — واکه بلند به عنوان صامت
|
|
303
|
+
pass
|
|
304
|
+
else:
|
|
305
|
+
# واکه بلند مستقل: مثل «او»، «آب»، «نوش»
|
|
306
|
+
syllables += 1
|
|
307
|
+
prev_was_short_vowel = False
|
|
308
|
+
|
|
309
|
+
else:
|
|
310
|
+
prev_was_short_vowel = False
|
|
311
|
+
|
|
312
|
+
return max(syllables, 1)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _count_fa_syllables_pos(word: str, pos_tag: str) -> int:
|
|
316
|
+
base = _count_fa_syllables_base(word)
|
|
317
|
+
|
|
318
|
+
if pos_tag == _TAG_NON_VERB_KHAH:
|
|
319
|
+
return max(base, 1)
|
|
320
|
+
if pos_tag == _TAG_FUTURE_AUX:
|
|
321
|
+
pass # base درست است
|
|
322
|
+
|
|
323
|
+
if pos_tag in _VERB_TAGS_PARSIVAR and _VERB_PREFIX_ATTACHED.match(word):
|
|
324
|
+
base += 1
|
|
325
|
+
|
|
326
|
+
if word.endswith("ترین") and len(word) > 4:
|
|
327
|
+
base = max(base, _count_fa_syllables_base(word[:-4]) + 2)
|
|
328
|
+
elif word.endswith("تر") and len(word) > 2:
|
|
329
|
+
base = max(base, _count_fa_syllables_base(word[:-2]) + 1)
|
|
330
|
+
|
|
331
|
+
return max(base, 1)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def count_syllables(word: str, pos_tag: str | None = None) -> int:
|
|
335
|
+
is_persian = any("\u0600" <= ch <= "\u06ff" for ch in word)
|
|
336
|
+
if not is_persian:
|
|
337
|
+
return _count_en_syllables(word)
|
|
338
|
+
if pos_tag is not None:
|
|
339
|
+
return _count_fa_syllables_pos(word, pos_tag)
|
|
340
|
+
return _count_fa_syllables_base(word)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
344
|
+
# توکنها و حروف
|
|
345
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
346
|
+
|
|
347
|
+
def _strip_punctuation(token: str) -> str:
|
|
348
|
+
i, j = 0, len(token)
|
|
349
|
+
while i < j and not token[i].isalpha():
|
|
350
|
+
i += 1
|
|
351
|
+
while j > i and not token[j - 1].isalpha():
|
|
352
|
+
j -= 1
|
|
353
|
+
return token[i:j]
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _is_word_token(token: str) -> bool:
|
|
357
|
+
return any(ch.isalpha() for ch in token)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def count_letters(words: list[str]) -> int:
|
|
361
|
+
return sum(ch.isalpha() for w in words for ch in w)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
365
|
+
# Singletonها
|
|
366
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
367
|
+
|
|
368
|
+
@lru_cache(maxsize=1)
|
|
369
|
+
def _get_normalizer() -> Normalizer:
|
|
370
|
+
return Normalizer()
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
_parsivar_tagger = None
|
|
374
|
+
_parsivar_tagger_ready = False
|
|
375
|
+
_parsivar_tokenizer = None
|
|
376
|
+
_parsivar_tokenizer_ready = False
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _get_parsivar_tagger() -> "ParsivarPOSTagger | None":
|
|
380
|
+
global _parsivar_tagger, _parsivar_tagger_ready
|
|
381
|
+
if _parsivar_tagger_ready:
|
|
382
|
+
return _parsivar_tagger
|
|
383
|
+
_parsivar_tagger_ready = True
|
|
384
|
+
if not _PARSIVAR_AVAILABLE:
|
|
385
|
+
return None
|
|
386
|
+
try:
|
|
387
|
+
_parsivar_tagger = ParsivarPOSTagger(tagging_model="wapiti")
|
|
388
|
+
logger.info("Parsivar POSTagger loaded successfully.")
|
|
389
|
+
except ImportError:
|
|
390
|
+
logger.info("wapiti not installed — falling back to heuristic POS.")
|
|
391
|
+
except Exception as exc:
|
|
392
|
+
logger.info("Parsivar POSTagger could not be loaded (%s) — heuristic used.", type(exc).__name__)
|
|
393
|
+
return _parsivar_tagger
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _get_parsivar_tokenizer() -> "ParsivarTokenizer | None":
|
|
397
|
+
global _parsivar_tokenizer, _parsivar_tokenizer_ready
|
|
398
|
+
if _parsivar_tokenizer_ready:
|
|
399
|
+
return _parsivar_tokenizer
|
|
400
|
+
_parsivar_tokenizer_ready = True
|
|
401
|
+
if not _PARSIVAR_AVAILABLE:
|
|
402
|
+
return None
|
|
403
|
+
try:
|
|
404
|
+
_parsivar_tokenizer = ParsivarTokenizer()
|
|
405
|
+
logger.info("Parsivar Tokenizer loaded successfully.")
|
|
406
|
+
except Exception as exc:
|
|
407
|
+
logger.warning("Parsivar Tokenizer could not be loaded: %s", exc)
|
|
408
|
+
return _parsivar_tokenizer
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
412
|
+
# سطحبندی خوانایی
|
|
413
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
414
|
+
|
|
415
|
+
_READABILITY_LEVELS: list[tuple[int, str]] = [
|
|
416
|
+
(90, "بسیار آسان — مناسب کودکان دبستانی"),
|
|
417
|
+
(80, "آسان — مناسب نوجوانان"),
|
|
418
|
+
(70, "نسبتاً آسان — مناسب عموم مردم"),
|
|
419
|
+
(60, "متوسط — مناسب دانشآموزان دبیرستان"),
|
|
420
|
+
(50, "نسبتاً دشوار — مناسب دانشجویان"),
|
|
421
|
+
(30, "دشوار — مناسب متخصصان"),
|
|
422
|
+
(0, "بسیار دشوار — متون علمی/تخصصی"),
|
|
423
|
+
]
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def interpret_score(score: float) -> str:
|
|
427
|
+
for threshold, label in _READABILITY_LEVELS:
|
|
428
|
+
if score >= threshold:
|
|
429
|
+
return label
|
|
430
|
+
return _READABILITY_LEVELS[-1][1]
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
POS_MODE_PARSIVAR = "POS-enhanced — Parsivar"
|
|
434
|
+
POS_MODE_HEURISTIC = "morphological heuristic (بدون POS)"
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
@dataclass
|
|
438
|
+
class ReadabilityResult:
|
|
439
|
+
sentences: int
|
|
440
|
+
words: int
|
|
441
|
+
letters: int
|
|
442
|
+
syllables: int
|
|
443
|
+
asl: float
|
|
444
|
+
wl: float
|
|
445
|
+
asyl: float # raw (uncalibrated) — برای شفافیت گزارش میشود
|
|
446
|
+
flesch_dayani: float
|
|
447
|
+
level: str
|
|
448
|
+
pos_mode: str
|
|
449
|
+
is_likely_poetry: bool = False
|
|
450
|
+
diacritics_mode: bool = False # True اگر متن اعرابگذاریشده بود
|
|
451
|
+
diacritic_ratio: float = 0.0 # نسبت اعراب به حروف فارسی
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def pos_enhanced(self) -> bool:
|
|
455
|
+
return self.pos_mode.startswith("POS-enhanced")
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
459
|
+
# Tagging pipeline
|
|
460
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
461
|
+
|
|
462
|
+
def _tag_sentence_parsivar(sent, pv_tok, pv_tagger):
|
|
463
|
+
tokens = pv_tok.tokenize_words(sent)
|
|
464
|
+
word_tokens = [
|
|
465
|
+
cleaned
|
|
466
|
+
for t in tokens
|
|
467
|
+
if _is_word_token(t)
|
|
468
|
+
for cleaned in (_strip_punctuation(t),)
|
|
469
|
+
if cleaned
|
|
470
|
+
]
|
|
471
|
+
if not word_tokens:
|
|
472
|
+
return []
|
|
473
|
+
try:
|
|
474
|
+
return pv_tagger.parse(word_tokens)
|
|
475
|
+
except Exception as exc:
|
|
476
|
+
logger.warning("Parsivar tagger failed, falling back to heuristic: %s", exc)
|
|
477
|
+
return [(w, None) for w in word_tokens]
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _tag_sentence_heuristic(sent: str) -> list[tuple[str, str | None]]:
|
|
481
|
+
result = []
|
|
482
|
+
for t in _fast_word_tokenize(sent):
|
|
483
|
+
if not _is_word_token(t):
|
|
484
|
+
continue
|
|
485
|
+
cleaned = _strip_punctuation(t)
|
|
486
|
+
if cleaned:
|
|
487
|
+
result.append((cleaned, None))
|
|
488
|
+
return result
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _extract_tagged_words(sentences, pv_tok, pv_tagger):
|
|
492
|
+
if pv_tagger is not None and pv_tok is not None:
|
|
493
|
+
tagged = []
|
|
494
|
+
for sent in sentences:
|
|
495
|
+
tagged.extend(_tag_sentence_parsivar(sent, pv_tok, pv_tagger))
|
|
496
|
+
return tagged, POS_MODE_PARSIVAR
|
|
497
|
+
tagged = []
|
|
498
|
+
for sent in sentences:
|
|
499
|
+
tagged.extend(_tag_sentence_heuristic(sent))
|
|
500
|
+
return tagged, POS_MODE_HEURISTIC
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
504
|
+
# تشخیص شعر و تقسیم جملات
|
|
505
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
506
|
+
|
|
507
|
+
_CAESURA_PATTERN = re.compile(r"\s{2,}")
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def _split_hemistich_line(line: str) -> list[str]:
|
|
511
|
+
"""
|
|
512
|
+
خطوطی که دو مصراع در یک سطر دارند را تقسیم میکند.
|
|
513
|
+
|
|
514
|
+
در تایپ شعر فارسی مصراع اول و دوم اغلب با ۲+ فاصله جدا میشوند:
|
|
515
|
+
سواری و تیر و کمان و کمند عنان و رکیب و چه و چون و چند
|
|
516
|
+
|
|
517
|
+
شرط تقسیم: دقیقاً دو بخش، هر بخش ≥ ۳ کلمه و محتوای الفبایی.
|
|
518
|
+
در غیر این صورت خط دستنخورده برگردانده میشود.
|
|
519
|
+
"""
|
|
520
|
+
parts = _CAESURA_PATTERN.split(line)
|
|
521
|
+
if len(parts) == 2:
|
|
522
|
+
p1, p2 = parts[0].strip(), parts[1].strip()
|
|
523
|
+
if (any(ch.isalpha() for ch in p1) and len(p1.split()) >= 3
|
|
524
|
+
and any(ch.isalpha() for ch in p2) and len(p2.split()) >= 3):
|
|
525
|
+
return [p1, p2]
|
|
526
|
+
return [line]
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _split_into_sentences(normalized: str) -> list[str]:
|
|
530
|
+
"""
|
|
531
|
+
تقسیم متن به جملات با پشتیبانی از:
|
|
532
|
+
- نثر: جملهبندی با .!؟
|
|
533
|
+
- شعر تکمصراعی: هر خط = یک مصراع
|
|
534
|
+
- شعر دومصراعی: دو مصراع در یک خط با ۲+ فاصله (caesura)
|
|
535
|
+
"""
|
|
536
|
+
sentences: list[str] = []
|
|
537
|
+
for line in normalized.splitlines():
|
|
538
|
+
line = line.strip()
|
|
539
|
+
if not line:
|
|
540
|
+
continue
|
|
541
|
+
for sub in _split_hemistich_line(line):
|
|
542
|
+
for sent in sent_tokenize(sub):
|
|
543
|
+
if any(ch.isalpha() for ch in sent):
|
|
544
|
+
sentences.append(sent)
|
|
545
|
+
return sentences
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _detect_likely_poetry(raw_text: str, n_sentences: int, asl: float) -> bool:
|
|
549
|
+
"""
|
|
550
|
+
تشخیص احتمالی متن شعری — سه لایه تشخیص:
|
|
551
|
+
|
|
552
|
+
لایه ۱ — چندخطی با نسبت خط/جمله بالا:
|
|
553
|
+
خطوط کوتاه (< ۷۰ کاراکتر) + ASL ≤ ۱۲ + line_sent_ratio ≥ ۰.۷
|
|
554
|
+
اصلاح باگ: شرط strict «asl < 9» که شعر با ASL=9 تا ۱۲ را رد میکرد.
|
|
555
|
+
|
|
556
|
+
لایه ۲ — یکنواختی طول خطوط (coefficient of variation):
|
|
557
|
+
شعر: خطوط همطول → CV < ۰.۲۵ / نثر: طول خطوط متغیر
|
|
558
|
+
|
|
559
|
+
لایه ۳ — متن تکخطی کوتاه (شعر کپیشده روی یک سطر):
|
|
560
|
+
یک خط + ASL ≤ ۱۰ + طول < ۱۰۰ کاراکتر
|
|
561
|
+
"""
|
|
562
|
+
alpha_lines = [
|
|
563
|
+
ln.strip()
|
|
564
|
+
for ln in raw_text.splitlines()
|
|
565
|
+
if ln.strip() and any(ch.isalpha() for ch in ln)
|
|
566
|
+
]
|
|
567
|
+
if not alpha_lines or n_sentences == 0:
|
|
568
|
+
return False
|
|
569
|
+
|
|
570
|
+
n_lines = len(alpha_lines)
|
|
571
|
+
avg_line_len = sum(len(ln) for ln in alpha_lines) / n_lines
|
|
572
|
+
line_sent_ratio = n_lines / n_sentences
|
|
573
|
+
|
|
574
|
+
# ── لایه ۱: چندخطی با ASL معقول ─────────────────────────────────────────
|
|
575
|
+
if asl <= 12 and avg_line_len < 70 and line_sent_ratio >= 0.7:
|
|
576
|
+
return True
|
|
577
|
+
|
|
578
|
+
# ── لایه ۲: یکنواختی طول خطوط (ویژهترین نشانه شعر) ────────────────────
|
|
579
|
+
if n_lines >= 2:
|
|
580
|
+
lengths = [len(ln) for ln in alpha_lines]
|
|
581
|
+
mean_len = sum(lengths) / n_lines
|
|
582
|
+
if mean_len > 0:
|
|
583
|
+
variance = sum((x - mean_len) ** 2 for x in lengths) / n_lines
|
|
584
|
+
cv = variance ** 0.5 / mean_len
|
|
585
|
+
if cv < 0.25 and avg_line_len < 80 and asl <= 14:
|
|
586
|
+
return True
|
|
587
|
+
|
|
588
|
+
# ── لایه ۳: متن تکخطی کوتاه (شعر بر یک سطر) ──────────────────────────
|
|
589
|
+
if n_lines == 1 and asl <= 10 and avg_line_len < 100:
|
|
590
|
+
return True
|
|
591
|
+
|
|
592
|
+
return False
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
596
|
+
# محاسبه اصلی
|
|
597
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
598
|
+
|
|
599
|
+
def compute_flesch_dayani(text: str, mode: InputMode | str = InputMode.AUTO) -> ReadabilityResult:
|
|
600
|
+
"""
|
|
601
|
+
شاخص خوانایی Flesch–Dayani برای متن فارسی.
|
|
602
|
+
|
|
603
|
+
فرمول دیانی (۱۳۷۴):
|
|
604
|
+
FDR = 262.835 − 84.6 × ASYL_calibrated − 1.015 × ASL
|
|
605
|
+
|
|
606
|
+
پارامتر mode:
|
|
607
|
+
InputMode.AUTO — تشخیص خودکار بر اساس نسبت اعراب در متن
|
|
608
|
+
InputMode.DIACRITICS — متن اعرابگذاریشده: calibration=1.0، دقت ~۹۵٪
|
|
609
|
+
InputMode.PLAIN — متن بدون اعراب: calibration=1.5 (نثر) یا 2.0 (شعر)
|
|
610
|
+
|
|
611
|
+
pipeline:
|
|
612
|
+
normalize → split_sentences → tag → annotate_khah
|
|
613
|
+
→ syllable_count → detect_diacritics → calibrate_ASYL → score
|
|
614
|
+
"""
|
|
615
|
+
normalizer = _get_normalizer()
|
|
616
|
+
|
|
617
|
+
# مصراعهای دوتایی باید قبل از normalize تقسیم شوند
|
|
618
|
+
# چون hazm.Normalizer فاصلههای متعدد (caesura) را به یک فاصله compress میکند
|
|
619
|
+
pre_split_lines: list[str] = []
|
|
620
|
+
for raw_line in text.splitlines():
|
|
621
|
+
raw_line = raw_line.strip()
|
|
622
|
+
if not raw_line:
|
|
623
|
+
pre_split_lines.append("")
|
|
624
|
+
continue
|
|
625
|
+
for part in _split_hemistich_line(raw_line):
|
|
626
|
+
pre_split_lines.append(part)
|
|
627
|
+
|
|
628
|
+
normalized = normalizer.normalize("\n".join(pre_split_lines))
|
|
629
|
+
raw_sentences = _split_into_sentences(normalized)
|
|
630
|
+
if not raw_sentences:
|
|
631
|
+
raise ValueError("متن پس از نرمالسازی هیچ جملهای ندارد.")
|
|
632
|
+
|
|
633
|
+
tagged_words, pos_mode = _extract_tagged_words(
|
|
634
|
+
raw_sentences,
|
|
635
|
+
pv_tok=_get_parsivar_tokenizer(),
|
|
636
|
+
pv_tagger=_get_parsivar_tagger(),
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
n_sentences = len(raw_sentences)
|
|
640
|
+
n_words = len(tagged_words)
|
|
641
|
+
|
|
642
|
+
if n_words == 0:
|
|
643
|
+
raise ValueError(
|
|
644
|
+
"پس از پاکسازی علائم نشانهگذاری، هیچ کلمهای در متن یافت نشد."
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
all_words = [w for w, _ in tagged_words]
|
|
648
|
+
n_letters = count_letters(all_words)
|
|
649
|
+
if n_letters == 0:
|
|
650
|
+
raise ValueError("هیچ حرف الفبایی در متن یافت نشد.")
|
|
651
|
+
|
|
652
|
+
if n_words < _MIN_WORDS_RELIABLE:
|
|
653
|
+
logger.warning(
|
|
654
|
+
"Text has only %d words; score may be unreliable (recommend >= %d).",
|
|
655
|
+
n_words, _MIN_WORDS_RELIABLE,
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
tagged_words = _annotate_khah_tokens(tagged_words)
|
|
659
|
+
|
|
660
|
+
# ── تشخیص حالت اعراب ──────────────────────────────────────────────────────
|
|
661
|
+
diac_info = analyze_diacritics(text)
|
|
662
|
+
if isinstance(mode, str):
|
|
663
|
+
mode = InputMode(mode)
|
|
664
|
+
|
|
665
|
+
if mode == InputMode.AUTO:
|
|
666
|
+
use_diacritics = diac_info["has_diacritics"]
|
|
667
|
+
elif mode == InputMode.DIACRITICS:
|
|
668
|
+
use_diacritics = True
|
|
669
|
+
else:
|
|
670
|
+
use_diacritics = False
|
|
671
|
+
|
|
672
|
+
# ── هجاشماری ─────────────────────────────────────────────────────────────
|
|
673
|
+
if use_diacritics:
|
|
674
|
+
# متن معرب: واکههای کوتاه نوشته شدهاند → calibration=1.0 (دقت ~۹۵٪)
|
|
675
|
+
n_syllables = sum(
|
|
676
|
+
_count_fa_syllables_diacritic(w)
|
|
677
|
+
if any("\u0600" <= c <= "\u06ff" for c in w)
|
|
678
|
+
else _count_en_syllables(w)
|
|
679
|
+
for w, _ in tagged_words
|
|
680
|
+
)
|
|
681
|
+
logger.info(
|
|
682
|
+
"Diacritic mode active (ratio=%.2f). Using exact syllable count, calibration=1.0.",
|
|
683
|
+
diac_info["diacritic_ratio"],
|
|
684
|
+
)
|
|
685
|
+
else:
|
|
686
|
+
n_syllables = sum(count_syllables(w, tag) for w, tag in tagged_words)
|
|
687
|
+
|
|
688
|
+
asl = n_words / n_sentences
|
|
689
|
+
wl = n_letters / n_words
|
|
690
|
+
asyl = n_syllables / n_words
|
|
691
|
+
|
|
692
|
+
# ── کالیبراسیون ASYL ──────────────────────────────────────────────────────
|
|
693
|
+
pre_split_text = "\n".join(pre_split_lines)
|
|
694
|
+
is_poetry = _detect_likely_poetry(pre_split_text, n_sentences, asl)
|
|
695
|
+
|
|
696
|
+
if is_poetry:
|
|
697
|
+
if use_diacritics:
|
|
698
|
+
# شعر اعرابدار: هجاشماری دقیقتر است (نه نیاز به جبران undercounting)
|
|
699
|
+
# اما فرمول Flesch-Dayani برای نثر طراحی شده → factor=1.5 (میانهرو)
|
|
700
|
+
# جلوگیری از امتیازهای کاذب > 100 که در شعر اعرابدار رخ میدهد
|
|
701
|
+
calibration_factor = 1.5
|
|
702
|
+
logger.info(
|
|
703
|
+
"Poetry detected in diacritic mode (ASL=%.1f). "
|
|
704
|
+
"Using calibration=1.5 to prevent inflated scores.",
|
|
705
|
+
asl,
|
|
706
|
+
)
|
|
707
|
+
else:
|
|
708
|
+
# شعر بدون اعراب: جبران هجاهای نوشتهنشده در شعر کلاسیک
|
|
709
|
+
calibration_factor = _ASYL_CALIBRATION_POETRY
|
|
710
|
+
elif use_diacritics:
|
|
711
|
+
# نثر اعرابدار: هجاشماری دقیق → نیازی به calibration نیست
|
|
712
|
+
calibration_factor = 1.0
|
|
713
|
+
else:
|
|
714
|
+
# نثر بدون اعراب: جبران هجاهای نوشتهنشده
|
|
715
|
+
calibration_factor = _ASYL_CALIBRATION_PROSE
|
|
716
|
+
|
|
717
|
+
asyl_calibrated = asyl * calibration_factor
|
|
718
|
+
score = 262.835 - 84.6 * asyl_calibrated - 1.015 * asl
|
|
719
|
+
|
|
720
|
+
if is_poetry:
|
|
721
|
+
logger.warning(
|
|
722
|
+
"Text appears to be verse/poetry (ASL=%.1f, factor=%.1f, diacritics=%s). "
|
|
723
|
+
"Score calibrated for classical Persian poetry.",
|
|
724
|
+
asl, calibration_factor, use_diacritics,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
return ReadabilityResult(
|
|
728
|
+
sentences=n_sentences,
|
|
729
|
+
words=n_words,
|
|
730
|
+
letters=n_letters,
|
|
731
|
+
syllables=n_syllables,
|
|
732
|
+
asl=asl,
|
|
733
|
+
wl=wl,
|
|
734
|
+
asyl=asyl,
|
|
735
|
+
flesch_dayani=score,
|
|
736
|
+
level=interpret_score(score),
|
|
737
|
+
pos_mode=pos_mode,
|
|
738
|
+
is_likely_poetry=is_poetry,
|
|
739
|
+
diacritics_mode=use_diacritics,
|
|
740
|
+
diacritic_ratio=round(diac_info["diacritic_ratio"], 3),
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
745
|
+
# CLI
|
|
746
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
747
|
+
|
|
748
|
+
def parse_args(argv=None):
|
|
749
|
+
parser = argparse.ArgumentParser(
|
|
750
|
+
description="Persian Flesch–Dayani readability index calculator"
|
|
751
|
+
)
|
|
752
|
+
g = parser.add_mutually_exclusive_group()
|
|
753
|
+
g.add_argument("-f", "--file", type=str, help="Path to a UTF-8 Persian text file")
|
|
754
|
+
g.add_argument("-t", "--text", type=str, help="Persian text to analyze")
|
|
755
|
+
parser.add_argument("--plain", action="store_true", help="Print raw score only")
|
|
756
|
+
parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
|
|
757
|
+
parser.add_argument(
|
|
758
|
+
"--mode",
|
|
759
|
+
choices=["auto", "diacritics", "plain"],
|
|
760
|
+
default="auto",
|
|
761
|
+
help="Syllable counting mode: auto (default), diacritics (معرب), plain (بدون اعراب)",
|
|
762
|
+
)
|
|
763
|
+
return parser.parse_args(argv)
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def main(argv=None):
|
|
767
|
+
args = parse_args(argv)
|
|
768
|
+
|
|
769
|
+
if args.verbose:
|
|
770
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
|
771
|
+
|
|
772
|
+
if args.file:
|
|
773
|
+
try:
|
|
774
|
+
with open(args.file, "r", encoding="utf-8") as f:
|
|
775
|
+
text = f.read()
|
|
776
|
+
except OSError as e:
|
|
777
|
+
print(f"خطا در خواندن فایل: {e}", file=sys.stderr)
|
|
778
|
+
sys.exit(1)
|
|
779
|
+
elif args.text:
|
|
780
|
+
text = args.text
|
|
781
|
+
else:
|
|
782
|
+
if sys.stdin.isatty():
|
|
783
|
+
print("در انتظار متن از stdin (Ctrl+D برای پایان)...", file=sys.stderr)
|
|
784
|
+
text = sys.stdin.read()
|
|
785
|
+
|
|
786
|
+
if not text or not text.strip():
|
|
787
|
+
print("خطا: متن خالی است.", file=sys.stderr)
|
|
788
|
+
sys.exit(1)
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
result = compute_flesch_dayani(text, mode=args.mode)
|
|
792
|
+
except ValueError as e:
|
|
793
|
+
print(f"خطا: {e}", file=sys.stderr)
|
|
794
|
+
sys.exit(1)
|
|
795
|
+
|
|
796
|
+
if args.plain:
|
|
797
|
+
print(f"{result.flesch_dayani:.2f}")
|
|
798
|
+
return
|
|
799
|
+
|
|
800
|
+
W = 54
|
|
801
|
+
print("═" * W)
|
|
802
|
+
print(" Persian Readability — Flesch–Dayani")
|
|
803
|
+
print("═" * W)
|
|
804
|
+
print(f" جملات : {result.sentences}")
|
|
805
|
+
print(f" کلمات : {result.words}")
|
|
806
|
+
print(f" حروف : {result.letters}")
|
|
807
|
+
print(f" هجاها : {result.syllables}")
|
|
808
|
+
print(f" روش : {result.pos_mode}")
|
|
809
|
+
print("─" * W)
|
|
810
|
+
print(f" ASL (کلمه/جمله) : {result.asl:.2f}")
|
|
811
|
+
print(f" WL (حرف/کلمه) : {result.wl:.2f}")
|
|
812
|
+
print(f" ASYL (هجا/کلمه) : {result.asyl:.2f}")
|
|
813
|
+
print("─" * W)
|
|
814
|
+
print(f" امتیاز Flesch–Dayani : {result.flesch_dayani:.2f}")
|
|
815
|
+
print(f" سطح خوانایی : {result.level}")
|
|
816
|
+
if result.diacritics_mode:
|
|
817
|
+
print("─" * W)
|
|
818
|
+
print(f" حالت : اعرابدار (diacritic mode) — calibration=1.0")
|
|
819
|
+
print(f" نسبت اعراب: {result.diacritic_ratio:.1%}")
|
|
820
|
+
if result.is_likely_poetry:
|
|
821
|
+
print("─" * W)
|
|
822
|
+
print(" ⚠ متن شعری تشخیص داده شد")
|
|
823
|
+
print(" فرمول دیانی برای نثر کالیبره شده.")
|
|
824
|
+
print(" ASL کوتاه مصراعها باعث تورم امتیاز میشود.")
|
|
825
|
+
print("═" * W)
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
if __name__ == "__main__":
|
|
829
|
+
main()
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: persian-readability
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A lightweight Python tool for Persian/Farsi readability analysis using the Flesch-Dayani formula.
|
|
5
|
+
Author-email: Mohammad Pirouzan <mohammadpirouzan@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Drpirouzan/Persian-Readability
|
|
8
|
+
Project-URL: Repository, https://github.com/Drpirouzan/Persian-Readability
|
|
9
|
+
Project-URL: Issues, https://github.com/Drpirouzan/Persian-Readability/issues
|
|
10
|
+
Keywords: persian,farsi,readability,nlp,flesch,flesch-dayani,text-analysis,persian-language,education,accessibility
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Education
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Natural Language :: Persian
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Text Processing
|
|
22
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: hazm
|
|
27
|
+
Provides-Extra: pos
|
|
28
|
+
Requires-Dist: parsivar; extra == "pos"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Requires-Dist: build; extra == "dev"
|
|
32
|
+
Requires-Dist: twine; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# Persian Readability (Flesch–Dayani)
|
|
36
|
+
|
|
37
|
+
A lightweight Python package and command-line tool to calculate the **Flesch–Dayani readability score** for Persian (Farsi) text — with an optional POS-enhanced syllable counter for higher accuracy.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- Persian text normalization and tokenization via `hazm`
|
|
44
|
+
- **Punctuation-aware tokenization** — علائم نشانهگذاری از شمارش کلمات و هجاها حذف میشوند
|
|
45
|
+
- **Two-tier syllable counting:**
|
|
46
|
+
- **POS-enhanced** (Better Accuracy) — if `parsivar` is installed, uses part-of-speech tags to correctly count syllables in verbs with attached prefixes (`میرود`، `نمیدانم`) and comparative adjectives (`بهتر`، `بزرگترین`)
|
|
47
|
+
- **Morphological heuristic** (Good Accuracy) — used automatically if `parsivar` is not installed
|
|
48
|
+
- **Context-aware خواه classifier** — three-layer disambiguation prevents confusing `خواهش`, `خواهر`, `آزادیخواه`, and `خواه ... خواه ...` with the future auxiliary (`خواهم رفت`)
|
|
49
|
+
- Computes:
|
|
50
|
+
- Number of sentences, words, letters, and syllables
|
|
51
|
+
- **ASL** — Average Sentence Length (words per sentence)
|
|
52
|
+
- **WL** — Average Word Length (letters per word)
|
|
53
|
+
- **ASYL** — Average Syllables per Word *(used in the original Dayani formula)*
|
|
54
|
+
- Flesch–Dayani readability score
|
|
55
|
+
- **Human-readable level** (e.g. *متوسط — مناسب دانشآموزان دبیرستان*)
|
|
56
|
+
- Accepts input from a file, a command-line argument, or **stdin** (pipe-friendly)
|
|
57
|
+
- `--plain` flag for scripting and pipeline use
|
|
58
|
+
- `--verbose` flag for debug logging
|
|
59
|
+
- Warns when text is too short for a reliable score (< 50 words)
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Readability Levels
|
|
64
|
+
|
|
65
|
+
| Score | Level |
|
|
66
|
+
|-------|-------|
|
|
67
|
+
| ≥ 90 | بسیار آسان — مناسب کودکان دبستانی |
|
|
68
|
+
| ≥ 80 | آسان — مناسب نوجوانان |
|
|
69
|
+
| ≥ 70 | نسبتاً آسان — مناسب عموم مردم |
|
|
70
|
+
| ≥ 60 | متوسط — مناسب دانشآموزان دبیرستان |
|
|
71
|
+
| ≥ 50 | نسبتاً دشوار — مناسب دانشجویان |
|
|
72
|
+
| ≥ 30 | دشوار — مناسب متخصصان |
|
|
73
|
+
| < 30 | بسیار دشوار — متون علمی/تخصصی |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Installation
|
|
78
|
+
|
|
79
|
+
Install from PyPI after release:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install persian-readability
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
For local development:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install -e ".[dev]"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
For optional POS-enhanced syllable counting:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install "persian-readability[pos]"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Requirements
|
|
100
|
+
|
|
101
|
+
### Required
|
|
102
|
+
|
|
103
|
+
- Python **3.10** or newer
|
|
104
|
+
- [`hazm`](https://github.com/roshan-research/hazm) — Persian NLP library
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
pip install hazm
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Optional (for higher syllable accuracy)
|
|
111
|
+
|
|
112
|
+
- [`parsivar`](https://github.com/ICTRC/Parsivar) — Persian preprocessing toolkit with POS tagger
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
pip install parsivar
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
> If `parsivar` is not installed, the script falls back to the morphological heuristic automatically — no configuration needed.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Usage
|
|
123
|
+
|
|
124
|
+
**Direct text:**
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
persian-readability -t "متن فارسی شما"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
**From a file:**
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
persian-readability -f sample.txt
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**From stdin (pipe):**
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
echo "متن فارسی شما" | persian-readability
|
|
140
|
+
cat article.txt | persian-readability
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**Raw score only (for scripting):**
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
persian-readability -f sample.txt --plain
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**With debug logging:**
|
|
150
|
+
|
|
151
|
+
```
|
|
152
|
+
persian-readability -f sample.txt --verbose
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Python API Usage
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from persian_readability import calculate_readability
|
|
161
|
+
|
|
162
|
+
result = calculate_readability("برای پیشگیری از پوسیدگی دندان، روزی دو بار مسواک بزنید.")
|
|
163
|
+
print(result)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Real-World Examples
|
|
169
|
+
|
|
170
|
+
### Example 1 — Public health text
|
|
171
|
+
|
|
172
|
+
**Input:**
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
persian-readability -t "برای پیشگیری از پوسیدگی دندان، بهتر است روزی دو بار مسواک بزنید و مصرف مواد قندی را کاهش دهید."
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**Possible use case:**
|
|
179
|
+
|
|
180
|
+
This can help public health educators check whether patient-facing Persian health messages are simple enough for the general public.
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
### Example 2 — Academic text
|
|
185
|
+
|
|
186
|
+
**Input:**
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
persian-readability -t "شاخصهای زیستی بزاقی میتوانند در تشخیص زودهنگام برخی بیماریهای دهان و فک و صورت نقش مهمی داشته باشند."
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Possible use case:**
|
|
193
|
+
|
|
194
|
+
Researchers can compare the readability of Persian academic summaries, abstracts, or educational materials.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
### Example 3 — Pipeline use
|
|
199
|
+
|
|
200
|
+
**Input:**
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
cat article.txt | persian-readability --plain
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**Possible use case:**
|
|
207
|
+
|
|
208
|
+
Developers can integrate the readability score into larger Persian NLP or content-quality workflows.
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Sample Output
|
|
213
|
+
|
|
214
|
+
```text
|
|
215
|
+
══════════════════════════════════════════════════════
|
|
216
|
+
Persian Readability — Flesch–Dayani
|
|
217
|
+
══════════════════════════════════════════════════════
|
|
218
|
+
جملات : 5
|
|
219
|
+
کلمات : 87
|
|
220
|
+
حروف : 412
|
|
221
|
+
هجاها : 201
|
|
222
|
+
روش : POS-enhanced — Parsivar
|
|
223
|
+
────────────────────────────────────────────────────
|
|
224
|
+
ASL (کلمه/جمله) : 17.40
|
|
225
|
+
WL (حرف/کلمه) : 4.74
|
|
226
|
+
ASYL (هجا/کلمه) : 2.31
|
|
227
|
+
────────────────────────────────────────────────────
|
|
228
|
+
امتیاز Flesch–Dayani : 58.34
|
|
229
|
+
سطح خوانایی : متوسط — مناسب دانشآموزان دبیرستان
|
|
230
|
+
══════════════════════════════════════════════════════
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Formula
|
|
236
|
+
|
|
237
|
+
```
|
|
238
|
+
FDR = 262.835 − 0.846 × ASYL − 1.015 × ASL
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Where **ASYL** = average syllables per word and **ASL** = average words per sentence.
|
|
242
|
+
Higher scores indicate easier text.
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## How Syllable Accuracy Tiers Work
|
|
247
|
+
|
|
248
|
+
| Mode | Accuracy | How |
|
|
249
|
+
|------|----------|-----|
|
|
250
|
+
| POS-enhanced | ~85% | Parsivar POSTagger (wapiti CRF, Bijankhan corpus) detects verb/adjective tags; prefix/suffix rules applied per POS |
|
|
251
|
+
| Morphological heuristic | ~75% | Counts written long vowels (ا و ی), diacritics, and word-final ه; no POS context |
|
|
252
|
+
|
|
253
|
+
Main cases where POS tagging improves accuracy:
|
|
254
|
+
|
|
255
|
+
- Verbs with attached `می`/`نمی` prefix (no half-space): `میرود` → +1 syllable
|
|
256
|
+
- Comparative/superlative adjectives: `بهترین` → suffix `ترین` = 2 syllables
|
|
257
|
+
|
|
258
|
+
### خواه Classifier
|
|
259
|
+
|
|
260
|
+
The word `خواه` has multiple roles in Persian. A three-layer classifier resolves ambiguity **before** syllable counting:
|
|
261
|
+
|
|
262
|
+
| Label | Examples | Treatment |
|
|
263
|
+
|-------|---------|-----------|
|
|
264
|
+
| `FUTURE_AUX` | خواهم رفت، نخواهند پذیرفت | syllable count unchanged (هجاشماری base درست است) |
|
|
265
|
+
| `LEXICAL_KHASTAN` | خواهد که برود، این را خواهد | tag اصلی حفظ میشود |
|
|
266
|
+
| `PARTICLE_KHAH` | خواه بیاید خواه نیاید | treated as non-verb |
|
|
267
|
+
| `NOMINAL_DERIVATIVE` | خواهش، خواهان، خواهنده | treated as non-verb |
|
|
268
|
+
| `INDEPENDENT_WORD` | خواهر، خواهران | treated as non-verb |
|
|
269
|
+
| `SUFFIX_COMPOUND` | آزادیخواه، خیرخواه، دادخواه | treated as non-verb |
|
|
270
|
+
|
|
271
|
+
The classifier uses exact lexical sets (layer 1), suffix-compound detection (layer 2), and a 2-token context window (layer 3) — never a simple prefix regex.
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## Notes
|
|
276
|
+
|
|
277
|
+
- **Minimum text length:** The Flesch–Dayani formula is designed for running prose. Texts shorter than ~50 words produce unstable scores. A warning is emitted in this case (visible with `--verbose`).
|
|
278
|
+
- **Punctuation filtering:** علائم نشانهگذاری فارسی و لاتین (گیومه، نقطه، ویرگول، ...) از لبههای هر توکن پاک میشوند و توکنهای تمامعلامت از شمارش حذف میشوند.
|
|
279
|
+
- **stdin:** When running interactively without `-t` or `-f`, the script waits for input and prints a prompt. Press `Ctrl+D` to signal end of input.
|
|
280
|
+
- **Log messages:** All warnings go to stderr and do not affect `--plain` output.
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Running Tests
|
|
285
|
+
|
|
286
|
+
```
|
|
287
|
+
pip install pytest hazm
|
|
288
|
+
python -m pytest tests/test_core.py -v
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
76 tests covering: خواه classifier (all 9 document cases), punctuation filtering,
|
|
292
|
+
syllable counting, heuristic limitations, formula verification, and edge cases.
|
|
293
|
+
|
|
294
|
+
---
|
|
295
|
+
|
|
296
|
+
## References
|
|
297
|
+
|
|
298
|
+
- Dayani, M. (1374/1995). *سنجش خوانایی متون فارسی*. Persian adaptation of the Flesch Reading Ease formula.
|
|
299
|
+
- Mohtaj et al. (2018). [Parsivar: A Language Processing Toolkit for Persian](https://github.com/ICTRC/Parsivar). LREC 2018.
|
|
300
|
+
- Mohammadi & Khasteh (2020). [A Machine Learning Approach to Persian Text Readability](https://arxiv.org/abs/1810.06639).
|
|
301
|
+
- Sobhe. [hazm — Persian NLP library](https://github.com/roshan-research/hazm).
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## Author
|
|
306
|
+
|
|
307
|
+
**Dr. Mohammad Pirouzan** — [@Drpirouzan](https://github.com/Drpirouzan)
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## License
|
|
312
|
+
|
|
313
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
persian_readability/__init__.py,sha256=i3dnAbmmnvXJVmHu6y57zQ3eG6xDLA7lTJM-QxfADKM,1828
|
|
2
|
+
persian_readability/core.py,sha256=-LiY_hbY4vKc4a9Bps7TYq3jZYXbqJnW9Cc1HH57tvM,36611
|
|
3
|
+
persian_readability-0.1.2.dist-info/licenses/LICENSE,sha256=mMvLcUywQEQfAD0nMKHahrWbC0CtVobu-HD9yEckEsM,1078
|
|
4
|
+
persian_readability-0.1.2.dist-info/METADATA,sha256=m_rWotquBiMTWVxSuYkYczium6jDjy2hS8hEdiKOPdI,10882
|
|
5
|
+
persian_readability-0.1.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
persian_readability-0.1.2.dist-info/entry_points.txt,sha256=Kw1bYo_Lt-2nfNRFLEF9vCzmm5fzu_q5fQtvOHljV78,70
|
|
7
|
+
persian_readability-0.1.2.dist-info/top_level.txt,sha256=lyKW6_tx5c2Zy_2VuCzgfEttxY8uZ2KJCkCmaaGuDzE,20
|
|
8
|
+
persian_readability-0.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Dr. Mohammad Pirouzan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
persian_readability
|