hindi-readability 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindi_readability-0.1.0/LICENSE +21 -0
- hindi_readability-0.1.0/PKG-INFO +202 -0
- hindi_readability-0.1.0/README.md +172 -0
- hindi_readability-0.1.0/hindi_readability/__init__.py +32 -0
- hindi_readability-0.1.0/hindi_readability/formulas.py +184 -0
- hindi_readability-0.1.0/hindi_readability/scorer.py +161 -0
- hindi_readability-0.1.0/hindi_readability/script.py +168 -0
- hindi_readability-0.1.0/hindi_readability.egg-info/PKG-INFO +202 -0
- hindi_readability-0.1.0/hindi_readability.egg-info/SOURCES.txt +13 -0
- hindi_readability-0.1.0/hindi_readability.egg-info/dependency_links.txt +1 -0
- hindi_readability-0.1.0/hindi_readability.egg-info/requires.txt +5 -0
- hindi_readability-0.1.0/hindi_readability.egg-info/top_level.txt +1 -0
- hindi_readability-0.1.0/pyproject.toml +43 -0
- hindi_readability-0.1.0/setup.cfg +4 -0
- hindi_readability-0.1.0/tests/test_all.py +138 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Prabhat Chaudhary
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: hindi-readability
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The first Python package for measuring readability of Hindi text using Devanagari-aware formulas
|
|
5
|
+
Author-email: Prabhat Chaudhary <raja1999chaudhary@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Erprabhat8423/hindi-readability
|
|
8
|
+
Project-URL: Repository, https://github.com/Erprabhat8423/hindi-readability
|
|
9
|
+
Keywords: hindi,readability,nlp,devanagari,indic,flesch,grade-level,text-analysis,education
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
22
|
+
Classifier: Natural Language :: Hindi
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
28
|
+
Requires-Dist: build; extra == "dev"
|
|
29
|
+
Requires-Dist: twine; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# hindi-readability šš®š³
|
|
32
|
+
|
|
33
|
+
**The first Python package for measuring the readability of Hindi text.**
|
|
34
|
+
|
|
35
|
+
Zero external dependencies. Pure Python 3.9+.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## The Problem
|
|
40
|
+
|
|
41
|
+
English has Flesch-Kincaid, Gunning Fog, and ARI ā readability formulas used in MS Word since 1992. **Hindi has nothing.**
|
|
42
|
+
|
|
43
|
+
India has 24.8 crore school students, 886 million internet users consuming Hindi content, and 14.7 lakh schools ā all producing and consuming Hindi text with no way to automatically measure whether it is easy or hard to read.
|
|
44
|
+
|
|
45
|
+
This package fills that gap with three **original formulas** designed specifically for Devanagari script.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install hindi-readability
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from hindi_readability import ReadabilityScorer
|
|
61
|
+
|
|
62
|
+
rs = ReadabilityScorer()
|
|
63
|
+
|
|
64
|
+
# Simple sentence
|
|
65
|
+
result = rs.score("यह ą¤ą¤ सरल ą¤µą¤¾ą¤ą„य ą¤¹ą„ą„¤")
|
|
66
|
+
print(result["hrs"]) # Hindi Readability Score (0-100)
|
|
67
|
+
print(result["label"]) # "Easy"
|
|
68
|
+
print(result["grade_label"]) # "Class 3ā5"
|
|
69
|
+
print(result["cbse_level"]) # "Prathmik Uttara"
|
|
70
|
+
|
|
71
|
+
# Constitutional text ā hard
|
|
72
|
+
result = rs.score("ą¤øą¤ą¤µą¤æą¤§ą¤¾ą¤Ø ą¤ą„ ą¤Ŗą„ą¤°ą¤øą„तावना ą¤®ą„ą¤ ą¤ą¤¾ą¤°ą¤¤ ą¤ą„ ą¤ą¤ ą¤øą¤ą¤Ŗą„ą¤°ą¤ą„, ą¤øą¤®ą¤¾ą¤ą¤µą¤¾ą¤¦ą„, ą¤§ą¤°ą„ą¤®ą¤Øą¤æą¤°ą¤Ŗą„ą¤ą„ष, ą¤²ą„ą¤ą¤¤ą¤¾ą¤ą¤¤ą„ą¤°ą¤æą¤ ą¤ą¤£ą¤°ą¤¾ą¤ą„य ą¤ą„षित ą¤ą¤æą¤Æą¤¾ ą¤ą¤Æą¤¾ ą¤¹ą„ą„¤")
|
|
73
|
+
print(result["hrs"]) # 0.0
|
|
74
|
+
print(result["label"]) # "Expert"
|
|
75
|
+
print(result["grade_label"])# "College+"
|
|
76
|
+
|
|
77
|
+
# Compare multiple texts ā sorted easiest first
|
|
78
|
+
texts = [
|
|
79
|
+
"ą¤¬ą¤ą„ą¤ą„ ą¤ą„ą¤²ą¤¤ą„ ą¤¹ą„ą¤ą„¤",
|
|
80
|
+
"ą¤ą¤¾ą¤°ą¤¤ ą¤ą„ ą¤¶ą¤æą¤ą„षा ą¤Øą„ą¤¤ą¤æ बदल ą¤°ą¤¹ą„ ą¤¹ą„ą„¤",
|
|
81
|
+
"ą¤øą¤ą¤µą„ą¤§ą¤¾ą¤Øą¤æą¤ ą¤Ŗą„ą¤°ą¤¾ą¤µą¤§ą¤¾ą¤Øą„ą¤ ą¤ą„ ą¤
ą¤Øą„ą¤øą¤¾ą¤° ą¤Øą¤¾ą¤ą¤°ą¤æą¤ą„ą¤ ą¤ą„ ą¤®ą„ą¤² ą¤
ą¤§ą¤æą¤ą¤¾ą¤° ą¤øą„ą¤°ą¤ą„षित ą¤¹ą„ą¤ą„¤",
|
|
82
|
+
]
|
|
83
|
+
ranked = rs.compare(texts)
|
|
84
|
+
for r in ranked:
|
|
85
|
+
print(f"{r['hrs']:5.1f} {r['label']:12} {r['text'][:40]}")
|
|
86
|
+
|
|
87
|
+
# Get simplification suggestions
|
|
88
|
+
suggestions = rs.simplify_suggestions("ą¤øą¤ą¤µą„ą¤§ą¤¾ą¤Øą¤æą¤ ą¤Ŗą„ą¤°ą¤¾ą¤µą¤§ą¤¾ą¤Øą„ą¤ ą¤ą„ ą¤
ą¤Øą„ą¤øą¤¾ą¤°...")
|
|
89
|
+
for s in suggestions:
|
|
90
|
+
print(s)
|
|
91
|
+
|
|
92
|
+
# Check if appropriate for a school grade
|
|
93
|
+
rs.is_appropriate_for_grade("यह सरल ą¤Ŗą¤¾ą¤ ą¤¹ą„ą„¤", grade=5) # True/False
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## The Three Formulas
|
|
99
|
+
|
|
100
|
+
### 1. Hindi Readability Score (HRS)
|
|
101
|
+
An ease score from **0 to 100** ā higher means easier. Inspired by Flesch Reading Ease but redesigned for Devanagari.
|
|
102
|
+
|
|
103
|
+
| Score | Label | Suitable for |
|
|
104
|
+
|-------|-------|-------------|
|
|
105
|
+
| 90ā100 | Very easy | Class 1ā2 |
|
|
106
|
+
| 70ā89 | Easy | Class 3ā5 |
|
|
107
|
+
| 50ā69 | Standard | Class 6ā8 |
|
|
108
|
+
| 30ā49 | Difficult | Class 9ā10 |
|
|
109
|
+
| 10ā29 | Very hard | Class 11ā12 |
|
|
110
|
+
| 0ā9 | Expert | College+ |
|
|
111
|
+
|
|
112
|
+
**Formula:**
|
|
113
|
+
```
|
|
114
|
+
HRS = 206.0
|
|
115
|
+
- (60.0 Ć avg_syllables_per_word)
|
|
116
|
+
- (1.8 Ć avg_words_per_sentence)
|
|
117
|
+
- (70.0 Ć conjunct_density)
|
|
118
|
+
- (8.0 Ć matra_complexity)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 2. Hindi Grade Level (HGL)
|
|
122
|
+
Maps HRS to Indian school grades (CBSE Class 1 to College+).
|
|
123
|
+
|
|
124
|
+
### 3. Hindi Complexity Index (HCI)
|
|
125
|
+
A normalized 0ā1 score. Lower = easier. Useful for ML pipelines.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Why These Formulas Are Different
|
|
130
|
+
|
|
131
|
+
| Feature | English (Flesch-Kincaid) | Hindi (this package) |
|
|
132
|
+
|---------|--------------------------|---------------------|
|
|
133
|
+
| Syllable counting | English phoneme rules | Devanagari matra-based |
|
|
134
|
+
| Conjunct detection | Not applicable | ā Virama-based detection |
|
|
135
|
+
| Script-aware | No | ā Full Unicode U+0900āU+097F |
|
|
136
|
+
| Long vowel complexity | No | ā Guru/laghu distinction |
|
|
137
|
+
| CBSE grade mapping | No | ā Class 1ā12 + College |
|
|
138
|
+
|
|
139
|
+
**Conjunct consonants** (ą¤øą¤ą¤Æą„ą¤ą„त ą¤
ą¤ą„षर) ā formed when a virama (ą„) joins two consonants ā are the primary marker of Sanskrit-origin vocabulary. They appear in tatsam words (ą¤¤ą¤¤ą„ą¤øą¤®) which are significantly harder for younger readers. This package detects them automatically using Unicode analysis.
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## What Is Solved vs. What This Package Solves
|
|
144
|
+
|
|
145
|
+
### Already solved (for English)
|
|
146
|
+
- Flesch Reading Ease (1948)
|
|
147
|
+
- Flesch-Kincaid Grade Level (1975)
|
|
148
|
+
- Gunning Fog Index (1952)
|
|
149
|
+
|
|
150
|
+
### What this package solves (first ever for Hindi)
|
|
151
|
+
- Matra-aware syllable counting
|
|
152
|
+
- Conjunct consonant density as a difficulty signal
|
|
153
|
+
- CBSE-aligned grade level output
|
|
154
|
+
- Actionable simplification suggestions in Hindi
|
|
155
|
+
|
|
156
|
+
### Still open (future research / dissertation topics)
|
|
157
|
+
- Validation against human-graded Hindi texts (labeled corpus needed)
|
|
158
|
+
- Domain-specific calibration (news vs. textbooks vs. legal)
|
|
159
|
+
- Extension to Bengali, Marathi, Gujarati (same Devanagari script family)
|
|
160
|
+
- Hinglish (code-mixed Hindi-English) readability
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## API Reference
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
ReadabilityScorer.score(text) # Full report dict
|
|
168
|
+
ReadabilityScorer.compare(texts) # Rank list easiestāhardest
|
|
169
|
+
ReadabilityScorer.batch_score(texts) # Score list in order
|
|
170
|
+
ReadabilityScorer.is_appropriate_for_grade(text, grade) # bool
|
|
171
|
+
ReadabilityScorer.simplify_suggestions(text) # list of Hindi suggestions
|
|
172
|
+
|
|
173
|
+
# Low-level functions
|
|
174
|
+
hindi_readability_score(text) # float 0-100
|
|
175
|
+
hindi_grade_level(text) # dict {grade, grade_label, cbse_level}
|
|
176
|
+
hindi_complexity_index(text) # float 0-1
|
|
177
|
+
analyse(text) # dict of raw script counts
|
|
178
|
+
syllables_per_word(text) # float
|
|
179
|
+
conjunct_density(text) # conjuncts per 100 words
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Citation
|
|
185
|
+
|
|
186
|
+
If you use this package in academic work:
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
@software{hindi_readability,
|
|
190
|
+
author = {Prabhat Chaudhary},
|
|
191
|
+
title = {hindi-readability: The First Python Package for Hindi Text Readability},
|
|
192
|
+
year = {2025},
|
|
193
|
+
publisher = {PyPI},
|
|
194
|
+
url = {https://pypi.org/project/hindi-readability/}
|
|
195
|
+
}
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
MIT ā free for academic and commercial use.
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# hindi-readability šš®š³
|
|
2
|
+
|
|
3
|
+
**The first Python package for measuring the readability of Hindi text.**
|
|
4
|
+
|
|
5
|
+
Zero external dependencies. Pure Python 3.9+.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## The Problem
|
|
10
|
+
|
|
11
|
+
English has Flesch-Kincaid, Gunning Fog, and ARI ā readability formulas used in MS Word since 1992. **Hindi has nothing.**
|
|
12
|
+
|
|
13
|
+
India has 24.8 crore school students, 886 million internet users consuming Hindi content, and 14.7 lakh schools ā all producing and consuming Hindi text with no way to automatically measure whether it is easy or hard to read.
|
|
14
|
+
|
|
15
|
+
This package fills that gap with three **original formulas** designed specifically for Devanagari script.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install hindi-readability
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from hindi_readability import ReadabilityScorer
|
|
31
|
+
|
|
32
|
+
rs = ReadabilityScorer()
|
|
33
|
+
|
|
34
|
+
# Simple sentence
|
|
35
|
+
result = rs.score("यह ą¤ą¤ सरल ą¤µą¤¾ą¤ą„य ą¤¹ą„ą„¤")
|
|
36
|
+
print(result["hrs"]) # Hindi Readability Score (0-100)
|
|
37
|
+
print(result["label"]) # "Easy"
|
|
38
|
+
print(result["grade_label"]) # "Class 3ā5"
|
|
39
|
+
print(result["cbse_level"]) # "Prathmik Uttara"
|
|
40
|
+
|
|
41
|
+
# Constitutional text ā hard
|
|
42
|
+
result = rs.score("ą¤øą¤ą¤µą¤æą¤§ą¤¾ą¤Ø ą¤ą„ ą¤Ŗą„ą¤°ą¤øą„तावना ą¤®ą„ą¤ ą¤ą¤¾ą¤°ą¤¤ ą¤ą„ ą¤ą¤ ą¤øą¤ą¤Ŗą„ą¤°ą¤ą„, ą¤øą¤®ą¤¾ą¤ą¤µą¤¾ą¤¦ą„, ą¤§ą¤°ą„ą¤®ą¤Øą¤æą¤°ą¤Ŗą„ą¤ą„ष, ą¤²ą„ą¤ą¤¤ą¤¾ą¤ą¤¤ą„ą¤°ą¤æą¤ ą¤ą¤£ą¤°ą¤¾ą¤ą„य ą¤ą„षित ą¤ą¤æą¤Æą¤¾ ą¤ą¤Æą¤¾ ą¤¹ą„ą„¤")
|
|
43
|
+
print(result["hrs"]) # 0.0
|
|
44
|
+
print(result["label"]) # "Expert"
|
|
45
|
+
print(result["grade_label"])# "College+"
|
|
46
|
+
|
|
47
|
+
# Compare multiple texts ā sorted easiest first
|
|
48
|
+
texts = [
|
|
49
|
+
"ą¤¬ą¤ą„ą¤ą„ ą¤ą„ą¤²ą¤¤ą„ ą¤¹ą„ą¤ą„¤",
|
|
50
|
+
"ą¤ą¤¾ą¤°ą¤¤ ą¤ą„ ą¤¶ą¤æą¤ą„षा ą¤Øą„ą¤¤ą¤æ बदल ą¤°ą¤¹ą„ ą¤¹ą„ą„¤",
|
|
51
|
+
"ą¤øą¤ą¤µą„ą¤§ą¤¾ą¤Øą¤æą¤ ą¤Ŗą„ą¤°ą¤¾ą¤µą¤§ą¤¾ą¤Øą„ą¤ ą¤ą„ ą¤
ą¤Øą„ą¤øą¤¾ą¤° ą¤Øą¤¾ą¤ą¤°ą¤æą¤ą„ą¤ ą¤ą„ ą¤®ą„ą¤² ą¤
ą¤§ą¤æą¤ą¤¾ą¤° ą¤øą„ą¤°ą¤ą„षित ą¤¹ą„ą¤ą„¤",
|
|
52
|
+
]
|
|
53
|
+
ranked = rs.compare(texts)
|
|
54
|
+
for r in ranked:
|
|
55
|
+
print(f"{r['hrs']:5.1f} {r['label']:12} {r['text'][:40]}")
|
|
56
|
+
|
|
57
|
+
# Get simplification suggestions
|
|
58
|
+
suggestions = rs.simplify_suggestions("ą¤øą¤ą¤µą„ą¤§ą¤¾ą¤Øą¤æą¤ ą¤Ŗą„ą¤°ą¤¾ą¤µą¤§ą¤¾ą¤Øą„ą¤ ą¤ą„ ą¤
ą¤Øą„ą¤øą¤¾ą¤°...")
|
|
59
|
+
for s in suggestions:
|
|
60
|
+
print(s)
|
|
61
|
+
|
|
62
|
+
# Check if appropriate for a school grade
|
|
63
|
+
rs.is_appropriate_for_grade("यह सरल ą¤Ŗą¤¾ą¤ ą¤¹ą„ą„¤", grade=5) # True/False
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## The Three Formulas
|
|
69
|
+
|
|
70
|
+
### 1. Hindi Readability Score (HRS)
|
|
71
|
+
An ease score from **0 to 100** ā higher means easier. Inspired by Flesch Reading Ease but redesigned for Devanagari.
|
|
72
|
+
|
|
73
|
+
| Score | Label | Suitable for |
|
|
74
|
+
|-------|-------|-------------|
|
|
75
|
+
| 90ā100 | Very easy | Class 1ā2 |
|
|
76
|
+
| 70ā89 | Easy | Class 3ā5 |
|
|
77
|
+
| 50ā69 | Standard | Class 6ā8 |
|
|
78
|
+
| 30ā49 | Difficult | Class 9ā10 |
|
|
79
|
+
| 10ā29 | Very hard | Class 11ā12 |
|
|
80
|
+
| 0ā9 | Expert | College+ |
|
|
81
|
+
|
|
82
|
+
**Formula:**
|
|
83
|
+
```
|
|
84
|
+
HRS = 206.0
|
|
85
|
+
- (60.0 Ć avg_syllables_per_word)
|
|
86
|
+
- (1.8 Ć avg_words_per_sentence)
|
|
87
|
+
- (70.0 Ć conjunct_density)
|
|
88
|
+
- (8.0 Ć matra_complexity)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 2. Hindi Grade Level (HGL)
|
|
92
|
+
Maps HRS to Indian school grades (CBSE Class 1 to College+).
|
|
93
|
+
|
|
94
|
+
### 3. Hindi Complexity Index (HCI)
|
|
95
|
+
A normalized 0ā1 score. Lower = easier. Useful for ML pipelines.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Why These Formulas Are Different
|
|
100
|
+
|
|
101
|
+
| Feature | English (Flesch-Kincaid) | Hindi (this package) |
|
|
102
|
+
|---------|--------------------------|---------------------|
|
|
103
|
+
| Syllable counting | English phoneme rules | Devanagari matra-based |
|
|
104
|
+
| Conjunct detection | Not applicable | ā Virama-based detection |
|
|
105
|
+
| Script-aware | No | ā Full Unicode U+0900āU+097F |
|
|
106
|
+
| Long vowel complexity | No | ā Guru/laghu distinction |
|
|
107
|
+
| CBSE grade mapping | No | ā Class 1ā12 + College |
|
|
108
|
+
|
|
109
|
+
**Conjunct consonants** (ą¤øą¤ą¤Æą„ą¤ą„त ą¤
ą¤ą„षर) ā formed when a virama (ą„) joins two consonants ā are the primary marker of Sanskrit-origin vocabulary. They appear in tatsam words (ą¤¤ą¤¤ą„ą¤øą¤®) which are significantly harder for younger readers. This package detects them automatically using Unicode analysis.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## What Is Solved vs. What This Package Solves
|
|
114
|
+
|
|
115
|
+
### Already solved (for English)
|
|
116
|
+
- Flesch Reading Ease (1948)
|
|
117
|
+
- Flesch-Kincaid Grade Level (1975)
|
|
118
|
+
- Gunning Fog Index (1952)
|
|
119
|
+
|
|
120
|
+
### What this package solves (first ever for Hindi)
|
|
121
|
+
- Matra-aware syllable counting
|
|
122
|
+
- Conjunct consonant density as a difficulty signal
|
|
123
|
+
- CBSE-aligned grade level output
|
|
124
|
+
- Actionable simplification suggestions in Hindi
|
|
125
|
+
|
|
126
|
+
### Still open (future research / dissertation topics)
|
|
127
|
+
- Validation against human-graded Hindi texts (labeled corpus needed)
|
|
128
|
+
- Domain-specific calibration (news vs. textbooks vs. legal)
|
|
129
|
+
- Extension to Bengali, Marathi, Gujarati (same Devanagari script family)
|
|
130
|
+
- Hinglish (code-mixed Hindi-English) readability
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## API Reference
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
ReadabilityScorer.score(text) # Full report dict
|
|
138
|
+
ReadabilityScorer.compare(texts) # Rank list easiestāhardest
|
|
139
|
+
ReadabilityScorer.batch_score(texts) # Score list in order
|
|
140
|
+
ReadabilityScorer.is_appropriate_for_grade(text, grade) # bool
|
|
141
|
+
ReadabilityScorer.simplify_suggestions(text) # list of Hindi suggestions
|
|
142
|
+
|
|
143
|
+
# Low-level functions
|
|
144
|
+
hindi_readability_score(text) # float 0-100
|
|
145
|
+
hindi_grade_level(text) # dict {grade, grade_label, cbse_level}
|
|
146
|
+
hindi_complexity_index(text) # float 0-1
|
|
147
|
+
analyse(text) # dict of raw script counts
|
|
148
|
+
syllables_per_word(text) # float
|
|
149
|
+
conjunct_density(text) # conjuncts per 100 words
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Citation
|
|
155
|
+
|
|
156
|
+
If you use this package in academic work:
|
|
157
|
+
|
|
158
|
+
```
|
|
159
|
+
@software{hindi_readability,
|
|
160
|
+
author = {Prabhat Chaudhary},
|
|
161
|
+
title = {hindi-readability: The First Python Package for Hindi Text Readability},
|
|
162
|
+
year = {2025},
|
|
163
|
+
publisher = {PyPI},
|
|
164
|
+
url = {https://pypi.org/project/hindi-readability/}
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
MIT ā free for academic and commercial use.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
hindi-readability
|
|
3
|
+
=================
|
|
4
|
+
The first Python package for measuring readability of Hindi text.
|
|
5
|
+
|
|
6
|
+
Provides three original formulas designed for Devanagari script:
|
|
7
|
+
- Hindi Readability Score (HRS) ā 0-100, higher = easier
|
|
8
|
+
- Hindi Grade Level (HGL) ā CBSE Class 1 to College+
|
|
9
|
+
- Hindi Complexity Index (HCI) ā 0-1, lower = easier
|
|
10
|
+
|
|
11
|
+
Install: pip install hindi-readability
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .scorer import ReadabilityScorer
|
|
15
|
+
from .script import analyse, syllables_per_word, conjunct_density
|
|
16
|
+
from .formulas import (
|
|
17
|
+
hindi_readability_score,
|
|
18
|
+
hindi_grade_level,
|
|
19
|
+
hindi_complexity_index,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
__author__ = "Prabhat Chaudhary"
|
|
24
|
+
__all__ = [
|
|
25
|
+
"ReadabilityScorer",
|
|
26
|
+
"analyse",
|
|
27
|
+
"syllables_per_word",
|
|
28
|
+
"conjunct_density",
|
|
29
|
+
"hindi_readability_score",
|
|
30
|
+
"hindi_grade_level",
|
|
31
|
+
"hindi_complexity_index",
|
|
32
|
+
]
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
formulas.py ā Hindi Readability Formulas
|
|
3
|
+
=========================================
|
|
4
|
+
This module implements THREE original readability formulas for Hindi text,
|
|
5
|
+
all designed from scratch for Devanagari script characteristics.
|
|
6
|
+
|
|
7
|
+
WHY NOT JUST TRANSLATE FLESCH-KINCAID?
|
|
8
|
+
---------------------------------------
|
|
9
|
+
Flesch-Kincaid counts English syllables and words per sentence.
|
|
10
|
+
Hindi is fundamentally different:
|
|
11
|
+
1. Syllable weight ā Hindi has HEAVY (guru) and LIGHT (laghu) syllables
|
|
12
|
+
based on matras. A long-matra word is harder than a short one of the
|
|
13
|
+
same syllable count.
|
|
14
|
+
2. Conjuncts (sankyukt akshar) ā These are the single biggest marker of
|
|
15
|
+
reading difficulty in Hindi. They appear in Sanskrit-origin (tatsama)
|
|
16
|
+
words which educated adults use but children struggle with.
|
|
17
|
+
3. Sentence structure ā Hindi is SOV (Subject-Object-Verb). Long sentences
|
|
18
|
+
with postpositions and embedded clauses are harder than simple SOV.
|
|
19
|
+
|
|
20
|
+
THE THREE FORMULAS
|
|
21
|
+
-------------------
|
|
22
|
+
1. Hindi Readability Score (HRS) ā ease score 0ā100 (higher = easier)
|
|
23
|
+
2. Hindi Grade Level (HGL) ā school grade 1ā12+
|
|
24
|
+
3. Hindi Complexity Index (HCI) ā raw difficulty 0ā1 (lower = easier)
|
|
25
|
+
|
|
26
|
+
Each formula is independently usable. HRS is the headline metric.
|
|
27
|
+
|
|
28
|
+
FORMULA DERIVATION
|
|
29
|
+
-------------------
|
|
30
|
+
HRS is adapted from Flesch Reading Ease with Hindi-specific weights:
|
|
31
|
+
|
|
32
|
+
HRS = 121.2
|
|
33
|
+
- (58.0 Ć avg_syllables_per_word)
|
|
34
|
+
- (1.02 Ć avg_words_per_sentence)
|
|
35
|
+
- (22.0 Ć conjunct_density_normalized)
|
|
36
|
+
- (6.0 Ć matra_complexity)
|
|
37
|
+
|
|
38
|
+
Weights chosen by linguistic reasoning:
|
|
39
|
+
⢠avg_syllables_per_word : primary difficulty driver (same as English)
|
|
40
|
+
⢠avg_words_per_sentence : secondary (same as English, lower weight)
|
|
41
|
+
⢠conjunct_density : NEW ā unique to Hindi/Devanagari
|
|
42
|
+
⢠matra_complexity : NEW ā ratio of heavy matras (long vowels)
|
|
43
|
+
to total matras; long matras = harder words
|
|
44
|
+
|
|
45
|
+
HGL maps HRS to Indian school grades (Class 1ā12) using the same
|
|
46
|
+
inverse relationship as Kincaid but re-calibrated for Hindi:
|
|
47
|
+
|
|
48
|
+
HGL = 17.2 - (HRS Ć 0.14)
|
|
49
|
+
|
|
50
|
+
HCI is a 0ā1 normalized composite:
|
|
51
|
+
HCI = weighted average of 4 sub-scores (syllable, sentence, conjunct, matra)
|
|
52
|
+
|
|
53
|
+
GRADE LABELS
|
|
54
|
+
------------
|
|
55
|
+
These are mapped to CBSE/NCERT grade groupings:
|
|
56
|
+
Class 1ā2 : Prathmik (Primary) ā very simple
|
|
57
|
+
Class 3ā5 : Prathmik Uttara ā simple
|
|
58
|
+
Class 6ā8 : Madhyamik ā standard
|
|
59
|
+
Class 9ā10 : Uccha Madhyamik ā difficult
|
|
60
|
+
Class 11ā12 : Uccha Vidyalay ā very difficult
|
|
61
|
+
College+ : Snatak ā expert
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
from typing import Dict
|
|
65
|
+
from .script import analyse, MATRAS
|
|
66
|
+
|
|
67
|
+
# Long-vowel matras ā these indicate "heavy" (guru) syllables, harder to read
|
|
68
|
+
LONG_MATRAS = {
|
|
69
|
+
"\u093E", # ा (aa)
|
|
70
|
+
"\u0940", # ą„ (ii)
|
|
71
|
+
"\u0942", # ą„ (uu)
|
|
72
|
+
"\u0948", # ą„ (ai)
|
|
73
|
+
"\u094C", # ą„ (au)
|
|
74
|
+
"\u0947", # ą„ (e)
|
|
75
|
+
"\u094B", # ą„ (o)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _matra_complexity(text: str) -> float:
|
|
80
|
+
"""
|
|
81
|
+
Ratio of long (heavy) matras to total matras.
|
|
82
|
+
Range: 0.0 (all short vowels) ā 1.0 (all long vowels).
|
|
83
|
+
Long matras in a text signal Sanskrit-heavy vocabulary ā harder.
|
|
84
|
+
"""
|
|
85
|
+
long_count = sum(1 for ch in text if ch in LONG_MATRAS)
|
|
86
|
+
total_count = sum(1 for ch in text if ch in MATRAS)
|
|
87
|
+
if total_count == 0:
|
|
88
|
+
return 0.0
|
|
89
|
+
return long_count / total_count
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def hindi_readability_score(text: str) -> float:
|
|
93
|
+
"""
|
|
94
|
+
Hindi Readability Score (HRS) ā the headline metric.
|
|
95
|
+
|
|
96
|
+
Range : 0 ā 100
|
|
97
|
+
Higher = easier to read (same direction as Flesch Reading Ease)
|
|
98
|
+
|
|
99
|
+
Interpretation:
|
|
100
|
+
90ā100 : Very easy (Class 1ā2)
|
|
101
|
+
70ā89 : Easy (Class 3ā5)
|
|
102
|
+
50ā69 : Standard (Class 6ā8)
|
|
103
|
+
30ā49 : Difficult (Class 9ā10)
|
|
104
|
+
10ā29 : Very hard (Class 11ā12)
|
|
105
|
+
0ā9 : Expert (College+)
|
|
106
|
+
"""
|
|
107
|
+
data = analyse(text)
|
|
108
|
+
words = max(data["words"], 1)
|
|
109
|
+
sentences = max(data["sentences"], 1)
|
|
110
|
+
syllables = max(data["syllables"], 1)
|
|
111
|
+
|
|
112
|
+
avg_syl_per_word = syllables / words
|
|
113
|
+
avg_words_per_sent = words / sentences
|
|
114
|
+
conjunct_dens_norm = (data["conjuncts"] / words) # 0āN per word
|
|
115
|
+
matra_compl = _matra_complexity(text)
|
|
116
|
+
|
|
117
|
+
score = (
|
|
118
|
+
206.0
|
|
119
|
+
- (60.0 * avg_syl_per_word)
|
|
120
|
+
- (1.8 * avg_words_per_sent)
|
|
121
|
+
- (70.0 * conjunct_dens_norm)
|
|
122
|
+
- (8.0 * matra_compl)
|
|
123
|
+
)
|
|
124
|
+
return round(max(0.0, min(100.0, score)), 2)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def hindi_grade_level(text: str) -> Dict[str, object]:
|
|
128
|
+
"""
|
|
129
|
+
Hindi Grade Level (HGL) ā maps HRS to Indian school grade.
|
|
130
|
+
|
|
131
|
+
Returns dict with:
|
|
132
|
+
grade : int (1ā13, where 13 = college+)
|
|
133
|
+
grade_label : str (e.g. "Class 6ā8")
|
|
134
|
+
cbse_level : str (e.g. "Madhyamik")
|
|
135
|
+
"""
|
|
136
|
+
hrs = hindi_readability_score(text)
|
|
137
|
+
raw_grade = 17.2 - (hrs * 0.14)
|
|
138
|
+
grade = max(1, min(13, round(raw_grade)))
|
|
139
|
+
|
|
140
|
+
if grade <= 2:
|
|
141
|
+
label, cbse = "Class 1ā2", "Prathmik (Primary)"
|
|
142
|
+
elif grade <= 5:
|
|
143
|
+
label, cbse = "Class 3ā5", "Prathmik Uttara (Upper Primary)"
|
|
144
|
+
elif grade <= 8:
|
|
145
|
+
label, cbse = "Class 6ā8", "Madhyamik (Middle School)"
|
|
146
|
+
elif grade <= 10:
|
|
147
|
+
label, cbse = "Class 9ā10", "Uccha Madhyamik (Secondary)"
|
|
148
|
+
elif grade <= 12:
|
|
149
|
+
label, cbse = "Class 11ā12", "Uccha Vidyalay (Senior Secondary)"
|
|
150
|
+
else:
|
|
151
|
+
label, cbse = "College+", "Snatak (Graduate)"
|
|
152
|
+
|
|
153
|
+
return {"grade": grade, "grade_label": label, "cbse_level": cbse}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def hindi_complexity_index(text: str) -> float:
|
|
157
|
+
"""
|
|
158
|
+
Hindi Complexity Index (HCI) ā normalized 0ā1 composite score.
|
|
159
|
+
|
|
160
|
+
Lower = easier
|
|
161
|
+
Higher = harder
|
|
162
|
+
|
|
163
|
+
Sub-components (all normalized 0ā1):
|
|
164
|
+
syl_score : syllables/word normalized (cap at 5 syl/word)
|
|
165
|
+
sent_score : words/sentence normalized (cap at 30 words/sent)
|
|
166
|
+
conjunct_score : conjuncts/word normalized (cap at 1 per word)
|
|
167
|
+
matra_score : long-matra ratio (already 0ā1)
|
|
168
|
+
"""
|
|
169
|
+
data = analyse(text)
|
|
170
|
+
words = max(data["words"], 1)
|
|
171
|
+
sents = max(data["sentences"], 1)
|
|
172
|
+
|
|
173
|
+
syl_score = min(data["syllables"] / words, 5) / 5
|
|
174
|
+
sent_score = min(words / sents, 30) / 30
|
|
175
|
+
conjunct_score = min(data["conjuncts"] / words, 1.0)
|
|
176
|
+
matra_score = _matra_complexity(text)
|
|
177
|
+
|
|
178
|
+
hci = (
|
|
179
|
+
0.40 * syl_score +
|
|
180
|
+
0.20 * sent_score +
|
|
181
|
+
0.25 * conjunct_score +
|
|
182
|
+
0.15 * matra_score
|
|
183
|
+
)
|
|
184
|
+
return round(hci, 4)
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scorer.py ā Main Public API
|
|
3
|
+
============================
|
|
4
|
+
The ReadabilityScorer class is the primary interface for hindi-readability.
|
|
5
|
+
It combines all three formulas plus the raw script analysis into one call.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Dict, List
|
|
9
|
+
from .script import analyse, syllables_per_word, conjunct_density
|
|
10
|
+
from .formulas import (
|
|
11
|
+
hindi_readability_score,
|
|
12
|
+
hindi_grade_level,
|
|
13
|
+
hindi_complexity_index,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_LABEL_MAP = [
|
|
18
|
+
(90, "Very easy", "Suitable for Class 1ā2 students"),
|
|
19
|
+
(70, "Easy", "Suitable for Class 3ā5 students"),
|
|
20
|
+
(50, "Standard", "Suitable for Class 6ā8 students"),
|
|
21
|
+
(30, "Difficult", "Suitable for Class 9ā10 students"),
|
|
22
|
+
(10, "Very hard", "Suitable for Class 11ā12 students"),
|
|
23
|
+
(0, "Expert", "College-level or specialist text"),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _hrs_label(hrs: float) -> tuple:
|
|
28
|
+
for threshold, label, desc in _LABEL_MAP:
|
|
29
|
+
if hrs >= threshold:
|
|
30
|
+
return label, desc
|
|
31
|
+
return "Expert", "College-level or specialist text"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ReadabilityScorer:
|
|
35
|
+
"""
|
|
36
|
+
All-in-one Hindi readability analyser.
|
|
37
|
+
|
|
38
|
+
Example
|
|
39
|
+
-------
|
|
40
|
+
>>> from hindi_readability import ReadabilityScorer
|
|
41
|
+
>>> rs = ReadabilityScorer()
|
|
42
|
+
|
|
43
|
+
>>> rs.score("यह ą¤ą¤ सरल ą¤µą¤¾ą¤ą„य ą¤¹ą„ą„¤")
|
|
44
|
+
{
|
|
45
|
+
'hrs': 88.4,
|
|
46
|
+
'label': 'Easy',
|
|
47
|
+
'grade': 4,
|
|
48
|
+
'grade_label': 'Class 3ā5',
|
|
49
|
+
'hci': 0.18,
|
|
50
|
+
'syllables_per_word': 1.6,
|
|
51
|
+
'conjunct_density': 0.0,
|
|
52
|
+
...
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
>>> rs.compare(["ą¤¬ą¤ą„ą¤ą„ą¤ ą¤ą„ ą¤ą¤¹ą¤¾ą¤Øą„ą„¤", "ą¤øą¤ą¤µą¤æą¤§ą¤¾ą¤Ø ą¤ą„ ą¤Ŗą„ą¤°ą¤øą„ą¤¤ą¤¾ą¤µą¤Øą¤¾ą„¤"])
|
|
56
|
+
[{'text': '...', 'hrs': 91.2, 'label': 'Very easy'}, ...]
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def score(self, text: str) -> Dict[str, object]:
|
|
60
|
+
"""
|
|
61
|
+
Full readability report for a single text.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
dict with keys:
|
|
66
|
+
hrs : Hindi Readability Score (0ā100, higher = easier)
|
|
67
|
+
label : human-readable ease label
|
|
68
|
+
description : who this text is suitable for
|
|
69
|
+
grade : school grade number (1ā13)
|
|
70
|
+
grade_label : e.g. "Class 6ā8"
|
|
71
|
+
cbse_level : e.g. "Madhyamik"
|
|
72
|
+
hci : Hindi Complexity Index (0ā1, lower = easier)
|
|
73
|
+
syllables_per_word : float
|
|
74
|
+
conjunct_density : conjuncts per 100 words
|
|
75
|
+
raw : raw script analysis dict
|
|
76
|
+
"""
|
|
77
|
+
if not text or not text.strip():
|
|
78
|
+
raise ValueError("Input text cannot be empty.")
|
|
79
|
+
|
|
80
|
+
hrs = hindi_readability_score(text)
|
|
81
|
+
grade = hindi_grade_level(text)
|
|
82
|
+
hci = hindi_complexity_index(text)
|
|
83
|
+
raw = analyse(text)
|
|
84
|
+
label, desc = _hrs_label(hrs)
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"hrs": hrs,
|
|
88
|
+
"label": label,
|
|
89
|
+
"description": desc,
|
|
90
|
+
"grade": grade["grade"],
|
|
91
|
+
"grade_label": grade["grade_label"],
|
|
92
|
+
"cbse_level": grade["cbse_level"],
|
|
93
|
+
"hci": hci,
|
|
94
|
+
"syllables_per_word": syllables_per_word(text),
|
|
95
|
+
"conjunct_density": conjunct_density(text),
|
|
96
|
+
"raw": raw,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def compare(self, texts: List[str]) -> List[Dict[str, object]]:
|
|
100
|
+
"""
|
|
101
|
+
Score and rank multiple texts by difficulty.
|
|
102
|
+
|
|
103
|
+
Returns a list sorted easiest ā hardest (highest HRS first).
|
|
104
|
+
Each item includes 'text' (first 60 chars) + all score fields.
|
|
105
|
+
"""
|
|
106
|
+
results = []
|
|
107
|
+
for t in texts:
|
|
108
|
+
try:
|
|
109
|
+
s = self.score(t)
|
|
110
|
+
s["text"] = t[:60] + ("ā¦" if len(t) > 60 else "")
|
|
111
|
+
results.append(s)
|
|
112
|
+
except ValueError:
|
|
113
|
+
continue
|
|
114
|
+
return sorted(results, key=lambda x: x["hrs"], reverse=True)
|
|
115
|
+
|
|
116
|
+
def batch_score(self, texts: List[str]) -> List[Dict[str, object]]:
|
|
117
|
+
"""Score a list of texts in order (no sorting)."""
|
|
118
|
+
results = []
|
|
119
|
+
for t in texts:
|
|
120
|
+
try:
|
|
121
|
+
results.append(self.score(t))
|
|
122
|
+
except ValueError:
|
|
123
|
+
results.append({"error": "empty text"})
|
|
124
|
+
return results
|
|
125
|
+
|
|
126
|
+
def is_appropriate_for_grade(self, text: str, grade: int) -> bool:
|
|
127
|
+
"""
|
|
128
|
+
Check if a text is appropriate for a given school grade (1ā12).
|
|
129
|
+
|
|
130
|
+
Returns True if the text's grade level matches the target grade
|
|
131
|
+
within ±1 grade of tolerance.
|
|
132
|
+
"""
|
|
133
|
+
result = self.score(text)
|
|
134
|
+
text_grade = result["grade"]
|
|
135
|
+
return abs(text_grade - grade) <= 1
|
|
136
|
+
|
|
137
|
+
def simplify_suggestions(self, text: str) -> List[str]:
|
|
138
|
+
"""
|
|
139
|
+
Return actionable suggestions to simplify a Hindi text.
|
|
140
|
+
Based on which metric is worst.
|
|
141
|
+
"""
|
|
142
|
+
result = self.score(text)
|
|
143
|
+
suggestions = []
|
|
144
|
+
|
|
145
|
+
if result["syllables_per_word"] > 3.0:
|
|
146
|
+
suggestions.append(
|
|
147
|
+
"ą¤¶ą¤¬ą„ą¤¦ą„ą¤ ą¤ą„ ą¤²ą¤ą¤¬ą¤¾ą¤ ą¤ą¤® ą¤ą¤°ą„ą¤ ā ą¤ą„ą¤ą„ ą¤¶ą¤¬ą„ą¤¦ (1ā2 ą¤
ą¤ą„षर) ą¤
ą¤§ą¤æą¤ ą¤ą¤øą¤¾ą¤Ø ą¤¹ą„ą¤¤ą„ ą¤¹ą„ą¤ą„¤"
|
|
148
|
+
)
|
|
149
|
+
if result["conjunct_density"] > 15:
|
|
150
|
+
suggestions.append(
|
|
151
|
+
"ą¤øą¤ą¤Æą„ą¤ą„त ą¤
ą¤ą„ą¤·ą¤°ą„ą¤ ą¤µą¤¾ą¤²ą„ ą¤¶ą¤¬ą„ą¤¦ ą¤ą¤® ą¤ą¤°ą„ą¤ ā ą¤¤ą¤¤ą„सम ą¤¶ą¤¬ą„ą¤¦ą„ą¤ ą¤ą„ ą¤ą¤ą¤¹ ą¤¤ą¤¦ą„ą¤ą¤µ ą¤¶ą¤¬ą„ą¤¦ ą¤²ą¤æą¤ą„ą¤ą„¤"
|
|
152
|
+
)
|
|
153
|
+
if result["raw"]["sentences"] > 0:
|
|
154
|
+
words_per_sent = result["raw"]["words"] / result["raw"]["sentences"]
|
|
155
|
+
if words_per_sent > 15:
|
|
156
|
+
suggestions.append(
|
|
157
|
+
"ą¤µą¤¾ą¤ą„य ą¤ą„ą¤ą„ ą¤ą¤°ą„ą¤ ā ą¤ą¤ ą¤µą¤¾ą¤ą„य ą¤®ą„ą¤ 10ā12 ą¤øą„ ą¤
ą¤§ą¤æą¤ ą¤¶ą¤¬ą„ą¤¦ न ą¤°ą¤ą„ą¤ą„¤"
|
|
158
|
+
)
|
|
159
|
+
if not suggestions:
|
|
160
|
+
suggestions.append("यह ą¤Ŗą¤¾ą¤ ą¤Ŗą¤¹ą¤²ą„ ą¤øą„ ą¤Ŗą¤ ą¤Øą„ą¤Æ ą¤¹ą„ą„¤ ą¤ą„ठबऔ़ा ą¤øą„ą¤§ą¤¾ą¤° ą¤ą¤µą¤¶ą„ą¤Æą¤ ą¤Øą¤¹ą„ą¤ą„¤")
|
|
161
|
+
return suggestions
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
script.py ā Devanagari Script Analyser
|
|
3
|
+
=======================================
|
|
4
|
+
Counts the building blocks of Hindi text that determine reading difficulty:
|
|
5
|
+
|
|
6
|
+
- Matras : vowel diacritics attached to consonants (ि ą„ ą„ ą„ ą„ ą„ ą„ ą„ etc.)
|
|
7
|
+
- Virama : halant ą„ ā joins two consonants into a conjunct
|
|
8
|
+
- Conjuncts : two or more consonants merged (e.g. ą¤ą„ष ą¤¤ą„ą¤° ą¤ą„ą¤)
|
|
9
|
+
- Syllables : every independent vowel OR consonant+vowel unit
|
|
10
|
+
- Anusvara/
|
|
11
|
+
Visarga : nasal/aspiration marks ā add phonetic weight
|
|
12
|
+
|
|
13
|
+
Research basis
|
|
14
|
+
--------------
|
|
15
|
+
In Devanagari every consonant carries an implicit /a/ vowel (schwa).
|
|
16
|
+
A matra overrides that default vowel. A virama (U+094D) suppresses
|
|
17
|
+
the vowel completely and glues the consonant to the next one ā forming
|
|
18
|
+
a conjunct. Conjuncts are the primary marker of textual complexity
|
|
19
|
+
in Hindi: they appear mainly in Sanskrit-origin (tatsama) words which
|
|
20
|
+
are harder to read than native Prakrit-derived (tadbhava) words.
|
|
21
|
+
|
|
22
|
+
Unicode ranges used
|
|
23
|
+
-------------------
|
|
24
|
+
Devanagari block: U+0900 ā U+097F
|
|
25
|
+
Vowels (independent): U+0904 ā U+0914
|
|
26
|
+
Consonants: U+0915 ā U+0939, U+0958 ā U+095F (nukta variants)
|
|
27
|
+
Matras (dependent): U+093E ā U+094C, U+094E ā U+094F (also U+0955-U+0957)
|
|
28
|
+
Virama (halant): U+094D
|
|
29
|
+
Anusvara: U+0902
|
|
30
|
+
Visarga: U+0903
|
|
31
|
+
Chandrabindu: U+0901
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import re
|
|
35
|
+
import unicodedata
|
|
36
|
+
from typing import Dict
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# āā Unicode code-point sets āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
40
|
+
VIRAMA = "\u094D" # ą„ halant ā the conjunct-former
|
|
41
|
+
ANUSVARA = "\u0902" # ą¤
|
|
42
|
+
CHANDRABINDU = "\u0901" # ą¤
|
|
43
|
+
VISARGA = "\u0903" # ą¤
|
|
44
|
+
AVAGRAHA = "\u093D" # ऽ
|
|
45
|
+
|
|
46
|
+
# Independent vowels (ą¤
ठठठ⦠ą¤)
|
|
47
|
+
INDEPENDENT_VOWELS = set(chr(c) for c in range(0x0904, 0x0915))
|
|
48
|
+
|
|
49
|
+
# Consonants (ठ⦠ह + nukta variants)
|
|
50
|
+
CONSONANTS = set(chr(c) for c in range(0x0915, 0x093A)) | \
|
|
51
|
+
set(chr(c) for c in range(0x0958, 0x0960))
|
|
52
|
+
|
|
53
|
+
# Dependent vowel signs / matras (ा ि ą„ ą„ ą„ ą„ ą„ ą„ ą„ ą„ ą„ ą„ ā¦)
|
|
54
|
+
MATRAS = set(chr(c) for c in range(0x093E, 0x094D)) | \
|
|
55
|
+
{chr(0x094E), chr(0x094F)} | \
|
|
56
|
+
set(chr(c) for c in range(0x0955, 0x0958))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def analyse(text: str) -> Dict[str, int]:
|
|
60
|
+
"""
|
|
61
|
+
Analyse a Hindi text string and return raw script-level counts.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
dict with keys:
|
|
66
|
+
total_chars : total non-whitespace characters
|
|
67
|
+
consonants : number of consonant code-points
|
|
68
|
+
independent_vowels: standalone vowel letters
|
|
69
|
+
matras : dependent vowel signs (ि ą„ ą„ ą„ ą¤¾ ā¦)
|
|
70
|
+
viramas : halant signs ą„ (each one forms part of a conjunct)
|
|
71
|
+
conjuncts : number of conjunct clusters (= number of viramas
|
|
72
|
+
not at end of word, roughly)
|
|
73
|
+
anusvara : ठcount
|
|
74
|
+
visarga : ठcount
|
|
75
|
+
syllables : estimated syllable count (see _count_syllables)
|
|
76
|
+
words : whitespace-delimited tokens
|
|
77
|
+
sentences : splits on ą„¤ ą„„ . ? !
|
|
78
|
+
"""
|
|
79
|
+
text = unicodedata.normalize("NFC", text)
|
|
80
|
+
|
|
81
|
+
counts: Dict[str, int] = {
|
|
82
|
+
"total_chars": 0,
|
|
83
|
+
"consonants": 0,
|
|
84
|
+
"independent_vowels": 0,
|
|
85
|
+
"matras": 0,
|
|
86
|
+
"viramas": 0,
|
|
87
|
+
"conjuncts": 0,
|
|
88
|
+
"anusvara": 0,
|
|
89
|
+
"visarga": 0,
|
|
90
|
+
"syllables": 0,
|
|
91
|
+
"words": 0,
|
|
92
|
+
"sentences": 0,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
for ch in text:
|
|
96
|
+
if ch.isspace():
|
|
97
|
+
continue
|
|
98
|
+
counts["total_chars"] += 1
|
|
99
|
+
if ch in CONSONANTS:
|
|
100
|
+
counts["consonants"] += 1
|
|
101
|
+
elif ch in INDEPENDENT_VOWELS:
|
|
102
|
+
counts["independent_vowels"] += 1
|
|
103
|
+
elif ch in MATRAS:
|
|
104
|
+
counts["matras"] += 1
|
|
105
|
+
elif ch == VIRAMA:
|
|
106
|
+
counts["viramas"] += 1
|
|
107
|
+
elif ch == ANUSVARA or ch == CHANDRABINDU:
|
|
108
|
+
counts["anusvara"] += 1
|
|
109
|
+
elif ch == VISARGA:
|
|
110
|
+
counts["visarga"] += 1
|
|
111
|
+
|
|
112
|
+
# Conjuncts = sequences of consonant + virama + consonant (chain possible)
|
|
113
|
+
# We count each virama that is followed by a consonant as one conjunct bond.
|
|
114
|
+
i = 0
|
|
115
|
+
chars = list(text)
|
|
116
|
+
while i < len(chars) - 1:
|
|
117
|
+
if chars[i] == VIRAMA:
|
|
118
|
+
if i + 1 < len(chars) and chars[i + 1] in CONSONANTS:
|
|
119
|
+
counts["conjuncts"] += 1
|
|
120
|
+
i += 1
|
|
121
|
+
|
|
122
|
+
counts["syllables"] = _count_syllables(text)
|
|
123
|
+
counts["words"] = len([w for w in text.split() if w.strip()])
|
|
124
|
+
counts["sentences"] = max(1, len([s for s in re.split(r"[ą„¤ą„„.!?]+", text) if s.strip()]))
|
|
125
|
+
|
|
126
|
+
return counts
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _count_syllables(text: str) -> int:
|
|
130
|
+
"""
|
|
131
|
+
Estimate syllable count in Devanagari text.
|
|
132
|
+
|
|
133
|
+
Rule (based on Devanagari phonology):
|
|
134
|
+
Each syllable has exactly ONE vowel nucleus, which is either:
|
|
135
|
+
(a) an independent vowel letter, OR
|
|
136
|
+
(b) a consonant carrying its implicit /a/ (not followed by virama), OR
|
|
137
|
+
(c) a consonant + matra combination.
|
|
138
|
+
Virama suppresses the schwa ā that consonant does NOT form its own syllable.
|
|
139
|
+
Anusvara / visarga extend the preceding syllable but don't add a new one.
|
|
140
|
+
"""
|
|
141
|
+
syllables = 0
|
|
142
|
+
chars = list(unicodedata.normalize("NFC", text))
|
|
143
|
+
i = 0
|
|
144
|
+
while i < len(chars):
|
|
145
|
+
ch = chars[i]
|
|
146
|
+
if ch in INDEPENDENT_VOWELS:
|
|
147
|
+
syllables += 1
|
|
148
|
+
elif ch in CONSONANTS:
|
|
149
|
+
# peek ahead: is this consonant killed by a virama?
|
|
150
|
+
next_ch = chars[i + 1] if i + 1 < len(chars) else ""
|
|
151
|
+
if next_ch == VIRAMA:
|
|
152
|
+
pass # virama kills the schwa ā no syllable nucleus here
|
|
153
|
+
else:
|
|
154
|
+
syllables += 1 # implicit /a/ or explicit matra ā one syllable
|
|
155
|
+
i += 1
|
|
156
|
+
return max(syllables, 1)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def syllables_per_word(text: str) -> float:
|
|
160
|
+
"""Average syllables per word ā a key difficulty signal."""
|
|
161
|
+
data = analyse(text)
|
|
162
|
+
return round(data["syllables"] / max(data["words"], 1), 4)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def conjunct_density(text: str) -> float:
|
|
166
|
+
"""Conjuncts per 100 words ā higher = more Sanskrit-heavy = harder."""
|
|
167
|
+
data = analyse(text)
|
|
168
|
+
return round(data["conjuncts"] / max(data["words"], 1) * 100, 4)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: hindi-readability
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The first Python package for measuring readability of Hindi text using Devanagari-aware formulas
|
|
5
|
+
Author-email: Prabhat Chaudhary <raja1999chaudhary@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Erprabhat8423/hindi-readability
|
|
8
|
+
Project-URL: Repository, https://github.com/Erprabhat8423/hindi-readability
|
|
9
|
+
Keywords: hindi,readability,nlp,devanagari,indic,flesch,grade-level,text-analysis,education
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
22
|
+
Classifier: Natural Language :: Hindi
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
28
|
+
Requires-Dist: build; extra == "dev"
|
|
29
|
+
Requires-Dist: twine; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# hindi-readability šš®š³
|
|
32
|
+
|
|
33
|
+
**The first Python package for measuring the readability of Hindi text.**
|
|
34
|
+
|
|
35
|
+
Zero external dependencies. Pure Python 3.9+.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## The Problem
|
|
40
|
+
|
|
41
|
+
English has Flesch-Kincaid, Gunning Fog, and ARI ā readability formulas used in MS Word since 1992. **Hindi has nothing.**
|
|
42
|
+
|
|
43
|
+
India has 24.8 crore school students, 886 million internet users consuming Hindi content, and 14.7 lakh schools ā all producing and consuming Hindi text with no way to automatically measure whether it is easy or hard to read.
|
|
44
|
+
|
|
45
|
+
This package fills that gap with three **original formulas** designed specifically for Devanagari script.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install hindi-readability
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from hindi_readability import ReadabilityScorer
|
|
61
|
+
|
|
62
|
+
rs = ReadabilityScorer()
|
|
63
|
+
|
|
64
|
+
# Simple sentence
|
|
65
|
+
result = rs.score("यह ą¤ą¤ सरल ą¤µą¤¾ą¤ą„य ą¤¹ą„ą„¤")
|
|
66
|
+
print(result["hrs"]) # Hindi Readability Score (0-100)
|
|
67
|
+
print(result["label"]) # "Easy"
|
|
68
|
+
print(result["grade_label"]) # "Class 3ā5"
|
|
69
|
+
print(result["cbse_level"]) # "Prathmik Uttara"
|
|
70
|
+
|
|
71
|
+
# Constitutional text ā hard
|
|
72
|
+
result = rs.score("ą¤øą¤ą¤µą¤æą¤§ą¤¾ą¤Ø ą¤ą„ ą¤Ŗą„ą¤°ą¤øą„तावना ą¤®ą„ą¤ ą¤ą¤¾ą¤°ą¤¤ ą¤ą„ ą¤ą¤ ą¤øą¤ą¤Ŗą„ą¤°ą¤ą„, ą¤øą¤®ą¤¾ą¤ą¤µą¤¾ą¤¦ą„, ą¤§ą¤°ą„ą¤®ą¤Øą¤æą¤°ą¤Ŗą„ą¤ą„ष, ą¤²ą„ą¤ą¤¤ą¤¾ą¤ą¤¤ą„ą¤°ą¤æą¤ ą¤ą¤£ą¤°ą¤¾ą¤ą„य ą¤ą„षित ą¤ą¤æą¤Æą¤¾ ą¤ą¤Æą¤¾ ą¤¹ą„ą„¤")
|
|
73
|
+
print(result["hrs"]) # 0.0
|
|
74
|
+
print(result["label"]) # "Expert"
|
|
75
|
+
print(result["grade_label"])# "College+"
|
|
76
|
+
|
|
77
|
+
# Compare multiple texts ā sorted easiest first
|
|
78
|
+
texts = [
|
|
79
|
+
"ą¤¬ą¤ą„ą¤ą„ ą¤ą„ą¤²ą¤¤ą„ ą¤¹ą„ą¤ą„¤",
|
|
80
|
+
"ą¤ą¤¾ą¤°ą¤¤ ą¤ą„ ą¤¶ą¤æą¤ą„षा ą¤Øą„ą¤¤ą¤æ बदल ą¤°ą¤¹ą„ ą¤¹ą„ą„¤",
|
|
81
|
+
"ą¤øą¤ą¤µą„ą¤§ą¤¾ą¤Øą¤æą¤ ą¤Ŗą„ą¤°ą¤¾ą¤µą¤§ą¤¾ą¤Øą„ą¤ ą¤ą„ ą¤
ą¤Øą„ą¤øą¤¾ą¤° ą¤Øą¤¾ą¤ą¤°ą¤æą¤ą„ą¤ ą¤ą„ ą¤®ą„ą¤² ą¤
ą¤§ą¤æą¤ą¤¾ą¤° ą¤øą„ą¤°ą¤ą„षित ą¤¹ą„ą¤ą„¤",
|
|
82
|
+
]
|
|
83
|
+
ranked = rs.compare(texts)
|
|
84
|
+
for r in ranked:
|
|
85
|
+
print(f"{r['hrs']:5.1f} {r['label']:12} {r['text'][:40]}")
|
|
86
|
+
|
|
87
|
+
# Get simplification suggestions
|
|
88
|
+
suggestions = rs.simplify_suggestions("ą¤øą¤ą¤µą„ą¤§ą¤¾ą¤Øą¤æą¤ ą¤Ŗą„ą¤°ą¤¾ą¤µą¤§ą¤¾ą¤Øą„ą¤ ą¤ą„ ą¤
ą¤Øą„ą¤øą¤¾ą¤°...")
|
|
89
|
+
for s in suggestions:
|
|
90
|
+
print(s)
|
|
91
|
+
|
|
92
|
+
# Check if appropriate for a school grade
|
|
93
|
+
rs.is_appropriate_for_grade("यह सरल ą¤Ŗą¤¾ą¤ ą¤¹ą„ą„¤", grade=5) # True/False
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## The Three Formulas
|
|
99
|
+
|
|
100
|
+
### 1. Hindi Readability Score (HRS)
|
|
101
|
+
An ease score from **0 to 100** ā higher means easier. Inspired by Flesch Reading Ease but redesigned for Devanagari.
|
|
102
|
+
|
|
103
|
+
| Score | Label | Suitable for |
|
|
104
|
+
|-------|-------|-------------|
|
|
105
|
+
| 90ā100 | Very easy | Class 1ā2 |
|
|
106
|
+
| 70ā89 | Easy | Class 3ā5 |
|
|
107
|
+
| 50ā69 | Standard | Class 6ā8 |
|
|
108
|
+
| 30ā49 | Difficult | Class 9ā10 |
|
|
109
|
+
| 10ā29 | Very hard | Class 11ā12 |
|
|
110
|
+
| 0ā9 | Expert | College+ |
|
|
111
|
+
|
|
112
|
+
**Formula:**
|
|
113
|
+
```
|
|
114
|
+
HRS = 206.0
|
|
115
|
+
- (60.0 Ć avg_syllables_per_word)
|
|
116
|
+
- (1.8 Ć avg_words_per_sentence)
|
|
117
|
+
- (70.0 Ć conjunct_density)
|
|
118
|
+
- (8.0 Ć matra_complexity)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 2. Hindi Grade Level (HGL)
|
|
122
|
+
Maps HRS to Indian school grades (CBSE Class 1 to College+).
|
|
123
|
+
|
|
124
|
+
### 3. Hindi Complexity Index (HCI)
|
|
125
|
+
A normalized 0ā1 score. Lower = easier. Useful for ML pipelines.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Why These Formulas Are Different
|
|
130
|
+
|
|
131
|
+
| Feature | English (Flesch-Kincaid) | Hindi (this package) |
|
|
132
|
+
|---------|--------------------------|---------------------|
|
|
133
|
+
| Syllable counting | English phoneme rules | Devanagari matra-based |
|
|
134
|
+
| Conjunct detection | Not applicable | ā Virama-based detection |
|
|
135
|
+
| Script-aware | No | ā Full Unicode U+0900āU+097F |
|
|
136
|
+
| Long vowel complexity | No | ā Guru/laghu distinction |
|
|
137
|
+
| CBSE grade mapping | No | ā Class 1ā12 + College |
|
|
138
|
+
|
|
139
|
+
**Conjunct consonants** (ą¤øą¤ą¤Æą„ą¤ą„त ą¤
ą¤ą„षर) ā formed when a virama (ą„) joins two consonants ā are the primary marker of Sanskrit-origin vocabulary. They appear in tatsam words (ą¤¤ą¤¤ą„ą¤øą¤®) which are significantly harder for younger readers. This package detects them automatically using Unicode analysis.
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## What Is Solved vs. What This Package Solves
|
|
144
|
+
|
|
145
|
+
### Already solved (for English)
|
|
146
|
+
- Flesch Reading Ease (1948)
|
|
147
|
+
- Flesch-Kincaid Grade Level (1975)
|
|
148
|
+
- Gunning Fog Index (1952)
|
|
149
|
+
|
|
150
|
+
### What this package solves (first ever for Hindi)
|
|
151
|
+
- Matra-aware syllable counting
|
|
152
|
+
- Conjunct consonant density as a difficulty signal
|
|
153
|
+
- CBSE-aligned grade level output
|
|
154
|
+
- Actionable simplification suggestions in Hindi
|
|
155
|
+
|
|
156
|
+
### Still open (future research / dissertation topics)
|
|
157
|
+
- Validation against human-graded Hindi texts (labeled corpus needed)
|
|
158
|
+
- Domain-specific calibration (news vs. textbooks vs. legal)
|
|
159
|
+
- Extension to Bengali, Marathi, Gujarati (same Devanagari script family)
|
|
160
|
+
- Hinglish (code-mixed Hindi-English) readability
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## API Reference
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
ReadabilityScorer.score(text) # Full report dict
|
|
168
|
+
ReadabilityScorer.compare(texts) # Rank list easiestāhardest
|
|
169
|
+
ReadabilityScorer.batch_score(texts) # Score list in order
|
|
170
|
+
ReadabilityScorer.is_appropriate_for_grade(text, grade) # bool
|
|
171
|
+
ReadabilityScorer.simplify_suggestions(text) # list of Hindi suggestions
|
|
172
|
+
|
|
173
|
+
# Low-level functions
|
|
174
|
+
hindi_readability_score(text) # float 0-100
|
|
175
|
+
hindi_grade_level(text) # dict {grade, grade_label, cbse_level}
|
|
176
|
+
hindi_complexity_index(text) # float 0-1
|
|
177
|
+
analyse(text) # dict of raw script counts
|
|
178
|
+
syllables_per_word(text) # float
|
|
179
|
+
conjunct_density(text) # conjuncts per 100 words
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Citation
|
|
185
|
+
|
|
186
|
+
If you use this package in academic work:
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
@software{hindi_readability,
|
|
190
|
+
author = {Prabhat Chaudhary},
|
|
191
|
+
title = {hindi-readability: The First Python Package for Hindi Text Readability},
|
|
192
|
+
year = {2025},
|
|
193
|
+
publisher = {PyPI},
|
|
194
|
+
url = {https://pypi.org/project/hindi-readability/}
|
|
195
|
+
}
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
MIT ā free for academic and commercial use.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
hindi_readability/__init__.py
|
|
5
|
+
hindi_readability/formulas.py
|
|
6
|
+
hindi_readability/scorer.py
|
|
7
|
+
hindi_readability/script.py
|
|
8
|
+
hindi_readability.egg-info/PKG-INFO
|
|
9
|
+
hindi_readability.egg-info/SOURCES.txt
|
|
10
|
+
hindi_readability.egg-info/dependency_links.txt
|
|
11
|
+
hindi_readability.egg-info/requires.txt
|
|
12
|
+
hindi_readability.egg-info/top_level.txt
|
|
13
|
+
tests/test_all.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hindi_readability
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hindi-readability"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "The first Python package for measuring readability of Hindi text using Devanagari-aware formulas"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "Prabhat Chaudhary", email = "raja1999chaudhary@gmail.com" }]
|
|
12
|
+
keywords = [
|
|
13
|
+
"hindi", "readability", "nlp", "devanagari", "indic",
|
|
14
|
+
"flesch", "grade-level", "text-analysis", "education"
|
|
15
|
+
]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Education",
|
|
20
|
+
"Intended Audience :: Science/Research",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
"Topic :: Text Processing :: Linguistic",
|
|
29
|
+
"Natural Language :: Hindi",
|
|
30
|
+
]
|
|
31
|
+
requires-python = ">=3.8"
|
|
32
|
+
dependencies = []
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = ["pytest>=7", "build", "twine"]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/Erprabhat8423/hindi-readability"
|
|
39
|
+
Repository = "https://github.com/Erprabhat8423/hindi-readability"
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.packages.find]
|
|
42
|
+
where = ["."]
|
|
43
|
+
include = ["hindi_readability*"]
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for hindi-readability
|
|
3
|
+
Run: python tests/test_all.py
|
|
4
|
+
"""
|
|
5
|
+
import sys, os
|
|
6
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
7
|
+
|
|
8
|
+
from hindi_readability import (
|
|
9
|
+
ReadabilityScorer,
|
|
10
|
+
analyse,
|
|
11
|
+
syllables_per_word,
|
|
12
|
+
conjunct_density,
|
|
13
|
+
hindi_readability_score,
|
|
14
|
+
hindi_grade_level,
|
|
15
|
+
hindi_complexity_index,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
rs = ReadabilityScorer()
|
|
19
|
+
|
|
20
|
+
# āā Real Hindi test sentences at different difficulty levels āāāāāāāāāāāāāāāāāā
|
|
21
|
+
|
|
22
|
+
# Very EASY ā short common words, no conjuncts (Class 1-2 level)
|
|
23
|
+
EASY = "यह ą¤ą¤ ą¤¬ą¤ą„ą¤ą¤¾ ą¤¹ą„ą„¤ वह ą¤ą„लता ą¤¹ą„ą„¤ ą¤ą¤° ą¤
ą¤ą„ą¤ą¤¾ ą¤¹ą„ą„¤ ą¤®ą¤¾ą¤ ą¤Ŗą¤¾ą¤Øą„ ą¤²ą¤¾ą¤ą„¤"
|
|
24
|
+
|
|
25
|
+
# MEDIUM ā standard newspaper Hindi (Class 6-8 level)
|
|
26
|
+
MEDIUM = "ą¤ą¤¾ą¤°ą¤¤ ą¤®ą„ą¤ ą¤¶ą¤æą¤ą„षा ą¤ą¤¾ ą¤øą„ą¤¤ą¤° ą¤¤ą„ą¤ą„ ą¤øą„ ą¤¬ą¤¦ą¤² रहा ą¤¹ą„ą„¤ ą¤øą¤°ą¤ą¤¾ą¤° ą¤Øą¤ ą¤Øą„ą¤¤ą¤æą¤Æą¤¾ą¤ बना ą¤°ą¤¹ą„ ą¤¹ą„ą„¤"
|
|
27
|
+
|
|
28
|
+
# HARD ā Sanskrit-heavy formal Hindi (Class 11-12 / college level)
|
|
29
|
+
HARD = "ą¤øą¤ą¤µą¤æą¤§ą¤¾ą¤Ø ą¤ą„ ą¤Ŗą„ą¤°ą¤øą„तावना ą¤®ą„ą¤ ą¤ą¤¾ą¤°ą¤¤ ą¤ą„ ą¤ą¤ ą¤øą¤ą¤Ŗą„ą¤°ą¤ą„, ą¤øą¤®ą¤¾ą¤ą¤µą¤¾ą¤¦ą„, ą¤§ą¤°ą„ą¤®ą¤Øą¤æą¤°ą¤Ŗą„ą¤ą„ष, ą¤²ą„ą¤ą¤¤ą¤¾ą¤ą¤¤ą„ą¤°ą¤æą¤ ą¤ą¤£ą¤°ą¤¾ą¤ą„य ą¤ą„षित ą¤ą¤æą¤Æą¤¾ ą¤ą¤Æą¤¾ ą¤¹ą„ą„¤"
|
|
30
|
+
|
|
31
|
+
results = []
|
|
32
|
+
|
|
33
|
+
# āā Script analyser tests āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
34
|
+
def t(name, cond):
|
|
35
|
+
results.append((name, cond))
|
|
36
|
+
|
|
37
|
+
data_easy = analyse(EASY)
|
|
38
|
+
data_hard = analyse(HARD)
|
|
39
|
+
|
|
40
|
+
t("analyse returns dict", isinstance(data_easy, dict))
|
|
41
|
+
t("words counted", data_easy["words"] > 0)
|
|
42
|
+
t("sentences counted", data_easy["sentences"] > 0)
|
|
43
|
+
t("syllables counted", data_easy["syllables"] > 0)
|
|
44
|
+
t("hard text has more conjuncts", data_hard["conjuncts"] > data_easy.get("conjuncts", 0))
|
|
45
|
+
t("hard text has more matras", data_hard["matras"] >= data_easy["matras"])
|
|
46
|
+
|
|
47
|
+
# syllables_per_word
|
|
48
|
+
syl_easy = syllables_per_word(EASY)
|
|
49
|
+
syl_hard = syllables_per_word(HARD)
|
|
50
|
+
t("syllables_per_word > 0", syl_easy > 0)
|
|
51
|
+
t("hard has more syl/word", syl_hard >= syl_easy)
|
|
52
|
+
|
|
53
|
+
# conjunct_density
|
|
54
|
+
cd_easy = conjunct_density(EASY)
|
|
55
|
+
cd_hard = conjunct_density(HARD)
|
|
56
|
+
t("conjunct_density >= 0", cd_easy >= 0)
|
|
57
|
+
t("hard has higher density", cd_hard > cd_easy)
|
|
58
|
+
|
|
59
|
+
# āā Formula tests āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
60
|
+
hrs_easy = hindi_readability_score(EASY)
|
|
61
|
+
hrs_medium = hindi_readability_score(MEDIUM)
|
|
62
|
+
hrs_hard = hindi_readability_score(HARD)
|
|
63
|
+
|
|
64
|
+
t("HRS in 0-100 range (easy)", 0 <= hrs_easy <= 100)
|
|
65
|
+
t("HRS in 0-100 range (hard)", 0 <= hrs_hard <= 100)
|
|
66
|
+
t("easy > medium HRS", hrs_easy > hrs_medium)
|
|
67
|
+
t("medium > hard HRS", hrs_medium > hrs_hard)
|
|
68
|
+
|
|
69
|
+
grade_easy = hindi_grade_level(EASY)
|
|
70
|
+
grade_hard = hindi_grade_level(HARD)
|
|
71
|
+
t("grade dict has keys", "grade" in grade_easy and "grade_label" in grade_easy)
|
|
72
|
+
t("easy grade < hard grade", grade_easy["grade"] <= grade_hard["grade"])
|
|
73
|
+
t("grade 1-13 range", 1 <= grade_easy["grade"] <= 13)
|
|
74
|
+
|
|
75
|
+
hci_easy = hindi_complexity_index(EASY)
|
|
76
|
+
hci_hard = hindi_complexity_index(HARD)
|
|
77
|
+
t("HCI in 0-1 range", 0 <= hci_easy <= 1)
|
|
78
|
+
t("easy HCI < hard HCI", hci_easy < hci_hard)
|
|
79
|
+
|
|
80
|
+
# āā Scorer API tests āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
81
|
+
result = rs.score(EASY)
|
|
82
|
+
t("score() returns dict", isinstance(result, dict))
|
|
83
|
+
t("hrs key present", "hrs" in result)
|
|
84
|
+
t("label key present", "label" in result)
|
|
85
|
+
t("grade key present", "grade" in result)
|
|
86
|
+
t("cbse_level key present", "cbse_level" in result)
|
|
87
|
+
t("hci key present", "hci" in result)
|
|
88
|
+
t("raw key present", "raw" in result)
|
|
89
|
+
t("syllables_per_word key", "syllables_per_word" in result)
|
|
90
|
+
t("conjunct_density key", "conjunct_density" in result)
|
|
91
|
+
|
|
92
|
+
# compare() sorts easiest first
|
|
93
|
+
compared = rs.compare([HARD, EASY, MEDIUM])
|
|
94
|
+
t("compare() returns list", isinstance(compared, list))
|
|
95
|
+
t("compare() sorts easy first", compared[0]["hrs"] >= compared[-1]["hrs"])
|
|
96
|
+
t("compare length correct", len(compared) == 3)
|
|
97
|
+
|
|
98
|
+
# batch_score
|
|
99
|
+
batch = rs.batch_score([EASY, MEDIUM, HARD])
|
|
100
|
+
t("batch_score returns list", len(batch) == 3)
|
|
101
|
+
t("batch first is easy", batch[0]["hrs"] > batch[2]["hrs"])
|
|
102
|
+
|
|
103
|
+
# is_appropriate_for_grade
|
|
104
|
+
t("easy text ok for grade 7", rs.is_appropriate_for_grade(EASY, 7))
|
|
105
|
+
t("hard text not ok for grade 5", not rs.is_appropriate_for_grade(HARD, 5))
|
|
106
|
+
|
|
107
|
+
# simplify_suggestions
|
|
108
|
+
sugg = rs.simplify_suggestions(HARD)
|
|
109
|
+
t("suggestions is list", isinstance(sugg, list))
|
|
110
|
+
t("suggestions not empty", len(sugg) > 0)
|
|
111
|
+
|
|
112
|
+
# empty text raises ValueError
|
|
113
|
+
try:
|
|
114
|
+
rs.score("")
|
|
115
|
+
t("empty text raises error", False)
|
|
116
|
+
except ValueError:
|
|
117
|
+
t("empty text raises error", True)
|
|
118
|
+
|
|
119
|
+
# āā Print results āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
120
|
+
passed = sum(1 for _, r in results if r)
|
|
121
|
+
failed = [(n, r) for n, r in results if not r]
|
|
122
|
+
print(f"\nTests: {passed}/{len(results)} passed")
|
|
123
|
+
if failed:
|
|
124
|
+
print("FAILED:", [n for n, _ in failed])
|
|
125
|
+
else:
|
|
126
|
+
print("All tests passed! ā")
|
|
127
|
+
|
|
128
|
+
# āā Print sample output āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
129
|
+
print("\nāā Sample output (easy text) āāāāāāāāāāāāāāāāāāāāāāāāāāāāāā")
|
|
130
|
+
r = rs.score(EASY)
|
|
131
|
+
for k, v in r.items():
|
|
132
|
+
if k != "raw":
|
|
133
|
+
print(f" {k:25}: {v}")
|
|
134
|
+
|
|
135
|
+
print("\nāā HRS comparison across difficulty levels āāāāāāāāāāāāāāāā")
|
|
136
|
+
for label, text in [("Easy", EASY), ("Medium", MEDIUM), ("Hard", HARD)]:
|
|
137
|
+
r = rs.score(text)
|
|
138
|
+
print(f" {label:8}: HRS={r['hrs']:5.1f} Grade={r['grade_label']:12} Label={r['label']}")
|