pakgender 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pakgender-0.1.0/LICENSE +21 -0
- pakgender-0.1.0/PKG-INFO +278 -0
- pakgender-0.1.0/README.md +253 -0
- pakgender-0.1.0/pakgender/__init__.py +47 -0
- pakgender-0.1.0/pakgender/_version.py +41 -0
- pakgender-0.1.0/pakgender/batch.py +49 -0
- pakgender-0.1.0/pakgender/data/model.pkl +0 -0
- pakgender-0.1.0/pakgender/data/names.json +28225 -0
- pakgender-0.1.0/pakgender/dictionary.py +145 -0
- pakgender-0.1.0/pakgender/ml_model.py +55 -0
- pakgender-0.1.0/pakgender/predictor.py +363 -0
- pakgender-0.1.0/pakgender/preprocessor.py +208 -0
- pakgender-0.1.0/pakgender/rules.py +228 -0
- pakgender-0.1.0/pakgender.egg-info/PKG-INFO +278 -0
- pakgender-0.1.0/pakgender.egg-info/SOURCES.txt +19 -0
- pakgender-0.1.0/pakgender.egg-info/dependency_links.txt +1 -0
- pakgender-0.1.0/pakgender.egg-info/entry_points.txt +2 -0
- pakgender-0.1.0/pakgender.egg-info/requires.txt +8 -0
- pakgender-0.1.0/pakgender.egg-info/top_level.txt +1 -0
- pakgender-0.1.0/pyproject.toml +41 -0
- pakgender-0.1.0/setup.cfg +4 -0
pakgender-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sahib Dino
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pakgender-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pakgender
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Gender inference for Pakistani names using a 3-layer pipeline: dictionary, rules, and ML.
|
|
5
|
+
Author-email: Sahib Dino <dino28575@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/dino28575/pakgender
|
|
8
|
+
Project-URL: Repository, https://github.com/dino28575/pakgender
|
|
9
|
+
Keywords: nlp,pakistan,urdu,names,gender,arabic
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: scikit-learn>=1.3
|
|
19
|
+
Provides-Extra: cli
|
|
20
|
+
Requires-Dist: click>=8.0; extra == "cli"
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
23
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# pakgender
|
|
27
|
+
|
|
28
|
+
**Gender inference for Pakistani and Arabic names — built for South Asian data pipelines.**
|
|
29
|
+
|
|
30
|
+
[](https://pypi.org/project/pakgender/)
|
|
31
|
+
[](https://pypi.org/project/pakgender/)
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
Most gender inference libraries (like `genderize.io`) perform poorly on Pakistani names — they mis-classify names like `Shehnaz`, `Maryam`, and `Saifullah` because they are trained almost exclusively on Western data.
|
|
37
|
+
|
|
38
|
+
`pakgender` was built specifically for this gap. It uses a three-layer pipeline — dictionary lookup, rule-based suffix/prefix analysis, and a character n-gram ML model — all tuned for Urdu, Arabic, and Persian-origin names written in Roman script.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Features
|
|
43
|
+
|
|
44
|
+
- **5,100+ name dictionary** covering Pakistani CBS records and Arabic names
|
|
45
|
+
- **Spelling variant normalisation** — `Aisha`, `Ayesha`, `Aesha` all resolve to the same entry
|
|
46
|
+
- **Honorific stripping** — handles `Muhammad`, `Mst.`, `Ch.`, `Syed`, `Begum`, and 30+ other prefixes
|
|
47
|
+
- **Rule engine** — 25+ suffix patterns (`-ullah`, `-bano`, `-een`, `-naz`, `-uddin`) with confidence scores
|
|
48
|
+
- **ML fallback** — character n-gram Logistic Regression (82% accuracy, ±0.6% CV variance) for names outside the dictionary
|
|
49
|
+
- **Batch processing** — native `pandas` Series support for large datasets
|
|
50
|
+
- **Fully offline** — no API calls, no internet required; model ships with the package (~185 KB)
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install pakgender
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
No extra dependencies beyond `scikit-learn`.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Quick start
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from pakgender import predict
|
|
68
|
+
|
|
69
|
+
result = predict("Fatima Noor")
|
|
70
|
+
print(result)
|
|
71
|
+
# GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
|
|
72
|
+
|
|
73
|
+
print(result.gender) # 'F'
|
|
74
|
+
print(result.confidence) # 0.95
|
|
75
|
+
print(result.source) # 'dict' — which layer answered: dict / rule / ml
|
|
76
|
+
print(result.is_known()) # True
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Handles real-world messy names
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from pakgender import predict
|
|
83
|
+
|
|
84
|
+
predict("MUHAMMAD AHSAN RAZA") # strips Muhammad prefix → GenderResult(gender='M', ...)
|
|
85
|
+
predict("Mst. Zara Bibi") # Mst. signals female → GenderResult(gender='F', ...)
|
|
86
|
+
predict("Ch. Imran Khan") # strips Ch. and Khan → GenderResult(gender='M', ...)
|
|
87
|
+
predict("Aisha") # normalises Aisha → Ayesha → dict lookup
|
|
88
|
+
predict("Saifullah") # rule: -ullah suffix → GenderResult(gender='M', confidence=0.95, source='rule', ...)
|
|
89
|
+
predict("xyz123") # GenderResult(gender='U', confidence=0.0, source='none', ...)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Batch processing with pandas
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
import pandas as pd
|
|
96
|
+
from pakgender import predict_series
|
|
97
|
+
|
|
98
|
+
df = pd.read_excel("cbs_data.xlsx")
|
|
99
|
+
|
|
100
|
+
result = predict_series(df["Account_Title"])
|
|
101
|
+
df["gender"] = result["gender"] # 'M', 'F', or 'U'
|
|
102
|
+
df["confidence"] = result["confidence"] # 0.0 – 1.0
|
|
103
|
+
df["gender_src"] = result["source"] # 'dict', 'rule', or 'ml'
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Understanding the output
|
|
107
|
+
|
|
108
|
+
| Field | Values | Meaning |
|
|
109
|
+
|---|---|---|
|
|
110
|
+
| `gender` | `'M'`, `'F'`, `'U'` | Male, Female, Unknown/ambiguous |
|
|
111
|
+
| `confidence` | `0.0` – `1.0` | How certain the prediction is |
|
|
112
|
+
| `source` | `'dict'`, `'rule'`, `'ml'` | Which layer answered |
|
|
113
|
+
| `matched_token` | e.g. `'fatima'` | The specific name token that triggered the result |
|
|
114
|
+
|
|
115
|
+
A confidence of `0.0` with `source='none'` means all three layers failed — treat this as `U`.
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## How it works
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
Input: "Mst. Fatima Noor"
|
|
123
|
+
│
|
|
124
|
+
▼
|
|
125
|
+
┌─────────────┐
|
|
126
|
+
│ Preprocessor│ strips Mst. (→ F signal), normalises spelling,
|
|
127
|
+
└──────┬──────┘ tokenises → candidates: ['fatima', 'noor']
|
|
128
|
+
│
|
|
129
|
+
▼
|
|
130
|
+
┌─────────────┐
|
|
131
|
+
│ Layer 1 │ looks up 'fatima' in 5,100+ name dictionary
|
|
132
|
+
│ Dictionary │ → hit: gender=F, confidence=1.0
|
|
133
|
+
└──────┬──────┘
|
|
134
|
+
│ (if miss or ambiguous)
|
|
135
|
+
▼
|
|
136
|
+
┌─────────────┐
|
|
137
|
+
│ Layer 2 │ checks suffix patterns: -bano, -naz, -een, -ullah,
|
|
138
|
+
│ Rules │ -uddin, -ara, -ul, and 20+ more
|
|
139
|
+
└──────┬──────┘
|
|
140
|
+
│ (if still ambiguous)
|
|
141
|
+
▼
|
|
142
|
+
┌─────────────┐
|
|
143
|
+
│ Layer 3 │ character n-gram (2–4) Logistic Regression
|
|
144
|
+
│ ML model │ trained on 5,100+ Pakistani + Arabic names
|
|
145
|
+
└─────────────┘
|
|
146
|
+
│
|
|
147
|
+
▼
|
|
148
|
+
GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Supported name formats
|
|
154
|
+
|
|
155
|
+
| Format | Example | Handled |
|
|
156
|
+
|---|---|---|
|
|
157
|
+
| First name only | `Fatima` | ✓ |
|
|
158
|
+
| Full name | `Muhammad Ahsan Raza` | ✓ |
|
|
159
|
+
| With honorific prefix | `Mst. Zara`, `Ch. Imran`, `Syed Ali` | ✓ |
|
|
160
|
+
| With female honorific | `Begum Nusrat`, `Bibi Zulaikha` | ✓ |
|
|
161
|
+
| Spelling variants | `Aisha` / `Ayesha` / `Aesha` / `Aysha` | ✓ |
|
|
162
|
+
| Uppercase | `FATIMA MALIK` | ✓ |
|
|
163
|
+
| Abbreviated prefix | `M. Usman`, `Md. Tariq` | ✓ |
|
|
164
|
+
| Compound Islamic names | `Saifullah`, `Salahuddin`, `Nooruddin` | ✓ |
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Accuracy
|
|
169
|
+
|
|
170
|
+
Evaluated on a held-out 20% test split from the training dictionary:
|
|
171
|
+
|
|
172
|
+
| Metric | Value |
|
|
173
|
+
|---|---|
|
|
174
|
+
| Test accuracy | 82.0% |
|
|
175
|
+
| 5-fold CV accuracy | 81.4% ± 0.6% |
|
|
176
|
+
| Female precision / recall | 0.84 / 0.83 |
|
|
177
|
+
| Male precision / recall | 0.80 / 0.81 |
|
|
178
|
+
|
|
179
|
+
The low CV variance (±0.6%) means the model generalises consistently — it is not sensitive to which names end up in the test split.
|
|
180
|
+
|
|
181
|
+
The dictionary layer (Layer 1) handles the majority of common Pakistani names with confidence ≥ 0.95. The 82% figure reflects the ML fallback layer only, which activates for names outside the dictionary.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Expanding the dictionary
|
|
186
|
+
|
|
187
|
+
The dictionary is a plain JSON file bundled with the package. You can add your own names without retraining:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
# add_names.py — run once, then reinstall or point to custom path
|
|
191
|
+
import json
|
|
192
|
+
from pathlib import Path
|
|
193
|
+
import importlib.resources
|
|
194
|
+
|
|
195
|
+
ref = importlib.resources.files("pakgender.data").joinpath("names.json")
|
|
196
|
+
with importlib.resources.as_file(ref) as path:
|
|
197
|
+
with open(path) as f:
|
|
198
|
+
db = json.load(f)
|
|
199
|
+
|
|
200
|
+
# Add new entries
|
|
201
|
+
db["first_names"]["bakhtawar"] = {"gender": "F", "frequency": "medium"}
|
|
202
|
+
db["first_names"]["zulaikha"] = {"gender": "F", "frequency": "low"}
|
|
203
|
+
|
|
204
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
205
|
+
json.dump(db, f, ensure_ascii=False, indent=2)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
To retrain the ML model after a large dictionary update:
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
python train_model.py
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## Honorifics and prefixes recognised
|
|
217
|
+
|
|
218
|
+
The preprocessor strips these automatically before lookup:
|
|
219
|
+
|
|
220
|
+
**Neutral** (stripped silently): `Muhammad`, `Mohammed`, `Md`, `M.`, `Syed`, `Sayyid`, `Sheikh`, `Hafiz`, `Haji`, `Ch`, `Chaudhry`, `Raja`, `Rana`, `Malik`, `Khan`, `Mirza`, `Baig`, `Mian`, `Dr`, `Mr`
|
|
221
|
+
|
|
222
|
+
**Female signal** (stripped + gender hint set to F): `Begum`, `Bibi`, `Bano`, `Mst`, `Mst.`
|
|
223
|
+
|
|
224
|
+
**Male signal** (stripped + gender hint set to M): `Muhammad`, `Hafiz`, `Haji`, `Maulana`
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## Spelling variants normalised
|
|
229
|
+
|
|
230
|
+
A sample of the variant map built into the preprocessor:
|
|
231
|
+
|
|
232
|
+
| Input variant | Normalised to |
|
|
233
|
+
|---|---|
|
|
234
|
+
| Aisha / Aesha / Aysha | Ayesha |
|
|
235
|
+
| Fatimah / Fatema | Fatima |
|
|
236
|
+
| Khadijah / Khadeeja | Khadija |
|
|
237
|
+
| Mariam / Marium / Maryum | Maryam |
|
|
238
|
+
| Hussain / Hussein | Husain |
|
|
239
|
+
| Hassan | Hasan |
|
|
240
|
+
| Nouman / Numaan | Noman |
|
|
241
|
+
| Nur / Nour | Noor |
|
|
242
|
+
| Zehra | Zahra |
|
|
243
|
+
| Ehsan / Ihsan | Ahsan |
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## Limitations
|
|
248
|
+
|
|
249
|
+
- **Roman script only** — Urdu/Arabic script (`فاطمہ`) is not supported. Transliterate first if needed.
|
|
250
|
+
- **Ambiguous names** — names like `Noor`, `Akhtar`, `Gul` are used for both genders in Pakistan. These return `gender='U'` with a note in `source`. Add your own override rules using the dictionary.
|
|
251
|
+
- **Gulf vs South Asian conventions** — some names differ in gender between Pakistani and Gulf Arab usage (e.g. `Aman`, `Hani`). The library is tuned for Pakistani convention.
|
|
252
|
+
- **Post-processing recommended** — for high-stakes applications, filter on `confidence >= 0.75` and manually review rows where `source='ml'` and `confidence < 0.80`.
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Contributing
|
|
257
|
+
|
|
258
|
+
Contributions welcome — especially:
|
|
259
|
+
- Additional Pakistani name entries with correct gender labels
|
|
260
|
+
- Urdu/Persian-origin names missing from the dictionary
|
|
261
|
+
- Corrections to mislabeled entries
|
|
262
|
+
|
|
263
|
+
Please open an issue or pull request on [GitHub](https://github.com/dino28575/pakgender).
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## License
|
|
268
|
+
|
|
269
|
+
MIT License. See `LICENSE` for details.
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Author
|
|
274
|
+
|
|
275
|
+
**Sahib Dino** — Data Analyst, Monitoring & Internal Control
|
|
276
|
+
[dino28575.github.io](https://dino28575.github.io) · [GitHub](https://github.com/dino28575)
|
|
277
|
+
|
|
278
|
+
Built to solve a real problem: automated gender verification in Pakistani banking records for CBS data quality audits.
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# pakgender
|
|
2
|
+
|
|
3
|
+
**Gender inference for Pakistani and Arabic names — built for South Asian data pipelines.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/pakgender/)
|
|
6
|
+
[](https://pypi.org/project/pakgender/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
Most gender inference libraries (like `genderize.io`) perform poorly on Pakistani names — they mis-classify names like `Shehnaz`, `Maryam`, and `Saifullah` because they are trained almost exclusively on Western data.
|
|
12
|
+
|
|
13
|
+
`pakgender` was built specifically for this gap. It uses a three-layer pipeline — dictionary lookup, rule-based suffix/prefix analysis, and a character n-gram ML model — all tuned for Urdu, Arabic, and Persian-origin names written in Roman script.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Features
|
|
18
|
+
|
|
19
|
+
- **5,100+ name dictionary** covering Pakistani CBS records and Arabic names
|
|
20
|
+
- **Spelling variant normalisation** — `Aisha`, `Ayesha`, `Aesha` all resolve to the same entry
|
|
21
|
+
- **Honorific stripping** — handles `Muhammad`, `Mst.`, `Ch.`, `Syed`, `Begum`, and 30+ other prefixes
|
|
22
|
+
- **Rule engine** — 25+ suffix patterns (`-ullah`, `-bano`, `-een`, `-naz`, `-uddin`) with confidence scores
|
|
23
|
+
- **ML fallback** — character n-gram Logistic Regression (82% accuracy, ±0.6% CV variance) for names outside the dictionary
|
|
24
|
+
- **Batch processing** — native `pandas` Series support for large datasets
|
|
25
|
+
- **Fully offline** — no API calls, no internet required; model ships with the package (~185 KB)
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install pakgender
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
No extra dependencies beyond `scikit-learn`.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Quick start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from pakgender import predict
|
|
43
|
+
|
|
44
|
+
result = predict("Fatima Noor")
|
|
45
|
+
print(result)
|
|
46
|
+
# GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
|
|
47
|
+
|
|
48
|
+
print(result.gender) # 'F'
|
|
49
|
+
print(result.confidence) # 0.95
|
|
50
|
+
print(result.source) # 'dict' — which layer answered: dict / rule / ml
|
|
51
|
+
print(result.is_known()) # True
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Handles real-world messy names
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from pakgender import predict
|
|
58
|
+
|
|
59
|
+
predict("MUHAMMAD AHSAN RAZA") # strips Muhammad prefix → GenderResult(gender='M', ...)
|
|
60
|
+
predict("Mst. Zara Bibi") # Mst. signals female → GenderResult(gender='F', ...)
|
|
61
|
+
predict("Ch. Imran Khan") # strips Ch. and Khan → GenderResult(gender='M', ...)
|
|
62
|
+
predict("Aisha") # normalises Aisha → Ayesha → dict lookup
|
|
63
|
+
predict("Saifullah") # rule: -ullah suffix → GenderResult(gender='M', confidence=0.95, source='rule', ...)
|
|
64
|
+
predict("xyz123") # GenderResult(gender='U', confidence=0.0, source='none', ...)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Batch processing with pandas
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import pandas as pd
|
|
71
|
+
from pakgender import predict_series
|
|
72
|
+
|
|
73
|
+
df = pd.read_excel("cbs_data.xlsx")
|
|
74
|
+
|
|
75
|
+
result = predict_series(df["Account_Title"])
|
|
76
|
+
df["gender"] = result["gender"] # 'M', 'F', or 'U'
|
|
77
|
+
df["confidence"] = result["confidence"] # 0.0 – 1.0
|
|
78
|
+
df["gender_src"] = result["source"] # 'dict', 'rule', or 'ml'
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Understanding the output
|
|
82
|
+
|
|
83
|
+
| Field | Values | Meaning |
|
|
84
|
+
|---|---|---|
|
|
85
|
+
| `gender` | `'M'`, `'F'`, `'U'` | Male, Female, Unknown/ambiguous |
|
|
86
|
+
| `confidence` | `0.0` – `1.0` | How certain the prediction is |
|
|
87
|
+
| `source` | `'dict'`, `'rule'`, `'ml'` | Which layer answered |
|
|
88
|
+
| `matched_token` | e.g. `'fatima'` | The specific name token that triggered the result |
|
|
89
|
+
|
|
90
|
+
A confidence of `0.0` with `source='none'` means all three layers failed — treat this as `U`.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## How it works
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
Input: "Mst. Fatima Noor"
|
|
98
|
+
│
|
|
99
|
+
▼
|
|
100
|
+
┌─────────────┐
|
|
101
|
+
│ Preprocessor│ strips Mst. (→ F signal), normalises spelling,
|
|
102
|
+
└──────┬──────┘ tokenises → candidates: ['fatima', 'noor']
|
|
103
|
+
│
|
|
104
|
+
▼
|
|
105
|
+
┌─────────────┐
|
|
106
|
+
│ Layer 1 │ looks up 'fatima' in 5,100+ name dictionary
|
|
107
|
+
│ Dictionary │ → hit: gender=F, confidence=1.0
|
|
108
|
+
└──────┬──────┘
|
|
109
|
+
│ (if miss or ambiguous)
|
|
110
|
+
▼
|
|
111
|
+
┌─────────────┐
|
|
112
|
+
│ Layer 2 │ checks suffix patterns: -bano, -naz, -een, -ullah,
|
|
113
|
+
│ Rules │ -uddin, -ara, -ul, and 20+ more
|
|
114
|
+
└──────┬──────┘
|
|
115
|
+
│ (if still ambiguous)
|
|
116
|
+
▼
|
|
117
|
+
┌─────────────┐
|
|
118
|
+
│ Layer 3 │ character n-gram (2–4) Logistic Regression
|
|
119
|
+
│ ML model │ trained on 5,100+ Pakistani + Arabic names
|
|
120
|
+
└─────────────┘
|
|
121
|
+
│
|
|
122
|
+
▼
|
|
123
|
+
GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Supported name formats
|
|
129
|
+
|
|
130
|
+
| Format | Example | Handled |
|
|
131
|
+
|---|---|---|
|
|
132
|
+
| First name only | `Fatima` | ✓ |
|
|
133
|
+
| Full name | `Muhammad Ahsan Raza` | ✓ |
|
|
134
|
+
| With honorific prefix | `Mst. Zara`, `Ch. Imran`, `Syed Ali` | ✓ |
|
|
135
|
+
| With female honorific | `Begum Nusrat`, `Bibi Zulaikha` | ✓ |
|
|
136
|
+
| Spelling variants | `Aisha` / `Ayesha` / `Aesha` / `Aysha` | ✓ |
|
|
137
|
+
| Uppercase | `FATIMA MALIK` | ✓ |
|
|
138
|
+
| Abbreviated prefix | `M. Usman`, `Md. Tariq` | ✓ |
|
|
139
|
+
| Compound Islamic names | `Saifullah`, `Salahuddin`, `Nooruddin` | ✓ |
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Accuracy
|
|
144
|
+
|
|
145
|
+
Evaluated on a held-out 20% test split from the training dictionary:
|
|
146
|
+
|
|
147
|
+
| Metric | Value |
|
|
148
|
+
|---|---|
|
|
149
|
+
| Test accuracy | 82.0% |
|
|
150
|
+
| 5-fold CV accuracy | 81.4% ± 0.6% |
|
|
151
|
+
| Female precision / recall | 0.84 / 0.83 |
|
|
152
|
+
| Male precision / recall | 0.80 / 0.81 |
|
|
153
|
+
|
|
154
|
+
The low CV variance (±0.6%) means the model generalises consistently — it is not sensitive to which names end up in the test split.
|
|
155
|
+
|
|
156
|
+
The dictionary layer (Layer 1) handles the majority of common Pakistani names with confidence ≥ 0.95. The 82% figure reflects the ML fallback layer only, which activates for names outside the dictionary.
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Expanding the dictionary
|
|
161
|
+
|
|
162
|
+
The dictionary is a plain JSON file bundled with the package. You can add your own names without retraining:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
# add_names.py — run once, then reinstall or point to custom path
|
|
166
|
+
import json
|
|
167
|
+
from pathlib import Path
|
|
168
|
+
import importlib.resources
|
|
169
|
+
|
|
170
|
+
ref = importlib.resources.files("pakgender.data").joinpath("names.json")
|
|
171
|
+
with importlib.resources.as_file(ref) as path:
|
|
172
|
+
with open(path) as f:
|
|
173
|
+
db = json.load(f)
|
|
174
|
+
|
|
175
|
+
# Add new entries
|
|
176
|
+
db["first_names"]["bakhtawar"] = {"gender": "F", "frequency": "medium"}
|
|
177
|
+
db["first_names"]["zulaikha"] = {"gender": "F", "frequency": "low"}
|
|
178
|
+
|
|
179
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
180
|
+
json.dump(db, f, ensure_ascii=False, indent=2)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
To retrain the ML model after a large dictionary update:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
python train_model.py
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Honorifics and prefixes recognised
|
|
192
|
+
|
|
193
|
+
The preprocessor strips these automatically before lookup:
|
|
194
|
+
|
|
195
|
+
**Neutral** (stripped silently): `Muhammad`, `Mohammed`, `Md`, `M.`, `Syed`, `Sayyid`, `Sheikh`, `Hafiz`, `Haji`, `Ch`, `Chaudhry`, `Raja`, `Rana`, `Malik`, `Khan`, `Mirza`, `Baig`, `Mian`, `Dr`, `Mr`
|
|
196
|
+
|
|
197
|
+
**Female signal** (stripped + gender hint set to F): `Begum`, `Bibi`, `Bano`, `Mst`, `Mst.`
|
|
198
|
+
|
|
199
|
+
**Male signal** (stripped + gender hint set to M): `Muhammad`, `Hafiz`, `Haji`, `Maulana`
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Spelling variants normalised
|
|
204
|
+
|
|
205
|
+
A sample of the variant map built into the preprocessor:
|
|
206
|
+
|
|
207
|
+
| Input variant | Normalised to |
|
|
208
|
+
|---|---|
|
|
209
|
+
| Aisha / Aesha / Aysha | Ayesha |
|
|
210
|
+
| Fatimah / Fatema | Fatima |
|
|
211
|
+
| Khadijah / Khadeeja | Khadija |
|
|
212
|
+
| Mariam / Marium / Maryum | Maryam |
|
|
213
|
+
| Hussain / Hussein | Husain |
|
|
214
|
+
| Hassan | Hasan |
|
|
215
|
+
| Nouman / Numaan | Noman |
|
|
216
|
+
| Nur / Nour | Noor |
|
|
217
|
+
| Zehra | Zahra |
|
|
218
|
+
| Ehsan / Ihsan | Ahsan |
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Limitations
|
|
223
|
+
|
|
224
|
+
- **Roman script only** — Urdu/Arabic script (`فاطمہ`) is not supported. Transliterate first if needed.
|
|
225
|
+
- **Ambiguous names** — names like `Noor`, `Akhtar`, `Gul` are used for both genders in Pakistan. These return `gender='U'` with a note in `source`. Add your own override rules using the dictionary.
|
|
226
|
+
- **Gulf vs South Asian conventions** — some names differ in gender between Pakistani and Gulf Arab usage (e.g. `Aman`, `Hani`). The library is tuned for Pakistani convention.
|
|
227
|
+
- **Post-processing recommended** — for high-stakes applications, filter on `confidence >= 0.75` and manually review rows where `source='ml'` and `confidence < 0.80`.
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Contributing
|
|
232
|
+
|
|
233
|
+
Contributions welcome — especially:
|
|
234
|
+
- Additional Pakistani name entries with correct gender labels
|
|
235
|
+
- Urdu/Persian-origin names missing from the dictionary
|
|
236
|
+
- Corrections to mislabeled entries
|
|
237
|
+
|
|
238
|
+
Please open an issue or pull request on [GitHub](https://github.com/dino28575/pakgender).
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT License. See `LICENSE` for details.
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Author
|
|
249
|
+
|
|
250
|
+
**Sahib Dino** — Data Analyst, Monitoring & Internal Control
|
|
251
|
+
[dino28575.github.io](https://dino28575.github.io) · [GitHub](https://github.com/dino28575)
|
|
252
|
+
|
|
253
|
+
Built to solve a real problem: automated gender verification in Pakistani banking records for CBS data quality audits.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pakgender
|
|
3
|
+
~~~~~~~~~
|
|
4
|
+
Gender inference for Pakistani names.
|
|
5
|
+
|
|
6
|
+
Quick start::
|
|
7
|
+
|
|
8
|
+
from pakgender import predict, predict_series
|
|
9
|
+
|
|
10
|
+
result = predict("Fatima Noor")
|
|
11
|
+
print(result)
|
|
12
|
+
# GenderResult(gender='F', confidence=0.97, source='dict', matched_token='Fatima')
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
df = pd.DataFrame({"name": ["Muhammad Ahsan", "Zara Malik", "Noor Ali"]})
|
|
16
|
+
df[["gender", "confidence"]] = predict_series(df["name"])
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from .predictor import Predictor
|
|
20
|
+
from .batch import predict_series
|
|
21
|
+
from ._version import __version__
|
|
22
|
+
|
|
23
|
+
__all__ = ["predict", "predict_series", "Predictor", "__version__"]
|
|
24
|
+
|
|
25
|
+
_default_predictor: Predictor | None = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def predict(name: str) -> "GenderResult":
|
|
29
|
+
"""
|
|
30
|
+
Predict gender for a single name string.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
name : str
|
|
35
|
+
Any Pakistani name in Roman script, e.g. "Muhammad Ahsan Raza",
|
|
36
|
+
"Fatima", "Begum Zara", "Ayesha".
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
GenderResult
|
|
41
|
+
A named result with fields: gender, confidence, source, matched_token.
|
|
42
|
+
gender is 'M', 'F', or 'U' (unknown).
|
|
43
|
+
"""
|
|
44
|
+
global _default_predictor
|
|
45
|
+
if _default_predictor is None:
|
|
46
|
+
_default_predictor = Predictor()
|
|
47
|
+
return _default_predictor.predict(name)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True)
|
|
7
|
+
class GenderResult:
|
|
8
|
+
"""
|
|
9
|
+
Immutable result returned by predict().
|
|
10
|
+
|
|
11
|
+
Attributes
|
|
12
|
+
----------
|
|
13
|
+
gender : str
|
|
14
|
+
'M' (male), 'F' (female), or 'U' (unknown/ambiguous).
|
|
15
|
+
confidence : float
|
|
16
|
+
Score between 0.0 and 1.0. Higher = more certain.
|
|
17
|
+
>= 0.75 dict hit
|
|
18
|
+
>= 0.65 rule hit
|
|
19
|
+
>= 0.50 ML prediction
|
|
20
|
+
< 0.50 unknown
|
|
21
|
+
source : str
|
|
22
|
+
Which layer produced the answer: 'dict', 'rule', or 'ml'.
|
|
23
|
+
matched_token : str
|
|
24
|
+
The specific token (part of the name) that triggered the result.
|
|
25
|
+
"""
|
|
26
|
+
gender: str
|
|
27
|
+
confidence: float
|
|
28
|
+
source: str
|
|
29
|
+
matched_token: str
|
|
30
|
+
|
|
31
|
+
def __repr__(self) -> str:
|
|
32
|
+
return (
|
|
33
|
+
f"GenderResult(gender={self.gender!r}, "
|
|
34
|
+
f"confidence={self.confidence:.2f}, "
|
|
35
|
+
f"source={self.source!r}, "
|
|
36
|
+
f"matched_token={self.matched_token!r})"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def is_known(self) -> bool:
|
|
40
|
+
"""Return True if gender is not unknown."""
|
|
41
|
+
return self.gender != "U"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
batch.py
|
|
3
|
+
--------
|
|
4
|
+
predict_series(series) — run predict() over a pandas Series of names.
|
|
5
|
+
Returns a DataFrame with columns [gender, confidence, source, matched_token].
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from .predictor import Predictor
|
|
10
|
+
|
|
11
|
+
_predictor: Predictor | None = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def predict_series(series: pd.Series, use_ml: bool = True) -> pd.DataFrame:
|
|
15
|
+
"""
|
|
16
|
+
Predict gender for every name in a pandas Series.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
series : pd.Series
|
|
21
|
+
A Series of name strings.
|
|
22
|
+
use_ml : bool
|
|
23
|
+
Whether to use the ML layer (default True).
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
pd.DataFrame
|
|
28
|
+
Columns: gender, confidence, source, matched_token.
|
|
29
|
+
Index matches the input Series index.
|
|
30
|
+
|
|
31
|
+
Example
|
|
32
|
+
-------
|
|
33
|
+
>>> df[["gender","confidence"]] = predict_series(df["CNIC_Name"])[["gender","confidence"]]
|
|
34
|
+
"""
|
|
35
|
+
global _predictor
|
|
36
|
+
if _predictor is None or _predictor._ml is None and use_ml:
|
|
37
|
+
_predictor = Predictor(use_ml=use_ml)
|
|
38
|
+
|
|
39
|
+
results = series.apply(lambda name: _predictor.predict(name))
|
|
40
|
+
|
|
41
|
+
return pd.DataFrame(
|
|
42
|
+
{
|
|
43
|
+
"gender": [r.gender for r in results],
|
|
44
|
+
"confidence": [r.confidence for r in results],
|
|
45
|
+
"source": [r.source for r in results],
|
|
46
|
+
"matched_token": [r.matched_token for r in results],
|
|
47
|
+
},
|
|
48
|
+
index=series.index,
|
|
49
|
+
)
|
|
Binary file
|