pakgender 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sahib Dino
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,278 @@
1
+ Metadata-Version: 2.4
2
+ Name: pakgender
3
+ Version: 0.1.0
4
+ Summary: Gender inference for Pakistani names using a 3-layer pipeline: dictionary, rules, and ML.
5
+ Author-email: Sahib Dino <dino28575@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/dino28575/pakgender
8
+ Project-URL: Repository, https://github.com/dino28575/pakgender
9
+ Keywords: nlp,pakistan,urdu,names,gender,arabic
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Requires-Python: >=3.9
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: scikit-learn>=1.3
19
+ Provides-Extra: cli
20
+ Requires-Dist: click>=8.0; extra == "cli"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=7.0; extra == "dev"
23
+ Requires-Dist: pytest-cov; extra == "dev"
24
+ Dynamic: license-file
25
+
26
+ # pakgender
27
+
28
+ **Gender inference for Pakistani and Arabic names — built for South Asian data pipelines.**
29
+
30
+ [![PyPI version](https://img.shields.io/pypi/v/pakgender.svg)](https://pypi.org/project/pakgender/)
31
+ [![Python](https://img.shields.io/pypi/pyversions/pakgender.svg)](https://pypi.org/project/pakgender/)
32
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
+
34
+ ---
35
+
36
+ Most gender inference libraries (like `genderize.io`) perform poorly on Pakistani names — they mis-classify names like `Shehnaz`, `Maryam`, and `Saifullah` because they are trained almost exclusively on Western data.
37
+
38
+ `pakgender` was built specifically for this gap. It uses a three-layer pipeline — dictionary lookup, rule-based suffix/prefix analysis, and a character n-gram ML model — all tuned for Urdu, Arabic, and Persian-origin names written in Roman script.
39
+
40
+ ---
41
+
42
+ ## Features
43
+
44
+ - **5,100+ name dictionary** covering Pakistani CBS records and Arabic names
45
+ - **Spelling variant normalisation** — `Aisha`, `Ayesha`, `Aesha` all resolve to the same entry
46
+ - **Honorific stripping** — handles `Muhammad`, `Mst.`, `Ch.`, `Syed`, `Begum`, and 30+ other prefixes
47
+ - **Rule engine** — 25+ suffix patterns (`-ullah`, `-bano`, `-een`, `-naz`, `-uddin`) with confidence scores
48
+ - **ML fallback** — character n-gram Logistic Regression (82% accuracy, ±0.6% CV variance) for names outside the dictionary
49
+ - **Batch processing** — native `pandas` Series support for large datasets
50
+ - **Fully offline** — no API calls, no internet required; model ships with the package (~185 KB)
51
+
52
+ ---
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install pakgender
58
+ ```
59
+
60
+ No extra dependencies beyond `scikit-learn`.
61
+
62
+ ---
63
+
64
+ ## Quick start
65
+
66
+ ```python
67
+ from pakgender import predict
68
+
69
+ result = predict("Fatima Noor")
70
+ print(result)
71
+ # GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
72
+
73
+ print(result.gender) # 'F'
74
+ print(result.confidence) # 0.95
75
+ print(result.source) # 'dict' — which layer answered: dict / rule / ml
76
+ print(result.is_known()) # True
77
+ ```
78
+
79
+ ### Handles real-world messy names
80
+
81
+ ```python
82
+ from pakgender import predict
83
+
84
+ predict("MUHAMMAD AHSAN RAZA") # strips Muhammad prefix → GenderResult(gender='M', ...)
85
+ predict("Mst. Zara Bibi") # Mst. signals female → GenderResult(gender='F', ...)
86
+ predict("Ch. Imran Khan") # strips Ch. and Khan → GenderResult(gender='M', ...)
87
+ predict("Aisha") # normalises Aisha → Ayesha → dict lookup
88
+ predict("Saifullah") # rule: -ullah suffix → GenderResult(gender='M', confidence=0.95, source='rule', ...)
89
+ predict("xyz123") # GenderResult(gender='U', confidence=0.0, source='none', ...)
90
+ ```
91
+
92
+ ### Batch processing with pandas
93
+
94
+ ```python
95
+ import pandas as pd
96
+ from pakgender import predict_series
97
+
98
+ df = pd.read_excel("cbs_data.xlsx")
99
+
100
+ result = predict_series(df["Account_Title"])
101
+ df["gender"] = result["gender"] # 'M', 'F', or 'U'
102
+ df["confidence"] = result["confidence"] # 0.0 – 1.0
103
+ df["gender_src"] = result["source"] # 'dict', 'rule', or 'ml'
104
+ ```
105
+
106
+ ### Understanding the output
107
+
108
+ | Field | Values | Meaning |
109
+ |---|---|---|
110
+ | `gender` | `'M'`, `'F'`, `'U'` | Male, Female, Unknown/ambiguous |
111
+ | `confidence` | `0.0` – `1.0` | How certain the prediction is |
112
+ | `source` | `'dict'`, `'rule'`, `'ml'` | Which layer answered |
113
+ | `matched_token` | e.g. `'fatima'` | The specific name token that triggered the result |
114
+
115
+ A confidence of `0.0` with `source='none'` means all three layers failed — treat this as `U`.
116
+
117
+ ---
118
+
119
+ ## How it works
120
+
121
+ ```
122
+ Input: "Mst. Fatima Noor"
123
+
124
+
125
+ ┌─────────────┐
126
+ │ Preprocessor│ strips Mst. (→ F signal), normalises spelling,
127
+ └──────┬──────┘ tokenises → candidates: ['fatima', 'noor']
128
+
129
+
130
+ ┌─────────────┐
131
+ │ Layer 1 │ looks up 'fatima' in 5,100+ name dictionary
132
+ │ Dictionary │ → hit: gender=F, confidence=1.0
133
+ └──────┬──────┘
134
+ │ (if miss or ambiguous)
135
+
136
+ ┌─────────────┐
137
+ │ Layer 2 │ checks suffix patterns: -bano, -naz, -een, -ullah,
138
+ │ Rules │ -uddin, -ara, -ul, and 20+ more
139
+ └──────┬──────┘
140
+ │ (if still ambiguous)
141
+
142
+ ┌─────────────┐
143
+ │ Layer 3 │ character n-gram (2–4) Logistic Regression
144
+ │ ML model │ trained on 5,100+ Pakistani + Arabic names
145
+ └─────────────┘
146
+
147
+
148
+ GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
149
+ ```
150
+
151
+ ---
152
+
153
+ ## Supported name formats
154
+
155
+ | Format | Example | Handled |
156
+ |---|---|---|
157
+ | First name only | `Fatima` | ✓ |
158
+ | Full name | `Muhammad Ahsan Raza` | ✓ |
159
+ | With honorific prefix | `Mst. Zara`, `Ch. Imran`, `Syed Ali` | ✓ |
160
+ | With female honorific | `Begum Nusrat`, `Bibi Zulaikha` | ✓ |
161
+ | Spelling variants | `Aisha` / `Ayesha` / `Aesha` / `Aysha` | ✓ |
162
+ | Uppercase | `FATIMA MALIK` | ✓ |
163
+ | Abbreviated prefix | `M. Usman`, `Md. Tariq` | ✓ |
164
+ | Compound Islamic names | `Saifullah`, `Salahuddin`, `Nooruddin` | ✓ |
165
+
166
+ ---
167
+
168
+ ## Accuracy
169
+
170
+ Evaluated on a held-out 20% test split from the training dictionary:
171
+
172
+ | Metric | Value |
173
+ |---|---|
174
+ | Test accuracy | 82.0% |
175
+ | 5-fold CV accuracy | 81.4% ± 0.6% |
176
+ | Female precision / recall | 0.84 / 0.83 |
177
+ | Male precision / recall | 0.80 / 0.81 |
178
+
179
+ The low CV variance (±0.6%) means the model generalises consistently — it is not sensitive to which names end up in the test split.
180
+
181
+ The dictionary layer (Layer 1) handles the majority of common Pakistani names with confidence ≥ 0.95. The 82% figure reflects the ML fallback layer only, which activates for names outside the dictionary.
182
+
183
+ ---
184
+
185
+ ## Expanding the dictionary
186
+
187
+ The dictionary is a plain JSON file bundled with the package. You can add your own names without retraining:
188
+
189
+ ```python
190
+ # add_names.py — run once, then reinstall or point to custom path
191
+ import json
192
+ from pathlib import Path
193
+ import importlib.resources
194
+
195
+ ref = importlib.resources.files("pakgender.data").joinpath("names.json")
196
+ with importlib.resources.as_file(ref) as path:
197
+ with open(path) as f:
198
+ db = json.load(f)
199
+
200
+ # Add new entries
201
+ db["first_names"]["bakhtawar"] = {"gender": "F", "frequency": "medium"}
202
+ db["first_names"]["zulaikha"] = {"gender": "F", "frequency": "low"}
203
+
204
+ with open(path, "w", encoding="utf-8") as f:
205
+ json.dump(db, f, ensure_ascii=False, indent=2)
206
+ ```
207
+
208
+ To retrain the ML model after a large dictionary update:
209
+
210
+ ```bash
211
+ python train_model.py
212
+ ```
213
+
214
+ ---
215
+
216
+ ## Honorifics and prefixes recognised
217
+
218
+ The preprocessor strips these automatically before lookup:
219
+
220
+ **Neutral** (stripped silently): `Muhammad`, `Mohammed`, `Md`, `M.`, `Syed`, `Sayyid`, `Sheikh`, `Hafiz`, `Haji`, `Ch`, `Chaudhry`, `Raja`, `Rana`, `Malik`, `Khan`, `Mirza`, `Baig`, `Mian`, `Dr`, `Mr`
221
+
222
+ **Female signal** (stripped + gender hint set to F): `Begum`, `Bibi`, `Bano`, `Mst`, `Mst.`
223
+
224
+ **Male signal** (stripped + gender hint set to M): `Muhammad`, `Hafiz`, `Haji`, `Maulana`
225
+
226
+ ---
227
+
228
+ ## Spelling variants normalised
229
+
230
+ A sample of the variant map built into the preprocessor:
231
+
232
+ | Input variant | Normalised to |
233
+ |---|---|
234
+ | Aisha / Aesha / Aysha | Ayesha |
235
+ | Fatimah / Fatema | Fatima |
236
+ | Khadijah / Khadeeja | Khadija |
237
+ | Mariam / Marium / Maryum | Maryam |
238
+ | Hussain / Hussein | Husain |
239
+ | Hassan | Hasan |
240
+ | Nouman / Numaan | Noman |
241
+ | Nur / Nour | Noor |
242
+ | Zehra | Zahra |
243
+ | Ehsan / Ihsan | Ahsan |
244
+
245
+ ---
246
+
247
+ ## Limitations
248
+
249
+ - **Roman script only** — Urdu/Arabic script (`فاطمہ`) is not supported. Transliterate first if needed.
250
+ - **Ambiguous names** — names like `Noor`, `Akhtar`, `Gul` are used for both genders in Pakistan. These return `gender='U'` with a note in `source`. Add your own override rules using the dictionary.
251
+ - **Gulf vs South Asian conventions** — some names differ in gender between Pakistani and Gulf Arab usage (e.g. `Aman`, `Hani`). The library is tuned for Pakistani convention.
252
+ - **Post-processing recommended** — for high-stakes applications, filter on `confidence >= 0.75` and manually review rows where `source='ml'` and `confidence < 0.80`.
253
+
254
+ ---
255
+
256
+ ## Contributing
257
+
258
+ Contributions welcome — especially:
259
+ - Additional Pakistani name entries with correct gender labels
260
+ - Urdu/Persian-origin names missing from the dictionary
261
+ - Corrections to mislabeled entries
262
+
263
+ Please open an issue or pull request on [GitHub](https://github.com/dino28575/pakgender).
264
+
265
+ ---
266
+
267
+ ## License
268
+
269
+ MIT License. See `LICENSE` for details.
270
+
271
+ ---
272
+
273
+ ## Author
274
+
275
+ **Sahib Dino** — Data Analyst, Monitoring & Internal Control
276
+ [dino28575.github.io](https://dino28575.github.io) · [GitHub](https://github.com/dino28575)
277
+
278
+ Built to solve a real problem: automated gender verification in Pakistani banking records for CBS data quality audits.
@@ -0,0 +1,253 @@
1
+ # pakgender
2
+
3
+ **Gender inference for Pakistani and Arabic names — built for South Asian data pipelines.**
4
+
5
+ [![PyPI version](https://img.shields.io/pypi/v/pakgender.svg)](https://pypi.org/project/pakgender/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/pakgender.svg)](https://pypi.org/project/pakgender/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ ---
10
+
11
+ Most gender inference libraries (like `genderize.io`) perform poorly on Pakistani names — they mis-classify names like `Shehnaz`, `Maryam`, and `Saifullah` because they are trained almost exclusively on Western data.
12
+
13
+ `pakgender` was built specifically for this gap. It uses a three-layer pipeline — dictionary lookup, rule-based suffix/prefix analysis, and a character n-gram ML model — all tuned for Urdu, Arabic, and Persian-origin names written in Roman script.
14
+
15
+ ---
16
+
17
+ ## Features
18
+
19
+ - **5,100+ name dictionary** covering Pakistani CBS records and Arabic names
20
+ - **Spelling variant normalisation** — `Aisha`, `Ayesha`, `Aesha` all resolve to the same entry
21
+ - **Honorific stripping** — handles `Muhammad`, `Mst.`, `Ch.`, `Syed`, `Begum`, and 30+ other prefixes
22
+ - **Rule engine** — 25+ suffix patterns (`-ullah`, `-bano`, `-een`, `-naz`, `-uddin`) with confidence scores
23
+ - **ML fallback** — character n-gram Logistic Regression (82% accuracy, ±0.6% CV variance) for names outside the dictionary
24
+ - **Batch processing** — native `pandas` Series support for large datasets
25
+ - **Fully offline** — no API calls, no internet required; model ships with the package (~185 KB)
26
+
27
+ ---
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install pakgender
33
+ ```
34
+
35
+ No extra dependencies beyond `scikit-learn`.
36
+
37
+ ---
38
+
39
+ ## Quick start
40
+
41
+ ```python
42
+ from pakgender import predict
43
+
44
+ result = predict("Fatima Noor")
45
+ print(result)
46
+ # GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
47
+
48
+ print(result.gender) # 'F'
49
+ print(result.confidence) # 0.95
50
+ print(result.source) # 'dict' — which layer answered: dict / rule / ml
51
+ print(result.is_known()) # True
52
+ ```
53
+
54
+ ### Handles real-world messy names
55
+
56
+ ```python
57
+ from pakgender import predict
58
+
59
+ predict("MUHAMMAD AHSAN RAZA") # strips Muhammad prefix → GenderResult(gender='M', ...)
60
+ predict("Mst. Zara Bibi") # Mst. signals female → GenderResult(gender='F', ...)
61
+ predict("Ch. Imran Khan") # strips Ch. and Khan → GenderResult(gender='M', ...)
62
+ predict("Aisha") # normalises Aisha → Ayesha → dict lookup
63
+ predict("Saifullah") # rule: -ullah suffix → GenderResult(gender='M', confidence=0.95, source='rule', ...)
64
+ predict("xyz123") # GenderResult(gender='U', confidence=0.0, source='none', ...)
65
+ ```
66
+
67
+ ### Batch processing with pandas
68
+
69
+ ```python
70
+ import pandas as pd
71
+ from pakgender import predict_series
72
+
73
+ df = pd.read_excel("cbs_data.xlsx")
74
+
75
+ result = predict_series(df["Account_Title"])
76
+ df["gender"] = result["gender"] # 'M', 'F', or 'U'
77
+ df["confidence"] = result["confidence"] # 0.0 – 1.0
78
+ df["gender_src"] = result["source"] # 'dict', 'rule', or 'ml'
79
+ ```
80
+
81
+ ### Understanding the output
82
+
83
+ | Field | Values | Meaning |
84
+ |---|---|---|
85
+ | `gender` | `'M'`, `'F'`, `'U'` | Male, Female, Unknown/ambiguous |
86
+ | `confidence` | `0.0` – `1.0` | How certain the prediction is |
87
+ | `source` | `'dict'`, `'rule'`, `'ml'` | Which layer answered |
88
+ | `matched_token` | e.g. `'fatima'` | The specific name token that triggered the result |
89
+
90
+ A confidence of `0.0` with `source='none'` means all three layers failed — treat this as `U`.
91
+
92
+ ---
93
+
94
+ ## How it works
95
+
96
+ ```
97
+ Input: "Mst. Fatima Noor"
98
+
99
+
100
+ ┌─────────────┐
101
+ │ Preprocessor│ strips Mst. (→ F signal), normalises spelling,
102
+ └──────┬──────┘ tokenises → candidates: ['fatima', 'noor']
103
+
104
+
105
+ ┌─────────────┐
106
+ │ Layer 1 │ looks up 'fatima' in 5,100+ name dictionary
107
+ │ Dictionary │ → hit: gender=F, confidence=1.0
108
+ └──────┬──────┘
109
+ │ (if miss or ambiguous)
110
+
111
+ ┌─────────────┐
112
+ │ Layer 2 │ checks suffix patterns: -bano, -naz, -een, -ullah,
113
+ │ Rules │ -uddin, -ara, -ul, and 20+ more
114
+ └──────┬──────┘
115
+ │ (if still ambiguous)
116
+
117
+ ┌─────────────┐
118
+ │ Layer 3 │ character n-gram (2–4) Logistic Regression
119
+ │ ML model │ trained on 5,100+ Pakistani + Arabic names
120
+ └─────────────┘
121
+
122
+
123
+ GenderResult(gender='F', confidence=0.95, source='dict', matched_token='fatima')
124
+ ```
125
+
126
+ ---
127
+
128
+ ## Supported name formats
129
+
130
+ | Format | Example | Handled |
131
+ |---|---|---|
132
+ | First name only | `Fatima` | ✓ |
133
+ | Full name | `Muhammad Ahsan Raza` | ✓ |
134
+ | With honorific prefix | `Mst. Zara`, `Ch. Imran`, `Syed Ali` | ✓ |
135
+ | With female honorific | `Begum Nusrat`, `Bibi Zulaikha` | ✓ |
136
+ | Spelling variants | `Aisha` / `Ayesha` / `Aesha` / `Aysha` | ✓ |
137
+ | Uppercase | `FATIMA MALIK` | ✓ |
138
+ | Abbreviated prefix | `M. Usman`, `Md. Tariq` | ✓ |
139
+ | Compound Islamic names | `Saifullah`, `Salahuddin`, `Nooruddin` | ✓ |
140
+
141
+ ---
142
+
143
+ ## Accuracy
144
+
145
+ Evaluated on a held-out 20% test split from the training dictionary:
146
+
147
+ | Metric | Value |
148
+ |---|---|
149
+ | Test accuracy | 82.0% |
150
+ | 5-fold CV accuracy | 81.4% ± 0.6% |
151
+ | Female precision / recall | 0.84 / 0.83 |
152
+ | Male precision / recall | 0.80 / 0.81 |
153
+
154
+ The low CV variance (±0.6%) means the model generalises consistently — it is not sensitive to which names end up in the test split.
155
+
156
+ The dictionary layer (Layer 1) handles the majority of common Pakistani names with confidence ≥ 0.95. The 82% figure reflects the ML fallback layer only, which activates for names outside the dictionary.
157
+
158
+ ---
159
+
160
+ ## Expanding the dictionary
161
+
162
+ The dictionary is a plain JSON file bundled with the package. You can add your own names without retraining:
163
+
164
+ ```python
165
+ # add_names.py — run once, then reinstall or point to custom path
166
+ import json
167
+ from pathlib import Path
168
+ import importlib.resources
169
+
170
+ ref = importlib.resources.files("pakgender.data").joinpath("names.json")
171
+ with importlib.resources.as_file(ref) as path:
172
+ with open(path) as f:
173
+ db = json.load(f)
174
+
175
+ # Add new entries
176
+ db["first_names"]["bakhtawar"] = {"gender": "F", "frequency": "medium"}
177
+ db["first_names"]["zulaikha"] = {"gender": "F", "frequency": "low"}
178
+
179
+ with open(path, "w", encoding="utf-8") as f:
180
+ json.dump(db, f, ensure_ascii=False, indent=2)
181
+ ```
182
+
183
+ To retrain the ML model after a large dictionary update:
184
+
185
+ ```bash
186
+ python train_model.py
187
+ ```
188
+
189
+ ---
190
+
191
+ ## Honorifics and prefixes recognised
192
+
193
+ The preprocessor strips these automatically before lookup:
194
+
195
+ **Neutral** (stripped silently): `Muhammad`, `Mohammed`, `Md`, `M.`, `Syed`, `Sayyid`, `Sheikh`, `Hafiz`, `Haji`, `Ch`, `Chaudhry`, `Raja`, `Rana`, `Malik`, `Khan`, `Mirza`, `Baig`, `Mian`, `Dr`, `Mr`
196
+
197
+ **Female signal** (stripped + gender hint set to F): `Begum`, `Bibi`, `Bano`, `Mst`, `Mst.`
198
+
199
+ **Male signal** (stripped + gender hint set to M): `Muhammad`, `Hafiz`, `Haji`, `Maulana`
200
+
201
+ ---
202
+
203
+ ## Spelling variants normalised
204
+
205
+ A sample of the variant map built into the preprocessor:
206
+
207
+ | Input variant | Normalised to |
208
+ |---|---|
209
+ | Aisha / Aesha / Aysha | Ayesha |
210
+ | Fatimah / Fatema | Fatima |
211
+ | Khadijah / Khadeeja | Khadija |
212
+ | Mariam / Marium / Maryum | Maryam |
213
+ | Hussain / Hussein | Husain |
214
+ | Hassan | Hasan |
215
+ | Nouman / Numaan | Noman |
216
+ | Nur / Nour | Noor |
217
+ | Zehra | Zahra |
218
+ | Ehsan / Ihsan | Ahsan |
219
+
220
+ ---
221
+
222
+ ## Limitations
223
+
224
+ - **Roman script only** — Urdu/Arabic script (`فاطمہ`) is not supported. Transliterate first if needed.
225
+ - **Ambiguous names** — names like `Noor`, `Akhtar`, `Gul` are used for both genders in Pakistan. These return `gender='U'` with a note in `source`. Add your own override rules using the dictionary.
226
+ - **Gulf vs South Asian conventions** — some names differ in gender between Pakistani and Gulf Arab usage (e.g. `Aman`, `Hani`). The library is tuned for Pakistani convention.
227
+ - **Post-processing recommended** — for high-stakes applications, filter on `confidence >= 0.75` and manually review rows where `source='ml'` and `confidence < 0.80`.
228
+
229
+ ---
230
+
231
+ ## Contributing
232
+
233
+ Contributions welcome — especially:
234
+ - Additional Pakistani name entries with correct gender labels
235
+ - Urdu/Persian-origin names missing from the dictionary
236
+ - Corrections to mislabeled entries
237
+
238
+ Please open an issue or pull request on [GitHub](https://github.com/dino28575/pakgender).
239
+
240
+ ---
241
+
242
+ ## License
243
+
244
+ MIT License. See `LICENSE` for details.
245
+
246
+ ---
247
+
248
+ ## Author
249
+
250
+ **Sahib Dino** — Data Analyst, Monitoring & Internal Control
251
+ [dino28575.github.io](https://dino28575.github.io) · [GitHub](https://github.com/dino28575)
252
+
253
+ Built to solve a real problem: automated gender verification in Pakistani banking records for CBS data quality audits.
@@ -0,0 +1,47 @@
1
+ """
2
+ pakgender
3
+ ~~~~~~~~~
4
+ Gender inference for Pakistani names.
5
+
6
+ Quick start::
7
+
8
+ from pakgender import predict, predict_series
9
+
10
+ result = predict("Fatima Noor")
11
+ print(result)
12
+ # GenderResult(gender='F', confidence=0.97, source='dict', matched_token='Fatima')
13
+
14
+ import pandas as pd
15
+ df = pd.DataFrame({"name": ["Muhammad Ahsan", "Zara Malik", "Noor Ali"]})
16
+ df[["gender", "confidence"]] = predict_series(df["name"])
17
+ """
18
+
19
+ from .predictor import Predictor
20
+ from .batch import predict_series
21
+ from ._version import __version__
22
+
23
+ __all__ = ["predict", "predict_series", "Predictor", "__version__"]
24
+
25
+ _default_predictor: Predictor | None = None
26
+
27
+
28
+ def predict(name: str) -> "GenderResult":
29
+ """
30
+ Predict gender for a single name string.
31
+
32
+ Parameters
33
+ ----------
34
+ name : str
35
+ Any Pakistani name in Roman script, e.g. "Muhammad Ahsan Raza",
36
+ "Fatima", "Begum Zara", "Ayesha".
37
+
38
+ Returns
39
+ -------
40
+ GenderResult
41
+ A named result with fields: gender, confidence, source, matched_token.
42
+ gender is 'M', 'F', or 'U' (unknown).
43
+ """
44
+ global _default_predictor
45
+ if _default_predictor is None:
46
+ _default_predictor = Predictor()
47
+ return _default_predictor.predict(name)
@@ -0,0 +1,41 @@
1
+ __version__ = "0.1.0"
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class GenderResult:
8
+ """
9
+ Immutable result returned by predict().
10
+
11
+ Attributes
12
+ ----------
13
+ gender : str
14
+ 'M' (male), 'F' (female), or 'U' (unknown/ambiguous).
15
+ confidence : float
16
+ Score between 0.0 and 1.0. Higher = more certain.
17
+ >= 0.75 dict hit
18
+ >= 0.65 rule hit
19
+ >= 0.50 ML prediction
20
+ < 0.50 unknown
21
+ source : str
22
+ Which layer produced the answer: 'dict', 'rule', or 'ml'.
23
+ matched_token : str
24
+ The specific token (part of the name) that triggered the result.
25
+ """
26
+ gender: str
27
+ confidence: float
28
+ source: str
29
+ matched_token: str
30
+
31
+ def __repr__(self) -> str:
32
+ return (
33
+ f"GenderResult(gender={self.gender!r}, "
34
+ f"confidence={self.confidence:.2f}, "
35
+ f"source={self.source!r}, "
36
+ f"matched_token={self.matched_token!r})"
37
+ )
38
+
39
+ def is_known(self) -> bool:
40
+ """Return True if gender is not unknown."""
41
+ return self.gender != "U"
@@ -0,0 +1,49 @@
1
+ """
2
+ batch.py
3
+ --------
4
+ predict_series(series) — run predict() over a pandas Series of names.
5
+ Returns a DataFrame with columns [gender, confidence, source, matched_token].
6
+ """
7
+
8
+ import pandas as pd
9
+ from .predictor import Predictor
10
+
11
+ _predictor: Predictor | None = None
12
+
13
+
14
+ def predict_series(series: pd.Series, use_ml: bool = True) -> pd.DataFrame:
15
+ """
16
+ Predict gender for every name in a pandas Series.
17
+
18
+ Parameters
19
+ ----------
20
+ series : pd.Series
21
+ A Series of name strings.
22
+ use_ml : bool
23
+ Whether to use the ML layer (default True).
24
+
25
+ Returns
26
+ -------
27
+ pd.DataFrame
28
+ Columns: gender, confidence, source, matched_token.
29
+ Index matches the input Series index.
30
+
31
+ Example
32
+ -------
33
+ >>> df[["gender","confidence"]] = predict_series(df["CNIC_Name"])[["gender","confidence"]]
34
+ """
35
+ global _predictor
36
+ if _predictor is None or _predictor._ml is None and use_ml:
37
+ _predictor = Predictor(use_ml=use_ml)
38
+
39
+ results = series.apply(lambda name: _predictor.predict(name))
40
+
41
+ return pd.DataFrame(
42
+ {
43
+ "gender": [r.gender for r in results],
44
+ "confidence": [r.confidence for r in results],
45
+ "source": [r.source for r in results],
46
+ "matched_token": [r.matched_token for r in results],
47
+ },
48
+ index=series.index,
49
+ )
Binary file