pointblank 0.18.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +44 -1
- pointblank/_constants.py +258 -166
- pointblank/_constants_translations.py +378 -0
- pointblank/_interrogation.py +204 -0
- pointblank/_utils_llms_txt.py +20 -0
- pointblank/data/api-docs.txt +793 -1
- pointblank/field.py +1507 -0
- pointblank/generate/__init__.py +17 -0
- pointblank/generate/base.py +49 -0
- pointblank/generate/generators.py +573 -0
- pointblank/generate/regex.py +217 -0
- pointblank/locales/__init__.py +1476 -0
- pointblank/locales/data/AR/address.json +73 -0
- pointblank/locales/data/AR/company.json +60 -0
- pointblank/locales/data/AR/internet.json +19 -0
- pointblank/locales/data/AR/misc.json +7 -0
- pointblank/locales/data/AR/person.json +39 -0
- pointblank/locales/data/AR/text.json +38 -0
- pointblank/locales/data/AT/address.json +84 -0
- pointblank/locales/data/AT/company.json +65 -0
- pointblank/locales/data/AT/internet.json +20 -0
- pointblank/locales/data/AT/misc.json +8 -0
- pointblank/locales/data/AT/person.json +17 -0
- pointblank/locales/data/AT/text.json +35 -0
- pointblank/locales/data/AU/address.json +83 -0
- pointblank/locales/data/AU/company.json +65 -0
- pointblank/locales/data/AU/internet.json +20 -0
- pointblank/locales/data/AU/misc.json +8 -0
- pointblank/locales/data/AU/person.json +17 -0
- pointblank/locales/data/AU/text.json +35 -0
- pointblank/locales/data/BE/address.json +225 -0
- pointblank/locales/data/BE/company.json +129 -0
- pointblank/locales/data/BE/internet.json +36 -0
- pointblank/locales/data/BE/misc.json +6 -0
- pointblank/locales/data/BE/person.json +62 -0
- pointblank/locales/data/BE/text.json +38 -0
- pointblank/locales/data/BG/address.json +75 -0
- pointblank/locales/data/BG/company.json +60 -0
- pointblank/locales/data/BG/internet.json +19 -0
- pointblank/locales/data/BG/misc.json +7 -0
- pointblank/locales/data/BG/person.json +40 -0
- pointblank/locales/data/BG/text.json +38 -0
- pointblank/locales/data/BR/address.json +98 -0
- pointblank/locales/data/BR/company.json +65 -0
- pointblank/locales/data/BR/internet.json +20 -0
- pointblank/locales/data/BR/misc.json +8 -0
- pointblank/locales/data/BR/person.json +17 -0
- pointblank/locales/data/BR/text.json +35 -0
- pointblank/locales/data/CA/address.json +747 -0
- pointblank/locales/data/CA/company.json +120 -0
- pointblank/locales/data/CA/internet.json +24 -0
- pointblank/locales/data/CA/misc.json +11 -0
- pointblank/locales/data/CA/person.json +1033 -0
- pointblank/locales/data/CA/text.json +58 -0
- pointblank/locales/data/CH/address.json +184 -0
- pointblank/locales/data/CH/company.json +112 -0
- pointblank/locales/data/CH/internet.json +20 -0
- pointblank/locales/data/CH/misc.json +10 -0
- pointblank/locales/data/CH/person.json +64 -0
- pointblank/locales/data/CH/text.json +45 -0
- pointblank/locales/data/CL/address.json +71 -0
- pointblank/locales/data/CL/company.json +60 -0
- pointblank/locales/data/CL/internet.json +19 -0
- pointblank/locales/data/CL/misc.json +7 -0
- pointblank/locales/data/CL/person.json +38 -0
- pointblank/locales/data/CL/text.json +38 -0
- pointblank/locales/data/CN/address.json +124 -0
- pointblank/locales/data/CN/company.json +76 -0
- pointblank/locales/data/CN/internet.json +20 -0
- pointblank/locales/data/CN/misc.json +8 -0
- pointblank/locales/data/CN/person.json +50 -0
- pointblank/locales/data/CN/text.json +38 -0
- pointblank/locales/data/CO/address.json +76 -0
- pointblank/locales/data/CO/company.json +60 -0
- pointblank/locales/data/CO/internet.json +19 -0
- pointblank/locales/data/CO/misc.json +7 -0
- pointblank/locales/data/CO/person.json +38 -0
- pointblank/locales/data/CO/text.json +38 -0
- pointblank/locales/data/CY/address.json +62 -0
- pointblank/locales/data/CY/company.json +60 -0
- pointblank/locales/data/CY/internet.json +19 -0
- pointblank/locales/data/CY/misc.json +7 -0
- pointblank/locales/data/CY/person.json +38 -0
- pointblank/locales/data/CY/text.json +38 -0
- pointblank/locales/data/CZ/address.json +70 -0
- pointblank/locales/data/CZ/company.json +61 -0
- pointblank/locales/data/CZ/internet.json +19 -0
- pointblank/locales/data/CZ/misc.json +7 -0
- pointblank/locales/data/CZ/person.json +40 -0
- pointblank/locales/data/CZ/text.json +38 -0
- pointblank/locales/data/DE/address.json +756 -0
- pointblank/locales/data/DE/company.json +101 -0
- pointblank/locales/data/DE/internet.json +22 -0
- pointblank/locales/data/DE/misc.json +11 -0
- pointblank/locales/data/DE/person.json +1026 -0
- pointblank/locales/data/DE/text.json +50 -0
- pointblank/locales/data/DK/address.json +231 -0
- pointblank/locales/data/DK/company.json +65 -0
- pointblank/locales/data/DK/internet.json +20 -0
- pointblank/locales/data/DK/misc.json +7 -0
- pointblank/locales/data/DK/person.json +45 -0
- pointblank/locales/data/DK/text.json +43 -0
- pointblank/locales/data/EE/address.json +69 -0
- pointblank/locales/data/EE/company.json +60 -0
- pointblank/locales/data/EE/internet.json +19 -0
- pointblank/locales/data/EE/misc.json +7 -0
- pointblank/locales/data/EE/person.json +39 -0
- pointblank/locales/data/EE/text.json +38 -0
- pointblank/locales/data/ES/address.json +3086 -0
- pointblank/locales/data/ES/company.json +644 -0
- pointblank/locales/data/ES/internet.json +25 -0
- pointblank/locales/data/ES/misc.json +11 -0
- pointblank/locales/data/ES/person.json +488 -0
- pointblank/locales/data/ES/text.json +49 -0
- pointblank/locales/data/FI/address.json +93 -0
- pointblank/locales/data/FI/company.json +65 -0
- pointblank/locales/data/FI/internet.json +20 -0
- pointblank/locales/data/FI/misc.json +8 -0
- pointblank/locales/data/FI/person.json +17 -0
- pointblank/locales/data/FI/text.json +35 -0
- pointblank/locales/data/FR/address.json +619 -0
- pointblank/locales/data/FR/company.json +111 -0
- pointblank/locales/data/FR/internet.json +22 -0
- pointblank/locales/data/FR/misc.json +11 -0
- pointblank/locales/data/FR/person.json +1066 -0
- pointblank/locales/data/FR/text.json +50 -0
- pointblank/locales/data/GB/address.json +5759 -0
- pointblank/locales/data/GB/company.json +131 -0
- pointblank/locales/data/GB/internet.json +24 -0
- pointblank/locales/data/GB/misc.json +45 -0
- pointblank/locales/data/GB/person.json +578 -0
- pointblank/locales/data/GB/text.json +61 -0
- pointblank/locales/data/GR/address.json +68 -0
- pointblank/locales/data/GR/company.json +61 -0
- pointblank/locales/data/GR/internet.json +19 -0
- pointblank/locales/data/GR/misc.json +7 -0
- pointblank/locales/data/GR/person.json +39 -0
- pointblank/locales/data/GR/text.json +38 -0
- pointblank/locales/data/HK/address.json +79 -0
- pointblank/locales/data/HK/company.json +69 -0
- pointblank/locales/data/HK/internet.json +19 -0
- pointblank/locales/data/HK/misc.json +7 -0
- pointblank/locales/data/HK/person.json +42 -0
- pointblank/locales/data/HK/text.json +38 -0
- pointblank/locales/data/HR/address.json +73 -0
- pointblank/locales/data/HR/company.json +60 -0
- pointblank/locales/data/HR/internet.json +19 -0
- pointblank/locales/data/HR/misc.json +7 -0
- pointblank/locales/data/HR/person.json +38 -0
- pointblank/locales/data/HR/text.json +38 -0
- pointblank/locales/data/HU/address.json +70 -0
- pointblank/locales/data/HU/company.json +61 -0
- pointblank/locales/data/HU/internet.json +19 -0
- pointblank/locales/data/HU/misc.json +7 -0
- pointblank/locales/data/HU/person.json +40 -0
- pointblank/locales/data/HU/text.json +38 -0
- pointblank/locales/data/ID/address.json +68 -0
- pointblank/locales/data/ID/company.json +61 -0
- pointblank/locales/data/ID/internet.json +19 -0
- pointblank/locales/data/ID/misc.json +7 -0
- pointblank/locales/data/ID/person.json +40 -0
- pointblank/locales/data/ID/text.json +38 -0
- pointblank/locales/data/IE/address.json +643 -0
- pointblank/locales/data/IE/company.json +140 -0
- pointblank/locales/data/IE/internet.json +24 -0
- pointblank/locales/data/IE/misc.json +44 -0
- pointblank/locales/data/IE/person.json +55 -0
- pointblank/locales/data/IE/text.json +60 -0
- pointblank/locales/data/IN/address.json +92 -0
- pointblank/locales/data/IN/company.json +65 -0
- pointblank/locales/data/IN/internet.json +20 -0
- pointblank/locales/data/IN/misc.json +8 -0
- pointblank/locales/data/IN/person.json +52 -0
- pointblank/locales/data/IN/text.json +39 -0
- pointblank/locales/data/IS/address.json +63 -0
- pointblank/locales/data/IS/company.json +61 -0
- pointblank/locales/data/IS/internet.json +19 -0
- pointblank/locales/data/IS/misc.json +7 -0
- pointblank/locales/data/IS/person.json +44 -0
- pointblank/locales/data/IS/text.json +38 -0
- pointblank/locales/data/IT/address.json +192 -0
- pointblank/locales/data/IT/company.json +137 -0
- pointblank/locales/data/IT/internet.json +20 -0
- pointblank/locales/data/IT/misc.json +10 -0
- pointblank/locales/data/IT/person.json +70 -0
- pointblank/locales/data/IT/text.json +44 -0
- pointblank/locales/data/JP/address.json +713 -0
- pointblank/locales/data/JP/company.json +113 -0
- pointblank/locales/data/JP/internet.json +22 -0
- pointblank/locales/data/JP/misc.json +10 -0
- pointblank/locales/data/JP/person.json +1057 -0
- pointblank/locales/data/JP/text.json +51 -0
- pointblank/locales/data/KR/address.json +77 -0
- pointblank/locales/data/KR/company.json +68 -0
- pointblank/locales/data/KR/internet.json +19 -0
- pointblank/locales/data/KR/misc.json +7 -0
- pointblank/locales/data/KR/person.json +40 -0
- pointblank/locales/data/KR/text.json +38 -0
- pointblank/locales/data/LT/address.json +66 -0
- pointblank/locales/data/LT/company.json +60 -0
- pointblank/locales/data/LT/internet.json +19 -0
- pointblank/locales/data/LT/misc.json +7 -0
- pointblank/locales/data/LT/person.json +42 -0
- pointblank/locales/data/LT/text.json +38 -0
- pointblank/locales/data/LU/address.json +66 -0
- pointblank/locales/data/LU/company.json +60 -0
- pointblank/locales/data/LU/internet.json +19 -0
- pointblank/locales/data/LU/misc.json +7 -0
- pointblank/locales/data/LU/person.json +38 -0
- pointblank/locales/data/LU/text.json +38 -0
- pointblank/locales/data/LV/address.json +62 -0
- pointblank/locales/data/LV/company.json +60 -0
- pointblank/locales/data/LV/internet.json +19 -0
- pointblank/locales/data/LV/misc.json +7 -0
- pointblank/locales/data/LV/person.json +40 -0
- pointblank/locales/data/LV/text.json +38 -0
- pointblank/locales/data/MT/address.json +61 -0
- pointblank/locales/data/MT/company.json +60 -0
- pointblank/locales/data/MT/internet.json +19 -0
- pointblank/locales/data/MT/misc.json +7 -0
- pointblank/locales/data/MT/person.json +38 -0
- pointblank/locales/data/MT/text.json +38 -0
- pointblank/locales/data/MX/address.json +100 -0
- pointblank/locales/data/MX/company.json +65 -0
- pointblank/locales/data/MX/internet.json +20 -0
- pointblank/locales/data/MX/misc.json +8 -0
- pointblank/locales/data/MX/person.json +18 -0
- pointblank/locales/data/MX/text.json +39 -0
- pointblank/locales/data/NL/address.json +1517 -0
- pointblank/locales/data/NL/company.json +133 -0
- pointblank/locales/data/NL/internet.json +44 -0
- pointblank/locales/data/NL/misc.json +55 -0
- pointblank/locales/data/NL/person.json +365 -0
- pointblank/locales/data/NL/text.json +210 -0
- pointblank/locales/data/NO/address.json +86 -0
- pointblank/locales/data/NO/company.json +66 -0
- pointblank/locales/data/NO/internet.json +20 -0
- pointblank/locales/data/NO/misc.json +8 -0
- pointblank/locales/data/NO/person.json +17 -0
- pointblank/locales/data/NO/text.json +35 -0
- pointblank/locales/data/NZ/address.json +90 -0
- pointblank/locales/data/NZ/company.json +65 -0
- pointblank/locales/data/NZ/internet.json +20 -0
- pointblank/locales/data/NZ/misc.json +8 -0
- pointblank/locales/data/NZ/person.json +17 -0
- pointblank/locales/data/NZ/text.json +39 -0
- pointblank/locales/data/PH/address.json +67 -0
- pointblank/locales/data/PH/company.json +61 -0
- pointblank/locales/data/PH/internet.json +19 -0
- pointblank/locales/data/PH/misc.json +7 -0
- pointblank/locales/data/PH/person.json +40 -0
- pointblank/locales/data/PH/text.json +38 -0
- pointblank/locales/data/PL/address.json +91 -0
- pointblank/locales/data/PL/company.json +65 -0
- pointblank/locales/data/PL/internet.json +20 -0
- pointblank/locales/data/PL/misc.json +8 -0
- pointblank/locales/data/PL/person.json +17 -0
- pointblank/locales/data/PL/text.json +35 -0
- pointblank/locales/data/PT/address.json +90 -0
- pointblank/locales/data/PT/company.json +65 -0
- pointblank/locales/data/PT/internet.json +20 -0
- pointblank/locales/data/PT/misc.json +8 -0
- pointblank/locales/data/PT/person.json +17 -0
- pointblank/locales/data/PT/text.json +35 -0
- pointblank/locales/data/RO/address.json +73 -0
- pointblank/locales/data/RO/company.json +61 -0
- pointblank/locales/data/RO/internet.json +19 -0
- pointblank/locales/data/RO/misc.json +7 -0
- pointblank/locales/data/RO/person.json +40 -0
- pointblank/locales/data/RO/text.json +38 -0
- pointblank/locales/data/RU/address.json +74 -0
- pointblank/locales/data/RU/company.json +60 -0
- pointblank/locales/data/RU/internet.json +19 -0
- pointblank/locales/data/RU/misc.json +7 -0
- pointblank/locales/data/RU/person.json +38 -0
- pointblank/locales/data/RU/text.json +38 -0
- pointblank/locales/data/SE/address.json +247 -0
- pointblank/locales/data/SE/company.json +65 -0
- pointblank/locales/data/SE/internet.json +20 -0
- pointblank/locales/data/SE/misc.json +7 -0
- pointblank/locales/data/SE/person.json +45 -0
- pointblank/locales/data/SE/text.json +43 -0
- pointblank/locales/data/SI/address.json +67 -0
- pointblank/locales/data/SI/company.json +60 -0
- pointblank/locales/data/SI/internet.json +19 -0
- pointblank/locales/data/SI/misc.json +7 -0
- pointblank/locales/data/SI/person.json +38 -0
- pointblank/locales/data/SI/text.json +38 -0
- pointblank/locales/data/SK/address.json +64 -0
- pointblank/locales/data/SK/company.json +60 -0
- pointblank/locales/data/SK/internet.json +19 -0
- pointblank/locales/data/SK/misc.json +7 -0
- pointblank/locales/data/SK/person.json +38 -0
- pointblank/locales/data/SK/text.json +38 -0
- pointblank/locales/data/TR/address.json +105 -0
- pointblank/locales/data/TR/company.json +65 -0
- pointblank/locales/data/TR/internet.json +20 -0
- pointblank/locales/data/TR/misc.json +8 -0
- pointblank/locales/data/TR/person.json +17 -0
- pointblank/locales/data/TR/text.json +35 -0
- pointblank/locales/data/TW/address.json +86 -0
- pointblank/locales/data/TW/company.json +69 -0
- pointblank/locales/data/TW/internet.json +19 -0
- pointblank/locales/data/TW/misc.json +7 -0
- pointblank/locales/data/TW/person.json +42 -0
- pointblank/locales/data/TW/text.json +38 -0
- pointblank/locales/data/US/address.json +996 -0
- pointblank/locales/data/US/company.json +131 -0
- pointblank/locales/data/US/internet.json +22 -0
- pointblank/locales/data/US/misc.json +11 -0
- pointblank/locales/data/US/person.json +1092 -0
- pointblank/locales/data/US/text.json +56 -0
- pointblank/locales/data/_shared/misc.json +42 -0
- pointblank/schema.py +339 -2
- pointblank/validate.py +1263 -11
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/METADATA +45 -1
- pointblank-0.20.0.dist-info/RECORD +366 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/WHEEL +1 -1
- pointblank-0.18.0.dist-info/RECORD +0 -59
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1476 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Country-based data generation for synthetic test data.
|
|
3
|
+
|
|
4
|
+
This module provides country-specific data generation without external dependencies.
|
|
5
|
+
It supports generating realistic names, addresses, emails, and other data types
|
|
6
|
+
with proper localization based on ISO 3166-1 country codes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import random
|
|
13
|
+
import unicodedata
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from importlib.resources import files
|
|
16
|
+
from typing import TYPE_CHECKING, Any
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"LocaleRegistry",
|
|
23
|
+
"LocaleGenerator",
|
|
24
|
+
"get_generator",
|
|
25
|
+
"COUNTRY_CODE_MAP",
|
|
26
|
+
"COUNTRIES_WITH_FULL_DATA",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ISO 3166-1 country code mappings
|
|
31
|
+
# Maps alpha-2 (2-letter) and alpha-3 (3-letter) codes to internal data directory names
|
|
32
|
+
COUNTRY_CODE_MAP: dict[str, str] = {
|
|
33
|
+
# United States
|
|
34
|
+
"US": "US",
|
|
35
|
+
"USA": "US",
|
|
36
|
+
# United Kingdom
|
|
37
|
+
"GB": "GB",
|
|
38
|
+
"GBR": "GB",
|
|
39
|
+
"UK": "GB", # Common alias
|
|
40
|
+
# Ireland
|
|
41
|
+
"IE": "IE",
|
|
42
|
+
"IRL": "IE",
|
|
43
|
+
# Iceland
|
|
44
|
+
"IS": "IS",
|
|
45
|
+
"ISL": "IS",
|
|
46
|
+
# Australia
|
|
47
|
+
"AU": "AU",
|
|
48
|
+
"AUS": "AU",
|
|
49
|
+
# Argentina
|
|
50
|
+
"AR": "AR",
|
|
51
|
+
"ARG": "AR",
|
|
52
|
+
# Canada
|
|
53
|
+
"CA": "CA",
|
|
54
|
+
"CAN": "CA",
|
|
55
|
+
# Germany
|
|
56
|
+
"DE": "DE",
|
|
57
|
+
"DEU": "DE",
|
|
58
|
+
# Greece
|
|
59
|
+
"GR": "GR",
|
|
60
|
+
"GRC": "GR",
|
|
61
|
+
# Austria
|
|
62
|
+
"AT": "AT",
|
|
63
|
+
"AUT": "AT",
|
|
64
|
+
# Switzerland
|
|
65
|
+
"CH": "CH",
|
|
66
|
+
"CHE": "CH",
|
|
67
|
+
# Chile
|
|
68
|
+
"CL": "CL",
|
|
69
|
+
"CHL": "CL",
|
|
70
|
+
# France
|
|
71
|
+
"FR": "FR",
|
|
72
|
+
"FRA": "FR",
|
|
73
|
+
# Spain
|
|
74
|
+
"ES": "ES",
|
|
75
|
+
"ESP": "ES",
|
|
76
|
+
# Mexico
|
|
77
|
+
"MX": "MX",
|
|
78
|
+
"MEX": "MX",
|
|
79
|
+
# Malta
|
|
80
|
+
"MT": "MT",
|
|
81
|
+
"MLT": "MT",
|
|
82
|
+
# Portugal
|
|
83
|
+
"PT": "PT",
|
|
84
|
+
"PRT": "PT",
|
|
85
|
+
# Brazil
|
|
86
|
+
"BR": "BR",
|
|
87
|
+
"BRA": "BR",
|
|
88
|
+
# India
|
|
89
|
+
"IN": "IN",
|
|
90
|
+
"IND": "IN",
|
|
91
|
+
# Italy
|
|
92
|
+
"IT": "IT",
|
|
93
|
+
"ITA": "IT",
|
|
94
|
+
# Netherlands
|
|
95
|
+
"NL": "NL",
|
|
96
|
+
"NLD": "NL",
|
|
97
|
+
# Belgium
|
|
98
|
+
"BE": "BE",
|
|
99
|
+
"BEL": "BE",
|
|
100
|
+
# Bulgaria
|
|
101
|
+
"BG": "BG",
|
|
102
|
+
"BGR": "BG",
|
|
103
|
+
# Poland
|
|
104
|
+
"PL": "PL",
|
|
105
|
+
"POL": "PL",
|
|
106
|
+
# Romania
|
|
107
|
+
"RO": "RO",
|
|
108
|
+
"ROU": "RO",
|
|
109
|
+
# Russia
|
|
110
|
+
"RU": "RU",
|
|
111
|
+
"RUS": "RU",
|
|
112
|
+
# Slovakia
|
|
113
|
+
"SK": "SK",
|
|
114
|
+
"SVK": "SK",
|
|
115
|
+
# Slovenia
|
|
116
|
+
"SI": "SI",
|
|
117
|
+
"SVN": "SI",
|
|
118
|
+
# Japan
|
|
119
|
+
"JP": "JP",
|
|
120
|
+
"JPN": "JP",
|
|
121
|
+
# South Korea
|
|
122
|
+
"KR": "KR",
|
|
123
|
+
"KOR": "KR",
|
|
124
|
+
# Latvia
|
|
125
|
+
"LV": "LV",
|
|
126
|
+
"LVA": "LV",
|
|
127
|
+
# Lithuania
|
|
128
|
+
"LT": "LT",
|
|
129
|
+
"LTU": "LT",
|
|
130
|
+
# Luxembourg
|
|
131
|
+
"LU": "LU",
|
|
132
|
+
"LUX": "LU",
|
|
133
|
+
# China
|
|
134
|
+
"CN": "CN",
|
|
135
|
+
"CHN": "CN",
|
|
136
|
+
# Colombia
|
|
137
|
+
"CO": "CO",
|
|
138
|
+
"COL": "CO",
|
|
139
|
+
# Cyprus
|
|
140
|
+
"CY": "CY",
|
|
141
|
+
"CYP": "CY",
|
|
142
|
+
# Czech Republic
|
|
143
|
+
"CZ": "CZ",
|
|
144
|
+
"CZE": "CZ",
|
|
145
|
+
# Estonia
|
|
146
|
+
"EE": "EE",
|
|
147
|
+
"EST": "EE",
|
|
148
|
+
# Hong Kong
|
|
149
|
+
"HK": "HK",
|
|
150
|
+
"HKG": "HK",
|
|
151
|
+
# Croatia
|
|
152
|
+
"HR": "HR",
|
|
153
|
+
"HRV": "HR",
|
|
154
|
+
# Hungary
|
|
155
|
+
"HU": "HU",
|
|
156
|
+
"HUN": "HU",
|
|
157
|
+
# Indonesia
|
|
158
|
+
"ID": "ID",
|
|
159
|
+
"IDN": "ID",
|
|
160
|
+
# Taiwan
|
|
161
|
+
"TW": "TW",
|
|
162
|
+
"TWN": "TW",
|
|
163
|
+
# Turkey
|
|
164
|
+
"TR": "TR",
|
|
165
|
+
"TUR": "TR",
|
|
166
|
+
# New Zealand
|
|
167
|
+
"NZ": "NZ",
|
|
168
|
+
"NZL": "NZ",
|
|
169
|
+
# Philippines
|
|
170
|
+
"PH": "PH",
|
|
171
|
+
"PHL": "PH",
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
# Countries that have complete locale data files
|
|
175
|
+
# These are the ISO alpha-2 codes for countries with full address, company,
|
|
176
|
+
# internet, misc, person, and text JSON files in the data directory
|
|
177
|
+
COUNTRIES_WITH_FULL_DATA: list[str] = [
|
|
178
|
+
"US", # United States
|
|
179
|
+
"AR", # Argentina
|
|
180
|
+
"AT", # Austria
|
|
181
|
+
"AU", # Australia
|
|
182
|
+
"BE", # Belgium
|
|
183
|
+
"BG", # Bulgaria
|
|
184
|
+
"BR", # Brazil
|
|
185
|
+
"CA", # Canada
|
|
186
|
+
"CH", # Switzerland
|
|
187
|
+
"CL", # Chile
|
|
188
|
+
"CN", # China
|
|
189
|
+
"CO", # Colombia
|
|
190
|
+
"CY", # Cyprus
|
|
191
|
+
"CZ", # Czech Republic
|
|
192
|
+
"DE", # Germany
|
|
193
|
+
"DK", # Denmark
|
|
194
|
+
"EE", # Estonia
|
|
195
|
+
"ES", # Spain
|
|
196
|
+
"FI", # Finland
|
|
197
|
+
"FR", # France
|
|
198
|
+
"GB", # United Kingdom
|
|
199
|
+
"GR", # Greece
|
|
200
|
+
"HK", # Hong Kong
|
|
201
|
+
"HR", # Croatia
|
|
202
|
+
"HU", # Hungary
|
|
203
|
+
"ID", # Indonesia
|
|
204
|
+
"IE", # Ireland
|
|
205
|
+
"IN", # India
|
|
206
|
+
"IS", # Iceland
|
|
207
|
+
"IT", # Italy
|
|
208
|
+
"JP", # Japan
|
|
209
|
+
"KR", # South Korea
|
|
210
|
+
"LV", # Latvia
|
|
211
|
+
"LT", # Lithuania
|
|
212
|
+
"LU", # Luxembourg
|
|
213
|
+
"MT", # Malta
|
|
214
|
+
"MX", # Mexico
|
|
215
|
+
"NL", # Netherlands
|
|
216
|
+
"NO", # Norway
|
|
217
|
+
"NZ", # New Zealand
|
|
218
|
+
"PL", # Poland
|
|
219
|
+
"PH", # Philippines
|
|
220
|
+
"PT", # Portugal
|
|
221
|
+
"RO", # Romania
|
|
222
|
+
"RU", # Russia
|
|
223
|
+
"SE", # Sweden
|
|
224
|
+
"SK", # Slovakia
|
|
225
|
+
"SI", # Slovenia
|
|
226
|
+
"TR", # Turkey
|
|
227
|
+
"TW", # Taiwan
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
# Fallback chains for countries (when a country's data is incomplete)
|
|
231
|
+
COUNTRY_FALLBACKS: dict[str, list[str]] = {
|
|
232
|
+
# English-speaking countries fall back to US
|
|
233
|
+
"GB": ["GB", "US"],
|
|
234
|
+
"IE": ["IE", "GB", "US"],
|
|
235
|
+
"AU": ["AU", "GB", "US"],
|
|
236
|
+
"CA": ["CA", "US"],
|
|
237
|
+
# German-speaking countries
|
|
238
|
+
"DE": ["DE", "US"],
|
|
239
|
+
"AT": ["AT", "DE", "US"],
|
|
240
|
+
"CH": ["CH", "DE", "US"],
|
|
241
|
+
# French-speaking
|
|
242
|
+
"FR": ["FR", "US"],
|
|
243
|
+
# Belgian (Dutch/French bilingual)
|
|
244
|
+
"BE": ["BE", "NL", "FR", "US"],
|
|
245
|
+
# Scandinavian
|
|
246
|
+
"DK": ["DK", "DE", "US"],
|
|
247
|
+
"NO": ["NO", "DK", "DE", "US"],
|
|
248
|
+
"SE": ["SE", "DK", "DE", "US"],
|
|
249
|
+
"FI": ["FI", "SE", "US"],
|
|
250
|
+
# Spanish-speaking
|
|
251
|
+
"ES": ["ES", "US"],
|
|
252
|
+
"MX": ["MX", "ES", "US"],
|
|
253
|
+
# Portuguese-speaking
|
|
254
|
+
"PT": ["PT", "US"],
|
|
255
|
+
"BR": ["BR", "PT", "US"],
|
|
256
|
+
# Other European
|
|
257
|
+
"IT": ["IT", "US"],
|
|
258
|
+
"NL": ["NL", "US"],
|
|
259
|
+
"PL": ["PL", "US"],
|
|
260
|
+
"RU": ["RU", "US"],
|
|
261
|
+
# Asian countries
|
|
262
|
+
"JP": ["JP", "US"],
|
|
263
|
+
"KR": ["KR", "US"],
|
|
264
|
+
"CN": ["CN", "US"],
|
|
265
|
+
"TW": ["TW", "CN", "US"],
|
|
266
|
+
# Turkey
|
|
267
|
+
"TR": ["TR", "US"],
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@dataclass
|
|
272
|
+
class LocaleData:
|
|
273
|
+
"""Container for all locale-specific data."""
|
|
274
|
+
|
|
275
|
+
locale: str
|
|
276
|
+
person: dict[str, Any] = field(default_factory=dict)
|
|
277
|
+
address: dict[str, Any] = field(default_factory=dict)
|
|
278
|
+
company: dict[str, Any] = field(default_factory=dict)
|
|
279
|
+
internet: dict[str, Any] = field(default_factory=dict)
|
|
280
|
+
text: dict[str, Any] = field(default_factory=dict)
|
|
281
|
+
misc: dict[str, Any] = field(default_factory=dict)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# Transliteration map for special characters (umlauts add 'e', others simplified)
|
|
285
|
+
_TRANSLITERATION_MAP: dict[str, str] = {
|
|
286
|
+
# German umlauts -> add 'e'
|
|
287
|
+
"ä": "ae",
|
|
288
|
+
"ö": "oe",
|
|
289
|
+
"ü": "ue",
|
|
290
|
+
"Ä": "Ae",
|
|
291
|
+
"Ö": "Oe",
|
|
292
|
+
"Ü": "Ue",
|
|
293
|
+
"ß": "ss",
|
|
294
|
+
# Scandinavian
|
|
295
|
+
"å": "aa",
|
|
296
|
+
"Å": "Aa",
|
|
297
|
+
"ø": "oe",
|
|
298
|
+
"Ø": "Oe",
|
|
299
|
+
"æ": "ae",
|
|
300
|
+
"Æ": "Ae",
|
|
301
|
+
# French/Spanish/Portuguese/Italian accents
|
|
302
|
+
"à": "a",
|
|
303
|
+
"á": "a",
|
|
304
|
+
"â": "a",
|
|
305
|
+
"ã": "a",
|
|
306
|
+
"À": "A",
|
|
307
|
+
"Á": "A",
|
|
308
|
+
"Â": "A",
|
|
309
|
+
"Ã": "A",
|
|
310
|
+
"è": "e",
|
|
311
|
+
"é": "e",
|
|
312
|
+
"ê": "e",
|
|
313
|
+
"ë": "e",
|
|
314
|
+
"È": "E",
|
|
315
|
+
"É": "E",
|
|
316
|
+
"Ê": "E",
|
|
317
|
+
"Ë": "E",
|
|
318
|
+
"ì": "i",
|
|
319
|
+
"í": "i",
|
|
320
|
+
"î": "i",
|
|
321
|
+
"ï": "i",
|
|
322
|
+
"Ì": "I",
|
|
323
|
+
"Í": "I",
|
|
324
|
+
"Î": "I",
|
|
325
|
+
"Ï": "I",
|
|
326
|
+
"ò": "o",
|
|
327
|
+
"ó": "o",
|
|
328
|
+
"ô": "o",
|
|
329
|
+
"õ": "o",
|
|
330
|
+
"Ò": "O",
|
|
331
|
+
"Ó": "O",
|
|
332
|
+
"Ô": "O",
|
|
333
|
+
"Õ": "O",
|
|
334
|
+
"ù": "u",
|
|
335
|
+
"ú": "u",
|
|
336
|
+
"û": "u",
|
|
337
|
+
"Ù": "U",
|
|
338
|
+
"Ú": "U",
|
|
339
|
+
"Û": "U",
|
|
340
|
+
"ñ": "n",
|
|
341
|
+
"Ñ": "N",
|
|
342
|
+
"ç": "c",
|
|
343
|
+
"Ç": "C",
|
|
344
|
+
"ý": "y",
|
|
345
|
+
"ÿ": "y",
|
|
346
|
+
"Ý": "Y",
|
|
347
|
+
# Eastern European
|
|
348
|
+
"ł": "l",
|
|
349
|
+
"Ł": "L",
|
|
350
|
+
"ń": "n",
|
|
351
|
+
"Ń": "N",
|
|
352
|
+
"ś": "s",
|
|
353
|
+
"Ś": "S",
|
|
354
|
+
"ź": "z",
|
|
355
|
+
"Ź": "Z",
|
|
356
|
+
"ż": "z",
|
|
357
|
+
"Ż": "Z",
|
|
358
|
+
"ć": "c",
|
|
359
|
+
"Ć": "C",
|
|
360
|
+
"ě": "e",
|
|
361
|
+
"Ě": "E",
|
|
362
|
+
"š": "s",
|
|
363
|
+
"Š": "S",
|
|
364
|
+
"č": "c",
|
|
365
|
+
"Č": "C",
|
|
366
|
+
"ř": "r",
|
|
367
|
+
"Ř": "R",
|
|
368
|
+
"ž": "z",
|
|
369
|
+
"Ž": "Z",
|
|
370
|
+
"ů": "u",
|
|
371
|
+
"Ů": "U",
|
|
372
|
+
"ď": "d",
|
|
373
|
+
"Ď": "D",
|
|
374
|
+
"ť": "t",
|
|
375
|
+
"Ť": "T",
|
|
376
|
+
"ň": "n",
|
|
377
|
+
"Ň": "N",
|
|
378
|
+
# Other
|
|
379
|
+
"đ": "d",
|
|
380
|
+
"Đ": "D",
|
|
381
|
+
"ğ": "g",
|
|
382
|
+
"Ğ": "G",
|
|
383
|
+
"ı": "i",
|
|
384
|
+
"İ": "I",
|
|
385
|
+
"ş": "s",
|
|
386
|
+
"Ş": "S",
|
|
387
|
+
"ț": "t",
|
|
388
|
+
"Ț": "T",
|
|
389
|
+
"ă": "a",
|
|
390
|
+
"Ă": "A",
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _transliterate_to_ascii(text: str) -> str:
|
|
395
|
+
"""
|
|
396
|
+
Transliterate text to ASCII-safe characters for email addresses and usernames.
|
|
397
|
+
|
|
398
|
+
Handles German umlauts specially (ü -> ue, ö -> oe, ä -> ae) and converts
|
|
399
|
+
other accented characters to their base ASCII equivalents.
|
|
400
|
+
|
|
401
|
+
Parameters
|
|
402
|
+
----------
|
|
403
|
+
text
|
|
404
|
+
The text to transliterate.
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
str
|
|
409
|
+
ASCII-safe version of the text.
|
|
410
|
+
"""
|
|
411
|
+
# First apply our custom transliteration map
|
|
412
|
+
result = []
|
|
413
|
+
for char in text:
|
|
414
|
+
if char in _TRANSLITERATION_MAP:
|
|
415
|
+
result.append(_TRANSLITERATION_MAP[char])
|
|
416
|
+
else:
|
|
417
|
+
result.append(char)
|
|
418
|
+
text = "".join(result)
|
|
419
|
+
|
|
420
|
+
# Then use unicodedata to handle any remaining non-ASCII characters
|
|
421
|
+
# NFD decomposes characters (e.g., é -> e + combining accent)
|
|
422
|
+
# We then filter to keep only ASCII characters
|
|
423
|
+
normalized = unicodedata.normalize("NFD", text)
|
|
424
|
+
ascii_text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
|
|
425
|
+
|
|
426
|
+
return ascii_text
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _normalize_country(country: str) -> str:
|
|
430
|
+
"""
|
|
431
|
+
Normalize a country code to the standard 2-letter ISO 3166-1 alpha-2 format.
|
|
432
|
+
|
|
433
|
+
Parameters
|
|
434
|
+
----------
|
|
435
|
+
country
|
|
436
|
+
Country code in alpha-2 (US), alpha-3 (USA), or legacy locale format (en_US).
|
|
437
|
+
|
|
438
|
+
Returns
|
|
439
|
+
-------
|
|
440
|
+
str
|
|
441
|
+
The normalized 2-letter country code.
|
|
442
|
+
|
|
443
|
+
Raises
|
|
444
|
+
------
|
|
445
|
+
ValueError
|
|
446
|
+
If the country code is not recognized.
|
|
447
|
+
"""
|
|
448
|
+
# Uppercase and strip whitespace
|
|
449
|
+
code = country.strip().upper()
|
|
450
|
+
|
|
451
|
+
# Handle legacy locale format (en_US, de-DE, etc.)
|
|
452
|
+
if "_" in code or "-" in code:
|
|
453
|
+
# Extract country part from locale code
|
|
454
|
+
parts = code.replace("-", "_").split("_")
|
|
455
|
+
if len(parts) == 2:
|
|
456
|
+
code = parts[1] # Take the country part (US from en_US)
|
|
457
|
+
|
|
458
|
+
# Look up in the country code map
|
|
459
|
+
if code in COUNTRY_CODE_MAP:
|
|
460
|
+
return COUNTRY_CODE_MAP[code]
|
|
461
|
+
|
|
462
|
+
# If already a valid 2-letter code in fallbacks, use it
|
|
463
|
+
if code in COUNTRY_FALLBACKS:
|
|
464
|
+
return code
|
|
465
|
+
|
|
466
|
+
# Default to US with a warning (or raise an error)
|
|
467
|
+
raise ValueError(
|
|
468
|
+
f"Unknown country code: {country!r}. "
|
|
469
|
+
f"Supported codes: {', '.join(sorted(set(COUNTRY_CODE_MAP.keys())))}"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
class LocaleRegistry:
|
|
474
|
+
"""Registry for country data with fallback support."""
|
|
475
|
+
|
|
476
|
+
_instance: LocaleRegistry | None = None
|
|
477
|
+
_cache: dict[str, LocaleData]
|
|
478
|
+
|
|
479
|
+
def __new__(cls) -> LocaleRegistry:
|
|
480
|
+
if cls._instance is None:
|
|
481
|
+
cls._instance = super().__new__(cls)
|
|
482
|
+
cls._instance._cache = {}
|
|
483
|
+
return cls._instance
|
|
484
|
+
|
|
485
|
+
def get(self, country: str) -> LocaleData:
|
|
486
|
+
"""
|
|
487
|
+
Get country data with fallback chain.
|
|
488
|
+
|
|
489
|
+
Parameters
|
|
490
|
+
----------
|
|
491
|
+
country
|
|
492
|
+
Country code (e.g., "US", "DE", "USA", "DEU").
|
|
493
|
+
Also accepts legacy locale codes like "en_US" for backwards compatibility.
|
|
494
|
+
|
|
495
|
+
Returns
|
|
496
|
+
-------
|
|
497
|
+
LocaleData
|
|
498
|
+
The country data, falling back to parent countries if needed.
|
|
499
|
+
"""
|
|
500
|
+
# Normalize to 2-letter country code
|
|
501
|
+
country_code = _normalize_country(country)
|
|
502
|
+
|
|
503
|
+
if country_code in self._cache:
|
|
504
|
+
return self._cache[country_code]
|
|
505
|
+
|
|
506
|
+
# Get fallback chain
|
|
507
|
+
fallback_chain = COUNTRY_FALLBACKS.get(country_code, [country_code, "US"])
|
|
508
|
+
if country_code not in fallback_chain:
|
|
509
|
+
fallback_chain = [country_code] + fallback_chain
|
|
510
|
+
|
|
511
|
+
# Load data with fallback
|
|
512
|
+
locale_data = self._load_with_fallback(fallback_chain)
|
|
513
|
+
self._cache[country_code] = locale_data
|
|
514
|
+
return locale_data
|
|
515
|
+
|
|
516
|
+
def _load_with_fallback(self, fallback_chain: list[str]) -> LocaleData:
|
|
517
|
+
"""Load country data, falling back through the chain."""
|
|
518
|
+
merged_data = LocaleData(locale=fallback_chain[0])
|
|
519
|
+
|
|
520
|
+
# Load shared/universal data first (e.g., file extensions, MIME types)
|
|
521
|
+
shared_data = self._load_country_files("_shared")
|
|
522
|
+
if shared_data:
|
|
523
|
+
self._merge_data(merged_data, shared_data)
|
|
524
|
+
|
|
525
|
+
# Load in reverse order so more specific countries override
|
|
526
|
+
for country in reversed(fallback_chain):
|
|
527
|
+
data = self._load_country_files(country)
|
|
528
|
+
if data:
|
|
529
|
+
self._merge_data(merged_data, data)
|
|
530
|
+
|
|
531
|
+
return merged_data
|
|
532
|
+
|
|
533
|
+
def _load_country_files(self, country: str) -> dict[str, Any] | None:
|
|
534
|
+
"""Load all data files for a country."""
|
|
535
|
+
try:
|
|
536
|
+
data_path = files("pointblank.locales.data") / country
|
|
537
|
+
if not data_path.is_dir():
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
result: dict[str, Any] = {}
|
|
541
|
+
for category in ["person", "address", "company", "internet", "text", "misc"]:
|
|
542
|
+
file_path = data_path / f"{category}.json"
|
|
543
|
+
try:
|
|
544
|
+
content = file_path.read_text(encoding="utf-8")
|
|
545
|
+
result[category] = json.loads(content)
|
|
546
|
+
except (FileNotFoundError, json.JSONDecodeError):
|
|
547
|
+
pass
|
|
548
|
+
|
|
549
|
+
return result if result else None
|
|
550
|
+
except (TypeError, FileNotFoundError):
|
|
551
|
+
return None
|
|
552
|
+
|
|
553
|
+
def _merge_data(self, target: LocaleData, source: dict[str, Any]) -> None:
|
|
554
|
+
"""Merge source data into target LocaleData."""
|
|
555
|
+
for category, data in source.items():
|
|
556
|
+
if hasattr(target, category):
|
|
557
|
+
existing = getattr(target, category)
|
|
558
|
+
if isinstance(existing, dict) and isinstance(data, dict):
|
|
559
|
+
existing.update(data)
|
|
560
|
+
else:
|
|
561
|
+
setattr(target, category, data)
|
|
562
|
+
|
|
563
|
+
def clear_cache(self) -> None:
|
|
564
|
+
"""Clear the country data cache."""
|
|
565
|
+
self._cache.clear()
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
class LocaleGenerator:
|
|
569
|
+
"""
|
|
570
|
+
Generator for country-specific test data.
|
|
571
|
+
|
|
572
|
+
This class provides methods to generate realistic data like names, emails,
|
|
573
|
+
addresses, etc. based on country-specific patterns and data.
|
|
574
|
+
"""
|
|
575
|
+
|
|
576
|
+
def __init__(self, country: str = "US", seed: int | None = None):
|
|
577
|
+
"""
|
|
578
|
+
Initialize the country data generator.
|
|
579
|
+
|
|
580
|
+
Parameters
|
|
581
|
+
----------
|
|
582
|
+
country
|
|
583
|
+
Country code (e.g., "US", "DE", "USA", "DEU").
|
|
584
|
+
Also accepts legacy locale codes like "en_US" for backwards compatibility.
|
|
585
|
+
seed
|
|
586
|
+
Random seed for reproducibility.
|
|
587
|
+
"""
|
|
588
|
+
self.country_code = _normalize_country(country)
|
|
589
|
+
self.rng = random.Random(seed)
|
|
590
|
+
self._registry = LocaleRegistry()
|
|
591
|
+
self._data = self._registry.get(self.country_code)
|
|
592
|
+
|
|
593
|
+
def seed(self, seed: int) -> None:
|
|
594
|
+
"""Set the random seed."""
|
|
595
|
+
self.rng.seed(seed)
|
|
596
|
+
|
|
597
|
+
# =========================================================================
|
|
598
|
+
# Person
|
|
599
|
+
# =========================================================================
|
|
600
|
+
|
|
601
|
+
_current_person: dict[str, str] | None = None
|
|
602
|
+
_row_persons: list[dict[str, str]] | None = None
|
|
603
|
+
|
|
604
|
+
def _get_person(self, gender: str | None = None) -> dict[str, str]:
|
|
605
|
+
"""Get a coherent person (first_name, last_name, gender) from the data."""
|
|
606
|
+
# If no gender specified, randomly select one (weighted toward male/female)
|
|
607
|
+
if gender is None:
|
|
608
|
+
gender = self.rng.choice(["male", "female"])
|
|
609
|
+
|
|
610
|
+
return {
|
|
611
|
+
"first_name": self._generate_first_name(gender),
|
|
612
|
+
"last_name": self._generate_last_name(),
|
|
613
|
+
"gender": gender,
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
def _generate_first_name(self, gender: str | None = None) -> str:
|
|
617
|
+
"""Generate a random first name (internal, no caching)."""
|
|
618
|
+
names = self._data.person.get("first_names", {})
|
|
619
|
+
|
|
620
|
+
if gender and gender in names:
|
|
621
|
+
name_list = names[gender]
|
|
622
|
+
elif "neutral" in names:
|
|
623
|
+
# Combine all available names
|
|
624
|
+
all_names = []
|
|
625
|
+
for category in ["male", "female", "neutral"]:
|
|
626
|
+
all_names.extend(names.get(category, []))
|
|
627
|
+
name_list = all_names if all_names else ["Alex"]
|
|
628
|
+
else:
|
|
629
|
+
# Flatten all categories
|
|
630
|
+
all_names = []
|
|
631
|
+
for category_names in names.values():
|
|
632
|
+
if isinstance(category_names, list):
|
|
633
|
+
all_names.extend(category_names)
|
|
634
|
+
name_list = all_names if all_names else ["Alex"]
|
|
635
|
+
|
|
636
|
+
return self.rng.choice(name_list)
|
|
637
|
+
|
|
638
|
+
def _generate_last_name(self) -> str:
|
|
639
|
+
"""Generate a random last name (internal, no caching)."""
|
|
640
|
+
names = self._data.person.get("last_names", ["Smith"])
|
|
641
|
+
return self.rng.choice(names)
|
|
642
|
+
|
|
643
|
+
def init_row_persons(self, n_rows: int) -> None:
|
|
644
|
+
"""
|
|
645
|
+
Pre-generate person data for multiple rows to ensure coherence across columns.
|
|
646
|
+
|
|
647
|
+
This should be called before generating a dataset with person-related columns.
|
|
648
|
+
When active, first_name(), last_name(), name(), email() will use the person
|
|
649
|
+
for the current row (set via set_row()).
|
|
650
|
+
|
|
651
|
+
Parameters
|
|
652
|
+
----------
|
|
653
|
+
n_rows
|
|
654
|
+
Number of rows to pre-generate persons for.
|
|
655
|
+
"""
|
|
656
|
+
self._row_persons = [self._get_person() for _ in range(n_rows)]
|
|
657
|
+
|
|
658
|
+
def clear_row_persons(self) -> None:
|
|
659
|
+
"""Clear all pre-generated row persons."""
|
|
660
|
+
self._row_persons = None
|
|
661
|
+
|
|
662
|
+
def new_person(self, gender: str | None = None) -> dict[str, str]:
|
|
663
|
+
"""
|
|
664
|
+
Select a new random person and cache it for coherent generation.
|
|
665
|
+
|
|
666
|
+
Call this before generating related person components (first_name, last_name, email)
|
|
667
|
+
to ensure they all refer to the same person.
|
|
668
|
+
|
|
669
|
+
Returns
|
|
670
|
+
-------
|
|
671
|
+
dict
|
|
672
|
+
The selected person with first_name and last_name.
|
|
673
|
+
"""
|
|
674
|
+
self._current_person = self._get_person(gender)
|
|
675
|
+
return self._current_person
|
|
676
|
+
|
|
677
|
+
def _get_current_person(self) -> dict[str, str]:
|
|
678
|
+
"""Get the current cached person, or select a new one."""
|
|
679
|
+
# If row persons are active, use those
|
|
680
|
+
if self._row_persons is not None and self._current_row is not None:
|
|
681
|
+
return self._row_persons[self._current_row]
|
|
682
|
+
# Otherwise use single cached person
|
|
683
|
+
if self._current_person is None:
|
|
684
|
+
self._current_person = self._get_person()
|
|
685
|
+
return self._current_person
|
|
686
|
+
|
|
687
|
+
def clear_person(self) -> None:
|
|
688
|
+
"""Clear the cached person so the next call will select a new one."""
|
|
689
|
+
self._current_person = None
|
|
690
|
+
|
|
691
|
+
def first_name(self, gender: str | None = None) -> str:
|
|
692
|
+
"""Generate a random first name (coherent with current person context)."""
|
|
693
|
+
person = self._get_current_person()
|
|
694
|
+
return person.get("first_name", "Alex")
|
|
695
|
+
|
|
696
|
+
def last_name(self) -> str:
|
|
697
|
+
"""Generate a random last name (coherent with current person context)."""
|
|
698
|
+
person = self._get_current_person()
|
|
699
|
+
return person.get("last_name", "Smith")
|
|
700
|
+
|
|
701
|
+
def name(self, gender: str | None = None) -> str:
|
|
702
|
+
"""Generate a simple full name (first + last, coherent with current person context).
|
|
703
|
+
|
|
704
|
+
For names with prefixes (Mr., Ms., Dr., etc.) and occasional suffixes (Jr., III),
|
|
705
|
+
use name_full() instead.
|
|
706
|
+
"""
|
|
707
|
+
person = self._get_current_person()
|
|
708
|
+
first = person.get("first_name", "Alex")
|
|
709
|
+
last = person.get("last_name", "Smith")
|
|
710
|
+
|
|
711
|
+
# Check if locale uses "last first" order (e.g., Japanese)
|
|
712
|
+
formats = self._data.person.get("name_formats", ["{first_name} {last_name}"])
|
|
713
|
+
# Use the simplest format (usually first one, which is typically "first last" or "last first")
|
|
714
|
+
if formats and "{last_name} {first_name}" in formats[0]:
|
|
715
|
+
return f"{last} {first}"
|
|
716
|
+
return f"{first} {last}"
|
|
717
|
+
|
|
718
|
+
def name_full(self, gender: str | None = None) -> str:
|
|
719
|
+
"""Generate a full name with optional prefix and rare suffix.
|
|
720
|
+
|
|
721
|
+
Includes honorific prefixes with realistic frequencies:
|
|
722
|
+
- Common honorifics (Mr., Ms., Mrs., etc.): ~95% of names
|
|
723
|
+
- Professional titles (Dr., Prof., Rev., etc.): ~5% of names
|
|
724
|
+
|
|
725
|
+
Suffixes (Jr., II, III) appear very rarely (~1 in 2000).
|
|
726
|
+
"""
|
|
727
|
+
person = self._get_current_person()
|
|
728
|
+
first = person.get("first_name", "Alex")
|
|
729
|
+
last = person.get("last_name", "Smith")
|
|
730
|
+
|
|
731
|
+
# Get gender for prefix selection (from person context or parameter)
|
|
732
|
+
person_gender = person.get("gender", "neutral")
|
|
733
|
+
if gender:
|
|
734
|
+
person_gender = gender
|
|
735
|
+
|
|
736
|
+
# Professional titles are rare (~2-3% of population for Dr., ~0.5% for Prof.)
|
|
737
|
+
# These should appear infrequently
|
|
738
|
+
professional_titles = {
|
|
739
|
+
"Dr.",
|
|
740
|
+
"Prof.",
|
|
741
|
+
"Professor",
|
|
742
|
+
"Rev.",
|
|
743
|
+
"Pr.",
|
|
744
|
+
"Prof. Dr.",
|
|
745
|
+
"Rabbi",
|
|
746
|
+
"Father",
|
|
747
|
+
"Sister",
|
|
748
|
+
"Pastor",
|
|
749
|
+
"Elder",
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
# Get prefix based on gender from locale data
|
|
753
|
+
prefixes = self._data.person.get("prefixes", {})
|
|
754
|
+
prefix_list = prefixes.get(person_gender, prefixes.get("neutral", []))
|
|
755
|
+
|
|
756
|
+
# Separate common honorifics and professional titles from locale data
|
|
757
|
+
locale_common = [p for p in prefix_list if p not in professional_titles]
|
|
758
|
+
locale_professional = [p for p in prefix_list if p in professional_titles]
|
|
759
|
+
|
|
760
|
+
# Select prefix with realistic probabilities
|
|
761
|
+
# ~95% common honorific, ~5% professional title
|
|
762
|
+
if locale_professional and self.rng.random() < 0.05:
|
|
763
|
+
prefix = self.rng.choice(locale_professional)
|
|
764
|
+
elif locale_common:
|
|
765
|
+
prefix = self.rng.choice(locale_common)
|
|
766
|
+
else:
|
|
767
|
+
# Fallback defaults if no common prefixes in locale
|
|
768
|
+
fallback = {"male": "Mr.", "female": "Ms.", "neutral": "Mr."}
|
|
769
|
+
prefix = fallback.get(person_gender, "")
|
|
770
|
+
|
|
771
|
+
# Get suffix - very rare (approximately 1/2000 chance)
|
|
772
|
+
suffix = ""
|
|
773
|
+
if self.rng.random() < 0.0005: # 1 in 2000
|
|
774
|
+
suffixes = self._data.person.get("suffixes", [])
|
|
775
|
+
# Filter out empty strings
|
|
776
|
+
suffixes = [s for s in suffixes if s]
|
|
777
|
+
if suffixes:
|
|
778
|
+
suffix = self.rng.choice(suffixes)
|
|
779
|
+
|
|
780
|
+
# Check if locale uses "last first" order (e.g., Japanese)
|
|
781
|
+
formats = self._data.person.get("name_formats", ["{first_name} {last_name}"])
|
|
782
|
+
if formats and "{last_name} {first_name}" in formats[0]:
|
|
783
|
+
# For "last first" cultures, prefix typically comes before everything
|
|
784
|
+
parts = [prefix, last, first] if prefix else [last, first]
|
|
785
|
+
else:
|
|
786
|
+
parts = [prefix, first, last] if prefix else [first, last]
|
|
787
|
+
|
|
788
|
+
if suffix:
|
|
789
|
+
parts.append(suffix)
|
|
790
|
+
|
|
791
|
+
return " ".join(parts)
|
|
792
|
+
|
|
793
|
+
# =========================================================================
|
|
794
|
+
# Address
|
|
795
|
+
# =========================================================================
|
|
796
|
+
|
|
797
|
+
_current_location: dict[str, str] | None = None
|
|
798
|
+
_row_locations: list[dict[str, str]] | None = None
|
|
799
|
+
_current_row: int | None = None
|
|
800
|
+
|
|
801
|
+
def _get_location(self) -> dict[str, str]:
|
|
802
|
+
"""Get a coherent location (city, state, postcode_prefix) from the data."""
|
|
803
|
+
locations = self._data.address.get("locations", [])
|
|
804
|
+
if locations:
|
|
805
|
+
return self.rng.choice(locations)
|
|
806
|
+
# Fallback for old-style data
|
|
807
|
+
return {
|
|
808
|
+
"city": "Springfield",
|
|
809
|
+
"state": "State",
|
|
810
|
+
"state_abbr": "ST",
|
|
811
|
+
"postcode_prefix": "000",
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
def init_row_locations(self, n_rows: int) -> None:
|
|
815
|
+
"""
|
|
816
|
+
Pre-generate locations for multiple rows to ensure coherence across columns.
|
|
817
|
+
|
|
818
|
+
This should be called before generating a dataset with address-related columns.
|
|
819
|
+
When active, city(), state(), postcode() etc. will use the location for the
|
|
820
|
+
current row (set via set_row()).
|
|
821
|
+
|
|
822
|
+
Parameters
|
|
823
|
+
----------
|
|
824
|
+
n_rows
|
|
825
|
+
Number of rows to pre-generate locations for.
|
|
826
|
+
"""
|
|
827
|
+
self._row_locations = [self._get_location() for _ in range(n_rows)]
|
|
828
|
+
self._current_row = None
|
|
829
|
+
|
|
830
|
+
def set_row(self, row_index: int) -> None:
|
|
831
|
+
"""
|
|
832
|
+
Set the current row index for location-based generation.
|
|
833
|
+
|
|
834
|
+
When row locations are initialized, this sets which row's location to use.
|
|
835
|
+
|
|
836
|
+
Parameters
|
|
837
|
+
----------
|
|
838
|
+
row_index
|
|
839
|
+
The row index (0-based).
|
|
840
|
+
"""
|
|
841
|
+
self._current_row = row_index
|
|
842
|
+
|
|
843
|
+
def clear_row_locations(self) -> None:
|
|
844
|
+
"""Clear all pre-generated row locations."""
|
|
845
|
+
self._row_locations = None
|
|
846
|
+
self._current_row = None
|
|
847
|
+
|
|
848
|
+
def new_location(self) -> dict[str, str]:
|
|
849
|
+
"""
|
|
850
|
+
Select a new random location and cache it for coherent address generation.
|
|
851
|
+
|
|
852
|
+
Call this before generating related address components (city, state, postcode)
|
|
853
|
+
to ensure they all refer to the same location.
|
|
854
|
+
|
|
855
|
+
Returns
|
|
856
|
+
-------
|
|
857
|
+
dict
|
|
858
|
+
The selected location with city, state, state_abbr, and postcode_prefix.
|
|
859
|
+
"""
|
|
860
|
+
self._current_location = self._get_location()
|
|
861
|
+
return self._current_location
|
|
862
|
+
|
|
863
|
+
def _get_current_location(self) -> dict[str, str]:
|
|
864
|
+
"""Get the current cached location, or select a new one."""
|
|
865
|
+
# If row locations are active, use those
|
|
866
|
+
if self._row_locations is not None and self._current_row is not None:
|
|
867
|
+
return self._row_locations[self._current_row]
|
|
868
|
+
# Otherwise use single cached location
|
|
869
|
+
if self._current_location is None:
|
|
870
|
+
self._current_location = self._get_location()
|
|
871
|
+
return self._current_location
|
|
872
|
+
|
|
873
|
+
def clear_location(self) -> None:
|
|
874
|
+
"""Clear the cached location so the next call will select a new one."""
|
|
875
|
+
self._current_location = None
|
|
876
|
+
|
|
877
|
+
def city(self) -> str:
|
|
878
|
+
"""Generate a random city name (coherent with current location context).
|
|
879
|
+
|
|
880
|
+
Returns the exonym (English name) if available, otherwise the native city name.
|
|
881
|
+
This allows addresses to use native names while city presets use international names.
|
|
882
|
+
"""
|
|
883
|
+
location = self._get_current_location()
|
|
884
|
+
# Prefer exonym (English name) for standalone city preset
|
|
885
|
+
return location.get("exonym", location.get("city", "Springfield"))
|
|
886
|
+
|
|
887
|
+
def _city_native(self) -> str:
|
|
888
|
+
"""Get the native city name (used internally for addresses).
|
|
889
|
+
|
|
890
|
+
Always returns the native name, ignoring any exonym.
|
|
891
|
+
"""
|
|
892
|
+
location = self._get_current_location()
|
|
893
|
+
return location.get("city", "Springfield")
|
|
894
|
+
|
|
895
|
+
def state(self, abbr: bool = False) -> str:
|
|
896
|
+
"""Generate a random state/province name (coherent with current location context)."""
|
|
897
|
+
location = self._get_current_location()
|
|
898
|
+
if abbr:
|
|
899
|
+
return location.get("state_abbr", "ST")
|
|
900
|
+
return location.get("state", "State")
|
|
901
|
+
|
|
902
|
+
def country(self) -> str:
|
|
903
|
+
"""Generate the country name for this locale."""
|
|
904
|
+
return self._data.address.get("country", "United States")
|
|
905
|
+
|
|
906
|
+
def postcode(self) -> str:
|
|
907
|
+
"""Generate a random postal code (coherent with current location context)."""
|
|
908
|
+
location = self._get_current_location()
|
|
909
|
+
prefix = location.get("postcode_prefix", "")
|
|
910
|
+
postcode_format = self._data.address.get("postcode_format", "")
|
|
911
|
+
|
|
912
|
+
# If format uses pattern characters (? for letter, # for digit), generate accordingly
|
|
913
|
+
if "?" in postcode_format or "#" in postcode_format:
|
|
914
|
+
# Generate the full postcode from the format pattern
|
|
915
|
+
# Replace ? with random uppercase letter, # with random digit
|
|
916
|
+
result = []
|
|
917
|
+
prefix_idx = 0
|
|
918
|
+
for char in postcode_format:
|
|
919
|
+
if char == "?":
|
|
920
|
+
# Use prefix character if available, otherwise random letter
|
|
921
|
+
if prefix_idx < len(prefix) and prefix[prefix_idx].isalpha():
|
|
922
|
+
result.append(prefix[prefix_idx])
|
|
923
|
+
prefix_idx += 1
|
|
924
|
+
else:
|
|
925
|
+
result.append(self.rng.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
|
|
926
|
+
elif char == "#":
|
|
927
|
+
# Use prefix character if available, otherwise random digit
|
|
928
|
+
if prefix_idx < len(prefix) and prefix[prefix_idx].isdigit():
|
|
929
|
+
result.append(prefix[prefix_idx])
|
|
930
|
+
prefix_idx += 1
|
|
931
|
+
else:
|
|
932
|
+
result.append(str(self.rng.randint(0, 9)))
|
|
933
|
+
else:
|
|
934
|
+
# Keep literal characters (spaces, dashes, etc.)
|
|
935
|
+
result.append(char)
|
|
936
|
+
return "".join(result)
|
|
937
|
+
|
|
938
|
+
# Default: append digits to complete the postal code
|
|
939
|
+
remaining = 5 - len(prefix)
|
|
940
|
+
suffix = "".join(str(self.rng.randint(0, 9)) for _ in range(remaining))
|
|
941
|
+
return prefix + suffix
|
|
942
|
+
|
|
943
|
+
def street_name(self) -> str:
|
|
944
|
+
"""Generate a random street name.
|
|
945
|
+
|
|
946
|
+
If the locale has `streets_by_city`, use city-specific streets.
|
|
947
|
+
Otherwise, fall back to combining `street_names` and `street_suffixes`.
|
|
948
|
+
"""
|
|
949
|
+
# Check if locale uses city-specific streets
|
|
950
|
+
streets_by_city = self._data.address.get("streets_by_city")
|
|
951
|
+
if streets_by_city:
|
|
952
|
+
# Get current city from location
|
|
953
|
+
location = self._get_current_location()
|
|
954
|
+
city = location.get("city", "")
|
|
955
|
+
city_streets = streets_by_city.get(city)
|
|
956
|
+
if city_streets:
|
|
957
|
+
return self.rng.choice(city_streets)
|
|
958
|
+
|
|
959
|
+
# Fall back to old street_names + street_suffixes approach
|
|
960
|
+
names = self._data.address.get("street_names", ["Main"])
|
|
961
|
+
suffixes = self._data.address.get("street_suffixes", ["St"])
|
|
962
|
+
return f"{self.rng.choice(names)} {self.rng.choice(suffixes)}"
|
|
963
|
+
|
|
964
|
+
def building_number(self) -> str:
|
|
965
|
+
"""Generate a random building number."""
|
|
966
|
+
return str(self.rng.randint(1, 9999))
|
|
967
|
+
|
|
968
|
+
def address(self) -> str:
|
|
969
|
+
"""Generate a full coherent address (city, state, postcode are consistent)."""
|
|
970
|
+
# Only select a new location if row locations are not active
|
|
971
|
+
# This ensures coherence with other address-related columns (city, state, etc.)
|
|
972
|
+
using_row_context = self._row_locations is not None and self._current_row is not None
|
|
973
|
+
if not using_row_context:
|
|
974
|
+
self.new_location()
|
|
975
|
+
|
|
976
|
+
formats = self._data.address.get(
|
|
977
|
+
"address_formats",
|
|
978
|
+
["{building_number} {street}, {city}, {state} {postcode}"],
|
|
979
|
+
)
|
|
980
|
+
fmt = self.rng.choice(formats)
|
|
981
|
+
|
|
982
|
+
result = fmt.format(
|
|
983
|
+
building_number=self.building_number(),
|
|
984
|
+
street=self.street_name(),
|
|
985
|
+
city=self._city_native(), # Use native name in addresses
|
|
986
|
+
state=self.state(abbr=False),
|
|
987
|
+
state_abbr=self.state(abbr=True),
|
|
988
|
+
postcode=self.postcode(),
|
|
989
|
+
country=self.country(),
|
|
990
|
+
unit=str(self.rng.randint(1, 999)),
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
# Clear location after generating full address (only if we set it)
|
|
994
|
+
if not using_row_context:
|
|
995
|
+
self.clear_location()
|
|
996
|
+
return result
|
|
997
|
+
|
|
998
|
+
def phone_number(self) -> str:
|
|
999
|
+
"""Generate a phone number with area code matching the current location's state."""
|
|
1000
|
+
location = self._get_current_location()
|
|
1001
|
+
state = location.get("state", "California")
|
|
1002
|
+
|
|
1003
|
+
# Get area codes for this state
|
|
1004
|
+
area_codes = self._data.address.get("phone_area_codes", {})
|
|
1005
|
+
state_codes = area_codes.get(state, ["555"]) # 555 is fictional fallback
|
|
1006
|
+
area_code = self.rng.choice(state_codes)
|
|
1007
|
+
|
|
1008
|
+
# Generate the rest of the number
|
|
1009
|
+
exchange = str(self.rng.randint(200, 999)) # Exchange can't start with 0 or 1
|
|
1010
|
+
subscriber = str(self.rng.randint(0, 9999)).zfill(4)
|
|
1011
|
+
|
|
1012
|
+
return f"({area_code}) {exchange}-{subscriber}"
|
|
1013
|
+
|
|
1014
|
+
def latitude(self) -> str:
|
|
1015
|
+
"""Generate a random latitude (bounded by current location if available)."""
|
|
1016
|
+
location = self._get_current_location()
|
|
1017
|
+
lat_min = location.get("lat_min", -90)
|
|
1018
|
+
lat_max = location.get("lat_max", 90)
|
|
1019
|
+
return f"{self.rng.uniform(lat_min, lat_max):.6f}"
|
|
1020
|
+
|
|
1021
|
+
def longitude(self) -> str:
|
|
1022
|
+
"""Generate a random longitude (bounded by current location if available)."""
|
|
1023
|
+
location = self._get_current_location()
|
|
1024
|
+
lon_min = location.get("lon_min", -180)
|
|
1025
|
+
lon_max = location.get("lon_max", 180)
|
|
1026
|
+
return f"{self.rng.uniform(lon_min, lon_max):.6f}"
|
|
1027
|
+
|
|
1028
|
+
# =========================================================================
|
|
1029
|
+
# Company
|
|
1030
|
+
# =========================================================================
|
|
1031
|
+
|
|
1032
|
+
def company(self) -> str:
|
|
1033
|
+
"""Generate a random company name.
|
|
1034
|
+
|
|
1035
|
+
Has a ~15% chance to return a well-known company name, with preference
|
|
1036
|
+
for companies that have offices in the current city (if location context is active).
|
|
1037
|
+
Otherwise generates a fictional company name.
|
|
1038
|
+
"""
|
|
1039
|
+
# 15% chance to use a well-known company
|
|
1040
|
+
if self.rng.random() < 0.15:
|
|
1041
|
+
well_known = self._data.company.get("well_known_companies", [])
|
|
1042
|
+
if well_known:
|
|
1043
|
+
# Get current city if location context is active
|
|
1044
|
+
current_city = None
|
|
1045
|
+
if self._row_locations is not None and self._current_row is not None:
|
|
1046
|
+
current_city = self._row_locations[self._current_row].get("city")
|
|
1047
|
+
|
|
1048
|
+
# Collect all companies, preferring those in the current city
|
|
1049
|
+
city_companies = []
|
|
1050
|
+
all_companies = []
|
|
1051
|
+
|
|
1052
|
+
for company in well_known:
|
|
1053
|
+
name = company.get("name") if isinstance(company, dict) else company
|
|
1054
|
+
cities = company.get("cities", []) if isinstance(company, dict) else []
|
|
1055
|
+
all_companies.append(name)
|
|
1056
|
+
if current_city and current_city in cities:
|
|
1057
|
+
city_companies.append(name)
|
|
1058
|
+
|
|
1059
|
+
# 70% chance to use city-relevant company if available
|
|
1060
|
+
if city_companies and self.rng.random() < 0.7:
|
|
1061
|
+
return self.rng.choice(city_companies)
|
|
1062
|
+
elif all_companies:
|
|
1063
|
+
return self.rng.choice(all_companies)
|
|
1064
|
+
|
|
1065
|
+
# Generate a fictional company name
|
|
1066
|
+
formats = self._data.company.get("formats", ["{last_name} {suffix}"])
|
|
1067
|
+
fmt = self.rng.choice(formats)
|
|
1068
|
+
|
|
1069
|
+
suffixes = self._data.company.get("suffixes", ["Inc", "LLC", "Corp"])
|
|
1070
|
+
adjectives = self._data.company.get("adjectives", ["Global", "Advanced"])
|
|
1071
|
+
nouns = self._data.company.get("nouns", ["Solutions", "Systems"])
|
|
1072
|
+
|
|
1073
|
+
# Count how many {last_name} placeholders are in the format
|
|
1074
|
+
# and generate distinct last names for each
|
|
1075
|
+
last_name_count = fmt.count("{last_name}")
|
|
1076
|
+
if last_name_count <= 1:
|
|
1077
|
+
company_last_name = self._generate_last_name()
|
|
1078
|
+
return fmt.format(
|
|
1079
|
+
last_name=company_last_name,
|
|
1080
|
+
suffix=self.rng.choice(suffixes),
|
|
1081
|
+
adjective=self.rng.choice(adjectives),
|
|
1082
|
+
noun=self.rng.choice(nouns),
|
|
1083
|
+
)
|
|
1084
|
+
else:
|
|
1085
|
+
# Generate distinct last names for formats like "{last_name} and {last_name}"
|
|
1086
|
+
last_names = []
|
|
1087
|
+
for _ in range(last_name_count):
|
|
1088
|
+
new_name = self._generate_last_name()
|
|
1089
|
+
# Ensure we don't repeat the same name
|
|
1090
|
+
attempts = 0
|
|
1091
|
+
while new_name in last_names and attempts < 10:
|
|
1092
|
+
new_name = self._generate_last_name()
|
|
1093
|
+
attempts += 1
|
|
1094
|
+
last_names.append(new_name)
|
|
1095
|
+
|
|
1096
|
+
# Replace placeholders one at a time
|
|
1097
|
+
result = fmt
|
|
1098
|
+
for name in last_names:
|
|
1099
|
+
result = result.replace("{last_name}", name, 1)
|
|
1100
|
+
|
|
1101
|
+
return result.format(
|
|
1102
|
+
suffix=self.rng.choice(suffixes),
|
|
1103
|
+
adjective=self.rng.choice(adjectives),
|
|
1104
|
+
noun=self.rng.choice(nouns),
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
def job(self) -> str:
|
|
1108
|
+
"""Generate a random job title."""
|
|
1109
|
+
jobs = self._data.company.get("jobs", ["Manager"])
|
|
1110
|
+
return self.rng.choice(jobs)
|
|
1111
|
+
|
|
1112
|
+
def catch_phrase(self) -> str:
|
|
1113
|
+
"""Generate a random business catch phrase."""
|
|
1114
|
+
adjectives = self._data.company.get("catch_phrase_adjectives", ["Innovative", "Dynamic"])
|
|
1115
|
+
nouns = self._data.company.get("catch_phrase_nouns", ["solutions", "paradigms"])
|
|
1116
|
+
verbs = self._data.company.get("catch_phrase_verbs", ["deliver", "leverage"])
|
|
1117
|
+
return (
|
|
1118
|
+
f"{self.rng.choice(adjectives)} {self.rng.choice(nouns)} that {self.rng.choice(verbs)}"
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1121
|
+
# =========================================================================
|
|
1122
|
+
# Internet
|
|
1123
|
+
# =========================================================================
|
|
1124
|
+
|
|
1125
|
+
def email(self) -> str:
|
|
1126
|
+
"""Generate a random email address (coherent with current person context)."""
|
|
1127
|
+
# Get person data - uses cached person if available
|
|
1128
|
+
person = self._get_current_person()
|
|
1129
|
+
first = person.get("first_name", "user").lower()
|
|
1130
|
+
last = person.get("last_name", "name").lower()
|
|
1131
|
+
domains = self._data.internet.get("free_email_domains", ["gmail.com", "outlook.com"])
|
|
1132
|
+
|
|
1133
|
+
# Transliterate to ASCII for valid email addresses
|
|
1134
|
+
first = _transliterate_to_ascii(first)
|
|
1135
|
+
last = _transliterate_to_ascii(last)
|
|
1136
|
+
|
|
1137
|
+
# Clean names for email (remove non-alphanumeric)
|
|
1138
|
+
first = "".join(c for c in first if c.isalnum())
|
|
1139
|
+
last = "".join(c for c in last if c.isalnum())
|
|
1140
|
+
|
|
1141
|
+
# Various realistic email patterns
|
|
1142
|
+
patterns = [
|
|
1143
|
+
f"{first}.{last}", # john.smith
|
|
1144
|
+
f"{first}{last}", # johnsmith
|
|
1145
|
+
f"{first}_{last}", # john_smith
|
|
1146
|
+
f"{first[0]}{last}", # jsmith
|
|
1147
|
+
f"{first}{self.rng.randint(1, 999)}", # john123
|
|
1148
|
+
f"{first[0]}{last}{self.rng.randint(1, 99)}", # jsmith42
|
|
1149
|
+
f"{first}.{last}{self.rng.randint(1, 99)}", # john.smith99
|
|
1150
|
+
f"{first[0]}_{last}", # j_smith
|
|
1151
|
+
]
|
|
1152
|
+
|
|
1153
|
+
return f"{self.rng.choice(patterns)}@{self.rng.choice(domains)}"
|
|
1154
|
+
|
|
1155
|
+
def user_name(self) -> str:
|
|
1156
|
+
"""Generate a random username (coherent with current person context)."""
|
|
1157
|
+
# Get person data - uses cached person if available
|
|
1158
|
+
person = self._get_current_person()
|
|
1159
|
+
first = person.get("first_name", "user").lower()
|
|
1160
|
+
last = person.get("last_name", "name").lower()
|
|
1161
|
+
|
|
1162
|
+
# Transliterate to ASCII for valid usernames
|
|
1163
|
+
first = _transliterate_to_ascii(first)
|
|
1164
|
+
last = _transliterate_to_ascii(last)
|
|
1165
|
+
|
|
1166
|
+
# Clean names
|
|
1167
|
+
first = "".join(c for c in first if c.isalnum())
|
|
1168
|
+
last = "".join(c for c in last if c.isalnum())
|
|
1169
|
+
|
|
1170
|
+
patterns = [
|
|
1171
|
+
f"{first}{last}",
|
|
1172
|
+
f"{first}_{last}",
|
|
1173
|
+
f"{first}{self.rng.randint(1, 999)}",
|
|
1174
|
+
f"{first[0]}{last}{self.rng.randint(1, 99)}",
|
|
1175
|
+
]
|
|
1176
|
+
|
|
1177
|
+
return self.rng.choice(patterns)
|
|
1178
|
+
|
|
1179
|
+
def password(self, length: int = 12) -> str:
|
|
1180
|
+
"""Generate a random password."""
|
|
1181
|
+
import string
|
|
1182
|
+
|
|
1183
|
+
chars = string.ascii_letters + string.digits + "!@#$%^&*"
|
|
1184
|
+
return "".join(self.rng.choice(chars) for _ in range(length))
|
|
1185
|
+
|
|
1186
|
+
def url(self) -> str:
|
|
1187
|
+
"""Generate a random URL."""
|
|
1188
|
+
protocols = ["https://"]
|
|
1189
|
+
tlds = self._data.internet.get("tlds", ["com", "org", "net"])
|
|
1190
|
+
words = self._data.text.get("words", ["example", "test", "sample"])
|
|
1191
|
+
|
|
1192
|
+
domain = self.rng.choice(words).lower()
|
|
1193
|
+
domain = "".join(c for c in domain if c.isalnum())
|
|
1194
|
+
|
|
1195
|
+
return f"{self.rng.choice(protocols)}www.{domain}.{self.rng.choice(tlds)}"
|
|
1196
|
+
|
|
1197
|
+
def domain_name(self) -> str:
|
|
1198
|
+
"""Generate a random domain name."""
|
|
1199
|
+
tlds = self._data.internet.get("tlds", ["com", "org", "net"])
|
|
1200
|
+
words = self._data.text.get("words", ["example", "test", "sample"])
|
|
1201
|
+
|
|
1202
|
+
domain = self.rng.choice(words).lower()
|
|
1203
|
+
domain = "".join(c for c in domain if c.isalnum())
|
|
1204
|
+
|
|
1205
|
+
return f"{domain}.{self.rng.choice(tlds)}"
|
|
1206
|
+
|
|
1207
|
+
def ipv4(self) -> str:
|
|
1208
|
+
"""Generate a random IPv4 address."""
|
|
1209
|
+
return ".".join(str(self.rng.randint(0, 255)) for _ in range(4))
|
|
1210
|
+
|
|
1211
|
+
def ipv6(self) -> str:
|
|
1212
|
+
"""Generate a random IPv6 address."""
|
|
1213
|
+
return ":".join(f"{self.rng.randint(0, 65535):04x}" for _ in range(8))
|
|
1214
|
+
|
|
1215
|
+
# =========================================================================
|
|
1216
|
+
# Text
|
|
1217
|
+
# =========================================================================
|
|
1218
|
+
|
|
1219
|
+
def word(self) -> str:
|
|
1220
|
+
"""Generate a random word."""
|
|
1221
|
+
words = self._data.text.get("words", ["lorem", "ipsum", "dolor"])
|
|
1222
|
+
return self.rng.choice(words)
|
|
1223
|
+
|
|
1224
|
+
def sentence(self, num_words: int | None = None) -> str:
|
|
1225
|
+
"""Generate a random sentence."""
|
|
1226
|
+
if num_words is None:
|
|
1227
|
+
num_words = self.rng.randint(5, 15)
|
|
1228
|
+
|
|
1229
|
+
words = [self.word() for _ in range(num_words)]
|
|
1230
|
+
words[0] = words[0].capitalize()
|
|
1231
|
+
return " ".join(words) + "."
|
|
1232
|
+
|
|
1233
|
+
def paragraph(self, num_sentences: int | None = None) -> str:
|
|
1234
|
+
"""Generate a random paragraph."""
|
|
1235
|
+
if num_sentences is None:
|
|
1236
|
+
num_sentences = self.rng.randint(3, 7)
|
|
1237
|
+
|
|
1238
|
+
return " ".join(self.sentence() for _ in range(num_sentences))
|
|
1239
|
+
|
|
1240
|
+
def text(self, max_chars: int = 200) -> str:
|
|
1241
|
+
"""Generate random text up to max_chars."""
|
|
1242
|
+
result = []
|
|
1243
|
+
current_length = 0
|
|
1244
|
+
|
|
1245
|
+
while current_length < max_chars:
|
|
1246
|
+
sentence = self.sentence()
|
|
1247
|
+
if current_length + len(sentence) + 1 > max_chars:
|
|
1248
|
+
break
|
|
1249
|
+
result.append(sentence)
|
|
1250
|
+
current_length += len(sentence) + 1
|
|
1251
|
+
|
|
1252
|
+
return " ".join(result) if result else self.sentence()[:max_chars]
|
|
1253
|
+
|
|
1254
|
+
# =========================================================================
|
|
1255
|
+
# Financial
|
|
1256
|
+
# =========================================================================
|
|
1257
|
+
|
|
1258
|
+
def credit_card_number(self) -> str:
|
|
1259
|
+
"""Generate a random credit card number (not valid for transactions)."""
|
|
1260
|
+
# Generate a 16-digit number with valid Luhn checksum
|
|
1261
|
+
prefix = self.rng.choice(["4", "5", "37", "6011"]) # Visa, MC, Amex, Discover
|
|
1262
|
+
length = 15 if prefix == "37" else 16
|
|
1263
|
+
|
|
1264
|
+
# Generate digits (minus check digit)
|
|
1265
|
+
digits = list(prefix)
|
|
1266
|
+
while len(digits) < length - 1:
|
|
1267
|
+
digits.append(str(self.rng.randint(0, 9)))
|
|
1268
|
+
|
|
1269
|
+
# Calculate Luhn check digit
|
|
1270
|
+
check_digit = self._luhn_checksum(digits)
|
|
1271
|
+
digits.append(str(check_digit))
|
|
1272
|
+
|
|
1273
|
+
return "".join(digits)
|
|
1274
|
+
|
|
1275
|
+
def _luhn_checksum(self, digits: list[str]) -> int:
|
|
1276
|
+
"""Calculate Luhn check digit for a partial card number.
|
|
1277
|
+
|
|
1278
|
+
The check digit is appended to make the full number pass the Luhn algorithm.
|
|
1279
|
+
We process from right to left, doubling every second digit starting from
|
|
1280
|
+
the rightmost digit of the partial number (since the check digit will be
|
|
1281
|
+
at position 0 and won't be doubled).
|
|
1282
|
+
"""
|
|
1283
|
+
nums = [int(d) for d in digits]
|
|
1284
|
+
total = 0
|
|
1285
|
+
for i, d in enumerate(reversed(nums)):
|
|
1286
|
+
if i % 2 == 0: # These positions get doubled (check digit at pos 0 won't be)
|
|
1287
|
+
d = d * 2
|
|
1288
|
+
if d > 9:
|
|
1289
|
+
d -= 9
|
|
1290
|
+
total += d
|
|
1291
|
+
return (10 - (total % 10)) % 10
|
|
1292
|
+
|
|
1293
|
+
def iban(self) -> str:
|
|
1294
|
+
"""Generate a random IBAN."""
|
|
1295
|
+
# Simplified - generates a plausible-looking IBAN
|
|
1296
|
+
country = self._data.address.get("country_code", "US")
|
|
1297
|
+
if country == "US":
|
|
1298
|
+
# US doesn't use IBAN, use DE as example
|
|
1299
|
+
country = "DE"
|
|
1300
|
+
|
|
1301
|
+
check_digits = f"{self.rng.randint(10, 99)}"
|
|
1302
|
+
bank_code = "".join(str(self.rng.randint(0, 9)) for _ in range(8))
|
|
1303
|
+
account = "".join(str(self.rng.randint(0, 9)) for _ in range(10))
|
|
1304
|
+
|
|
1305
|
+
return f"{country}{check_digits}{bank_code}{account}"
|
|
1306
|
+
|
|
1307
|
+
def currency_code(self) -> str:
|
|
1308
|
+
"""Generate a random currency code."""
|
|
1309
|
+
codes = self._data.misc.get("currency_codes", ["USD", "EUR", "GBP", "JPY", "CNY"])
|
|
1310
|
+
return self.rng.choice(codes)
|
|
1311
|
+
|
|
1312
|
+
# =========================================================================
|
|
1313
|
+
# Identifiers
|
|
1314
|
+
# =========================================================================
|
|
1315
|
+
|
|
1316
|
+
def uuid4(self) -> str:
|
|
1317
|
+
"""Generate a random UUID4."""
|
|
1318
|
+
# Use our RNG to generate deterministic UUIDs
|
|
1319
|
+
hex_chars = "0123456789abcdef"
|
|
1320
|
+
parts = [
|
|
1321
|
+
"".join(self.rng.choice(hex_chars) for _ in range(8)),
|
|
1322
|
+
"".join(self.rng.choice(hex_chars) for _ in range(4)),
|
|
1323
|
+
"4" + "".join(self.rng.choice(hex_chars) for _ in range(3)), # Version 4
|
|
1324
|
+
self.rng.choice("89ab")
|
|
1325
|
+
+ "".join(self.rng.choice(hex_chars) for _ in range(3)), # Variant
|
|
1326
|
+
"".join(self.rng.choice(hex_chars) for _ in range(12)),
|
|
1327
|
+
]
|
|
1328
|
+
return "-".join(parts)
|
|
1329
|
+
|
|
1330
|
+
def ssn(self) -> str:
|
|
1331
|
+
"""Generate a random SSN-like identifier."""
|
|
1332
|
+
# US format: XXX-XX-XXXX
|
|
1333
|
+
fmt = self._data.misc.get("ssn_format", "###-##-####")
|
|
1334
|
+
return self._generate_from_format(fmt)
|
|
1335
|
+
|
|
1336
|
+
def license_plate(self) -> str:
|
|
1337
|
+
"""Generate a random license plate."""
|
|
1338
|
+
fmt = self._data.misc.get("license_plate_format", "???-####")
|
|
1339
|
+
return self._generate_from_format(fmt)
|
|
1340
|
+
|
|
1341
|
+
# =========================================================================
|
|
1342
|
+
# Date/Time (string representations)
|
|
1343
|
+
# =========================================================================
|
|
1344
|
+
|
|
1345
|
+
def date_this_year(self) -> str:
|
|
1346
|
+
"""Generate a random date from this year as ISO string."""
|
|
1347
|
+
from datetime import date, timedelta
|
|
1348
|
+
|
|
1349
|
+
today = date.today()
|
|
1350
|
+
start = date(today.year, 1, 1)
|
|
1351
|
+
days = (today - start).days
|
|
1352
|
+
random_date = start + timedelta(days=self.rng.randint(0, max(days, 1)))
|
|
1353
|
+
return random_date.isoformat()
|
|
1354
|
+
|
|
1355
|
+
def date_this_decade(self) -> str:
|
|
1356
|
+
"""Generate a random date from this decade as ISO string."""
|
|
1357
|
+
from datetime import date, timedelta
|
|
1358
|
+
|
|
1359
|
+
today = date.today()
|
|
1360
|
+
decade_start = (today.year // 10) * 10
|
|
1361
|
+
start = date(decade_start, 1, 1)
|
|
1362
|
+
days = (today - start).days
|
|
1363
|
+
random_date = start + timedelta(days=self.rng.randint(0, max(days, 1)))
|
|
1364
|
+
return random_date.isoformat()
|
|
1365
|
+
|
|
1366
|
+
def time(self) -> str:
|
|
1367
|
+
"""Generate a random time as string."""
|
|
1368
|
+
hour = self.rng.randint(0, 23)
|
|
1369
|
+
minute = self.rng.randint(0, 59)
|
|
1370
|
+
second = self.rng.randint(0, 59)
|
|
1371
|
+
return f"{hour:02d}:{minute:02d}:{second:02d}"
|
|
1372
|
+
|
|
1373
|
+
# =========================================================================
|
|
1374
|
+
# Misc
|
|
1375
|
+
# =========================================================================
|
|
1376
|
+
|
|
1377
|
+
def color_name(self) -> str:
|
|
1378
|
+
"""Generate a random color name."""
|
|
1379
|
+
colors = self._data.misc.get(
|
|
1380
|
+
"colors",
|
|
1381
|
+
[
|
|
1382
|
+
"Red",
|
|
1383
|
+
"Blue",
|
|
1384
|
+
"Green",
|
|
1385
|
+
"Yellow",
|
|
1386
|
+
"Purple",
|
|
1387
|
+
"Orange",
|
|
1388
|
+
"Pink",
|
|
1389
|
+
"Brown",
|
|
1390
|
+
"Black",
|
|
1391
|
+
"White",
|
|
1392
|
+
"Gray",
|
|
1393
|
+
"Cyan",
|
|
1394
|
+
"Magenta",
|
|
1395
|
+
],
|
|
1396
|
+
)
|
|
1397
|
+
return self.rng.choice(colors)
|
|
1398
|
+
|
|
1399
|
+
def file_name(self) -> str:
|
|
1400
|
+
"""Generate a random file name."""
|
|
1401
|
+
words = self._data.text.get("words", ["document", "file", "report"])
|
|
1402
|
+
extensions = self._data.misc.get("file_extensions", ["txt", "pdf", "doc", "xlsx"])
|
|
1403
|
+
word = self.rng.choice(words).lower()
|
|
1404
|
+
word = "".join(c for c in word if c.isalnum())
|
|
1405
|
+
return f"{word}.{self.rng.choice(extensions)}"
|
|
1406
|
+
|
|
1407
|
+
def file_extension(self) -> str:
|
|
1408
|
+
"""Generate a random file extension."""
|
|
1409
|
+
extensions = self._data.misc.get(
|
|
1410
|
+
"file_extensions", ["txt", "pdf", "doc", "xlsx", "png", "jpg"]
|
|
1411
|
+
)
|
|
1412
|
+
return self.rng.choice(extensions)
|
|
1413
|
+
|
|
1414
|
+
def mime_type(self) -> str:
|
|
1415
|
+
"""Generate a random MIME type."""
|
|
1416
|
+
mime_types = self._data.misc.get(
|
|
1417
|
+
"mime_types",
|
|
1418
|
+
[
|
|
1419
|
+
"text/plain",
|
|
1420
|
+
"text/html",
|
|
1421
|
+
"application/json",
|
|
1422
|
+
"application/pdf",
|
|
1423
|
+
"image/png",
|
|
1424
|
+
"image/jpeg",
|
|
1425
|
+
],
|
|
1426
|
+
)
|
|
1427
|
+
return self.rng.choice(mime_types)
|
|
1428
|
+
|
|
1429
|
+
# =========================================================================
|
|
1430
|
+
# Utilities
|
|
1431
|
+
# =========================================================================
|
|
1432
|
+
|
|
1433
|
+
def _generate_from_format(self, fmt: str) -> str:
|
|
1434
|
+
"""
|
|
1435
|
+
Generate a string from a format pattern.
|
|
1436
|
+
|
|
1437
|
+
Patterns:
|
|
1438
|
+
- # = digit (0-9)
|
|
1439
|
+
- ? = uppercase letter (A-Z)
|
|
1440
|
+
- * = alphanumeric
|
|
1441
|
+
"""
|
|
1442
|
+
result = []
|
|
1443
|
+
for char in fmt:
|
|
1444
|
+
if char == "#":
|
|
1445
|
+
result.append(str(self.rng.randint(0, 9)))
|
|
1446
|
+
elif char == "?":
|
|
1447
|
+
result.append(self.rng.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
|
|
1448
|
+
elif char == "*":
|
|
1449
|
+
result.append(self.rng.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
|
|
1450
|
+
else:
|
|
1451
|
+
result.append(char)
|
|
1452
|
+
return "".join(result)
|
|
1453
|
+
|
|
1454
|
+
|
|
1455
|
+
# Module-level convenience function
|
|
1456
|
+
_default_registry = LocaleRegistry()
|
|
1457
|
+
|
|
1458
|
+
|
|
1459
|
+
def get_generator(country: str = "US", seed: int | None = None) -> LocaleGenerator:
|
|
1460
|
+
"""
|
|
1461
|
+
Get a country data generator instance.
|
|
1462
|
+
|
|
1463
|
+
Parameters
|
|
1464
|
+
----------
|
|
1465
|
+
country
|
|
1466
|
+
Country code (e.g., "US", "DE", "USA", "DEU").
|
|
1467
|
+
Also accepts legacy locale codes like "en_US" for backwards compatibility.
|
|
1468
|
+
seed
|
|
1469
|
+
Random seed for reproducibility.
|
|
1470
|
+
|
|
1471
|
+
Returns
|
|
1472
|
+
-------
|
|
1473
|
+
LocaleGenerator
|
|
1474
|
+
A generator configured for the specified country.
|
|
1475
|
+
"""
|
|
1476
|
+
return LocaleGenerator(country=country, seed=seed)
|