pointblank 0.18.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +44 -1
- pointblank/_constants.py +258 -166
- pointblank/_constants_translations.py +378 -0
- pointblank/_interrogation.py +204 -0
- pointblank/_utils_llms_txt.py +20 -0
- pointblank/data/api-docs.txt +793 -1
- pointblank/field.py +1507 -0
- pointblank/generate/__init__.py +17 -0
- pointblank/generate/base.py +49 -0
- pointblank/generate/generators.py +573 -0
- pointblank/generate/regex.py +217 -0
- pointblank/locales/__init__.py +1476 -0
- pointblank/locales/data/AR/address.json +73 -0
- pointblank/locales/data/AR/company.json +60 -0
- pointblank/locales/data/AR/internet.json +19 -0
- pointblank/locales/data/AR/misc.json +7 -0
- pointblank/locales/data/AR/person.json +39 -0
- pointblank/locales/data/AR/text.json +38 -0
- pointblank/locales/data/AT/address.json +84 -0
- pointblank/locales/data/AT/company.json +65 -0
- pointblank/locales/data/AT/internet.json +20 -0
- pointblank/locales/data/AT/misc.json +8 -0
- pointblank/locales/data/AT/person.json +17 -0
- pointblank/locales/data/AT/text.json +35 -0
- pointblank/locales/data/AU/address.json +83 -0
- pointblank/locales/data/AU/company.json +65 -0
- pointblank/locales/data/AU/internet.json +20 -0
- pointblank/locales/data/AU/misc.json +8 -0
- pointblank/locales/data/AU/person.json +17 -0
- pointblank/locales/data/AU/text.json +35 -0
- pointblank/locales/data/BE/address.json +225 -0
- pointblank/locales/data/BE/company.json +129 -0
- pointblank/locales/data/BE/internet.json +36 -0
- pointblank/locales/data/BE/misc.json +6 -0
- pointblank/locales/data/BE/person.json +62 -0
- pointblank/locales/data/BE/text.json +38 -0
- pointblank/locales/data/BG/address.json +75 -0
- pointblank/locales/data/BG/company.json +60 -0
- pointblank/locales/data/BG/internet.json +19 -0
- pointblank/locales/data/BG/misc.json +7 -0
- pointblank/locales/data/BG/person.json +40 -0
- pointblank/locales/data/BG/text.json +38 -0
- pointblank/locales/data/BR/address.json +98 -0
- pointblank/locales/data/BR/company.json +65 -0
- pointblank/locales/data/BR/internet.json +20 -0
- pointblank/locales/data/BR/misc.json +8 -0
- pointblank/locales/data/BR/person.json +17 -0
- pointblank/locales/data/BR/text.json +35 -0
- pointblank/locales/data/CA/address.json +747 -0
- pointblank/locales/data/CA/company.json +120 -0
- pointblank/locales/data/CA/internet.json +24 -0
- pointblank/locales/data/CA/misc.json +11 -0
- pointblank/locales/data/CA/person.json +1033 -0
- pointblank/locales/data/CA/text.json +58 -0
- pointblank/locales/data/CH/address.json +184 -0
- pointblank/locales/data/CH/company.json +112 -0
- pointblank/locales/data/CH/internet.json +20 -0
- pointblank/locales/data/CH/misc.json +10 -0
- pointblank/locales/data/CH/person.json +64 -0
- pointblank/locales/data/CH/text.json +45 -0
- pointblank/locales/data/CL/address.json +71 -0
- pointblank/locales/data/CL/company.json +60 -0
- pointblank/locales/data/CL/internet.json +19 -0
- pointblank/locales/data/CL/misc.json +7 -0
- pointblank/locales/data/CL/person.json +38 -0
- pointblank/locales/data/CL/text.json +38 -0
- pointblank/locales/data/CN/address.json +124 -0
- pointblank/locales/data/CN/company.json +76 -0
- pointblank/locales/data/CN/internet.json +20 -0
- pointblank/locales/data/CN/misc.json +8 -0
- pointblank/locales/data/CN/person.json +50 -0
- pointblank/locales/data/CN/text.json +38 -0
- pointblank/locales/data/CO/address.json +76 -0
- pointblank/locales/data/CO/company.json +60 -0
- pointblank/locales/data/CO/internet.json +19 -0
- pointblank/locales/data/CO/misc.json +7 -0
- pointblank/locales/data/CO/person.json +38 -0
- pointblank/locales/data/CO/text.json +38 -0
- pointblank/locales/data/CY/address.json +62 -0
- pointblank/locales/data/CY/company.json +60 -0
- pointblank/locales/data/CY/internet.json +19 -0
- pointblank/locales/data/CY/misc.json +7 -0
- pointblank/locales/data/CY/person.json +38 -0
- pointblank/locales/data/CY/text.json +38 -0
- pointblank/locales/data/CZ/address.json +70 -0
- pointblank/locales/data/CZ/company.json +61 -0
- pointblank/locales/data/CZ/internet.json +19 -0
- pointblank/locales/data/CZ/misc.json +7 -0
- pointblank/locales/data/CZ/person.json +40 -0
- pointblank/locales/data/CZ/text.json +38 -0
- pointblank/locales/data/DE/address.json +756 -0
- pointblank/locales/data/DE/company.json +101 -0
- pointblank/locales/data/DE/internet.json +22 -0
- pointblank/locales/data/DE/misc.json +11 -0
- pointblank/locales/data/DE/person.json +1026 -0
- pointblank/locales/data/DE/text.json +50 -0
- pointblank/locales/data/DK/address.json +231 -0
- pointblank/locales/data/DK/company.json +65 -0
- pointblank/locales/data/DK/internet.json +20 -0
- pointblank/locales/data/DK/misc.json +7 -0
- pointblank/locales/data/DK/person.json +45 -0
- pointblank/locales/data/DK/text.json +43 -0
- pointblank/locales/data/EE/address.json +69 -0
- pointblank/locales/data/EE/company.json +60 -0
- pointblank/locales/data/EE/internet.json +19 -0
- pointblank/locales/data/EE/misc.json +7 -0
- pointblank/locales/data/EE/person.json +39 -0
- pointblank/locales/data/EE/text.json +38 -0
- pointblank/locales/data/ES/address.json +3086 -0
- pointblank/locales/data/ES/company.json +644 -0
- pointblank/locales/data/ES/internet.json +25 -0
- pointblank/locales/data/ES/misc.json +11 -0
- pointblank/locales/data/ES/person.json +488 -0
- pointblank/locales/data/ES/text.json +49 -0
- pointblank/locales/data/FI/address.json +93 -0
- pointblank/locales/data/FI/company.json +65 -0
- pointblank/locales/data/FI/internet.json +20 -0
- pointblank/locales/data/FI/misc.json +8 -0
- pointblank/locales/data/FI/person.json +17 -0
- pointblank/locales/data/FI/text.json +35 -0
- pointblank/locales/data/FR/address.json +619 -0
- pointblank/locales/data/FR/company.json +111 -0
- pointblank/locales/data/FR/internet.json +22 -0
- pointblank/locales/data/FR/misc.json +11 -0
- pointblank/locales/data/FR/person.json +1066 -0
- pointblank/locales/data/FR/text.json +50 -0
- pointblank/locales/data/GB/address.json +5759 -0
- pointblank/locales/data/GB/company.json +131 -0
- pointblank/locales/data/GB/internet.json +24 -0
- pointblank/locales/data/GB/misc.json +45 -0
- pointblank/locales/data/GB/person.json +578 -0
- pointblank/locales/data/GB/text.json +61 -0
- pointblank/locales/data/GR/address.json +68 -0
- pointblank/locales/data/GR/company.json +61 -0
- pointblank/locales/data/GR/internet.json +19 -0
- pointblank/locales/data/GR/misc.json +7 -0
- pointblank/locales/data/GR/person.json +39 -0
- pointblank/locales/data/GR/text.json +38 -0
- pointblank/locales/data/HK/address.json +79 -0
- pointblank/locales/data/HK/company.json +69 -0
- pointblank/locales/data/HK/internet.json +19 -0
- pointblank/locales/data/HK/misc.json +7 -0
- pointblank/locales/data/HK/person.json +42 -0
- pointblank/locales/data/HK/text.json +38 -0
- pointblank/locales/data/HR/address.json +73 -0
- pointblank/locales/data/HR/company.json +60 -0
- pointblank/locales/data/HR/internet.json +19 -0
- pointblank/locales/data/HR/misc.json +7 -0
- pointblank/locales/data/HR/person.json +38 -0
- pointblank/locales/data/HR/text.json +38 -0
- pointblank/locales/data/HU/address.json +70 -0
- pointblank/locales/data/HU/company.json +61 -0
- pointblank/locales/data/HU/internet.json +19 -0
- pointblank/locales/data/HU/misc.json +7 -0
- pointblank/locales/data/HU/person.json +40 -0
- pointblank/locales/data/HU/text.json +38 -0
- pointblank/locales/data/ID/address.json +68 -0
- pointblank/locales/data/ID/company.json +61 -0
- pointblank/locales/data/ID/internet.json +19 -0
- pointblank/locales/data/ID/misc.json +7 -0
- pointblank/locales/data/ID/person.json +40 -0
- pointblank/locales/data/ID/text.json +38 -0
- pointblank/locales/data/IE/address.json +643 -0
- pointblank/locales/data/IE/company.json +140 -0
- pointblank/locales/data/IE/internet.json +24 -0
- pointblank/locales/data/IE/misc.json +44 -0
- pointblank/locales/data/IE/person.json +55 -0
- pointblank/locales/data/IE/text.json +60 -0
- pointblank/locales/data/IN/address.json +92 -0
- pointblank/locales/data/IN/company.json +65 -0
- pointblank/locales/data/IN/internet.json +20 -0
- pointblank/locales/data/IN/misc.json +8 -0
- pointblank/locales/data/IN/person.json +52 -0
- pointblank/locales/data/IN/text.json +39 -0
- pointblank/locales/data/IS/address.json +63 -0
- pointblank/locales/data/IS/company.json +61 -0
- pointblank/locales/data/IS/internet.json +19 -0
- pointblank/locales/data/IS/misc.json +7 -0
- pointblank/locales/data/IS/person.json +44 -0
- pointblank/locales/data/IS/text.json +38 -0
- pointblank/locales/data/IT/address.json +192 -0
- pointblank/locales/data/IT/company.json +137 -0
- pointblank/locales/data/IT/internet.json +20 -0
- pointblank/locales/data/IT/misc.json +10 -0
- pointblank/locales/data/IT/person.json +70 -0
- pointblank/locales/data/IT/text.json +44 -0
- pointblank/locales/data/JP/address.json +713 -0
- pointblank/locales/data/JP/company.json +113 -0
- pointblank/locales/data/JP/internet.json +22 -0
- pointblank/locales/data/JP/misc.json +10 -0
- pointblank/locales/data/JP/person.json +1057 -0
- pointblank/locales/data/JP/text.json +51 -0
- pointblank/locales/data/KR/address.json +77 -0
- pointblank/locales/data/KR/company.json +68 -0
- pointblank/locales/data/KR/internet.json +19 -0
- pointblank/locales/data/KR/misc.json +7 -0
- pointblank/locales/data/KR/person.json +40 -0
- pointblank/locales/data/KR/text.json +38 -0
- pointblank/locales/data/LT/address.json +66 -0
- pointblank/locales/data/LT/company.json +60 -0
- pointblank/locales/data/LT/internet.json +19 -0
- pointblank/locales/data/LT/misc.json +7 -0
- pointblank/locales/data/LT/person.json +42 -0
- pointblank/locales/data/LT/text.json +38 -0
- pointblank/locales/data/LU/address.json +66 -0
- pointblank/locales/data/LU/company.json +60 -0
- pointblank/locales/data/LU/internet.json +19 -0
- pointblank/locales/data/LU/misc.json +7 -0
- pointblank/locales/data/LU/person.json +38 -0
- pointblank/locales/data/LU/text.json +38 -0
- pointblank/locales/data/LV/address.json +62 -0
- pointblank/locales/data/LV/company.json +60 -0
- pointblank/locales/data/LV/internet.json +19 -0
- pointblank/locales/data/LV/misc.json +7 -0
- pointblank/locales/data/LV/person.json +40 -0
- pointblank/locales/data/LV/text.json +38 -0
- pointblank/locales/data/MT/address.json +61 -0
- pointblank/locales/data/MT/company.json +60 -0
- pointblank/locales/data/MT/internet.json +19 -0
- pointblank/locales/data/MT/misc.json +7 -0
- pointblank/locales/data/MT/person.json +38 -0
- pointblank/locales/data/MT/text.json +38 -0
- pointblank/locales/data/MX/address.json +100 -0
- pointblank/locales/data/MX/company.json +65 -0
- pointblank/locales/data/MX/internet.json +20 -0
- pointblank/locales/data/MX/misc.json +8 -0
- pointblank/locales/data/MX/person.json +18 -0
- pointblank/locales/data/MX/text.json +39 -0
- pointblank/locales/data/NL/address.json +1517 -0
- pointblank/locales/data/NL/company.json +133 -0
- pointblank/locales/data/NL/internet.json +44 -0
- pointblank/locales/data/NL/misc.json +55 -0
- pointblank/locales/data/NL/person.json +365 -0
- pointblank/locales/data/NL/text.json +210 -0
- pointblank/locales/data/NO/address.json +86 -0
- pointblank/locales/data/NO/company.json +66 -0
- pointblank/locales/data/NO/internet.json +20 -0
- pointblank/locales/data/NO/misc.json +8 -0
- pointblank/locales/data/NO/person.json +17 -0
- pointblank/locales/data/NO/text.json +35 -0
- pointblank/locales/data/NZ/address.json +90 -0
- pointblank/locales/data/NZ/company.json +65 -0
- pointblank/locales/data/NZ/internet.json +20 -0
- pointblank/locales/data/NZ/misc.json +8 -0
- pointblank/locales/data/NZ/person.json +17 -0
- pointblank/locales/data/NZ/text.json +39 -0
- pointblank/locales/data/PH/address.json +67 -0
- pointblank/locales/data/PH/company.json +61 -0
- pointblank/locales/data/PH/internet.json +19 -0
- pointblank/locales/data/PH/misc.json +7 -0
- pointblank/locales/data/PH/person.json +40 -0
- pointblank/locales/data/PH/text.json +38 -0
- pointblank/locales/data/PL/address.json +91 -0
- pointblank/locales/data/PL/company.json +65 -0
- pointblank/locales/data/PL/internet.json +20 -0
- pointblank/locales/data/PL/misc.json +8 -0
- pointblank/locales/data/PL/person.json +17 -0
- pointblank/locales/data/PL/text.json +35 -0
- pointblank/locales/data/PT/address.json +90 -0
- pointblank/locales/data/PT/company.json +65 -0
- pointblank/locales/data/PT/internet.json +20 -0
- pointblank/locales/data/PT/misc.json +8 -0
- pointblank/locales/data/PT/person.json +17 -0
- pointblank/locales/data/PT/text.json +35 -0
- pointblank/locales/data/RO/address.json +73 -0
- pointblank/locales/data/RO/company.json +61 -0
- pointblank/locales/data/RO/internet.json +19 -0
- pointblank/locales/data/RO/misc.json +7 -0
- pointblank/locales/data/RO/person.json +40 -0
- pointblank/locales/data/RO/text.json +38 -0
- pointblank/locales/data/RU/address.json +74 -0
- pointblank/locales/data/RU/company.json +60 -0
- pointblank/locales/data/RU/internet.json +19 -0
- pointblank/locales/data/RU/misc.json +7 -0
- pointblank/locales/data/RU/person.json +38 -0
- pointblank/locales/data/RU/text.json +38 -0
- pointblank/locales/data/SE/address.json +247 -0
- pointblank/locales/data/SE/company.json +65 -0
- pointblank/locales/data/SE/internet.json +20 -0
- pointblank/locales/data/SE/misc.json +7 -0
- pointblank/locales/data/SE/person.json +45 -0
- pointblank/locales/data/SE/text.json +43 -0
- pointblank/locales/data/SI/address.json +67 -0
- pointblank/locales/data/SI/company.json +60 -0
- pointblank/locales/data/SI/internet.json +19 -0
- pointblank/locales/data/SI/misc.json +7 -0
- pointblank/locales/data/SI/person.json +38 -0
- pointblank/locales/data/SI/text.json +38 -0
- pointblank/locales/data/SK/address.json +64 -0
- pointblank/locales/data/SK/company.json +60 -0
- pointblank/locales/data/SK/internet.json +19 -0
- pointblank/locales/data/SK/misc.json +7 -0
- pointblank/locales/data/SK/person.json +38 -0
- pointblank/locales/data/SK/text.json +38 -0
- pointblank/locales/data/TR/address.json +105 -0
- pointblank/locales/data/TR/company.json +65 -0
- pointblank/locales/data/TR/internet.json +20 -0
- pointblank/locales/data/TR/misc.json +8 -0
- pointblank/locales/data/TR/person.json +17 -0
- pointblank/locales/data/TR/text.json +35 -0
- pointblank/locales/data/TW/address.json +86 -0
- pointblank/locales/data/TW/company.json +69 -0
- pointblank/locales/data/TW/internet.json +19 -0
- pointblank/locales/data/TW/misc.json +7 -0
- pointblank/locales/data/TW/person.json +42 -0
- pointblank/locales/data/TW/text.json +38 -0
- pointblank/locales/data/US/address.json +996 -0
- pointblank/locales/data/US/company.json +131 -0
- pointblank/locales/data/US/internet.json +22 -0
- pointblank/locales/data/US/misc.json +11 -0
- pointblank/locales/data/US/person.json +1092 -0
- pointblank/locales/data/US/text.json +56 -0
- pointblank/locales/data/_shared/misc.json +42 -0
- pointblank/schema.py +339 -2
- pointblank/validate.py +1263 -11
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/METADATA +45 -1
- pointblank-0.20.0.dist-info/RECORD +366 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/WHEEL +1 -1
- pointblank-0.18.0.dist-info/RECORD +0 -59
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data generation module for Pointblank.
|
|
3
|
+
|
|
4
|
+
This module provides synthetic test data generation from Schema definitions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pointblank.generate.base import GeneratorConfig
|
|
8
|
+
from pointblank.generate.generators import (
|
|
9
|
+
generate_column,
|
|
10
|
+
generate_dataframe,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"GeneratorConfig",
|
|
15
|
+
"generate_column",
|
|
16
|
+
"generate_dataframe",
|
|
17
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base infrastructure for data generation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import TYPE_CHECKING, Literal
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
__all__ = ["GeneratorConfig"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class GeneratorConfig:
|
|
18
|
+
"""
|
|
19
|
+
Configuration for data generation.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
n
|
|
24
|
+
Number of rows to generate.
|
|
25
|
+
seed
|
|
26
|
+
Random seed for reproducibility.
|
|
27
|
+
output
|
|
28
|
+
Output format: "polars", "pandas", or "dict".
|
|
29
|
+
country
|
|
30
|
+
Country code for realistic data generation. Accepts ISO 3166-1 alpha-2 codes
|
|
31
|
+
(e.g., `"US"`, `"DE"`, `"FR"`) or alpha-3 codes (e.g., `"USA"`, `"DEU"`).
|
|
32
|
+
Default is `"US"`.
|
|
33
|
+
max_unique_retries
|
|
34
|
+
Maximum retries when generating unique values.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
n: int = 100
|
|
38
|
+
seed: int | None = None
|
|
39
|
+
output: Literal["polars", "pandas", "dict"] = "polars"
|
|
40
|
+
country: str = "US"
|
|
41
|
+
max_unique_retries: int = 1000
|
|
42
|
+
|
|
43
|
+
def __post_init__(self):
|
|
44
|
+
if self.n < 0:
|
|
45
|
+
raise ValueError(f"n must be non-negative, got {self.n}")
|
|
46
|
+
if self.max_unique_retries < 1:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"max_unique_retries must be at least 1, got {self.max_unique_retries}"
|
|
49
|
+
)
|
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Per-dtype value generators for synthetic data generation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import random
|
|
8
|
+
import string
|
|
9
|
+
from datetime import date, datetime, time, timedelta
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
11
|
+
|
|
12
|
+
from pointblank._utils import _is_lib_present
|
|
13
|
+
from pointblank.field import Field
|
|
14
|
+
from pointblank.generate.base import GeneratorConfig
|
|
15
|
+
from pointblank.generate.regex import generate_from_regex
|
|
16
|
+
from pointblank.locales import LocaleGenerator
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
__all__ = ["generate_column", "generate_dataframe"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Integer dtype bounds
|
|
25
|
+
INTEGER_BOUNDS = {
|
|
26
|
+
"Int8": (-(2**7), 2**7 - 1),
|
|
27
|
+
"Int16": (-(2**15), 2**15 - 1),
|
|
28
|
+
"Int32": (-(2**31), 2**31 - 1),
|
|
29
|
+
"Int64": (-(2**63), 2**63 - 1),
|
|
30
|
+
"UInt8": (0, 2**8 - 1),
|
|
31
|
+
"UInt16": (0, 2**16 - 1),
|
|
32
|
+
"UInt32": (0, 2**32 - 1),
|
|
33
|
+
"UInt64": (0, 2**64 - 1),
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_locale_generator(country: str = "US", seed: int | None = None) -> LocaleGenerator:
|
|
38
|
+
"""Get a LocaleGenerator instance with the specified country."""
|
|
39
|
+
return LocaleGenerator(country=country, seed=seed)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _generate_integer(field: Field, rng: random.Random, generator: Any | None = None) -> int:
|
|
43
|
+
"""Generate a random integer value respecting field constraints."""
|
|
44
|
+
dtype_min, dtype_max = INTEGER_BOUNDS.get(field.dtype, (-(2**63), 2**63 - 1))
|
|
45
|
+
|
|
46
|
+
min_val = getattr(field, "min_val", None)
|
|
47
|
+
max_val = getattr(field, "max_val", None)
|
|
48
|
+
|
|
49
|
+
min_val = min_val if min_val is not None else dtype_min
|
|
50
|
+
max_val = max_val if max_val is not None else dtype_max
|
|
51
|
+
|
|
52
|
+
# Clamp to dtype bounds
|
|
53
|
+
min_val = max(min_val, dtype_min)
|
|
54
|
+
max_val = min(max_val, dtype_max)
|
|
55
|
+
|
|
56
|
+
return rng.randint(int(min_val), int(max_val))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _generate_float(field: Field, rng: random.Random, generator: Any | None = None) -> float:
|
|
60
|
+
"""Generate a random float value respecting field constraints."""
|
|
61
|
+
min_val = getattr(field, "min_val", None)
|
|
62
|
+
max_val = getattr(field, "max_val", None)
|
|
63
|
+
|
|
64
|
+
min_val = min_val if min_val is not None else -1e10
|
|
65
|
+
max_val = max_val if max_val is not None else 1e10
|
|
66
|
+
|
|
67
|
+
return rng.uniform(float(min_val), float(max_val))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _generate_string(
|
|
71
|
+
field: Field, rng: random.Random, generator: LocaleGenerator | None = None
|
|
72
|
+
) -> str:
|
|
73
|
+
"""Generate a random string value respecting field constraints."""
|
|
74
|
+
# If using a preset, delegate to locale generator
|
|
75
|
+
preset = getattr(field, "preset", None)
|
|
76
|
+
if preset is not None:
|
|
77
|
+
if generator is None:
|
|
78
|
+
raise ValueError("LocaleGenerator instance required for preset generation")
|
|
79
|
+
return _generate_from_preset(preset, generator)
|
|
80
|
+
|
|
81
|
+
# If using a pattern, generate from regex
|
|
82
|
+
pattern = getattr(field, "pattern", None)
|
|
83
|
+
if pattern is not None:
|
|
84
|
+
return _generate_from_pattern(pattern, rng)
|
|
85
|
+
|
|
86
|
+
# Otherwise generate random alphanumeric string
|
|
87
|
+
min_length = getattr(field, "min_length", None)
|
|
88
|
+
max_length = getattr(field, "max_length", None)
|
|
89
|
+
min_len = min_length if min_length is not None else 1
|
|
90
|
+
max_len = max_length if max_length is not None else 20
|
|
91
|
+
|
|
92
|
+
length = rng.randint(min_len, max_len)
|
|
93
|
+
chars = string.ascii_letters + string.digits
|
|
94
|
+
return "".join(rng.choice(chars) for _ in range(length))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _generate_from_preset(preset: str, generator: LocaleGenerator) -> str:
|
|
98
|
+
"""Generate a value using a LocaleGenerator preset."""
|
|
99
|
+
# Map preset names to LocaleGenerator methods
|
|
100
|
+
preset_mapping = {
|
|
101
|
+
# Personal
|
|
102
|
+
"name": generator.name,
|
|
103
|
+
"name_full": generator.name_full,
|
|
104
|
+
"first_name": generator.first_name,
|
|
105
|
+
"last_name": generator.last_name,
|
|
106
|
+
"email": generator.email,
|
|
107
|
+
"phone_number": generator.phone_number,
|
|
108
|
+
"address": generator.address,
|
|
109
|
+
"city": generator.city,
|
|
110
|
+
"state": generator.state,
|
|
111
|
+
"country": generator.country,
|
|
112
|
+
"postcode": generator.postcode,
|
|
113
|
+
"latitude": generator.latitude,
|
|
114
|
+
"longitude": generator.longitude,
|
|
115
|
+
# Business
|
|
116
|
+
"company": generator.company,
|
|
117
|
+
"job": generator.job,
|
|
118
|
+
"catch_phrase": generator.catch_phrase,
|
|
119
|
+
# Internet
|
|
120
|
+
"url": generator.url,
|
|
121
|
+
"domain_name": generator.domain_name,
|
|
122
|
+
"ipv4": generator.ipv4,
|
|
123
|
+
"ipv6": generator.ipv6,
|
|
124
|
+
"user_name": generator.user_name,
|
|
125
|
+
"password": generator.password,
|
|
126
|
+
# Text
|
|
127
|
+
"text": generator.text,
|
|
128
|
+
"sentence": generator.sentence,
|
|
129
|
+
"paragraph": generator.paragraph,
|
|
130
|
+
"word": generator.word,
|
|
131
|
+
# Financial
|
|
132
|
+
"credit_card_number": generator.credit_card_number,
|
|
133
|
+
"iban": generator.iban,
|
|
134
|
+
"currency_code": generator.currency_code,
|
|
135
|
+
# Identifiers
|
|
136
|
+
"uuid4": generator.uuid4,
|
|
137
|
+
"ssn": generator.ssn,
|
|
138
|
+
"license_plate": generator.license_plate,
|
|
139
|
+
# Date/Time
|
|
140
|
+
"date_this_year": generator.date_this_year,
|
|
141
|
+
"date_this_decade": generator.date_this_decade,
|
|
142
|
+
"time": generator.time,
|
|
143
|
+
# Misc
|
|
144
|
+
"color_name": generator.color_name,
|
|
145
|
+
"file_name": generator.file_name,
|
|
146
|
+
"file_extension": generator.file_extension,
|
|
147
|
+
"mime_type": generator.mime_type,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
generator = preset_mapping.get(preset)
|
|
151
|
+
if generator is None:
|
|
152
|
+
raise ValueError(f"Unknown preset: {preset}")
|
|
153
|
+
|
|
154
|
+
return str(generator())
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _generate_from_pattern(pattern: str, rng: random.Random) -> str:
|
|
158
|
+
"""Generate a string matching the given regex pattern."""
|
|
159
|
+
return generate_from_regex(pattern, rng)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _generate_boolean(field: Field, rng: random.Random, generator: Any | None = None) -> bool:
|
|
163
|
+
"""Generate a random boolean value."""
|
|
164
|
+
p_true = getattr(field, "p_true", 0.5)
|
|
165
|
+
return rng.random() < p_true
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _generate_date(field: Field, rng: random.Random, generator: Any | None = None) -> date:
|
|
169
|
+
"""Generate a random date value respecting field constraints."""
|
|
170
|
+
min_date = getattr(field, "min_date", None)
|
|
171
|
+
max_date = getattr(field, "max_date", None)
|
|
172
|
+
|
|
173
|
+
# Default date range
|
|
174
|
+
if min_date is None:
|
|
175
|
+
min_date = date(2000, 1, 1)
|
|
176
|
+
elif isinstance(min_date, str):
|
|
177
|
+
min_date = date.fromisoformat(min_date)
|
|
178
|
+
elif isinstance(min_date, datetime):
|
|
179
|
+
min_date = min_date.date()
|
|
180
|
+
|
|
181
|
+
if max_date is None:
|
|
182
|
+
max_date = date(2030, 12, 31)
|
|
183
|
+
elif isinstance(max_date, str):
|
|
184
|
+
max_date = date.fromisoformat(max_date)
|
|
185
|
+
elif isinstance(max_date, datetime):
|
|
186
|
+
max_date = max_date.date()
|
|
187
|
+
|
|
188
|
+
days_between = (max_date - min_date).days
|
|
189
|
+
random_days = rng.randint(0, max(0, days_between))
|
|
190
|
+
return min_date + timedelta(days=random_days)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _generate_datetime(field: Field, rng: random.Random, generator: Any | None = None) -> datetime:
|
|
194
|
+
"""Generate a random datetime value respecting field constraints."""
|
|
195
|
+
min_date = getattr(field, "min_date", None)
|
|
196
|
+
max_date = getattr(field, "max_date", None)
|
|
197
|
+
|
|
198
|
+
# Default datetime range
|
|
199
|
+
if min_date is None:
|
|
200
|
+
min_dt = datetime(2000, 1, 1, 0, 0, 0)
|
|
201
|
+
elif isinstance(min_date, str):
|
|
202
|
+
min_dt = datetime.fromisoformat(min_date)
|
|
203
|
+
elif isinstance(min_date, date) and not isinstance(min_date, datetime):
|
|
204
|
+
min_dt = datetime.combine(min_date, datetime.min.time())
|
|
205
|
+
else:
|
|
206
|
+
min_dt = min_date
|
|
207
|
+
|
|
208
|
+
if max_date is None:
|
|
209
|
+
max_dt = datetime(2030, 12, 31, 23, 59, 59)
|
|
210
|
+
elif isinstance(max_date, str):
|
|
211
|
+
max_dt = datetime.fromisoformat(max_date)
|
|
212
|
+
elif isinstance(max_date, date) and not isinstance(max_date, datetime):
|
|
213
|
+
max_dt = datetime.combine(max_date, datetime.max.time())
|
|
214
|
+
else:
|
|
215
|
+
max_dt = max_date
|
|
216
|
+
|
|
217
|
+
seconds_between = int((max_dt - min_dt).total_seconds())
|
|
218
|
+
random_seconds = rng.randint(0, max(0, seconds_between))
|
|
219
|
+
return min_dt + timedelta(seconds=random_seconds)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _generate_duration(field: Field, rng: random.Random, generator: Any | None = None) -> timedelta:
|
|
223
|
+
"""Generate a random duration value, respecting field constraints."""
|
|
224
|
+
min_duration = getattr(field, "min_duration", None)
|
|
225
|
+
max_duration = getattr(field, "max_duration", None)
|
|
226
|
+
|
|
227
|
+
# Parse min_duration
|
|
228
|
+
if min_duration is None:
|
|
229
|
+
min_d = timedelta(seconds=0)
|
|
230
|
+
elif isinstance(min_duration, str):
|
|
231
|
+
# Parse "HH:MM:SS" format
|
|
232
|
+
parts = min_duration.split(":")
|
|
233
|
+
if len(parts) == 3:
|
|
234
|
+
hours, minutes, seconds = map(float, parts)
|
|
235
|
+
min_d = timedelta(hours=hours, minutes=minutes, seconds=seconds)
|
|
236
|
+
elif len(parts) == 2:
|
|
237
|
+
minutes, seconds = map(float, parts)
|
|
238
|
+
min_d = timedelta(minutes=minutes, seconds=seconds)
|
|
239
|
+
else:
|
|
240
|
+
min_d = timedelta(seconds=0)
|
|
241
|
+
else:
|
|
242
|
+
min_d = min_duration
|
|
243
|
+
|
|
244
|
+
# Parse max_duration
|
|
245
|
+
if max_duration is None:
|
|
246
|
+
max_d = timedelta(days=30) # Default: 30 days
|
|
247
|
+
elif isinstance(max_duration, str):
|
|
248
|
+
# Parse "HH:MM:SS" format
|
|
249
|
+
parts = max_duration.split(":")
|
|
250
|
+
if len(parts) == 3:
|
|
251
|
+
hours, minutes, seconds = map(float, parts)
|
|
252
|
+
max_d = timedelta(hours=hours, minutes=minutes, seconds=seconds)
|
|
253
|
+
elif len(parts) == 2:
|
|
254
|
+
minutes, seconds = map(float, parts)
|
|
255
|
+
max_d = timedelta(minutes=minutes, seconds=seconds)
|
|
256
|
+
else:
|
|
257
|
+
max_d = timedelta(days=30)
|
|
258
|
+
else:
|
|
259
|
+
max_d = max_duration
|
|
260
|
+
|
|
261
|
+
# Generate random duration within range
|
|
262
|
+
min_seconds = int(min_d.total_seconds())
|
|
263
|
+
max_seconds = int(max_d.total_seconds())
|
|
264
|
+
random_seconds = rng.randint(min_seconds, max(min_seconds, max_seconds))
|
|
265
|
+
|
|
266
|
+
return timedelta(seconds=random_seconds)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _generate_time(field: Field, rng: random.Random, generator: Any | None = None) -> str:
|
|
270
|
+
"""Generate a random time value as string, respecting field constraints."""
|
|
271
|
+
min_time = getattr(field, "min_time", None)
|
|
272
|
+
max_time = getattr(field, "max_time", None)
|
|
273
|
+
|
|
274
|
+
# Parse min_time
|
|
275
|
+
if min_time is None:
|
|
276
|
+
min_t = time(0, 0, 0)
|
|
277
|
+
elif isinstance(min_time, str):
|
|
278
|
+
min_t = time.fromisoformat(min_time)
|
|
279
|
+
else:
|
|
280
|
+
min_t = min_time
|
|
281
|
+
|
|
282
|
+
# Parse max_time
|
|
283
|
+
if max_time is None:
|
|
284
|
+
max_t = time(23, 59, 59)
|
|
285
|
+
elif isinstance(max_time, str):
|
|
286
|
+
max_t = time.fromisoformat(max_time)
|
|
287
|
+
else:
|
|
288
|
+
max_t = max_time
|
|
289
|
+
|
|
290
|
+
# Convert to seconds since midnight for random generation
|
|
291
|
+
min_seconds = min_t.hour * 3600 + min_t.minute * 60 + min_t.second
|
|
292
|
+
max_seconds = max_t.hour * 3600 + max_t.minute * 60 + max_t.second
|
|
293
|
+
|
|
294
|
+
# Generate random seconds within range
|
|
295
|
+
random_seconds = rng.randint(min_seconds, max(min_seconds, max_seconds))
|
|
296
|
+
|
|
297
|
+
# Convert back to time components
|
|
298
|
+
hour = random_seconds // 3600
|
|
299
|
+
minute = (random_seconds % 3600) // 60
|
|
300
|
+
second = random_seconds % 60
|
|
301
|
+
|
|
302
|
+
return f"{hour:02d}:{minute:02d}:{second:02d}"
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# Mapping from dtype to generator function
|
|
306
|
+
DTYPE_GENERATORS: dict[str, Callable[[Field, random.Random, Any | None], Any]] = {
|
|
307
|
+
"Int8": _generate_integer,
|
|
308
|
+
"Int16": _generate_integer,
|
|
309
|
+
"Int32": _generate_integer,
|
|
310
|
+
"Int64": _generate_integer,
|
|
311
|
+
"UInt8": _generate_integer,
|
|
312
|
+
"UInt16": _generate_integer,
|
|
313
|
+
"UInt32": _generate_integer,
|
|
314
|
+
"UInt64": _generate_integer,
|
|
315
|
+
"Float32": _generate_float,
|
|
316
|
+
"Float64": _generate_float,
|
|
317
|
+
"String": _generate_string,
|
|
318
|
+
"Boolean": _generate_boolean,
|
|
319
|
+
"Date": _generate_date,
|
|
320
|
+
"Datetime": _generate_datetime,
|
|
321
|
+
"Duration": _generate_duration,
|
|
322
|
+
"Time": _generate_time,
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _generate_value(field: Field, rng: random.Random, locale_gen: Any | None = None) -> Any:
|
|
327
|
+
"""Generate a single value for a field."""
|
|
328
|
+
# Check for custom generator first
|
|
329
|
+
if field.generator is not None:
|
|
330
|
+
return field.generator()
|
|
331
|
+
|
|
332
|
+
# Check for allowed values (categorical)
|
|
333
|
+
allowed = getattr(field, "allowed", None)
|
|
334
|
+
if allowed is not None:
|
|
335
|
+
return rng.choice(allowed)
|
|
336
|
+
|
|
337
|
+
# Use dtype-specific generator
|
|
338
|
+
generator = DTYPE_GENERATORS.get(field.dtype)
|
|
339
|
+
if generator is None:
|
|
340
|
+
raise ValueError(f"No generator available for dtype: {field.dtype}")
|
|
341
|
+
|
|
342
|
+
return generator(field, rng, locale_gen)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _generate_unique_values(
|
|
346
|
+
field: Field,
|
|
347
|
+
n: int,
|
|
348
|
+
rng: random.Random,
|
|
349
|
+
locale_gen: Any | None = None,
|
|
350
|
+
max_retries: int = 1000,
|
|
351
|
+
) -> list[Any]:
|
|
352
|
+
"""Generate n unique values for a field."""
|
|
353
|
+
# Check if we can even generate enough unique values
|
|
354
|
+
allowed = getattr(field, "allowed", None)
|
|
355
|
+
if allowed is not None and len(allowed) < n:
|
|
356
|
+
raise ValueError(
|
|
357
|
+
f"Cannot generate {n} unique values from {len(allowed)} allowed values "
|
|
358
|
+
f"for field with allowed={allowed}"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
seen: set[Any] = set()
|
|
362
|
+
values: list[Any] = []
|
|
363
|
+
consecutive_retries = 0
|
|
364
|
+
|
|
365
|
+
while len(values) < n:
|
|
366
|
+
value = _generate_value(field, rng, locale_gen)
|
|
367
|
+
|
|
368
|
+
# Handle unhashable types
|
|
369
|
+
try:
|
|
370
|
+
value_key = value
|
|
371
|
+
if isinstance(value, (list, dict)):
|
|
372
|
+
value_key = str(value)
|
|
373
|
+
|
|
374
|
+
if value_key not in seen:
|
|
375
|
+
seen.add(value_key)
|
|
376
|
+
values.append(value)
|
|
377
|
+
consecutive_retries = 0
|
|
378
|
+
else:
|
|
379
|
+
consecutive_retries += 1
|
|
380
|
+
if consecutive_retries > max_retries:
|
|
381
|
+
raise ValueError(
|
|
382
|
+
f"Unable to generate {n} unique values after {max_retries} "
|
|
383
|
+
f"consecutive retries. Generated {len(values)} unique values. "
|
|
384
|
+
"Consider relaxing constraints or reducing n."
|
|
385
|
+
)
|
|
386
|
+
except TypeError:
|
|
387
|
+
# Unhashable type, just append (can't check uniqueness easily)
|
|
388
|
+
values.append(value)
|
|
389
|
+
|
|
390
|
+
return values
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def generate_column(
|
|
394
|
+
field: Field,
|
|
395
|
+
config: GeneratorConfig,
|
|
396
|
+
) -> list[Any]:
|
|
397
|
+
"""
|
|
398
|
+
Generate a list of values for a single column.
|
|
399
|
+
|
|
400
|
+
Parameters
|
|
401
|
+
----------
|
|
402
|
+
field
|
|
403
|
+
The Field specification for the column.
|
|
404
|
+
config
|
|
405
|
+
Generation configuration.
|
|
406
|
+
|
|
407
|
+
Returns
|
|
408
|
+
-------
|
|
409
|
+
list
|
|
410
|
+
List of generated values.
|
|
411
|
+
"""
|
|
412
|
+
# Set up random number generator
|
|
413
|
+
rng = random.Random(config.seed)
|
|
414
|
+
|
|
415
|
+
# Set up locale generator if needed
|
|
416
|
+
locale_gen = None
|
|
417
|
+
preset = getattr(field, "preset", None)
|
|
418
|
+
if preset is not None:
|
|
419
|
+
# Use config country
|
|
420
|
+
locale_gen = _get_locale_generator(config.country, config.seed)
|
|
421
|
+
|
|
422
|
+
# Generate values
|
|
423
|
+
if field.unique:
|
|
424
|
+
values = _generate_unique_values(
|
|
425
|
+
field, config.n, rng, locale_gen, config.max_unique_retries
|
|
426
|
+
)
|
|
427
|
+
else:
|
|
428
|
+
values = [_generate_value(field, rng, locale_gen) for _ in range(config.n)]
|
|
429
|
+
|
|
430
|
+
# Apply null probability
|
|
431
|
+
if field.nullable and field.null_probability > 0:
|
|
432
|
+
null_rng = random.Random(config.seed + 1 if config.seed else None)
|
|
433
|
+
values = [None if null_rng.random() < field.null_probability else v for v in values]
|
|
434
|
+
|
|
435
|
+
return values
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# Presets that should share coherent context across columns
|
|
439
|
+
ADDRESS_RELATED_PRESETS = {
|
|
440
|
+
"address",
|
|
441
|
+
"city",
|
|
442
|
+
"state",
|
|
443
|
+
"postcode",
|
|
444
|
+
"phone_number",
|
|
445
|
+
"latitude",
|
|
446
|
+
"longitude",
|
|
447
|
+
}
|
|
448
|
+
PERSON_RELATED_PRESETS = {"name", "name_full", "first_name", "last_name", "email", "user_name"}
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _get_coherence_needs(fields: dict[str, Field]) -> tuple[bool, bool]:
|
|
452
|
+
"""Check what coherence is needed for the given fields."""
|
|
453
|
+
needs_address = False
|
|
454
|
+
needs_person = False
|
|
455
|
+
|
|
456
|
+
for field in fields.values():
|
|
457
|
+
preset = getattr(field, "preset", None)
|
|
458
|
+
if preset in ADDRESS_RELATED_PRESETS:
|
|
459
|
+
needs_address = True
|
|
460
|
+
if preset in PERSON_RELATED_PRESETS:
|
|
461
|
+
needs_person = True
|
|
462
|
+
|
|
463
|
+
return needs_address, needs_person
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _generate_column_with_row_context(
|
|
467
|
+
field: Field,
|
|
468
|
+
config: GeneratorConfig,
|
|
469
|
+
locale_gen: LocaleGenerator | None,
|
|
470
|
+
) -> list[Any]:
|
|
471
|
+
"""
|
|
472
|
+
Generate column values with per-row context (location and/or person).
|
|
473
|
+
|
|
474
|
+
This is used when columns need to share coherent data per row.
|
|
475
|
+
"""
|
|
476
|
+
rng = random.Random(config.seed)
|
|
477
|
+
|
|
478
|
+
values = []
|
|
479
|
+
for i in range(config.n):
|
|
480
|
+
if locale_gen is not None:
|
|
481
|
+
locale_gen.set_row(i)
|
|
482
|
+
values.append(_generate_value(field, rng, locale_gen))
|
|
483
|
+
|
|
484
|
+
# Apply null probability
|
|
485
|
+
if field.nullable and field.null_probability > 0:
|
|
486
|
+
null_rng = random.Random(config.seed + 1 if config.seed else None)
|
|
487
|
+
values = [None if null_rng.random() < field.null_probability else v for v in values]
|
|
488
|
+
|
|
489
|
+
return values
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def generate_dataframe(
|
|
493
|
+
fields: dict[str, Field],
|
|
494
|
+
config: GeneratorConfig,
|
|
495
|
+
) -> Any:
|
|
496
|
+
"""
|
|
497
|
+
Generate a DataFrame with the specified fields.
|
|
498
|
+
|
|
499
|
+
Parameters
|
|
500
|
+
----------
|
|
501
|
+
fields
|
|
502
|
+
Dictionary mapping column names to Field specifications.
|
|
503
|
+
config
|
|
504
|
+
Generation configuration.
|
|
505
|
+
|
|
506
|
+
Returns
|
|
507
|
+
-------
|
|
508
|
+
DataFrame
|
|
509
|
+
Generated DataFrame in the format specified by config.output.
|
|
510
|
+
"""
|
|
511
|
+
# Check what coherence is needed
|
|
512
|
+
needs_address, needs_person = _get_coherence_needs(fields)
|
|
513
|
+
needs_coherence = needs_address or needs_person
|
|
514
|
+
|
|
515
|
+
# Set up shared locale generator if any coherence is needed
|
|
516
|
+
shared_locale_gen = None
|
|
517
|
+
if needs_coherence:
|
|
518
|
+
shared_locale_gen = _get_locale_generator(config.country, config.seed)
|
|
519
|
+
if needs_address:
|
|
520
|
+
shared_locale_gen.init_row_locations(config.n)
|
|
521
|
+
if needs_person:
|
|
522
|
+
shared_locale_gen.init_row_persons(config.n)
|
|
523
|
+
|
|
524
|
+
# Determine which presets need row context
|
|
525
|
+
coherent_presets = set()
|
|
526
|
+
if needs_address:
|
|
527
|
+
coherent_presets.update(ADDRESS_RELATED_PRESETS)
|
|
528
|
+
if needs_person:
|
|
529
|
+
coherent_presets.update(PERSON_RELATED_PRESETS)
|
|
530
|
+
|
|
531
|
+
# Generate data for each column
|
|
532
|
+
data: dict[str, list[Any]] = {}
|
|
533
|
+
for col_name, field in fields.items():
|
|
534
|
+
preset = getattr(field, "preset", None)
|
|
535
|
+
|
|
536
|
+
# Use shared locale generator for coherent presets
|
|
537
|
+
if needs_coherence and preset in coherent_presets:
|
|
538
|
+
data[col_name] = _generate_column_with_row_context(field, config, shared_locale_gen)
|
|
539
|
+
else:
|
|
540
|
+
data[col_name] = generate_column(field, config)
|
|
541
|
+
|
|
542
|
+
# Clean up
|
|
543
|
+
if shared_locale_gen is not None:
|
|
544
|
+
if needs_address:
|
|
545
|
+
shared_locale_gen.clear_row_locations()
|
|
546
|
+
if needs_person:
|
|
547
|
+
shared_locale_gen.clear_row_persons()
|
|
548
|
+
|
|
549
|
+
# Convert to requested output format
|
|
550
|
+
if config.output == "dict":
|
|
551
|
+
return data
|
|
552
|
+
|
|
553
|
+
if config.output == "polars":
|
|
554
|
+
if not _is_lib_present("polars"):
|
|
555
|
+
raise ImportError(
|
|
556
|
+
"The Polars library is not installed but is required when specifying "
|
|
557
|
+
'`output="polars"`.'
|
|
558
|
+
)
|
|
559
|
+
import polars as pl
|
|
560
|
+
|
|
561
|
+
return pl.DataFrame(data)
|
|
562
|
+
|
|
563
|
+
if config.output == "pandas":
|
|
564
|
+
if not _is_lib_present("pandas"):
|
|
565
|
+
raise ImportError(
|
|
566
|
+
"The Pandas library is not installed but is required when specifying "
|
|
567
|
+
'`output="pandas"`.'
|
|
568
|
+
)
|
|
569
|
+
import pandas as pd
|
|
570
|
+
|
|
571
|
+
return pd.DataFrame(data)
|
|
572
|
+
|
|
573
|
+
raise ValueError(f"Unknown output format: {config.output}")
|