pointblank 0.19.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +44 -1
- pointblank/_utils_llms_txt.py +20 -0
- pointblank/data/api-docs.txt +793 -1
- pointblank/field.py +1507 -0
- pointblank/generate/__init__.py +17 -0
- pointblank/generate/base.py +49 -0
- pointblank/generate/generators.py +573 -0
- pointblank/generate/regex.py +217 -0
- pointblank/locales/__init__.py +1476 -0
- pointblank/locales/data/AR/address.json +73 -0
- pointblank/locales/data/AR/company.json +60 -0
- pointblank/locales/data/AR/internet.json +19 -0
- pointblank/locales/data/AR/misc.json +7 -0
- pointblank/locales/data/AR/person.json +39 -0
- pointblank/locales/data/AR/text.json +38 -0
- pointblank/locales/data/AT/address.json +84 -0
- pointblank/locales/data/AT/company.json +65 -0
- pointblank/locales/data/AT/internet.json +20 -0
- pointblank/locales/data/AT/misc.json +8 -0
- pointblank/locales/data/AT/person.json +17 -0
- pointblank/locales/data/AT/text.json +35 -0
- pointblank/locales/data/AU/address.json +83 -0
- pointblank/locales/data/AU/company.json +65 -0
- pointblank/locales/data/AU/internet.json +20 -0
- pointblank/locales/data/AU/misc.json +8 -0
- pointblank/locales/data/AU/person.json +17 -0
- pointblank/locales/data/AU/text.json +35 -0
- pointblank/locales/data/BE/address.json +225 -0
- pointblank/locales/data/BE/company.json +129 -0
- pointblank/locales/data/BE/internet.json +36 -0
- pointblank/locales/data/BE/misc.json +6 -0
- pointblank/locales/data/BE/person.json +62 -0
- pointblank/locales/data/BE/text.json +38 -0
- pointblank/locales/data/BG/address.json +75 -0
- pointblank/locales/data/BG/company.json +60 -0
- pointblank/locales/data/BG/internet.json +19 -0
- pointblank/locales/data/BG/misc.json +7 -0
- pointblank/locales/data/BG/person.json +40 -0
- pointblank/locales/data/BG/text.json +38 -0
- pointblank/locales/data/BR/address.json +98 -0
- pointblank/locales/data/BR/company.json +65 -0
- pointblank/locales/data/BR/internet.json +20 -0
- pointblank/locales/data/BR/misc.json +8 -0
- pointblank/locales/data/BR/person.json +17 -0
- pointblank/locales/data/BR/text.json +35 -0
- pointblank/locales/data/CA/address.json +747 -0
- pointblank/locales/data/CA/company.json +120 -0
- pointblank/locales/data/CA/internet.json +24 -0
- pointblank/locales/data/CA/misc.json +11 -0
- pointblank/locales/data/CA/person.json +1033 -0
- pointblank/locales/data/CA/text.json +58 -0
- pointblank/locales/data/CH/address.json +184 -0
- pointblank/locales/data/CH/company.json +112 -0
- pointblank/locales/data/CH/internet.json +20 -0
- pointblank/locales/data/CH/misc.json +10 -0
- pointblank/locales/data/CH/person.json +64 -0
- pointblank/locales/data/CH/text.json +45 -0
- pointblank/locales/data/CL/address.json +71 -0
- pointblank/locales/data/CL/company.json +60 -0
- pointblank/locales/data/CL/internet.json +19 -0
- pointblank/locales/data/CL/misc.json +7 -0
- pointblank/locales/data/CL/person.json +38 -0
- pointblank/locales/data/CL/text.json +38 -0
- pointblank/locales/data/CN/address.json +124 -0
- pointblank/locales/data/CN/company.json +76 -0
- pointblank/locales/data/CN/internet.json +20 -0
- pointblank/locales/data/CN/misc.json +8 -0
- pointblank/locales/data/CN/person.json +50 -0
- pointblank/locales/data/CN/text.json +38 -0
- pointblank/locales/data/CO/address.json +76 -0
- pointblank/locales/data/CO/company.json +60 -0
- pointblank/locales/data/CO/internet.json +19 -0
- pointblank/locales/data/CO/misc.json +7 -0
- pointblank/locales/data/CO/person.json +38 -0
- pointblank/locales/data/CO/text.json +38 -0
- pointblank/locales/data/CY/address.json +62 -0
- pointblank/locales/data/CY/company.json +60 -0
- pointblank/locales/data/CY/internet.json +19 -0
- pointblank/locales/data/CY/misc.json +7 -0
- pointblank/locales/data/CY/person.json +38 -0
- pointblank/locales/data/CY/text.json +38 -0
- pointblank/locales/data/CZ/address.json +70 -0
- pointblank/locales/data/CZ/company.json +61 -0
- pointblank/locales/data/CZ/internet.json +19 -0
- pointblank/locales/data/CZ/misc.json +7 -0
- pointblank/locales/data/CZ/person.json +40 -0
- pointblank/locales/data/CZ/text.json +38 -0
- pointblank/locales/data/DE/address.json +756 -0
- pointblank/locales/data/DE/company.json +101 -0
- pointblank/locales/data/DE/internet.json +22 -0
- pointblank/locales/data/DE/misc.json +11 -0
- pointblank/locales/data/DE/person.json +1026 -0
- pointblank/locales/data/DE/text.json +50 -0
- pointblank/locales/data/DK/address.json +231 -0
- pointblank/locales/data/DK/company.json +65 -0
- pointblank/locales/data/DK/internet.json +20 -0
- pointblank/locales/data/DK/misc.json +7 -0
- pointblank/locales/data/DK/person.json +45 -0
- pointblank/locales/data/DK/text.json +43 -0
- pointblank/locales/data/EE/address.json +69 -0
- pointblank/locales/data/EE/company.json +60 -0
- pointblank/locales/data/EE/internet.json +19 -0
- pointblank/locales/data/EE/misc.json +7 -0
- pointblank/locales/data/EE/person.json +39 -0
- pointblank/locales/data/EE/text.json +38 -0
- pointblank/locales/data/ES/address.json +3086 -0
- pointblank/locales/data/ES/company.json +644 -0
- pointblank/locales/data/ES/internet.json +25 -0
- pointblank/locales/data/ES/misc.json +11 -0
- pointblank/locales/data/ES/person.json +488 -0
- pointblank/locales/data/ES/text.json +49 -0
- pointblank/locales/data/FI/address.json +93 -0
- pointblank/locales/data/FI/company.json +65 -0
- pointblank/locales/data/FI/internet.json +20 -0
- pointblank/locales/data/FI/misc.json +8 -0
- pointblank/locales/data/FI/person.json +17 -0
- pointblank/locales/data/FI/text.json +35 -0
- pointblank/locales/data/FR/address.json +619 -0
- pointblank/locales/data/FR/company.json +111 -0
- pointblank/locales/data/FR/internet.json +22 -0
- pointblank/locales/data/FR/misc.json +11 -0
- pointblank/locales/data/FR/person.json +1066 -0
- pointblank/locales/data/FR/text.json +50 -0
- pointblank/locales/data/GB/address.json +5759 -0
- pointblank/locales/data/GB/company.json +131 -0
- pointblank/locales/data/GB/internet.json +24 -0
- pointblank/locales/data/GB/misc.json +45 -0
- pointblank/locales/data/GB/person.json +578 -0
- pointblank/locales/data/GB/text.json +61 -0
- pointblank/locales/data/GR/address.json +68 -0
- pointblank/locales/data/GR/company.json +61 -0
- pointblank/locales/data/GR/internet.json +19 -0
- pointblank/locales/data/GR/misc.json +7 -0
- pointblank/locales/data/GR/person.json +39 -0
- pointblank/locales/data/GR/text.json +38 -0
- pointblank/locales/data/HK/address.json +79 -0
- pointblank/locales/data/HK/company.json +69 -0
- pointblank/locales/data/HK/internet.json +19 -0
- pointblank/locales/data/HK/misc.json +7 -0
- pointblank/locales/data/HK/person.json +42 -0
- pointblank/locales/data/HK/text.json +38 -0
- pointblank/locales/data/HR/address.json +73 -0
- pointblank/locales/data/HR/company.json +60 -0
- pointblank/locales/data/HR/internet.json +19 -0
- pointblank/locales/data/HR/misc.json +7 -0
- pointblank/locales/data/HR/person.json +38 -0
- pointblank/locales/data/HR/text.json +38 -0
- pointblank/locales/data/HU/address.json +70 -0
- pointblank/locales/data/HU/company.json +61 -0
- pointblank/locales/data/HU/internet.json +19 -0
- pointblank/locales/data/HU/misc.json +7 -0
- pointblank/locales/data/HU/person.json +40 -0
- pointblank/locales/data/HU/text.json +38 -0
- pointblank/locales/data/ID/address.json +68 -0
- pointblank/locales/data/ID/company.json +61 -0
- pointblank/locales/data/ID/internet.json +19 -0
- pointblank/locales/data/ID/misc.json +7 -0
- pointblank/locales/data/ID/person.json +40 -0
- pointblank/locales/data/ID/text.json +38 -0
- pointblank/locales/data/IE/address.json +643 -0
- pointblank/locales/data/IE/company.json +140 -0
- pointblank/locales/data/IE/internet.json +24 -0
- pointblank/locales/data/IE/misc.json +44 -0
- pointblank/locales/data/IE/person.json +55 -0
- pointblank/locales/data/IE/text.json +60 -0
- pointblank/locales/data/IN/address.json +92 -0
- pointblank/locales/data/IN/company.json +65 -0
- pointblank/locales/data/IN/internet.json +20 -0
- pointblank/locales/data/IN/misc.json +8 -0
- pointblank/locales/data/IN/person.json +52 -0
- pointblank/locales/data/IN/text.json +39 -0
- pointblank/locales/data/IS/address.json +63 -0
- pointblank/locales/data/IS/company.json +61 -0
- pointblank/locales/data/IS/internet.json +19 -0
- pointblank/locales/data/IS/misc.json +7 -0
- pointblank/locales/data/IS/person.json +44 -0
- pointblank/locales/data/IS/text.json +38 -0
- pointblank/locales/data/IT/address.json +192 -0
- pointblank/locales/data/IT/company.json +137 -0
- pointblank/locales/data/IT/internet.json +20 -0
- pointblank/locales/data/IT/misc.json +10 -0
- pointblank/locales/data/IT/person.json +70 -0
- pointblank/locales/data/IT/text.json +44 -0
- pointblank/locales/data/JP/address.json +713 -0
- pointblank/locales/data/JP/company.json +113 -0
- pointblank/locales/data/JP/internet.json +22 -0
- pointblank/locales/data/JP/misc.json +10 -0
- pointblank/locales/data/JP/person.json +1057 -0
- pointblank/locales/data/JP/text.json +51 -0
- pointblank/locales/data/KR/address.json +77 -0
- pointblank/locales/data/KR/company.json +68 -0
- pointblank/locales/data/KR/internet.json +19 -0
- pointblank/locales/data/KR/misc.json +7 -0
- pointblank/locales/data/KR/person.json +40 -0
- pointblank/locales/data/KR/text.json +38 -0
- pointblank/locales/data/LT/address.json +66 -0
- pointblank/locales/data/LT/company.json +60 -0
- pointblank/locales/data/LT/internet.json +19 -0
- pointblank/locales/data/LT/misc.json +7 -0
- pointblank/locales/data/LT/person.json +42 -0
- pointblank/locales/data/LT/text.json +38 -0
- pointblank/locales/data/LU/address.json +66 -0
- pointblank/locales/data/LU/company.json +60 -0
- pointblank/locales/data/LU/internet.json +19 -0
- pointblank/locales/data/LU/misc.json +7 -0
- pointblank/locales/data/LU/person.json +38 -0
- pointblank/locales/data/LU/text.json +38 -0
- pointblank/locales/data/LV/address.json +62 -0
- pointblank/locales/data/LV/company.json +60 -0
- pointblank/locales/data/LV/internet.json +19 -0
- pointblank/locales/data/LV/misc.json +7 -0
- pointblank/locales/data/LV/person.json +40 -0
- pointblank/locales/data/LV/text.json +38 -0
- pointblank/locales/data/MT/address.json +61 -0
- pointblank/locales/data/MT/company.json +60 -0
- pointblank/locales/data/MT/internet.json +19 -0
- pointblank/locales/data/MT/misc.json +7 -0
- pointblank/locales/data/MT/person.json +38 -0
- pointblank/locales/data/MT/text.json +38 -0
- pointblank/locales/data/MX/address.json +100 -0
- pointblank/locales/data/MX/company.json +65 -0
- pointblank/locales/data/MX/internet.json +20 -0
- pointblank/locales/data/MX/misc.json +8 -0
- pointblank/locales/data/MX/person.json +18 -0
- pointblank/locales/data/MX/text.json +39 -0
- pointblank/locales/data/NL/address.json +1517 -0
- pointblank/locales/data/NL/company.json +133 -0
- pointblank/locales/data/NL/internet.json +44 -0
- pointblank/locales/data/NL/misc.json +55 -0
- pointblank/locales/data/NL/person.json +365 -0
- pointblank/locales/data/NL/text.json +210 -0
- pointblank/locales/data/NO/address.json +86 -0
- pointblank/locales/data/NO/company.json +66 -0
- pointblank/locales/data/NO/internet.json +20 -0
- pointblank/locales/data/NO/misc.json +8 -0
- pointblank/locales/data/NO/person.json +17 -0
- pointblank/locales/data/NO/text.json +35 -0
- pointblank/locales/data/NZ/address.json +90 -0
- pointblank/locales/data/NZ/company.json +65 -0
- pointblank/locales/data/NZ/internet.json +20 -0
- pointblank/locales/data/NZ/misc.json +8 -0
- pointblank/locales/data/NZ/person.json +17 -0
- pointblank/locales/data/NZ/text.json +39 -0
- pointblank/locales/data/PH/address.json +67 -0
- pointblank/locales/data/PH/company.json +61 -0
- pointblank/locales/data/PH/internet.json +19 -0
- pointblank/locales/data/PH/misc.json +7 -0
- pointblank/locales/data/PH/person.json +40 -0
- pointblank/locales/data/PH/text.json +38 -0
- pointblank/locales/data/PL/address.json +91 -0
- pointblank/locales/data/PL/company.json +65 -0
- pointblank/locales/data/PL/internet.json +20 -0
- pointblank/locales/data/PL/misc.json +8 -0
- pointblank/locales/data/PL/person.json +17 -0
- pointblank/locales/data/PL/text.json +35 -0
- pointblank/locales/data/PT/address.json +90 -0
- pointblank/locales/data/PT/company.json +65 -0
- pointblank/locales/data/PT/internet.json +20 -0
- pointblank/locales/data/PT/misc.json +8 -0
- pointblank/locales/data/PT/person.json +17 -0
- pointblank/locales/data/PT/text.json +35 -0
- pointblank/locales/data/RO/address.json +73 -0
- pointblank/locales/data/RO/company.json +61 -0
- pointblank/locales/data/RO/internet.json +19 -0
- pointblank/locales/data/RO/misc.json +7 -0
- pointblank/locales/data/RO/person.json +40 -0
- pointblank/locales/data/RO/text.json +38 -0
- pointblank/locales/data/RU/address.json +74 -0
- pointblank/locales/data/RU/company.json +60 -0
- pointblank/locales/data/RU/internet.json +19 -0
- pointblank/locales/data/RU/misc.json +7 -0
- pointblank/locales/data/RU/person.json +38 -0
- pointblank/locales/data/RU/text.json +38 -0
- pointblank/locales/data/SE/address.json +247 -0
- pointblank/locales/data/SE/company.json +65 -0
- pointblank/locales/data/SE/internet.json +20 -0
- pointblank/locales/data/SE/misc.json +7 -0
- pointblank/locales/data/SE/person.json +45 -0
- pointblank/locales/data/SE/text.json +43 -0
- pointblank/locales/data/SI/address.json +67 -0
- pointblank/locales/data/SI/company.json +60 -0
- pointblank/locales/data/SI/internet.json +19 -0
- pointblank/locales/data/SI/misc.json +7 -0
- pointblank/locales/data/SI/person.json +38 -0
- pointblank/locales/data/SI/text.json +38 -0
- pointblank/locales/data/SK/address.json +64 -0
- pointblank/locales/data/SK/company.json +60 -0
- pointblank/locales/data/SK/internet.json +19 -0
- pointblank/locales/data/SK/misc.json +7 -0
- pointblank/locales/data/SK/person.json +38 -0
- pointblank/locales/data/SK/text.json +38 -0
- pointblank/locales/data/TR/address.json +105 -0
- pointblank/locales/data/TR/company.json +65 -0
- pointblank/locales/data/TR/internet.json +20 -0
- pointblank/locales/data/TR/misc.json +8 -0
- pointblank/locales/data/TR/person.json +17 -0
- pointblank/locales/data/TR/text.json +35 -0
- pointblank/locales/data/TW/address.json +86 -0
- pointblank/locales/data/TW/company.json +69 -0
- pointblank/locales/data/TW/internet.json +19 -0
- pointblank/locales/data/TW/misc.json +7 -0
- pointblank/locales/data/TW/person.json +42 -0
- pointblank/locales/data/TW/text.json +38 -0
- pointblank/locales/data/US/address.json +996 -0
- pointblank/locales/data/US/company.json +131 -0
- pointblank/locales/data/US/internet.json +22 -0
- pointblank/locales/data/US/misc.json +11 -0
- pointblank/locales/data/US/person.json +1092 -0
- pointblank/locales/data/US/text.json +56 -0
- pointblank/locales/data/_shared/misc.json +42 -0
- pointblank/schema.py +339 -2
- {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/METADATA +45 -1
- pointblank-0.20.0.dist-info/RECORD +366 -0
- {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/WHEEL +1 -1
- pointblank-0.19.0.dist-info/RECORD +0 -59
- {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/top_level.txt +0 -0
pointblank/field.py
ADDED
|
@@ -0,0 +1,1507 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import date, datetime, time, timedelta
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
# Helper functions (primary API)
|
|
12
|
+
"int_field",
|
|
13
|
+
"float_field",
|
|
14
|
+
"string_field",
|
|
15
|
+
"bool_field",
|
|
16
|
+
"date_field",
|
|
17
|
+
"datetime_field",
|
|
18
|
+
"time_field",
|
|
19
|
+
"duration_field",
|
|
20
|
+
# Classes (for type hints and advanced usage)
|
|
21
|
+
"Field",
|
|
22
|
+
"IntField",
|
|
23
|
+
"FloatField",
|
|
24
|
+
"StringField",
|
|
25
|
+
"BoolField",
|
|
26
|
+
"DateField",
|
|
27
|
+
"DatetimeField",
|
|
28
|
+
"TimeField",
|
|
29
|
+
"DurationField",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Available presets for realistic data generation
|
|
34
|
+
AVAILABLE_PRESETS = frozenset(
|
|
35
|
+
{
|
|
36
|
+
# Personal
|
|
37
|
+
"name",
|
|
38
|
+
"name_full",
|
|
39
|
+
"first_name",
|
|
40
|
+
"last_name",
|
|
41
|
+
"email",
|
|
42
|
+
"phone_number",
|
|
43
|
+
"address",
|
|
44
|
+
"city",
|
|
45
|
+
"state",
|
|
46
|
+
"country",
|
|
47
|
+
"postcode",
|
|
48
|
+
"latitude",
|
|
49
|
+
"longitude",
|
|
50
|
+
# Business
|
|
51
|
+
"company",
|
|
52
|
+
"job",
|
|
53
|
+
"catch_phrase",
|
|
54
|
+
# Internet
|
|
55
|
+
"url",
|
|
56
|
+
"domain_name",
|
|
57
|
+
"ipv4",
|
|
58
|
+
"ipv6",
|
|
59
|
+
"user_name",
|
|
60
|
+
"password",
|
|
61
|
+
# Text
|
|
62
|
+
"text",
|
|
63
|
+
"sentence",
|
|
64
|
+
"paragraph",
|
|
65
|
+
"word",
|
|
66
|
+
# Financial
|
|
67
|
+
"credit_card_number",
|
|
68
|
+
"iban",
|
|
69
|
+
"currency_code",
|
|
70
|
+
# Identifiers
|
|
71
|
+
"uuid4",
|
|
72
|
+
"ssn",
|
|
73
|
+
"license_plate",
|
|
74
|
+
# Date/Time (for string representations)
|
|
75
|
+
"date_this_year",
|
|
76
|
+
"date_this_decade",
|
|
77
|
+
"time",
|
|
78
|
+
# Misc
|
|
79
|
+
"color_name",
|
|
80
|
+
"file_name",
|
|
81
|
+
"file_extension",
|
|
82
|
+
"mime_type",
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# =============================================================================
|
|
88
|
+
# Base Field Class
|
|
89
|
+
# =============================================================================
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class Field:
|
|
94
|
+
"""
|
|
95
|
+
Base class for column specifications in schema definition.
|
|
96
|
+
|
|
97
|
+
This is the base class used internally. For creating fields, use the
|
|
98
|
+
purpose-built field classes or helper functions:
|
|
99
|
+
|
|
100
|
+
- `int_field()` / `IntField` for integer columns
|
|
101
|
+
- `float_field()` / `FloatField` for floating-point columns
|
|
102
|
+
- `string_field()` / `StringField` for string columns
|
|
103
|
+
- `bool_field()` / `BoolField` for boolean columns
|
|
104
|
+
- `date_field()` / `DateField` for date columns
|
|
105
|
+
- `datetime_field()` / `DatetimeField` for datetime columns
|
|
106
|
+
- `time_field()` / `TimeField` for time columns
|
|
107
|
+
- `duration_field()` / `DurationField` for duration columns
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
dtype: str
|
|
111
|
+
|
|
112
|
+
# Nullability
|
|
113
|
+
nullable: bool = False
|
|
114
|
+
null_probability: float = 0.0
|
|
115
|
+
|
|
116
|
+
# Uniqueness
|
|
117
|
+
unique: bool = False
|
|
118
|
+
|
|
119
|
+
# Custom generator
|
|
120
|
+
generator: Callable[[], Any] | None = field(default=None, repr=False)
|
|
121
|
+
|
|
122
|
+
def __post_init__(self):
|
|
123
|
+
"""Validate field constraints after initialization."""
|
|
124
|
+
self._validate()
|
|
125
|
+
|
|
126
|
+
def _validate(self) -> None:
|
|
127
|
+
"""Validate that all field constraints are consistent and valid."""
|
|
128
|
+
# Validate null_probability
|
|
129
|
+
if not 0.0 <= self.null_probability <= 1.0:
|
|
130
|
+
raise ValueError(
|
|
131
|
+
f"null_probability must be between 0.0 and 1.0, got {self.null_probability}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if self.null_probability > 0.0 and not self.nullable:
|
|
135
|
+
raise ValueError("null_probability > 0 requires nullable=True")
|
|
136
|
+
|
|
137
|
+
def is_numeric(self) -> bool:
|
|
138
|
+
"""Check if this field has a numeric dtype."""
|
|
139
|
+
return self.dtype in {
|
|
140
|
+
"Int8",
|
|
141
|
+
"Int16",
|
|
142
|
+
"Int32",
|
|
143
|
+
"Int64",
|
|
144
|
+
"UInt8",
|
|
145
|
+
"UInt16",
|
|
146
|
+
"UInt32",
|
|
147
|
+
"UInt64",
|
|
148
|
+
"Float32",
|
|
149
|
+
"Float64",
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
def is_integer(self) -> bool:
|
|
153
|
+
"""Check if this field has an integer dtype."""
|
|
154
|
+
return self.dtype in {
|
|
155
|
+
"Int8",
|
|
156
|
+
"Int16",
|
|
157
|
+
"Int32",
|
|
158
|
+
"Int64",
|
|
159
|
+
"UInt8",
|
|
160
|
+
"UInt16",
|
|
161
|
+
"UInt32",
|
|
162
|
+
"UInt64",
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
def is_float(self) -> bool:
|
|
166
|
+
"""Check if this field has a float dtype."""
|
|
167
|
+
return self.dtype in {"Float32", "Float64"}
|
|
168
|
+
|
|
169
|
+
def is_string(self) -> bool:
|
|
170
|
+
"""Check if this field has a string dtype."""
|
|
171
|
+
return self.dtype == "String"
|
|
172
|
+
|
|
173
|
+
def is_boolean(self) -> bool:
|
|
174
|
+
"""Check if this field has a boolean dtype."""
|
|
175
|
+
return self.dtype == "Boolean"
|
|
176
|
+
|
|
177
|
+
def is_temporal(self) -> bool:
|
|
178
|
+
"""Check if this field has a temporal dtype."""
|
|
179
|
+
return self.dtype in {"Date", "Datetime", "Time", "Duration"}
|
|
180
|
+
|
|
181
|
+
def has_custom_generator(self) -> bool:
|
|
182
|
+
"""Check if this field uses a custom generator."""
|
|
183
|
+
return self.generator is not None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# =============================================================================
|
|
187
|
+
# Integer Field
|
|
188
|
+
# =============================================================================
|
|
189
|
+
|
|
190
|
+
# Valid integer dtypes
|
|
191
|
+
INT_DTYPES = frozenset({"Int8", "Int16", "Int32", "Int64", "UInt8", "UInt16", "UInt32", "UInt64"})
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@dataclass
|
|
195
|
+
class IntField(Field):
|
|
196
|
+
"""
|
|
197
|
+
Integer column specification for schema definition.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
min_val
|
|
202
|
+
Minimum value (inclusive). Default is `None` (no minimum).
|
|
203
|
+
max_val
|
|
204
|
+
Maximum value (inclusive). Default is `None` (no maximum).
|
|
205
|
+
allowed
|
|
206
|
+
List of allowed values (categorical constraint). When provided,
|
|
207
|
+
values are sampled from this list.
|
|
208
|
+
nullable
|
|
209
|
+
Whether the column can contain null values. Default is `False`.
|
|
210
|
+
null_probability
|
|
211
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
212
|
+
unique
|
|
213
|
+
Whether all values must be unique. Default is `False`.
|
|
214
|
+
generator
|
|
215
|
+
Custom callable that generates values. Overrides other settings.
|
|
216
|
+
dtype
|
|
217
|
+
Integer dtype. Default is `"Int64"`. Options: `"Int8"`, `"Int16"`,
|
|
218
|
+
`"Int32"`, `"Int64"`, `"UInt8"`, `"UInt16"`, `"UInt32"`, `"UInt64"`.
|
|
219
|
+
|
|
220
|
+
Raises
|
|
221
|
+
------
|
|
222
|
+
ValueError
|
|
223
|
+
If constraints are invalid (e.g., `min_val > max_val`).
|
|
224
|
+
|
|
225
|
+
Examples
|
|
226
|
+
--------
|
|
227
|
+
Define a schema with integer fields and generate test data:
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
import pointblank as pb
|
|
231
|
+
|
|
232
|
+
# Define a schema with integer field specifications
|
|
233
|
+
schema = pb.Schema(
|
|
234
|
+
user_id=pb.int_field(min_val=1, unique=True),
|
|
235
|
+
age=pb.int_field(min_val=0, max_val=120),
|
|
236
|
+
rating=pb.int_field(allowed=[1, 2, 3, 4, 5]),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Generate 100 rows of test data
|
|
240
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
The generated data will have unique user IDs starting from `1`, ages between `0`-`120`,
|
|
244
|
+
and ratings sampled from the allowed values.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
# Integer-specific constraints
|
|
248
|
+
min_val: int | None = None
|
|
249
|
+
max_val: int | None = None
|
|
250
|
+
allowed: list[int] | None = field(default=None)
|
|
251
|
+
|
|
252
|
+
# Override dtype with default
|
|
253
|
+
dtype: str = "Int64"
|
|
254
|
+
|
|
255
|
+
def _validate(self) -> None:
|
|
256
|
+
"""Validate integer field constraints."""
|
|
257
|
+
super()._validate()
|
|
258
|
+
|
|
259
|
+
# Validate dtype
|
|
260
|
+
if self.dtype not in INT_DTYPES:
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"Invalid dtype '{self.dtype}' for IntField. Valid options: {sorted(INT_DTYPES)}"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Validate min/max
|
|
266
|
+
if self.min_val is not None and self.max_val is not None:
|
|
267
|
+
if self.min_val > self.max_val:
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"min_val ({self.min_val}) cannot be greater than max_val ({self.max_val})"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Validate allowed list
|
|
273
|
+
if self.allowed is not None:
|
|
274
|
+
if len(self.allowed) == 0:
|
|
275
|
+
raise ValueError("allowed list cannot be empty")
|
|
276
|
+
|
|
277
|
+
def has_allowed_values(self) -> bool:
|
|
278
|
+
"""Check if this field has a set of allowed values."""
|
|
279
|
+
return self.allowed is not None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def int_field(
|
|
283
|
+
min_val: int | None = None,
|
|
284
|
+
max_val: int | None = None,
|
|
285
|
+
allowed: list[int] | None = None,
|
|
286
|
+
nullable: bool = False,
|
|
287
|
+
null_probability: float = 0.0,
|
|
288
|
+
unique: bool = False,
|
|
289
|
+
generator: Callable[[], Any] | None = None,
|
|
290
|
+
dtype: str = "Int64",
|
|
291
|
+
) -> IntField:
|
|
292
|
+
"""
|
|
293
|
+
Create an integer column specification.
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
min_val
|
|
298
|
+
Minimum value (inclusive). Default is `None` (no minimum).
|
|
299
|
+
max_val
|
|
300
|
+
Maximum value (inclusive). Default is `None` (no maximum).
|
|
301
|
+
allowed
|
|
302
|
+
List of allowed values (categorical constraint). When provided,
|
|
303
|
+
values are sampled from this list.
|
|
304
|
+
nullable
|
|
305
|
+
Whether the column can contain null values. Default is `False`.
|
|
306
|
+
null_probability
|
|
307
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
308
|
+
unique
|
|
309
|
+
Whether all values must be unique. Default is `False`.
|
|
310
|
+
generator
|
|
311
|
+
Custom callable that generates values. Overrides other settings.
|
|
312
|
+
dtype
|
|
313
|
+
Integer dtype. Default is `"Int64"`. Options: `"Int8"`, `"Int16"`,
|
|
314
|
+
`"Int32"`, `"Int64"`, `"UInt8"`, `"UInt16"`, `"UInt32"`, `"UInt64"`.
|
|
315
|
+
|
|
316
|
+
Returns
|
|
317
|
+
-------
|
|
318
|
+
IntField
|
|
319
|
+
An integer field specification.
|
|
320
|
+
|
|
321
|
+
Examples
|
|
322
|
+
--------
|
|
323
|
+
Define a schema with integer fields and generate test data:
|
|
324
|
+
|
|
325
|
+
```{python}
|
|
326
|
+
import pointblank as pb
|
|
327
|
+
|
|
328
|
+
# Define a schema with integer field specifications
|
|
329
|
+
schema = pb.Schema(
|
|
330
|
+
user_id=pb.int_field(min_val=1, unique=True),
|
|
331
|
+
age=pb.int_field(min_val=0, max_val=120),
|
|
332
|
+
rating=pb.int_field(allowed=[1, 2, 3, 4, 5]),
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Generate 100 rows of test data
|
|
336
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
The generated data will have unique user IDs starting from `1`, ages between `0`-`120`,
|
|
340
|
+
and ratings sampled from the allowed values.
|
|
341
|
+
"""
|
|
342
|
+
return IntField(
|
|
343
|
+
min_val=min_val,
|
|
344
|
+
max_val=max_val,
|
|
345
|
+
allowed=allowed,
|
|
346
|
+
nullable=nullable,
|
|
347
|
+
null_probability=null_probability,
|
|
348
|
+
unique=unique,
|
|
349
|
+
generator=generator,
|
|
350
|
+
dtype=dtype,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# =============================================================================
|
|
355
|
+
# Float Field
|
|
356
|
+
# =============================================================================
|
|
357
|
+
|
|
358
|
+
FLOAT_DTYPES = frozenset({"Float32", "Float64"})
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
@dataclass
|
|
362
|
+
class FloatField(Field):
|
|
363
|
+
"""
|
|
364
|
+
Floating-point column specification for schema definition.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
min_val
|
|
369
|
+
Minimum value (inclusive). Default is `None` (no minimum).
|
|
370
|
+
max_val
|
|
371
|
+
Maximum value (inclusive). Default is `None` (no maximum).
|
|
372
|
+
allowed
|
|
373
|
+
List of allowed values (categorical constraint). When provided,
|
|
374
|
+
values are sampled from this list.
|
|
375
|
+
nullable
|
|
376
|
+
Whether the column can contain null values. Default is `False`.
|
|
377
|
+
null_probability
|
|
378
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
379
|
+
unique
|
|
380
|
+
Whether all values must be unique. Default is `False`.
|
|
381
|
+
generator
|
|
382
|
+
Custom callable that generates values. Overrides other settings.
|
|
383
|
+
dtype
|
|
384
|
+
Float dtype. Default is `"Float64"`. Options: `"Float32"`, `"Float64"`.
|
|
385
|
+
|
|
386
|
+
Raises
|
|
387
|
+
------
|
|
388
|
+
ValueError
|
|
389
|
+
If constraints are invalid (e.g., `min_val > max_val`).
|
|
390
|
+
|
|
391
|
+
Examples
|
|
392
|
+
--------
|
|
393
|
+
Define a schema with float fields and generate test data:
|
|
394
|
+
|
|
395
|
+
```python
|
|
396
|
+
import pointblank as pb
|
|
397
|
+
|
|
398
|
+
# Define a schema with float field specifications
|
|
399
|
+
schema = pb.Schema(
|
|
400
|
+
price=pb.float_field(min_val=0.01, max_val=9999.99),
|
|
401
|
+
probability=pb.float_field(min_val=0.0, max_val=1.0),
|
|
402
|
+
temperature=pb.float_field(min_val=-40.0, max_val=50.0),
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Generate 100 rows of test data
|
|
406
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
Values are uniformly distributed across the specified ranges.
|
|
410
|
+
"""
|
|
411
|
+
|
|
412
|
+
# Float-specific constraints
|
|
413
|
+
min_val: float | None = None
|
|
414
|
+
max_val: float | None = None
|
|
415
|
+
allowed: list[float] | None = field(default=None)
|
|
416
|
+
|
|
417
|
+
# Override dtype with default
|
|
418
|
+
dtype: str = "Float64"
|
|
419
|
+
|
|
420
|
+
def _validate(self) -> None:
|
|
421
|
+
"""Validate float field constraints."""
|
|
422
|
+
super()._validate()
|
|
423
|
+
|
|
424
|
+
# Validate dtype
|
|
425
|
+
if self.dtype not in FLOAT_DTYPES:
|
|
426
|
+
raise ValueError(
|
|
427
|
+
f"Invalid dtype '{self.dtype}' for FloatField. "
|
|
428
|
+
f"Valid options: {sorted(FLOAT_DTYPES)}"
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Validate min/max
|
|
432
|
+
if self.min_val is not None and self.max_val is not None:
|
|
433
|
+
if self.min_val > self.max_val:
|
|
434
|
+
raise ValueError(
|
|
435
|
+
f"min_val ({self.min_val}) cannot be greater than max_val ({self.max_val})"
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Validate allowed list
|
|
439
|
+
if self.allowed is not None:
|
|
440
|
+
if len(self.allowed) == 0:
|
|
441
|
+
raise ValueError("allowed list cannot be empty")
|
|
442
|
+
|
|
443
|
+
def has_allowed_values(self) -> bool:
|
|
444
|
+
"""Check if this field has a set of allowed values."""
|
|
445
|
+
return self.allowed is not None
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def float_field(
|
|
449
|
+
min_val: float | None = None,
|
|
450
|
+
max_val: float | None = None,
|
|
451
|
+
allowed: list[float] | None = None,
|
|
452
|
+
nullable: bool = False,
|
|
453
|
+
null_probability: float = 0.0,
|
|
454
|
+
unique: bool = False,
|
|
455
|
+
generator: Callable[[], Any] | None = None,
|
|
456
|
+
dtype: str = "Float64",
|
|
457
|
+
) -> FloatField:
|
|
458
|
+
"""
|
|
459
|
+
Create a floating-point column specification.
|
|
460
|
+
|
|
461
|
+
Parameters
|
|
462
|
+
----------
|
|
463
|
+
min_val
|
|
464
|
+
Minimum value (inclusive). Default is `None` (no minimum).
|
|
465
|
+
max_val
|
|
466
|
+
Maximum value (inclusive). Default is `None` (no maximum).
|
|
467
|
+
allowed
|
|
468
|
+
List of allowed values (categorical constraint). When provided,
|
|
469
|
+
values are sampled from this list.
|
|
470
|
+
nullable
|
|
471
|
+
Whether the column can contain null values. Default is `False`.
|
|
472
|
+
null_probability
|
|
473
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
474
|
+
unique
|
|
475
|
+
Whether all values must be unique. Default is `False`.
|
|
476
|
+
generator
|
|
477
|
+
Custom callable that generates values. Overrides other settings.
|
|
478
|
+
dtype
|
|
479
|
+
Float dtype. Default is `"Float64"`. Options: `"Float32"`, `"Float64"`.
|
|
480
|
+
|
|
481
|
+
Returns
|
|
482
|
+
-------
|
|
483
|
+
FloatField
|
|
484
|
+
A float field specification.
|
|
485
|
+
|
|
486
|
+
Examples
|
|
487
|
+
--------
|
|
488
|
+
Define a schema with float fields and generate test data:
|
|
489
|
+
|
|
490
|
+
```{python}
|
|
491
|
+
import pointblank as pb
|
|
492
|
+
|
|
493
|
+
# Define a schema with float field specifications
|
|
494
|
+
schema = pb.Schema(
|
|
495
|
+
price=pb.float_field(min_val=0.01, max_val=9999.99),
|
|
496
|
+
probability=pb.float_field(min_val=0.0, max_val=1.0),
|
|
497
|
+
temperature=pb.float_field(min_val=-40.0, max_val=50.0),
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
# Generate 100 rows of test data
|
|
501
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
502
|
+
```
|
|
503
|
+
|
|
504
|
+
Values are uniformly distributed across the specified ranges.
|
|
505
|
+
"""
|
|
506
|
+
return FloatField(
|
|
507
|
+
min_val=min_val,
|
|
508
|
+
max_val=max_val,
|
|
509
|
+
allowed=allowed,
|
|
510
|
+
nullable=nullable,
|
|
511
|
+
null_probability=null_probability,
|
|
512
|
+
unique=unique,
|
|
513
|
+
generator=generator,
|
|
514
|
+
dtype=dtype,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
# =============================================================================
|
|
519
|
+
# String Field
|
|
520
|
+
# =============================================================================
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
@dataclass
|
|
524
|
+
class StringField(Field):
|
|
525
|
+
"""
|
|
526
|
+
String column specification for schema definition.
|
|
527
|
+
|
|
528
|
+
Parameters
|
|
529
|
+
----------
|
|
530
|
+
min_length
|
|
531
|
+
Minimum string length. Default is `None` (no minimum).
|
|
532
|
+
max_length
|
|
533
|
+
Maximum string length. Default is `None` (no maximum).
|
|
534
|
+
pattern
|
|
535
|
+
Regular expression pattern for generated strings.
|
|
536
|
+
preset
|
|
537
|
+
Preset for realistic data (e.g., `"email"`, `"name"`, `"phone_number"`).
|
|
538
|
+
allowed
|
|
539
|
+
List of allowed values (categorical constraint).
|
|
540
|
+
nullable
|
|
541
|
+
Whether the column can contain null values. Default is `False`.
|
|
542
|
+
null_probability
|
|
543
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
544
|
+
unique
|
|
545
|
+
Whether all values must be unique. Default is `False`.
|
|
546
|
+
generator
|
|
547
|
+
Custom callable that generates values. Overrides other settings.
|
|
548
|
+
dtype
|
|
549
|
+
Always `"String"` for StringField.
|
|
550
|
+
|
|
551
|
+
Raises
|
|
552
|
+
------
|
|
553
|
+
ValueError
|
|
554
|
+
If constraints are invalid or incompatible.
|
|
555
|
+
|
|
556
|
+
Examples
|
|
557
|
+
--------
|
|
558
|
+
Define a schema with string fields and generate test data:
|
|
559
|
+
|
|
560
|
+
```python
|
|
561
|
+
import pointblank as pb
|
|
562
|
+
|
|
563
|
+
# Define a schema with string field specifications
|
|
564
|
+
schema = pb.Schema(
|
|
565
|
+
name=pb.string_field(preset="name"),
|
|
566
|
+
email=pb.string_field(preset="email", unique=True),
|
|
567
|
+
status=pb.string_field(allowed=["active", "pending", "inactive"]),
|
|
568
|
+
code=pb.string_field(pattern=r"[A-Z]{3}-[0-9]{4}"),
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
# Generate 100 rows of test data
|
|
572
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
573
|
+
```
|
|
574
|
+
|
|
575
|
+
The generated data will have coherent names and emails (derived from the name),
|
|
576
|
+
statuses sampled from the allowed values, and codes matching the regex pattern.
|
|
577
|
+
"""
|
|
578
|
+
|
|
579
|
+
# String-specific constraints
|
|
580
|
+
min_length: int | None = None
|
|
581
|
+
max_length: int | None = None
|
|
582
|
+
pattern: str | None = None
|
|
583
|
+
preset: str | None = None
|
|
584
|
+
allowed: list[str] | None = field(default=None)
|
|
585
|
+
|
|
586
|
+
# Override dtype with fixed value
|
|
587
|
+
dtype: str = "String"
|
|
588
|
+
|
|
589
|
+
def _validate(self) -> None:
|
|
590
|
+
"""Validate string field constraints."""
|
|
591
|
+
super()._validate()
|
|
592
|
+
|
|
593
|
+
# Validate dtype (must be String)
|
|
594
|
+
if self.dtype != "String":
|
|
595
|
+
raise ValueError(f"StringField dtype must be 'String', got '{self.dtype}'")
|
|
596
|
+
|
|
597
|
+
# Validate length constraints
|
|
598
|
+
if self.min_length is not None and self.min_length < 0:
|
|
599
|
+
raise ValueError(f"min_length must be non-negative, got {self.min_length}")
|
|
600
|
+
|
|
601
|
+
if self.max_length is not None and self.max_length < 0:
|
|
602
|
+
raise ValueError(f"max_length must be non-negative, got {self.max_length}")
|
|
603
|
+
|
|
604
|
+
if self.min_length is not None and self.max_length is not None:
|
|
605
|
+
if self.min_length > self.max_length:
|
|
606
|
+
raise ValueError(
|
|
607
|
+
f"min_length ({self.min_length}) cannot be greater than "
|
|
608
|
+
f"max_length ({self.max_length})"
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Validate preset
|
|
612
|
+
if self.preset is not None and self.preset not in AVAILABLE_PRESETS:
|
|
613
|
+
raise ValueError(
|
|
614
|
+
f"Unknown preset '{self.preset}'. Available presets: {sorted(AVAILABLE_PRESETS)}"
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
# Validate allowed list
|
|
618
|
+
if self.allowed is not None:
|
|
619
|
+
if len(self.allowed) == 0:
|
|
620
|
+
raise ValueError("allowed list cannot be empty")
|
|
621
|
+
|
|
622
|
+
# Validate incompatible combinations
|
|
623
|
+
specified = []
|
|
624
|
+
if self.preset is not None:
|
|
625
|
+
specified.append("preset")
|
|
626
|
+
if self.pattern is not None:
|
|
627
|
+
specified.append("pattern")
|
|
628
|
+
if self.allowed is not None:
|
|
629
|
+
specified.append("allowed")
|
|
630
|
+
|
|
631
|
+
if len(specified) > 1:
|
|
632
|
+
raise ValueError(
|
|
633
|
+
f"Only one of preset, pattern, or allowed can be specified. "
|
|
634
|
+
f"Got: {', '.join(specified)}"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
def has_preset(self) -> bool:
|
|
638
|
+
"""Check if this field uses a preset for generation."""
|
|
639
|
+
return self.preset is not None
|
|
640
|
+
|
|
641
|
+
def has_allowed_values(self) -> bool:
|
|
642
|
+
"""Check if this field has a set of allowed values."""
|
|
643
|
+
return self.allowed is not None
|
|
644
|
+
|
|
645
|
+
def has_pattern(self) -> bool:
|
|
646
|
+
"""Check if this field has a regex pattern constraint."""
|
|
647
|
+
return self.pattern is not None
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def string_field(
|
|
651
|
+
min_length: int | None = None,
|
|
652
|
+
max_length: int | None = None,
|
|
653
|
+
pattern: str | None = None,
|
|
654
|
+
preset: str | None = None,
|
|
655
|
+
allowed: list[str] | None = None,
|
|
656
|
+
nullable: bool = False,
|
|
657
|
+
null_probability: float = 0.0,
|
|
658
|
+
unique: bool = False,
|
|
659
|
+
generator: Callable[[], Any] | None = None,
|
|
660
|
+
) -> StringField:
|
|
661
|
+
"""
|
|
662
|
+
Create a string column specification.
|
|
663
|
+
|
|
664
|
+
Parameters
|
|
665
|
+
----------
|
|
666
|
+
min_length
|
|
667
|
+
Minimum string length. Default is `None` (no minimum).
|
|
668
|
+
max_length
|
|
669
|
+
Maximum string length. Default is `None` (no maximum).
|
|
670
|
+
pattern
|
|
671
|
+
Regular expression pattern for generated strings.
|
|
672
|
+
preset
|
|
673
|
+
Preset for realistic data (e.g., `"email"`, `"name"`, `"phone_number"`).
|
|
674
|
+
allowed
|
|
675
|
+
List of allowed values (categorical constraint).
|
|
676
|
+
nullable
|
|
677
|
+
Whether the column can contain null values. Default is `False`.
|
|
678
|
+
null_probability
|
|
679
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
680
|
+
unique
|
|
681
|
+
Whether all values must be unique. Default is `False`.
|
|
682
|
+
generator
|
|
683
|
+
Custom callable that generates values. Overrides other settings.
|
|
684
|
+
|
|
685
|
+
Returns
|
|
686
|
+
-------
|
|
687
|
+
StringField
|
|
688
|
+
A string field specification.
|
|
689
|
+
|
|
690
|
+
Examples
|
|
691
|
+
--------
|
|
692
|
+
Define a schema with string fields and generate test data:
|
|
693
|
+
|
|
694
|
+
```{python}
|
|
695
|
+
import pointblank as pb
|
|
696
|
+
|
|
697
|
+
# Define a schema with string field specifications
|
|
698
|
+
schema = pb.Schema(
|
|
699
|
+
name=pb.string_field(preset="name"),
|
|
700
|
+
email=pb.string_field(preset="email", unique=True),
|
|
701
|
+
status=pb.string_field(allowed=["active", "pending", "inactive"]),
|
|
702
|
+
code=pb.string_field(pattern=r"[A-Z]{3}-[0-9]{4}"),
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
# Generate 100 rows of test data
|
|
706
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
707
|
+
```
|
|
708
|
+
|
|
709
|
+
The generated data will have coherent names and emails (derived from the name),
|
|
710
|
+
statuses sampled from the allowed values, and codes matching the regex pattern.
|
|
711
|
+
"""
|
|
712
|
+
return StringField(
|
|
713
|
+
min_length=min_length,
|
|
714
|
+
max_length=max_length,
|
|
715
|
+
pattern=pattern,
|
|
716
|
+
preset=preset,
|
|
717
|
+
allowed=allowed,
|
|
718
|
+
nullable=nullable,
|
|
719
|
+
null_probability=null_probability,
|
|
720
|
+
unique=unique,
|
|
721
|
+
generator=generator,
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
# =============================================================================
|
|
726
|
+
# Boolean Field
|
|
727
|
+
# =============================================================================
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
@dataclass
|
|
731
|
+
class BoolField(Field):
|
|
732
|
+
"""
|
|
733
|
+
Boolean column specification for schema definition.
|
|
734
|
+
|
|
735
|
+
Parameters
|
|
736
|
+
----------
|
|
737
|
+
p_true
|
|
738
|
+
Probability of generating `True`. Default is `0.5` (equal probability).
|
|
739
|
+
Must be between 0.0 and 1.0.
|
|
740
|
+
nullable
|
|
741
|
+
Whether the column can contain null values. Default is `False`.
|
|
742
|
+
null_probability
|
|
743
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
744
|
+
unique
|
|
745
|
+
Whether all values must be unique. Default is `False`.
|
|
746
|
+
Note: Boolean can only have 2 unique non-null values.
|
|
747
|
+
generator
|
|
748
|
+
Custom callable that generates values. Overrides other settings.
|
|
749
|
+
dtype
|
|
750
|
+
Always `"Boolean"` for BoolField.
|
|
751
|
+
|
|
752
|
+
Examples
|
|
753
|
+
--------
|
|
754
|
+
Define a schema with boolean fields and generate test data:
|
|
755
|
+
|
|
756
|
+
```python
|
|
757
|
+
import pointblank as pb
|
|
758
|
+
|
|
759
|
+
# Define a schema with boolean field specifications
|
|
760
|
+
schema = pb.Schema(
|
|
761
|
+
is_active=pb.bool_field(p_true=0.8), # 80% True
|
|
762
|
+
is_premium=pb.bool_field(p_true=0.2), # 20% True
|
|
763
|
+
is_verified=pb.bool_field(), # 50% True (default)
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
# Generate 100 rows of test data
|
|
767
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
768
|
+
```
|
|
769
|
+
|
|
770
|
+
The `p_true` parameter controls the probability of generating `True` values,
|
|
771
|
+
which is helpful for simulating real-world distributions.
|
|
772
|
+
"""
|
|
773
|
+
|
|
774
|
+
# Boolean-specific parameter
|
|
775
|
+
p_true: float = 0.5
|
|
776
|
+
|
|
777
|
+
# Override dtype with fixed value
|
|
778
|
+
dtype: str = "Boolean"
|
|
779
|
+
|
|
780
|
+
def _validate(self) -> None:
|
|
781
|
+
"""Validate boolean field constraints."""
|
|
782
|
+
super()._validate()
|
|
783
|
+
|
|
784
|
+
# Validate dtype (must be Boolean)
|
|
785
|
+
if self.dtype != "Boolean":
|
|
786
|
+
raise ValueError(f"BoolField dtype must be 'Boolean', got '{self.dtype}'")
|
|
787
|
+
|
|
788
|
+
# Validate p_true
|
|
789
|
+
if not 0.0 <= self.p_true <= 1.0:
|
|
790
|
+
raise ValueError(f"p_true must be between 0.0 and 1.0, got {self.p_true}")
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
def bool_field(
|
|
794
|
+
p_true: float = 0.5,
|
|
795
|
+
nullable: bool = False,
|
|
796
|
+
null_probability: float = 0.0,
|
|
797
|
+
unique: bool = False,
|
|
798
|
+
generator: Callable[[], Any] | None = None,
|
|
799
|
+
) -> BoolField:
|
|
800
|
+
"""
|
|
801
|
+
Create a boolean column specification.
|
|
802
|
+
|
|
803
|
+
Parameters
|
|
804
|
+
----------
|
|
805
|
+
p_true
|
|
806
|
+
Probability of generating `True`. Default is `0.5` (equal probability).
|
|
807
|
+
Must be between 0.0 and 1.0.
|
|
808
|
+
nullable
|
|
809
|
+
Whether the column can contain null values. Default is `False`.
|
|
810
|
+
null_probability
|
|
811
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
812
|
+
unique
|
|
813
|
+
Whether all values must be unique. Default is `False`.
|
|
814
|
+
Note: Boolean can only have 2 unique non-null values.
|
|
815
|
+
generator
|
|
816
|
+
Custom callable that generates values. Overrides other settings.
|
|
817
|
+
|
|
818
|
+
Returns
|
|
819
|
+
-------
|
|
820
|
+
BoolField
|
|
821
|
+
A boolean field specification.
|
|
822
|
+
|
|
823
|
+
Examples
|
|
824
|
+
--------
|
|
825
|
+
Define a schema with boolean fields and generate test data:
|
|
826
|
+
|
|
827
|
+
```{python}
|
|
828
|
+
import pointblank as pb
|
|
829
|
+
|
|
830
|
+
# Define a schema with boolean field specifications
|
|
831
|
+
schema = pb.Schema(
|
|
832
|
+
is_active=pb.bool_field(p_true=0.8), # 80% True
|
|
833
|
+
is_premium=pb.bool_field(p_true=0.2), # 20% True
|
|
834
|
+
is_verified=pb.bool_field(), # 50% True (default)
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
# Generate 100 rows of test data
|
|
838
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
839
|
+
```
|
|
840
|
+
|
|
841
|
+
The `p_true=` parameter controls the probability of generating `True` values,
|
|
842
|
+
which is helpful for simulating real-world distributions.
|
|
843
|
+
"""
|
|
844
|
+
return BoolField(
|
|
845
|
+
p_true=p_true,
|
|
846
|
+
nullable=nullable,
|
|
847
|
+
null_probability=null_probability,
|
|
848
|
+
unique=unique,
|
|
849
|
+
generator=generator,
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
# =============================================================================
|
|
854
|
+
# Date Field
|
|
855
|
+
# =============================================================================
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
@dataclass
|
|
859
|
+
class DateField(Field):
|
|
860
|
+
"""
|
|
861
|
+
Date column specification for schema definition.
|
|
862
|
+
|
|
863
|
+
Parameters
|
|
864
|
+
----------
|
|
865
|
+
min_date
|
|
866
|
+
Minimum date (inclusive). Can be ISO string or `date` object.
|
|
867
|
+
max_date
|
|
868
|
+
Maximum date (inclusive). Can be ISO string or `date` object.
|
|
869
|
+
nullable
|
|
870
|
+
Whether the column can contain null values. Default is `False`.
|
|
871
|
+
null_probability
|
|
872
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
873
|
+
unique
|
|
874
|
+
Whether all values must be unique. Default is `False`.
|
|
875
|
+
generator
|
|
876
|
+
Custom callable that generates values. Overrides other settings.
|
|
877
|
+
dtype
|
|
878
|
+
Always `"Date"` for DateField.
|
|
879
|
+
|
|
880
|
+
Examples
|
|
881
|
+
--------
|
|
882
|
+
Define a schema with date fields and generate test data:
|
|
883
|
+
|
|
884
|
+
```python
|
|
885
|
+
import pointblank as pb
|
|
886
|
+
from datetime import date
|
|
887
|
+
|
|
888
|
+
# Define a schema with date field specifications
|
|
889
|
+
schema = pb.Schema(
|
|
890
|
+
birth_date=pb.date_field(
|
|
891
|
+
min_date=date(1960, 1, 1),
|
|
892
|
+
max_date=date(2005, 12, 31)
|
|
893
|
+
),
|
|
894
|
+
hire_date=pb.date_field(
|
|
895
|
+
min_date=date(2020, 1, 1),
|
|
896
|
+
max_date=date(2024, 12, 31)
|
|
897
|
+
),
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
# Generate 100 rows of test data
|
|
901
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
902
|
+
```
|
|
903
|
+
|
|
904
|
+
Date values are uniformly distributed within the specified range.
|
|
905
|
+
"""
|
|
906
|
+
|
|
907
|
+
# Date-specific constraints
|
|
908
|
+
min_date: str | date | None = None
|
|
909
|
+
max_date: str | date | None = None
|
|
910
|
+
|
|
911
|
+
# Override dtype with fixed value
|
|
912
|
+
dtype: str = "Date"
|
|
913
|
+
|
|
914
|
+
def _validate(self) -> None:
|
|
915
|
+
"""Validate date field constraints."""
|
|
916
|
+
super()._validate()
|
|
917
|
+
|
|
918
|
+
# Validate dtype (must be Date)
|
|
919
|
+
if self.dtype != "Date":
|
|
920
|
+
raise ValueError(f"DateField dtype must be 'Date', got '{self.dtype}'")
|
|
921
|
+
|
|
922
|
+
# Validate date range
|
|
923
|
+
if self.min_date is not None and self.max_date is not None:
|
|
924
|
+
min_dt = self._parse_date(self.min_date)
|
|
925
|
+
max_dt = self._parse_date(self.max_date)
|
|
926
|
+
if min_dt > max_dt:
|
|
927
|
+
raise ValueError(
|
|
928
|
+
f"min_date ({self.min_date}) cannot be greater than max_date ({self.max_date})"
|
|
929
|
+
)
|
|
930
|
+
|
|
931
|
+
@staticmethod
|
|
932
|
+
def _parse_date(value: str | date | datetime) -> datetime:
|
|
933
|
+
"""Parse a date value to datetime for comparison."""
|
|
934
|
+
if isinstance(value, datetime):
|
|
935
|
+
return value
|
|
936
|
+
if isinstance(value, date):
|
|
937
|
+
return datetime.combine(value, datetime.min.time())
|
|
938
|
+
if isinstance(value, str):
|
|
939
|
+
try:
|
|
940
|
+
return datetime.fromisoformat(value)
|
|
941
|
+
except ValueError:
|
|
942
|
+
raise ValueError(
|
|
943
|
+
f"Unable to parse date string '{value}'. Use ISO format (YYYY-MM-DD)."
|
|
944
|
+
)
|
|
945
|
+
raise ValueError(f"Invalid date type: {type(value)}")
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
def date_field(
|
|
949
|
+
min_date: str | date | None = None,
|
|
950
|
+
max_date: str | date | None = None,
|
|
951
|
+
nullable: bool = False,
|
|
952
|
+
null_probability: float = 0.0,
|
|
953
|
+
unique: bool = False,
|
|
954
|
+
generator: Callable[[], Any] | None = None,
|
|
955
|
+
) -> DateField:
|
|
956
|
+
"""
|
|
957
|
+
Create a date column specification.
|
|
958
|
+
|
|
959
|
+
Parameters
|
|
960
|
+
----------
|
|
961
|
+
min_date
|
|
962
|
+
Minimum date (inclusive). Can be ISO string or `date` object.
|
|
963
|
+
max_date
|
|
964
|
+
Maximum date (inclusive). Can be ISO string or `date` object.
|
|
965
|
+
nullable
|
|
966
|
+
Whether the column can contain null values. Default is `False`.
|
|
967
|
+
null_probability
|
|
968
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
969
|
+
unique
|
|
970
|
+
Whether all values must be unique. Default is `False`.
|
|
971
|
+
generator
|
|
972
|
+
Custom callable that generates values. Overrides other settings.
|
|
973
|
+
|
|
974
|
+
Returns
|
|
975
|
+
-------
|
|
976
|
+
DateField
|
|
977
|
+
A date field specification.
|
|
978
|
+
|
|
979
|
+
Examples
|
|
980
|
+
--------
|
|
981
|
+
Define a schema with date fields and generate test data:
|
|
982
|
+
|
|
983
|
+
```{python}
|
|
984
|
+
import pointblank as pb
|
|
985
|
+
from datetime import date
|
|
986
|
+
|
|
987
|
+
# Define a schema with date field specifications
|
|
988
|
+
schema = pb.Schema(
|
|
989
|
+
birth_date=pb.date_field(
|
|
990
|
+
min_date=date(1960, 1, 1),
|
|
991
|
+
max_date=date(2005, 12, 31)
|
|
992
|
+
),
|
|
993
|
+
hire_date=pb.date_field(
|
|
994
|
+
min_date=date(2020, 1, 1),
|
|
995
|
+
max_date=date(2024, 12, 31)
|
|
996
|
+
),
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
# Generate 100 rows of test data
|
|
1000
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
1001
|
+
```
|
|
1002
|
+
|
|
1003
|
+
Date values are uniformly distributed within the specified range.
|
|
1004
|
+
"""
|
|
1005
|
+
return DateField(
|
|
1006
|
+
min_date=min_date,
|
|
1007
|
+
max_date=max_date,
|
|
1008
|
+
nullable=nullable,
|
|
1009
|
+
null_probability=null_probability,
|
|
1010
|
+
unique=unique,
|
|
1011
|
+
generator=generator,
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
# =============================================================================
|
|
1016
|
+
# Datetime Field
|
|
1017
|
+
# =============================================================================
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
@dataclass
|
|
1021
|
+
class DatetimeField(Field):
|
|
1022
|
+
"""
|
|
1023
|
+
Datetime column specification for schema definition.
|
|
1024
|
+
|
|
1025
|
+
Parameters
|
|
1026
|
+
----------
|
|
1027
|
+
min_date
|
|
1028
|
+
Minimum datetime (inclusive). Can be ISO string or `datetime` object.
|
|
1029
|
+
max_date
|
|
1030
|
+
Maximum datetime (inclusive). Can be ISO string or `datetime` object.
|
|
1031
|
+
nullable
|
|
1032
|
+
Whether the column can contain null values. Default is `False`.
|
|
1033
|
+
null_probability
|
|
1034
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
1035
|
+
unique
|
|
1036
|
+
Whether all values must be unique. Default is `False`.
|
|
1037
|
+
generator
|
|
1038
|
+
Custom callable that generates values. Overrides other settings.
|
|
1039
|
+
dtype
|
|
1040
|
+
Always `"Datetime"` for DatetimeField.
|
|
1041
|
+
|
|
1042
|
+
Examples
|
|
1043
|
+
--------
|
|
1044
|
+
Define a schema with datetime fields and generate test data:
|
|
1045
|
+
|
|
1046
|
+
```python
|
|
1047
|
+
import pointblank as pb
|
|
1048
|
+
from datetime import datetime
|
|
1049
|
+
|
|
1050
|
+
# Define a schema with datetime field specifications
|
|
1051
|
+
schema = pb.Schema(
|
|
1052
|
+
created_at=pb.datetime_field(
|
|
1053
|
+
min_date=datetime(2024, 1, 1),
|
|
1054
|
+
max_date=datetime(2024, 12, 31)
|
|
1055
|
+
),
|
|
1056
|
+
updated_at=pb.datetime_field(
|
|
1057
|
+
min_date=datetime(2024, 6, 1),
|
|
1058
|
+
max_date=datetime(2024, 12, 31)
|
|
1059
|
+
),
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
# Generate 100 rows of test data
|
|
1063
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
1064
|
+
```
|
|
1065
|
+
|
|
1066
|
+
Datetime values are uniformly distributed within the specified range.
|
|
1067
|
+
"""
|
|
1068
|
+
|
|
1069
|
+
# Datetime-specific constraints
|
|
1070
|
+
min_date: str | datetime | None = None
|
|
1071
|
+
max_date: str | datetime | None = None
|
|
1072
|
+
|
|
1073
|
+
# Override dtype with fixed value
|
|
1074
|
+
dtype: str = "Datetime"
|
|
1075
|
+
|
|
1076
|
+
def _validate(self) -> None:
|
|
1077
|
+
"""Validate datetime field constraints."""
|
|
1078
|
+
super()._validate()
|
|
1079
|
+
|
|
1080
|
+
# Validate dtype (must be Datetime)
|
|
1081
|
+
if self.dtype != "Datetime":
|
|
1082
|
+
raise ValueError(f"DatetimeField dtype must be 'Datetime', got '{self.dtype}'")
|
|
1083
|
+
|
|
1084
|
+
# Validate date range
|
|
1085
|
+
if self.min_date is not None and self.max_date is not None:
|
|
1086
|
+
min_dt = self._parse_datetime(self.min_date)
|
|
1087
|
+
max_dt = self._parse_datetime(self.max_date)
|
|
1088
|
+
if min_dt > max_dt:
|
|
1089
|
+
raise ValueError(
|
|
1090
|
+
f"min_date ({self.min_date}) cannot be greater than max_date ({self.max_date})"
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
@staticmethod
|
|
1094
|
+
def _parse_datetime(value: str | datetime) -> datetime:
|
|
1095
|
+
"""Parse a datetime value for comparison."""
|
|
1096
|
+
if isinstance(value, datetime):
|
|
1097
|
+
return value
|
|
1098
|
+
if isinstance(value, date):
|
|
1099
|
+
return datetime.combine(value, datetime.min.time())
|
|
1100
|
+
if isinstance(value, str):
|
|
1101
|
+
try:
|
|
1102
|
+
return datetime.fromisoformat(value)
|
|
1103
|
+
except ValueError:
|
|
1104
|
+
raise ValueError(
|
|
1105
|
+
f"Unable to parse datetime string '{value}'. "
|
|
1106
|
+
"Use ISO format (YYYY-MM-DDTHH:MM:SS)."
|
|
1107
|
+
)
|
|
1108
|
+
raise ValueError(f"Invalid datetime type: {type(value)}")
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def datetime_field(
|
|
1112
|
+
min_date: str | datetime | None = None,
|
|
1113
|
+
max_date: str | datetime | None = None,
|
|
1114
|
+
nullable: bool = False,
|
|
1115
|
+
null_probability: float = 0.0,
|
|
1116
|
+
unique: bool = False,
|
|
1117
|
+
generator: Callable[[], Any] | None = None,
|
|
1118
|
+
) -> DatetimeField:
|
|
1119
|
+
"""
|
|
1120
|
+
Create a datetime column specification.
|
|
1121
|
+
|
|
1122
|
+
Parameters
|
|
1123
|
+
----------
|
|
1124
|
+
min_date
|
|
1125
|
+
Minimum datetime (inclusive). Can be ISO string or `datetime` object.
|
|
1126
|
+
max_date
|
|
1127
|
+
Maximum datetime (inclusive). Can be ISO string or `datetime` object.
|
|
1128
|
+
nullable
|
|
1129
|
+
Whether the column can contain null values. Default is `False`.
|
|
1130
|
+
null_probability
|
|
1131
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
1132
|
+
unique
|
|
1133
|
+
Whether all values must be unique. Default is `False`.
|
|
1134
|
+
generator
|
|
1135
|
+
Custom callable that generates values. Overrides other settings.
|
|
1136
|
+
|
|
1137
|
+
Returns
|
|
1138
|
+
-------
|
|
1139
|
+
DatetimeField
|
|
1140
|
+
A datetime field specification.
|
|
1141
|
+
|
|
1142
|
+
Examples
|
|
1143
|
+
--------
|
|
1144
|
+
Define a schema with datetime fields and generate test data:
|
|
1145
|
+
|
|
1146
|
+
```{python}
|
|
1147
|
+
import pointblank as pb
|
|
1148
|
+
from datetime import datetime
|
|
1149
|
+
|
|
1150
|
+
# Define a schema with datetime field specifications
|
|
1151
|
+
schema = pb.Schema(
|
|
1152
|
+
created_at=pb.datetime_field(
|
|
1153
|
+
min_date=datetime(2024, 1, 1),
|
|
1154
|
+
max_date=datetime(2024, 12, 31)
|
|
1155
|
+
),
|
|
1156
|
+
updated_at=pb.datetime_field(
|
|
1157
|
+
min_date=datetime(2024, 6, 1),
|
|
1158
|
+
max_date=datetime(2024, 12, 31)
|
|
1159
|
+
),
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
# Generate 100 rows of test data
|
|
1163
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
1164
|
+
```
|
|
1165
|
+
|
|
1166
|
+
Datetime values are uniformly distributed within the specified range.
|
|
1167
|
+
"""
|
|
1168
|
+
return DatetimeField(
|
|
1169
|
+
min_date=min_date,
|
|
1170
|
+
max_date=max_date,
|
|
1171
|
+
nullable=nullable,
|
|
1172
|
+
null_probability=null_probability,
|
|
1173
|
+
unique=unique,
|
|
1174
|
+
generator=generator,
|
|
1175
|
+
)
|
|
1176
|
+
|
|
1177
|
+
|
|
1178
|
+
# =============================================================================
|
|
1179
|
+
# Time Field
|
|
1180
|
+
# =============================================================================
|
|
1181
|
+
|
|
1182
|
+
|
|
1183
|
+
@dataclass
|
|
1184
|
+
class TimeField(Field):
|
|
1185
|
+
"""
|
|
1186
|
+
Time column specification for schema definition.
|
|
1187
|
+
|
|
1188
|
+
Parameters
|
|
1189
|
+
----------
|
|
1190
|
+
min_time
|
|
1191
|
+
Minimum time (inclusive). Can be ISO string or `time` object.
|
|
1192
|
+
max_time
|
|
1193
|
+
Maximum time (inclusive). Can be ISO string or `time` object.
|
|
1194
|
+
nullable
|
|
1195
|
+
Whether the column can contain null values. Default is `False`.
|
|
1196
|
+
null_probability
|
|
1197
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
1198
|
+
unique
|
|
1199
|
+
Whether all values must be unique. Default is `False`.
|
|
1200
|
+
generator
|
|
1201
|
+
Custom callable that generates values. Overrides other settings.
|
|
1202
|
+
dtype
|
|
1203
|
+
Always `"Time"` for TimeField.
|
|
1204
|
+
|
|
1205
|
+
Examples
|
|
1206
|
+
--------
|
|
1207
|
+
Define a schema with time fields and generate test data:
|
|
1208
|
+
|
|
1209
|
+
```python
|
|
1210
|
+
import pointblank as pb
|
|
1211
|
+
from datetime import time
|
|
1212
|
+
|
|
1213
|
+
# Define a schema with time field specifications
|
|
1214
|
+
schema = pb.Schema(
|
|
1215
|
+
start_time=pb.time_field(
|
|
1216
|
+
min_time=time(9, 0, 0),
|
|
1217
|
+
max_time=time(12, 0, 0)
|
|
1218
|
+
),
|
|
1219
|
+
end_time=pb.time_field(
|
|
1220
|
+
min_time=time(13, 0, 0),
|
|
1221
|
+
max_time=time(17, 0, 0)
|
|
1222
|
+
),
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# Generate 100 rows of test data
|
|
1226
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
1227
|
+
```
|
|
1228
|
+
|
|
1229
|
+
Time values are uniformly distributed within the specified range.
|
|
1230
|
+
"""
|
|
1231
|
+
|
|
1232
|
+
# Time-specific constraints
|
|
1233
|
+
min_time: str | time | None = None
|
|
1234
|
+
max_time: str | time | None = None
|
|
1235
|
+
|
|
1236
|
+
# Override dtype with fixed value
|
|
1237
|
+
dtype: str = "Time"
|
|
1238
|
+
|
|
1239
|
+
def _validate(self) -> None:
|
|
1240
|
+
"""Validate time field constraints."""
|
|
1241
|
+
super()._validate()
|
|
1242
|
+
|
|
1243
|
+
# Validate dtype (must be Time)
|
|
1244
|
+
if self.dtype != "Time":
|
|
1245
|
+
raise ValueError(f"TimeField dtype must be 'Time', got '{self.dtype}'")
|
|
1246
|
+
|
|
1247
|
+
# Validate time range
|
|
1248
|
+
if self.min_time is not None and self.max_time is not None:
|
|
1249
|
+
min_t = self._parse_time(self.min_time)
|
|
1250
|
+
max_t = self._parse_time(self.max_time)
|
|
1251
|
+
if min_t > max_t:
|
|
1252
|
+
raise ValueError(
|
|
1253
|
+
f"min_time ({self.min_time}) cannot be greater than max_time ({self.max_time})"
|
|
1254
|
+
)
|
|
1255
|
+
|
|
1256
|
+
@staticmethod
|
|
1257
|
+
def _parse_time(value: str | time) -> time:
|
|
1258
|
+
"""Parse a time value for comparison."""
|
|
1259
|
+
if isinstance(value, time):
|
|
1260
|
+
return value
|
|
1261
|
+
if isinstance(value, str):
|
|
1262
|
+
try:
|
|
1263
|
+
return time.fromisoformat(value)
|
|
1264
|
+
except ValueError:
|
|
1265
|
+
raise ValueError(
|
|
1266
|
+
f"Unable to parse time string '{value}'. Use ISO format (HH:MM:SS)."
|
|
1267
|
+
)
|
|
1268
|
+
raise ValueError(f"Invalid time type: {type(value)}")
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def time_field(
|
|
1272
|
+
min_time: str | time | None = None,
|
|
1273
|
+
max_time: str | time | None = None,
|
|
1274
|
+
nullable: bool = False,
|
|
1275
|
+
null_probability: float = 0.0,
|
|
1276
|
+
unique: bool = False,
|
|
1277
|
+
generator: Callable[[], Any] | None = None,
|
|
1278
|
+
) -> TimeField:
|
|
1279
|
+
"""
|
|
1280
|
+
Create a time column specification.
|
|
1281
|
+
|
|
1282
|
+
Parameters
|
|
1283
|
+
----------
|
|
1284
|
+
min_time
|
|
1285
|
+
Minimum time (inclusive). Can be ISO string or `time` object.
|
|
1286
|
+
max_time
|
|
1287
|
+
Maximum time (inclusive). Can be ISO string or `time` object.
|
|
1288
|
+
nullable
|
|
1289
|
+
Whether the column can contain null values. Default is `False`.
|
|
1290
|
+
null_probability
|
|
1291
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
1292
|
+
unique
|
|
1293
|
+
Whether all values must be unique. Default is `False`.
|
|
1294
|
+
generator
|
|
1295
|
+
Custom callable that generates values. Overrides other settings.
|
|
1296
|
+
|
|
1297
|
+
Returns
|
|
1298
|
+
-------
|
|
1299
|
+
TimeField
|
|
1300
|
+
A time field specification.
|
|
1301
|
+
|
|
1302
|
+
Examples
|
|
1303
|
+
--------
|
|
1304
|
+
Define a schema with time fields and generate test data:
|
|
1305
|
+
|
|
1306
|
+
```{python}
|
|
1307
|
+
import pointblank as pb
|
|
1308
|
+
from datetime import time
|
|
1309
|
+
|
|
1310
|
+
# Define a schema with time field specifications
|
|
1311
|
+
schema = pb.Schema(
|
|
1312
|
+
start_time=pb.time_field(
|
|
1313
|
+
min_time=time(9, 0, 0),
|
|
1314
|
+
max_time=time(12, 0, 0)
|
|
1315
|
+
),
|
|
1316
|
+
end_time=pb.time_field(
|
|
1317
|
+
min_time=time(13, 0, 0),
|
|
1318
|
+
max_time=time(17, 0, 0)
|
|
1319
|
+
),
|
|
1320
|
+
)
|
|
1321
|
+
|
|
1322
|
+
# Generate 100 rows of test data
|
|
1323
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
1324
|
+
```
|
|
1325
|
+
|
|
1326
|
+
Time values are uniformly distributed within the specified range.
|
|
1327
|
+
"""
|
|
1328
|
+
return TimeField(
|
|
1329
|
+
min_time=min_time,
|
|
1330
|
+
max_time=max_time,
|
|
1331
|
+
nullable=nullable,
|
|
1332
|
+
null_probability=null_probability,
|
|
1333
|
+
unique=unique,
|
|
1334
|
+
generator=generator,
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
|
|
1338
|
+
# =============================================================================
|
|
1339
|
+
# Duration Field
|
|
1340
|
+
# =============================================================================
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
@dataclass
|
|
1344
|
+
class DurationField(Field):
|
|
1345
|
+
"""
|
|
1346
|
+
Duration column specification for schema definition.
|
|
1347
|
+
|
|
1348
|
+
Parameters
|
|
1349
|
+
----------
|
|
1350
|
+
min_duration
|
|
1351
|
+
Minimum duration (inclusive). Can be ISO string or `timedelta` object.
|
|
1352
|
+
max_duration
|
|
1353
|
+
Maximum duration (inclusive). Can be ISO string or `timedelta` object.
|
|
1354
|
+
nullable
|
|
1355
|
+
Whether the column can contain null values. Default is `False`.
|
|
1356
|
+
null_probability
|
|
1357
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
1358
|
+
unique
|
|
1359
|
+
Whether all values must be unique. Default is `False`.
|
|
1360
|
+
generator
|
|
1361
|
+
Custom callable that generates values. Overrides other settings.
|
|
1362
|
+
dtype
|
|
1363
|
+
Always `"Duration"` for DurationField.
|
|
1364
|
+
|
|
1365
|
+
Examples
|
|
1366
|
+
--------
|
|
1367
|
+
Define a schema with duration fields and generate test data:
|
|
1368
|
+
|
|
1369
|
+
```python
|
|
1370
|
+
import pointblank as pb
|
|
1371
|
+
from datetime import timedelta
|
|
1372
|
+
|
|
1373
|
+
# Define a schema with duration field specifications
|
|
1374
|
+
schema = pb.Schema(
|
|
1375
|
+
session_length=pb.duration_field(
|
|
1376
|
+
min_duration=timedelta(minutes=5),
|
|
1377
|
+
max_duration=timedelta(hours=2)
|
|
1378
|
+
),
|
|
1379
|
+
wait_time=pb.duration_field(
|
|
1380
|
+
min_duration=timedelta(seconds=30),
|
|
1381
|
+
max_duration=timedelta(minutes=15)
|
|
1382
|
+
),
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
# Generate 100 rows of test data
|
|
1386
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
1387
|
+
```
|
|
1388
|
+
|
|
1389
|
+
Duration values are uniformly distributed within the specified range.
|
|
1390
|
+
"""
|
|
1391
|
+
|
|
1392
|
+
# Duration-specific constraints
|
|
1393
|
+
min_duration: str | timedelta | None = None
|
|
1394
|
+
max_duration: str | timedelta | None = None
|
|
1395
|
+
|
|
1396
|
+
# Override dtype with fixed value
|
|
1397
|
+
dtype: str = "Duration"
|
|
1398
|
+
|
|
1399
|
+
def _validate(self) -> None:
|
|
1400
|
+
"""Validate duration field constraints."""
|
|
1401
|
+
super()._validate()
|
|
1402
|
+
|
|
1403
|
+
# Validate dtype (must be Duration)
|
|
1404
|
+
if self.dtype != "Duration":
|
|
1405
|
+
raise ValueError(f"DurationField dtype must be 'Duration', got '{self.dtype}'")
|
|
1406
|
+
|
|
1407
|
+
# Validate duration range
|
|
1408
|
+
if self.min_duration is not None and self.max_duration is not None:
|
|
1409
|
+
min_d = self._parse_duration(self.min_duration)
|
|
1410
|
+
max_d = self._parse_duration(self.max_duration)
|
|
1411
|
+
if min_d > max_d:
|
|
1412
|
+
raise ValueError(
|
|
1413
|
+
f"min_duration ({self.min_duration}) cannot be greater than "
|
|
1414
|
+
f"max_duration ({self.max_duration})"
|
|
1415
|
+
)
|
|
1416
|
+
|
|
1417
|
+
@staticmethod
|
|
1418
|
+
def _parse_duration(value: str | timedelta) -> timedelta:
|
|
1419
|
+
"""Parse a duration value for comparison."""
|
|
1420
|
+
if isinstance(value, timedelta):
|
|
1421
|
+
return value
|
|
1422
|
+
if isinstance(value, str):
|
|
1423
|
+
# Parse ISO 8601 duration format (simplified)
|
|
1424
|
+
# e.g., "PT1H30M" for 1 hour 30 minutes
|
|
1425
|
+
# For simplicity, we also accept formats like "1:30:00"
|
|
1426
|
+
try:
|
|
1427
|
+
parts = value.split(":")
|
|
1428
|
+
if len(parts) == 3:
|
|
1429
|
+
hours, minutes, seconds = map(float, parts)
|
|
1430
|
+
return timedelta(hours=hours, minutes=minutes, seconds=seconds)
|
|
1431
|
+
elif len(parts) == 2:
|
|
1432
|
+
minutes, seconds = map(float, parts)
|
|
1433
|
+
return timedelta(minutes=minutes, seconds=seconds)
|
|
1434
|
+
except ValueError:
|
|
1435
|
+
pass
|
|
1436
|
+
raise ValueError(
|
|
1437
|
+
f"Unable to parse duration string '{value}'. "
|
|
1438
|
+
"Use format 'HH:MM:SS' or timedelta object."
|
|
1439
|
+
)
|
|
1440
|
+
raise ValueError(f"Invalid duration type: {type(value)}")
|
|
1441
|
+
|
|
1442
|
+
|
|
1443
|
+
def duration_field(
|
|
1444
|
+
min_duration: str | timedelta | None = None,
|
|
1445
|
+
max_duration: str | timedelta | None = None,
|
|
1446
|
+
nullable: bool = False,
|
|
1447
|
+
null_probability: float = 0.0,
|
|
1448
|
+
unique: bool = False,
|
|
1449
|
+
generator: Callable[[], Any] | None = None,
|
|
1450
|
+
) -> DurationField:
|
|
1451
|
+
"""
|
|
1452
|
+
Create a duration column specification.
|
|
1453
|
+
|
|
1454
|
+
Parameters
|
|
1455
|
+
----------
|
|
1456
|
+
min_duration
|
|
1457
|
+
Minimum duration (inclusive). Can be string or `timedelta` object.
|
|
1458
|
+
max_duration
|
|
1459
|
+
Maximum duration (inclusive). Can be string or `timedelta` object.
|
|
1460
|
+
nullable
|
|
1461
|
+
Whether the column can contain null values. Default is `False`.
|
|
1462
|
+
null_probability
|
|
1463
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
1464
|
+
unique
|
|
1465
|
+
Whether all values must be unique. Default is `False`.
|
|
1466
|
+
generator
|
|
1467
|
+
Custom callable that generates values. Overrides other settings.
|
|
1468
|
+
|
|
1469
|
+
Returns
|
|
1470
|
+
-------
|
|
1471
|
+
DurationField
|
|
1472
|
+
A duration field specification.
|
|
1473
|
+
|
|
1474
|
+
Examples
|
|
1475
|
+
--------
|
|
1476
|
+
Define a schema with duration fields and generate test data:
|
|
1477
|
+
|
|
1478
|
+
```{python}
|
|
1479
|
+
import pointblank as pb
|
|
1480
|
+
from datetime import timedelta
|
|
1481
|
+
|
|
1482
|
+
# Define a schema with duration field specifications
|
|
1483
|
+
schema = pb.Schema(
|
|
1484
|
+
session_length=pb.duration_field(
|
|
1485
|
+
min_duration=timedelta(minutes=5),
|
|
1486
|
+
max_duration=timedelta(hours=2)
|
|
1487
|
+
),
|
|
1488
|
+
wait_time=pb.duration_field(
|
|
1489
|
+
min_duration=timedelta(seconds=30),
|
|
1490
|
+
max_duration=timedelta(minutes=15)
|
|
1491
|
+
),
|
|
1492
|
+
)
|
|
1493
|
+
|
|
1494
|
+
# Generate 100 rows of test data
|
|
1495
|
+
pb.generate_dataset(schema, n=100, seed=23)
|
|
1496
|
+
```
|
|
1497
|
+
|
|
1498
|
+
Duration values are uniformly distributed within the specified range.
|
|
1499
|
+
"""
|
|
1500
|
+
return DurationField(
|
|
1501
|
+
min_duration=min_duration,
|
|
1502
|
+
max_duration=max_duration,
|
|
1503
|
+
nullable=nullable,
|
|
1504
|
+
null_probability=null_probability,
|
|
1505
|
+
unique=unique,
|
|
1506
|
+
generator=generator,
|
|
1507
|
+
)
|