pointblank 0.19.0__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. pointblank/__init__.py +44 -1
  2. pointblank/_utils_llms_txt.py +20 -0
  3. pointblank/data/api-docs.txt +793 -1
  4. pointblank/field.py +1507 -0
  5. pointblank/generate/__init__.py +17 -0
  6. pointblank/generate/base.py +49 -0
  7. pointblank/generate/generators.py +573 -0
  8. pointblank/generate/regex.py +217 -0
  9. pointblank/locales/__init__.py +1476 -0
  10. pointblank/locales/data/AR/address.json +73 -0
  11. pointblank/locales/data/AR/company.json +60 -0
  12. pointblank/locales/data/AR/internet.json +19 -0
  13. pointblank/locales/data/AR/misc.json +7 -0
  14. pointblank/locales/data/AR/person.json +39 -0
  15. pointblank/locales/data/AR/text.json +38 -0
  16. pointblank/locales/data/AT/address.json +84 -0
  17. pointblank/locales/data/AT/company.json +65 -0
  18. pointblank/locales/data/AT/internet.json +20 -0
  19. pointblank/locales/data/AT/misc.json +8 -0
  20. pointblank/locales/data/AT/person.json +17 -0
  21. pointblank/locales/data/AT/text.json +35 -0
  22. pointblank/locales/data/AU/address.json +83 -0
  23. pointblank/locales/data/AU/company.json +65 -0
  24. pointblank/locales/data/AU/internet.json +20 -0
  25. pointblank/locales/data/AU/misc.json +8 -0
  26. pointblank/locales/data/AU/person.json +17 -0
  27. pointblank/locales/data/AU/text.json +35 -0
  28. pointblank/locales/data/BE/address.json +225 -0
  29. pointblank/locales/data/BE/company.json +129 -0
  30. pointblank/locales/data/BE/internet.json +36 -0
  31. pointblank/locales/data/BE/misc.json +6 -0
  32. pointblank/locales/data/BE/person.json +62 -0
  33. pointblank/locales/data/BE/text.json +38 -0
  34. pointblank/locales/data/BG/address.json +75 -0
  35. pointblank/locales/data/BG/company.json +60 -0
  36. pointblank/locales/data/BG/internet.json +19 -0
  37. pointblank/locales/data/BG/misc.json +7 -0
  38. pointblank/locales/data/BG/person.json +40 -0
  39. pointblank/locales/data/BG/text.json +38 -0
  40. pointblank/locales/data/BR/address.json +98 -0
  41. pointblank/locales/data/BR/company.json +65 -0
  42. pointblank/locales/data/BR/internet.json +20 -0
  43. pointblank/locales/data/BR/misc.json +8 -0
  44. pointblank/locales/data/BR/person.json +17 -0
  45. pointblank/locales/data/BR/text.json +35 -0
  46. pointblank/locales/data/CA/address.json +747 -0
  47. pointblank/locales/data/CA/company.json +120 -0
  48. pointblank/locales/data/CA/internet.json +24 -0
  49. pointblank/locales/data/CA/misc.json +11 -0
  50. pointblank/locales/data/CA/person.json +1033 -0
  51. pointblank/locales/data/CA/text.json +58 -0
  52. pointblank/locales/data/CH/address.json +184 -0
  53. pointblank/locales/data/CH/company.json +112 -0
  54. pointblank/locales/data/CH/internet.json +20 -0
  55. pointblank/locales/data/CH/misc.json +10 -0
  56. pointblank/locales/data/CH/person.json +64 -0
  57. pointblank/locales/data/CH/text.json +45 -0
  58. pointblank/locales/data/CL/address.json +71 -0
  59. pointblank/locales/data/CL/company.json +60 -0
  60. pointblank/locales/data/CL/internet.json +19 -0
  61. pointblank/locales/data/CL/misc.json +7 -0
  62. pointblank/locales/data/CL/person.json +38 -0
  63. pointblank/locales/data/CL/text.json +38 -0
  64. pointblank/locales/data/CN/address.json +124 -0
  65. pointblank/locales/data/CN/company.json +76 -0
  66. pointblank/locales/data/CN/internet.json +20 -0
  67. pointblank/locales/data/CN/misc.json +8 -0
  68. pointblank/locales/data/CN/person.json +50 -0
  69. pointblank/locales/data/CN/text.json +38 -0
  70. pointblank/locales/data/CO/address.json +76 -0
  71. pointblank/locales/data/CO/company.json +60 -0
  72. pointblank/locales/data/CO/internet.json +19 -0
  73. pointblank/locales/data/CO/misc.json +7 -0
  74. pointblank/locales/data/CO/person.json +38 -0
  75. pointblank/locales/data/CO/text.json +38 -0
  76. pointblank/locales/data/CY/address.json +62 -0
  77. pointblank/locales/data/CY/company.json +60 -0
  78. pointblank/locales/data/CY/internet.json +19 -0
  79. pointblank/locales/data/CY/misc.json +7 -0
  80. pointblank/locales/data/CY/person.json +38 -0
  81. pointblank/locales/data/CY/text.json +38 -0
  82. pointblank/locales/data/CZ/address.json +70 -0
  83. pointblank/locales/data/CZ/company.json +61 -0
  84. pointblank/locales/data/CZ/internet.json +19 -0
  85. pointblank/locales/data/CZ/misc.json +7 -0
  86. pointblank/locales/data/CZ/person.json +40 -0
  87. pointblank/locales/data/CZ/text.json +38 -0
  88. pointblank/locales/data/DE/address.json +756 -0
  89. pointblank/locales/data/DE/company.json +101 -0
  90. pointblank/locales/data/DE/internet.json +22 -0
  91. pointblank/locales/data/DE/misc.json +11 -0
  92. pointblank/locales/data/DE/person.json +1026 -0
  93. pointblank/locales/data/DE/text.json +50 -0
  94. pointblank/locales/data/DK/address.json +231 -0
  95. pointblank/locales/data/DK/company.json +65 -0
  96. pointblank/locales/data/DK/internet.json +20 -0
  97. pointblank/locales/data/DK/misc.json +7 -0
  98. pointblank/locales/data/DK/person.json +45 -0
  99. pointblank/locales/data/DK/text.json +43 -0
  100. pointblank/locales/data/EE/address.json +69 -0
  101. pointblank/locales/data/EE/company.json +60 -0
  102. pointblank/locales/data/EE/internet.json +19 -0
  103. pointblank/locales/data/EE/misc.json +7 -0
  104. pointblank/locales/data/EE/person.json +39 -0
  105. pointblank/locales/data/EE/text.json +38 -0
  106. pointblank/locales/data/ES/address.json +3086 -0
  107. pointblank/locales/data/ES/company.json +644 -0
  108. pointblank/locales/data/ES/internet.json +25 -0
  109. pointblank/locales/data/ES/misc.json +11 -0
  110. pointblank/locales/data/ES/person.json +488 -0
  111. pointblank/locales/data/ES/text.json +49 -0
  112. pointblank/locales/data/FI/address.json +93 -0
  113. pointblank/locales/data/FI/company.json +65 -0
  114. pointblank/locales/data/FI/internet.json +20 -0
  115. pointblank/locales/data/FI/misc.json +8 -0
  116. pointblank/locales/data/FI/person.json +17 -0
  117. pointblank/locales/data/FI/text.json +35 -0
  118. pointblank/locales/data/FR/address.json +619 -0
  119. pointblank/locales/data/FR/company.json +111 -0
  120. pointblank/locales/data/FR/internet.json +22 -0
  121. pointblank/locales/data/FR/misc.json +11 -0
  122. pointblank/locales/data/FR/person.json +1066 -0
  123. pointblank/locales/data/FR/text.json +50 -0
  124. pointblank/locales/data/GB/address.json +5759 -0
  125. pointblank/locales/data/GB/company.json +131 -0
  126. pointblank/locales/data/GB/internet.json +24 -0
  127. pointblank/locales/data/GB/misc.json +45 -0
  128. pointblank/locales/data/GB/person.json +578 -0
  129. pointblank/locales/data/GB/text.json +61 -0
  130. pointblank/locales/data/GR/address.json +68 -0
  131. pointblank/locales/data/GR/company.json +61 -0
  132. pointblank/locales/data/GR/internet.json +19 -0
  133. pointblank/locales/data/GR/misc.json +7 -0
  134. pointblank/locales/data/GR/person.json +39 -0
  135. pointblank/locales/data/GR/text.json +38 -0
  136. pointblank/locales/data/HK/address.json +79 -0
  137. pointblank/locales/data/HK/company.json +69 -0
  138. pointblank/locales/data/HK/internet.json +19 -0
  139. pointblank/locales/data/HK/misc.json +7 -0
  140. pointblank/locales/data/HK/person.json +42 -0
  141. pointblank/locales/data/HK/text.json +38 -0
  142. pointblank/locales/data/HR/address.json +73 -0
  143. pointblank/locales/data/HR/company.json +60 -0
  144. pointblank/locales/data/HR/internet.json +19 -0
  145. pointblank/locales/data/HR/misc.json +7 -0
  146. pointblank/locales/data/HR/person.json +38 -0
  147. pointblank/locales/data/HR/text.json +38 -0
  148. pointblank/locales/data/HU/address.json +70 -0
  149. pointblank/locales/data/HU/company.json +61 -0
  150. pointblank/locales/data/HU/internet.json +19 -0
  151. pointblank/locales/data/HU/misc.json +7 -0
  152. pointblank/locales/data/HU/person.json +40 -0
  153. pointblank/locales/data/HU/text.json +38 -0
  154. pointblank/locales/data/ID/address.json +68 -0
  155. pointblank/locales/data/ID/company.json +61 -0
  156. pointblank/locales/data/ID/internet.json +19 -0
  157. pointblank/locales/data/ID/misc.json +7 -0
  158. pointblank/locales/data/ID/person.json +40 -0
  159. pointblank/locales/data/ID/text.json +38 -0
  160. pointblank/locales/data/IE/address.json +643 -0
  161. pointblank/locales/data/IE/company.json +140 -0
  162. pointblank/locales/data/IE/internet.json +24 -0
  163. pointblank/locales/data/IE/misc.json +44 -0
  164. pointblank/locales/data/IE/person.json +55 -0
  165. pointblank/locales/data/IE/text.json +60 -0
  166. pointblank/locales/data/IN/address.json +92 -0
  167. pointblank/locales/data/IN/company.json +65 -0
  168. pointblank/locales/data/IN/internet.json +20 -0
  169. pointblank/locales/data/IN/misc.json +8 -0
  170. pointblank/locales/data/IN/person.json +52 -0
  171. pointblank/locales/data/IN/text.json +39 -0
  172. pointblank/locales/data/IS/address.json +63 -0
  173. pointblank/locales/data/IS/company.json +61 -0
  174. pointblank/locales/data/IS/internet.json +19 -0
  175. pointblank/locales/data/IS/misc.json +7 -0
  176. pointblank/locales/data/IS/person.json +44 -0
  177. pointblank/locales/data/IS/text.json +38 -0
  178. pointblank/locales/data/IT/address.json +192 -0
  179. pointblank/locales/data/IT/company.json +137 -0
  180. pointblank/locales/data/IT/internet.json +20 -0
  181. pointblank/locales/data/IT/misc.json +10 -0
  182. pointblank/locales/data/IT/person.json +70 -0
  183. pointblank/locales/data/IT/text.json +44 -0
  184. pointblank/locales/data/JP/address.json +713 -0
  185. pointblank/locales/data/JP/company.json +113 -0
  186. pointblank/locales/data/JP/internet.json +22 -0
  187. pointblank/locales/data/JP/misc.json +10 -0
  188. pointblank/locales/data/JP/person.json +1057 -0
  189. pointblank/locales/data/JP/text.json +51 -0
  190. pointblank/locales/data/KR/address.json +77 -0
  191. pointblank/locales/data/KR/company.json +68 -0
  192. pointblank/locales/data/KR/internet.json +19 -0
  193. pointblank/locales/data/KR/misc.json +7 -0
  194. pointblank/locales/data/KR/person.json +40 -0
  195. pointblank/locales/data/KR/text.json +38 -0
  196. pointblank/locales/data/LT/address.json +66 -0
  197. pointblank/locales/data/LT/company.json +60 -0
  198. pointblank/locales/data/LT/internet.json +19 -0
  199. pointblank/locales/data/LT/misc.json +7 -0
  200. pointblank/locales/data/LT/person.json +42 -0
  201. pointblank/locales/data/LT/text.json +38 -0
  202. pointblank/locales/data/LU/address.json +66 -0
  203. pointblank/locales/data/LU/company.json +60 -0
  204. pointblank/locales/data/LU/internet.json +19 -0
  205. pointblank/locales/data/LU/misc.json +7 -0
  206. pointblank/locales/data/LU/person.json +38 -0
  207. pointblank/locales/data/LU/text.json +38 -0
  208. pointblank/locales/data/LV/address.json +62 -0
  209. pointblank/locales/data/LV/company.json +60 -0
  210. pointblank/locales/data/LV/internet.json +19 -0
  211. pointblank/locales/data/LV/misc.json +7 -0
  212. pointblank/locales/data/LV/person.json +40 -0
  213. pointblank/locales/data/LV/text.json +38 -0
  214. pointblank/locales/data/MT/address.json +61 -0
  215. pointblank/locales/data/MT/company.json +60 -0
  216. pointblank/locales/data/MT/internet.json +19 -0
  217. pointblank/locales/data/MT/misc.json +7 -0
  218. pointblank/locales/data/MT/person.json +38 -0
  219. pointblank/locales/data/MT/text.json +38 -0
  220. pointblank/locales/data/MX/address.json +100 -0
  221. pointblank/locales/data/MX/company.json +65 -0
  222. pointblank/locales/data/MX/internet.json +20 -0
  223. pointblank/locales/data/MX/misc.json +8 -0
  224. pointblank/locales/data/MX/person.json +18 -0
  225. pointblank/locales/data/MX/text.json +39 -0
  226. pointblank/locales/data/NL/address.json +1517 -0
  227. pointblank/locales/data/NL/company.json +133 -0
  228. pointblank/locales/data/NL/internet.json +44 -0
  229. pointblank/locales/data/NL/misc.json +55 -0
  230. pointblank/locales/data/NL/person.json +365 -0
  231. pointblank/locales/data/NL/text.json +210 -0
  232. pointblank/locales/data/NO/address.json +86 -0
  233. pointblank/locales/data/NO/company.json +66 -0
  234. pointblank/locales/data/NO/internet.json +20 -0
  235. pointblank/locales/data/NO/misc.json +8 -0
  236. pointblank/locales/data/NO/person.json +17 -0
  237. pointblank/locales/data/NO/text.json +35 -0
  238. pointblank/locales/data/NZ/address.json +90 -0
  239. pointblank/locales/data/NZ/company.json +65 -0
  240. pointblank/locales/data/NZ/internet.json +20 -0
  241. pointblank/locales/data/NZ/misc.json +8 -0
  242. pointblank/locales/data/NZ/person.json +17 -0
  243. pointblank/locales/data/NZ/text.json +39 -0
  244. pointblank/locales/data/PH/address.json +67 -0
  245. pointblank/locales/data/PH/company.json +61 -0
  246. pointblank/locales/data/PH/internet.json +19 -0
  247. pointblank/locales/data/PH/misc.json +7 -0
  248. pointblank/locales/data/PH/person.json +40 -0
  249. pointblank/locales/data/PH/text.json +38 -0
  250. pointblank/locales/data/PL/address.json +91 -0
  251. pointblank/locales/data/PL/company.json +65 -0
  252. pointblank/locales/data/PL/internet.json +20 -0
  253. pointblank/locales/data/PL/misc.json +8 -0
  254. pointblank/locales/data/PL/person.json +17 -0
  255. pointblank/locales/data/PL/text.json +35 -0
  256. pointblank/locales/data/PT/address.json +90 -0
  257. pointblank/locales/data/PT/company.json +65 -0
  258. pointblank/locales/data/PT/internet.json +20 -0
  259. pointblank/locales/data/PT/misc.json +8 -0
  260. pointblank/locales/data/PT/person.json +17 -0
  261. pointblank/locales/data/PT/text.json +35 -0
  262. pointblank/locales/data/RO/address.json +73 -0
  263. pointblank/locales/data/RO/company.json +61 -0
  264. pointblank/locales/data/RO/internet.json +19 -0
  265. pointblank/locales/data/RO/misc.json +7 -0
  266. pointblank/locales/data/RO/person.json +40 -0
  267. pointblank/locales/data/RO/text.json +38 -0
  268. pointblank/locales/data/RU/address.json +74 -0
  269. pointblank/locales/data/RU/company.json +60 -0
  270. pointblank/locales/data/RU/internet.json +19 -0
  271. pointblank/locales/data/RU/misc.json +7 -0
  272. pointblank/locales/data/RU/person.json +38 -0
  273. pointblank/locales/data/RU/text.json +38 -0
  274. pointblank/locales/data/SE/address.json +247 -0
  275. pointblank/locales/data/SE/company.json +65 -0
  276. pointblank/locales/data/SE/internet.json +20 -0
  277. pointblank/locales/data/SE/misc.json +7 -0
  278. pointblank/locales/data/SE/person.json +45 -0
  279. pointblank/locales/data/SE/text.json +43 -0
  280. pointblank/locales/data/SI/address.json +67 -0
  281. pointblank/locales/data/SI/company.json +60 -0
  282. pointblank/locales/data/SI/internet.json +19 -0
  283. pointblank/locales/data/SI/misc.json +7 -0
  284. pointblank/locales/data/SI/person.json +38 -0
  285. pointblank/locales/data/SI/text.json +38 -0
  286. pointblank/locales/data/SK/address.json +64 -0
  287. pointblank/locales/data/SK/company.json +60 -0
  288. pointblank/locales/data/SK/internet.json +19 -0
  289. pointblank/locales/data/SK/misc.json +7 -0
  290. pointblank/locales/data/SK/person.json +38 -0
  291. pointblank/locales/data/SK/text.json +38 -0
  292. pointblank/locales/data/TR/address.json +105 -0
  293. pointblank/locales/data/TR/company.json +65 -0
  294. pointblank/locales/data/TR/internet.json +20 -0
  295. pointblank/locales/data/TR/misc.json +8 -0
  296. pointblank/locales/data/TR/person.json +17 -0
  297. pointblank/locales/data/TR/text.json +35 -0
  298. pointblank/locales/data/TW/address.json +86 -0
  299. pointblank/locales/data/TW/company.json +69 -0
  300. pointblank/locales/data/TW/internet.json +19 -0
  301. pointblank/locales/data/TW/misc.json +7 -0
  302. pointblank/locales/data/TW/person.json +42 -0
  303. pointblank/locales/data/TW/text.json +38 -0
  304. pointblank/locales/data/US/address.json +996 -0
  305. pointblank/locales/data/US/company.json +131 -0
  306. pointblank/locales/data/US/internet.json +22 -0
  307. pointblank/locales/data/US/misc.json +11 -0
  308. pointblank/locales/data/US/person.json +1092 -0
  309. pointblank/locales/data/US/text.json +56 -0
  310. pointblank/locales/data/_shared/misc.json +42 -0
  311. pointblank/schema.py +339 -2
  312. {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/METADATA +45 -1
  313. pointblank-0.20.0.dist-info/RECORD +366 -0
  314. {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/WHEEL +1 -1
  315. pointblank-0.19.0.dist-info/RECORD +0 -59
  316. {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/entry_points.txt +0 -0
  317. {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/licenses/LICENSE +0 -0
  318. {pointblank-0.19.0.dist-info → pointblank-0.20.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,17 @@
1
+ """
2
+ Data generation module for Pointblank.
3
+
4
+ This module provides synthetic test data generation from Schema definitions.
5
+ """
6
+
7
+ from pointblank.generate.base import GeneratorConfig
8
+ from pointblank.generate.generators import (
9
+ generate_column,
10
+ generate_dataframe,
11
+ )
12
+
13
+ __all__ = [
14
+ "GeneratorConfig",
15
+ "generate_column",
16
+ "generate_dataframe",
17
+ ]
@@ -0,0 +1,49 @@
1
+ """
2
+ Base infrastructure for data generation.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import TYPE_CHECKING, Literal
9
+
10
+ if TYPE_CHECKING:
11
+ pass
12
+
13
+ __all__ = ["GeneratorConfig"]
14
+
15
+
16
+ @dataclass
17
+ class GeneratorConfig:
18
+ """
19
+ Configuration for data generation.
20
+
21
+ Parameters
22
+ ----------
23
+ n
24
+ Number of rows to generate.
25
+ seed
26
+ Random seed for reproducibility.
27
+ output
28
+ Output format: "polars", "pandas", or "dict".
29
+ country
30
+ Country code for realistic data generation. Accepts ISO 3166-1 alpha-2 codes
31
+ (e.g., `"US"`, `"DE"`, `"FR"`) or alpha-3 codes (e.g., `"USA"`, `"DEU"`).
32
+ Default is `"US"`.
33
+ max_unique_retries
34
+ Maximum retries when generating unique values.
35
+ """
36
+
37
+ n: int = 100
38
+ seed: int | None = None
39
+ output: Literal["polars", "pandas", "dict"] = "polars"
40
+ country: str = "US"
41
+ max_unique_retries: int = 1000
42
+
43
+ def __post_init__(self):
44
+ if self.n < 0:
45
+ raise ValueError(f"n must be non-negative, got {self.n}")
46
+ if self.max_unique_retries < 1:
47
+ raise ValueError(
48
+ f"max_unique_retries must be at least 1, got {self.max_unique_retries}"
49
+ )
@@ -0,0 +1,573 @@
1
+ """
2
+ Per-dtype value generators for synthetic data generation.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import random
8
+ import string
9
+ from datetime import date, datetime, time, timedelta
10
+ from typing import TYPE_CHECKING, Any, Callable
11
+
12
+ from pointblank._utils import _is_lib_present
13
+ from pointblank.field import Field
14
+ from pointblank.generate.base import GeneratorConfig
15
+ from pointblank.generate.regex import generate_from_regex
16
+ from pointblank.locales import LocaleGenerator
17
+
18
+ if TYPE_CHECKING:
19
+ pass
20
+
21
+ __all__ = ["generate_column", "generate_dataframe"]
22
+
23
+
24
+ # Integer dtype bounds
25
+ INTEGER_BOUNDS = {
26
+ "Int8": (-(2**7), 2**7 - 1),
27
+ "Int16": (-(2**15), 2**15 - 1),
28
+ "Int32": (-(2**31), 2**31 - 1),
29
+ "Int64": (-(2**63), 2**63 - 1),
30
+ "UInt8": (0, 2**8 - 1),
31
+ "UInt16": (0, 2**16 - 1),
32
+ "UInt32": (0, 2**32 - 1),
33
+ "UInt64": (0, 2**64 - 1),
34
+ }
35
+
36
+
37
+ def _get_locale_generator(country: str = "US", seed: int | None = None) -> LocaleGenerator:
38
+ """Get a LocaleGenerator instance with the specified country."""
39
+ return LocaleGenerator(country=country, seed=seed)
40
+
41
+
42
+ def _generate_integer(field: Field, rng: random.Random, generator: Any | None = None) -> int:
43
+ """Generate a random integer value respecting field constraints."""
44
+ dtype_min, dtype_max = INTEGER_BOUNDS.get(field.dtype, (-(2**63), 2**63 - 1))
45
+
46
+ min_val = getattr(field, "min_val", None)
47
+ max_val = getattr(field, "max_val", None)
48
+
49
+ min_val = min_val if min_val is not None else dtype_min
50
+ max_val = max_val if max_val is not None else dtype_max
51
+
52
+ # Clamp to dtype bounds
53
+ min_val = max(min_val, dtype_min)
54
+ max_val = min(max_val, dtype_max)
55
+
56
+ return rng.randint(int(min_val), int(max_val))
57
+
58
+
59
+ def _generate_float(field: Field, rng: random.Random, generator: Any | None = None) -> float:
60
+ """Generate a random float value respecting field constraints."""
61
+ min_val = getattr(field, "min_val", None)
62
+ max_val = getattr(field, "max_val", None)
63
+
64
+ min_val = min_val if min_val is not None else -1e10
65
+ max_val = max_val if max_val is not None else 1e10
66
+
67
+ return rng.uniform(float(min_val), float(max_val))
68
+
69
+
70
+ def _generate_string(
71
+ field: Field, rng: random.Random, generator: LocaleGenerator | None = None
72
+ ) -> str:
73
+ """Generate a random string value respecting field constraints."""
74
+ # If using a preset, delegate to locale generator
75
+ preset = getattr(field, "preset", None)
76
+ if preset is not None:
77
+ if generator is None:
78
+ raise ValueError("LocaleGenerator instance required for preset generation")
79
+ return _generate_from_preset(preset, generator)
80
+
81
+ # If using a pattern, generate from regex
82
+ pattern = getattr(field, "pattern", None)
83
+ if pattern is not None:
84
+ return _generate_from_pattern(pattern, rng)
85
+
86
+ # Otherwise generate random alphanumeric string
87
+ min_length = getattr(field, "min_length", None)
88
+ max_length = getattr(field, "max_length", None)
89
+ min_len = min_length if min_length is not None else 1
90
+ max_len = max_length if max_length is not None else 20
91
+
92
+ length = rng.randint(min_len, max_len)
93
+ chars = string.ascii_letters + string.digits
94
+ return "".join(rng.choice(chars) for _ in range(length))
95
+
96
+
97
+ def _generate_from_preset(preset: str, generator: LocaleGenerator) -> str:
98
+ """Generate a value using a LocaleGenerator preset."""
99
+ # Map preset names to LocaleGenerator methods
100
+ preset_mapping = {
101
+ # Personal
102
+ "name": generator.name,
103
+ "name_full": generator.name_full,
104
+ "first_name": generator.first_name,
105
+ "last_name": generator.last_name,
106
+ "email": generator.email,
107
+ "phone_number": generator.phone_number,
108
+ "address": generator.address,
109
+ "city": generator.city,
110
+ "state": generator.state,
111
+ "country": generator.country,
112
+ "postcode": generator.postcode,
113
+ "latitude": generator.latitude,
114
+ "longitude": generator.longitude,
115
+ # Business
116
+ "company": generator.company,
117
+ "job": generator.job,
118
+ "catch_phrase": generator.catch_phrase,
119
+ # Internet
120
+ "url": generator.url,
121
+ "domain_name": generator.domain_name,
122
+ "ipv4": generator.ipv4,
123
+ "ipv6": generator.ipv6,
124
+ "user_name": generator.user_name,
125
+ "password": generator.password,
126
+ # Text
127
+ "text": generator.text,
128
+ "sentence": generator.sentence,
129
+ "paragraph": generator.paragraph,
130
+ "word": generator.word,
131
+ # Financial
132
+ "credit_card_number": generator.credit_card_number,
133
+ "iban": generator.iban,
134
+ "currency_code": generator.currency_code,
135
+ # Identifiers
136
+ "uuid4": generator.uuid4,
137
+ "ssn": generator.ssn,
138
+ "license_plate": generator.license_plate,
139
+ # Date/Time
140
+ "date_this_year": generator.date_this_year,
141
+ "date_this_decade": generator.date_this_decade,
142
+ "time": generator.time,
143
+ # Misc
144
+ "color_name": generator.color_name,
145
+ "file_name": generator.file_name,
146
+ "file_extension": generator.file_extension,
147
+ "mime_type": generator.mime_type,
148
+ }
149
+
150
+ generator = preset_mapping.get(preset)
151
+ if generator is None:
152
+ raise ValueError(f"Unknown preset: {preset}")
153
+
154
+ return str(generator())
155
+
156
+
157
+ def _generate_from_pattern(pattern: str, rng: random.Random) -> str:
158
+ """Generate a string matching the given regex pattern."""
159
+ return generate_from_regex(pattern, rng)
160
+
161
+
162
+ def _generate_boolean(field: Field, rng: random.Random, generator: Any | None = None) -> bool:
163
+ """Generate a random boolean value."""
164
+ p_true = getattr(field, "p_true", 0.5)
165
+ return rng.random() < p_true
166
+
167
+
168
+ def _generate_date(field: Field, rng: random.Random, generator: Any | None = None) -> date:
169
+ """Generate a random date value respecting field constraints."""
170
+ min_date = getattr(field, "min_date", None)
171
+ max_date = getattr(field, "max_date", None)
172
+
173
+ # Default date range
174
+ if min_date is None:
175
+ min_date = date(2000, 1, 1)
176
+ elif isinstance(min_date, str):
177
+ min_date = date.fromisoformat(min_date)
178
+ elif isinstance(min_date, datetime):
179
+ min_date = min_date.date()
180
+
181
+ if max_date is None:
182
+ max_date = date(2030, 12, 31)
183
+ elif isinstance(max_date, str):
184
+ max_date = date.fromisoformat(max_date)
185
+ elif isinstance(max_date, datetime):
186
+ max_date = max_date.date()
187
+
188
+ days_between = (max_date - min_date).days
189
+ random_days = rng.randint(0, max(0, days_between))
190
+ return min_date + timedelta(days=random_days)
191
+
192
+
193
+ def _generate_datetime(field: Field, rng: random.Random, generator: Any | None = None) -> datetime:
194
+ """Generate a random datetime value respecting field constraints."""
195
+ min_date = getattr(field, "min_date", None)
196
+ max_date = getattr(field, "max_date", None)
197
+
198
+ # Default datetime range
199
+ if min_date is None:
200
+ min_dt = datetime(2000, 1, 1, 0, 0, 0)
201
+ elif isinstance(min_date, str):
202
+ min_dt = datetime.fromisoformat(min_date)
203
+ elif isinstance(min_date, date) and not isinstance(min_date, datetime):
204
+ min_dt = datetime.combine(min_date, datetime.min.time())
205
+ else:
206
+ min_dt = min_date
207
+
208
+ if max_date is None:
209
+ max_dt = datetime(2030, 12, 31, 23, 59, 59)
210
+ elif isinstance(max_date, str):
211
+ max_dt = datetime.fromisoformat(max_date)
212
+ elif isinstance(max_date, date) and not isinstance(max_date, datetime):
213
+ max_dt = datetime.combine(max_date, datetime.max.time())
214
+ else:
215
+ max_dt = max_date
216
+
217
+ seconds_between = int((max_dt - min_dt).total_seconds())
218
+ random_seconds = rng.randint(0, max(0, seconds_between))
219
+ return min_dt + timedelta(seconds=random_seconds)
220
+
221
+
222
+ def _generate_duration(field: Field, rng: random.Random, generator: Any | None = None) -> timedelta:
223
+ """Generate a random duration value, respecting field constraints."""
224
+ min_duration = getattr(field, "min_duration", None)
225
+ max_duration = getattr(field, "max_duration", None)
226
+
227
+ # Parse min_duration
228
+ if min_duration is None:
229
+ min_d = timedelta(seconds=0)
230
+ elif isinstance(min_duration, str):
231
+ # Parse "HH:MM:SS" format
232
+ parts = min_duration.split(":")
233
+ if len(parts) == 3:
234
+ hours, minutes, seconds = map(float, parts)
235
+ min_d = timedelta(hours=hours, minutes=minutes, seconds=seconds)
236
+ elif len(parts) == 2:
237
+ minutes, seconds = map(float, parts)
238
+ min_d = timedelta(minutes=minutes, seconds=seconds)
239
+ else:
240
+ min_d = timedelta(seconds=0)
241
+ else:
242
+ min_d = min_duration
243
+
244
+ # Parse max_duration
245
+ if max_duration is None:
246
+ max_d = timedelta(days=30) # Default: 30 days
247
+ elif isinstance(max_duration, str):
248
+ # Parse "HH:MM:SS" format
249
+ parts = max_duration.split(":")
250
+ if len(parts) == 3:
251
+ hours, minutes, seconds = map(float, parts)
252
+ max_d = timedelta(hours=hours, minutes=minutes, seconds=seconds)
253
+ elif len(parts) == 2:
254
+ minutes, seconds = map(float, parts)
255
+ max_d = timedelta(minutes=minutes, seconds=seconds)
256
+ else:
257
+ max_d = timedelta(days=30)
258
+ else:
259
+ max_d = max_duration
260
+
261
+ # Generate random duration within range
262
+ min_seconds = int(min_d.total_seconds())
263
+ max_seconds = int(max_d.total_seconds())
264
+ random_seconds = rng.randint(min_seconds, max(min_seconds, max_seconds))
265
+
266
+ return timedelta(seconds=random_seconds)
267
+
268
+
269
+ def _generate_time(field: Field, rng: random.Random, generator: Any | None = None) -> str:
270
+ """Generate a random time value as string, respecting field constraints."""
271
+ min_time = getattr(field, "min_time", None)
272
+ max_time = getattr(field, "max_time", None)
273
+
274
+ # Parse min_time
275
+ if min_time is None:
276
+ min_t = time(0, 0, 0)
277
+ elif isinstance(min_time, str):
278
+ min_t = time.fromisoformat(min_time)
279
+ else:
280
+ min_t = min_time
281
+
282
+ # Parse max_time
283
+ if max_time is None:
284
+ max_t = time(23, 59, 59)
285
+ elif isinstance(max_time, str):
286
+ max_t = time.fromisoformat(max_time)
287
+ else:
288
+ max_t = max_time
289
+
290
+ # Convert to seconds since midnight for random generation
291
+ min_seconds = min_t.hour * 3600 + min_t.minute * 60 + min_t.second
292
+ max_seconds = max_t.hour * 3600 + max_t.minute * 60 + max_t.second
293
+
294
+ # Generate random seconds within range
295
+ random_seconds = rng.randint(min_seconds, max(min_seconds, max_seconds))
296
+
297
+ # Convert back to time components
298
+ hour = random_seconds // 3600
299
+ minute = (random_seconds % 3600) // 60
300
+ second = random_seconds % 60
301
+
302
+ return f"{hour:02d}:{minute:02d}:{second:02d}"
303
+
304
+
305
+ # Mapping from dtype to generator function
306
+ DTYPE_GENERATORS: dict[str, Callable[[Field, random.Random, Any | None], Any]] = {
307
+ "Int8": _generate_integer,
308
+ "Int16": _generate_integer,
309
+ "Int32": _generate_integer,
310
+ "Int64": _generate_integer,
311
+ "UInt8": _generate_integer,
312
+ "UInt16": _generate_integer,
313
+ "UInt32": _generate_integer,
314
+ "UInt64": _generate_integer,
315
+ "Float32": _generate_float,
316
+ "Float64": _generate_float,
317
+ "String": _generate_string,
318
+ "Boolean": _generate_boolean,
319
+ "Date": _generate_date,
320
+ "Datetime": _generate_datetime,
321
+ "Duration": _generate_duration,
322
+ "Time": _generate_time,
323
+ }
324
+
325
+
326
+ def _generate_value(field: Field, rng: random.Random, locale_gen: Any | None = None) -> Any:
327
+ """Generate a single value for a field."""
328
+ # Check for custom generator first
329
+ if field.generator is not None:
330
+ return field.generator()
331
+
332
+ # Check for allowed values (categorical)
333
+ allowed = getattr(field, "allowed", None)
334
+ if allowed is not None:
335
+ return rng.choice(allowed)
336
+
337
+ # Use dtype-specific generator
338
+ generator = DTYPE_GENERATORS.get(field.dtype)
339
+ if generator is None:
340
+ raise ValueError(f"No generator available for dtype: {field.dtype}")
341
+
342
+ return generator(field, rng, locale_gen)
343
+
344
+
345
+ def _generate_unique_values(
346
+ field: Field,
347
+ n: int,
348
+ rng: random.Random,
349
+ locale_gen: Any | None = None,
350
+ max_retries: int = 1000,
351
+ ) -> list[Any]:
352
+ """Generate n unique values for a field."""
353
+ # Check if we can even generate enough unique values
354
+ allowed = getattr(field, "allowed", None)
355
+ if allowed is not None and len(allowed) < n:
356
+ raise ValueError(
357
+ f"Cannot generate {n} unique values from {len(allowed)} allowed values "
358
+ f"for field with allowed={allowed}"
359
+ )
360
+
361
+ seen: set[Any] = set()
362
+ values: list[Any] = []
363
+ consecutive_retries = 0
364
+
365
+ while len(values) < n:
366
+ value = _generate_value(field, rng, locale_gen)
367
+
368
+ # Handle unhashable types
369
+ try:
370
+ value_key = value
371
+ if isinstance(value, (list, dict)):
372
+ value_key = str(value)
373
+
374
+ if value_key not in seen:
375
+ seen.add(value_key)
376
+ values.append(value)
377
+ consecutive_retries = 0
378
+ else:
379
+ consecutive_retries += 1
380
+ if consecutive_retries > max_retries:
381
+ raise ValueError(
382
+ f"Unable to generate {n} unique values after {max_retries} "
383
+ f"consecutive retries. Generated {len(values)} unique values. "
384
+ "Consider relaxing constraints or reducing n."
385
+ )
386
+ except TypeError:
387
+ # Unhashable type, just append (can't check uniqueness easily)
388
+ values.append(value)
389
+
390
+ return values
391
+
392
+
393
+ def generate_column(
394
+ field: Field,
395
+ config: GeneratorConfig,
396
+ ) -> list[Any]:
397
+ """
398
+ Generate a list of values for a single column.
399
+
400
+ Parameters
401
+ ----------
402
+ field
403
+ The Field specification for the column.
404
+ config
405
+ Generation configuration.
406
+
407
+ Returns
408
+ -------
409
+ list
410
+ List of generated values.
411
+ """
412
+ # Set up random number generator
413
+ rng = random.Random(config.seed)
414
+
415
+ # Set up locale generator if needed
416
+ locale_gen = None
417
+ preset = getattr(field, "preset", None)
418
+ if preset is not None:
419
+ # Use config country
420
+ locale_gen = _get_locale_generator(config.country, config.seed)
421
+
422
+ # Generate values
423
+ if field.unique:
424
+ values = _generate_unique_values(
425
+ field, config.n, rng, locale_gen, config.max_unique_retries
426
+ )
427
+ else:
428
+ values = [_generate_value(field, rng, locale_gen) for _ in range(config.n)]
429
+
430
+ # Apply null probability
431
+ if field.nullable and field.null_probability > 0:
432
+ null_rng = random.Random(config.seed + 1 if config.seed else None)
433
+ values = [None if null_rng.random() < field.null_probability else v for v in values]
434
+
435
+ return values
436
+
437
+
438
+ # Presets that should share coherent context across columns
439
+ ADDRESS_RELATED_PRESETS = {
440
+ "address",
441
+ "city",
442
+ "state",
443
+ "postcode",
444
+ "phone_number",
445
+ "latitude",
446
+ "longitude",
447
+ }
448
+ PERSON_RELATED_PRESETS = {"name", "name_full", "first_name", "last_name", "email", "user_name"}
449
+
450
+
451
+ def _get_coherence_needs(fields: dict[str, Field]) -> tuple[bool, bool]:
452
+ """Check what coherence is needed for the given fields."""
453
+ needs_address = False
454
+ needs_person = False
455
+
456
+ for field in fields.values():
457
+ preset = getattr(field, "preset", None)
458
+ if preset in ADDRESS_RELATED_PRESETS:
459
+ needs_address = True
460
+ if preset in PERSON_RELATED_PRESETS:
461
+ needs_person = True
462
+
463
+ return needs_address, needs_person
464
+
465
+
466
+ def _generate_column_with_row_context(
467
+ field: Field,
468
+ config: GeneratorConfig,
469
+ locale_gen: LocaleGenerator | None,
470
+ ) -> list[Any]:
471
+ """
472
+ Generate column values with per-row context (location and/or person).
473
+
474
+ This is used when columns need to share coherent data per row.
475
+ """
476
+ rng = random.Random(config.seed)
477
+
478
+ values = []
479
+ for i in range(config.n):
480
+ if locale_gen is not None:
481
+ locale_gen.set_row(i)
482
+ values.append(_generate_value(field, rng, locale_gen))
483
+
484
+ # Apply null probability
485
+ if field.nullable and field.null_probability > 0:
486
+ null_rng = random.Random(config.seed + 1 if config.seed else None)
487
+ values = [None if null_rng.random() < field.null_probability else v for v in values]
488
+
489
+ return values
490
+
491
+
492
+ def generate_dataframe(
493
+ fields: dict[str, Field],
494
+ config: GeneratorConfig,
495
+ ) -> Any:
496
+ """
497
+ Generate a DataFrame with the specified fields.
498
+
499
+ Parameters
500
+ ----------
501
+ fields
502
+ Dictionary mapping column names to Field specifications.
503
+ config
504
+ Generation configuration.
505
+
506
+ Returns
507
+ -------
508
+ DataFrame
509
+ Generated DataFrame in the format specified by config.output.
510
+ """
511
+ # Check what coherence is needed
512
+ needs_address, needs_person = _get_coherence_needs(fields)
513
+ needs_coherence = needs_address or needs_person
514
+
515
+ # Set up shared locale generator if any coherence is needed
516
+ shared_locale_gen = None
517
+ if needs_coherence:
518
+ shared_locale_gen = _get_locale_generator(config.country, config.seed)
519
+ if needs_address:
520
+ shared_locale_gen.init_row_locations(config.n)
521
+ if needs_person:
522
+ shared_locale_gen.init_row_persons(config.n)
523
+
524
+ # Determine which presets need row context
525
+ coherent_presets = set()
526
+ if needs_address:
527
+ coherent_presets.update(ADDRESS_RELATED_PRESETS)
528
+ if needs_person:
529
+ coherent_presets.update(PERSON_RELATED_PRESETS)
530
+
531
+ # Generate data for each column
532
+ data: dict[str, list[Any]] = {}
533
+ for col_name, field in fields.items():
534
+ preset = getattr(field, "preset", None)
535
+
536
+ # Use shared locale generator for coherent presets
537
+ if needs_coherence and preset in coherent_presets:
538
+ data[col_name] = _generate_column_with_row_context(field, config, shared_locale_gen)
539
+ else:
540
+ data[col_name] = generate_column(field, config)
541
+
542
+ # Clean up
543
+ if shared_locale_gen is not None:
544
+ if needs_address:
545
+ shared_locale_gen.clear_row_locations()
546
+ if needs_person:
547
+ shared_locale_gen.clear_row_persons()
548
+
549
+ # Convert to requested output format
550
+ if config.output == "dict":
551
+ return data
552
+
553
+ if config.output == "polars":
554
+ if not _is_lib_present("polars"):
555
+ raise ImportError(
556
+ "The Polars library is not installed but is required when specifying "
557
+ '`output="polars"`.'
558
+ )
559
+ import polars as pl
560
+
561
+ return pl.DataFrame(data)
562
+
563
+ if config.output == "pandas":
564
+ if not _is_lib_present("pandas"):
565
+ raise ImportError(
566
+ "The Pandas library is not installed but is required when specifying "
567
+ '`output="pandas"`.'
568
+ )
569
+ import pandas as pd
570
+
571
+ return pd.DataFrame(data)
572
+
573
+ raise ValueError(f"Unknown output format: {config.output}")