dataforge-py 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge/__init__.py +20 -0
- dataforge/backend.py +147 -0
- dataforge/cli.py +166 -0
- dataforge/core.py +1169 -0
- dataforge/locales/__init__.py +1 -0
- dataforge/locales/ar_SA/__init__.py +1 -0
- dataforge/locales/ar_SA/address.py +128 -0
- dataforge/locales/ar_SA/company.py +183 -0
- dataforge/locales/ar_SA/internet.py +25 -0
- dataforge/locales/ar_SA/person.py +217 -0
- dataforge/locales/ar_SA/phone.py +15 -0
- dataforge/locales/de_DE/__init__.py +1 -0
- dataforge/locales/de_DE/address.py +148 -0
- dataforge/locales/de_DE/company.py +125 -0
- dataforge/locales/de_DE/internet.py +32 -0
- dataforge/locales/de_DE/person.py +212 -0
- dataforge/locales/de_DE/phone.py +17 -0
- dataforge/locales/en_AU/__init__.py +1 -0
- dataforge/locales/en_AU/address.py +231 -0
- dataforge/locales/en_AU/company.py +193 -0
- dataforge/locales/en_AU/internet.py +34 -0
- dataforge/locales/en_AU/person.py +370 -0
- dataforge/locales/en_AU/phone.py +16 -0
- dataforge/locales/en_CA/__init__.py +1 -0
- dataforge/locales/en_CA/address.py +276 -0
- dataforge/locales/en_CA/company.py +193 -0
- dataforge/locales/en_CA/internet.py +34 -0
- dataforge/locales/en_CA/person.py +377 -0
- dataforge/locales/en_CA/phone.py +15 -0
- dataforge/locales/en_GB/__init__.py +1 -0
- dataforge/locales/en_GB/address.py +312 -0
- dataforge/locales/en_GB/company.py +196 -0
- dataforge/locales/en_GB/internet.py +34 -0
- dataforge/locales/en_GB/person.py +372 -0
- dataforge/locales/en_GB/phone.py +15 -0
- dataforge/locales/en_US/__init__.py +1 -0
- dataforge/locales/en_US/address.py +268 -0
- dataforge/locales/en_US/company.py +191 -0
- dataforge/locales/en_US/internet.py +34 -0
- dataforge/locales/en_US/person.py +370 -0
- dataforge/locales/en_US/phone.py +15 -0
- dataforge/locales/es_ES/__init__.py +1 -0
- dataforge/locales/es_ES/address.py +151 -0
- dataforge/locales/es_ES/company.py +125 -0
- dataforge/locales/es_ES/internet.py +30 -0
- dataforge/locales/es_ES/person.py +207 -0
- dataforge/locales/es_ES/phone.py +15 -0
- dataforge/locales/fr_FR/__init__.py +1 -0
- dataforge/locales/fr_FR/address.py +145 -0
- dataforge/locales/fr_FR/company.py +125 -0
- dataforge/locales/fr_FR/internet.py +30 -0
- dataforge/locales/fr_FR/person.py +212 -0
- dataforge/locales/fr_FR/phone.py +15 -0
- dataforge/locales/hi_IN/__init__.py +1 -0
- dataforge/locales/hi_IN/address.py +177 -0
- dataforge/locales/hi_IN/company.py +191 -0
- dataforge/locales/hi_IN/internet.py +26 -0
- dataforge/locales/hi_IN/person.py +218 -0
- dataforge/locales/hi_IN/phone.py +21 -0
- dataforge/locales/it_IT/__init__.py +1 -0
- dataforge/locales/it_IT/address.py +218 -0
- dataforge/locales/it_IT/company.py +151 -0
- dataforge/locales/it_IT/internet.py +31 -0
- dataforge/locales/it_IT/person.py +187 -0
- dataforge/locales/it_IT/phone.py +15 -0
- dataforge/locales/ja_JP/__init__.py +1 -0
- dataforge/locales/ja_JP/address.py +174 -0
- dataforge/locales/ja_JP/company.py +121 -0
- dataforge/locales/ja_JP/internet.py +30 -0
- dataforge/locales/ja_JP/person.py +207 -0
- dataforge/locales/ja_JP/phone.py +18 -0
- dataforge/locales/ko_KR/__init__.py +1 -0
- dataforge/locales/ko_KR/address.py +121 -0
- dataforge/locales/ko_KR/company.py +151 -0
- dataforge/locales/ko_KR/internet.py +30 -0
- dataforge/locales/ko_KR/person.py +157 -0
- dataforge/locales/ko_KR/phone.py +26 -0
- dataforge/locales/nl_NL/__init__.py +1 -0
- dataforge/locales/nl_NL/address.py +152 -0
- dataforge/locales/nl_NL/company.py +182 -0
- dataforge/locales/nl_NL/internet.py +41 -0
- dataforge/locales/nl_NL/person.py +218 -0
- dataforge/locales/nl_NL/phone.py +19 -0
- dataforge/locales/pl_PL/__init__.py +1 -0
- dataforge/locales/pl_PL/address.py +140 -0
- dataforge/locales/pl_PL/company.py +183 -0
- dataforge/locales/pl_PL/internet.py +36 -0
- dataforge/locales/pl_PL/person.py +217 -0
- dataforge/locales/pl_PL/phone.py +15 -0
- dataforge/locales/pt_BR/__init__.py +1 -0
- dataforge/locales/pt_BR/address.py +127 -0
- dataforge/locales/pt_BR/company.py +151 -0
- dataforge/locales/pt_BR/internet.py +31 -0
- dataforge/locales/pt_BR/person.py +187 -0
- dataforge/locales/pt_BR/phone.py +15 -0
- dataforge/locales/ru_RU/__init__.py +1 -0
- dataforge/locales/ru_RU/address.py +156 -0
- dataforge/locales/ru_RU/company.py +168 -0
- dataforge/locales/ru_RU/internet.py +26 -0
- dataforge/locales/ru_RU/person.py +218 -0
- dataforge/locales/ru_RU/phone.py +16 -0
- dataforge/locales/zh_CN/__init__.py +1 -0
- dataforge/locales/zh_CN/address.py +141 -0
- dataforge/locales/zh_CN/company.py +151 -0
- dataforge/locales/zh_CN/internet.py +30 -0
- dataforge/locales/zh_CN/person.py +157 -0
- dataforge/locales/zh_CN/phone.py +25 -0
- dataforge/providers/__init__.py +1 -0
- dataforge/providers/address.py +460 -0
- dataforge/providers/ai_chat.py +170 -0
- dataforge/providers/ai_prompt.py +447 -0
- dataforge/providers/automotive.py +416 -0
- dataforge/providers/barcode.py +149 -0
- dataforge/providers/base.py +34 -0
- dataforge/providers/color.py +247 -0
- dataforge/providers/company.py +144 -0
- dataforge/providers/crypto.py +105 -0
- dataforge/providers/datetime.py +397 -0
- dataforge/providers/ecommerce.py +316 -0
- dataforge/providers/education.py +234 -0
- dataforge/providers/file.py +271 -0
- dataforge/providers/finance.py +545 -0
- dataforge/providers/geo.py +332 -0
- dataforge/providers/government.py +114 -0
- dataforge/providers/internet.py +351 -0
- dataforge/providers/llm.py +726 -0
- dataforge/providers/lorem.py +241 -0
- dataforge/providers/medical.py +364 -0
- dataforge/providers/misc.py +196 -0
- dataforge/providers/network.py +283 -0
- dataforge/providers/payment.py +300 -0
- dataforge/providers/person.py +195 -0
- dataforge/providers/phone.py +87 -0
- dataforge/providers/profile.py +265 -0
- dataforge/providers/science.py +365 -0
- dataforge/providers/text.py +365 -0
- dataforge/py.typed +0 -0
- dataforge/pytest_plugin.py +80 -0
- dataforge/registry.py +164 -0
- dataforge/schema.py +772 -0
- dataforge/unique.py +171 -0
- dataforge_py-0.2.0.dist-info/METADATA +964 -0
- dataforge_py-0.2.0.dist-info/RECORD +145 -0
- dataforge_py-0.2.0.dist-info/WHEEL +4 -0
- dataforge_py-0.2.0.dist-info/entry_points.txt +35 -0
dataforge/core.py
ADDED
|
@@ -0,0 +1,1169 @@
|
|
|
1
|
+
"""DataForge — the main entry point for fake data generation.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from dataforge import DataForge
|
|
6
|
+
|
|
7
|
+
forge = DataForge(locale="en_US", seed=42)
|
|
8
|
+
|
|
9
|
+
forge.person.first_name() # "James"
|
|
10
|
+
forge.person.full_name(count=1000) # list of 1000 full names
|
|
11
|
+
forge.address.full_address() # "4821 Oak Ave, Chicago, IL 60614"
|
|
12
|
+
forge.internet.email() # "james.smith@gmail.com"
|
|
13
|
+
forge.company.company_name() # "Acme Inc"
|
|
14
|
+
forge.phone.phone_number() # "555-123-4567"
|
|
15
|
+
forge.lorem.sentence() # "Lorem ipsum dolor sit amet."
|
|
16
|
+
forge.dt.date() # "2024-03-15"
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import importlib
|
|
20
|
+
from typing import TYPE_CHECKING, Any
|
|
21
|
+
from types import ModuleType
|
|
22
|
+
|
|
23
|
+
from dataforge.backend import RandomEngine
|
|
24
|
+
from dataforge.providers.base import BaseProvider
|
|
25
|
+
|
|
26
|
+
# ------------------------------------------------------------------
|
|
27
|
+
# Heuristic field-name mappings for ORM / model introspection
|
|
28
|
+
# ------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
# Maps common model field names to DataForge field shorthand names.
|
|
31
|
+
# Used by schema_from_pydantic() and schema_from_sqlalchemy().
|
|
32
|
+
_FIELD_ALIASES: dict[str, str] = {
|
|
33
|
+
# Person
|
|
34
|
+
"name": "full_name",
|
|
35
|
+
"full_name": "full_name",
|
|
36
|
+
"fname": "first_name",
|
|
37
|
+
"lname": "last_name",
|
|
38
|
+
"surname": "last_name",
|
|
39
|
+
"last": "last_name",
|
|
40
|
+
"first": "first_name",
|
|
41
|
+
"given_name": "first_name",
|
|
42
|
+
"family_name": "last_name",
|
|
43
|
+
"username": "username",
|
|
44
|
+
"user_name": "username",
|
|
45
|
+
# Contact
|
|
46
|
+
"email_address": "email",
|
|
47
|
+
"mail": "email",
|
|
48
|
+
"phone": "phone_number",
|
|
49
|
+
"phone_num": "phone_number",
|
|
50
|
+
"telephone": "phone_number",
|
|
51
|
+
"cell": "cell_number",
|
|
52
|
+
"mobile": "cell_number",
|
|
53
|
+
"cell_phone": "cell_number",
|
|
54
|
+
"mobile_phone": "cell_number",
|
|
55
|
+
# Address
|
|
56
|
+
"street": "street_address",
|
|
57
|
+
"street_addr": "street_address",
|
|
58
|
+
"addr": "full_address",
|
|
59
|
+
"address": "full_address",
|
|
60
|
+
"zip": "zipcode",
|
|
61
|
+
"zip_code": "zipcode",
|
|
62
|
+
"postal_code": "zipcode",
|
|
63
|
+
"postcode": "zipcode",
|
|
64
|
+
"state_abbr": "state_abbreviation",
|
|
65
|
+
"country_name": "country",
|
|
66
|
+
# Internet
|
|
67
|
+
"url": "url",
|
|
68
|
+
"website": "url",
|
|
69
|
+
"domain": "domain_name",
|
|
70
|
+
"ip": "ipv4",
|
|
71
|
+
"ip_address": "ipv4",
|
|
72
|
+
"ipv4_address": "ipv4",
|
|
73
|
+
"ipv6_address": "ipv6",
|
|
74
|
+
"mac": "mac_address",
|
|
75
|
+
"user_agent_string": "user_agent",
|
|
76
|
+
# Company
|
|
77
|
+
"company": "company_name",
|
|
78
|
+
"company_nm": "company_name",
|
|
79
|
+
"job": "job_title",
|
|
80
|
+
"job_name": "job_title",
|
|
81
|
+
"occupation": "job_title",
|
|
82
|
+
"title": "job_title",
|
|
83
|
+
# Finance
|
|
84
|
+
"credit_card": "credit_card_number",
|
|
85
|
+
"cc_number": "credit_card_number",
|
|
86
|
+
"card_number": "credit_card_number",
|
|
87
|
+
"iban_code": "iban",
|
|
88
|
+
"currency": "currency_code",
|
|
89
|
+
# Datetime
|
|
90
|
+
"date": "date",
|
|
91
|
+
"dob": "date_of_birth",
|
|
92
|
+
"birth_date": "date_of_birth",
|
|
93
|
+
"birthday": "date_of_birth",
|
|
94
|
+
"time": "time",
|
|
95
|
+
"datetime": "datetime",
|
|
96
|
+
"created_at": "datetime",
|
|
97
|
+
"updated_at": "datetime",
|
|
98
|
+
"timestamp": "datetime",
|
|
99
|
+
# Misc
|
|
100
|
+
"uuid": "uuid4",
|
|
101
|
+
"guid": "uuid4",
|
|
102
|
+
"description": "sentence",
|
|
103
|
+
"bio": "paragraph",
|
|
104
|
+
"summary": "sentence",
|
|
105
|
+
"note": "sentence",
|
|
106
|
+
"notes": "paragraph",
|
|
107
|
+
"comment": "sentence",
|
|
108
|
+
"body": "paragraph",
|
|
109
|
+
"text": "paragraph",
|
|
110
|
+
"content": "paragraph",
|
|
111
|
+
# Color
|
|
112
|
+
"color": "color_name",
|
|
113
|
+
"colour": "color_name",
|
|
114
|
+
"hex_color": "hex_color",
|
|
115
|
+
# File
|
|
116
|
+
"filename": "file_name",
|
|
117
|
+
"file": "file_name",
|
|
118
|
+
"extension": "file_extension",
|
|
119
|
+
"mime": "mime_type",
|
|
120
|
+
"mime_type": "mime_type",
|
|
121
|
+
# Network
|
|
122
|
+
"port": "port",
|
|
123
|
+
"hostname": "hostname",
|
|
124
|
+
# Geo
|
|
125
|
+
"latitude": "latitude",
|
|
126
|
+
"lat": "latitude",
|
|
127
|
+
"longitude": "longitude",
|
|
128
|
+
"lng": "longitude",
|
|
129
|
+
"lon": "longitude",
|
|
130
|
+
# Government
|
|
131
|
+
"ssn": "ssn",
|
|
132
|
+
"tax_id": "tax_id",
|
|
133
|
+
"passport": "passport_number",
|
|
134
|
+
"passport_no": "passport_number",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _pydantic_heuristic(field_name: str) -> str | None:
|
|
139
|
+
"""Map a Pydantic field name to a DataForge field name (or None)."""
|
|
140
|
+
return _FIELD_ALIASES.get(field_name)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _sqlalchemy_heuristic(col_name: str, column: "Any") -> str | None:
|
|
144
|
+
"""Map a SQLAlchemy column name to a DataForge field name (or None).
|
|
145
|
+
|
|
146
|
+
Uses the column name first, then falls back to type-based
|
|
147
|
+
heuristics for common SQL column types.
|
|
148
|
+
"""
|
|
149
|
+
alias = _FIELD_ALIASES.get(col_name)
|
|
150
|
+
if alias:
|
|
151
|
+
return alias
|
|
152
|
+
# Type-based fallback: if the column is an Integer primary key
|
|
153
|
+
# we already skip it. Other type-based heuristics could go here.
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
if TYPE_CHECKING:
|
|
158
|
+
from dataforge.providers.address import AddressProvider
|
|
159
|
+
from dataforge.providers.automotive import AutomotiveProvider
|
|
160
|
+
from dataforge.providers.barcode import BarcodeProvider
|
|
161
|
+
from dataforge.providers.color import ColorProvider
|
|
162
|
+
from dataforge.providers.company import CompanyProvider
|
|
163
|
+
from dataforge.providers.crypto import CryptoProvider
|
|
164
|
+
from dataforge.providers.datetime import DateTimeProvider
|
|
165
|
+
from dataforge.providers.ecommerce import EcommerceProvider
|
|
166
|
+
from dataforge.providers.education import EducationProvider
|
|
167
|
+
from dataforge.providers.file import FileProvider
|
|
168
|
+
from dataforge.providers.finance import FinanceProvider
|
|
169
|
+
from dataforge.providers.geo import GeoProvider
|
|
170
|
+
from dataforge.providers.government import GovernmentProvider
|
|
171
|
+
from dataforge.providers.internet import InternetProvider
|
|
172
|
+
from dataforge.providers.lorem import LoremProvider
|
|
173
|
+
from dataforge.providers.medical import MedicalProvider
|
|
174
|
+
from dataforge.providers.misc import MiscProvider
|
|
175
|
+
from dataforge.providers.network import NetworkProvider
|
|
176
|
+
from dataforge.providers.payment import PaymentProvider
|
|
177
|
+
from dataforge.providers.person import PersonProvider
|
|
178
|
+
from dataforge.providers.phone import PhoneProvider
|
|
179
|
+
from dataforge.providers.profile import ProfileProvider
|
|
180
|
+
from dataforge.providers.science import ScienceProvider
|
|
181
|
+
from dataforge.providers.text import TextProvider
|
|
182
|
+
from dataforge.providers.ai_prompt import AiPromptProvider
|
|
183
|
+
from dataforge.providers.llm import LlmProvider
|
|
184
|
+
from dataforge.providers.ai_chat import AiChatProvider
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class DataForge:
|
|
188
|
+
"""High-performance fake data generator.
|
|
189
|
+
|
|
190
|
+
Providers are loaded **lazily** — nothing is imported until a
|
|
191
|
+
provider property is first accessed. The provider registry
|
|
192
|
+
(:mod:`dataforge.registry`) resolves field names and provider
|
|
193
|
+
classes automatically, so new providers can be added without
|
|
194
|
+
editing this file.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
locale : str
|
|
199
|
+
The locale to use for data generation (e.g. ``"en_US"``).
|
|
200
|
+
Locale data is loaded **lazily** — nothing is imported until
|
|
201
|
+
a provider property is first accessed.
|
|
202
|
+
seed : int | None
|
|
203
|
+
Optional seed for reproducible output. When set, the stdlib
|
|
204
|
+
``random`` backend is seeded for deterministic generation.
|
|
205
|
+
|
|
206
|
+
Examples
|
|
207
|
+
--------
|
|
208
|
+
>>> forge = DataForge(seed=42)
|
|
209
|
+
>>> forge.person.first_name()
|
|
210
|
+
'...'
|
|
211
|
+
>>> forge.address.city()
|
|
212
|
+
'...'
|
|
213
|
+
>>> forge.internet.email()
|
|
214
|
+
'...'
|
|
215
|
+
>>> forge.company.company_name()
|
|
216
|
+
'...'
|
|
217
|
+
>>> forge.phone.phone_number()
|
|
218
|
+
'...'
|
|
219
|
+
>>> forge.lorem.sentence()
|
|
220
|
+
'...'
|
|
221
|
+
>>> forge.dt.date()
|
|
222
|
+
'...'
|
|
223
|
+
>>> forge.finance.credit_card_number()
|
|
224
|
+
'...'
|
|
225
|
+
>>> forge.color.hex_color()
|
|
226
|
+
'...'
|
|
227
|
+
>>> forge.file.file_name()
|
|
228
|
+
'...'
|
|
229
|
+
>>> forge.network.ipv6()
|
|
230
|
+
'...'
|
|
231
|
+
>>> forge.misc.uuid4()
|
|
232
|
+
'...'
|
|
233
|
+
>>> forge.barcode.ean13()
|
|
234
|
+
'...'
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
__slots__ = (
|
|
238
|
+
"_engine",
|
|
239
|
+
"_locale",
|
|
240
|
+
"_providers",
|
|
241
|
+
"_locale_cache",
|
|
242
|
+
"_unique_proxy",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def __init__(self, locale: str = "en_US", seed: int | None = None) -> None:
|
|
246
|
+
self._engine = RandomEngine(seed=seed)
|
|
247
|
+
self._locale = locale
|
|
248
|
+
self._providers: dict[str, BaseProvider] = {}
|
|
249
|
+
self._locale_cache: dict[str, ModuleType] = {}
|
|
250
|
+
self._unique_proxy: Any = None
|
|
251
|
+
|
|
252
|
+
# ------------------------------------------------------------------
|
|
253
|
+
# Dynamic provider access via registry
|
|
254
|
+
# ------------------------------------------------------------------
|
|
255
|
+
|
|
256
|
+
def _get_provider(self, name: str) -> BaseProvider:
|
|
257
|
+
"""Lazily instantiate and cache a provider by registry name.
|
|
258
|
+
|
|
259
|
+
Uses the provider registry to resolve the class and its
|
|
260
|
+
locale module requirements. Providers are instantiated once
|
|
261
|
+
and cached in ``_providers``.
|
|
262
|
+
"""
|
|
263
|
+
prov = self._providers.get(name)
|
|
264
|
+
if prov is not None:
|
|
265
|
+
return prov
|
|
266
|
+
|
|
267
|
+
from dataforge.registry import get_provider_info
|
|
268
|
+
|
|
269
|
+
info = get_provider_info()
|
|
270
|
+
if name not in info:
|
|
271
|
+
raise AttributeError(
|
|
272
|
+
f"DataForge has no provider '{name}'. "
|
|
273
|
+
f"Available: {', '.join(sorted(info))}"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
cls, locale_modules = info[name]
|
|
277
|
+
if getattr(cls, "_needs_forge", False):
|
|
278
|
+
# Compound provider that needs access to the DataForge instance
|
|
279
|
+
prov = cls(self._engine, self)
|
|
280
|
+
elif locale_modules:
|
|
281
|
+
# Provider needs locale data modules
|
|
282
|
+
locale_args = [self._load_locale_module(mod) for mod in locale_modules]
|
|
283
|
+
prov = cls(self._engine, *locale_args)
|
|
284
|
+
else:
|
|
285
|
+
prov = cls(self._engine)
|
|
286
|
+
|
|
287
|
+
self._providers[name] = prov
|
|
288
|
+
return prov
|
|
289
|
+
|
|
290
|
+
# ------------------------------------------------------------------
|
|
291
|
+
# Explicit provider properties (for IDE autocomplete + type safety)
|
|
292
|
+
# These delegate to _get_provider() which uses the registry.
|
|
293
|
+
# ------------------------------------------------------------------
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def person(self) -> "PersonProvider":
|
|
297
|
+
"""Access the person data provider (names, prefixes, suffixes)."""
|
|
298
|
+
return self._get_provider("person") # type: ignore[return-value]
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def address(self) -> "AddressProvider":
|
|
302
|
+
"""Access the address data provider (streets, cities, zip codes)."""
|
|
303
|
+
return self._get_provider("address") # type: ignore[return-value]
|
|
304
|
+
|
|
305
|
+
@property
|
|
306
|
+
def internet(self) -> "InternetProvider":
|
|
307
|
+
"""Access the internet data provider (emails, usernames, domains, IPs)."""
|
|
308
|
+
return self._get_provider("internet") # type: ignore[return-value]
|
|
309
|
+
|
|
310
|
+
@property
|
|
311
|
+
def company(self) -> "CompanyProvider":
|
|
312
|
+
"""Access the company data provider (names, catch phrases, job titles)."""
|
|
313
|
+
return self._get_provider("company") # type: ignore[return-value]
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def phone(self) -> "PhoneProvider":
|
|
317
|
+
"""Access the phone data provider (phone numbers, cell numbers)."""
|
|
318
|
+
return self._get_provider("phone") # type: ignore[return-value]
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
def lorem(self) -> "LoremProvider":
|
|
322
|
+
"""Access the Lorem Ipsum text provider (words, sentences, paragraphs)."""
|
|
323
|
+
return self._get_provider("lorem") # type: ignore[return-value]
|
|
324
|
+
|
|
325
|
+
@property
|
|
326
|
+
def dt(self) -> "DateTimeProvider":
|
|
327
|
+
"""Access the datetime provider (dates, times, datetimes)."""
|
|
328
|
+
return self._get_provider("dt") # type: ignore[return-value]
|
|
329
|
+
|
|
330
|
+
@property
|
|
331
|
+
def finance(self) -> "FinanceProvider":
|
|
332
|
+
"""Access the finance provider (credit cards, IBANs, currencies)."""
|
|
333
|
+
return self._get_provider("finance") # type: ignore[return-value]
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def color(self) -> "ColorProvider":
|
|
337
|
+
"""Access the color provider (hex, RGB, HSL, color names)."""
|
|
338
|
+
return self._get_provider("color") # type: ignore[return-value]
|
|
339
|
+
|
|
340
|
+
@property
|
|
341
|
+
def file(self) -> "FileProvider":
|
|
342
|
+
"""Access the file provider (file names, extensions, MIME types, paths)."""
|
|
343
|
+
return self._get_provider("file") # type: ignore[return-value]
|
|
344
|
+
|
|
345
|
+
@property
|
|
346
|
+
def network(self) -> "NetworkProvider":
|
|
347
|
+
"""Access the network provider (IPv6, MAC, port, hostname, user agent)."""
|
|
348
|
+
return self._get_provider("network") # type: ignore[return-value]
|
|
349
|
+
|
|
350
|
+
@property
|
|
351
|
+
def misc(self) -> "MiscProvider":
|
|
352
|
+
"""Access the misc provider (UUID4, boolean, random_element, null_or)."""
|
|
353
|
+
return self._get_provider("misc") # type: ignore[return-value]
|
|
354
|
+
|
|
355
|
+
@property
|
|
356
|
+
def barcode(self) -> "BarcodeProvider":
|
|
357
|
+
"""Access the barcode provider (EAN-13, EAN-8, ISBN-13, ISBN-10)."""
|
|
358
|
+
return self._get_provider("barcode") # type: ignore[return-value]
|
|
359
|
+
|
|
360
|
+
@property
|
|
361
|
+
def crypto(self) -> "CryptoProvider":
|
|
362
|
+
"""Access the crypto provider (MD5, SHA-1, SHA-256 hex strings)."""
|
|
363
|
+
return self._get_provider("crypto") # type: ignore[return-value]
|
|
364
|
+
|
|
365
|
+
@property
|
|
366
|
+
def automotive(self) -> "AutomotiveProvider":
|
|
367
|
+
"""Access the automotive provider (plates, VINs, makes, models)."""
|
|
368
|
+
return self._get_provider("automotive") # type: ignore[return-value]
|
|
369
|
+
|
|
370
|
+
@property
|
|
371
|
+
def education(self) -> "EducationProvider":
|
|
372
|
+
"""Access the education provider (universities, degrees, fields)."""
|
|
373
|
+
return self._get_provider("education") # type: ignore[return-value]
|
|
374
|
+
|
|
375
|
+
@property
|
|
376
|
+
def profile(self) -> "ProfileProvider":
|
|
377
|
+
"""Access the profile provider (coherent user profiles)."""
|
|
378
|
+
return self._get_provider("profile") # type: ignore[return-value]
|
|
379
|
+
|
|
380
|
+
@property
|
|
381
|
+
def government(self) -> "GovernmentProvider":
|
|
382
|
+
"""Access the government provider (SSN, tax ID, passports)."""
|
|
383
|
+
return self._get_provider("government") # type: ignore[return-value]
|
|
384
|
+
|
|
385
|
+
@property
|
|
386
|
+
def ecommerce(self) -> "EcommerceProvider":
|
|
387
|
+
"""Access the e-commerce provider (products, SKUs, orders)."""
|
|
388
|
+
return self._get_provider("ecommerce") # type: ignore[return-value]
|
|
389
|
+
|
|
390
|
+
@property
|
|
391
|
+
def medical(self) -> "MedicalProvider":
|
|
392
|
+
"""Access the medical provider (ICD-10, drugs, blood types)."""
|
|
393
|
+
return self._get_provider("medical") # type: ignore[return-value]
|
|
394
|
+
|
|
395
|
+
@property
|
|
396
|
+
def payment(self) -> "PaymentProvider":
|
|
397
|
+
"""Access the payment provider (card types, processors, transactions)."""
|
|
398
|
+
return self._get_provider("payment") # type: ignore[return-value]
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def text(self) -> "TextProvider":
|
|
402
|
+
"""Access the text provider (quotes, headlines, paragraphs)."""
|
|
403
|
+
return self._get_provider("text") # type: ignore[return-value]
|
|
404
|
+
|
|
405
|
+
@property
|
|
406
|
+
def geo(self) -> "GeoProvider":
|
|
407
|
+
"""Access the geo provider (continents, oceans, rivers, coordinates)."""
|
|
408
|
+
return self._get_provider("geo") # type: ignore[return-value]
|
|
409
|
+
|
|
410
|
+
@property
|
|
411
|
+
def science(self) -> "ScienceProvider":
|
|
412
|
+
"""Access the science provider (elements, planets, units)."""
|
|
413
|
+
return self._get_provider("science") # type: ignore[return-value]
|
|
414
|
+
|
|
415
|
+
@property
|
|
416
|
+
def ai_prompt(self) -> "AiPromptProvider":
|
|
417
|
+
"""Access the AI prompt provider (user/system/creative prompts)."""
|
|
418
|
+
return self._get_provider("ai_prompt") # type: ignore[return-value]
|
|
419
|
+
|
|
420
|
+
@property
|
|
421
|
+
def llm(self) -> "LlmProvider":
|
|
422
|
+
"""Access the LLM provider (models, agents, RAG, moderation, billing)."""
|
|
423
|
+
return self._get_provider("llm") # type: ignore[return-value]
|
|
424
|
+
|
|
425
|
+
@property
|
|
426
|
+
def ai_chat(self) -> "AiChatProvider":
|
|
427
|
+
"""Access the AI chat provider (conversation turns, messages)."""
|
|
428
|
+
return self._get_provider("ai_chat") # type: ignore[return-value]
|
|
429
|
+
|
|
430
|
+
# ------------------------------------------------------------------
|
|
431
|
+
# Unique value generation
|
|
432
|
+
# ------------------------------------------------------------------
|
|
433
|
+
|
|
434
|
+
@property
|
|
435
|
+
def unique(self) -> "Any":
|
|
436
|
+
"""Access the unique-value proxy.
|
|
437
|
+
|
|
438
|
+
Returns a proxy that ensures every value returned by a
|
|
439
|
+
provider method is unique within the proxy's lifetime.
|
|
440
|
+
Call ``forge.unique.clear()`` to reset tracking.
|
|
441
|
+
|
|
442
|
+
Examples
|
|
443
|
+
--------
|
|
444
|
+
>>> forge = DataForge(seed=42)
|
|
445
|
+
>>> a = forge.unique.person.first_name()
|
|
446
|
+
>>> b = forge.unique.person.first_name()
|
|
447
|
+
>>> a != b
|
|
448
|
+
True
|
|
449
|
+
"""
|
|
450
|
+
if self._unique_proxy is None:
|
|
451
|
+
from dataforge.unique import UniqueProxy
|
|
452
|
+
|
|
453
|
+
self._unique_proxy = UniqueProxy(self)
|
|
454
|
+
return self._unique_proxy
|
|
455
|
+
|
|
456
|
+
# ------------------------------------------------------------------
|
|
457
|
+
# Provider registration
|
|
458
|
+
# ------------------------------------------------------------------
|
|
459
|
+
|
|
460
|
+
def register_provider(
|
|
461
|
+
self,
|
|
462
|
+
provider_cls: type[BaseProvider],
|
|
463
|
+
name: str | None = None,
|
|
464
|
+
) -> None:
|
|
465
|
+
"""Register a custom provider class at runtime.
|
|
466
|
+
|
|
467
|
+
The provider is added to this ``DataForge`` instance's
|
|
468
|
+
internal registry and can be accessed via ``getattr``.
|
|
469
|
+
|
|
470
|
+
Parameters
|
|
471
|
+
----------
|
|
472
|
+
provider_cls : type[BaseProvider]
|
|
473
|
+
The provider class to register. Must be a
|
|
474
|
+
``BaseProvider`` subclass with ``_provider_name``.
|
|
475
|
+
name : str | None
|
|
476
|
+
Override the provider name. Defaults to the class's
|
|
477
|
+
``_provider_name`` attribute.
|
|
478
|
+
|
|
479
|
+
Examples
|
|
480
|
+
--------
|
|
481
|
+
>>> from dataforge.providers.base import BaseProvider
|
|
482
|
+
>>> from dataforge.backend import RandomEngine
|
|
483
|
+
>>> class MyProvider(BaseProvider):
|
|
484
|
+
... _provider_name = "my"
|
|
485
|
+
... _field_map = {"greeting": "greeting"}
|
|
486
|
+
... def greeting(self, count=1):
|
|
487
|
+
... return "hello" if count == 1 else ["hello"] * count
|
|
488
|
+
>>> forge = DataForge()
|
|
489
|
+
>>> forge.register_provider(MyProvider)
|
|
490
|
+
>>> forge.my.greeting()
|
|
491
|
+
'hello'
|
|
492
|
+
"""
|
|
493
|
+
prov_name = name or getattr(provider_cls, "_provider_name", "")
|
|
494
|
+
if not prov_name:
|
|
495
|
+
raise ValueError(
|
|
496
|
+
f"{provider_cls.__name__} does not define '_provider_name'."
|
|
497
|
+
)
|
|
498
|
+
locale_modules = getattr(provider_cls, "_locale_modules", ())
|
|
499
|
+
if getattr(provider_cls, "_needs_forge", False):
|
|
500
|
+
prov = provider_cls(self._engine, self) # type: ignore[call-arg]
|
|
501
|
+
elif locale_modules:
|
|
502
|
+
locale_args = [self._load_locale_module(mod) for mod in locale_modules]
|
|
503
|
+
prov = provider_cls(self._engine, *locale_args) # type: ignore[call-arg]
|
|
504
|
+
else:
|
|
505
|
+
prov = provider_cls(self._engine)
|
|
506
|
+
self._providers[prov_name] = prov
|
|
507
|
+
|
|
508
|
+
# Register field mappings so Schema/to_dict can find them
|
|
509
|
+
from dataforge.registry import register_runtime_provider
|
|
510
|
+
|
|
511
|
+
register_runtime_provider(prov_name, provider_cls, locale_modules)
|
|
512
|
+
|
|
513
|
+
def __getattr__(self, name: str) -> Any:
|
|
514
|
+
"""Dynamic attribute access for registered providers.
|
|
515
|
+
|
|
516
|
+
Allows ``forge.my_provider`` to work for providers
|
|
517
|
+
registered via :meth:`register_provider` at runtime,
|
|
518
|
+
without requiring a ``@property`` on the class.
|
|
519
|
+
"""
|
|
520
|
+
# Check if it's a cached provider
|
|
521
|
+
providers = object.__getattribute__(self, "_providers")
|
|
522
|
+
if name in providers:
|
|
523
|
+
return providers[name]
|
|
524
|
+
# Try registry lookup
|
|
525
|
+
try:
|
|
526
|
+
return self._get_provider(name)
|
|
527
|
+
except AttributeError:
|
|
528
|
+
raise AttributeError(
|
|
529
|
+
f"'{type(self).__name__}' object has no attribute '{name}'"
|
|
530
|
+
) from None
|
|
531
|
+
|
|
532
|
+
# ------------------------------------------------------------------
|
|
533
|
+
# Seed control
|
|
534
|
+
# ------------------------------------------------------------------
|
|
535
|
+
|
|
536
|
+
def seed(self, value: int) -> None:
|
|
537
|
+
"""Re-seed the random engine for reproducible output.
|
|
538
|
+
|
|
539
|
+
This resets the internal state of the stdlib ``random`` backend.
|
|
540
|
+
"""
|
|
541
|
+
self._engine.seed(value)
|
|
542
|
+
|
|
543
|
+
def copy(self, seed: int | None = None) -> "DataForge":
|
|
544
|
+
"""Create a new ``DataForge`` instance with the same locale.
|
|
545
|
+
|
|
546
|
+
Parameters
|
|
547
|
+
----------
|
|
548
|
+
seed : int | None
|
|
549
|
+
Optional seed for the new instance. If ``None``, the new
|
|
550
|
+
instance is unseeded (non-deterministic).
|
|
551
|
+
|
|
552
|
+
Returns
|
|
553
|
+
-------
|
|
554
|
+
DataForge
|
|
555
|
+
"""
|
|
556
|
+
return DataForge(locale=self._locale, seed=seed)
|
|
557
|
+
|
|
558
|
+
# ------------------------------------------------------------------
|
|
559
|
+
# Schema API
|
|
560
|
+
# ------------------------------------------------------------------
|
|
561
|
+
|
|
562
|
+
def schema(self, fields: "list[str] | dict[str, Any]") -> "Any":
|
|
563
|
+
"""Create a pre-resolved :class:`Schema` for maximum throughput.
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
fields : list[str] | dict[str, str | Callable]
|
|
568
|
+
Fields to generate. String values are resolved to provider
|
|
569
|
+
methods. Callable values receive the current row dict and
|
|
570
|
+
can reference previously generated columns.
|
|
571
|
+
|
|
572
|
+
Returns
|
|
573
|
+
-------
|
|
574
|
+
Schema
|
|
575
|
+
|
|
576
|
+
Examples
|
|
577
|
+
--------
|
|
578
|
+
>>> forge = DataForge(seed=42)
|
|
579
|
+
>>> s = forge.schema(["first_name", "email"])
|
|
580
|
+
>>> rows = s.generate(count=1000)
|
|
581
|
+
"""
|
|
582
|
+
from dataforge.schema import Schema
|
|
583
|
+
|
|
584
|
+
return Schema(self, fields)
|
|
585
|
+
|
|
586
|
+
# ------------------------------------------------------------------
|
|
587
|
+
# Locale management
|
|
588
|
+
# ------------------------------------------------------------------
|
|
589
|
+
|
|
590
|
+
@property
|
|
591
|
+
def locale(self) -> str:
|
|
592
|
+
"""The currently active locale string (e.g. ``"en_US"``)."""
|
|
593
|
+
return self._locale
|
|
594
|
+
|
|
595
|
+
# ------------------------------------------------------------------
|
|
596
|
+
# Internal helpers
|
|
597
|
+
# ------------------------------------------------------------------
|
|
598
|
+
|
|
599
|
+
def _load_locale_module(self, module_name: str) -> ModuleType:
|
|
600
|
+
"""Dynamically import a locale data module.
|
|
601
|
+
|
|
602
|
+
Results are cached so that repeated access to the same provider
|
|
603
|
+
does not re-import the module.
|
|
604
|
+
|
|
605
|
+
If the requested locale does not provide the specified module,
|
|
606
|
+
falls back to ``en_US`` and emits a warning.
|
|
607
|
+
|
|
608
|
+
Parameters
|
|
609
|
+
----------
|
|
610
|
+
module_name : str
|
|
611
|
+
The name of the submodule inside the locale package
|
|
612
|
+
(e.g. ``"person"``, ``"address"``).
|
|
613
|
+
"""
|
|
614
|
+
key = f"{self._locale}.{module_name}"
|
|
615
|
+
if key not in self._locale_cache:
|
|
616
|
+
try:
|
|
617
|
+
mod = importlib.import_module(
|
|
618
|
+
f"dataforge.locales.{self._locale}.{module_name}"
|
|
619
|
+
)
|
|
620
|
+
except ModuleNotFoundError:
|
|
621
|
+
if self._locale == "en_US":
|
|
622
|
+
raise ValueError(
|
|
623
|
+
f"Locale 'en_US' does not have a '{module_name}' data module."
|
|
624
|
+
)
|
|
625
|
+
import warnings
|
|
626
|
+
|
|
627
|
+
warnings.warn(
|
|
628
|
+
f"Locale '{self._locale}' does not have a '{module_name}' "
|
|
629
|
+
f"data module — falling back to 'en_US'.",
|
|
630
|
+
UserWarning,
|
|
631
|
+
stacklevel=3,
|
|
632
|
+
)
|
|
633
|
+
mod = importlib.import_module(f"dataforge.locales.en_US.{module_name}")
|
|
634
|
+
self._locale_cache[key] = mod
|
|
635
|
+
return self._locale_cache[key]
|
|
636
|
+
|
|
637
|
+
def _resolve_field(self, field: str) -> tuple[str, str]:
|
|
638
|
+
"""Resolve a field name to (provider_attr, method_name).
|
|
639
|
+
|
|
640
|
+
Supports both direct names (e.g. ``"first_name"``) and
|
|
641
|
+
dotted paths (e.g. ``"person.first_name"``).
|
|
642
|
+
"""
|
|
643
|
+
# Dotted path: "person.first_name" → ("person", "first_name")
|
|
644
|
+
if "." in field:
|
|
645
|
+
provider_attr, method_name = field.split(".", 1)
|
|
646
|
+
return provider_attr, method_name
|
|
647
|
+
|
|
648
|
+
from dataforge.registry import get_field_map
|
|
649
|
+
|
|
650
|
+
fm = get_field_map()
|
|
651
|
+
if field in fm:
|
|
652
|
+
return fm[field]
|
|
653
|
+
raise ValueError(
|
|
654
|
+
f"Unknown field '{field}'. Use dotted notation "
|
|
655
|
+
f"(e.g. 'person.first_name') or a known shorthand."
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
# ------------------------------------------------------------------
|
|
659
|
+
# Bulk data generation
|
|
660
|
+
# ------------------------------------------------------------------
|
|
661
|
+
|
|
662
|
+
def to_dict(
|
|
663
|
+
self,
|
|
664
|
+
fields: list[str] | dict[str, str],
|
|
665
|
+
count: int = 10,
|
|
666
|
+
) -> list[dict[str, str]]:
|
|
667
|
+
"""Generate *count* rows of fake data as a list of dicts.
|
|
668
|
+
|
|
669
|
+
Uses **column-first** batch generation for maximum throughput:
|
|
670
|
+
each field is generated in bulk via its ``count=N`` batch path,
|
|
671
|
+
then columns are zipped into row dicts.
|
|
672
|
+
|
|
673
|
+
Parameters
|
|
674
|
+
----------
|
|
675
|
+
fields : list[str] | dict[str, str]
|
|
676
|
+
Fields to generate. Can be a list of field names (e.g.
|
|
677
|
+
``["first_name", "email"]``) or a dict mapping output column
|
|
678
|
+
names to field names (e.g. ``{"Name": "full_name"}``).
|
|
679
|
+
count : int
|
|
680
|
+
Number of rows to generate.
|
|
681
|
+
|
|
682
|
+
Returns
|
|
683
|
+
-------
|
|
684
|
+
list[dict[str, str]]
|
|
685
|
+
Each dict maps column name → generated value.
|
|
686
|
+
|
|
687
|
+
Examples
|
|
688
|
+
--------
|
|
689
|
+
>>> forge = DataForge(seed=42)
|
|
690
|
+
>>> rows = forge.to_dict(["first_name", "email"], count=3)
|
|
691
|
+
>>> len(rows)
|
|
692
|
+
3
|
|
693
|
+
"""
|
|
694
|
+
if count == 0:
|
|
695
|
+
return []
|
|
696
|
+
|
|
697
|
+
# Normalize fields
|
|
698
|
+
if isinstance(fields, list):
|
|
699
|
+
field_defs = [(f, f) for f in fields]
|
|
700
|
+
else:
|
|
701
|
+
field_defs = list(fields.items())
|
|
702
|
+
|
|
703
|
+
# Resolve providers and methods
|
|
704
|
+
columns: list[str] = []
|
|
705
|
+
callables: list[object] = []
|
|
706
|
+
for col_name, field_name in field_defs:
|
|
707
|
+
provider_attr, method_name = self._resolve_field(field_name)
|
|
708
|
+
provider = getattr(self, provider_attr)
|
|
709
|
+
method = getattr(provider, method_name)
|
|
710
|
+
columns.append(col_name)
|
|
711
|
+
callables.append(method)
|
|
712
|
+
|
|
713
|
+
# Column-first: generate all values for each column in one batch call
|
|
714
|
+
col_data: list[list[str]] = []
|
|
715
|
+
for fn in callables:
|
|
716
|
+
if count == 1:
|
|
717
|
+
val = fn() # type: ignore[operator]
|
|
718
|
+
col_data.append([val if isinstance(val, str) else str(val)])
|
|
719
|
+
else:
|
|
720
|
+
values = fn(count=count) # type: ignore[operator]
|
|
721
|
+
# Most providers return list[str] — skip redundant str()
|
|
722
|
+
if values and isinstance(values[0], str):
|
|
723
|
+
col_data.append(values) # type: ignore[arg-type]
|
|
724
|
+
else:
|
|
725
|
+
col_data.append([str(v) for v in values])
|
|
726
|
+
|
|
727
|
+
# Zip columns into row dicts
|
|
728
|
+
col_tuple = tuple(columns)
|
|
729
|
+
return [dict(zip(col_tuple, row)) for row in zip(*col_data)]
|
|
730
|
+
|
|
731
|
+
def to_csv(
|
|
732
|
+
self,
|
|
733
|
+
fields: list[str] | dict[str, str],
|
|
734
|
+
count: int = 10,
|
|
735
|
+
path: str | None = None,
|
|
736
|
+
) -> str:
|
|
737
|
+
"""Generate fake data and return (or write) as CSV.
|
|
738
|
+
|
|
739
|
+
Delegates to :meth:`Schema.to_csv` for zero-duplication.
|
|
740
|
+
|
|
741
|
+
Parameters
|
|
742
|
+
----------
|
|
743
|
+
fields : list[str] | dict[str, str]
|
|
744
|
+
Fields to generate (same format as :meth:`to_dict`).
|
|
745
|
+
count : int
|
|
746
|
+
Number of rows.
|
|
747
|
+
path : str | None
|
|
748
|
+
If provided, write CSV to this file path. Otherwise return
|
|
749
|
+
the CSV as a string.
|
|
750
|
+
|
|
751
|
+
Returns
|
|
752
|
+
-------
|
|
753
|
+
str
|
|
754
|
+
The CSV content as a string.
|
|
755
|
+
"""
|
|
756
|
+
return self.schema(fields).to_csv(count=count, path=path)
|
|
757
|
+
|
|
758
|
+
def to_jsonl(
|
|
759
|
+
self,
|
|
760
|
+
fields: list[str] | dict[str, str],
|
|
761
|
+
count: int = 10,
|
|
762
|
+
path: str | None = None,
|
|
763
|
+
) -> str:
|
|
764
|
+
"""Generate fake data and return (or write) as JSON Lines.
|
|
765
|
+
|
|
766
|
+
Delegates to :meth:`Schema.to_jsonl` for zero-duplication.
|
|
767
|
+
|
|
768
|
+
Parameters
|
|
769
|
+
----------
|
|
770
|
+
fields : list[str] | dict[str, str]
|
|
771
|
+
Fields to generate (same format as :meth:`to_dict`).
|
|
772
|
+
count : int
|
|
773
|
+
Number of rows.
|
|
774
|
+
path : str | None
|
|
775
|
+
If provided, write JSONL to this file path.
|
|
776
|
+
|
|
777
|
+
Returns
|
|
778
|
+
-------
|
|
779
|
+
str
|
|
780
|
+
The JSONL content as a string.
|
|
781
|
+
"""
|
|
782
|
+
return self.schema(fields).to_jsonl(count=count, path=path)
|
|
783
|
+
|
|
784
|
+
def to_sql(
|
|
785
|
+
self,
|
|
786
|
+
fields: list[str] | dict[str, str],
|
|
787
|
+
table: str,
|
|
788
|
+
count: int = 10,
|
|
789
|
+
dialect: str = "sqlite",
|
|
790
|
+
path: str | None = None,
|
|
791
|
+
) -> str:
|
|
792
|
+
"""Generate fake data and return as SQL INSERT statements.
|
|
793
|
+
|
|
794
|
+
Delegates to :meth:`Schema.to_sql` for zero-duplication.
|
|
795
|
+
|
|
796
|
+
Parameters
|
|
797
|
+
----------
|
|
798
|
+
fields : list[str] | dict[str, str]
|
|
799
|
+
Fields to generate (same format as :meth:`to_dict`).
|
|
800
|
+
table : str
|
|
801
|
+
Target table name.
|
|
802
|
+
count : int
|
|
803
|
+
Number of rows.
|
|
804
|
+
dialect : str
|
|
805
|
+
SQL dialect: ``"sqlite"``, ``"mysql"``, or ``"postgresql"``.
|
|
806
|
+
path : str | None
|
|
807
|
+
If provided, write SQL to this file path.
|
|
808
|
+
|
|
809
|
+
Returns
|
|
810
|
+
-------
|
|
811
|
+
str
|
|
812
|
+
SQL INSERT statements as a string.
|
|
813
|
+
"""
|
|
814
|
+
return self.schema(fields).to_sql(
|
|
815
|
+
table=table, count=count, dialect=dialect, path=path
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
def to_dataframe(
|
|
819
|
+
self,
|
|
820
|
+
fields: list[str] | dict[str, str],
|
|
821
|
+
count: int = 10,
|
|
822
|
+
) -> "Any":
|
|
823
|
+
"""Generate fake data as a pandas DataFrame.
|
|
824
|
+
|
|
825
|
+
Delegates to :meth:`Schema.to_dataframe` for zero-duplication.
|
|
826
|
+
Requires ``pandas`` to be installed.
|
|
827
|
+
|
|
828
|
+
Parameters
|
|
829
|
+
----------
|
|
830
|
+
fields : list[str] | dict[str, str]
|
|
831
|
+
Fields to generate (same format as :meth:`to_dict`).
|
|
832
|
+
count : int
|
|
833
|
+
Number of rows.
|
|
834
|
+
|
|
835
|
+
Returns
|
|
836
|
+
-------
|
|
837
|
+
pandas.DataFrame
|
|
838
|
+
A DataFrame with one column per field.
|
|
839
|
+
"""
|
|
840
|
+
return self.schema(fields).to_dataframe(count=count)
|
|
841
|
+
|
|
842
|
+
def stream_to_csv(
|
|
843
|
+
self,
|
|
844
|
+
fields: list[str] | dict[str, str],
|
|
845
|
+
path: str,
|
|
846
|
+
count: int = 10,
|
|
847
|
+
batch_size: int | None = None,
|
|
848
|
+
) -> int:
|
|
849
|
+
"""Stream fake data directly to a CSV file.
|
|
850
|
+
|
|
851
|
+
Memory-efficient: writes in batches without materializing
|
|
852
|
+
all rows in memory.
|
|
853
|
+
|
|
854
|
+
Parameters
|
|
855
|
+
----------
|
|
856
|
+
fields : list[str] | dict[str, str]
|
|
857
|
+
Fields to generate.
|
|
858
|
+
path : str
|
|
859
|
+
File path to write.
|
|
860
|
+
count : int
|
|
861
|
+
Number of rows.
|
|
862
|
+
batch_size : int | None
|
|
863
|
+
Rows per batch. Auto-tuned when ``None``.
|
|
864
|
+
|
|
865
|
+
Returns
|
|
866
|
+
-------
|
|
867
|
+
int
|
|
868
|
+
Number of rows written.
|
|
869
|
+
"""
|
|
870
|
+
return self.schema(fields).stream_to_csv(
|
|
871
|
+
path=path, count=count, batch_size=batch_size
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
def stream_to_jsonl(
|
|
875
|
+
self,
|
|
876
|
+
fields: list[str] | dict[str, str],
|
|
877
|
+
path: str,
|
|
878
|
+
count: int = 10,
|
|
879
|
+
batch_size: int | None = None,
|
|
880
|
+
) -> int:
|
|
881
|
+
"""Stream fake data directly to a JSON Lines file.
|
|
882
|
+
|
|
883
|
+
Memory-efficient: writes in batches without materializing
|
|
884
|
+
all rows in memory.
|
|
885
|
+
|
|
886
|
+
Parameters
|
|
887
|
+
----------
|
|
888
|
+
fields : list[str] | dict[str, str]
|
|
889
|
+
Fields to generate.
|
|
890
|
+
path : str
|
|
891
|
+
File path to write.
|
|
892
|
+
count : int
|
|
893
|
+
Number of rows.
|
|
894
|
+
batch_size : int | None
|
|
895
|
+
Rows per batch. Auto-tuned when ``None``.
|
|
896
|
+
|
|
897
|
+
Returns
|
|
898
|
+
-------
|
|
899
|
+
int
|
|
900
|
+
Number of rows written.
|
|
901
|
+
"""
|
|
902
|
+
return self.schema(fields).stream_to_jsonl(
|
|
903
|
+
path=path, count=count, batch_size=batch_size
|
|
904
|
+
)
|
|
905
|
+
|
|
906
|
+
def to_arrow(
|
|
907
|
+
self,
|
|
908
|
+
fields: list[str] | dict[str, str],
|
|
909
|
+
count: int = 10,
|
|
910
|
+
batch_size: int | None = None,
|
|
911
|
+
) -> "Any":
|
|
912
|
+
"""Generate fake data as a PyArrow Table.
|
|
913
|
+
|
|
914
|
+
Delegates to :meth:`Schema.to_arrow` for zero-duplication.
|
|
915
|
+
Requires ``pyarrow`` to be installed.
|
|
916
|
+
|
|
917
|
+
Parameters
|
|
918
|
+
----------
|
|
919
|
+
fields : list[str] | dict[str, str]
|
|
920
|
+
Fields to generate (same format as :meth:`to_dict`).
|
|
921
|
+
count : int
|
|
922
|
+
Number of rows.
|
|
923
|
+
batch_size : int | None
|
|
924
|
+
Rows per internal batch. Auto-tuned when ``None``.
|
|
925
|
+
|
|
926
|
+
Returns
|
|
927
|
+
-------
|
|
928
|
+
pyarrow.Table
|
|
929
|
+
"""
|
|
930
|
+
return self.schema(fields).to_arrow(count=count, batch_size=batch_size)
|
|
931
|
+
|
|
932
|
+
def to_polars(
|
|
933
|
+
self,
|
|
934
|
+
fields: list[str] | dict[str, str],
|
|
935
|
+
count: int = 10,
|
|
936
|
+
batch_size: int | None = None,
|
|
937
|
+
) -> "Any":
|
|
938
|
+
"""Generate fake data as a Polars DataFrame.
|
|
939
|
+
|
|
940
|
+
Delegates to :meth:`Schema.to_polars` for zero-duplication.
|
|
941
|
+
Requires ``polars`` to be installed.
|
|
942
|
+
|
|
943
|
+
Parameters
|
|
944
|
+
----------
|
|
945
|
+
fields : list[str] | dict[str, str]
|
|
946
|
+
Fields to generate (same format as :meth:`to_dict`).
|
|
947
|
+
count : int
|
|
948
|
+
Number of rows.
|
|
949
|
+
batch_size : int | None
|
|
950
|
+
Rows per internal batch. Auto-tuned when ``None``.
|
|
951
|
+
|
|
952
|
+
Returns
|
|
953
|
+
-------
|
|
954
|
+
polars.DataFrame
|
|
955
|
+
"""
|
|
956
|
+
return self.schema(fields).to_polars(count=count, batch_size=batch_size)
|
|
957
|
+
|
|
958
|
+
def to_parquet(
|
|
959
|
+
self,
|
|
960
|
+
fields: list[str] | dict[str, str],
|
|
961
|
+
path: str,
|
|
962
|
+
count: int = 10,
|
|
963
|
+
batch_size: int | None = None,
|
|
964
|
+
) -> int:
|
|
965
|
+
"""Generate fake data and write as a Parquet file.
|
|
966
|
+
|
|
967
|
+
Requires ``pyarrow`` to be installed. Data is written in
|
|
968
|
+
batched row-groups for bounded memory usage.
|
|
969
|
+
|
|
970
|
+
Parameters
|
|
971
|
+
----------
|
|
972
|
+
fields : list[str] | dict[str, str]
|
|
973
|
+
Fields to generate.
|
|
974
|
+
path : str
|
|
975
|
+
File path to write.
|
|
976
|
+
count : int
|
|
977
|
+
Number of rows.
|
|
978
|
+
batch_size : int | None
|
|
979
|
+
Rows per row-group. Auto-tuned when ``None``.
|
|
980
|
+
|
|
981
|
+
Returns
|
|
982
|
+
-------
|
|
983
|
+
int
|
|
984
|
+
Number of rows written.
|
|
985
|
+
"""
|
|
986
|
+
return self.schema(fields).to_parquet(
|
|
987
|
+
path=path, count=count, batch_size=batch_size
|
|
988
|
+
)
|
|
989
|
+
|
|
990
|
+
def __repr__(self) -> str:
|
|
991
|
+
return f"DataForge(locale={self._locale!r})"
|
|
992
|
+
|
|
993
|
+
# ------------------------------------------------------------------
|
|
994
|
+
# Schema factories from ORM / model introspection
|
|
995
|
+
# ------------------------------------------------------------------
|
|
996
|
+
|
|
997
|
+
def schema_from_pydantic(self, model: type) -> "Any":
|
|
998
|
+
"""Create a :class:`Schema` by introspecting a Pydantic model.
|
|
999
|
+
|
|
1000
|
+
Maps model field names to DataForge fields using the field
|
|
1001
|
+
registry. Fields that cannot be mapped are silently skipped
|
|
1002
|
+
(a warning is emitted). If the model has a field whose name
|
|
1003
|
+
exactly matches a registered DataForge field (e.g.
|
|
1004
|
+
``first_name``, ``email``, ``city``), it is mapped
|
|
1005
|
+
automatically.
|
|
1006
|
+
|
|
1007
|
+
Requires ``pydantic`` to be installed.
|
|
1008
|
+
|
|
1009
|
+
Parameters
|
|
1010
|
+
----------
|
|
1011
|
+
model : type
|
|
1012
|
+
A Pydantic ``BaseModel`` subclass.
|
|
1013
|
+
|
|
1014
|
+
Returns
|
|
1015
|
+
-------
|
|
1016
|
+
Schema
|
|
1017
|
+
|
|
1018
|
+
Examples
|
|
1019
|
+
--------
|
|
1020
|
+
>>> from pydantic import BaseModel
|
|
1021
|
+
>>> class User(BaseModel):
|
|
1022
|
+
... first_name: str
|
|
1023
|
+
... email: str
|
|
1024
|
+
... city: str
|
|
1025
|
+
>>> forge = DataForge(seed=42)
|
|
1026
|
+
>>> s = forge.schema_from_pydantic(User)
|
|
1027
|
+
>>> rows = s.generate(count=5)
|
|
1028
|
+
"""
|
|
1029
|
+
from dataforge.schema import Schema
|
|
1030
|
+
|
|
1031
|
+
try:
|
|
1032
|
+
from pydantic import BaseModel # noqa: F811
|
|
1033
|
+
except ModuleNotFoundError as exc:
|
|
1034
|
+
raise ModuleNotFoundError(
|
|
1035
|
+
"pydantic is required for schema_from_pydantic(). "
|
|
1036
|
+
"Install it with: pip install pydantic"
|
|
1037
|
+
) from exc
|
|
1038
|
+
|
|
1039
|
+
if not (isinstance(model, type) and issubclass(model, BaseModel)):
|
|
1040
|
+
raise TypeError(f"Expected a Pydantic BaseModel subclass, got {model!r}")
|
|
1041
|
+
|
|
1042
|
+
from dataforge.registry import get_field_map
|
|
1043
|
+
|
|
1044
|
+
field_map = get_field_map()
|
|
1045
|
+
mapped: dict[str, str] = {}
|
|
1046
|
+
|
|
1047
|
+
# Pydantic v2 uses model_fields; v1 used __fields__
|
|
1048
|
+
model_fields: dict[str, Any] = {}
|
|
1049
|
+
if hasattr(model, "model_fields"):
|
|
1050
|
+
model_fields = model.model_fields
|
|
1051
|
+
elif hasattr(model, "__fields__"):
|
|
1052
|
+
model_fields = model.__fields__
|
|
1053
|
+
else:
|
|
1054
|
+
raise TypeError(
|
|
1055
|
+
f"Cannot introspect fields from {model.__name__}. "
|
|
1056
|
+
"Ensure it is a valid Pydantic BaseModel."
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
import warnings
|
|
1060
|
+
|
|
1061
|
+
for field_name in model_fields:
|
|
1062
|
+
if field_name in field_map:
|
|
1063
|
+
mapped[field_name] = field_name
|
|
1064
|
+
else:
|
|
1065
|
+
# Try common aliases / heuristic mapping
|
|
1066
|
+
alias = _pydantic_heuristic(field_name)
|
|
1067
|
+
if alias and alias in field_map:
|
|
1068
|
+
mapped[field_name] = alias
|
|
1069
|
+
else:
|
|
1070
|
+
warnings.warn(
|
|
1071
|
+
f"Pydantic field '{field_name}' on {model.__name__} "
|
|
1072
|
+
f"could not be mapped to a DataForge field — skipping.",
|
|
1073
|
+
UserWarning,
|
|
1074
|
+
stacklevel=2,
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
if not mapped:
|
|
1078
|
+
raise ValueError(
|
|
1079
|
+
f"No fields on {model.__name__} could be mapped to "
|
|
1080
|
+
f"DataForge fields. Ensure the model uses recognisable "
|
|
1081
|
+
f"field names (e.g. 'first_name', 'email', 'city')."
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
return Schema(self, mapped)
|
|
1085
|
+
|
|
1086
|
+
def schema_from_sqlalchemy(self, model: type) -> "Any":
|
|
1087
|
+
"""Create a :class:`Schema` by introspecting a SQLAlchemy model.
|
|
1088
|
+
|
|
1089
|
+
Maps column names to DataForge fields using the field
|
|
1090
|
+
registry. Columns that cannot be mapped are silently skipped
|
|
1091
|
+
(a warning is emitted). Primary key columns named ``id``
|
|
1092
|
+
are skipped automatically.
|
|
1093
|
+
|
|
1094
|
+
Requires ``sqlalchemy`` to be installed.
|
|
1095
|
+
|
|
1096
|
+
Parameters
|
|
1097
|
+
----------
|
|
1098
|
+
model : type
|
|
1099
|
+
A SQLAlchemy declarative model class (must have
|
|
1100
|
+
``__table__`` attribute).
|
|
1101
|
+
|
|
1102
|
+
Returns
|
|
1103
|
+
-------
|
|
1104
|
+
Schema
|
|
1105
|
+
|
|
1106
|
+
Examples
|
|
1107
|
+
--------
|
|
1108
|
+
>>> from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
|
|
1109
|
+
>>> class Base(DeclarativeBase): pass
|
|
1110
|
+
>>> class User(Base):
|
|
1111
|
+
... __tablename__ = "users"
|
|
1112
|
+
... id: Mapped[int] = mapped_column(primary_key=True)
|
|
1113
|
+
... first_name: Mapped[str]
|
|
1114
|
+
... email: Mapped[str]
|
|
1115
|
+
>>> forge = DataForge(seed=42)
|
|
1116
|
+
>>> s = forge.schema_from_sqlalchemy(User)
|
|
1117
|
+
>>> rows = s.generate(count=5)
|
|
1118
|
+
"""
|
|
1119
|
+
from dataforge.schema import Schema
|
|
1120
|
+
|
|
1121
|
+
try:
|
|
1122
|
+
import sqlalchemy # noqa: F401
|
|
1123
|
+
except ModuleNotFoundError as exc:
|
|
1124
|
+
raise ModuleNotFoundError(
|
|
1125
|
+
"sqlalchemy is required for schema_from_sqlalchemy(). "
|
|
1126
|
+
"Install it with: pip install sqlalchemy"
|
|
1127
|
+
) from exc
|
|
1128
|
+
|
|
1129
|
+
if not hasattr(model, "__table__"):
|
|
1130
|
+
raise TypeError(
|
|
1131
|
+
f"Expected a SQLAlchemy declarative model with __table__, got {model!r}"
|
|
1132
|
+
)
|
|
1133
|
+
|
|
1134
|
+
from dataforge.registry import get_field_map
|
|
1135
|
+
|
|
1136
|
+
field_map = get_field_map()
|
|
1137
|
+
mapped: dict[str, str] = {}
|
|
1138
|
+
|
|
1139
|
+
import warnings
|
|
1140
|
+
|
|
1141
|
+
table = model.__table__
|
|
1142
|
+
for column in table.columns:
|
|
1143
|
+
col_name = column.name
|
|
1144
|
+
# Skip primary key 'id' columns — not fake-able
|
|
1145
|
+
if col_name == "id" and column.primary_key:
|
|
1146
|
+
continue
|
|
1147
|
+
if col_name in field_map:
|
|
1148
|
+
mapped[col_name] = col_name
|
|
1149
|
+
else:
|
|
1150
|
+
alias = _sqlalchemy_heuristic(col_name, column)
|
|
1151
|
+
if alias and alias in field_map:
|
|
1152
|
+
mapped[col_name] = alias
|
|
1153
|
+
else:
|
|
1154
|
+
warnings.warn(
|
|
1155
|
+
f"SQLAlchemy column '{col_name}' on "
|
|
1156
|
+
f"{model.__name__} could not be mapped to a "
|
|
1157
|
+
f"DataForge field — skipping.",
|
|
1158
|
+
UserWarning,
|
|
1159
|
+
stacklevel=2,
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
if not mapped:
|
|
1163
|
+
raise ValueError(
|
|
1164
|
+
f"No columns on {model.__name__} could be mapped to "
|
|
1165
|
+
f"DataForge fields. Ensure the model uses recognisable "
|
|
1166
|
+
f"column names (e.g. 'first_name', 'email', 'city')."
|
|
1167
|
+
)
|
|
1168
|
+
|
|
1169
|
+
return Schema(self, mapped)
|