dataforge-py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. dataforge/__init__.py +20 -0
  2. dataforge/backend.py +147 -0
  3. dataforge/cli.py +166 -0
  4. dataforge/core.py +1169 -0
  5. dataforge/locales/__init__.py +1 -0
  6. dataforge/locales/ar_SA/__init__.py +1 -0
  7. dataforge/locales/ar_SA/address.py +128 -0
  8. dataforge/locales/ar_SA/company.py +183 -0
  9. dataforge/locales/ar_SA/internet.py +25 -0
  10. dataforge/locales/ar_SA/person.py +217 -0
  11. dataforge/locales/ar_SA/phone.py +15 -0
  12. dataforge/locales/de_DE/__init__.py +1 -0
  13. dataforge/locales/de_DE/address.py +148 -0
  14. dataforge/locales/de_DE/company.py +125 -0
  15. dataforge/locales/de_DE/internet.py +32 -0
  16. dataforge/locales/de_DE/person.py +212 -0
  17. dataforge/locales/de_DE/phone.py +17 -0
  18. dataforge/locales/en_AU/__init__.py +1 -0
  19. dataforge/locales/en_AU/address.py +231 -0
  20. dataforge/locales/en_AU/company.py +193 -0
  21. dataforge/locales/en_AU/internet.py +34 -0
  22. dataforge/locales/en_AU/person.py +370 -0
  23. dataforge/locales/en_AU/phone.py +16 -0
  24. dataforge/locales/en_CA/__init__.py +1 -0
  25. dataforge/locales/en_CA/address.py +276 -0
  26. dataforge/locales/en_CA/company.py +193 -0
  27. dataforge/locales/en_CA/internet.py +34 -0
  28. dataforge/locales/en_CA/person.py +377 -0
  29. dataforge/locales/en_CA/phone.py +15 -0
  30. dataforge/locales/en_GB/__init__.py +1 -0
  31. dataforge/locales/en_GB/address.py +312 -0
  32. dataforge/locales/en_GB/company.py +196 -0
  33. dataforge/locales/en_GB/internet.py +34 -0
  34. dataforge/locales/en_GB/person.py +372 -0
  35. dataforge/locales/en_GB/phone.py +15 -0
  36. dataforge/locales/en_US/__init__.py +1 -0
  37. dataforge/locales/en_US/address.py +268 -0
  38. dataforge/locales/en_US/company.py +191 -0
  39. dataforge/locales/en_US/internet.py +34 -0
  40. dataforge/locales/en_US/person.py +370 -0
  41. dataforge/locales/en_US/phone.py +15 -0
  42. dataforge/locales/es_ES/__init__.py +1 -0
  43. dataforge/locales/es_ES/address.py +151 -0
  44. dataforge/locales/es_ES/company.py +125 -0
  45. dataforge/locales/es_ES/internet.py +30 -0
  46. dataforge/locales/es_ES/person.py +207 -0
  47. dataforge/locales/es_ES/phone.py +15 -0
  48. dataforge/locales/fr_FR/__init__.py +1 -0
  49. dataforge/locales/fr_FR/address.py +145 -0
  50. dataforge/locales/fr_FR/company.py +125 -0
  51. dataforge/locales/fr_FR/internet.py +30 -0
  52. dataforge/locales/fr_FR/person.py +212 -0
  53. dataforge/locales/fr_FR/phone.py +15 -0
  54. dataforge/locales/hi_IN/__init__.py +1 -0
  55. dataforge/locales/hi_IN/address.py +177 -0
  56. dataforge/locales/hi_IN/company.py +191 -0
  57. dataforge/locales/hi_IN/internet.py +26 -0
  58. dataforge/locales/hi_IN/person.py +218 -0
  59. dataforge/locales/hi_IN/phone.py +21 -0
  60. dataforge/locales/it_IT/__init__.py +1 -0
  61. dataforge/locales/it_IT/address.py +218 -0
  62. dataforge/locales/it_IT/company.py +151 -0
  63. dataforge/locales/it_IT/internet.py +31 -0
  64. dataforge/locales/it_IT/person.py +187 -0
  65. dataforge/locales/it_IT/phone.py +15 -0
  66. dataforge/locales/ja_JP/__init__.py +1 -0
  67. dataforge/locales/ja_JP/address.py +174 -0
  68. dataforge/locales/ja_JP/company.py +121 -0
  69. dataforge/locales/ja_JP/internet.py +30 -0
  70. dataforge/locales/ja_JP/person.py +207 -0
  71. dataforge/locales/ja_JP/phone.py +18 -0
  72. dataforge/locales/ko_KR/__init__.py +1 -0
  73. dataforge/locales/ko_KR/address.py +121 -0
  74. dataforge/locales/ko_KR/company.py +151 -0
  75. dataforge/locales/ko_KR/internet.py +30 -0
  76. dataforge/locales/ko_KR/person.py +157 -0
  77. dataforge/locales/ko_KR/phone.py +26 -0
  78. dataforge/locales/nl_NL/__init__.py +1 -0
  79. dataforge/locales/nl_NL/address.py +152 -0
  80. dataforge/locales/nl_NL/company.py +182 -0
  81. dataforge/locales/nl_NL/internet.py +41 -0
  82. dataforge/locales/nl_NL/person.py +218 -0
  83. dataforge/locales/nl_NL/phone.py +19 -0
  84. dataforge/locales/pl_PL/__init__.py +1 -0
  85. dataforge/locales/pl_PL/address.py +140 -0
  86. dataforge/locales/pl_PL/company.py +183 -0
  87. dataforge/locales/pl_PL/internet.py +36 -0
  88. dataforge/locales/pl_PL/person.py +217 -0
  89. dataforge/locales/pl_PL/phone.py +15 -0
  90. dataforge/locales/pt_BR/__init__.py +1 -0
  91. dataforge/locales/pt_BR/address.py +127 -0
  92. dataforge/locales/pt_BR/company.py +151 -0
  93. dataforge/locales/pt_BR/internet.py +31 -0
  94. dataforge/locales/pt_BR/person.py +187 -0
  95. dataforge/locales/pt_BR/phone.py +15 -0
  96. dataforge/locales/ru_RU/__init__.py +1 -0
  97. dataforge/locales/ru_RU/address.py +156 -0
  98. dataforge/locales/ru_RU/company.py +168 -0
  99. dataforge/locales/ru_RU/internet.py +26 -0
  100. dataforge/locales/ru_RU/person.py +218 -0
  101. dataforge/locales/ru_RU/phone.py +16 -0
  102. dataforge/locales/zh_CN/__init__.py +1 -0
  103. dataforge/locales/zh_CN/address.py +141 -0
  104. dataforge/locales/zh_CN/company.py +151 -0
  105. dataforge/locales/zh_CN/internet.py +30 -0
  106. dataforge/locales/zh_CN/person.py +157 -0
  107. dataforge/locales/zh_CN/phone.py +25 -0
  108. dataforge/providers/__init__.py +1 -0
  109. dataforge/providers/address.py +460 -0
  110. dataforge/providers/ai_chat.py +170 -0
  111. dataforge/providers/ai_prompt.py +447 -0
  112. dataforge/providers/automotive.py +416 -0
  113. dataforge/providers/barcode.py +149 -0
  114. dataforge/providers/base.py +34 -0
  115. dataforge/providers/color.py +247 -0
  116. dataforge/providers/company.py +144 -0
  117. dataforge/providers/crypto.py +105 -0
  118. dataforge/providers/datetime.py +397 -0
  119. dataforge/providers/ecommerce.py +316 -0
  120. dataforge/providers/education.py +234 -0
  121. dataforge/providers/file.py +271 -0
  122. dataforge/providers/finance.py +545 -0
  123. dataforge/providers/geo.py +332 -0
  124. dataforge/providers/government.py +114 -0
  125. dataforge/providers/internet.py +351 -0
  126. dataforge/providers/llm.py +726 -0
  127. dataforge/providers/lorem.py +241 -0
  128. dataforge/providers/medical.py +364 -0
  129. dataforge/providers/misc.py +196 -0
  130. dataforge/providers/network.py +283 -0
  131. dataforge/providers/payment.py +300 -0
  132. dataforge/providers/person.py +195 -0
  133. dataforge/providers/phone.py +87 -0
  134. dataforge/providers/profile.py +265 -0
  135. dataforge/providers/science.py +365 -0
  136. dataforge/providers/text.py +365 -0
  137. dataforge/py.typed +0 -0
  138. dataforge/pytest_plugin.py +80 -0
  139. dataforge/registry.py +164 -0
  140. dataforge/schema.py +772 -0
  141. dataforge/unique.py +171 -0
  142. dataforge_py-0.2.0.dist-info/METADATA +964 -0
  143. dataforge_py-0.2.0.dist-info/RECORD +145 -0
  144. dataforge_py-0.2.0.dist-info/WHEEL +4 -0
  145. dataforge_py-0.2.0.dist-info/entry_points.txt +35 -0
dataforge/core.py ADDED
@@ -0,0 +1,1169 @@
1
+ """DataForge — the main entry point for fake data generation.
2
+
3
+ Usage::
4
+
5
+ from dataforge import DataForge
6
+
7
+ forge = DataForge(locale="en_US", seed=42)
8
+
9
+ forge.person.first_name() # "James"
10
+ forge.person.full_name(count=1000) # list of 1000 full names
11
+ forge.address.full_address() # "4821 Oak Ave, Chicago, IL 60614"
12
+ forge.internet.email() # "james.smith@gmail.com"
13
+ forge.company.company_name() # "Acme Inc"
14
+ forge.phone.phone_number() # "555-123-4567"
15
+ forge.lorem.sentence() # "Lorem ipsum dolor sit amet."
16
+ forge.dt.date() # "2024-03-15"
17
+ """
18
+
19
+ import importlib
20
+ from typing import TYPE_CHECKING, Any
21
+ from types import ModuleType
22
+
23
+ from dataforge.backend import RandomEngine
24
+ from dataforge.providers.base import BaseProvider
25
+
26
+ # ------------------------------------------------------------------
27
+ # Heuristic field-name mappings for ORM / model introspection
28
+ # ------------------------------------------------------------------
29
+
30
+ # Maps common model field names to DataForge field shorthand names.
31
+ # Used by schema_from_pydantic() and schema_from_sqlalchemy().
32
+ _FIELD_ALIASES: dict[str, str] = {
33
+ # Person
34
+ "name": "full_name",
35
+ "full_name": "full_name",
36
+ "fname": "first_name",
37
+ "lname": "last_name",
38
+ "surname": "last_name",
39
+ "last": "last_name",
40
+ "first": "first_name",
41
+ "given_name": "first_name",
42
+ "family_name": "last_name",
43
+ "username": "username",
44
+ "user_name": "username",
45
+ # Contact
46
+ "email_address": "email",
47
+ "mail": "email",
48
+ "phone": "phone_number",
49
+ "phone_num": "phone_number",
50
+ "telephone": "phone_number",
51
+ "cell": "cell_number",
52
+ "mobile": "cell_number",
53
+ "cell_phone": "cell_number",
54
+ "mobile_phone": "cell_number",
55
+ # Address
56
+ "street": "street_address",
57
+ "street_addr": "street_address",
58
+ "addr": "full_address",
59
+ "address": "full_address",
60
+ "zip": "zipcode",
61
+ "zip_code": "zipcode",
62
+ "postal_code": "zipcode",
63
+ "postcode": "zipcode",
64
+ "state_abbr": "state_abbreviation",
65
+ "country_name": "country",
66
+ # Internet
67
+ "url": "url",
68
+ "website": "url",
69
+ "domain": "domain_name",
70
+ "ip": "ipv4",
71
+ "ip_address": "ipv4",
72
+ "ipv4_address": "ipv4",
73
+ "ipv6_address": "ipv6",
74
+ "mac": "mac_address",
75
+ "user_agent_string": "user_agent",
76
+ # Company
77
+ "company": "company_name",
78
+ "company_nm": "company_name",
79
+ "job": "job_title",
80
+ "job_name": "job_title",
81
+ "occupation": "job_title",
82
+ "title": "job_title",
83
+ # Finance
84
+ "credit_card": "credit_card_number",
85
+ "cc_number": "credit_card_number",
86
+ "card_number": "credit_card_number",
87
+ "iban_code": "iban",
88
+ "currency": "currency_code",
89
+ # Datetime
90
+ "date": "date",
91
+ "dob": "date_of_birth",
92
+ "birth_date": "date_of_birth",
93
+ "birthday": "date_of_birth",
94
+ "time": "time",
95
+ "datetime": "datetime",
96
+ "created_at": "datetime",
97
+ "updated_at": "datetime",
98
+ "timestamp": "datetime",
99
+ # Misc
100
+ "uuid": "uuid4",
101
+ "guid": "uuid4",
102
+ "description": "sentence",
103
+ "bio": "paragraph",
104
+ "summary": "sentence",
105
+ "note": "sentence",
106
+ "notes": "paragraph",
107
+ "comment": "sentence",
108
+ "body": "paragraph",
109
+ "text": "paragraph",
110
+ "content": "paragraph",
111
+ # Color
112
+ "color": "color_name",
113
+ "colour": "color_name",
114
+ "hex_color": "hex_color",
115
+ # File
116
+ "filename": "file_name",
117
+ "file": "file_name",
118
+ "extension": "file_extension",
119
+ "mime": "mime_type",
120
+ "mime_type": "mime_type",
121
+ # Network
122
+ "port": "port",
123
+ "hostname": "hostname",
124
+ # Geo
125
+ "latitude": "latitude",
126
+ "lat": "latitude",
127
+ "longitude": "longitude",
128
+ "lng": "longitude",
129
+ "lon": "longitude",
130
+ # Government
131
+ "ssn": "ssn",
132
+ "tax_id": "tax_id",
133
+ "passport": "passport_number",
134
+ "passport_no": "passport_number",
135
+ }
136
+
137
+
138
+ def _pydantic_heuristic(field_name: str) -> str | None:
139
+ """Map a Pydantic field name to a DataForge field name (or None)."""
140
+ return _FIELD_ALIASES.get(field_name)
141
+
142
+
143
+ def _sqlalchemy_heuristic(col_name: str, column: "Any") -> str | None:
144
+ """Map a SQLAlchemy column name to a DataForge field name (or None).
145
+
146
+ Uses the column name first, then falls back to type-based
147
+ heuristics for common SQL column types.
148
+ """
149
+ alias = _FIELD_ALIASES.get(col_name)
150
+ if alias:
151
+ return alias
152
+ # Type-based fallback: if the column is an Integer primary key
153
+ # we already skip it. Other type-based heuristics could go here.
154
+ return None
155
+
156
+
157
+ if TYPE_CHECKING:
158
+ from dataforge.providers.address import AddressProvider
159
+ from dataforge.providers.automotive import AutomotiveProvider
160
+ from dataforge.providers.barcode import BarcodeProvider
161
+ from dataforge.providers.color import ColorProvider
162
+ from dataforge.providers.company import CompanyProvider
163
+ from dataforge.providers.crypto import CryptoProvider
164
+ from dataforge.providers.datetime import DateTimeProvider
165
+ from dataforge.providers.ecommerce import EcommerceProvider
166
+ from dataforge.providers.education import EducationProvider
167
+ from dataforge.providers.file import FileProvider
168
+ from dataforge.providers.finance import FinanceProvider
169
+ from dataforge.providers.geo import GeoProvider
170
+ from dataforge.providers.government import GovernmentProvider
171
+ from dataforge.providers.internet import InternetProvider
172
+ from dataforge.providers.lorem import LoremProvider
173
+ from dataforge.providers.medical import MedicalProvider
174
+ from dataforge.providers.misc import MiscProvider
175
+ from dataforge.providers.network import NetworkProvider
176
+ from dataforge.providers.payment import PaymentProvider
177
+ from dataforge.providers.person import PersonProvider
178
+ from dataforge.providers.phone import PhoneProvider
179
+ from dataforge.providers.profile import ProfileProvider
180
+ from dataforge.providers.science import ScienceProvider
181
+ from dataforge.providers.text import TextProvider
182
+ from dataforge.providers.ai_prompt import AiPromptProvider
183
+ from dataforge.providers.llm import LlmProvider
184
+ from dataforge.providers.ai_chat import AiChatProvider
185
+
186
+
187
+ class DataForge:
188
+ """High-performance fake data generator.
189
+
190
+ Providers are loaded **lazily** — nothing is imported until a
191
+ provider property is first accessed. The provider registry
192
+ (:mod:`dataforge.registry`) resolves field names and provider
193
+ classes automatically, so new providers can be added without
194
+ editing this file.
195
+
196
+ Parameters
197
+ ----------
198
+ locale : str
199
+ The locale to use for data generation (e.g. ``"en_US"``).
200
+ Locale data is loaded **lazily** — nothing is imported until
201
+ a provider property is first accessed.
202
+ seed : int | None
203
+ Optional seed for reproducible output. When set, the stdlib
204
+ ``random`` backend is seeded for deterministic generation.
205
+
206
+ Examples
207
+ --------
208
+ >>> forge = DataForge(seed=42)
209
+ >>> forge.person.first_name()
210
+ '...'
211
+ >>> forge.address.city()
212
+ '...'
213
+ >>> forge.internet.email()
214
+ '...'
215
+ >>> forge.company.company_name()
216
+ '...'
217
+ >>> forge.phone.phone_number()
218
+ '...'
219
+ >>> forge.lorem.sentence()
220
+ '...'
221
+ >>> forge.dt.date()
222
+ '...'
223
+ >>> forge.finance.credit_card_number()
224
+ '...'
225
+ >>> forge.color.hex_color()
226
+ '...'
227
+ >>> forge.file.file_name()
228
+ '...'
229
+ >>> forge.network.ipv6()
230
+ '...'
231
+ >>> forge.misc.uuid4()
232
+ '...'
233
+ >>> forge.barcode.ean13()
234
+ '...'
235
+ """
236
+
237
+ __slots__ = (
238
+ "_engine",
239
+ "_locale",
240
+ "_providers",
241
+ "_locale_cache",
242
+ "_unique_proxy",
243
+ )
244
+
245
+ def __init__(self, locale: str = "en_US", seed: int | None = None) -> None:
246
+ self._engine = RandomEngine(seed=seed)
247
+ self._locale = locale
248
+ self._providers: dict[str, BaseProvider] = {}
249
+ self._locale_cache: dict[str, ModuleType] = {}
250
+ self._unique_proxy: Any = None
251
+
252
+ # ------------------------------------------------------------------
253
+ # Dynamic provider access via registry
254
+ # ------------------------------------------------------------------
255
+
256
+ def _get_provider(self, name: str) -> BaseProvider:
257
+ """Lazily instantiate and cache a provider by registry name.
258
+
259
+ Uses the provider registry to resolve the class and its
260
+ locale module requirements. Providers are instantiated once
261
+ and cached in ``_providers``.
262
+ """
263
+ prov = self._providers.get(name)
264
+ if prov is not None:
265
+ return prov
266
+
267
+ from dataforge.registry import get_provider_info
268
+
269
+ info = get_provider_info()
270
+ if name not in info:
271
+ raise AttributeError(
272
+ f"DataForge has no provider '{name}'. "
273
+ f"Available: {', '.join(sorted(info))}"
274
+ )
275
+
276
+ cls, locale_modules = info[name]
277
+ if getattr(cls, "_needs_forge", False):
278
+ # Compound provider that needs access to the DataForge instance
279
+ prov = cls(self._engine, self)
280
+ elif locale_modules:
281
+ # Provider needs locale data modules
282
+ locale_args = [self._load_locale_module(mod) for mod in locale_modules]
283
+ prov = cls(self._engine, *locale_args)
284
+ else:
285
+ prov = cls(self._engine)
286
+
287
+ self._providers[name] = prov
288
+ return prov
289
+
290
+ # ------------------------------------------------------------------
291
+ # Explicit provider properties (for IDE autocomplete + type safety)
292
+ # These delegate to _get_provider() which uses the registry.
293
+ # ------------------------------------------------------------------
294
+
295
+ @property
296
+ def person(self) -> "PersonProvider":
297
+ """Access the person data provider (names, prefixes, suffixes)."""
298
+ return self._get_provider("person") # type: ignore[return-value]
299
+
300
+ @property
301
+ def address(self) -> "AddressProvider":
302
+ """Access the address data provider (streets, cities, zip codes)."""
303
+ return self._get_provider("address") # type: ignore[return-value]
304
+
305
+ @property
306
+ def internet(self) -> "InternetProvider":
307
+ """Access the internet data provider (emails, usernames, domains, IPs)."""
308
+ return self._get_provider("internet") # type: ignore[return-value]
309
+
310
+ @property
311
+ def company(self) -> "CompanyProvider":
312
+ """Access the company data provider (names, catch phrases, job titles)."""
313
+ return self._get_provider("company") # type: ignore[return-value]
314
+
315
+ @property
316
+ def phone(self) -> "PhoneProvider":
317
+ """Access the phone data provider (phone numbers, cell numbers)."""
318
+ return self._get_provider("phone") # type: ignore[return-value]
319
+
320
+ @property
321
+ def lorem(self) -> "LoremProvider":
322
+ """Access the Lorem Ipsum text provider (words, sentences, paragraphs)."""
323
+ return self._get_provider("lorem") # type: ignore[return-value]
324
+
325
+ @property
326
+ def dt(self) -> "DateTimeProvider":
327
+ """Access the datetime provider (dates, times, datetimes)."""
328
+ return self._get_provider("dt") # type: ignore[return-value]
329
+
330
+ @property
331
+ def finance(self) -> "FinanceProvider":
332
+ """Access the finance provider (credit cards, IBANs, currencies)."""
333
+ return self._get_provider("finance") # type: ignore[return-value]
334
+
335
+ @property
336
+ def color(self) -> "ColorProvider":
337
+ """Access the color provider (hex, RGB, HSL, color names)."""
338
+ return self._get_provider("color") # type: ignore[return-value]
339
+
340
+ @property
341
+ def file(self) -> "FileProvider":
342
+ """Access the file provider (file names, extensions, MIME types, paths)."""
343
+ return self._get_provider("file") # type: ignore[return-value]
344
+
345
+ @property
346
+ def network(self) -> "NetworkProvider":
347
+ """Access the network provider (IPv6, MAC, port, hostname, user agent)."""
348
+ return self._get_provider("network") # type: ignore[return-value]
349
+
350
+ @property
351
+ def misc(self) -> "MiscProvider":
352
+ """Access the misc provider (UUID4, boolean, random_element, null_or)."""
353
+ return self._get_provider("misc") # type: ignore[return-value]
354
+
355
+ @property
356
+ def barcode(self) -> "BarcodeProvider":
357
+ """Access the barcode provider (EAN-13, EAN-8, ISBN-13, ISBN-10)."""
358
+ return self._get_provider("barcode") # type: ignore[return-value]
359
+
360
+ @property
361
+ def crypto(self) -> "CryptoProvider":
362
+ """Access the crypto provider (MD5, SHA-1, SHA-256 hex strings)."""
363
+ return self._get_provider("crypto") # type: ignore[return-value]
364
+
365
+ @property
366
+ def automotive(self) -> "AutomotiveProvider":
367
+ """Access the automotive provider (plates, VINs, makes, models)."""
368
+ return self._get_provider("automotive") # type: ignore[return-value]
369
+
370
+ @property
371
+ def education(self) -> "EducationProvider":
372
+ """Access the education provider (universities, degrees, fields)."""
373
+ return self._get_provider("education") # type: ignore[return-value]
374
+
375
+ @property
376
+ def profile(self) -> "ProfileProvider":
377
+ """Access the profile provider (coherent user profiles)."""
378
+ return self._get_provider("profile") # type: ignore[return-value]
379
+
380
+ @property
381
+ def government(self) -> "GovernmentProvider":
382
+ """Access the government provider (SSN, tax ID, passports)."""
383
+ return self._get_provider("government") # type: ignore[return-value]
384
+
385
+ @property
386
+ def ecommerce(self) -> "EcommerceProvider":
387
+ """Access the e-commerce provider (products, SKUs, orders)."""
388
+ return self._get_provider("ecommerce") # type: ignore[return-value]
389
+
390
+ @property
391
+ def medical(self) -> "MedicalProvider":
392
+ """Access the medical provider (ICD-10, drugs, blood types)."""
393
+ return self._get_provider("medical") # type: ignore[return-value]
394
+
395
+ @property
396
+ def payment(self) -> "PaymentProvider":
397
+ """Access the payment provider (card types, processors, transactions)."""
398
+ return self._get_provider("payment") # type: ignore[return-value]
399
+
400
+ @property
401
+ def text(self) -> "TextProvider":
402
+ """Access the text provider (quotes, headlines, paragraphs)."""
403
+ return self._get_provider("text") # type: ignore[return-value]
404
+
405
+ @property
406
+ def geo(self) -> "GeoProvider":
407
+ """Access the geo provider (continents, oceans, rivers, coordinates)."""
408
+ return self._get_provider("geo") # type: ignore[return-value]
409
+
410
+ @property
411
+ def science(self) -> "ScienceProvider":
412
+ """Access the science provider (elements, planets, units)."""
413
+ return self._get_provider("science") # type: ignore[return-value]
414
+
415
+ @property
416
+ def ai_prompt(self) -> "AiPromptProvider":
417
+ """Access the AI prompt provider (user/system/creative prompts)."""
418
+ return self._get_provider("ai_prompt") # type: ignore[return-value]
419
+
420
+ @property
421
+ def llm(self) -> "LlmProvider":
422
+ """Access the LLM provider (models, agents, RAG, moderation, billing)."""
423
+ return self._get_provider("llm") # type: ignore[return-value]
424
+
425
+ @property
426
+ def ai_chat(self) -> "AiChatProvider":
427
+ """Access the AI chat provider (conversation turns, messages)."""
428
+ return self._get_provider("ai_chat") # type: ignore[return-value]
429
+
430
+ # ------------------------------------------------------------------
431
+ # Unique value generation
432
+ # ------------------------------------------------------------------
433
+
434
+ @property
435
+ def unique(self) -> "Any":
436
+ """Access the unique-value proxy.
437
+
438
+ Returns a proxy that ensures every value returned by a
439
+ provider method is unique within the proxy's lifetime.
440
+ Call ``forge.unique.clear()`` to reset tracking.
441
+
442
+ Examples
443
+ --------
444
+ >>> forge = DataForge(seed=42)
445
+ >>> a = forge.unique.person.first_name()
446
+ >>> b = forge.unique.person.first_name()
447
+ >>> a != b
448
+ True
449
+ """
450
+ if self._unique_proxy is None:
451
+ from dataforge.unique import UniqueProxy
452
+
453
+ self._unique_proxy = UniqueProxy(self)
454
+ return self._unique_proxy
455
+
456
+ # ------------------------------------------------------------------
457
+ # Provider registration
458
+ # ------------------------------------------------------------------
459
+
460
+ def register_provider(
461
+ self,
462
+ provider_cls: type[BaseProvider],
463
+ name: str | None = None,
464
+ ) -> None:
465
+ """Register a custom provider class at runtime.
466
+
467
+ The provider is added to this ``DataForge`` instance's
468
+ internal registry and can be accessed via ``getattr``.
469
+
470
+ Parameters
471
+ ----------
472
+ provider_cls : type[BaseProvider]
473
+ The provider class to register. Must be a
474
+ ``BaseProvider`` subclass with ``_provider_name``.
475
+ name : str | None
476
+ Override the provider name. Defaults to the class's
477
+ ``_provider_name`` attribute.
478
+
479
+ Examples
480
+ --------
481
+ >>> from dataforge.providers.base import BaseProvider
482
+ >>> from dataforge.backend import RandomEngine
483
+ >>> class MyProvider(BaseProvider):
484
+ ... _provider_name = "my"
485
+ ... _field_map = {"greeting": "greeting"}
486
+ ... def greeting(self, count=1):
487
+ ... return "hello" if count == 1 else ["hello"] * count
488
+ >>> forge = DataForge()
489
+ >>> forge.register_provider(MyProvider)
490
+ >>> forge.my.greeting()
491
+ 'hello'
492
+ """
493
+ prov_name = name or getattr(provider_cls, "_provider_name", "")
494
+ if not prov_name:
495
+ raise ValueError(
496
+ f"{provider_cls.__name__} does not define '_provider_name'."
497
+ )
498
+ locale_modules = getattr(provider_cls, "_locale_modules", ())
499
+ if getattr(provider_cls, "_needs_forge", False):
500
+ prov = provider_cls(self._engine, self) # type: ignore[call-arg]
501
+ elif locale_modules:
502
+ locale_args = [self._load_locale_module(mod) for mod in locale_modules]
503
+ prov = provider_cls(self._engine, *locale_args) # type: ignore[call-arg]
504
+ else:
505
+ prov = provider_cls(self._engine)
506
+ self._providers[prov_name] = prov
507
+
508
+ # Register field mappings so Schema/to_dict can find them
509
+ from dataforge.registry import register_runtime_provider
510
+
511
+ register_runtime_provider(prov_name, provider_cls, locale_modules)
512
+
513
+ def __getattr__(self, name: str) -> Any:
514
+ """Dynamic attribute access for registered providers.
515
+
516
+ Allows ``forge.my_provider`` to work for providers
517
+ registered via :meth:`register_provider` at runtime,
518
+ without requiring a ``@property`` on the class.
519
+ """
520
+ # Check if it's a cached provider
521
+ providers = object.__getattribute__(self, "_providers")
522
+ if name in providers:
523
+ return providers[name]
524
+ # Try registry lookup
525
+ try:
526
+ return self._get_provider(name)
527
+ except AttributeError:
528
+ raise AttributeError(
529
+ f"'{type(self).__name__}' object has no attribute '{name}'"
530
+ ) from None
531
+
532
+ # ------------------------------------------------------------------
533
+ # Seed control
534
+ # ------------------------------------------------------------------
535
+
536
+ def seed(self, value: int) -> None:
537
+ """Re-seed the random engine for reproducible output.
538
+
539
+ This resets the internal state of the stdlib ``random`` backend.
540
+ """
541
+ self._engine.seed(value)
542
+
543
+ def copy(self, seed: int | None = None) -> "DataForge":
544
+ """Create a new ``DataForge`` instance with the same locale.
545
+
546
+ Parameters
547
+ ----------
548
+ seed : int | None
549
+ Optional seed for the new instance. If ``None``, the new
550
+ instance is unseeded (non-deterministic).
551
+
552
+ Returns
553
+ -------
554
+ DataForge
555
+ """
556
+ return DataForge(locale=self._locale, seed=seed)
557
+
558
+ # ------------------------------------------------------------------
559
+ # Schema API
560
+ # ------------------------------------------------------------------
561
+
562
+ def schema(self, fields: "list[str] | dict[str, Any]") -> "Any":
563
+ """Create a pre-resolved :class:`Schema` for maximum throughput.
564
+
565
+ Parameters
566
+ ----------
567
+ fields : list[str] | dict[str, str | Callable]
568
+ Fields to generate. String values are resolved to provider
569
+ methods. Callable values receive the current row dict and
570
+ can reference previously generated columns.
571
+
572
+ Returns
573
+ -------
574
+ Schema
575
+
576
+ Examples
577
+ --------
578
+ >>> forge = DataForge(seed=42)
579
+ >>> s = forge.schema(["first_name", "email"])
580
+ >>> rows = s.generate(count=1000)
581
+ """
582
+ from dataforge.schema import Schema
583
+
584
+ return Schema(self, fields)
585
+
586
+ # ------------------------------------------------------------------
587
+ # Locale management
588
+ # ------------------------------------------------------------------
589
+
590
+ @property
591
+ def locale(self) -> str:
592
+ """The currently active locale string (e.g. ``"en_US"``)."""
593
+ return self._locale
594
+
595
+ # ------------------------------------------------------------------
596
+ # Internal helpers
597
+ # ------------------------------------------------------------------
598
+
599
+ def _load_locale_module(self, module_name: str) -> ModuleType:
600
+ """Dynamically import a locale data module.
601
+
602
+ Results are cached so that repeated access to the same provider
603
+ does not re-import the module.
604
+
605
+ If the requested locale does not provide the specified module,
606
+ falls back to ``en_US`` and emits a warning.
607
+
608
+ Parameters
609
+ ----------
610
+ module_name : str
611
+ The name of the submodule inside the locale package
612
+ (e.g. ``"person"``, ``"address"``).
613
+ """
614
+ key = f"{self._locale}.{module_name}"
615
+ if key not in self._locale_cache:
616
+ try:
617
+ mod = importlib.import_module(
618
+ f"dataforge.locales.{self._locale}.{module_name}"
619
+ )
620
+ except ModuleNotFoundError:
621
+ if self._locale == "en_US":
622
+ raise ValueError(
623
+ f"Locale 'en_US' does not have a '{module_name}' data module."
624
+ )
625
+ import warnings
626
+
627
+ warnings.warn(
628
+ f"Locale '{self._locale}' does not have a '{module_name}' "
629
+ f"data module — falling back to 'en_US'.",
630
+ UserWarning,
631
+ stacklevel=3,
632
+ )
633
+ mod = importlib.import_module(f"dataforge.locales.en_US.{module_name}")
634
+ self._locale_cache[key] = mod
635
+ return self._locale_cache[key]
636
+
637
+ def _resolve_field(self, field: str) -> tuple[str, str]:
638
+ """Resolve a field name to (provider_attr, method_name).
639
+
640
+ Supports both direct names (e.g. ``"first_name"``) and
641
+ dotted paths (e.g. ``"person.first_name"``).
642
+ """
643
+ # Dotted path: "person.first_name" → ("person", "first_name")
644
+ if "." in field:
645
+ provider_attr, method_name = field.split(".", 1)
646
+ return provider_attr, method_name
647
+
648
+ from dataforge.registry import get_field_map
649
+
650
+ fm = get_field_map()
651
+ if field in fm:
652
+ return fm[field]
653
+ raise ValueError(
654
+ f"Unknown field '{field}'. Use dotted notation "
655
+ f"(e.g. 'person.first_name') or a known shorthand."
656
+ )
657
+
658
+ # ------------------------------------------------------------------
659
+ # Bulk data generation
660
+ # ------------------------------------------------------------------
661
+
662
+ def to_dict(
663
+ self,
664
+ fields: list[str] | dict[str, str],
665
+ count: int = 10,
666
+ ) -> list[dict[str, str]]:
667
+ """Generate *count* rows of fake data as a list of dicts.
668
+
669
+ Uses **column-first** batch generation for maximum throughput:
670
+ each field is generated in bulk via its ``count=N`` batch path,
671
+ then columns are zipped into row dicts.
672
+
673
+ Parameters
674
+ ----------
675
+ fields : list[str] | dict[str, str]
676
+ Fields to generate. Can be a list of field names (e.g.
677
+ ``["first_name", "email"]``) or a dict mapping output column
678
+ names to field names (e.g. ``{"Name": "full_name"}``).
679
+ count : int
680
+ Number of rows to generate.
681
+
682
+ Returns
683
+ -------
684
+ list[dict[str, str]]
685
+ Each dict maps column name → generated value.
686
+
687
+ Examples
688
+ --------
689
+ >>> forge = DataForge(seed=42)
690
+ >>> rows = forge.to_dict(["first_name", "email"], count=3)
691
+ >>> len(rows)
692
+ 3
693
+ """
694
+ if count == 0:
695
+ return []
696
+
697
+ # Normalize fields
698
+ if isinstance(fields, list):
699
+ field_defs = [(f, f) for f in fields]
700
+ else:
701
+ field_defs = list(fields.items())
702
+
703
+ # Resolve providers and methods
704
+ columns: list[str] = []
705
+ callables: list[object] = []
706
+ for col_name, field_name in field_defs:
707
+ provider_attr, method_name = self._resolve_field(field_name)
708
+ provider = getattr(self, provider_attr)
709
+ method = getattr(provider, method_name)
710
+ columns.append(col_name)
711
+ callables.append(method)
712
+
713
+ # Column-first: generate all values for each column in one batch call
714
+ col_data: list[list[str]] = []
715
+ for fn in callables:
716
+ if count == 1:
717
+ val = fn() # type: ignore[operator]
718
+ col_data.append([val if isinstance(val, str) else str(val)])
719
+ else:
720
+ values = fn(count=count) # type: ignore[operator]
721
+ # Most providers return list[str] — skip redundant str()
722
+ if values and isinstance(values[0], str):
723
+ col_data.append(values) # type: ignore[arg-type]
724
+ else:
725
+ col_data.append([str(v) for v in values])
726
+
727
+ # Zip columns into row dicts
728
+ col_tuple = tuple(columns)
729
+ return [dict(zip(col_tuple, row)) for row in zip(*col_data)]
730
+
731
+ def to_csv(
732
+ self,
733
+ fields: list[str] | dict[str, str],
734
+ count: int = 10,
735
+ path: str | None = None,
736
+ ) -> str:
737
+ """Generate fake data and return (or write) as CSV.
738
+
739
+ Delegates to :meth:`Schema.to_csv` for zero-duplication.
740
+
741
+ Parameters
742
+ ----------
743
+ fields : list[str] | dict[str, str]
744
+ Fields to generate (same format as :meth:`to_dict`).
745
+ count : int
746
+ Number of rows.
747
+ path : str | None
748
+ If provided, write CSV to this file path. Otherwise return
749
+ the CSV as a string.
750
+
751
+ Returns
752
+ -------
753
+ str
754
+ The CSV content as a string.
755
+ """
756
+ return self.schema(fields).to_csv(count=count, path=path)
757
+
758
+ def to_jsonl(
759
+ self,
760
+ fields: list[str] | dict[str, str],
761
+ count: int = 10,
762
+ path: str | None = None,
763
+ ) -> str:
764
+ """Generate fake data and return (or write) as JSON Lines.
765
+
766
+ Delegates to :meth:`Schema.to_jsonl` for zero-duplication.
767
+
768
+ Parameters
769
+ ----------
770
+ fields : list[str] | dict[str, str]
771
+ Fields to generate (same format as :meth:`to_dict`).
772
+ count : int
773
+ Number of rows.
774
+ path : str | None
775
+ If provided, write JSONL to this file path.
776
+
777
+ Returns
778
+ -------
779
+ str
780
+ The JSONL content as a string.
781
+ """
782
+ return self.schema(fields).to_jsonl(count=count, path=path)
783
+
784
+ def to_sql(
785
+ self,
786
+ fields: list[str] | dict[str, str],
787
+ table: str,
788
+ count: int = 10,
789
+ dialect: str = "sqlite",
790
+ path: str | None = None,
791
+ ) -> str:
792
+ """Generate fake data and return as SQL INSERT statements.
793
+
794
+ Delegates to :meth:`Schema.to_sql` for zero-duplication.
795
+
796
+ Parameters
797
+ ----------
798
+ fields : list[str] | dict[str, str]
799
+ Fields to generate (same format as :meth:`to_dict`).
800
+ table : str
801
+ Target table name.
802
+ count : int
803
+ Number of rows.
804
+ dialect : str
805
+ SQL dialect: ``"sqlite"``, ``"mysql"``, or ``"postgresql"``.
806
+ path : str | None
807
+ If provided, write SQL to this file path.
808
+
809
+ Returns
810
+ -------
811
+ str
812
+ SQL INSERT statements as a string.
813
+ """
814
+ return self.schema(fields).to_sql(
815
+ table=table, count=count, dialect=dialect, path=path
816
+ )
817
+
818
+ def to_dataframe(
819
+ self,
820
+ fields: list[str] | dict[str, str],
821
+ count: int = 10,
822
+ ) -> "Any":
823
+ """Generate fake data as a pandas DataFrame.
824
+
825
+ Delegates to :meth:`Schema.to_dataframe` for zero-duplication.
826
+ Requires ``pandas`` to be installed.
827
+
828
+ Parameters
829
+ ----------
830
+ fields : list[str] | dict[str, str]
831
+ Fields to generate (same format as :meth:`to_dict`).
832
+ count : int
833
+ Number of rows.
834
+
835
+ Returns
836
+ -------
837
+ pandas.DataFrame
838
+ A DataFrame with one column per field.
839
+ """
840
+ return self.schema(fields).to_dataframe(count=count)
841
+
842
+ def stream_to_csv(
843
+ self,
844
+ fields: list[str] | dict[str, str],
845
+ path: str,
846
+ count: int = 10,
847
+ batch_size: int | None = None,
848
+ ) -> int:
849
+ """Stream fake data directly to a CSV file.
850
+
851
+ Memory-efficient: writes in batches without materializing
852
+ all rows in memory.
853
+
854
+ Parameters
855
+ ----------
856
+ fields : list[str] | dict[str, str]
857
+ Fields to generate.
858
+ path : str
859
+ File path to write.
860
+ count : int
861
+ Number of rows.
862
+ batch_size : int | None
863
+ Rows per batch. Auto-tuned when ``None``.
864
+
865
+ Returns
866
+ -------
867
+ int
868
+ Number of rows written.
869
+ """
870
+ return self.schema(fields).stream_to_csv(
871
+ path=path, count=count, batch_size=batch_size
872
+ )
873
+
874
+ def stream_to_jsonl(
875
+ self,
876
+ fields: list[str] | dict[str, str],
877
+ path: str,
878
+ count: int = 10,
879
+ batch_size: int | None = None,
880
+ ) -> int:
881
+ """Stream fake data directly to a JSON Lines file.
882
+
883
+ Memory-efficient: writes in batches without materializing
884
+ all rows in memory.
885
+
886
+ Parameters
887
+ ----------
888
+ fields : list[str] | dict[str, str]
889
+ Fields to generate.
890
+ path : str
891
+ File path to write.
892
+ count : int
893
+ Number of rows.
894
+ batch_size : int | None
895
+ Rows per batch. Auto-tuned when ``None``.
896
+
897
+ Returns
898
+ -------
899
+ int
900
+ Number of rows written.
901
+ """
902
+ return self.schema(fields).stream_to_jsonl(
903
+ path=path, count=count, batch_size=batch_size
904
+ )
905
+
906
+ def to_arrow(
907
+ self,
908
+ fields: list[str] | dict[str, str],
909
+ count: int = 10,
910
+ batch_size: int | None = None,
911
+ ) -> "Any":
912
+ """Generate fake data as a PyArrow Table.
913
+
914
+ Delegates to :meth:`Schema.to_arrow` for zero-duplication.
915
+ Requires ``pyarrow`` to be installed.
916
+
917
+ Parameters
918
+ ----------
919
+ fields : list[str] | dict[str, str]
920
+ Fields to generate (same format as :meth:`to_dict`).
921
+ count : int
922
+ Number of rows.
923
+ batch_size : int | None
924
+ Rows per internal batch. Auto-tuned when ``None``.
925
+
926
+ Returns
927
+ -------
928
+ pyarrow.Table
929
+ """
930
+ return self.schema(fields).to_arrow(count=count, batch_size=batch_size)
931
+
932
+ def to_polars(
933
+ self,
934
+ fields: list[str] | dict[str, str],
935
+ count: int = 10,
936
+ batch_size: int | None = None,
937
+ ) -> "Any":
938
+ """Generate fake data as a Polars DataFrame.
939
+
940
+ Delegates to :meth:`Schema.to_polars` for zero-duplication.
941
+ Requires ``polars`` to be installed.
942
+
943
+ Parameters
944
+ ----------
945
+ fields : list[str] | dict[str, str]
946
+ Fields to generate (same format as :meth:`to_dict`).
947
+ count : int
948
+ Number of rows.
949
+ batch_size : int | None
950
+ Rows per internal batch. Auto-tuned when ``None``.
951
+
952
+ Returns
953
+ -------
954
+ polars.DataFrame
955
+ """
956
+ return self.schema(fields).to_polars(count=count, batch_size=batch_size)
957
+
958
+ def to_parquet(
959
+ self,
960
+ fields: list[str] | dict[str, str],
961
+ path: str,
962
+ count: int = 10,
963
+ batch_size: int | None = None,
964
+ ) -> int:
965
+ """Generate fake data and write as a Parquet file.
966
+
967
+ Requires ``pyarrow`` to be installed. Data is written in
968
+ batched row-groups for bounded memory usage.
969
+
970
+ Parameters
971
+ ----------
972
+ fields : list[str] | dict[str, str]
973
+ Fields to generate.
974
+ path : str
975
+ File path to write.
976
+ count : int
977
+ Number of rows.
978
+ batch_size : int | None
979
+ Rows per row-group. Auto-tuned when ``None``.
980
+
981
+ Returns
982
+ -------
983
+ int
984
+ Number of rows written.
985
+ """
986
+ return self.schema(fields).to_parquet(
987
+ path=path, count=count, batch_size=batch_size
988
+ )
989
+
990
+ def __repr__(self) -> str:
991
+ return f"DataForge(locale={self._locale!r})"
992
+
993
+ # ------------------------------------------------------------------
994
+ # Schema factories from ORM / model introspection
995
+ # ------------------------------------------------------------------
996
+
997
+ def schema_from_pydantic(self, model: type) -> "Any":
998
+ """Create a :class:`Schema` by introspecting a Pydantic model.
999
+
1000
+ Maps model field names to DataForge fields using the field
1001
+ registry. Fields that cannot be mapped are silently skipped
1002
+ (a warning is emitted). If the model has a field whose name
1003
+ exactly matches a registered DataForge field (e.g.
1004
+ ``first_name``, ``email``, ``city``), it is mapped
1005
+ automatically.
1006
+
1007
+ Requires ``pydantic`` to be installed.
1008
+
1009
+ Parameters
1010
+ ----------
1011
+ model : type
1012
+ A Pydantic ``BaseModel`` subclass.
1013
+
1014
+ Returns
1015
+ -------
1016
+ Schema
1017
+
1018
+ Examples
1019
+ --------
1020
+ >>> from pydantic import BaseModel
1021
+ >>> class User(BaseModel):
1022
+ ... first_name: str
1023
+ ... email: str
1024
+ ... city: str
1025
+ >>> forge = DataForge(seed=42)
1026
+ >>> s = forge.schema_from_pydantic(User)
1027
+ >>> rows = s.generate(count=5)
1028
+ """
1029
+ from dataforge.schema import Schema
1030
+
1031
+ try:
1032
+ from pydantic import BaseModel # noqa: F811
1033
+ except ModuleNotFoundError as exc:
1034
+ raise ModuleNotFoundError(
1035
+ "pydantic is required for schema_from_pydantic(). "
1036
+ "Install it with: pip install pydantic"
1037
+ ) from exc
1038
+
1039
+ if not (isinstance(model, type) and issubclass(model, BaseModel)):
1040
+ raise TypeError(f"Expected a Pydantic BaseModel subclass, got {model!r}")
1041
+
1042
+ from dataforge.registry import get_field_map
1043
+
1044
+ field_map = get_field_map()
1045
+ mapped: dict[str, str] = {}
1046
+
1047
+ # Pydantic v2 uses model_fields; v1 used __fields__
1048
+ model_fields: dict[str, Any] = {}
1049
+ if hasattr(model, "model_fields"):
1050
+ model_fields = model.model_fields
1051
+ elif hasattr(model, "__fields__"):
1052
+ model_fields = model.__fields__
1053
+ else:
1054
+ raise TypeError(
1055
+ f"Cannot introspect fields from {model.__name__}. "
1056
+ "Ensure it is a valid Pydantic BaseModel."
1057
+ )
1058
+
1059
+ import warnings
1060
+
1061
+ for field_name in model_fields:
1062
+ if field_name in field_map:
1063
+ mapped[field_name] = field_name
1064
+ else:
1065
+ # Try common aliases / heuristic mapping
1066
+ alias = _pydantic_heuristic(field_name)
1067
+ if alias and alias in field_map:
1068
+ mapped[field_name] = alias
1069
+ else:
1070
+ warnings.warn(
1071
+ f"Pydantic field '{field_name}' on {model.__name__} "
1072
+ f"could not be mapped to a DataForge field — skipping.",
1073
+ UserWarning,
1074
+ stacklevel=2,
1075
+ )
1076
+
1077
+ if not mapped:
1078
+ raise ValueError(
1079
+ f"No fields on {model.__name__} could be mapped to "
1080
+ f"DataForge fields. Ensure the model uses recognisable "
1081
+ f"field names (e.g. 'first_name', 'email', 'city')."
1082
+ )
1083
+
1084
+ return Schema(self, mapped)
1085
+
1086
+ def schema_from_sqlalchemy(self, model: type) -> "Any":
1087
+ """Create a :class:`Schema` by introspecting a SQLAlchemy model.
1088
+
1089
+ Maps column names to DataForge fields using the field
1090
+ registry. Columns that cannot be mapped are silently skipped
1091
+ (a warning is emitted). Primary key columns named ``id``
1092
+ are skipped automatically.
1093
+
1094
+ Requires ``sqlalchemy`` to be installed.
1095
+
1096
+ Parameters
1097
+ ----------
1098
+ model : type
1099
+ A SQLAlchemy declarative model class (must have
1100
+ ``__table__`` attribute).
1101
+
1102
+ Returns
1103
+ -------
1104
+ Schema
1105
+
1106
+ Examples
1107
+ --------
1108
+ >>> from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
1109
+ >>> class Base(DeclarativeBase): pass
1110
+ >>> class User(Base):
1111
+ ... __tablename__ = "users"
1112
+ ... id: Mapped[int] = mapped_column(primary_key=True)
1113
+ ... first_name: Mapped[str]
1114
+ ... email: Mapped[str]
1115
+ >>> forge = DataForge(seed=42)
1116
+ >>> s = forge.schema_from_sqlalchemy(User)
1117
+ >>> rows = s.generate(count=5)
1118
+ """
1119
+ from dataforge.schema import Schema
1120
+
1121
+ try:
1122
+ import sqlalchemy # noqa: F401
1123
+ except ModuleNotFoundError as exc:
1124
+ raise ModuleNotFoundError(
1125
+ "sqlalchemy is required for schema_from_sqlalchemy(). "
1126
+ "Install it with: pip install sqlalchemy"
1127
+ ) from exc
1128
+
1129
+ if not hasattr(model, "__table__"):
1130
+ raise TypeError(
1131
+ f"Expected a SQLAlchemy declarative model with __table__, got {model!r}"
1132
+ )
1133
+
1134
+ from dataforge.registry import get_field_map
1135
+
1136
+ field_map = get_field_map()
1137
+ mapped: dict[str, str] = {}
1138
+
1139
+ import warnings
1140
+
1141
+ table = model.__table__
1142
+ for column in table.columns:
1143
+ col_name = column.name
1144
+ # Skip primary key 'id' columns — not fake-able
1145
+ if col_name == "id" and column.primary_key:
1146
+ continue
1147
+ if col_name in field_map:
1148
+ mapped[col_name] = col_name
1149
+ else:
1150
+ alias = _sqlalchemy_heuristic(col_name, column)
1151
+ if alias and alias in field_map:
1152
+ mapped[col_name] = alias
1153
+ else:
1154
+ warnings.warn(
1155
+ f"SQLAlchemy column '{col_name}' on "
1156
+ f"{model.__name__} could not be mapped to a "
1157
+ f"DataForge field — skipping.",
1158
+ UserWarning,
1159
+ stacklevel=2,
1160
+ )
1161
+
1162
+ if not mapped:
1163
+ raise ValueError(
1164
+ f"No columns on {model.__name__} could be mapped to "
1165
+ f"DataForge fields. Ensure the model uses recognisable "
1166
+ f"column names (e.g. 'first_name', 'email', 'city')."
1167
+ )
1168
+
1169
+ return Schema(self, mapped)