dataforge-py 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. dataforge_py-0.2.0/PKG-INFO +964 -0
  2. dataforge_py-0.2.0/README.md +955 -0
  3. dataforge_py-0.2.0/pyproject.toml +55 -0
  4. dataforge_py-0.2.0/src/dataforge/__init__.py +20 -0
  5. dataforge_py-0.2.0/src/dataforge/backend.py +147 -0
  6. dataforge_py-0.2.0/src/dataforge/cli.py +166 -0
  7. dataforge_py-0.2.0/src/dataforge/core.py +1169 -0
  8. dataforge_py-0.2.0/src/dataforge/locales/__init__.py +1 -0
  9. dataforge_py-0.2.0/src/dataforge/locales/ar_SA/__init__.py +1 -0
  10. dataforge_py-0.2.0/src/dataforge/locales/ar_SA/address.py +128 -0
  11. dataforge_py-0.2.0/src/dataforge/locales/ar_SA/company.py +183 -0
  12. dataforge_py-0.2.0/src/dataforge/locales/ar_SA/internet.py +25 -0
  13. dataforge_py-0.2.0/src/dataforge/locales/ar_SA/person.py +217 -0
  14. dataforge_py-0.2.0/src/dataforge/locales/ar_SA/phone.py +15 -0
  15. dataforge_py-0.2.0/src/dataforge/locales/de_DE/__init__.py +1 -0
  16. dataforge_py-0.2.0/src/dataforge/locales/de_DE/address.py +148 -0
  17. dataforge_py-0.2.0/src/dataforge/locales/de_DE/company.py +125 -0
  18. dataforge_py-0.2.0/src/dataforge/locales/de_DE/internet.py +32 -0
  19. dataforge_py-0.2.0/src/dataforge/locales/de_DE/person.py +212 -0
  20. dataforge_py-0.2.0/src/dataforge/locales/de_DE/phone.py +17 -0
  21. dataforge_py-0.2.0/src/dataforge/locales/en_AU/__init__.py +1 -0
  22. dataforge_py-0.2.0/src/dataforge/locales/en_AU/address.py +231 -0
  23. dataforge_py-0.2.0/src/dataforge/locales/en_AU/company.py +193 -0
  24. dataforge_py-0.2.0/src/dataforge/locales/en_AU/internet.py +34 -0
  25. dataforge_py-0.2.0/src/dataforge/locales/en_AU/person.py +370 -0
  26. dataforge_py-0.2.0/src/dataforge/locales/en_AU/phone.py +16 -0
  27. dataforge_py-0.2.0/src/dataforge/locales/en_CA/__init__.py +1 -0
  28. dataforge_py-0.2.0/src/dataforge/locales/en_CA/address.py +276 -0
  29. dataforge_py-0.2.0/src/dataforge/locales/en_CA/company.py +193 -0
  30. dataforge_py-0.2.0/src/dataforge/locales/en_CA/internet.py +34 -0
  31. dataforge_py-0.2.0/src/dataforge/locales/en_CA/person.py +377 -0
  32. dataforge_py-0.2.0/src/dataforge/locales/en_CA/phone.py +15 -0
  33. dataforge_py-0.2.0/src/dataforge/locales/en_GB/__init__.py +1 -0
  34. dataforge_py-0.2.0/src/dataforge/locales/en_GB/address.py +312 -0
  35. dataforge_py-0.2.0/src/dataforge/locales/en_GB/company.py +196 -0
  36. dataforge_py-0.2.0/src/dataforge/locales/en_GB/internet.py +34 -0
  37. dataforge_py-0.2.0/src/dataforge/locales/en_GB/person.py +372 -0
  38. dataforge_py-0.2.0/src/dataforge/locales/en_GB/phone.py +15 -0
  39. dataforge_py-0.2.0/src/dataforge/locales/en_US/__init__.py +1 -0
  40. dataforge_py-0.2.0/src/dataforge/locales/en_US/address.py +268 -0
  41. dataforge_py-0.2.0/src/dataforge/locales/en_US/company.py +191 -0
  42. dataforge_py-0.2.0/src/dataforge/locales/en_US/internet.py +34 -0
  43. dataforge_py-0.2.0/src/dataforge/locales/en_US/person.py +370 -0
  44. dataforge_py-0.2.0/src/dataforge/locales/en_US/phone.py +15 -0
  45. dataforge_py-0.2.0/src/dataforge/locales/es_ES/__init__.py +1 -0
  46. dataforge_py-0.2.0/src/dataforge/locales/es_ES/address.py +151 -0
  47. dataforge_py-0.2.0/src/dataforge/locales/es_ES/company.py +125 -0
  48. dataforge_py-0.2.0/src/dataforge/locales/es_ES/internet.py +30 -0
  49. dataforge_py-0.2.0/src/dataforge/locales/es_ES/person.py +207 -0
  50. dataforge_py-0.2.0/src/dataforge/locales/es_ES/phone.py +15 -0
  51. dataforge_py-0.2.0/src/dataforge/locales/fr_FR/__init__.py +1 -0
  52. dataforge_py-0.2.0/src/dataforge/locales/fr_FR/address.py +145 -0
  53. dataforge_py-0.2.0/src/dataforge/locales/fr_FR/company.py +125 -0
  54. dataforge_py-0.2.0/src/dataforge/locales/fr_FR/internet.py +30 -0
  55. dataforge_py-0.2.0/src/dataforge/locales/fr_FR/person.py +212 -0
  56. dataforge_py-0.2.0/src/dataforge/locales/fr_FR/phone.py +15 -0
  57. dataforge_py-0.2.0/src/dataforge/locales/hi_IN/__init__.py +1 -0
  58. dataforge_py-0.2.0/src/dataforge/locales/hi_IN/address.py +177 -0
  59. dataforge_py-0.2.0/src/dataforge/locales/hi_IN/company.py +191 -0
  60. dataforge_py-0.2.0/src/dataforge/locales/hi_IN/internet.py +26 -0
  61. dataforge_py-0.2.0/src/dataforge/locales/hi_IN/person.py +218 -0
  62. dataforge_py-0.2.0/src/dataforge/locales/hi_IN/phone.py +21 -0
  63. dataforge_py-0.2.0/src/dataforge/locales/it_IT/__init__.py +1 -0
  64. dataforge_py-0.2.0/src/dataforge/locales/it_IT/address.py +218 -0
  65. dataforge_py-0.2.0/src/dataforge/locales/it_IT/company.py +151 -0
  66. dataforge_py-0.2.0/src/dataforge/locales/it_IT/internet.py +31 -0
  67. dataforge_py-0.2.0/src/dataforge/locales/it_IT/person.py +187 -0
  68. dataforge_py-0.2.0/src/dataforge/locales/it_IT/phone.py +15 -0
  69. dataforge_py-0.2.0/src/dataforge/locales/ja_JP/__init__.py +1 -0
  70. dataforge_py-0.2.0/src/dataforge/locales/ja_JP/address.py +174 -0
  71. dataforge_py-0.2.0/src/dataforge/locales/ja_JP/company.py +121 -0
  72. dataforge_py-0.2.0/src/dataforge/locales/ja_JP/internet.py +30 -0
  73. dataforge_py-0.2.0/src/dataforge/locales/ja_JP/person.py +207 -0
  74. dataforge_py-0.2.0/src/dataforge/locales/ja_JP/phone.py +18 -0
  75. dataforge_py-0.2.0/src/dataforge/locales/ko_KR/__init__.py +1 -0
  76. dataforge_py-0.2.0/src/dataforge/locales/ko_KR/address.py +121 -0
  77. dataforge_py-0.2.0/src/dataforge/locales/ko_KR/company.py +151 -0
  78. dataforge_py-0.2.0/src/dataforge/locales/ko_KR/internet.py +30 -0
  79. dataforge_py-0.2.0/src/dataforge/locales/ko_KR/person.py +157 -0
  80. dataforge_py-0.2.0/src/dataforge/locales/ko_KR/phone.py +26 -0
  81. dataforge_py-0.2.0/src/dataforge/locales/nl_NL/__init__.py +1 -0
  82. dataforge_py-0.2.0/src/dataforge/locales/nl_NL/address.py +152 -0
  83. dataforge_py-0.2.0/src/dataforge/locales/nl_NL/company.py +182 -0
  84. dataforge_py-0.2.0/src/dataforge/locales/nl_NL/internet.py +41 -0
  85. dataforge_py-0.2.0/src/dataforge/locales/nl_NL/person.py +218 -0
  86. dataforge_py-0.2.0/src/dataforge/locales/nl_NL/phone.py +19 -0
  87. dataforge_py-0.2.0/src/dataforge/locales/pl_PL/__init__.py +1 -0
  88. dataforge_py-0.2.0/src/dataforge/locales/pl_PL/address.py +140 -0
  89. dataforge_py-0.2.0/src/dataforge/locales/pl_PL/company.py +183 -0
  90. dataforge_py-0.2.0/src/dataforge/locales/pl_PL/internet.py +36 -0
  91. dataforge_py-0.2.0/src/dataforge/locales/pl_PL/person.py +217 -0
  92. dataforge_py-0.2.0/src/dataforge/locales/pl_PL/phone.py +15 -0
  93. dataforge_py-0.2.0/src/dataforge/locales/pt_BR/__init__.py +1 -0
  94. dataforge_py-0.2.0/src/dataforge/locales/pt_BR/address.py +127 -0
  95. dataforge_py-0.2.0/src/dataforge/locales/pt_BR/company.py +151 -0
  96. dataforge_py-0.2.0/src/dataforge/locales/pt_BR/internet.py +31 -0
  97. dataforge_py-0.2.0/src/dataforge/locales/pt_BR/person.py +187 -0
  98. dataforge_py-0.2.0/src/dataforge/locales/pt_BR/phone.py +15 -0
  99. dataforge_py-0.2.0/src/dataforge/locales/ru_RU/__init__.py +1 -0
  100. dataforge_py-0.2.0/src/dataforge/locales/ru_RU/address.py +156 -0
  101. dataforge_py-0.2.0/src/dataforge/locales/ru_RU/company.py +168 -0
  102. dataforge_py-0.2.0/src/dataforge/locales/ru_RU/internet.py +26 -0
  103. dataforge_py-0.2.0/src/dataforge/locales/ru_RU/person.py +218 -0
  104. dataforge_py-0.2.0/src/dataforge/locales/ru_RU/phone.py +16 -0
  105. dataforge_py-0.2.0/src/dataforge/locales/zh_CN/__init__.py +1 -0
  106. dataforge_py-0.2.0/src/dataforge/locales/zh_CN/address.py +141 -0
  107. dataforge_py-0.2.0/src/dataforge/locales/zh_CN/company.py +151 -0
  108. dataforge_py-0.2.0/src/dataforge/locales/zh_CN/internet.py +30 -0
  109. dataforge_py-0.2.0/src/dataforge/locales/zh_CN/person.py +157 -0
  110. dataforge_py-0.2.0/src/dataforge/locales/zh_CN/phone.py +25 -0
  111. dataforge_py-0.2.0/src/dataforge/providers/__init__.py +1 -0
  112. dataforge_py-0.2.0/src/dataforge/providers/address.py +460 -0
  113. dataforge_py-0.2.0/src/dataforge/providers/ai_chat.py +170 -0
  114. dataforge_py-0.2.0/src/dataforge/providers/ai_prompt.py +447 -0
  115. dataforge_py-0.2.0/src/dataforge/providers/automotive.py +416 -0
  116. dataforge_py-0.2.0/src/dataforge/providers/barcode.py +149 -0
  117. dataforge_py-0.2.0/src/dataforge/providers/base.py +34 -0
  118. dataforge_py-0.2.0/src/dataforge/providers/color.py +247 -0
  119. dataforge_py-0.2.0/src/dataforge/providers/company.py +144 -0
  120. dataforge_py-0.2.0/src/dataforge/providers/crypto.py +105 -0
  121. dataforge_py-0.2.0/src/dataforge/providers/datetime.py +397 -0
  122. dataforge_py-0.2.0/src/dataforge/providers/ecommerce.py +316 -0
  123. dataforge_py-0.2.0/src/dataforge/providers/education.py +234 -0
  124. dataforge_py-0.2.0/src/dataforge/providers/file.py +271 -0
  125. dataforge_py-0.2.0/src/dataforge/providers/finance.py +545 -0
  126. dataforge_py-0.2.0/src/dataforge/providers/geo.py +332 -0
  127. dataforge_py-0.2.0/src/dataforge/providers/government.py +114 -0
  128. dataforge_py-0.2.0/src/dataforge/providers/internet.py +351 -0
  129. dataforge_py-0.2.0/src/dataforge/providers/llm.py +726 -0
  130. dataforge_py-0.2.0/src/dataforge/providers/lorem.py +241 -0
  131. dataforge_py-0.2.0/src/dataforge/providers/medical.py +364 -0
  132. dataforge_py-0.2.0/src/dataforge/providers/misc.py +196 -0
  133. dataforge_py-0.2.0/src/dataforge/providers/network.py +283 -0
  134. dataforge_py-0.2.0/src/dataforge/providers/payment.py +300 -0
  135. dataforge_py-0.2.0/src/dataforge/providers/person.py +195 -0
  136. dataforge_py-0.2.0/src/dataforge/providers/phone.py +87 -0
  137. dataforge_py-0.2.0/src/dataforge/providers/profile.py +265 -0
  138. dataforge_py-0.2.0/src/dataforge/providers/science.py +365 -0
  139. dataforge_py-0.2.0/src/dataforge/providers/text.py +365 -0
  140. dataforge_py-0.2.0/src/dataforge/py.typed +0 -0
  141. dataforge_py-0.2.0/src/dataforge/pytest_plugin.py +80 -0
  142. dataforge_py-0.2.0/src/dataforge/registry.py +164 -0
  143. dataforge_py-0.2.0/src/dataforge/schema.py +772 -0
  144. dataforge_py-0.2.0/src/dataforge/unique.py +171 -0
@@ -0,0 +1,964 @@
1
+ Metadata-Version: 2.3
2
+ Name: dataforge-py
3
+ Version: 0.2.0
4
+ Summary: High-performance fake data generator for testing
5
+ Author: Ivan Rener
6
+ Author-email: Ivan Rener <ivan.rener@multitude.com>
7
+ Requires-Python: >=3.12
8
+ Description-Content-Type: text/markdown
9
+
10
+ # DataForge
11
+
12
+ **High-performance, zero-dependency fake data generator for Python.**
13
+
14
+ DataForge generates realistic fake data at millions of items per second. It uses vectorized batch generation, lazy-loaded locale data, and pre-resolved field lookups to deliver throughput that is orders of magnitude faster than existing alternatives — with zero runtime dependencies.
15
+
16
+ ```python
17
+ from dataforge import DataForge
18
+
19
+ forge = DataForge(seed=42)
20
+
21
+ forge.person.first_name() # "James"
22
+ forge.internet.email() # "james.smith@gmail.com"
23
+ forge.person.first_name(count=1_000_000) # 1M names in ~55ms
24
+ ```
25
+
26
+ ---
27
+
28
+ ## Table of Contents
29
+
30
+ - [Features](#features)
31
+ - [Installation](#installation)
32
+ - [Quick Start](#quick-start)
33
+ - [Providers](#providers) (27 providers, 198 methods)
34
+ - [Schema API](#schema-api)
35
+ - [Bulk Export](#bulk-export)
36
+ - [Streaming Export](#streaming-export)
37
+ - [Integrations](#integrations) (PyArrow, Polars, Pydantic, SQLAlchemy)
38
+ - [Command Line Interface](#command-line-interface)
39
+ - [Pytest Plugin](#pytest-plugin)
40
+ - [Unique Values](#unique-values)
41
+ - [Locales](#locales) (17 locales)
42
+ - [Benchmarks](#benchmarks)
43
+ - [CI/CD](#cicd)
44
+ - [Contributing](#contributing)
45
+ - [License](#license)
46
+
47
+ ---
48
+
49
+ ## Features
50
+
51
+ - **High Performance** — scalar generation at millions of items/s, batch generation at ~18M items/s
52
+ - **Vectorized Batches** — every method accepts `count=N` and returns a list, using optimized batch paths
53
+ - **Zero Dependencies** — core library has no external dependencies
54
+ - **Type Safe** — fully typed with PEP 484 type hints and `@overload` signatures
55
+ - **Reproducible** — global seeding for deterministic output
56
+ - **Lazy Loading** — locales and providers are loaded only when first accessed
57
+ - **Schema API** — define reusable data blueprints with pre-resolved field lookups
58
+ - **Rich CLI** — generate CSV, JSON, or JSONL directly from the terminal
59
+ - **Bulk Export** — export to dict, CSV, JSONL, SQL, DataFrame, Arrow, Polars, or Parquet
60
+ - **Streaming Export** — memory-efficient streaming writes for arbitrarily large datasets
61
+ - **Pytest Plugin** — `forge`, `fake`, and `forge_unseeded` fixtures with seed markers
62
+ - **Unique Values** — three-layer proxy with set-based dedup and over-sampling for batches
63
+ - **27 Providers** — person, address, internet, company, phone, finance, datetime, color, file, network, lorem, barcode, misc, automotive, crypto, ecommerce, education, geo, government, medical, payment, profile, science, text, ai\_prompt, llm, ai\_chat
64
+ - **17 Locales** — en\_US, en\_GB, en\_AU, en\_CA, de\_DE, fr\_FR, es\_ES, it\_IT, pt\_BR, nl\_NL, pl\_PL, ru\_RU, ar\_SA, hi\_IN, ja\_JP, ko\_KR, zh\_CN
65
+
66
+ ## Installation
67
+
68
+ ```bash
69
+ # Standard installation (zero dependencies)
70
+ pip install dataforge
71
+
72
+ # With uv
73
+ uv add dataforge
74
+ ```
75
+
76
+ **Optional integrations** (install separately as needed):
77
+
78
+ ```bash
79
+ pip install pyarrow # to_arrow(), to_parquet()
80
+ pip install polars # to_polars()
81
+ pip install pandas # to_dataframe()
82
+ pip install pydantic # schema_from_pydantic()
83
+ pip install sqlalchemy # schema_from_sqlalchemy()
84
+ ```
85
+
86
+ **Requires Python >= 3.12.**
87
+
88
+ ## Quick Start
89
+
90
+ ```python
91
+ from dataforge import DataForge
92
+
93
+ # Initialize with optional locale and seed
94
+ forge = DataForge(locale="en_US", seed=42)
95
+
96
+ # Generate single items
97
+ forge.person.first_name() # "James"
98
+ forge.internet.email() # "james.smith@gmail.com"
99
+ forge.address.city() # "Chicago"
100
+ forge.finance.price() # "49.99"
101
+ forge.llm.model_name() # "gpt-4o"
102
+
103
+ # Generate batches (returns lists)
104
+ names = forge.person.first_name(count=1000)
105
+ emails = forge.internet.email(count=1000)
106
+ cities = forge.address.city(count=1000)
107
+
108
+ # Reproducible output
109
+ forge_a = DataForge(seed=42)
110
+ forge_b = DataForge(seed=42)
111
+ assert forge_a.person.first_name() == forge_b.person.first_name()
112
+ ```
113
+
114
+ ## Providers
115
+
116
+ DataForge ships with 27 providers organized by domain. Every method accepts `count=N` for batch generation.
117
+
118
+ ### `person` — Names and identity
119
+
120
+ | Method | Return | Example |
121
+ |--------|--------|---------|
122
+ | `first_name()` | `str` | `"James"` |
123
+ | `last_name()` | `str` | `"Smith"` |
124
+ | `full_name()` | `str` | `"James Smith"` |
125
+ | `male_first_name()` | `str` | `"Robert"` |
126
+ | `female_first_name()` | `str` | `"Jennifer"` |
127
+ | `prefix()` | `str` | `"Mr."` |
128
+ | `suffix()` | `str` | `"Jr."` |
129
+
130
+ ### `address` — Locations and geography
131
+
132
+ | Method | Return | Example |
133
+ |--------|--------|---------|
134
+ | `street_name()` | `str` | `"Elm Street"` |
135
+ | `street_address()` | `str` | `"742 Elm Street"` |
136
+ | `building_number()` | `str` | `"742"` |
137
+ | `city()` | `str` | `"Chicago"` |
138
+ | `state()` | `str` | `"California"` |
139
+ | `zip_code()` | `str` | `"90210"` |
140
+ | `full_address()` | `str` | `"742 Elm St, Chicago, IL 90210"` |
141
+ | `country()` | `str` | `"United States"` |
142
+ | `country_code()` | `str` | `"US"` |
143
+ | `latitude()` | `str` | `"41.8781"` |
144
+ | `longitude()` | `str` | `"-87.6298"` |
145
+ | `coordinate()` | `tuple[str, str]` | `("41.8781", "-87.6298")` |
146
+
147
+ ### `internet` — Web and network identifiers
148
+
149
+ | Method | Return | Example |
150
+ |--------|--------|---------|
151
+ | `email()` | `str` | `"james.smith@gmail.com"` |
152
+ | `safe_email()` | `str` | `"james@example.com"` |
153
+ | `username()` | `str` | `"jsmith42"` |
154
+ | `domain()` | `str` | `"example.com"` |
155
+ | `url()` | `str` | `"https://example.com"` |
156
+ | `ipv4()` | `str` | `"192.168.1.1"` |
157
+ | `slug()` | `str` | `"lorem-ipsum-dolor"` |
158
+ | `tld()` | `str` | `"com"` |
159
+
160
+ ### `company` — Business data
161
+
162
+ | Method | Return | Example |
163
+ |--------|--------|---------|
164
+ | `company_name()` | `str` | `"Acme Corp"` |
165
+ | `company_suffix()` | `str` | `"LLC"` |
166
+ | `job_title()` | `str` | `"Software Engineer"` |
167
+ | `catch_phrase()` | `str` | `"Innovative solutions"` |
168
+
169
+ ### `phone` — Phone numbers
170
+
171
+ | Method | Return | Example |
172
+ |--------|--------|---------|
173
+ | `phone_number()` | `str` | `"(555) 123-4567"` |
174
+ | `cell_phone()` | `str` | `"555-987-6543"` |
175
+
176
+ ### `finance` — Financial data
177
+
178
+ | Method | Return | Example |
179
+ |--------|--------|---------|
180
+ | `credit_card_number()` | `str` | `"4532015112830366"` |
181
+ | `credit_card()` | `dict` | `{"type": "Visa", ...}` |
182
+ | `card_type()` | `str` | `"Visa"` |
183
+ | `iban()` | `str` | `"DE89370400440532013000"` |
184
+ | `bic()` | `str` | `"DEUTDEFFXXX"` |
185
+ | `routing_number()` | `str` | `"021000021"` |
186
+ | `bitcoin_address()` | `str` | `"1A1zP1eP5QGefi2DMPTfTL..."` |
187
+ | `currency_code()` | `str` | `"USD"` |
188
+ | `currency_name()` | `str` | `"US Dollar"` |
189
+ | `currency_symbol()` | `str` | `"$"` |
190
+ | `price(min_val, max_val)` | `str` | `"49.99"` |
191
+
192
+ ### `dt` — Dates and times
193
+
194
+ | Method | Return | Example |
195
+ |--------|--------|---------|
196
+ | `date(start, end, fmt)` | `str` | `"2024-03-15"` |
197
+ | `time(fmt)` | `str` | `"14:30:00"` |
198
+ | `datetime(start, end, fmt)` | `str` | `"2024-03-15 14:30:00"` |
199
+ | `date_of_birth(min_age, max_age)` | `str` | `"1990-05-12"` |
200
+ | `date_object()` | `date` | `date(2024, 3, 15)` |
201
+ | `datetime_object()` | `datetime` | `datetime(2024, 3, 15, ...)` |
202
+ | `timezone()` | `str` | `"US/Eastern"` |
203
+ | `unix_timestamp(start, end)` | `int` | `1710504600` |
204
+
205
+ ```python
206
+ import datetime
207
+ forge.dt.date(start=datetime.date(2020, 1, 1), end=datetime.date(2024, 12, 31))
208
+ ```
209
+
210
+ ### `color` — Color values
211
+
212
+ | Method | Return | Example |
213
+ |--------|--------|---------|
214
+ | `color_name()` | `str` | `"Red"` |
215
+ | `hex_color()` | `str` | `"#ff5733"` |
216
+ | `rgb()` | `tuple[int, int, int]` | `(255, 87, 51)` |
217
+ | `rgba()` | `tuple[int, int, int, float]` | `(255, 87, 51, 0.8)` |
218
+ | `rgb_string()` | `str` | `"rgb(255, 87, 51)"` |
219
+ | `hsl()` | `tuple[int, int, int]` | `(11, 100, 60)` |
220
+ | `hsl_string()` | `str` | `"hsl(11, 100%, 60%)"` |
221
+
222
+ ### `file` — File system data
223
+
224
+ | Method | Return | Example |
225
+ |--------|--------|---------|
226
+ | `file_name()` | `str` | `"report.pdf"` |
227
+ | `file_extension()` | `str` | `"pdf"` |
228
+ | `file_path()` | `str` | `"/home/user/report.pdf"` |
229
+ | `file_category()` | `str` | `"document"` |
230
+ | `mime_type()` | `str` | `"application/pdf"` |
231
+
232
+ ### `network` — Networking and protocols
233
+
234
+ | Method | Return | Example |
235
+ |--------|--------|---------|
236
+ | `ipv6()` | `str` | `"2001:0db8:85a3:0000:..."` |
237
+ | `mac_address()` | `str` | `"a1:b2:c3:d4:e5:f6"` |
238
+ | `port()` | `int` | `8080` |
239
+ | `hostname()` | `str` | `"server-01.example.com"` |
240
+ | `user_agent()` | `str` | `"Mozilla/5.0 ..."` |
241
+ | `http_method()` | `str` | `"GET"` |
242
+ | `http_status_code()` | `str` | `"200 OK"` |
243
+
244
+ ### `lorem` — Placeholder text
245
+
246
+ | Method | Return | Example |
247
+ |--------|--------|---------|
248
+ | `word()` | `str` | `"lorem"` |
249
+ | `sentence(word_count)` | `str` | `"Lorem ipsum dolor sit."` |
250
+ | `paragraph(sentence_count)` | `str` | `"Lorem ipsum dolor ..."` |
251
+ | `text(max_chars)` | `str` | `"Lorem ipsum ..."` |
252
+
253
+ ### `barcode` — Barcodes and ISBNs
254
+
255
+ | Method | Return | Example |
256
+ |--------|--------|---------|
257
+ | `ean13()` | `str` | `"5901234123457"` |
258
+ | `ean8()` | `str` | `"96385074"` |
259
+ | `isbn13()` | `str` | `"9780306406157"` |
260
+ | `isbn10()` | `str` | `"0306406152"` |
261
+
262
+ All barcodes include valid check digits.
263
+
264
+ ### `misc` — Utilities
265
+
266
+ | Method | Return | Example |
267
+ |--------|--------|---------|
268
+ | `uuid4()` | `str` | `"550e8400-e29b-41d4-..."` |
269
+ | `uuid7()` | `str` | `"01912b4c-..."` |
270
+ | `boolean(probability)` | `bool` | `True` |
271
+ | `random_element(elements)` | `Any` | `"a"` |
272
+ | `null_or(value, probability)` | `Any` | `None` or value |
273
+
274
+ ### `automotive` — Vehicle data
275
+
276
+ | Method | Return | Example |
277
+ |--------|--------|---------|
278
+ | `license_plate()` | `str` | `"ABC-1234"` |
279
+ | `vin()` | `str` | `"1HGCM82633A004352"` |
280
+ | `vehicle_make()` | `str` | `"Toyota"` |
281
+ | `vehicle_model()` | `str` | `"Camry"` |
282
+ | `vehicle_year()` | `int` | `2023` |
283
+ | `vehicle_year_str()` | `str` | `"2023"` |
284
+ | `vehicle_color()` | `str` | `"Silver"` |
285
+
286
+ ### `crypto` — Hash digests
287
+
288
+ | Method | Return | Example |
289
+ |--------|--------|---------|
290
+ | `md5()` | `str` | `"d41d8cd98f00b204e98..."` |
291
+ | `sha1()` | `str` | `"da39a3ee5e6b4b0d325..."` |
292
+ | `sha256()` | `str` | `"e3b0c44298fc1c149af..."` |
293
+
294
+ ### `ecommerce` — E-commerce data
295
+
296
+ | Method | Return | Example |
297
+ |--------|--------|---------|
298
+ | `product_name()` | `str` | `"Wireless Mouse"` |
299
+ | `product_category()` | `str` | `"Electronics"` |
300
+ | `sku()` | `str` | `"SKU-A1B2C3"` |
301
+ | `price_with_currency()` | `str` | `"$49.99 USD"` |
302
+ | `review_rating()` | `int` | `4` |
303
+ | `review_title()` | `str` | `"Great product!"` |
304
+ | `tracking_number()` | `str` | `"1Z999AA10123456784"` |
305
+ | `order_id()` | `str` | `"ORD-20240315-A1B2"` |
306
+
307
+ ### `education` — Academic data
308
+
309
+ | Method | Return | Example |
310
+ |--------|--------|---------|
311
+ | `university()` | `str` | `"MIT"` |
312
+ | `degree()` | `str` | `"Bachelor of Science"` |
313
+ | `field_of_study()` | `str` | `"Computer Science"` |
314
+
315
+ ### `geo` — Geographic features
316
+
317
+ | Method | Return | Example |
318
+ |--------|--------|---------|
319
+ | `continent()` | `str` | `"North America"` |
320
+ | `ocean()` | `str` | `"Pacific Ocean"` |
321
+ | `sea()` | `str` | `"Mediterranean Sea"` |
322
+ | `mountain_range()` | `str` | `"Rocky Mountains"` |
323
+ | `river()` | `str` | `"Amazon"` |
324
+ | `compass_direction()` | `str` | `"Northeast"` |
325
+ | `geo_coordinate()` | `str` | `"41.8781, -87.6298"` |
326
+ | `dms_latitude()` | `str` | `"41°52'41.2\"N"` |
327
+ | `dms_longitude()` | `str` | `"87°37'47.3\"W"` |
328
+ | `geo_hash()` | `str` | `"dp3wjztvh"` |
329
+
330
+ ### `government` — Government IDs
331
+
332
+ | Method | Return | Example |
333
+ |--------|--------|---------|
334
+ | `ssn()` | `str` | `"123-45-6789"` |
335
+ | `tax_id()` | `str` | `"12-3456789"` |
336
+ | `passport_number()` | `str` | `"A12345678"` |
337
+ | `drivers_license()` | `str` | `"D123-4567-8901"` |
338
+ | `national_id()` | `str` | `"123456789012"` |
339
+
340
+ ### `medical` — Healthcare data
341
+
342
+ | Method | Return | Example |
343
+ |--------|--------|---------|
344
+ | `blood_type()` | `str` | `"O+"` |
345
+ | `realistic_blood_type()` | `str` | `"O+"` (weighted) |
346
+ | `icd10_code()` | `str` | `"J06.9"` |
347
+ | `drug_name()` | `str` | `"Amoxicillin"` |
348
+ | `drug_form()` | `str` | `"Tablet"` |
349
+ | `dosage()` | `str` | `"500mg"` |
350
+ | `diagnosis()` | `str` | `"Acute bronchitis"` |
351
+ | `procedure()` | `str` | `"Appendectomy"` |
352
+ | `medical_record_number()` | `str` | `"MRN-12345678"` |
353
+
354
+ `realistic_blood_type()` uses American Red Cross population distribution weights.
355
+
356
+ ### `payment` — Payment and transaction data
357
+
358
+ | Method | Return | Example |
359
+ |--------|--------|---------|
360
+ | `card_type()` | `str` | `"Visa"` |
361
+ | `payment_method()` | `str` | `"Credit Card"` |
362
+ | `payment_processor()` | `str` | `"Stripe"` |
363
+ | `transaction_status()` | `str` | `"completed"` |
364
+ | `transaction_id()` | `str` | `"txn_1a2b3c4d5e"` |
365
+ | `currency_code()` | `str` | `"USD"` |
366
+ | `currency_symbol()` | `str` | `"$"` |
367
+ | `payment_amount()` | `str` | `"149.99"` |
368
+ | `cvv()` | `str` | `"123"` |
369
+ | `expiry_date()` | `str` | `"12/28"` |
370
+
371
+ ### `profile` — Coherent user profiles
372
+
373
+ | Method | Return | Example |
374
+ |--------|--------|---------|
375
+ | `profile()` | `dict` | `{"first_name": "James", ...}` |
376
+ | `profile_first_name()` | `str` | `"James"` |
377
+ | `profile_last_name()` | `str` | `"Smith"` |
378
+ | `profile_email()` | `str` | `"james.smith@gmail.com"` |
379
+ | `profile_phone()` | `str` | `"(555) 123-4567"` |
380
+ | `profile_city()` | `str` | `"Chicago"` |
381
+ | `profile_state()` | `str` | `"Illinois"` |
382
+ | `profile_zip_code()` | `str` | `"60601"` |
383
+ | `profile_job_title()` | `str` | `"Software Engineer"` |
384
+
385
+ `profile()` returns a coherent dict combining person, internet, address, phone, and company data.
386
+
387
+ ### `science` — Scientific data
388
+
389
+ | Method | Return | Example |
390
+ |--------|--------|---------|
391
+ | `chemical_element()` | `str` | `"Hydrogen"` |
392
+ | `element_symbol()` | `str` | `"H"` |
393
+ | `si_unit()` | `str` | `"meter"` |
394
+ | `planet()` | `str` | `"Mars"` |
395
+ | `galaxy()` | `str` | `"Milky Way"` |
396
+ | `constellation()` | `str` | `"Orion"` |
397
+ | `scientific_discipline()` | `str` | `"Physics"` |
398
+ | `metric_prefix()` | `str` | `"kilo"` |
399
+
400
+ ### `text` — Rich text content
401
+
402
+ | Method | Return | Example |
403
+ |--------|--------|---------|
404
+ | `quote()` | `str` | `"The only way to do great work..."` |
405
+ | `headline()` | `str` | `"Breaking: New Study Reveals..."` |
406
+ | `buzzword()` | `str` | `"synergy"` |
407
+ | `paragraph()` | `str` | Multi-sentence paragraph |
408
+ | `text_block()` | `str` | Multi-paragraph text block |
409
+
410
+ ### `ai_prompt` — AI prompts
411
+
412
+ | Method | Return | Example |
413
+ |--------|--------|---------|
414
+ | `user_prompt()` | `str` | `"Explain quantum computing..."` |
415
+ | `coding_prompt()` | `str` | `"Write a Python function..."` |
416
+ | `creative_prompt()` | `str` | `"Write a short story about..."` |
417
+ | `analysis_prompt()` | `str` | `"Analyze the following data..."` |
418
+ | `system_prompt()` | `str` | `"You are a helpful assistant..."` |
419
+ | `persona_prompt()` | `str` | `"Act as a senior engineer..."` |
420
+ | `prompt_template()` | `str` | `"Given {context}, answer {question}"` |
421
+ | `few_shot_prompt()` | `str` | Multi-example prompt |
422
+
423
+ ### `llm` — LLM ecosystem data
424
+
425
+ **Models and metadata:**
426
+
427
+ | Method | Return | Example |
428
+ |--------|--------|---------|
429
+ | `model_name()` | `str` | `"gpt-4o"` |
430
+ | `provider_name()` | `str` | `"OpenAI"` |
431
+ | `api_key()` | `str` | `"sk-proj-a1b2c3d4..."` |
432
+ | `finish_reason()` | `str` | `"stop"` |
433
+ | `stop_sequence()` | `str` | `"<\|endoftext\|>"` |
434
+
435
+ **Agents and tool use:**
436
+
437
+ | Method | Return | Example |
438
+ |--------|--------|---------|
439
+ | `tool_name()` | `str` | `"web_search"` |
440
+ | `tool_call_id()` | `str` | `"call_abc123"` |
441
+ | `mcp_server_name()` | `str` | `"filesystem"` |
442
+ | `agent_name()` | `str` | `"research-agent"` |
443
+ | `capability()` | `str` | `"code_generation"` |
444
+
445
+ **RAG and embeddings:**
446
+
447
+ | Method | Return | Example |
448
+ |--------|--------|---------|
449
+ | `embedding_model()` | `str` | `"text-embedding-3-small"` |
450
+ | `vector_db_name()` | `str` | `"Pinecone"` |
451
+ | `chunk_id()` | `str` | `"chunk_a1b2c3d4"` |
452
+ | `similarity_score()` | `str` | `"0.9234"` |
453
+ | `namespace()` | `str` | `"production"` |
454
+
455
+ **Content moderation:**
456
+
457
+ | Method | Return | Example |
458
+ |--------|--------|---------|
459
+ | `moderation_category()` | `str` | `"hate"` |
460
+ | `moderation_score()` | `str` | `"0.0012"` |
461
+ | `harm_label()` | `str` | `"safe"` |
462
+
463
+ **Usage and billing:**
464
+
465
+ | Method | Return | Example |
466
+ |--------|--------|---------|
467
+ | `token_count()` | `str` | `"1234"` |
468
+ | `prompt_tokens()` | `str` | `"256"` |
469
+ | `completion_tokens()` | `str` | `"512"` |
470
+ | `cost_estimate()` | `str` | `"$0.0042"` |
471
+ | `rate_limit_header()` | `str` | `"x-ratelimit-remaining: 42"` |
472
+
473
+ ### `ai_chat` — AI conversation data
474
+
475
+ | Method | Return | Example |
476
+ |--------|--------|---------|
477
+ | `chat_message()` | `dict` | `{"role": "user", "content": "...", ...}` |
478
+ | `chat_role()` | `str` | `"user"` |
479
+ | `chat_model()` | `str` | `"gpt-4o"` |
480
+ | `chat_content()` | `str` | `"Explain quantum computing..."` |
481
+ | `chat_tokens()` | `str` | `"256"` |
482
+ | `chat_finish_reason()` | `str` | `"stop"` |
483
+
484
+ `chat_message()` returns a coherent dict combining role, model, content, tokens, and finish\_reason.
485
+
486
+ ---
487
+
488
+ ## Schema API
489
+
490
+ The `Schema` API provides reusable blueprints for structured data generation. Field lookups are pre-resolved at creation time for maximum throughput.
491
+
492
+ ```python
493
+ from dataforge import DataForge
494
+
495
+ forge = DataForge(seed=42)
496
+
497
+ # List of field names (auto-resolved to provider methods)
498
+ schema = forge.schema(["first_name", "last_name", "email", "city"])
499
+
500
+ # Dict with custom column names
501
+ schema = forge.schema({
502
+ "Name": "person.full_name",
503
+ "Email": "internet.email",
504
+ "City": "address.city",
505
+ "Price": "finance.price",
506
+ })
507
+
508
+ # Generate rows
509
+ rows = schema.generate(1000) # list[dict[str, str]]
510
+
511
+ # Stream rows lazily (memory-efficient)
512
+ for row in schema.stream(1_000_000):
513
+ process(row)
514
+
515
+ # Async streaming
516
+ async for row in schema.async_stream(1_000_000):
517
+ await process(row)
518
+
519
+ # Export directly
520
+ csv_str = schema.to_csv(count=5000)
521
+ jsonl_str = schema.to_jsonl(count=5000)
522
+ sql_str = schema.to_sql(count=5000, table="users")
523
+ df = schema.to_dataframe(count=5000) # requires pandas
524
+ ```
525
+
526
+ ### Row-dependent fields (correlated data)
527
+
528
+ Schema supports callable values for fields that depend on other columns:
529
+
530
+ ```python
531
+ schema = forge.schema({
532
+ "City": "address.city",
533
+ "Greeting": lambda row: f"Hello from {row['City']}!",
534
+ })
535
+ ```
536
+
537
+ Callables receive the current row dict and execute after batch columns are generated.
538
+
539
+ ## Bulk Export
540
+
541
+ Generate datasets directly from the `DataForge` instance:
542
+
543
+ ```python
544
+ # List of dictionaries
545
+ rows = forge.to_dict(
546
+ fields=["first_name", "last_name", "email", "company.job_title"],
547
+ count=100
548
+ )
549
+
550
+ # CSV (returns string, optionally writes to file)
551
+ csv_data = forge.to_csv(
552
+ fields={"Name": "person.full_name", "Email": "internet.email"},
553
+ count=5000,
554
+ path="users.csv"
555
+ )
556
+
557
+ # JSONL (returns string, optionally writes to file)
558
+ jsonl_data = forge.to_jsonl(
559
+ fields=["first_name", "email", "city"],
560
+ count=1000,
561
+ path="users.jsonl"
562
+ )
563
+
564
+ # SQL INSERT statements
565
+ sql_data = forge.to_sql(
566
+ fields=["first_name", "last_name", "email"],
567
+ count=500,
568
+ table="users",
569
+ dialect="postgresql" # "sqlite" (default), "mysql", or "postgresql"
570
+ )
571
+
572
+ # Pandas DataFrame (requires pandas)
573
+ df = forge.to_dataframe(
574
+ fields=["date", "finance.price", "address.state"],
575
+ count=10_000
576
+ )
577
+ ```
578
+
579
+ ## Streaming Export
580
+
581
+ For arbitrarily large datasets that don't fit in memory:
582
+
583
+ ```python
584
+ # Stream to CSV file in batches
585
+ rows_written = forge.stream_to_csv(
586
+ fields=["first_name", "email", "city"],
587
+ path="users.csv",
588
+ count=10_000_000,
589
+ batch_size=100_000
590
+ )
591
+
592
+ # Stream to JSONL file in batches
593
+ rows_written = forge.stream_to_jsonl(
594
+ fields=["first_name", "email", "city"],
595
+ path="users.jsonl",
596
+ count=10_000_000,
597
+ batch_size=100_000
598
+ )
599
+ ```
600
+
601
+ Batch size is auto-tuned when not specified.
602
+
603
+ ## Integrations
604
+
605
+ ### PyArrow
606
+
607
+ ```python
608
+ # PyArrow Table
609
+ table = forge.to_arrow(
610
+ fields=["first_name", "email", "city"],
611
+ count=1_000_000
612
+ )
613
+
614
+ # Write Parquet file
615
+ rows_written = forge.to_parquet(
616
+ fields=["first_name", "email", "city"],
617
+ path="users.parquet",
618
+ count=1_000_000
619
+ )
620
+ ```
621
+
622
+ All columns are typed as `pa.string()`. Large datasets are generated in batches with bounded memory.
623
+
624
+ ### Polars
625
+
626
+ ```python
627
+ # Polars DataFrame
628
+ df = forge.to_polars(
629
+ fields=["first_name", "email", "city"],
630
+ count=1_000_000
631
+ )
632
+ ```
633
+
634
+ All columns are typed as `pl.Utf8`. Large datasets use `pl.concat()` for efficient multi-batch assembly.
635
+
636
+ ### Pydantic
637
+
638
+ Auto-generate schemas from Pydantic models:
639
+
640
+ ```python
641
+ from pydantic import BaseModel
642
+
643
+ class User(BaseModel):
644
+ first_name: str
645
+ last_name: str
646
+ email: str
647
+ city: str
648
+
649
+ schema = forge.schema_from_pydantic(User)
650
+ rows = schema.generate(1000) # list[dict] with keys matching model fields
651
+ ```
652
+
653
+ Field names are matched to DataForge providers via direct name lookup and heuristic alias mapping (~70 common field names). Supports both Pydantic v1 and v2.
654
+
655
+ ### SQLAlchemy
656
+
657
+ Auto-generate schemas from SQLAlchemy models:
658
+
659
+ ```python
660
+ from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
661
+
662
+ class Base(DeclarativeBase):
663
+ pass
664
+
665
+ class User(Base):
666
+ __tablename__ = "users"
667
+ id: Mapped[int] = mapped_column(primary_key=True)
668
+ first_name: Mapped[str]
669
+ last_name: Mapped[str]
670
+ email: Mapped[str]
671
+
672
+ schema = forge.schema_from_sqlalchemy(User)
673
+ rows = schema.generate(1000) # primary key 'id' is auto-skipped
674
+ ```
675
+
676
+ ## Command Line Interface
677
+
678
+ DataForge includes a CLI for generating data directly from the terminal.
679
+
680
+ ```bash
681
+ # Generate 10 rows of CSV
682
+ dataforge --count 10 --format csv first_name last_name email
683
+
684
+ # Generate JSON output
685
+ dataforge -n 5 -f json company_name url city
686
+
687
+ # Generate JSONL
688
+ dataforge -n 100 -f jsonl first_name email city
689
+
690
+ # Write to file
691
+ dataforge -n 1000 -f csv -o users.csv first_name last_name email
692
+
693
+ # Use a specific locale and seed
694
+ dataforge --locale fr_FR --seed 42 -n 5 first_name city
695
+
696
+ # Omit headers
697
+ dataforge -n 10 -f csv --no-header first_name email
698
+
699
+ # List all available fields
700
+ dataforge --list-fields
701
+ ```
702
+
703
+ ### CLI Options
704
+
705
+ | Flag | Short | Description |
706
+ |------|-------|-------------|
707
+ | `--count N` | `-n` | Number of rows (default: 10) |
708
+ | `--format FMT` | `-f` | Output format: `text`, `csv`, `json`, `jsonl` |
709
+ | `--locale LOC` | `-l` | Locale code (default: `en_US`) |
710
+ | `--seed S` | `-s` | Random seed for reproducibility |
711
+ | `--output PATH` | `-o` | Write to file instead of stdout |
712
+ | `--no-header` | | Omit header row in text/csv output |
713
+ | `--list-fields` | | List all available field names |
714
+
715
+ Default fields (when none specified): `first_name`, `last_name`, `email`.
716
+
717
+ ## Pytest Plugin
718
+
719
+ DataForge auto-registers as a pytest plugin via the `pytest11` entry point.
720
+
721
+ ### Fixtures
722
+
723
+ | Fixture | Description |
724
+ |---------|-------------|
725
+ | `forge` | Seeded `DataForge` instance (deterministic) |
726
+ | `fake` | Alias for `forge` |
727
+ | `forge_unseeded` | Unseeded `DataForge` instance (non-deterministic) |
728
+
729
+ ### Seed priority
730
+
731
+ 1. `@pytest.mark.forge_seed(N)` marker (per-test)
732
+ 2. `--forge-seed N` CLI option (session-wide)
733
+ 3. Default: `0`
734
+
735
+ ### Usage
736
+
737
+ ```python
738
+ def test_name(forge):
739
+ name = forge.person.first_name()
740
+ assert isinstance(name, str)
741
+
742
+ def test_email(fake):
743
+ assert "@" in fake.internet.email()
744
+
745
+ @pytest.mark.forge_seed(42)
746
+ def test_specific(forge):
747
+ assert forge.person.first_name() == "James"
748
+
749
+ def test_random(forge_unseeded):
750
+ name = forge_unseeded.person.first_name()
751
+ assert len(name) > 0
752
+ ```
753
+
754
+ ```bash
755
+ # Run tests with a specific seed
756
+ pytest --forge-seed 42
757
+ ```
758
+
759
+ ## Unique Values
760
+
761
+ Generate guaranteed-unique values using the `forge.unique` proxy:
762
+
763
+ ```python
764
+ # Single unique values
765
+ name1 = forge.unique.person.first_name()
766
+ name2 = forge.unique.person.first_name()
767
+ assert name1 != name2
768
+
769
+ # Unique batches
770
+ names = forge.unique.person.first_name(count=100)
771
+ assert len(names) == len(set(names))
772
+
773
+ # Clear tracking to reuse values
774
+ forge.unique.clear() # clear all providers
775
+ forge.unique.clear("person") # clear specific provider
776
+ forge.unique.person.first_name.clear() # clear specific method
777
+ ```
778
+
779
+ The unique system uses a three-layer proxy architecture:
780
+
781
+ 1. `forge.unique` — `UniqueProxy` wrapping the forge instance
782
+ 2. `forge.unique.person` — `_UniqueProviderProxy` wrapping the provider
783
+ 3. `forge.unique.person.first_name` — `_UniqueMethodWrapper` with set-based dedup
784
+
785
+ Batch generation uses **over-sampling** (requests 20% extra per round) to minimize retry passes. Raises `RuntimeError` if uniqueness cannot be satisfied after extensive retries.
786
+
787
+ ## Locales
788
+
789
+ DataForge supports 17 locales with locale-specific person names, addresses, companies, phone numbers, and internet domains:
790
+
791
+ | Locale | Language | Region |
792
+ |--------|----------|--------|
793
+ | `en_US` | English | United States |
794
+ | `en_GB` | English | United Kingdom |
795
+ | `en_AU` | English | Australia |
796
+ | `en_CA` | English | Canada |
797
+ | `de_DE` | German | Germany |
798
+ | `fr_FR` | French | France |
799
+ | `es_ES` | Spanish | Spain |
800
+ | `it_IT` | Italian | Italy |
801
+ | `pt_BR` | Portuguese | Brazil |
802
+ | `nl_NL` | Dutch | Netherlands |
803
+ | `pl_PL` | Polish | Poland |
804
+ | `ru_RU` | Russian (romanized) | Russia |
805
+ | `ar_SA` | Arabic (romanized) | Saudi Arabia |
806
+ | `hi_IN` | Hindi (romanized) | India |
807
+ | `ja_JP` | Japanese | Japan |
808
+ | `ko_KR` | Korean | South Korea |
809
+ | `zh_CN` | Chinese | China |
810
+
811
+ ```python
812
+ forge = DataForge(locale="fr_FR")
813
+ forge.address.city() # "Paris"
814
+ forge.person.full_name() # "Jean Dupont"
815
+
816
+ forge = DataForge(locale="ja_JP")
817
+ forge.person.full_name() # "田中太郎"
818
+ ```
819
+
820
+ ## Benchmarks
821
+
822
+ DataForge is built for speed. Results from a standard developer machine:
823
+
824
+ ### Single Item Generation (10K iterations)
825
+
826
+ | Operation | Speed |
827
+ |-----------|-------|
828
+ | `misc.boolean()` | **9.2M items/s** |
829
+ | `person.first_name()` | **3.2M items/s** |
830
+ | `address.city()` | **3.1M items/s** |
831
+ | `dt.timezone()` | **3.2M items/s** |
832
+ | `network.port()` | **2.4M items/s** |
833
+ | `network.user_agent()` | **3.0M items/s** |
834
+ | `file.file_name()` | **1.5M items/s** |
835
+ | `dt.unix_timestamp()` | **1.3M items/s** |
836
+ | `finance.bic()` | **930K items/s** |
837
+
838
+ ### Batch Generation (1M items)
839
+
840
+ | Operation | Speed |
841
+ |-----------|-------|
842
+ | `person.first_name(count=1M)` | **18M items/s** |
843
+ | `address.city(count=1M)` | **17M items/s** |
844
+ | `dt.timezone(count=1M)` | **18M items/s** |
845
+ | `network.user_agent(count=1M)` | **19M items/s** |
846
+ | `person.full_name(count=1M)` | **4.7M items/s** |
847
+ | `address.country(count=1M)` | **3.6M items/s** |
848
+ | `file.file_name(count=1M)` | **1.6M items/s** |
849
+ | `finance.bic(count=1M)` | **1.0M items/s** |
850
+
851
+ ### Schema API (5 columns)
852
+
853
+ | Operation | Speed |
854
+ |-----------|-------|
855
+ | `generate(100K)` | **108K rows/s** |
856
+ | `to_csv(100K)` | **92K rows/s** |
857
+ | `stream(100K)` | **110K rows/s** |
858
+
859
+ Run benchmarks locally:
860
+
861
+ ```bash
862
+ uv run python benchmark.py
863
+ uv run python benchmark.py --compare # compare against saved baseline
864
+ ```
865
+
866
+ ## CI/CD
867
+
868
+ DataForge uses GitHub Actions for continuous integration and delivery:
869
+
870
+ | Workflow | Trigger | Description |
871
+ |----------|---------|-------------|
872
+ | **CI** | Push/PR to main | Commitlint + Ruff lint/format + pytest matrix (Python 3.12, 3.13) |
873
+ | **Integrations** | Push/PR to main | Tests with optional deps (PyArrow, Polars, Pydantic, SQLAlchemy) |
874
+ | **Benchmarks** | Push to main | Runs `benchmark.py --compare`, uploads results as artifact |
875
+ | **Release** | Push to main | release-please creates/updates Release PR, auto-bumps version |
876
+ | **Publish** | `v*.*.*` tag | Builds with `uv build`, publishes to PyPI via OIDC trusted publishing |
877
+
878
+ ### Release process
879
+
880
+ 1. All commits to `main` use [Conventional Commits](https://www.conventionalcommits.org/) format
881
+ 2. `release-please` automatically maintains a living Release PR that bundles changes
882
+ 3. Merging the Release PR creates a version tag (`v0.2.0`, etc.)
883
+ 4. The tag triggers the publish workflow, which pushes to PyPI with OIDC (no token needed)
884
+
885
+ ### Setup requirements
886
+
887
+ - **`RELEASE_TOKEN`** — GitHub PAT with `contents: write` permission (for release-please to push tags that trigger downstream workflows)
888
+ - **`pypi` environment** — GitHub Environment with manual approval gate for PyPI publishing
889
+
890
+ ## Contributing
891
+
892
+ Contributions are welcome. Please follow these guidelines:
893
+
894
+ ### Development setup
895
+
896
+ ```bash
897
+ git clone https://github.com/yourusername/dataforge.git
898
+ cd dataforge
899
+ uv sync # install all dependencies
900
+ uv run pytest # run tests (1061 tests)
901
+ uv run ruff check src/ tests/ # lint
902
+ uv run ruff format --check src/ tests/ # format check
903
+ uv run python benchmark.py # run benchmarks
904
+ ```
905
+
906
+ ### Commit messages
907
+
908
+ This project enforces [Conventional Commits](https://www.conventionalcommits.org/). All commit messages must follow this format:
909
+
910
+ ```
911
+ <type>: <description>
912
+
913
+ [optional body]
914
+ ```
915
+
916
+ | Type | Use when |
917
+ |------|----------|
918
+ | `feat` | Adding a new feature |
919
+ | `fix` | Fixing a bug |
920
+ | `perf` | Performance improvement |
921
+ | `refactor` | Code restructuring without behavior change |
922
+ | `test` | Adding or updating tests |
923
+ | `docs` | Documentation changes |
924
+ | `chore` | Maintenance tasks (deps, CI config) |
925
+
926
+ Examples:
927
+
928
+ ```
929
+ feat: add automotive provider with VIN generation
930
+ fix: correct Luhn checksum in credit card numbers
931
+ perf: use getrandbits() bulk approach for UUID generation
932
+ ```
933
+
934
+ ### Performance guidelines
935
+
936
+ Performance is the primary selling point of DataForge. All contributions must:
937
+
938
+ 1. **Never regress benchmarks** — run `uv run python benchmark.py --compare` before submitting
939
+ 2. **Use `__slots__`** on all classes
940
+ 3. **Use immutable tuples** for static data (never lists)
941
+ 4. **Implement batch paths** — every public method must accept `count=N` with an optimized batch code path
942
+ 5. **Use `@overload` triplets** for type narrowing (no args, `Literal[1]`, `int`)
943
+ 6. **Inline hot paths** — avoid unnecessary function calls in batch loops
944
+
945
+ ### Pull request process
946
+
947
+ 1. Fork the repository
948
+ 2. Create a feature branch
949
+ 3. Make your changes with conventional commit messages
950
+ 4. Ensure all tests pass and benchmarks don't regress
951
+ 5. Submit a PR using the provided template
952
+
953
+ ## Copy
954
+
955
+ Create an independent copy of a `DataForge` instance:
956
+
957
+ ```python
958
+ forge2 = forge.copy(seed=99) # new instance, same locale, different seed
959
+ forge3 = forge.copy() # new instance, same locale, no seed
960
+ ```
961
+
962
+ ## License
963
+
964
+ MIT