TestDataX 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {testdatax-0.1.3 → testdatax-0.2.0}/PKG-INFO +12 -7
  2. {testdatax-0.1.3 → testdatax-0.2.0}/README.md +8 -4
  3. {testdatax-0.1.3 → testdatax-0.2.0}/pyproject.toml +12 -6
  4. {testdatax-0.1.3 → testdatax-0.2.0}/src/__init__.py +1 -1
  5. {testdatax-0.1.3 → testdatax-0.2.0}/src/cli.py +19 -4
  6. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/csv_exporter.py +0 -2
  7. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/json_exporter.py +14 -8
  8. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/mssql_exporter.py +10 -3
  9. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/mysql_exporter.py +10 -3
  10. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/oracle_exporter.py +10 -3
  11. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/constants.py +6 -0
  12. testdatax-0.2.0/src/exporters/utils/sql.py +18 -0
  13. testdatax-0.2.0/src/generator.py +175 -0
  14. testdatax-0.2.0/src/providers/base.py +153 -0
  15. testdatax-0.2.0/src/providers/faker_provider.py +114 -0
  16. testdatax-0.2.0/src/providers/mimesis_provider.py +153 -0
  17. testdatax-0.2.0/src/schemas.py +145 -0
  18. testdatax-0.1.3/src/generator.py +0 -117
  19. testdatax-0.1.3/src/providers/base.py +0 -58
  20. testdatax-0.1.3/src/providers/faker_provider.py +0 -65
  21. testdatax-0.1.3/src/providers/mimesis_provider.py +0 -87
  22. testdatax-0.1.3/src/schemas.py +0 -81
  23. {testdatax-0.1.3 → testdatax-0.2.0}/LICENSE +0 -0
  24. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/__init__.py +0 -0
  25. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/base_exporter.py +0 -0
  26. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/orc_exporter.py +0 -0
  27. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/parquet_exporter.py +0 -0
  28. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/__init__.py +0 -0
  29. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/chunker.py +0 -0
  30. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/exporter_config.py +0 -0
  31. {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/formatters.py +0 -0
  32. {testdatax-0.1.3 → testdatax-0.2.0}/src/providers/__init__.py +0 -0
@@ -1,8 +1,9 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: TestDataX
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: A flexible test data generation toolkit
5
5
  License: MIT
6
+ License-File: LICENSE
6
7
  Author: JamesPBrett
7
8
  Requires-Python: >=3.11,<4.0
8
9
  Classifier: License :: OSI Approved :: MIT License
@@ -10,10 +11,10 @@ Classifier: Programming Language :: Python :: 3
10
11
  Classifier: Programming Language :: Python :: 3.11
11
12
  Classifier: Programming Language :: Python :: 3.12
12
13
  Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
13
15
  Requires-Dist: faker (>=33.1.0,<34.0.0)
14
16
  Requires-Dist: mimesis (>=18.0.0,<19.0.0)
15
17
  Requires-Dist: mysql-connector-python (>=9.1.0,<10.0.0)
16
- Requires-Dist: orjson (>=3.10.12,<4.0.0)
17
18
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
19
  Requires-Dist: pyarrow (>=18.1.0,<19.0.0)
19
20
  Requires-Dist: pydantic (>=2.10.4,<3.0.0)
@@ -22,8 +23,6 @@ Description-Content-Type: text/markdown
22
23
 
23
24
  # TestDataX
24
25
 
25
- # TestDataX
26
-
27
26
  ![Build Status](https://github.com/JamesPBrett/testdatax/actions/workflows/publish.yml/badge.svg)
28
27
  [![codecov](https://codecov.io/gh/JamesPBrett/testdatax/branch/main/graph/badge.svg?token=6VX62CI6U9)](https://codecov.io/gh/JamesPBrett/testdatax)
29
28
  ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
@@ -42,7 +41,7 @@ pip install testdatax
42
41
 
43
42
  # Generate sample data
44
43
  testdatax --rows 1000 --format json --output data.json
45
-
44
+ ```
46
45
 
47
46
  ## Features
48
47
 
@@ -118,7 +117,7 @@ testdatax -o mstest.sql -f mssql -r 1000
118
117
 
119
118
  Generate Oracle with default row count (1000), table_name as 'oracle':
120
119
  ```bash
121
- datagen -o oracle.sql -f oracle -r 1000
120
+ testdatax -o oracle.sql -f oracle -r 1000
122
121
  ```
123
122
 
124
123
  Each command consists of:
@@ -224,6 +223,12 @@ The schema file defines the structure and constraints of your generated data. Ea
224
223
  }
225
224
  ```
226
225
 
226
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
227
+ > `format` is set, date/datetime values are rendered to a string with
228
+ > `strftime`; for the SQL exporters this means the column receives a formatted
229
+ > string literal rather than a native date, so `format` is best suited to the
230
+ > CSV/JSON formats.
231
+
227
232
  #### Enum Fields
228
233
  ```json
229
234
  {
@@ -1,7 +1,5 @@
1
1
  # TestDataX
2
2
 
3
- # TestDataX
4
-
5
3
  ![Build Status](https://github.com/JamesPBrett/testdatax/actions/workflows/publish.yml/badge.svg)
6
4
  [![codecov](https://codecov.io/gh/JamesPBrett/testdatax/branch/main/graph/badge.svg?token=6VX62CI6U9)](https://codecov.io/gh/JamesPBrett/testdatax)
7
5
  ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
@@ -20,7 +18,7 @@ pip install testdatax
20
18
 
21
19
  # Generate sample data
22
20
  testdatax --rows 1000 --format json --output data.json
23
-
21
+ ```
24
22
 
25
23
  ## Features
26
24
 
@@ -96,7 +94,7 @@ testdatax -o mstest.sql -f mssql -r 1000
96
94
 
97
95
  Generate Oracle with default row count (1000), table_name as 'oracle':
98
96
  ```bash
99
- datagen -o oracle.sql -f oracle -r 1000
97
+ testdatax -o oracle.sql -f oracle -r 1000
100
98
  ```
101
99
 
102
100
  Each command consists of:
@@ -202,6 +200,12 @@ The schema file defines the structure and constraints of your generated data. Ea
202
200
  }
203
201
  ```
204
202
 
203
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
204
+ > `format` is set, date/datetime values are rendered to a string with
205
+ > `strftime`; for the SQL exporters this means the column receives a formatted
206
+ > string literal rather than a native date, so `format` is best suited to the
207
+ > CSV/JSON formats.
208
+
205
209
  #### Enum Fields
206
210
  ```json
207
211
  {
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "TestDataX"
3
- version = "0.1.3"
3
+ version = "0.2.0"
4
4
  description = "A flexible test data generation toolkit"
5
5
  authors = ["JamesPBrett"]
6
6
  license = "MIT"
@@ -12,7 +12,6 @@ python = "^3.11"
12
12
  typer = "^0.15.1"
13
13
  faker = "^33.1.0"
14
14
  pydantic = "^2.10.4"
15
- orjson = "^3.10.12"
16
15
  pyarrow = "^18.1.0"
17
16
  pandas = "^2.2.3"
18
17
  mysql-connector-python = "^9.1.0"
@@ -39,13 +38,11 @@ types-psutil = "^6.1.0.20241221"
39
38
  commitizen = "^3.13.0"
40
39
  python-semantic-release = "^9.17.0"
41
40
 
42
- [build-system]
43
- requires = ["poetry-core"]
44
- build-backend = "poetry.core.masonry.api"
45
41
 
46
42
  [tool.poetry.scripts]
47
43
  testdatax = "src.cli:app"
48
44
 
45
+
49
46
  [tool.ruff]
50
47
  # Same as Black
51
48
  line-length = 88
@@ -85,6 +82,7 @@ exclude = [
85
82
  [tool.ruff.lint.isort]
86
83
  known-first-party = ["src"]
87
84
 
85
+
88
86
  [tool.black]
89
87
  line-length = 88
90
88
  target-version = ['py311']
@@ -128,6 +126,7 @@ ignore_missing_imports = true
128
126
  module = "src.providers.mimesis_provider"
129
127
  warn_return_any = false
130
128
 
129
+
131
130
  [tool.coverage.run]
132
131
  source = ["src"]
133
132
  branch = true
@@ -141,15 +140,17 @@ exclude_lines = [
141
140
  "pass",
142
141
  ]
143
142
 
143
+
144
144
  [tool.pytest.ini_options]
145
145
  testpaths = ["tests"]
146
146
  python_files = ["test_*.py"]
147
147
  python_classes = ["Test*"]
148
148
  python_functions = ["test_*"]
149
149
 
150
+
150
151
  [tool.commitizen]
151
152
  name = "cz_conventional_commits"
152
- version = "0.1.0"
153
+ version = "0.1.3"
153
154
  tag_format = "v$version"
154
155
  version_files = [
155
156
  "src/__init__.py:__version__",
@@ -223,3 +224,8 @@ allowed_tags = [
223
224
  "chore", # Maintenance tasks
224
225
  "refactor", # Code changes without fixing bugs or adding features
225
226
  ]
227
+
228
+ [build-system]
229
+ requires = ["poetry-core"]
230
+ build-backend = "poetry.core.masonry.api"
231
+
@@ -1,6 +1,6 @@
1
1
  """TestDataX package initialization."""
2
2
 
3
- __version__ = "0.1.3"
3
+ __version__ = "0.2.0"
4
4
 
5
5
  from src.cli import app # noqa
6
6
 
@@ -102,16 +102,30 @@ def generate(
102
102
  f"{min_value}, {max_value}"
103
103
  )
104
104
 
105
+ # Accept "precision" as an alias for "right_digits"; use an
106
+ # explicit None check so an intentional 0 is not dropped.
107
+ right_digits = field_def.get("right_digits")
108
+ if right_digits is None:
109
+ right_digits = field_def.get("precision")
110
+
105
111
  field_schema = FieldSchema(
106
112
  name=name,
107
113
  type=field_type,
108
114
  enum_values=field_def.get("values"),
109
115
  min_value=min_value,
110
116
  max_value=max_value,
111
- right_digits=field_def.get("right_digits"),
117
+ right_digits=right_digits,
112
118
  value_provider=field_def.get("provider_field")
113
119
  or field_def.get("faker"),
114
120
  pattern=field_def.get("pattern"),
121
+ nullable=field_def.get("nullable", False),
122
+ unique=field_def.get("unique", False),
123
+ weights=field_def.get("weights"),
124
+ min_length=field_def.get("min_length"),
125
+ max_length=field_def.get("max_length"),
126
+ start_date=field_def.get("start_date"),
127
+ end_date=field_def.get("end_date"),
128
+ format=field_def.get("format"),
115
129
  )
116
130
  fields.append(field_schema.model_dump())
117
131
  else:
@@ -163,9 +177,10 @@ def generate(
163
177
  raise typer.Exit(code=1) from e
164
178
  except Exception as e:
165
179
  typer.echo(f"Error: {str(e)}", err=True)
166
- typer.echo(f"Exception type: {type(e).__name__}", err=True)
167
- typer.echo(f"Exception args: {e.args}", err=True)
168
- typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
180
+ if debug:
181
+ typer.echo(f"Exception type: {type(e).__name__}", err=True)
182
+ typer.echo(f"Exception args: {e.args}", err=True)
183
+ typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
169
184
  raise typer.Exit(code=1) from e
170
185
 
171
186
 
@@ -77,10 +77,8 @@ class CsvExporter(BaseExporter):
77
77
  fieldnames = list(data[0].keys())
78
78
 
79
79
  first_chunk = True
80
- formatted_rows = []
81
80
  for chunk in self.chunker.chunk_data(data):
82
81
  formatted_chunk = [self.formatter.format_row(row) for row in chunk]
83
- formatted_rows.extend(formatted_chunk)
84
82
  df = pd.DataFrame(formatted_chunk, columns=fieldnames)
85
83
 
86
84
  # Write the data to CSV in chunks
@@ -62,16 +62,22 @@ class JsonExporter(BaseExporter):
62
62
  raise ValueError(
63
63
  f"Field '{field}' in schema is not present in data."
64
64
  )
65
- # Format the data and write it in chunks to the output file
66
- all_formatted_rows = []
67
- for chunk in self.chunker.chunk_data(data):
68
- formatted_chunk = [self.formatter.format_row(row) for row in chunk]
69
- all_formatted_rows.extend(formatted_chunk)
70
65
 
71
- # Write the complete file with proper formatting using json.dumps
66
+ # Stream a valid JSON array to disk one chunk at a time so the whole
67
+ # dataset is never held in memory at once.
72
68
  with open(output_path, "w", encoding="utf-8") as f:
73
- json_str = json.dumps(all_formatted_rows, indent=4)
74
- f.write(json_str)
69
+ f.write("[")
70
+ first = True
71
+ for chunk in self.chunker.chunk_data(data):
72
+ for row in chunk:
73
+ formatted = self.formatter.format_row(row)
74
+ block = json.dumps(formatted, indent=4)
75
+ indented = "\n".join(
76
+ " " + line for line in block.splitlines()
77
+ )
78
+ f.write(("\n" if first else ",\n") + indented)
79
+ first = False
80
+ f.write("\n]" if not first else "]")
75
81
 
76
82
  logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
77
83
 
@@ -1,10 +1,12 @@
1
1
  import decimal
2
2
  from datetime import date, datetime
3
+ from pathlib import Path
3
4
  from typing import Any
4
5
  from uuid import UUID
5
6
 
6
7
  from .base_exporter import BaseExporter
7
8
  from .utils.constants import DEFAULT_SCHEMA
9
+ from .utils.sql import escape_ansi_quotes
8
10
 
9
11
  MSSQL_TYPE_MAPPING = {
10
12
  "string": "NVARCHAR(255)", # Unicode string support
@@ -25,6 +27,11 @@ MSSQL_TYPE_MAPPING = {
25
27
  class MssqlExporter(BaseExporter):
26
28
  """Exports data to MSSQL compatible SQL file."""
27
29
 
30
+ @staticmethod
31
+ def _escape(value: str) -> str:
32
+ """Escape a string for a T-SQL single-quoted literal (quote doubling)."""
33
+ return escape_ansi_quotes(value)
34
+
28
35
  def _format_value(
29
36
  self,
30
37
  value: (
@@ -69,7 +76,7 @@ class MssqlExporter(BaseExporter):
69
76
  if value is None:
70
77
  return "NULL"
71
78
  elif isinstance(value, (str | UUID)):
72
- return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
79
+ return "'" + self._escape(str(value)) + "'"
73
80
  elif isinstance(value, (datetime | date)):
74
81
  return f"'{value.isoformat()}'"
75
82
  elif isinstance(value, bool):
@@ -141,7 +148,7 @@ class MssqlExporter(BaseExporter):
141
148
  and field_def.get("type") == "enum"
142
149
  and "values" in field_def
143
150
  ):
144
- values = "','".join(field_def["values"])
151
+ values = "','".join(self._escape(v) for v in field_def["values"])
145
152
  check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
146
153
 
147
154
  columns.append(f" {field_name} {sql_type} NULL")
@@ -188,7 +195,7 @@ class MssqlExporter(BaseExporter):
188
195
  if not data:
189
196
  return
190
197
 
191
- table_name = output_path.split("/")[-1].split(".")[0]
198
+ table_name = Path(output_path).name.split(".")[0]
192
199
 
193
200
  with open(output_path, "w") as f:
194
201
  # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
@@ -1,10 +1,12 @@
1
1
  import decimal
2
2
  from datetime import date, datetime
3
+ from pathlib import Path
3
4
  from typing import Any
4
5
  from uuid import UUID
5
6
 
6
7
  from .base_exporter import BaseExporter
7
8
  from .utils.constants import DEFAULT_SCHEMA
9
+ from .utils.sql import escape_mysql_literal
8
10
 
9
11
  MYSQL_TYPE_MAPPING = {
10
12
  "string": "VARCHAR(255)",
@@ -25,6 +27,11 @@ MYSQL_TYPE_MAPPING = {
25
27
  class MysqlExporter(BaseExporter):
26
28
  """Exports data to MySQL compatible SQL file."""
27
29
 
30
+ @staticmethod
31
+ def _escape(value: str) -> str:
32
+ """Escape a string for a MySQL single-quoted literal."""
33
+ return escape_mysql_literal(value)
34
+
28
35
  def _format_value(
29
36
  self,
30
37
  value: (
@@ -69,7 +76,7 @@ class MysqlExporter(BaseExporter):
69
76
  if value is None:
70
77
  return "NULL"
71
78
  elif isinstance(value, (str | UUID)):
72
- return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
79
+ return "'" + self._escape(str(value)) + "'"
73
80
  elif isinstance(value, (datetime | date)):
74
81
  return f"'{value.isoformat()}'"
75
82
  elif isinstance(value, bool):
@@ -138,7 +145,7 @@ class MysqlExporter(BaseExporter):
138
145
  and field_def.get("type") == "enum"
139
146
  and "values" in field_def
140
147
  ):
141
- values = "','".join(field_def["values"])
148
+ values = "','".join(self._escape(v) for v in field_def["values"])
142
149
  sql_type = f"ENUM('{values}')"
143
150
  columns.append(f" {field_name} {sql_type} NULL")
144
151
 
@@ -174,7 +181,7 @@ class MysqlExporter(BaseExporter):
174
181
  if not data:
175
182
  return
176
183
 
177
- table_name = output_path.split("/")[-1].split(".")[0]
184
+ table_name = Path(output_path).name.split(".")[0]
178
185
 
179
186
  with open(output_path, "w") as f:
180
187
  # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
@@ -1,10 +1,12 @@
1
1
  import decimal
2
2
  from datetime import date, datetime
3
+ from pathlib import Path
3
4
  from typing import Any
4
5
  from uuid import UUID
5
6
 
6
7
  from .base_exporter import BaseExporter
7
8
  from .utils.constants import DEFAULT_SCHEMA
9
+ from .utils.sql import escape_ansi_quotes
8
10
 
9
11
  ORACLE_TYPE_MAPPING = {
10
12
  "string": "VARCHAR2(255)", # Oracle's Unicode string type
@@ -25,6 +27,11 @@ ORACLE_TYPE_MAPPING = {
25
27
  class OracleExporter(BaseExporter):
26
28
  """Exports data to ORACLE compatible SQL file."""
27
29
 
30
+ @staticmethod
31
+ def _escape(value: str) -> str:
32
+ """Escape a string for an Oracle single-quoted literal (quote doubling)."""
33
+ return escape_ansi_quotes(value)
34
+
28
35
  def _format_value(
29
36
  self,
30
37
  value: (
@@ -69,7 +76,7 @@ class OracleExporter(BaseExporter):
69
76
  if value is None:
70
77
  return "NULL"
71
78
  elif isinstance(value, (str)):
72
- return "'" + str(value).replace("'", "''") + "'" # uses '' for escaping
79
+ return "'" + self._escape(str(value)) + "'" # uses '' for escaping
73
80
  elif isinstance(value, UUID):
74
81
  return f"'{str(value)}'"
75
82
  elif isinstance(value, datetime):
@@ -148,7 +155,7 @@ class OracleExporter(BaseExporter):
148
155
  and field_def.get("type") == "enum"
149
156
  and "values" in field_def
150
157
  ):
151
- values = "','".join(field_def["values"])
158
+ values = "','".join(self._escape(v) for v in field_def["values"])
152
159
  check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
153
160
 
154
161
  columns.append(f" {field_name} {sql_type} NULL")
@@ -195,7 +202,7 @@ class OracleExporter(BaseExporter):
195
202
  if not data:
196
203
  return
197
204
 
198
- table_name = output_path.split("/")[-1].split(".")[0]
205
+ table_name = Path(output_path).name.split(".")[0]
199
206
 
200
207
  with open(output_path, "w") as f:
201
208
  # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
@@ -11,6 +11,12 @@ CHUNK_SIZE_CSV = CHUNK_SIZE
11
11
  CHUNK_SIZE_PARQUET = CHUNK_SIZE
12
12
  CHUNK_SIZE_ORC = CHUNK_SIZE
13
13
 
14
+ # Probability that a nullable field emits NULL instead of a value
15
+ NULL_PROBABILITY = 0.1
16
+
17
+ # Maximum attempts to generate a unique value before giving up
18
+ MAX_UNIQUE_RETRIES = 1000
19
+
14
20
  # ORC compression types
15
21
  OrcCompression = Literal["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
16
22
  OrcStrategy = Literal["SPEED", "COMPRESSION"]
@@ -0,0 +1,18 @@
1
+ """Shared SQL string-escaping helpers for the SQL exporters."""
2
+
3
+
4
+ def escape_ansi_quotes(value: str) -> str:
5
+ """Escape an ANSI single-quoted SQL literal by doubling embedded quotes.
6
+
7
+ Used by dialects without backslash escaping (MSSQL, Oracle).
8
+ """
9
+ return value.replace("'", "''")
10
+
11
+
12
+ def escape_mysql_literal(value: str) -> str:
13
+ """Escape a MySQL single-quoted literal.
14
+
15
+ The backslash is escaped first so a value containing a backslash cannot
16
+ terminate the literal early, then single quotes and newlines are escaped.
17
+ """
18
+ return value.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
@@ -0,0 +1,175 @@
1
+ import random
2
+ from datetime import date, datetime
3
+ from decimal import Decimal
4
+ from typing import TYPE_CHECKING, Any
5
+ from uuid import UUID
6
+
7
+ from .exporters.utils.constants import MAX_UNIQUE_RETRIES, NULL_PROBABILITY
8
+ from .providers import DataProvider, FakerProvider
9
+ from .schemas import DataType, FieldSchema
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Callable
13
+
14
+ # Union of every value type the generator can produce for a single field.
15
+ GeneratedValue = (
16
+ str | int | float | Decimal | bool | date | datetime | bytes | UUID | None
17
+ )
18
+
19
+
20
+ class DataGenerator:
21
+ """The DataGenerator class generates synthetic data based on the provided field schemas.
22
+
23
+ Supported data types:
24
+ - STRING: Generates usernames
25
+ - TEXT: Generates paragraphs of text
26
+ - INTEGER: Generates integers between 0 and 100
27
+ - BIGINT: Generates large integers between 0 and 9999999999
28
+ - FLOAT: Generates floating point numbers with 2 decimal places
29
+ - DECIMAL: Generates Decimal numbers with 2 decimal places
30
+ - BOOLEAN: Generates True/False values
31
+ - DATE: Generates date objects
32
+ - DATETIME: Generates datetime objects
33
+ - BLOB: Generates binary data
34
+ - UUID: Generates UUID objects
35
+ - ENUM: Generates values from provided enum_values list
36
+
37
+ Cross-cutting field options (``nullable``, ``unique`` and ``format``) are applied
38
+ uniformly across every type during :meth:`generate_data`.
39
+
40
+ """ # noqa: E501
41
+
42
+ def __init__(self, provider: DataProvider | None = None) -> None:
43
+ """Initialize the DataGenerator with a provider and type generator map.
44
+
45
+ Args:
46
+ provider: The data provider used to generate values. Defaults to
47
+ :class:`FakerProvider` when not supplied.
48
+
49
+ """
50
+ self.provider = provider or FakerProvider()
51
+ self.type_generators: dict[
52
+ DataType, Callable[[FieldSchema], GeneratedValue]
53
+ ] = {
54
+ DataType.STRING: self._generate_string,
55
+ DataType.TEXT: self._generate_text,
56
+ DataType.INTEGER: self._generate_integer,
57
+ DataType.BIGINT: self._generate_bigint,
58
+ DataType.FLOAT: self._generate_float,
59
+ DataType.DECIMAL: self._generate_decimal,
60
+ DataType.BOOLEAN: self._generate_boolean,
61
+ DataType.DATE: self._generate_date,
62
+ DataType.DATETIME: self._generate_datetime,
63
+ DataType.BLOB: self._generate_blob,
64
+ DataType.UUID: self._generate_uuid,
65
+ DataType.ENUM: self._generate_enum,
66
+ }
67
+
68
+ def generate_data(
69
+ self, fields: list[FieldSchema], count: int
70
+ ) -> list[dict[str, Any]]:
71
+ """Generate data based on the provided schema and count."""
72
+ unique_seen: dict[str, set[GeneratedValue]] = {
73
+ field.name: set() for field in fields if field.unique
74
+ }
75
+ data = []
76
+ for _ in range(count):
77
+ row = {}
78
+ for field in fields:
79
+ row[field.name] = self._generate_field_value(field, unique_seen)
80
+ data.append(row)
81
+ return data
82
+
83
+ def _generate_field_value(
84
+ self, field: FieldSchema, unique_seen: dict[str, set[GeneratedValue]]
85
+ ) -> GeneratedValue:
86
+ """Produce a single value, honoring nullable/unique/format options."""
87
+ if field.unique:
88
+ seen = unique_seen[field.name]
89
+ for _ in range(MAX_UNIQUE_RETRIES):
90
+ value = self._produce_value(field)
91
+ if value is None or value not in seen:
92
+ if value is not None:
93
+ seen.add(value)
94
+ return self._apply_format(field, value)
95
+ raise ValueError(
96
+ f"Could not generate a unique value for field '{field.name}' "
97
+ f"after {MAX_UNIQUE_RETRIES} attempts"
98
+ )
99
+ return self._apply_format(field, self._produce_value(field))
100
+
101
+ def _produce_value(self, field: FieldSchema) -> GeneratedValue:
102
+ """Generate a raw value for a field, applying only the nullable option."""
103
+ if field.nullable and random.random() < NULL_PROBABILITY:
104
+ return None
105
+ return self.type_generators[field.type](field)
106
+
107
+ @staticmethod
108
+ def _apply_format(field: FieldSchema, value: GeneratedValue) -> GeneratedValue:
109
+ """Render date/datetime values via strftime when a format is set.
110
+
111
+ Applied after the uniqueness check so uniqueness is tracked on the raw
112
+ (high-cardinality) value rather than the formatted string.
113
+ """
114
+ if field.format and isinstance(value, (date | datetime)):
115
+ return value.strftime(field.format)
116
+ return value
117
+
118
+ def _generate_string(self, field: FieldSchema) -> str:
119
+ provider_value = str(field.value_provider) if field.value_provider else "name"
120
+ return self.provider.generate_string(
121
+ value_provider=provider_value,
122
+ min_length=field.min_length,
123
+ max_length=field.max_length,
124
+ )
125
+
126
+ def _generate_text(self, field: FieldSchema) -> str:
127
+ return self.provider.generate_text(
128
+ min_length=field.min_length, max_length=field.max_length
129
+ )
130
+
131
+ def _generate_integer(self, field: FieldSchema) -> int:
132
+ min_val = int(field.min_value) if field.min_value is not None else 0
133
+ max_val = int(field.max_value) if field.max_value is not None else 100
134
+ return self.provider.generate_integer(min_value=min_val, max_value=max_val)
135
+
136
+ def _generate_bigint(self, field: FieldSchema) -> int:
137
+ min_val = int(field.min_value) if field.min_value is not None else 0
138
+ max_val = int(field.max_value) if field.max_value is not None else 9999999999
139
+ return self.provider.generate_integer(min_value=min_val, max_value=max_val)
140
+
141
+ def _generate_float(self, field: FieldSchema) -> float:
142
+ return self.provider.generate_float(
143
+ min_value=field.min_value,
144
+ max_value=field.max_value,
145
+ right_digits=field.right_digits if field.right_digits is not None else 2,
146
+ )
147
+
148
+ def _generate_decimal(self, field: FieldSchema) -> Decimal:
149
+ return self.provider.generate_decimal(
150
+ min_value=field.min_value,
151
+ max_value=field.max_value,
152
+ right_digits=field.right_digits if field.right_digits is not None else 2,
153
+ )
154
+
155
+ def _generate_boolean(self, field: FieldSchema) -> bool:
156
+ return self.provider.generate_boolean()
157
+
158
+ def _generate_date(self, field: FieldSchema) -> date:
159
+ return self.provider.generate_date(start=field.start_date, end=field.end_date)
160
+
161
+ def _generate_datetime(self, field: FieldSchema) -> datetime:
162
+ return self.provider.generate_datetime(
163
+ start=field.start_date, end=field.end_date
164
+ )
165
+
166
+ def _generate_blob(self, field: FieldSchema) -> bytes:
167
+ return self.provider.generate_binary()
168
+
169
+ def _generate_uuid(self, field: FieldSchema) -> UUID:
170
+ return self.provider.generate_uuid()
171
+
172
+ def _generate_enum(self, field: FieldSchema) -> str:
173
+ if not field.enum_values:
174
+ raise ValueError(f"Enum field {field.name} must have values defined")
175
+ return self.provider.generate_enum(field.enum_values, weights=field.weights)