TestDataX 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {testdatax-0.1.3 → testdatax-0.2.0}/PKG-INFO +12 -7
- {testdatax-0.1.3 → testdatax-0.2.0}/README.md +8 -4
- {testdatax-0.1.3 → testdatax-0.2.0}/pyproject.toml +12 -6
- {testdatax-0.1.3 → testdatax-0.2.0}/src/__init__.py +1 -1
- {testdatax-0.1.3 → testdatax-0.2.0}/src/cli.py +19 -4
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/csv_exporter.py +0 -2
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/json_exporter.py +14 -8
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/mssql_exporter.py +10 -3
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/mysql_exporter.py +10 -3
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/oracle_exporter.py +10 -3
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/constants.py +6 -0
- testdatax-0.2.0/src/exporters/utils/sql.py +18 -0
- testdatax-0.2.0/src/generator.py +175 -0
- testdatax-0.2.0/src/providers/base.py +153 -0
- testdatax-0.2.0/src/providers/faker_provider.py +114 -0
- testdatax-0.2.0/src/providers/mimesis_provider.py +153 -0
- testdatax-0.2.0/src/schemas.py +145 -0
- testdatax-0.1.3/src/generator.py +0 -117
- testdatax-0.1.3/src/providers/base.py +0 -58
- testdatax-0.1.3/src/providers/faker_provider.py +0 -65
- testdatax-0.1.3/src/providers/mimesis_provider.py +0 -87
- testdatax-0.1.3/src/schemas.py +0 -81
- {testdatax-0.1.3 → testdatax-0.2.0}/LICENSE +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/__init__.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/base_exporter.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/orc_exporter.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/parquet_exporter.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/__init__.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/chunker.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/exporter_config.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/exporters/utils/formatters.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.0}/src/providers/__init__.py +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: TestDataX
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A flexible test data generation toolkit
|
|
5
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Author: JamesPBrett
|
|
7
8
|
Requires-Python: >=3.11,<4.0
|
|
8
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -10,10 +11,10 @@ Classifier: Programming Language :: Python :: 3
|
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
13
15
|
Requires-Dist: faker (>=33.1.0,<34.0.0)
|
|
14
16
|
Requires-Dist: mimesis (>=18.0.0,<19.0.0)
|
|
15
17
|
Requires-Dist: mysql-connector-python (>=9.1.0,<10.0.0)
|
|
16
|
-
Requires-Dist: orjson (>=3.10.12,<4.0.0)
|
|
17
18
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
19
|
Requires-Dist: pyarrow (>=18.1.0,<19.0.0)
|
|
19
20
|
Requires-Dist: pydantic (>=2.10.4,<3.0.0)
|
|
@@ -22,8 +23,6 @@ Description-Content-Type: text/markdown
|
|
|
22
23
|
|
|
23
24
|
# TestDataX
|
|
24
25
|
|
|
25
|
-
# TestDataX
|
|
26
|
-
|
|
27
26
|

|
|
28
27
|
[](https://codecov.io/gh/JamesPBrett/testdatax)
|
|
29
28
|

|
|
@@ -42,7 +41,7 @@ pip install testdatax
|
|
|
42
41
|
|
|
43
42
|
# Generate sample data
|
|
44
43
|
testdatax --rows 1000 --format json --output data.json
|
|
45
|
-
|
|
44
|
+
```
|
|
46
45
|
|
|
47
46
|
## Features
|
|
48
47
|
|
|
@@ -118,7 +117,7 @@ testdatax -o mstest.sql -f mssql -r 1000
|
|
|
118
117
|
|
|
119
118
|
Generate Oracle with default row count (1000), table_name as 'oracle':
|
|
120
119
|
```bash
|
|
121
|
-
|
|
120
|
+
testdatax -o oracle.sql -f oracle -r 1000
|
|
122
121
|
```
|
|
123
122
|
|
|
124
123
|
Each command consists of:
|
|
@@ -224,6 +223,12 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
224
223
|
}
|
|
225
224
|
```
|
|
226
225
|
|
|
226
|
+
> **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
|
|
227
|
+
> `format` is set, date/datetime values are rendered to a string with
|
|
228
|
+
> `strftime`; for the SQL exporters this means the column receives a formatted
|
|
229
|
+
> string literal rather than a native date, so `format` is best suited to the
|
|
230
|
+
> CSV/JSON formats.
|
|
231
|
+
|
|
227
232
|
#### Enum Fields
|
|
228
233
|
```json
|
|
229
234
|
{
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# TestDataX
|
|
2
2
|
|
|
3
|
-
# TestDataX
|
|
4
|
-
|
|
5
3
|

|
|
6
4
|
[](https://codecov.io/gh/JamesPBrett/testdatax)
|
|
7
5
|

|
|
@@ -20,7 +18,7 @@ pip install testdatax
|
|
|
20
18
|
|
|
21
19
|
# Generate sample data
|
|
22
20
|
testdatax --rows 1000 --format json --output data.json
|
|
23
|
-
|
|
21
|
+
```
|
|
24
22
|
|
|
25
23
|
## Features
|
|
26
24
|
|
|
@@ -96,7 +94,7 @@ testdatax -o mstest.sql -f mssql -r 1000
|
|
|
96
94
|
|
|
97
95
|
Generate Oracle with default row count (1000), table_name as 'oracle':
|
|
98
96
|
```bash
|
|
99
|
-
|
|
97
|
+
testdatax -o oracle.sql -f oracle -r 1000
|
|
100
98
|
```
|
|
101
99
|
|
|
102
100
|
Each command consists of:
|
|
@@ -202,6 +200,12 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
202
200
|
}
|
|
203
201
|
```
|
|
204
202
|
|
|
203
|
+
> **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
|
|
204
|
+
> `format` is set, date/datetime values are rendered to a string with
|
|
205
|
+
> `strftime`; for the SQL exporters this means the column receives a formatted
|
|
206
|
+
> string literal rather than a native date, so `format` is best suited to the
|
|
207
|
+
> CSV/JSON formats.
|
|
208
|
+
|
|
205
209
|
#### Enum Fields
|
|
206
210
|
```json
|
|
207
211
|
{
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "TestDataX"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "A flexible test data generation toolkit"
|
|
5
5
|
authors = ["JamesPBrett"]
|
|
6
6
|
license = "MIT"
|
|
@@ -12,7 +12,6 @@ python = "^3.11"
|
|
|
12
12
|
typer = "^0.15.1"
|
|
13
13
|
faker = "^33.1.0"
|
|
14
14
|
pydantic = "^2.10.4"
|
|
15
|
-
orjson = "^3.10.12"
|
|
16
15
|
pyarrow = "^18.1.0"
|
|
17
16
|
pandas = "^2.2.3"
|
|
18
17
|
mysql-connector-python = "^9.1.0"
|
|
@@ -39,13 +38,11 @@ types-psutil = "^6.1.0.20241221"
|
|
|
39
38
|
commitizen = "^3.13.0"
|
|
40
39
|
python-semantic-release = "^9.17.0"
|
|
41
40
|
|
|
42
|
-
[build-system]
|
|
43
|
-
requires = ["poetry-core"]
|
|
44
|
-
build-backend = "poetry.core.masonry.api"
|
|
45
41
|
|
|
46
42
|
[tool.poetry.scripts]
|
|
47
43
|
testdatax = "src.cli:app"
|
|
48
44
|
|
|
45
|
+
|
|
49
46
|
[tool.ruff]
|
|
50
47
|
# Same as Black
|
|
51
48
|
line-length = 88
|
|
@@ -85,6 +82,7 @@ exclude = [
|
|
|
85
82
|
[tool.ruff.lint.isort]
|
|
86
83
|
known-first-party = ["src"]
|
|
87
84
|
|
|
85
|
+
|
|
88
86
|
[tool.black]
|
|
89
87
|
line-length = 88
|
|
90
88
|
target-version = ['py311']
|
|
@@ -128,6 +126,7 @@ ignore_missing_imports = true
|
|
|
128
126
|
module = "src.providers.mimesis_provider"
|
|
129
127
|
warn_return_any = false
|
|
130
128
|
|
|
129
|
+
|
|
131
130
|
[tool.coverage.run]
|
|
132
131
|
source = ["src"]
|
|
133
132
|
branch = true
|
|
@@ -141,15 +140,17 @@ exclude_lines = [
|
|
|
141
140
|
"pass",
|
|
142
141
|
]
|
|
143
142
|
|
|
143
|
+
|
|
144
144
|
[tool.pytest.ini_options]
|
|
145
145
|
testpaths = ["tests"]
|
|
146
146
|
python_files = ["test_*.py"]
|
|
147
147
|
python_classes = ["Test*"]
|
|
148
148
|
python_functions = ["test_*"]
|
|
149
149
|
|
|
150
|
+
|
|
150
151
|
[tool.commitizen]
|
|
151
152
|
name = "cz_conventional_commits"
|
|
152
|
-
version = "0.1.
|
|
153
|
+
version = "0.1.3"
|
|
153
154
|
tag_format = "v$version"
|
|
154
155
|
version_files = [
|
|
155
156
|
"src/__init__.py:__version__",
|
|
@@ -223,3 +224,8 @@ allowed_tags = [
|
|
|
223
224
|
"chore", # Maintenance tasks
|
|
224
225
|
"refactor", # Code changes without fixing bugs or adding features
|
|
225
226
|
]
|
|
227
|
+
|
|
228
|
+
[build-system]
|
|
229
|
+
requires = ["poetry-core"]
|
|
230
|
+
build-backend = "poetry.core.masonry.api"
|
|
231
|
+
|
|
@@ -102,16 +102,30 @@ def generate(
|
|
|
102
102
|
f"{min_value}, {max_value}"
|
|
103
103
|
)
|
|
104
104
|
|
|
105
|
+
# Accept "precision" as an alias for "right_digits"; use an
|
|
106
|
+
# explicit None check so an intentional 0 is not dropped.
|
|
107
|
+
right_digits = field_def.get("right_digits")
|
|
108
|
+
if right_digits is None:
|
|
109
|
+
right_digits = field_def.get("precision")
|
|
110
|
+
|
|
105
111
|
field_schema = FieldSchema(
|
|
106
112
|
name=name,
|
|
107
113
|
type=field_type,
|
|
108
114
|
enum_values=field_def.get("values"),
|
|
109
115
|
min_value=min_value,
|
|
110
116
|
max_value=max_value,
|
|
111
|
-
right_digits=
|
|
117
|
+
right_digits=right_digits,
|
|
112
118
|
value_provider=field_def.get("provider_field")
|
|
113
119
|
or field_def.get("faker"),
|
|
114
120
|
pattern=field_def.get("pattern"),
|
|
121
|
+
nullable=field_def.get("nullable", False),
|
|
122
|
+
unique=field_def.get("unique", False),
|
|
123
|
+
weights=field_def.get("weights"),
|
|
124
|
+
min_length=field_def.get("min_length"),
|
|
125
|
+
max_length=field_def.get("max_length"),
|
|
126
|
+
start_date=field_def.get("start_date"),
|
|
127
|
+
end_date=field_def.get("end_date"),
|
|
128
|
+
format=field_def.get("format"),
|
|
115
129
|
)
|
|
116
130
|
fields.append(field_schema.model_dump())
|
|
117
131
|
else:
|
|
@@ -163,9 +177,10 @@ def generate(
|
|
|
163
177
|
raise typer.Exit(code=1) from e
|
|
164
178
|
except Exception as e:
|
|
165
179
|
typer.echo(f"Error: {str(e)}", err=True)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
180
|
+
if debug:
|
|
181
|
+
typer.echo(f"Exception type: {type(e).__name__}", err=True)
|
|
182
|
+
typer.echo(f"Exception args: {e.args}", err=True)
|
|
183
|
+
typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
|
|
169
184
|
raise typer.Exit(code=1) from e
|
|
170
185
|
|
|
171
186
|
|
|
@@ -77,10 +77,8 @@ class CsvExporter(BaseExporter):
|
|
|
77
77
|
fieldnames = list(data[0].keys())
|
|
78
78
|
|
|
79
79
|
first_chunk = True
|
|
80
|
-
formatted_rows = []
|
|
81
80
|
for chunk in self.chunker.chunk_data(data):
|
|
82
81
|
formatted_chunk = [self.formatter.format_row(row) for row in chunk]
|
|
83
|
-
formatted_rows.extend(formatted_chunk)
|
|
84
82
|
df = pd.DataFrame(formatted_chunk, columns=fieldnames)
|
|
85
83
|
|
|
86
84
|
# Write the data to CSV in chunks
|
|
@@ -62,16 +62,22 @@ class JsonExporter(BaseExporter):
|
|
|
62
62
|
raise ValueError(
|
|
63
63
|
f"Field '{field}' in schema is not present in data."
|
|
64
64
|
)
|
|
65
|
-
# Format the data and write it in chunks to the output file
|
|
66
|
-
all_formatted_rows = []
|
|
67
|
-
for chunk in self.chunker.chunk_data(data):
|
|
68
|
-
formatted_chunk = [self.formatter.format_row(row) for row in chunk]
|
|
69
|
-
all_formatted_rows.extend(formatted_chunk)
|
|
70
65
|
|
|
71
|
-
#
|
|
66
|
+
# Stream a valid JSON array to disk one chunk at a time so the whole
|
|
67
|
+
# dataset is never held in memory at once.
|
|
72
68
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
73
|
-
|
|
74
|
-
|
|
69
|
+
f.write("[")
|
|
70
|
+
first = True
|
|
71
|
+
for chunk in self.chunker.chunk_data(data):
|
|
72
|
+
for row in chunk:
|
|
73
|
+
formatted = self.formatter.format_row(row)
|
|
74
|
+
block = json.dumps(formatted, indent=4)
|
|
75
|
+
indented = "\n".join(
|
|
76
|
+
" " + line for line in block.splitlines()
|
|
77
|
+
)
|
|
78
|
+
f.write(("\n" if first else ",\n") + indented)
|
|
79
|
+
first = False
|
|
80
|
+
f.write("\n]" if not first else "]")
|
|
75
81
|
|
|
76
82
|
logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
|
|
77
83
|
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import decimal
|
|
2
2
|
from datetime import date, datetime
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Any
|
|
4
5
|
from uuid import UUID
|
|
5
6
|
|
|
6
7
|
from .base_exporter import BaseExporter
|
|
7
8
|
from .utils.constants import DEFAULT_SCHEMA
|
|
9
|
+
from .utils.sql import escape_ansi_quotes
|
|
8
10
|
|
|
9
11
|
MSSQL_TYPE_MAPPING = {
|
|
10
12
|
"string": "NVARCHAR(255)", # Unicode string support
|
|
@@ -25,6 +27,11 @@ MSSQL_TYPE_MAPPING = {
|
|
|
25
27
|
class MssqlExporter(BaseExporter):
|
|
26
28
|
"""Exports data to MSSQL compatible SQL file."""
|
|
27
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _escape(value: str) -> str:
|
|
32
|
+
"""Escape a string for a T-SQL single-quoted literal (quote doubling)."""
|
|
33
|
+
return escape_ansi_quotes(value)
|
|
34
|
+
|
|
28
35
|
def _format_value(
|
|
29
36
|
self,
|
|
30
37
|
value: (
|
|
@@ -69,7 +76,7 @@ class MssqlExporter(BaseExporter):
|
|
|
69
76
|
if value is None:
|
|
70
77
|
return "NULL"
|
|
71
78
|
elif isinstance(value, (str | UUID)):
|
|
72
|
-
return "'" + str(value)
|
|
79
|
+
return "'" + self._escape(str(value)) + "'"
|
|
73
80
|
elif isinstance(value, (datetime | date)):
|
|
74
81
|
return f"'{value.isoformat()}'"
|
|
75
82
|
elif isinstance(value, bool):
|
|
@@ -141,7 +148,7 @@ class MssqlExporter(BaseExporter):
|
|
|
141
148
|
and field_def.get("type") == "enum"
|
|
142
149
|
and "values" in field_def
|
|
143
150
|
):
|
|
144
|
-
values = "','".join(field_def["values"])
|
|
151
|
+
values = "','".join(self._escape(v) for v in field_def["values"])
|
|
145
152
|
check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
|
|
146
153
|
|
|
147
154
|
columns.append(f" {field_name} {sql_type} NULL")
|
|
@@ -188,7 +195,7 @@ class MssqlExporter(BaseExporter):
|
|
|
188
195
|
if not data:
|
|
189
196
|
return
|
|
190
197
|
|
|
191
|
-
table_name = output_path
|
|
198
|
+
table_name = Path(output_path).name.split(".")[0]
|
|
192
199
|
|
|
193
200
|
with open(output_path, "w") as f:
|
|
194
201
|
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import decimal
|
|
2
2
|
from datetime import date, datetime
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Any
|
|
4
5
|
from uuid import UUID
|
|
5
6
|
|
|
6
7
|
from .base_exporter import BaseExporter
|
|
7
8
|
from .utils.constants import DEFAULT_SCHEMA
|
|
9
|
+
from .utils.sql import escape_mysql_literal
|
|
8
10
|
|
|
9
11
|
MYSQL_TYPE_MAPPING = {
|
|
10
12
|
"string": "VARCHAR(255)",
|
|
@@ -25,6 +27,11 @@ MYSQL_TYPE_MAPPING = {
|
|
|
25
27
|
class MysqlExporter(BaseExporter):
|
|
26
28
|
"""Exports data to MySQL compatible SQL file."""
|
|
27
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _escape(value: str) -> str:
|
|
32
|
+
"""Escape a string for a MySQL single-quoted literal."""
|
|
33
|
+
return escape_mysql_literal(value)
|
|
34
|
+
|
|
28
35
|
def _format_value(
|
|
29
36
|
self,
|
|
30
37
|
value: (
|
|
@@ -69,7 +76,7 @@ class MysqlExporter(BaseExporter):
|
|
|
69
76
|
if value is None:
|
|
70
77
|
return "NULL"
|
|
71
78
|
elif isinstance(value, (str | UUID)):
|
|
72
|
-
return "'" + str(value)
|
|
79
|
+
return "'" + self._escape(str(value)) + "'"
|
|
73
80
|
elif isinstance(value, (datetime | date)):
|
|
74
81
|
return f"'{value.isoformat()}'"
|
|
75
82
|
elif isinstance(value, bool):
|
|
@@ -138,7 +145,7 @@ class MysqlExporter(BaseExporter):
|
|
|
138
145
|
and field_def.get("type") == "enum"
|
|
139
146
|
and "values" in field_def
|
|
140
147
|
):
|
|
141
|
-
values = "','".join(field_def["values"])
|
|
148
|
+
values = "','".join(self._escape(v) for v in field_def["values"])
|
|
142
149
|
sql_type = f"ENUM('{values}')"
|
|
143
150
|
columns.append(f" {field_name} {sql_type} NULL")
|
|
144
151
|
|
|
@@ -174,7 +181,7 @@ class MysqlExporter(BaseExporter):
|
|
|
174
181
|
if not data:
|
|
175
182
|
return
|
|
176
183
|
|
|
177
|
-
table_name = output_path
|
|
184
|
+
table_name = Path(output_path).name.split(".")[0]
|
|
178
185
|
|
|
179
186
|
with open(output_path, "w") as f:
|
|
180
187
|
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import decimal
|
|
2
2
|
from datetime import date, datetime
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Any
|
|
4
5
|
from uuid import UUID
|
|
5
6
|
|
|
6
7
|
from .base_exporter import BaseExporter
|
|
7
8
|
from .utils.constants import DEFAULT_SCHEMA
|
|
9
|
+
from .utils.sql import escape_ansi_quotes
|
|
8
10
|
|
|
9
11
|
ORACLE_TYPE_MAPPING = {
|
|
10
12
|
"string": "VARCHAR2(255)", # Oracle's Unicode string type
|
|
@@ -25,6 +27,11 @@ ORACLE_TYPE_MAPPING = {
|
|
|
25
27
|
class OracleExporter(BaseExporter):
|
|
26
28
|
"""Exports data to ORACLE compatible SQL file."""
|
|
27
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _escape(value: str) -> str:
|
|
32
|
+
"""Escape a string for an Oracle single-quoted literal (quote doubling)."""
|
|
33
|
+
return escape_ansi_quotes(value)
|
|
34
|
+
|
|
28
35
|
def _format_value(
|
|
29
36
|
self,
|
|
30
37
|
value: (
|
|
@@ -69,7 +76,7 @@ class OracleExporter(BaseExporter):
|
|
|
69
76
|
if value is None:
|
|
70
77
|
return "NULL"
|
|
71
78
|
elif isinstance(value, (str)):
|
|
72
|
-
return "'" + str(value)
|
|
79
|
+
return "'" + self._escape(str(value)) + "'" # uses '' for escaping
|
|
73
80
|
elif isinstance(value, UUID):
|
|
74
81
|
return f"'{str(value)}'"
|
|
75
82
|
elif isinstance(value, datetime):
|
|
@@ -148,7 +155,7 @@ class OracleExporter(BaseExporter):
|
|
|
148
155
|
and field_def.get("type") == "enum"
|
|
149
156
|
and "values" in field_def
|
|
150
157
|
):
|
|
151
|
-
values = "','".join(field_def["values"])
|
|
158
|
+
values = "','".join(self._escape(v) for v in field_def["values"])
|
|
152
159
|
check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
|
|
153
160
|
|
|
154
161
|
columns.append(f" {field_name} {sql_type} NULL")
|
|
@@ -195,7 +202,7 @@ class OracleExporter(BaseExporter):
|
|
|
195
202
|
if not data:
|
|
196
203
|
return
|
|
197
204
|
|
|
198
|
-
table_name = output_path
|
|
205
|
+
table_name = Path(output_path).name.split(".")[0]
|
|
199
206
|
|
|
200
207
|
with open(output_path, "w") as f:
|
|
201
208
|
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
@@ -11,6 +11,12 @@ CHUNK_SIZE_CSV = CHUNK_SIZE
|
|
|
11
11
|
CHUNK_SIZE_PARQUET = CHUNK_SIZE
|
|
12
12
|
CHUNK_SIZE_ORC = CHUNK_SIZE
|
|
13
13
|
|
|
14
|
+
# Probability that a nullable field emits NULL instead of a value
|
|
15
|
+
NULL_PROBABILITY = 0.1
|
|
16
|
+
|
|
17
|
+
# Maximum attempts to generate a unique value before giving up
|
|
18
|
+
MAX_UNIQUE_RETRIES = 1000
|
|
19
|
+
|
|
14
20
|
# ORC compression types
|
|
15
21
|
OrcCompression = Literal["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
|
|
16
22
|
OrcStrategy = Literal["SPEED", "COMPRESSION"]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Shared SQL string-escaping helpers for the SQL exporters."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def escape_ansi_quotes(value: str) -> str:
|
|
5
|
+
"""Escape an ANSI single-quoted SQL literal by doubling embedded quotes.
|
|
6
|
+
|
|
7
|
+
Used by dialects without backslash escaping (MSSQL, Oracle).
|
|
8
|
+
"""
|
|
9
|
+
return value.replace("'", "''")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def escape_mysql_literal(value: str) -> str:
|
|
13
|
+
"""Escape a MySQL single-quoted literal.
|
|
14
|
+
|
|
15
|
+
The backslash is escaped first so a value containing a backslash cannot
|
|
16
|
+
terminate the literal early, then single quotes and newlines are escaped.
|
|
17
|
+
"""
|
|
18
|
+
return value.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from datetime import date, datetime
|
|
3
|
+
from decimal import Decimal
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
|
|
7
|
+
from .exporters.utils.constants import MAX_UNIQUE_RETRIES, NULL_PROBABILITY
|
|
8
|
+
from .providers import DataProvider, FakerProvider
|
|
9
|
+
from .schemas import DataType, FieldSchema
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
|
|
14
|
+
# Union of every value type the generator can produce for a single field.
|
|
15
|
+
GeneratedValue = (
|
|
16
|
+
str | int | float | Decimal | bool | date | datetime | bytes | UUID | None
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DataGenerator:
|
|
21
|
+
"""The DataGenerator class generates synthetic data based on the provided field schemas.
|
|
22
|
+
|
|
23
|
+
Supported data types:
|
|
24
|
+
- STRING: Generates usernames
|
|
25
|
+
- TEXT: Generates paragraphs of text
|
|
26
|
+
- INTEGER: Generates integers between 0 and 100
|
|
27
|
+
- BIGINT: Generates large integers between 0 and 9999999999
|
|
28
|
+
- FLOAT: Generates floating point numbers with 2 decimal places
|
|
29
|
+
- DECIMAL: Generates Decimal numbers with 2 decimal places
|
|
30
|
+
- BOOLEAN: Generates True/False values
|
|
31
|
+
- DATE: Generates date objects
|
|
32
|
+
- DATETIME: Generates datetime objects
|
|
33
|
+
- BLOB: Generates binary data
|
|
34
|
+
- UUID: Generates UUID objects
|
|
35
|
+
- ENUM: Generates values from provided enum_values list
|
|
36
|
+
|
|
37
|
+
Cross-cutting field options (``nullable``, ``unique`` and ``format``) are applied
|
|
38
|
+
uniformly across every type during :meth:`generate_data`.
|
|
39
|
+
|
|
40
|
+
""" # noqa: E501
|
|
41
|
+
|
|
42
|
+
def __init__(self, provider: DataProvider | None = None) -> None:
|
|
43
|
+
"""Initialize the DataGenerator with a provider and type generator map.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
provider: The data provider used to generate values. Defaults to
|
|
47
|
+
:class:`FakerProvider` when not supplied.
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
self.provider = provider or FakerProvider()
|
|
51
|
+
self.type_generators: dict[
|
|
52
|
+
DataType, Callable[[FieldSchema], GeneratedValue]
|
|
53
|
+
] = {
|
|
54
|
+
DataType.STRING: self._generate_string,
|
|
55
|
+
DataType.TEXT: self._generate_text,
|
|
56
|
+
DataType.INTEGER: self._generate_integer,
|
|
57
|
+
DataType.BIGINT: self._generate_bigint,
|
|
58
|
+
DataType.FLOAT: self._generate_float,
|
|
59
|
+
DataType.DECIMAL: self._generate_decimal,
|
|
60
|
+
DataType.BOOLEAN: self._generate_boolean,
|
|
61
|
+
DataType.DATE: self._generate_date,
|
|
62
|
+
DataType.DATETIME: self._generate_datetime,
|
|
63
|
+
DataType.BLOB: self._generate_blob,
|
|
64
|
+
DataType.UUID: self._generate_uuid,
|
|
65
|
+
DataType.ENUM: self._generate_enum,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def generate_data(
|
|
69
|
+
self, fields: list[FieldSchema], count: int
|
|
70
|
+
) -> list[dict[str, Any]]:
|
|
71
|
+
"""Generate data based on the provided schema and count."""
|
|
72
|
+
unique_seen: dict[str, set[GeneratedValue]] = {
|
|
73
|
+
field.name: set() for field in fields if field.unique
|
|
74
|
+
}
|
|
75
|
+
data = []
|
|
76
|
+
for _ in range(count):
|
|
77
|
+
row = {}
|
|
78
|
+
for field in fields:
|
|
79
|
+
row[field.name] = self._generate_field_value(field, unique_seen)
|
|
80
|
+
data.append(row)
|
|
81
|
+
return data
|
|
82
|
+
|
|
83
|
+
def _generate_field_value(
|
|
84
|
+
self, field: FieldSchema, unique_seen: dict[str, set[GeneratedValue]]
|
|
85
|
+
) -> GeneratedValue:
|
|
86
|
+
"""Produce a single value, honoring nullable/unique/format options."""
|
|
87
|
+
if field.unique:
|
|
88
|
+
seen = unique_seen[field.name]
|
|
89
|
+
for _ in range(MAX_UNIQUE_RETRIES):
|
|
90
|
+
value = self._produce_value(field)
|
|
91
|
+
if value is None or value not in seen:
|
|
92
|
+
if value is not None:
|
|
93
|
+
seen.add(value)
|
|
94
|
+
return self._apply_format(field, value)
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Could not generate a unique value for field '{field.name}' "
|
|
97
|
+
f"after {MAX_UNIQUE_RETRIES} attempts"
|
|
98
|
+
)
|
|
99
|
+
return self._apply_format(field, self._produce_value(field))
|
|
100
|
+
|
|
101
|
+
def _produce_value(self, field: FieldSchema) -> GeneratedValue:
|
|
102
|
+
"""Generate a raw value for a field, applying only the nullable option."""
|
|
103
|
+
if field.nullable and random.random() < NULL_PROBABILITY:
|
|
104
|
+
return None
|
|
105
|
+
return self.type_generators[field.type](field)
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def _apply_format(field: FieldSchema, value: GeneratedValue) -> GeneratedValue:
|
|
109
|
+
"""Render date/datetime values via strftime when a format is set.
|
|
110
|
+
|
|
111
|
+
Applied after the uniqueness check so uniqueness is tracked on the raw
|
|
112
|
+
(high-cardinality) value rather than the formatted string.
|
|
113
|
+
"""
|
|
114
|
+
if field.format and isinstance(value, (date | datetime)):
|
|
115
|
+
return value.strftime(field.format)
|
|
116
|
+
return value
|
|
117
|
+
|
|
118
|
+
def _generate_string(self, field: FieldSchema) -> str:
|
|
119
|
+
provider_value = str(field.value_provider) if field.value_provider else "name"
|
|
120
|
+
return self.provider.generate_string(
|
|
121
|
+
value_provider=provider_value,
|
|
122
|
+
min_length=field.min_length,
|
|
123
|
+
max_length=field.max_length,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def _generate_text(self, field: FieldSchema) -> str:
|
|
127
|
+
return self.provider.generate_text(
|
|
128
|
+
min_length=field.min_length, max_length=field.max_length
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def _generate_integer(self, field: FieldSchema) -> int:
|
|
132
|
+
min_val = int(field.min_value) if field.min_value is not None else 0
|
|
133
|
+
max_val = int(field.max_value) if field.max_value is not None else 100
|
|
134
|
+
return self.provider.generate_integer(min_value=min_val, max_value=max_val)
|
|
135
|
+
|
|
136
|
+
def _generate_bigint(self, field: FieldSchema) -> int:
|
|
137
|
+
min_val = int(field.min_value) if field.min_value is not None else 0
|
|
138
|
+
max_val = int(field.max_value) if field.max_value is not None else 9999999999
|
|
139
|
+
return self.provider.generate_integer(min_value=min_val, max_value=max_val)
|
|
140
|
+
|
|
141
|
+
def _generate_float(self, field: FieldSchema) -> float:
|
|
142
|
+
return self.provider.generate_float(
|
|
143
|
+
min_value=field.min_value,
|
|
144
|
+
max_value=field.max_value,
|
|
145
|
+
right_digits=field.right_digits if field.right_digits is not None else 2,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def _generate_decimal(self, field: FieldSchema) -> Decimal:
|
|
149
|
+
return self.provider.generate_decimal(
|
|
150
|
+
min_value=field.min_value,
|
|
151
|
+
max_value=field.max_value,
|
|
152
|
+
right_digits=field.right_digits if field.right_digits is not None else 2,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def _generate_boolean(self, field: FieldSchema) -> bool:
|
|
156
|
+
return self.provider.generate_boolean()
|
|
157
|
+
|
|
158
|
+
def _generate_date(self, field: FieldSchema) -> date:
|
|
159
|
+
return self.provider.generate_date(start=field.start_date, end=field.end_date)
|
|
160
|
+
|
|
161
|
+
def _generate_datetime(self, field: FieldSchema) -> datetime:
|
|
162
|
+
return self.provider.generate_datetime(
|
|
163
|
+
start=field.start_date, end=field.end_date
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def _generate_blob(self, field: FieldSchema) -> bytes:
|
|
167
|
+
return self.provider.generate_binary()
|
|
168
|
+
|
|
169
|
+
def _generate_uuid(self, field: FieldSchema) -> UUID:
|
|
170
|
+
return self.provider.generate_uuid()
|
|
171
|
+
|
|
172
|
+
def _generate_enum(self, field: FieldSchema) -> str:
|
|
173
|
+
if not field.enum_values:
|
|
174
|
+
raise ValueError(f"Enum field {field.name} must have values defined")
|
|
175
|
+
return self.provider.generate_enum(field.enum_values, weights=field.weights)
|