TestDataX 0.1.3__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {testdatax-0.1.3 → testdatax-0.2.1}/PKG-INFO +11 -7
- {testdatax-0.1.3 → testdatax-0.2.1}/README.md +7 -4
- {testdatax-0.1.3 → testdatax-0.2.1}/pyproject.toml +13 -6
- {testdatax-0.1.3 → testdatax-0.2.1}/src/__init__.py +1 -1
- {testdatax-0.1.3 → testdatax-0.2.1}/src/cli.py +19 -4
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/csv_exporter.py +5 -4
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/json_exporter.py +16 -9
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/mssql_exporter.py +23 -8
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/mysql_exporter.py +24 -7
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/oracle_exporter.py +23 -8
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/constants.py +6 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/formatters.py +24 -4
- testdatax-0.2.1/src/exporters/utils/sql.py +38 -0
- testdatax-0.2.1/src/generator.py +168 -0
- testdatax-0.2.1/src/providers/base.py +153 -0
- testdatax-0.2.1/src/providers/faker_provider.py +114 -0
- testdatax-0.2.1/src/providers/mimesis_provider.py +153 -0
- testdatax-0.2.1/src/schemas.py +145 -0
- testdatax-0.1.3/src/generator.py +0 -117
- testdatax-0.1.3/src/providers/base.py +0 -58
- testdatax-0.1.3/src/providers/faker_provider.py +0 -65
- testdatax-0.1.3/src/providers/mimesis_provider.py +0 -87
- testdatax-0.1.3/src/schemas.py +0 -81
- {testdatax-0.1.3 → testdatax-0.2.1}/LICENSE +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/__init__.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/base_exporter.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/orc_exporter.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/parquet_exporter.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/__init__.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/chunker.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/exporter_config.py +0 -0
- {testdatax-0.1.3 → testdatax-0.2.1}/src/providers/__init__.py +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: TestDataX
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A flexible test data generation toolkit
|
|
5
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Author: JamesPBrett
|
|
7
8
|
Requires-Python: >=3.11,<4.0
|
|
8
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -10,10 +11,10 @@ Classifier: Programming Language :: Python :: 3
|
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
13
15
|
Requires-Dist: faker (>=33.1.0,<34.0.0)
|
|
14
16
|
Requires-Dist: mimesis (>=18.0.0,<19.0.0)
|
|
15
17
|
Requires-Dist: mysql-connector-python (>=9.1.0,<10.0.0)
|
|
16
|
-
Requires-Dist: orjson (>=3.10.12,<4.0.0)
|
|
17
18
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
19
|
Requires-Dist: pyarrow (>=18.1.0,<19.0.0)
|
|
19
20
|
Requires-Dist: pydantic (>=2.10.4,<3.0.0)
|
|
@@ -22,8 +23,6 @@ Description-Content-Type: text/markdown
|
|
|
22
23
|
|
|
23
24
|
# TestDataX
|
|
24
25
|
|
|
25
|
-
# TestDataX
|
|
26
|
-
|
|
27
26
|

|
|
28
27
|
[](https://codecov.io/gh/JamesPBrett/testdatax)
|
|
29
28
|

|
|
@@ -42,7 +41,7 @@ pip install testdatax
|
|
|
42
41
|
|
|
43
42
|
# Generate sample data
|
|
44
43
|
testdatax --rows 1000 --format json --output data.json
|
|
45
|
-
|
|
44
|
+
```
|
|
46
45
|
|
|
47
46
|
## Features
|
|
48
47
|
|
|
@@ -118,7 +117,7 @@ testdatax -o mstest.sql -f mssql -r 1000
|
|
|
118
117
|
|
|
119
118
|
Generate Oracle with default row count (1000), table_name as 'oracle':
|
|
120
119
|
```bash
|
|
121
|
-
|
|
120
|
+
testdatax -o oracle.sql -f oracle -r 1000
|
|
122
121
|
```
|
|
123
122
|
|
|
124
123
|
Each command consists of:
|
|
@@ -224,6 +223,11 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
224
223
|
}
|
|
225
224
|
```
|
|
226
225
|
|
|
226
|
+
> **Note:** `start_date`/`end_date` bound the generated range (inclusive).
|
|
227
|
+
> `format` applies a `strftime` pattern to date/datetime values in the **CSV and
|
|
228
|
+
> JSON** outputs only; the SQL, Parquet and ORC exporters keep native date types
|
|
229
|
+
> and ignore `format`.
|
|
230
|
+
|
|
227
231
|
#### Enum Fields
|
|
228
232
|
```json
|
|
229
233
|
{
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# TestDataX
|
|
2
2
|
|
|
3
|
-
# TestDataX
|
|
4
|
-
|
|
5
3
|

|
|
6
4
|
[](https://codecov.io/gh/JamesPBrett/testdatax)
|
|
7
5
|

|
|
@@ -20,7 +18,7 @@ pip install testdatax
|
|
|
20
18
|
|
|
21
19
|
# Generate sample data
|
|
22
20
|
testdatax --rows 1000 --format json --output data.json
|
|
23
|
-
|
|
21
|
+
```
|
|
24
22
|
|
|
25
23
|
## Features
|
|
26
24
|
|
|
@@ -96,7 +94,7 @@ testdatax -o mstest.sql -f mssql -r 1000
|
|
|
96
94
|
|
|
97
95
|
Generate Oracle with default row count (1000), table_name as 'oracle':
|
|
98
96
|
```bash
|
|
99
|
-
|
|
97
|
+
testdatax -o oracle.sql -f oracle -r 1000
|
|
100
98
|
```
|
|
101
99
|
|
|
102
100
|
Each command consists of:
|
|
@@ -202,6 +200,11 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
202
200
|
}
|
|
203
201
|
```
|
|
204
202
|
|
|
203
|
+
> **Note:** `start_date`/`end_date` bound the generated range (inclusive).
|
|
204
|
+
> `format` applies a `strftime` pattern to date/datetime values in the **CSV and
|
|
205
|
+
> JSON** outputs only; the SQL, Parquet and ORC exporters keep native date types
|
|
206
|
+
> and ignore `format`.
|
|
207
|
+
|
|
205
208
|
#### Enum Fields
|
|
206
209
|
```json
|
|
207
210
|
{
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "TestDataX"
|
|
3
|
-
version = "0.1
|
|
3
|
+
version = "0.2.1"
|
|
4
4
|
description = "A flexible test data generation toolkit"
|
|
5
5
|
authors = ["JamesPBrett"]
|
|
6
6
|
license = "MIT"
|
|
@@ -12,7 +12,6 @@ python = "^3.11"
|
|
|
12
12
|
typer = "^0.15.1"
|
|
13
13
|
faker = "^33.1.0"
|
|
14
14
|
pydantic = "^2.10.4"
|
|
15
|
-
orjson = "^3.10.12"
|
|
16
15
|
pyarrow = "^18.1.0"
|
|
17
16
|
pandas = "^2.2.3"
|
|
18
17
|
mysql-connector-python = "^9.1.0"
|
|
@@ -39,13 +38,11 @@ types-psutil = "^6.1.0.20241221"
|
|
|
39
38
|
commitizen = "^3.13.0"
|
|
40
39
|
python-semantic-release = "^9.17.0"
|
|
41
40
|
|
|
42
|
-
[build-system]
|
|
43
|
-
requires = ["poetry-core"]
|
|
44
|
-
build-backend = "poetry.core.masonry.api"
|
|
45
41
|
|
|
46
42
|
[tool.poetry.scripts]
|
|
47
43
|
testdatax = "src.cli:app"
|
|
48
44
|
|
|
45
|
+
|
|
49
46
|
[tool.ruff]
|
|
50
47
|
# Same as Black
|
|
51
48
|
line-length = 88
|
|
@@ -85,6 +82,7 @@ exclude = [
|
|
|
85
82
|
[tool.ruff.lint.isort]
|
|
86
83
|
known-first-party = ["src"]
|
|
87
84
|
|
|
85
|
+
|
|
88
86
|
[tool.black]
|
|
89
87
|
line-length = 88
|
|
90
88
|
target-version = ['py311']
|
|
@@ -128,6 +126,7 @@ ignore_missing_imports = true
|
|
|
128
126
|
module = "src.providers.mimesis_provider"
|
|
129
127
|
warn_return_any = false
|
|
130
128
|
|
|
129
|
+
|
|
131
130
|
[tool.coverage.run]
|
|
132
131
|
source = ["src"]
|
|
133
132
|
branch = true
|
|
@@ -141,15 +140,17 @@ exclude_lines = [
|
|
|
141
140
|
"pass",
|
|
142
141
|
]
|
|
143
142
|
|
|
143
|
+
|
|
144
144
|
[tool.pytest.ini_options]
|
|
145
145
|
testpaths = ["tests"]
|
|
146
146
|
python_files = ["test_*.py"]
|
|
147
147
|
python_classes = ["Test*"]
|
|
148
148
|
python_functions = ["test_*"]
|
|
149
149
|
|
|
150
|
+
|
|
150
151
|
[tool.commitizen]
|
|
151
152
|
name = "cz_conventional_commits"
|
|
152
|
-
version = "0.1.
|
|
153
|
+
version = "0.1.3"
|
|
153
154
|
tag_format = "v$version"
|
|
154
155
|
version_files = [
|
|
155
156
|
"src/__init__.py:__version__",
|
|
@@ -223,3 +224,9 @@ allowed_tags = [
|
|
|
223
224
|
"chore", # Maintenance tasks
|
|
224
225
|
"refactor", # Code changes without fixing bugs or adding features
|
|
225
226
|
]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
[build-system]
|
|
230
|
+
requires = ["poetry-core"]
|
|
231
|
+
build-backend = "poetry.core.masonry.api"
|
|
232
|
+
|
|
@@ -102,16 +102,30 @@ def generate(
|
|
|
102
102
|
f"{min_value}, {max_value}"
|
|
103
103
|
)
|
|
104
104
|
|
|
105
|
+
# Accept "precision" as an alias for "right_digits"; use an
|
|
106
|
+
# explicit None check so an intentional 0 is not dropped.
|
|
107
|
+
right_digits = field_def.get("right_digits")
|
|
108
|
+
if right_digits is None:
|
|
109
|
+
right_digits = field_def.get("precision")
|
|
110
|
+
|
|
105
111
|
field_schema = FieldSchema(
|
|
106
112
|
name=name,
|
|
107
113
|
type=field_type,
|
|
108
114
|
enum_values=field_def.get("values"),
|
|
109
115
|
min_value=min_value,
|
|
110
116
|
max_value=max_value,
|
|
111
|
-
right_digits=
|
|
117
|
+
right_digits=right_digits,
|
|
112
118
|
value_provider=field_def.get("provider_field")
|
|
113
119
|
or field_def.get("faker"),
|
|
114
120
|
pattern=field_def.get("pattern"),
|
|
121
|
+
nullable=field_def.get("nullable", False),
|
|
122
|
+
unique=field_def.get("unique", False),
|
|
123
|
+
weights=field_def.get("weights"),
|
|
124
|
+
min_length=field_def.get("min_length"),
|
|
125
|
+
max_length=field_def.get("max_length"),
|
|
126
|
+
start_date=field_def.get("start_date"),
|
|
127
|
+
end_date=field_def.get("end_date"),
|
|
128
|
+
format=field_def.get("format"),
|
|
115
129
|
)
|
|
116
130
|
fields.append(field_schema.model_dump())
|
|
117
131
|
else:
|
|
@@ -163,9 +177,10 @@ def generate(
|
|
|
163
177
|
raise typer.Exit(code=1) from e
|
|
164
178
|
except Exception as e:
|
|
165
179
|
typer.echo(f"Error: {str(e)}", err=True)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
180
|
+
if debug:
|
|
181
|
+
typer.echo(f"Exception type: {type(e).__name__}", err=True)
|
|
182
|
+
typer.echo(f"Exception args: {e.args}", err=True)
|
|
183
|
+
typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
|
|
169
184
|
raise typer.Exit(code=1) from e
|
|
170
185
|
|
|
171
186
|
|
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from .base_exporter import BaseExporter
|
|
8
8
|
from .utils.chunker import DataChunker
|
|
9
9
|
from .utils.constants import CHUNK_SIZE_CSV
|
|
10
|
-
from .utils.formatters import CSVFormatter
|
|
10
|
+
from .utils.formatters import CSVFormatter, extract_formats
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -76,11 +76,12 @@ class CsvExporter(BaseExporter):
|
|
|
76
76
|
else:
|
|
77
77
|
fieldnames = list(data[0].keys())
|
|
78
78
|
|
|
79
|
+
formats = extract_formats(schema)
|
|
79
80
|
first_chunk = True
|
|
80
|
-
formatted_rows = []
|
|
81
81
|
for chunk in self.chunker.chunk_data(data):
|
|
82
|
-
formatted_chunk = [
|
|
83
|
-
|
|
82
|
+
formatted_chunk = [
|
|
83
|
+
self.formatter.format_row(row, formats) for row in chunk
|
|
84
|
+
]
|
|
84
85
|
df = pd.DataFrame(formatted_chunk, columns=fieldnames)
|
|
85
86
|
|
|
86
87
|
# Write the data to CSV in chunks
|
|
@@ -5,7 +5,7 @@ from typing import Any
|
|
|
5
5
|
from .base_exporter import BaseExporter
|
|
6
6
|
from .utils.chunker import DataChunker
|
|
7
7
|
from .utils.constants import CHUNK_SIZE_JSON
|
|
8
|
-
from .utils.formatters import JSONFormatter
|
|
8
|
+
from .utils.formatters import JSONFormatter, extract_formats
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -62,16 +62,23 @@ class JsonExporter(BaseExporter):
|
|
|
62
62
|
raise ValueError(
|
|
63
63
|
f"Field '{field}' in schema is not present in data."
|
|
64
64
|
)
|
|
65
|
-
# Format the data and write it in chunks to the output file
|
|
66
|
-
all_formatted_rows = []
|
|
67
|
-
for chunk in self.chunker.chunk_data(data):
|
|
68
|
-
formatted_chunk = [self.formatter.format_row(row) for row in chunk]
|
|
69
|
-
all_formatted_rows.extend(formatted_chunk)
|
|
70
65
|
|
|
71
|
-
#
|
|
66
|
+
# Stream a valid JSON array to disk one chunk at a time so the whole
|
|
67
|
+
# dataset is never held in memory at once.
|
|
68
|
+
formats = extract_formats(schema)
|
|
72
69
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
73
|
-
|
|
74
|
-
|
|
70
|
+
f.write("[")
|
|
71
|
+
first = True
|
|
72
|
+
for chunk in self.chunker.chunk_data(data):
|
|
73
|
+
for row in chunk:
|
|
74
|
+
formatted = self.formatter.format_row(row, formats)
|
|
75
|
+
block = json.dumps(formatted, indent=4)
|
|
76
|
+
indented = "\n".join(
|
|
77
|
+
" " + line for line in block.splitlines()
|
|
78
|
+
)
|
|
79
|
+
f.write(("\n" if first else ",\n") + indented)
|
|
80
|
+
first = False
|
|
81
|
+
f.write("\n]" if not first else "]")
|
|
75
82
|
|
|
76
83
|
logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
|
|
77
84
|
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import decimal
|
|
2
2
|
from datetime import date, datetime
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Any
|
|
4
5
|
from uuid import UUID
|
|
5
6
|
|
|
6
7
|
from .base_exporter import BaseExporter
|
|
7
8
|
from .utils.constants import DEFAULT_SCHEMA
|
|
9
|
+
from .utils.sql import escape_ansi_quotes, quote_mssql_ident
|
|
8
10
|
|
|
9
11
|
MSSQL_TYPE_MAPPING = {
|
|
10
12
|
"string": "NVARCHAR(255)", # Unicode string support
|
|
@@ -25,6 +27,16 @@ MSSQL_TYPE_MAPPING = {
|
|
|
25
27
|
class MssqlExporter(BaseExporter):
|
|
26
28
|
"""Exports data to MSSQL compatible SQL file."""
|
|
27
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _escape(value: str) -> str:
|
|
32
|
+
"""Escape a string for a T-SQL single-quoted literal (quote doubling)."""
|
|
33
|
+
return escape_ansi_quotes(value)
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _quote_ident(name: str) -> str:
|
|
37
|
+
"""Quote a T-SQL identifier (table or column name) with brackets."""
|
|
38
|
+
return quote_mssql_ident(name)
|
|
39
|
+
|
|
28
40
|
def _format_value(
|
|
29
41
|
self,
|
|
30
42
|
value: (
|
|
@@ -69,7 +81,7 @@ class MssqlExporter(BaseExporter):
|
|
|
69
81
|
if value is None:
|
|
70
82
|
return "NULL"
|
|
71
83
|
elif isinstance(value, (str | UUID)):
|
|
72
|
-
return "'" + str(value)
|
|
84
|
+
return "'" + self._escape(str(value)) + "'"
|
|
73
85
|
elif isinstance(value, (datetime | date)):
|
|
74
86
|
return f"'{value.isoformat()}'"
|
|
75
87
|
elif isinstance(value, bool):
|
|
@@ -141,14 +153,16 @@ class MssqlExporter(BaseExporter):
|
|
|
141
153
|
and field_def.get("type") == "enum"
|
|
142
154
|
and "values" in field_def
|
|
143
155
|
):
|
|
144
|
-
values = "','".join(field_def["values"])
|
|
145
|
-
check_constraints.append(
|
|
156
|
+
values = "','".join(self._escape(v) for v in field_def["values"])
|
|
157
|
+
check_constraints.append(
|
|
158
|
+
f"CHECK ({self._quote_ident(field_name)} IN ('{values}'))"
|
|
159
|
+
)
|
|
146
160
|
|
|
147
|
-
columns.append(f" {field_name} {sql_type} NULL")
|
|
161
|
+
columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
|
|
148
162
|
|
|
149
163
|
# Combine columns and check constraints
|
|
150
164
|
return (
|
|
151
|
-
f"CREATE TABLE {table_name} (\n"
|
|
165
|
+
f"CREATE TABLE {self._quote_ident(table_name)} (\n"
|
|
152
166
|
+ ",\n".join(columns)
|
|
153
167
|
+ (
|
|
154
168
|
(",\n " + ",\n ".join(check_constraints))
|
|
@@ -177,9 +191,10 @@ class MssqlExporter(BaseExporter):
|
|
|
177
191
|
'INSERT INTO users (id, name) VALUES (1, "test");'
|
|
178
192
|
|
|
179
193
|
"""
|
|
180
|
-
columns = ", ".join(row.keys())
|
|
194
|
+
columns = ", ".join(self._quote_ident(c) for c in row.keys())
|
|
181
195
|
values = ", ".join(self._format_value(v) for v in row.values())
|
|
182
|
-
|
|
196
|
+
table = self._quote_ident(table_name)
|
|
197
|
+
return f"INSERT INTO {table} ({columns}) VALUES ({values});"
|
|
183
198
|
|
|
184
199
|
def export(
|
|
185
200
|
self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
|
|
@@ -188,7 +203,7 @@ class MssqlExporter(BaseExporter):
|
|
|
188
203
|
if not data:
|
|
189
204
|
return
|
|
190
205
|
|
|
191
|
-
table_name = output_path
|
|
206
|
+
table_name = Path(output_path).name.split(".")[0]
|
|
192
207
|
|
|
193
208
|
with open(output_path, "w") as f:
|
|
194
209
|
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import decimal
|
|
2
2
|
from datetime import date, datetime
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Any
|
|
4
5
|
from uuid import UUID
|
|
5
6
|
|
|
6
7
|
from .base_exporter import BaseExporter
|
|
7
8
|
from .utils.constants import DEFAULT_SCHEMA
|
|
9
|
+
from .utils.sql import escape_mysql_literal, quote_mysql_ident
|
|
8
10
|
|
|
9
11
|
MYSQL_TYPE_MAPPING = {
|
|
10
12
|
"string": "VARCHAR(255)",
|
|
@@ -25,6 +27,16 @@ MYSQL_TYPE_MAPPING = {
|
|
|
25
27
|
class MysqlExporter(BaseExporter):
|
|
26
28
|
"""Exports data to MySQL compatible SQL file."""
|
|
27
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _escape(value: str) -> str:
|
|
32
|
+
"""Escape a string for a MySQL single-quoted literal."""
|
|
33
|
+
return escape_mysql_literal(value)
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _quote_ident(name: str) -> str:
|
|
37
|
+
"""Quote a MySQL identifier (table or column name)."""
|
|
38
|
+
return quote_mysql_ident(name)
|
|
39
|
+
|
|
28
40
|
def _format_value(
|
|
29
41
|
self,
|
|
30
42
|
value: (
|
|
@@ -69,7 +81,7 @@ class MysqlExporter(BaseExporter):
|
|
|
69
81
|
if value is None:
|
|
70
82
|
return "NULL"
|
|
71
83
|
elif isinstance(value, (str | UUID)):
|
|
72
|
-
return "'" + str(value)
|
|
84
|
+
return "'" + self._escape(str(value)) + "'"
|
|
73
85
|
elif isinstance(value, (datetime | date)):
|
|
74
86
|
return f"'{value.isoformat()}'"
|
|
75
87
|
elif isinstance(value, bool):
|
|
@@ -138,11 +150,15 @@ class MysqlExporter(BaseExporter):
|
|
|
138
150
|
and field_def.get("type") == "enum"
|
|
139
151
|
and "values" in field_def
|
|
140
152
|
):
|
|
141
|
-
values = "','".join(field_def["values"])
|
|
153
|
+
values = "','".join(self._escape(v) for v in field_def["values"])
|
|
142
154
|
sql_type = f"ENUM('{values}')"
|
|
143
|
-
columns.append(f" {field_name} {sql_type} NULL")
|
|
155
|
+
columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
|
|
144
156
|
|
|
145
|
-
return
|
|
157
|
+
return (
|
|
158
|
+
f"CREATE TABLE {self._quote_ident(table_name)} (\n"
|
|
159
|
+
+ ",\n".join(columns)
|
|
160
|
+
+ "\n);\n\n"
|
|
161
|
+
)
|
|
146
162
|
|
|
147
163
|
def _create_insert_stmt(
|
|
148
164
|
self, row: dict[str, Any], table_name: str = "output"
|
|
@@ -163,9 +179,10 @@ class MysqlExporter(BaseExporter):
|
|
|
163
179
|
'INSERT INTO users (id, name) VALUES (1, "test");'
|
|
164
180
|
|
|
165
181
|
"""
|
|
166
|
-
columns = ", ".join(row.keys())
|
|
182
|
+
columns = ", ".join(self._quote_ident(c) for c in row.keys())
|
|
167
183
|
values = ", ".join(self._format_value(v) for v in row.values())
|
|
168
|
-
|
|
184
|
+
table = self._quote_ident(table_name)
|
|
185
|
+
return f"INSERT INTO {table} ({columns}) VALUES ({values});"
|
|
169
186
|
|
|
170
187
|
def export(
|
|
171
188
|
self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
|
|
@@ -174,7 +191,7 @@ class MysqlExporter(BaseExporter):
|
|
|
174
191
|
if not data:
|
|
175
192
|
return
|
|
176
193
|
|
|
177
|
-
table_name = output_path
|
|
194
|
+
table_name = Path(output_path).name.split(".")[0]
|
|
178
195
|
|
|
179
196
|
with open(output_path, "w") as f:
|
|
180
197
|
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import decimal
|
|
2
2
|
from datetime import date, datetime
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Any
|
|
4
5
|
from uuid import UUID
|
|
5
6
|
|
|
6
7
|
from .base_exporter import BaseExporter
|
|
7
8
|
from .utils.constants import DEFAULT_SCHEMA
|
|
9
|
+
from .utils.sql import escape_ansi_quotes, quote_ansi_ident
|
|
8
10
|
|
|
9
11
|
ORACLE_TYPE_MAPPING = {
|
|
10
12
|
"string": "VARCHAR2(255)", # Oracle's Unicode string type
|
|
@@ -25,6 +27,16 @@ ORACLE_TYPE_MAPPING = {
|
|
|
25
27
|
class OracleExporter(BaseExporter):
|
|
26
28
|
"""Exports data to ORACLE compatible SQL file."""
|
|
27
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _escape(value: str) -> str:
|
|
32
|
+
"""Escape a string for an Oracle single-quoted literal (quote doubling)."""
|
|
33
|
+
return escape_ansi_quotes(value)
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _quote_ident(name: str) -> str:
|
|
37
|
+
"""Quote an Oracle identifier (table or column name) with double quotes."""
|
|
38
|
+
return quote_ansi_ident(name)
|
|
39
|
+
|
|
28
40
|
def _format_value(
|
|
29
41
|
self,
|
|
30
42
|
value: (
|
|
@@ -69,7 +81,7 @@ class OracleExporter(BaseExporter):
|
|
|
69
81
|
if value is None:
|
|
70
82
|
return "NULL"
|
|
71
83
|
elif isinstance(value, (str)):
|
|
72
|
-
return "'" + str(value)
|
|
84
|
+
return "'" + self._escape(str(value)) + "'" # uses '' for escaping
|
|
73
85
|
elif isinstance(value, UUID):
|
|
74
86
|
return f"'{str(value)}'"
|
|
75
87
|
elif isinstance(value, datetime):
|
|
@@ -148,14 +160,16 @@ class OracleExporter(BaseExporter):
|
|
|
148
160
|
and field_def.get("type") == "enum"
|
|
149
161
|
and "values" in field_def
|
|
150
162
|
):
|
|
151
|
-
values = "','".join(field_def["values"])
|
|
152
|
-
check_constraints.append(
|
|
163
|
+
values = "','".join(self._escape(v) for v in field_def["values"])
|
|
164
|
+
check_constraints.append(
|
|
165
|
+
f"CHECK ({self._quote_ident(field_name)} IN ('{values}'))"
|
|
166
|
+
)
|
|
153
167
|
|
|
154
|
-
columns.append(f" {field_name} {sql_type} NULL")
|
|
168
|
+
columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
|
|
155
169
|
|
|
156
170
|
# Combine columns and check constraints
|
|
157
171
|
return (
|
|
158
|
-
f"CREATE TABLE {table_name} (\n"
|
|
172
|
+
f"CREATE TABLE {self._quote_ident(table_name)} (\n"
|
|
159
173
|
+ ",\n".join(columns)
|
|
160
174
|
+ (
|
|
161
175
|
(",\n " + ",\n ".join(check_constraints))
|
|
@@ -184,9 +198,10 @@ class OracleExporter(BaseExporter):
|
|
|
184
198
|
'INSERT INTO users (id, name) VALUES (1, "test");'
|
|
185
199
|
|
|
186
200
|
"""
|
|
187
|
-
columns = ", ".join(row.keys())
|
|
201
|
+
columns = ", ".join(self._quote_ident(c) for c in row.keys())
|
|
188
202
|
values = ", ".join(self._format_value(v) for v in row.values())
|
|
189
|
-
|
|
203
|
+
table = self._quote_ident(table_name)
|
|
204
|
+
return f"INSERT INTO {table} ({columns}) VALUES ({values});"
|
|
190
205
|
|
|
191
206
|
def export(
|
|
192
207
|
self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
|
|
@@ -195,7 +210,7 @@ class OracleExporter(BaseExporter):
|
|
|
195
210
|
if not data:
|
|
196
211
|
return
|
|
197
212
|
|
|
198
|
-
table_name = output_path
|
|
213
|
+
table_name = Path(output_path).name.split(".")[0]
|
|
199
214
|
|
|
200
215
|
with open(output_path, "w") as f:
|
|
201
216
|
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
@@ -11,6 +11,12 @@ CHUNK_SIZE_CSV = CHUNK_SIZE
|
|
|
11
11
|
CHUNK_SIZE_PARQUET = CHUNK_SIZE
|
|
12
12
|
CHUNK_SIZE_ORC = CHUNK_SIZE
|
|
13
13
|
|
|
14
|
+
# Probability that a nullable field emits NULL instead of a value
|
|
15
|
+
NULL_PROBABILITY = 0.1
|
|
16
|
+
|
|
17
|
+
# Maximum attempts to generate a unique value before giving up
|
|
18
|
+
MAX_UNIQUE_RETRIES = 1000
|
|
19
|
+
|
|
14
20
|
# ORC compression types
|
|
15
21
|
OrcCompression = Literal["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
|
|
16
22
|
OrcStrategy = Literal["SPEED", "COMPRESSION"]
|
|
@@ -6,6 +6,20 @@ from decimal import Decimal
|
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
def extract_formats(schema: dict | None) -> dict[str, str]:
|
|
10
|
+
"""Extract per-column ``strftime`` formats from a schema definition.
|
|
11
|
+
|
|
12
|
+
Only complex (dict) field definitions carrying a ``format`` key are included.
|
|
13
|
+
"""
|
|
14
|
+
if not schema:
|
|
15
|
+
return {}
|
|
16
|
+
return {
|
|
17
|
+
name: field_def["format"]
|
|
18
|
+
for name, field_def in schema.items()
|
|
19
|
+
if isinstance(field_def, dict) and field_def.get("format")
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
9
23
|
class BaseFormatter:
|
|
10
24
|
"""Base class for handling data type formatting across exporters."""
|
|
11
25
|
|
|
@@ -68,22 +82,28 @@ class BaseFormatter:
|
|
|
68
82
|
pass
|
|
69
83
|
|
|
70
84
|
def format_row(
|
|
71
|
-
self, row: dict[str, Any],
|
|
85
|
+
self, row: dict[str, Any], formats: dict[str, str] | None = None
|
|
72
86
|
) -> dict[str, Any]:
|
|
73
|
-
"""Format the provided
|
|
87
|
+
"""Format the provided row with the correct format_value.
|
|
74
88
|
|
|
75
89
|
Args:
|
|
76
90
|
row: Dictionary containing row data
|
|
77
|
-
|
|
91
|
+
formats: Optional per-column ``strftime`` formats applied to
|
|
92
|
+
date/datetime values before normal value formatting.
|
|
78
93
|
|
|
79
94
|
Returns:
|
|
80
95
|
Formatted row dictionary
|
|
81
96
|
|
|
82
97
|
"""
|
|
98
|
+
formats = formats or {}
|
|
83
99
|
formatted_row: dict[str, Any] = {}
|
|
84
100
|
for key, value in row.items():
|
|
85
101
|
try:
|
|
86
|
-
|
|
102
|
+
fmt = formats.get(key)
|
|
103
|
+
if fmt and isinstance(value, (date | datetime)):
|
|
104
|
+
formatted_row[key] = value.strftime(fmt)
|
|
105
|
+
else:
|
|
106
|
+
formatted_row[key] = self.format_value(value)
|
|
87
107
|
except Exception as e:
|
|
88
108
|
formatted_row[key] = f"ERROR: {str(e)}"
|
|
89
109
|
return formatted_row
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Shared SQL string-escaping helpers for the SQL exporters."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def escape_ansi_quotes(value: str) -> str:
|
|
5
|
+
"""Escape an ANSI single-quoted SQL literal by doubling embedded quotes.
|
|
6
|
+
|
|
7
|
+
Used by dialects without backslash escaping (MSSQL, Oracle).
|
|
8
|
+
"""
|
|
9
|
+
return value.replace("'", "''")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def escape_mysql_literal(value: str) -> str:
|
|
13
|
+
"""Escape a MySQL single-quoted literal.
|
|
14
|
+
|
|
15
|
+
The backslash is escaped first so a value containing a backslash cannot
|
|
16
|
+
terminate the literal early, then single quotes and newlines are escaped.
|
|
17
|
+
"""
|
|
18
|
+
return value.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def quote_mysql_ident(name: str) -> str:
|
|
22
|
+
"""Quote a MySQL identifier with backticks, doubling embedded backticks."""
|
|
23
|
+
return "`" + name.replace("`", "``") + "`"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def quote_mssql_ident(name: str) -> str:
|
|
27
|
+
"""Quote a T-SQL identifier with brackets, doubling embedded ``]``."""
|
|
28
|
+
return "[" + name.replace("]", "]]") + "]"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def quote_ansi_ident(name: str) -> str:
|
|
32
|
+
"""Quote an ANSI/Oracle identifier with double quotes, doubling embedded ``"``.
|
|
33
|
+
|
|
34
|
+
Note: Oracle treats a double-quoted identifier as case-sensitive, so the
|
|
35
|
+
generated DDL and DML deliberately reference every identifier quoted to stay
|
|
36
|
+
self-consistent.
|
|
37
|
+
"""
|
|
38
|
+
return '"' + name.replace('"', '""') + '"'
|