TestDataX 0.1.3__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {testdatax-0.1.3 → testdatax-0.2.1}/PKG-INFO +11 -7
  2. {testdatax-0.1.3 → testdatax-0.2.1}/README.md +7 -4
  3. {testdatax-0.1.3 → testdatax-0.2.1}/pyproject.toml +13 -6
  4. {testdatax-0.1.3 → testdatax-0.2.1}/src/__init__.py +1 -1
  5. {testdatax-0.1.3 → testdatax-0.2.1}/src/cli.py +19 -4
  6. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/csv_exporter.py +5 -4
  7. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/json_exporter.py +16 -9
  8. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/mssql_exporter.py +23 -8
  9. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/mysql_exporter.py +24 -7
  10. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/oracle_exporter.py +23 -8
  11. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/constants.py +6 -0
  12. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/formatters.py +24 -4
  13. testdatax-0.2.1/src/exporters/utils/sql.py +38 -0
  14. testdatax-0.2.1/src/generator.py +168 -0
  15. testdatax-0.2.1/src/providers/base.py +153 -0
  16. testdatax-0.2.1/src/providers/faker_provider.py +114 -0
  17. testdatax-0.2.1/src/providers/mimesis_provider.py +153 -0
  18. testdatax-0.2.1/src/schemas.py +145 -0
  19. testdatax-0.1.3/src/generator.py +0 -117
  20. testdatax-0.1.3/src/providers/base.py +0 -58
  21. testdatax-0.1.3/src/providers/faker_provider.py +0 -65
  22. testdatax-0.1.3/src/providers/mimesis_provider.py +0 -87
  23. testdatax-0.1.3/src/schemas.py +0 -81
  24. {testdatax-0.1.3 → testdatax-0.2.1}/LICENSE +0 -0
  25. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/__init__.py +0 -0
  26. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/base_exporter.py +0 -0
  27. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/orc_exporter.py +0 -0
  28. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/parquet_exporter.py +0 -0
  29. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/__init__.py +0 -0
  30. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/chunker.py +0 -0
  31. {testdatax-0.1.3 → testdatax-0.2.1}/src/exporters/utils/exporter_config.py +0 -0
  32. {testdatax-0.1.3 → testdatax-0.2.1}/src/providers/__init__.py +0 -0
@@ -1,8 +1,9 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: TestDataX
3
- Version: 0.1.3
3
+ Version: 0.2.1
4
4
  Summary: A flexible test data generation toolkit
5
5
  License: MIT
6
+ License-File: LICENSE
6
7
  Author: JamesPBrett
7
8
  Requires-Python: >=3.11,<4.0
8
9
  Classifier: License :: OSI Approved :: MIT License
@@ -10,10 +11,10 @@ Classifier: Programming Language :: Python :: 3
10
11
  Classifier: Programming Language :: Python :: 3.11
11
12
  Classifier: Programming Language :: Python :: 3.12
12
13
  Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
13
15
  Requires-Dist: faker (>=33.1.0,<34.0.0)
14
16
  Requires-Dist: mimesis (>=18.0.0,<19.0.0)
15
17
  Requires-Dist: mysql-connector-python (>=9.1.0,<10.0.0)
16
- Requires-Dist: orjson (>=3.10.12,<4.0.0)
17
18
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
19
  Requires-Dist: pyarrow (>=18.1.0,<19.0.0)
19
20
  Requires-Dist: pydantic (>=2.10.4,<3.0.0)
@@ -22,8 +23,6 @@ Description-Content-Type: text/markdown
22
23
 
23
24
  # TestDataX
24
25
 
25
- # TestDataX
26
-
27
26
  ![Build Status](https://github.com/JamesPBrett/testdatax/actions/workflows/publish.yml/badge.svg)
28
27
  [![codecov](https://codecov.io/gh/JamesPBrett/testdatax/branch/main/graph/badge.svg?token=6VX62CI6U9)](https://codecov.io/gh/JamesPBrett/testdatax)
29
28
  ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
@@ -42,7 +41,7 @@ pip install testdatax
42
41
 
43
42
  # Generate sample data
44
43
  testdatax --rows 1000 --format json --output data.json
45
-
44
+ ```
46
45
 
47
46
  ## Features
48
47
 
@@ -118,7 +117,7 @@ testdatax -o mstest.sql -f mssql -r 1000
118
117
 
119
118
  Generate Oracle with default row count (1000), table_name as 'oracle':
120
119
  ```bash
121
- datagen -o oracle.sql -f oracle -r 1000
120
+ testdatax -o oracle.sql -f oracle -r 1000
122
121
  ```
123
122
 
124
123
  Each command consists of:
@@ -224,6 +223,11 @@ The schema file defines the structure and constraints of your generated data. Ea
224
223
  }
225
224
  ```
226
225
 
226
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive).
227
+ > `format` applies a `strftime` pattern to date/datetime values in the **CSV and
228
+ > JSON** outputs only; the SQL, Parquet and ORC exporters keep native date types
229
+ > and ignore `format`.
230
+
227
231
  #### Enum Fields
228
232
  ```json
229
233
  {
@@ -1,7 +1,5 @@
1
1
  # TestDataX
2
2
 
3
- # TestDataX
4
-
5
3
  ![Build Status](https://github.com/JamesPBrett/testdatax/actions/workflows/publish.yml/badge.svg)
6
4
  [![codecov](https://codecov.io/gh/JamesPBrett/testdatax/branch/main/graph/badge.svg?token=6VX62CI6U9)](https://codecov.io/gh/JamesPBrett/testdatax)
7
5
  ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
@@ -20,7 +18,7 @@ pip install testdatax
20
18
 
21
19
  # Generate sample data
22
20
  testdatax --rows 1000 --format json --output data.json
23
-
21
+ ```
24
22
 
25
23
  ## Features
26
24
 
@@ -96,7 +94,7 @@ testdatax -o mstest.sql -f mssql -r 1000
96
94
 
97
95
  Generate Oracle with default row count (1000), table_name as 'oracle':
98
96
  ```bash
99
- datagen -o oracle.sql -f oracle -r 1000
97
+ testdatax -o oracle.sql -f oracle -r 1000
100
98
  ```
101
99
 
102
100
  Each command consists of:
@@ -202,6 +200,11 @@ The schema file defines the structure and constraints of your generated data. Ea
202
200
  }
203
201
  ```
204
202
 
203
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive).
204
+ > `format` applies a `strftime` pattern to date/datetime values in the **CSV and
205
+ > JSON** outputs only; the SQL, Parquet and ORC exporters keep native date types
206
+ > and ignore `format`.
207
+
205
208
  #### Enum Fields
206
209
  ```json
207
210
  {
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "TestDataX"
3
- version = "0.1.3"
3
+ version = "0.2.1"
4
4
  description = "A flexible test data generation toolkit"
5
5
  authors = ["JamesPBrett"]
6
6
  license = "MIT"
@@ -12,7 +12,6 @@ python = "^3.11"
12
12
  typer = "^0.15.1"
13
13
  faker = "^33.1.0"
14
14
  pydantic = "^2.10.4"
15
- orjson = "^3.10.12"
16
15
  pyarrow = "^18.1.0"
17
16
  pandas = "^2.2.3"
18
17
  mysql-connector-python = "^9.1.0"
@@ -39,13 +38,11 @@ types-psutil = "^6.1.0.20241221"
39
38
  commitizen = "^3.13.0"
40
39
  python-semantic-release = "^9.17.0"
41
40
 
42
- [build-system]
43
- requires = ["poetry-core"]
44
- build-backend = "poetry.core.masonry.api"
45
41
 
46
42
  [tool.poetry.scripts]
47
43
  testdatax = "src.cli:app"
48
44
 
45
+
49
46
  [tool.ruff]
50
47
  # Same as Black
51
48
  line-length = 88
@@ -85,6 +82,7 @@ exclude = [
85
82
  [tool.ruff.lint.isort]
86
83
  known-first-party = ["src"]
87
84
 
85
+
88
86
  [tool.black]
89
87
  line-length = 88
90
88
  target-version = ['py311']
@@ -128,6 +126,7 @@ ignore_missing_imports = true
128
126
  module = "src.providers.mimesis_provider"
129
127
  warn_return_any = false
130
128
 
129
+
131
130
  [tool.coverage.run]
132
131
  source = ["src"]
133
132
  branch = true
@@ -141,15 +140,17 @@ exclude_lines = [
141
140
  "pass",
142
141
  ]
143
142
 
143
+
144
144
  [tool.pytest.ini_options]
145
145
  testpaths = ["tests"]
146
146
  python_files = ["test_*.py"]
147
147
  python_classes = ["Test*"]
148
148
  python_functions = ["test_*"]
149
149
 
150
+
150
151
  [tool.commitizen]
151
152
  name = "cz_conventional_commits"
152
- version = "0.1.0"
153
+ version = "0.1.3"
153
154
  tag_format = "v$version"
154
155
  version_files = [
155
156
  "src/__init__.py:__version__",
@@ -223,3 +224,9 @@ allowed_tags = [
223
224
  "chore", # Maintenance tasks
224
225
  "refactor", # Code changes without fixing bugs or adding features
225
226
  ]
227
+
228
+
229
+ [build-system]
230
+ requires = ["poetry-core"]
231
+ build-backend = "poetry.core.masonry.api"
232
+
@@ -1,6 +1,6 @@
1
1
  """TestDataX package initialization."""
2
2
 
3
- __version__ = "0.1.3"
3
+ __version__ = "0.2.1"
4
4
 
5
5
  from src.cli import app # noqa
6
6
 
@@ -102,16 +102,30 @@ def generate(
102
102
  f"{min_value}, {max_value}"
103
103
  )
104
104
 
105
+ # Accept "precision" as an alias for "right_digits"; use an
106
+ # explicit None check so an intentional 0 is not dropped.
107
+ right_digits = field_def.get("right_digits")
108
+ if right_digits is None:
109
+ right_digits = field_def.get("precision")
110
+
105
111
  field_schema = FieldSchema(
106
112
  name=name,
107
113
  type=field_type,
108
114
  enum_values=field_def.get("values"),
109
115
  min_value=min_value,
110
116
  max_value=max_value,
111
- right_digits=field_def.get("right_digits"),
117
+ right_digits=right_digits,
112
118
  value_provider=field_def.get("provider_field")
113
119
  or field_def.get("faker"),
114
120
  pattern=field_def.get("pattern"),
121
+ nullable=field_def.get("nullable", False),
122
+ unique=field_def.get("unique", False),
123
+ weights=field_def.get("weights"),
124
+ min_length=field_def.get("min_length"),
125
+ max_length=field_def.get("max_length"),
126
+ start_date=field_def.get("start_date"),
127
+ end_date=field_def.get("end_date"),
128
+ format=field_def.get("format"),
115
129
  )
116
130
  fields.append(field_schema.model_dump())
117
131
  else:
@@ -163,9 +177,10 @@ def generate(
163
177
  raise typer.Exit(code=1) from e
164
178
  except Exception as e:
165
179
  typer.echo(f"Error: {str(e)}", err=True)
166
- typer.echo(f"Exception type: {type(e).__name__}", err=True)
167
- typer.echo(f"Exception args: {e.args}", err=True)
168
- typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
180
+ if debug:
181
+ typer.echo(f"Exception type: {type(e).__name__}", err=True)
182
+ typer.echo(f"Exception args: {e.args}", err=True)
183
+ typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
169
184
  raise typer.Exit(code=1) from e
170
185
 
171
186
 
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from .base_exporter import BaseExporter
8
8
  from .utils.chunker import DataChunker
9
9
  from .utils.constants import CHUNK_SIZE_CSV
10
- from .utils.formatters import CSVFormatter
10
+ from .utils.formatters import CSVFormatter, extract_formats
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -76,11 +76,12 @@ class CsvExporter(BaseExporter):
76
76
  else:
77
77
  fieldnames = list(data[0].keys())
78
78
 
79
+ formats = extract_formats(schema)
79
80
  first_chunk = True
80
- formatted_rows = []
81
81
  for chunk in self.chunker.chunk_data(data):
82
- formatted_chunk = [self.formatter.format_row(row) for row in chunk]
83
- formatted_rows.extend(formatted_chunk)
82
+ formatted_chunk = [
83
+ self.formatter.format_row(row, formats) for row in chunk
84
+ ]
84
85
  df = pd.DataFrame(formatted_chunk, columns=fieldnames)
85
86
 
86
87
  # Write the data to CSV in chunks
@@ -5,7 +5,7 @@ from typing import Any
5
5
  from .base_exporter import BaseExporter
6
6
  from .utils.chunker import DataChunker
7
7
  from .utils.constants import CHUNK_SIZE_JSON
8
- from .utils.formatters import JSONFormatter
8
+ from .utils.formatters import JSONFormatter, extract_formats
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
@@ -62,16 +62,23 @@ class JsonExporter(BaseExporter):
62
62
  raise ValueError(
63
63
  f"Field '{field}' in schema is not present in data."
64
64
  )
65
- # Format the data and write it in chunks to the output file
66
- all_formatted_rows = []
67
- for chunk in self.chunker.chunk_data(data):
68
- formatted_chunk = [self.formatter.format_row(row) for row in chunk]
69
- all_formatted_rows.extend(formatted_chunk)
70
65
 
71
- # Write the complete file with proper formatting using json.dumps
66
+ # Stream a valid JSON array to disk one chunk at a time so the whole
67
+ # dataset is never held in memory at once.
68
+ formats = extract_formats(schema)
72
69
  with open(output_path, "w", encoding="utf-8") as f:
73
- json_str = json.dumps(all_formatted_rows, indent=4)
74
- f.write(json_str)
70
+ f.write("[")
71
+ first = True
72
+ for chunk in self.chunker.chunk_data(data):
73
+ for row in chunk:
74
+ formatted = self.formatter.format_row(row, formats)
75
+ block = json.dumps(formatted, indent=4)
76
+ indented = "\n".join(
77
+ " " + line for line in block.splitlines()
78
+ )
79
+ f.write(("\n" if first else ",\n") + indented)
80
+ first = False
81
+ f.write("\n]" if not first else "]")
75
82
 
76
83
  logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
77
84
 
@@ -1,10 +1,12 @@
1
1
  import decimal
2
2
  from datetime import date, datetime
3
+ from pathlib import Path
3
4
  from typing import Any
4
5
  from uuid import UUID
5
6
 
6
7
  from .base_exporter import BaseExporter
7
8
  from .utils.constants import DEFAULT_SCHEMA
9
+ from .utils.sql import escape_ansi_quotes, quote_mssql_ident
8
10
 
9
11
  MSSQL_TYPE_MAPPING = {
10
12
  "string": "NVARCHAR(255)", # Unicode string support
@@ -25,6 +27,16 @@ MSSQL_TYPE_MAPPING = {
25
27
  class MssqlExporter(BaseExporter):
26
28
  """Exports data to MSSQL compatible SQL file."""
27
29
 
30
+ @staticmethod
31
+ def _escape(value: str) -> str:
32
+ """Escape a string for a T-SQL single-quoted literal (quote doubling)."""
33
+ return escape_ansi_quotes(value)
34
+
35
+ @staticmethod
36
+ def _quote_ident(name: str) -> str:
37
+ """Quote a T-SQL identifier (table or column name) with brackets."""
38
+ return quote_mssql_ident(name)
39
+
28
40
  def _format_value(
29
41
  self,
30
42
  value: (
@@ -69,7 +81,7 @@ class MssqlExporter(BaseExporter):
69
81
  if value is None:
70
82
  return "NULL"
71
83
  elif isinstance(value, (str | UUID)):
72
- return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
84
+ return "'" + self._escape(str(value)) + "'"
73
85
  elif isinstance(value, (datetime | date)):
74
86
  return f"'{value.isoformat()}'"
75
87
  elif isinstance(value, bool):
@@ -141,14 +153,16 @@ class MssqlExporter(BaseExporter):
141
153
  and field_def.get("type") == "enum"
142
154
  and "values" in field_def
143
155
  ):
144
- values = "','".join(field_def["values"])
145
- check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
156
+ values = "','".join(self._escape(v) for v in field_def["values"])
157
+ check_constraints.append(
158
+ f"CHECK ({self._quote_ident(field_name)} IN ('{values}'))"
159
+ )
146
160
 
147
- columns.append(f" {field_name} {sql_type} NULL")
161
+ columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
148
162
 
149
163
  # Combine columns and check constraints
150
164
  return (
151
- f"CREATE TABLE {table_name} (\n"
165
+ f"CREATE TABLE {self._quote_ident(table_name)} (\n"
152
166
  + ",\n".join(columns)
153
167
  + (
154
168
  (",\n " + ",\n ".join(check_constraints))
@@ -177,9 +191,10 @@ class MssqlExporter(BaseExporter):
177
191
  'INSERT INTO users (id, name) VALUES (1, "test");'
178
192
 
179
193
  """
180
- columns = ", ".join(row.keys())
194
+ columns = ", ".join(self._quote_ident(c) for c in row.keys())
181
195
  values = ", ".join(self._format_value(v) for v in row.values())
182
- return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
196
+ table = self._quote_ident(table_name)
197
+ return f"INSERT INTO {table} ({columns}) VALUES ({values});"
183
198
 
184
199
  def export(
185
200
  self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
@@ -188,7 +203,7 @@ class MssqlExporter(BaseExporter):
188
203
  if not data:
189
204
  return
190
205
 
191
- table_name = output_path.split("/")[-1].split(".")[0]
206
+ table_name = Path(output_path).name.split(".")[0]
192
207
 
193
208
  with open(output_path, "w") as f:
194
209
  # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
@@ -1,10 +1,12 @@
1
1
  import decimal
2
2
  from datetime import date, datetime
3
+ from pathlib import Path
3
4
  from typing import Any
4
5
  from uuid import UUID
5
6
 
6
7
  from .base_exporter import BaseExporter
7
8
  from .utils.constants import DEFAULT_SCHEMA
9
+ from .utils.sql import escape_mysql_literal, quote_mysql_ident
8
10
 
9
11
  MYSQL_TYPE_MAPPING = {
10
12
  "string": "VARCHAR(255)",
@@ -25,6 +27,16 @@ MYSQL_TYPE_MAPPING = {
25
27
  class MysqlExporter(BaseExporter):
26
28
  """Exports data to MySQL compatible SQL file."""
27
29
 
30
+ @staticmethod
31
+ def _escape(value: str) -> str:
32
+ """Escape a string for a MySQL single-quoted literal."""
33
+ return escape_mysql_literal(value)
34
+
35
+ @staticmethod
36
+ def _quote_ident(name: str) -> str:
37
+ """Quote a MySQL identifier (table or column name)."""
38
+ return quote_mysql_ident(name)
39
+
28
40
  def _format_value(
29
41
  self,
30
42
  value: (
@@ -69,7 +81,7 @@ class MysqlExporter(BaseExporter):
69
81
  if value is None:
70
82
  return "NULL"
71
83
  elif isinstance(value, (str | UUID)):
72
- return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
84
+ return "'" + self._escape(str(value)) + "'"
73
85
  elif isinstance(value, (datetime | date)):
74
86
  return f"'{value.isoformat()}'"
75
87
  elif isinstance(value, bool):
@@ -138,11 +150,15 @@ class MysqlExporter(BaseExporter):
138
150
  and field_def.get("type") == "enum"
139
151
  and "values" in field_def
140
152
  ):
141
- values = "','".join(field_def["values"])
153
+ values = "','".join(self._escape(v) for v in field_def["values"])
142
154
  sql_type = f"ENUM('{values}')"
143
- columns.append(f" {field_name} {sql_type} NULL")
155
+ columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
144
156
 
145
- return f"CREATE TABLE {table_name} (\n" + ",\n".join(columns) + "\n);\n\n"
157
+ return (
158
+ f"CREATE TABLE {self._quote_ident(table_name)} (\n"
159
+ + ",\n".join(columns)
160
+ + "\n);\n\n"
161
+ )
146
162
 
147
163
  def _create_insert_stmt(
148
164
  self, row: dict[str, Any], table_name: str = "output"
@@ -163,9 +179,10 @@ class MysqlExporter(BaseExporter):
163
179
  'INSERT INTO users (id, name) VALUES (1, "test");'
164
180
 
165
181
  """
166
- columns = ", ".join(row.keys())
182
+ columns = ", ".join(self._quote_ident(c) for c in row.keys())
167
183
  values = ", ".join(self._format_value(v) for v in row.values())
168
- return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
184
+ table = self._quote_ident(table_name)
185
+ return f"INSERT INTO {table} ({columns}) VALUES ({values});"
169
186
 
170
187
  def export(
171
188
  self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
@@ -174,7 +191,7 @@ class MysqlExporter(BaseExporter):
174
191
  if not data:
175
192
  return
176
193
 
177
- table_name = output_path.split("/")[-1].split(".")[0]
194
+ table_name = Path(output_path).name.split(".")[0]
178
195
 
179
196
  with open(output_path, "w") as f:
180
197
  # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
@@ -1,10 +1,12 @@
1
1
  import decimal
2
2
  from datetime import date, datetime
3
+ from pathlib import Path
3
4
  from typing import Any
4
5
  from uuid import UUID
5
6
 
6
7
  from .base_exporter import BaseExporter
7
8
  from .utils.constants import DEFAULT_SCHEMA
9
+ from .utils.sql import escape_ansi_quotes, quote_ansi_ident
8
10
 
9
11
  ORACLE_TYPE_MAPPING = {
10
12
  "string": "VARCHAR2(255)", # Oracle's Unicode string type
@@ -25,6 +27,16 @@ ORACLE_TYPE_MAPPING = {
25
27
  class OracleExporter(BaseExporter):
26
28
  """Exports data to ORACLE compatible SQL file."""
27
29
 
30
+ @staticmethod
31
+ def _escape(value: str) -> str:
32
+ """Escape a string for an Oracle single-quoted literal (quote doubling)."""
33
+ return escape_ansi_quotes(value)
34
+
35
+ @staticmethod
36
+ def _quote_ident(name: str) -> str:
37
+ """Quote an Oracle identifier (table or column name) with double quotes."""
38
+ return quote_ansi_ident(name)
39
+
28
40
  def _format_value(
29
41
  self,
30
42
  value: (
@@ -69,7 +81,7 @@ class OracleExporter(BaseExporter):
69
81
  if value is None:
70
82
  return "NULL"
71
83
  elif isinstance(value, (str)):
72
- return "'" + str(value).replace("'", "''") + "'" # uses '' for escaping
84
+ return "'" + self._escape(str(value)) + "'" # uses '' for escaping
73
85
  elif isinstance(value, UUID):
74
86
  return f"'{str(value)}'"
75
87
  elif isinstance(value, datetime):
@@ -148,14 +160,16 @@ class OracleExporter(BaseExporter):
148
160
  and field_def.get("type") == "enum"
149
161
  and "values" in field_def
150
162
  ):
151
- values = "','".join(field_def["values"])
152
- check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
163
+ values = "','".join(self._escape(v) for v in field_def["values"])
164
+ check_constraints.append(
165
+ f"CHECK ({self._quote_ident(field_name)} IN ('{values}'))"
166
+ )
153
167
 
154
- columns.append(f" {field_name} {sql_type} NULL")
168
+ columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
155
169
 
156
170
  # Combine columns and check constraints
157
171
  return (
158
- f"CREATE TABLE {table_name} (\n"
172
+ f"CREATE TABLE {self._quote_ident(table_name)} (\n"
159
173
  + ",\n".join(columns)
160
174
  + (
161
175
  (",\n " + ",\n ".join(check_constraints))
@@ -184,9 +198,10 @@ class OracleExporter(BaseExporter):
184
198
  'INSERT INTO users (id, name) VALUES (1, "test");'
185
199
 
186
200
  """
187
- columns = ", ".join(row.keys())
201
+ columns = ", ".join(self._quote_ident(c) for c in row.keys())
188
202
  values = ", ".join(self._format_value(v) for v in row.values())
189
- return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
203
+ table = self._quote_ident(table_name)
204
+ return f"INSERT INTO {table} ({columns}) VALUES ({values});"
190
205
 
191
206
  def export(
192
207
  self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
@@ -195,7 +210,7 @@ class OracleExporter(BaseExporter):
195
210
  if not data:
196
211
  return
197
212
 
198
- table_name = output_path.split("/")[-1].split(".")[0]
213
+ table_name = Path(output_path).name.split(".")[0]
199
214
 
200
215
  with open(output_path, "w") as f:
201
216
  # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
@@ -11,6 +11,12 @@ CHUNK_SIZE_CSV = CHUNK_SIZE
11
11
  CHUNK_SIZE_PARQUET = CHUNK_SIZE
12
12
  CHUNK_SIZE_ORC = CHUNK_SIZE
13
13
 
14
+ # Probability that a nullable field emits NULL instead of a value
15
+ NULL_PROBABILITY = 0.1
16
+
17
+ # Maximum attempts to generate a unique value before giving up
18
+ MAX_UNIQUE_RETRIES = 1000
19
+
14
20
  # ORC compression types
15
21
  OrcCompression = Literal["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
16
22
  OrcStrategy = Literal["SPEED", "COMPRESSION"]
@@ -6,6 +6,20 @@ from decimal import Decimal
6
6
  from typing import Any
7
7
 
8
8
 
9
+ def extract_formats(schema: dict | None) -> dict[str, str]:
10
+ """Extract per-column ``strftime`` formats from a schema definition.
11
+
12
+ Only complex (dict) field definitions carrying a ``format`` key are included.
13
+ """
14
+ if not schema:
15
+ return {}
16
+ return {
17
+ name: field_def["format"]
18
+ for name, field_def in schema.items()
19
+ if isinstance(field_def, dict) and field_def.get("format")
20
+ }
21
+
22
+
9
23
  class BaseFormatter:
10
24
  """Base class for handling data type formatting across exporters."""
11
25
 
@@ -68,22 +82,28 @@ class BaseFormatter:
68
82
  pass
69
83
 
70
84
  def format_row(
71
- self, row: dict[str, Any], **kwargs: dict[str, str | int | float]
85
+ self, row: dict[str, Any], formats: dict[str, str] | None = None
72
86
  ) -> dict[str, Any]:
73
- """Format the provided rows with the correct format_value.
87
+ """Format the provided row with the correct format_value.
74
88
 
75
89
  Args:
76
90
  row: Dictionary containing row data
77
- **kwargs: Additional format-specific parameters
91
+ formats: Optional per-column ``strftime`` formats applied to
92
+ date/datetime values before normal value formatting.
78
93
 
79
94
  Returns:
80
95
  Formatted row dictionary
81
96
 
82
97
  """
98
+ formats = formats or {}
83
99
  formatted_row: dict[str, Any] = {}
84
100
  for key, value in row.items():
85
101
  try:
86
- formatted_row[key] = self.format_value(value)
102
+ fmt = formats.get(key)
103
+ if fmt and isinstance(value, (date | datetime)):
104
+ formatted_row[key] = value.strftime(fmt)
105
+ else:
106
+ formatted_row[key] = self.format_value(value)
87
107
  except Exception as e:
88
108
  formatted_row[key] = f"ERROR: {str(e)}"
89
109
  return formatted_row
@@ -0,0 +1,38 @@
1
+ """Shared SQL string-escaping helpers for the SQL exporters."""
2
+
3
+
4
+ def escape_ansi_quotes(value: str) -> str:
5
+ """Escape an ANSI single-quoted SQL literal by doubling embedded quotes.
6
+
7
+ Used by dialects without backslash escaping (MSSQL, Oracle).
8
+ """
9
+ return value.replace("'", "''")
10
+
11
+
12
+ def escape_mysql_literal(value: str) -> str:
13
+ """Escape a MySQL single-quoted literal.
14
+
15
+ The backslash is escaped first so a value containing a backslash cannot
16
+ terminate the literal early, then single quotes and newlines are escaped.
17
+ """
18
+ return value.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
19
+
20
+
21
+ def quote_mysql_ident(name: str) -> str:
22
+ """Quote a MySQL identifier with backticks, doubling embedded backticks."""
23
+ return "`" + name.replace("`", "``") + "`"
24
+
25
+
26
+ def quote_mssql_ident(name: str) -> str:
27
+ """Quote a T-SQL identifier with brackets, doubling embedded ``]``."""
28
+ return "[" + name.replace("]", "]]") + "]"
29
+
30
+
31
+ def quote_ansi_ident(name: str) -> str:
32
+ """Quote an ANSI/Oracle identifier with double quotes, doubling embedded ``"``.
33
+
34
+ Note: Oracle treats a double-quoted identifier as case-sensitive, so the
35
+ generated DDL and DML deliberately reference every identifier quoted to stay
36
+ self-consistent.
37
+ """
38
+ return '"' + name.replace('"', '""') + '"'