TestDataX 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {testdatax-0.2.0 → testdatax-0.2.1}/PKG-INFO +5 -6
  2. {testdatax-0.2.0 → testdatax-0.2.1}/README.md +4 -5
  3. {testdatax-0.2.0 → testdatax-0.2.1}/pyproject.toml +2 -1
  4. {testdatax-0.2.0 → testdatax-0.2.1}/src/__init__.py +1 -1
  5. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/csv_exporter.py +5 -2
  6. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/json_exporter.py +3 -2
  7. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/mssql_exporter.py +14 -6
  8. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/mysql_exporter.py +15 -5
  9. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/oracle_exporter.py +14 -6
  10. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/utils/formatters.py +24 -4
  11. testdatax-0.2.1/src/exporters/utils/sql.py +38 -0
  12. {testdatax-0.2.0 → testdatax-0.2.1}/src/generator.py +7 -14
  13. testdatax-0.2.0/src/exporters/utils/sql.py +0 -18
  14. {testdatax-0.2.0 → testdatax-0.2.1}/LICENSE +0 -0
  15. {testdatax-0.2.0 → testdatax-0.2.1}/src/cli.py +0 -0
  16. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/__init__.py +0 -0
  17. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/base_exporter.py +0 -0
  18. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/orc_exporter.py +0 -0
  19. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/parquet_exporter.py +0 -0
  20. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/utils/__init__.py +0 -0
  21. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/utils/chunker.py +0 -0
  22. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/utils/constants.py +0 -0
  23. {testdatax-0.2.0 → testdatax-0.2.1}/src/exporters/utils/exporter_config.py +0 -0
  24. {testdatax-0.2.0 → testdatax-0.2.1}/src/providers/__init__.py +0 -0
  25. {testdatax-0.2.0 → testdatax-0.2.1}/src/providers/base.py +0 -0
  26. {testdatax-0.2.0 → testdatax-0.2.1}/src/providers/faker_provider.py +0 -0
  27. {testdatax-0.2.0 → testdatax-0.2.1}/src/providers/mimesis_provider.py +0 -0
  28. {testdatax-0.2.0 → testdatax-0.2.1}/src/schemas.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TestDataX
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A flexible test data generation toolkit
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -223,11 +223,10 @@ The schema file defines the structure and constraints of your generated data. Ea
223
223
  }
224
224
  ```
225
225
 
226
- > **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
227
- > `format` is set, date/datetime values are rendered to a string with
228
- > `strftime`; for the SQL exporters this means the column receives a formatted
229
- > string literal rather than a native date, so `format` is best suited to the
230
- > CSV/JSON formats.
226
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive).
227
+ > `format` applies a `strftime` pattern to date/datetime values in the **CSV and
228
+ > JSON** outputs only; the SQL, Parquet and ORC exporters keep native date types
229
+ > and ignore `format`.
231
230
 
232
231
  #### Enum Fields
233
232
  ```json
@@ -200,11 +200,10 @@ The schema file defines the structure and constraints of your generated data. Ea
200
200
  }
201
201
  ```
202
202
 
203
- > **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
204
- > `format` is set, date/datetime values are rendered to a string with
205
- > `strftime`; for the SQL exporters this means the column receives a formatted
206
- > string literal rather than a native date, so `format` is best suited to the
207
- > CSV/JSON formats.
203
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive).
204
+ > `format` applies a `strftime` pattern to date/datetime values in the **CSV and
205
+ > JSON** outputs only; the SQL, Parquet and ORC exporters keep native date types
206
+ > and ignore `format`.
208
207
 
209
208
  #### Enum Fields
210
209
  ```json
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "TestDataX"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  description = "A flexible test data generation toolkit"
5
5
  authors = ["JamesPBrett"]
6
6
  license = "MIT"
@@ -225,6 +225,7 @@ allowed_tags = [
225
225
  "refactor", # Code changes without fixing bugs or adding features
226
226
  ]
227
227
 
228
+
228
229
  [build-system]
229
230
  requires = ["poetry-core"]
230
231
  build-backend = "poetry.core.masonry.api"
@@ -1,6 +1,6 @@
1
1
  """TestDataX package initialization."""
2
2
 
3
- __version__ = "0.2.0"
3
+ __version__ = "0.2.1"
4
4
 
5
5
  from src.cli import app # noqa
6
6
 
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from .base_exporter import BaseExporter
8
8
  from .utils.chunker import DataChunker
9
9
  from .utils.constants import CHUNK_SIZE_CSV
10
- from .utils.formatters import CSVFormatter
10
+ from .utils.formatters import CSVFormatter, extract_formats
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -76,9 +76,12 @@ class CsvExporter(BaseExporter):
76
76
  else:
77
77
  fieldnames = list(data[0].keys())
78
78
 
79
+ formats = extract_formats(schema)
79
80
  first_chunk = True
80
81
  for chunk in self.chunker.chunk_data(data):
81
- formatted_chunk = [self.formatter.format_row(row) for row in chunk]
82
+ formatted_chunk = [
83
+ self.formatter.format_row(row, formats) for row in chunk
84
+ ]
82
85
  df = pd.DataFrame(formatted_chunk, columns=fieldnames)
83
86
 
84
87
  # Write the data to CSV in chunks
@@ -5,7 +5,7 @@ from typing import Any
5
5
  from .base_exporter import BaseExporter
6
6
  from .utils.chunker import DataChunker
7
7
  from .utils.constants import CHUNK_SIZE_JSON
8
- from .utils.formatters import JSONFormatter
8
+ from .utils.formatters import JSONFormatter, extract_formats
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
@@ -65,12 +65,13 @@ class JsonExporter(BaseExporter):
65
65
 
66
66
  # Stream a valid JSON array to disk one chunk at a time so the whole
67
67
  # dataset is never held in memory at once.
68
+ formats = extract_formats(schema)
68
69
  with open(output_path, "w", encoding="utf-8") as f:
69
70
  f.write("[")
70
71
  first = True
71
72
  for chunk in self.chunker.chunk_data(data):
72
73
  for row in chunk:
73
- formatted = self.formatter.format_row(row)
74
+ formatted = self.formatter.format_row(row, formats)
74
75
  block = json.dumps(formatted, indent=4)
75
76
  indented = "\n".join(
76
77
  " " + line for line in block.splitlines()
@@ -6,7 +6,7 @@ from uuid import UUID
6
6
 
7
7
  from .base_exporter import BaseExporter
8
8
  from .utils.constants import DEFAULT_SCHEMA
9
- from .utils.sql import escape_ansi_quotes
9
+ from .utils.sql import escape_ansi_quotes, quote_mssql_ident
10
10
 
11
11
  MSSQL_TYPE_MAPPING = {
12
12
  "string": "NVARCHAR(255)", # Unicode string support
@@ -32,6 +32,11 @@ class MssqlExporter(BaseExporter):
32
32
  """Escape a string for a T-SQL single-quoted literal (quote doubling)."""
33
33
  return escape_ansi_quotes(value)
34
34
 
35
+ @staticmethod
36
+ def _quote_ident(name: str) -> str:
37
+ """Quote a T-SQL identifier (table or column name) with brackets."""
38
+ return quote_mssql_ident(name)
39
+
35
40
  def _format_value(
36
41
  self,
37
42
  value: (
@@ -149,13 +154,15 @@ class MssqlExporter(BaseExporter):
149
154
  and "values" in field_def
150
155
  ):
151
156
  values = "','".join(self._escape(v) for v in field_def["values"])
152
- check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
157
+ check_constraints.append(
158
+ f"CHECK ({self._quote_ident(field_name)} IN ('{values}'))"
159
+ )
153
160
 
154
- columns.append(f" {field_name} {sql_type} NULL")
161
+ columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
155
162
 
156
163
  # Combine columns and check constraints
157
164
  return (
158
- f"CREATE TABLE {table_name} (\n"
165
+ f"CREATE TABLE {self._quote_ident(table_name)} (\n"
159
166
  + ",\n".join(columns)
160
167
  + (
161
168
  (",\n " + ",\n ".join(check_constraints))
@@ -184,9 +191,10 @@ class MssqlExporter(BaseExporter):
184
191
  'INSERT INTO users (id, name) VALUES (1, "test");'
185
192
 
186
193
  """
187
- columns = ", ".join(row.keys())
194
+ columns = ", ".join(self._quote_ident(c) for c in row.keys())
188
195
  values = ", ".join(self._format_value(v) for v in row.values())
189
- return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
196
+ table = self._quote_ident(table_name)
197
+ return f"INSERT INTO {table} ({columns}) VALUES ({values});"
190
198
 
191
199
  def export(
192
200
  self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
@@ -6,7 +6,7 @@ from uuid import UUID
6
6
 
7
7
  from .base_exporter import BaseExporter
8
8
  from .utils.constants import DEFAULT_SCHEMA
9
- from .utils.sql import escape_mysql_literal
9
+ from .utils.sql import escape_mysql_literal, quote_mysql_ident
10
10
 
11
11
  MYSQL_TYPE_MAPPING = {
12
12
  "string": "VARCHAR(255)",
@@ -32,6 +32,11 @@ class MysqlExporter(BaseExporter):
32
32
  """Escape a string for a MySQL single-quoted literal."""
33
33
  return escape_mysql_literal(value)
34
34
 
35
+ @staticmethod
36
+ def _quote_ident(name: str) -> str:
37
+ """Quote a MySQL identifier (table or column name)."""
38
+ return quote_mysql_ident(name)
39
+
35
40
  def _format_value(
36
41
  self,
37
42
  value: (
@@ -147,9 +152,13 @@ class MysqlExporter(BaseExporter):
147
152
  ):
148
153
  values = "','".join(self._escape(v) for v in field_def["values"])
149
154
  sql_type = f"ENUM('{values}')"
150
- columns.append(f" {field_name} {sql_type} NULL")
155
+ columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
151
156
 
152
- return f"CREATE TABLE {table_name} (\n" + ",\n".join(columns) + "\n);\n\n"
157
+ return (
158
+ f"CREATE TABLE {self._quote_ident(table_name)} (\n"
159
+ + ",\n".join(columns)
160
+ + "\n);\n\n"
161
+ )
153
162
 
154
163
  def _create_insert_stmt(
155
164
  self, row: dict[str, Any], table_name: str = "output"
@@ -170,9 +179,10 @@ class MysqlExporter(BaseExporter):
170
179
  'INSERT INTO users (id, name) VALUES (1, "test");'
171
180
 
172
181
  """
173
- columns = ", ".join(row.keys())
182
+ columns = ", ".join(self._quote_ident(c) for c in row.keys())
174
183
  values = ", ".join(self._format_value(v) for v in row.values())
175
- return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
184
+ table = self._quote_ident(table_name)
185
+ return f"INSERT INTO {table} ({columns}) VALUES ({values});"
176
186
 
177
187
  def export(
178
188
  self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
@@ -6,7 +6,7 @@ from uuid import UUID
6
6
 
7
7
  from .base_exporter import BaseExporter
8
8
  from .utils.constants import DEFAULT_SCHEMA
9
- from .utils.sql import escape_ansi_quotes
9
+ from .utils.sql import escape_ansi_quotes, quote_ansi_ident
10
10
 
11
11
  ORACLE_TYPE_MAPPING = {
12
12
  "string": "VARCHAR2(255)", # Oracle's Unicode string type
@@ -32,6 +32,11 @@ class OracleExporter(BaseExporter):
32
32
  """Escape a string for an Oracle single-quoted literal (quote doubling)."""
33
33
  return escape_ansi_quotes(value)
34
34
 
35
+ @staticmethod
36
+ def _quote_ident(name: str) -> str:
37
+ """Quote an Oracle identifier (table or column name) with double quotes."""
38
+ return quote_ansi_ident(name)
39
+
35
40
  def _format_value(
36
41
  self,
37
42
  value: (
@@ -156,13 +161,15 @@ class OracleExporter(BaseExporter):
156
161
  and "values" in field_def
157
162
  ):
158
163
  values = "','".join(self._escape(v) for v in field_def["values"])
159
- check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
164
+ check_constraints.append(
165
+ f"CHECK ({self._quote_ident(field_name)} IN ('{values}'))"
166
+ )
160
167
 
161
- columns.append(f" {field_name} {sql_type} NULL")
168
+ columns.append(f" {self._quote_ident(field_name)} {sql_type} NULL")
162
169
 
163
170
  # Combine columns and check constraints
164
171
  return (
165
- f"CREATE TABLE {table_name} (\n"
172
+ f"CREATE TABLE {self._quote_ident(table_name)} (\n"
166
173
  + ",\n".join(columns)
167
174
  + (
168
175
  (",\n " + ",\n ".join(check_constraints))
@@ -191,9 +198,10 @@ class OracleExporter(BaseExporter):
191
198
  'INSERT INTO users (id, name) VALUES (1, "test");'
192
199
 
193
200
  """
194
- columns = ", ".join(row.keys())
201
+ columns = ", ".join(self._quote_ident(c) for c in row.keys())
195
202
  values = ", ".join(self._format_value(v) for v in row.values())
196
- return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
203
+ table = self._quote_ident(table_name)
204
+ return f"INSERT INTO {table} ({columns}) VALUES ({values});"
197
205
 
198
206
  def export(
199
207
  self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
@@ -6,6 +6,20 @@ from decimal import Decimal
6
6
  from typing import Any
7
7
 
8
8
 
9
+ def extract_formats(schema: dict | None) -> dict[str, str]:
10
+ """Extract per-column ``strftime`` formats from a schema definition.
11
+
12
+ Only complex (dict) field definitions carrying a ``format`` key are included.
13
+ """
14
+ if not schema:
15
+ return {}
16
+ return {
17
+ name: field_def["format"]
18
+ for name, field_def in schema.items()
19
+ if isinstance(field_def, dict) and field_def.get("format")
20
+ }
21
+
22
+
9
23
  class BaseFormatter:
10
24
  """Base class for handling data type formatting across exporters."""
11
25
 
@@ -68,22 +82,28 @@ class BaseFormatter:
68
82
  pass
69
83
 
70
84
  def format_row(
71
- self, row: dict[str, Any], **kwargs: dict[str, str | int | float]
85
+ self, row: dict[str, Any], formats: dict[str, str] | None = None
72
86
  ) -> dict[str, Any]:
73
- """Format the provided rows with the correct format_value.
87
+ """Format the provided row with the correct format_value.
74
88
 
75
89
  Args:
76
90
  row: Dictionary containing row data
77
- **kwargs: Additional format-specific parameters
91
+ formats: Optional per-column ``strftime`` formats applied to
92
+ date/datetime values before normal value formatting.
78
93
 
79
94
  Returns:
80
95
  Formatted row dictionary
81
96
 
82
97
  """
98
+ formats = formats or {}
83
99
  formatted_row: dict[str, Any] = {}
84
100
  for key, value in row.items():
85
101
  try:
86
- formatted_row[key] = self.format_value(value)
102
+ fmt = formats.get(key)
103
+ if fmt and isinstance(value, (date | datetime)):
104
+ formatted_row[key] = value.strftime(fmt)
105
+ else:
106
+ formatted_row[key] = self.format_value(value)
87
107
  except Exception as e:
88
108
  formatted_row[key] = f"ERROR: {str(e)}"
89
109
  return formatted_row
@@ -0,0 +1,38 @@
1
+ """Shared SQL string-escaping helpers for the SQL exporters."""
2
+
3
+
4
+ def escape_ansi_quotes(value: str) -> str:
5
+ """Escape an ANSI single-quoted SQL literal by doubling embedded quotes.
6
+
7
+ Used by dialects without backslash escaping (MSSQL, Oracle).
8
+ """
9
+ return value.replace("'", "''")
10
+
11
+
12
+ def escape_mysql_literal(value: str) -> str:
13
+ """Escape a MySQL single-quoted literal.
14
+
15
+ The backslash is escaped first so a value containing a backslash cannot
16
+ terminate the literal early, then single quotes and newlines are escaped.
17
+ """
18
+ return value.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
19
+
20
+
21
+ def quote_mysql_ident(name: str) -> str:
22
+ """Quote a MySQL identifier with backticks, doubling embedded backticks."""
23
+ return "`" + name.replace("`", "``") + "`"
24
+
25
+
26
+ def quote_mssql_ident(name: str) -> str:
27
+ """Quote a T-SQL identifier with brackets, doubling embedded ``]``."""
28
+ return "[" + name.replace("]", "]]") + "]"
29
+
30
+
31
+ def quote_ansi_ident(name: str) -> str:
32
+ """Quote an ANSI/Oracle identifier with double quotes, doubling embedded ``"``.
33
+
34
+ Note: Oracle treats a double-quoted identifier as case-sensitive, so the
35
+ generated DDL and DML deliberately reference every identifier quoted to stay
36
+ self-consistent.
37
+ """
38
+ return '"' + name.replace('"', '""') + '"'
@@ -83,7 +83,11 @@ class DataGenerator:
83
83
  def _generate_field_value(
84
84
  self, field: FieldSchema, unique_seen: dict[str, set[GeneratedValue]]
85
85
  ) -> GeneratedValue:
86
- """Produce a single value, honoring nullable/unique/format options."""
86
+ """Produce a single value, honoring the nullable and unique options.
87
+
88
+ Native (typed) values are returned; date ``format`` rendering is handled
89
+ by the file-format exporters so SQL/Parquet/ORC keep native date types.
90
+ """
87
91
  if field.unique:
88
92
  seen = unique_seen[field.name]
89
93
  for _ in range(MAX_UNIQUE_RETRIES):
@@ -91,12 +95,12 @@ class DataGenerator:
91
95
  if value is None or value not in seen:
92
96
  if value is not None:
93
97
  seen.add(value)
94
- return self._apply_format(field, value)
98
+ return value
95
99
  raise ValueError(
96
100
  f"Could not generate a unique value for field '{field.name}' "
97
101
  f"after {MAX_UNIQUE_RETRIES} attempts"
98
102
  )
99
- return self._apply_format(field, self._produce_value(field))
103
+ return self._produce_value(field)
100
104
 
101
105
  def _produce_value(self, field: FieldSchema) -> GeneratedValue:
102
106
  """Generate a raw value for a field, applying only the nullable option."""
@@ -104,17 +108,6 @@ class DataGenerator:
104
108
  return None
105
109
  return self.type_generators[field.type](field)
106
110
 
107
- @staticmethod
108
- def _apply_format(field: FieldSchema, value: GeneratedValue) -> GeneratedValue:
109
- """Render date/datetime values via strftime when a format is set.
110
-
111
- Applied after the uniqueness check so uniqueness is tracked on the raw
112
- (high-cardinality) value rather than the formatted string.
113
- """
114
- if field.format and isinstance(value, (date | datetime)):
115
- return value.strftime(field.format)
116
- return value
117
-
118
111
  def _generate_string(self, field: FieldSchema) -> str:
119
112
  provider_value = str(field.value_provider) if field.value_provider else "name"
120
113
  return self.provider.generate_string(
@@ -1,18 +0,0 @@
1
- """Shared SQL string-escaping helpers for the SQL exporters."""
2
-
3
-
4
- def escape_ansi_quotes(value: str) -> str:
5
- """Escape an ANSI single-quoted SQL literal by doubling embedded quotes.
6
-
7
- Used by dialects without backslash escaping (MSSQL, Oracle).
8
- """
9
- return value.replace("'", "''")
10
-
11
-
12
- def escape_mysql_literal(value: str) -> str:
13
- """Escape a MySQL single-quoted literal.
14
-
15
- The backslash is escaped first so a value containing a backslash cannot
16
- terminate the literal early, then single quotes and newlines are escaped.
17
- """
18
- return value.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
File without changes
File without changes
File without changes