TestDataX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """TestDataX package initialization."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from src.cli import app # noqa
6
+
7
+ __all__ = ["app"]
src/cli.py ADDED
@@ -0,0 +1,166 @@
1
+ import json
2
+ import traceback
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import typer
7
+
8
+ from .exporters.base_exporter import BaseExporter
9
+ from .exporters.utils.constants import DEFAULT_SCHEMA, EXPORT_FORMATS
10
+ from .exporters.utils.exporter_config import EXPORTER_CLASSES
11
+ from .generator import DataGenerator
12
+ from .schemas import DataType, FieldSchema, GeneratorConfig
13
+
14
+
15
+ def load_schema(schema_path: Path) -> dict[str, Any]:
16
+ """Load schema from a JSON file.
17
+
18
+ Args:
19
+ schema_path: Path to the JSON schema file
20
+
21
+ Returns:
22
+ dict[str, Any]: Loaded schema as a dictionary
23
+
24
+ Raises:
25
+ FileNotFoundError: If schema file doesn't exist
26
+ json.JSONDecodeError: If schema file contains invalid JSON
27
+
28
+ """
29
+ with open(schema_path, encoding="utf-8") as f:
30
+ data = json.load(f)
31
+ if not isinstance(data, dict):
32
+ raise ValueError("Schema file does not contain a JSON object.")
33
+ return data
34
+
35
+
36
+ app = typer.Typer()
37
+
38
+ OUTPUT_PATH_OPTION = typer.Option(..., "--output", "-o", help="Output file path")
39
+ FORMAT_OPTION = typer.Option(
40
+ "csv", "--format", "-f", help=f"Output format ({EXPORT_FORMATS})"
41
+ )
42
+ ROWS_OPTION = typer.Option(10, "--rows", "-r", help="Number of rows to generate")
43
+ SCHEMA_PATH_OPTION = typer.Option(None, "--schema", "-s", help="Path to schema file")
44
+ DEBUG_OPTION = typer.Option(False, "--debug", "-d", help="Enable debug output")
45
+
46
+
47
+ @app.command()
48
+ def generate(
49
+ output: Path = OUTPUT_PATH_OPTION,
50
+ format: str = FORMAT_OPTION,
51
+ rows: int = ROWS_OPTION,
52
+ schema_path: Path | None = SCHEMA_PATH_OPTION,
53
+ debug: bool = DEBUG_OPTION,
54
+ ) -> None:
55
+ """Generate synthetic data based on the provided schema."""
56
+ try:
57
+ if debug:
58
+ typer.echo(
59
+ f"Starting data generation with schema: {schema_path}", err=False
60
+ )
61
+ if schema_path:
62
+ schema = load_schema(schema_path)
63
+ else:
64
+ schema = DEFAULT_SCHEMA
65
+
66
+ # Check if the number of rows is zero or negative
67
+ if rows <= 0:
68
+ raise ValueError("Number of rows must be greater than zero")
69
+
70
+ # Convert schema to field schemas
71
+ if debug:
72
+ typer.echo(f"Loaded schema: {schema}", err=False)
73
+ fields = []
74
+ for name, field_def in schema.items():
75
+ if isinstance(field_def, dict):
76
+ if "type" not in field_def:
77
+ raise ValueError(f"Field '{name}' missing required 'type' key")
78
+
79
+ # Validate min and max values based on type
80
+ min_value = field_def.get("min")
81
+ max_value = field_def.get("max")
82
+ field_type = DataType(field_def["type"])
83
+
84
+ if field_type in {DataType.INTEGER, DataType.BIGINT}:
85
+ if min_value is not None and not isinstance(min_value, int):
86
+ raise ValueError(
87
+ f"Invalid min value for field '{name}': {min_value}"
88
+ )
89
+ if max_value is not None and not isinstance(max_value, int):
90
+ raise ValueError(
91
+ f"Invalid max value for field '{name}': {max_value}"
92
+ )
93
+ elif field_type == DataType.STRING:
94
+ if min_value is not None or max_value is not None:
95
+ raise ValueError(
96
+ f"Invalid min or max value for field '{name}': "
97
+ f"{min_value}, {max_value}"
98
+ )
99
+
100
+ field_schema = FieldSchema(
101
+ name=name,
102
+ type=field_type,
103
+ enum_values=field_def.get("values"),
104
+ min_value=min_value,
105
+ max_value=max_value,
106
+ right_digits=field_def.get("right_digits"),
107
+ value_provider=field_def.get("faker"),
108
+ pattern=field_def.get("pattern"),
109
+ )
110
+ fields.append(field_schema.model_dump())
111
+ else:
112
+ fields.append(
113
+ FieldSchema(name=name, type=DataType(field_def)).model_dump()
114
+ )
115
+
116
+ # Validate export format
117
+ if format not in EXPORT_FORMATS:
118
+ raise ValueError(f"Unsupported format: {format}")
119
+
120
+ # Create generator config
121
+ if debug:
122
+ typer.echo(f"Converted fields: {fields}", err=False)
123
+ config = GeneratorConfig(
124
+ fields=fields, row_count=rows, export_format=format, output_path=str(output)
125
+ )
126
+
127
+ # Generate data
128
+ if debug:
129
+ typer.echo(f"Generator config: {config}", err=False)
130
+ generator = DataGenerator()
131
+ data = generator.generate_data(config.fields, config.row_count)
132
+
133
+ # Export data
134
+ exporter = get_exporter(config.export_format)
135
+ if debug:
136
+ typer.echo(f"Generated data: {data}", err=False)
137
+
138
+ exporter.export(data, config.output_path, schema=schema)
139
+
140
+ typer.echo(f"Successfully generated {rows} rows of data to {output}")
141
+ return
142
+
143
+ except FileNotFoundError as e:
144
+ typer.echo(f"Schema file not found: {e}", err=True)
145
+ raise typer.Exit(code=1) from e
146
+ except ValueError as e:
147
+ typer.echo(f"Value Error: {e}", err=True)
148
+ raise typer.Exit(code=1) from e
149
+ except Exception as e:
150
+ typer.echo(f"Error: {str(e)}", err=True)
151
+ typer.echo(f"Exception type: {type(e).__name__}", err=True)
152
+ typer.echo(f"Exception args: {e.args}", err=True)
153
+ typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
154
+ raise typer.Exit(code=1) from e
155
+
156
+
157
+ def get_exporter(format: str) -> BaseExporter:
158
+ """Return the appropriate exporter based on the format."""
159
+ exporters = EXPORTER_CLASSES
160
+ if format not in exporters:
161
+ raise ValueError(f"Unsupported format: {format}")
162
+ return exporters[format]
163
+
164
+
165
+ if __name__ == "__main__":
166
+ app()
File without changes
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+
5
+ class BaseExporter(ABC):
6
+ """Base class for all data exporters.
7
+
8
+ A common interface for data export operations.
9
+ """
10
+
11
+ @abstractmethod
12
+ def export(
13
+ self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
14
+ ) -> None:
15
+ """Export data to a specified output path.
16
+
17
+ Args:
18
+ data: List of dictionaries containing the data to export
19
+ output_path: Path where the exported data should be saved
20
+ schema: Dictionary containing schema definition
21
+
22
+ """
23
+ pass
@@ -0,0 +1,115 @@
1
+ import csv
2
+ import logging
3
+ from typing import Any
4
+
5
+ import pandas as pd
6
+
7
+ from .base_exporter import BaseExporter
8
+ from .utils.chunker import DataChunker
9
+ from .utils.constants import CHUNK_SIZE_CSV
10
+ from .utils.formatters import CSVFormatter
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class CsvExporter(BaseExporter):
16
+ """Export data to CSV with support for chunked writing and data type formatting."""
17
+
18
+ def __init__(self, chunk_size: int = CHUNK_SIZE_CSV) -> None:
19
+ """Initialize CsvExporter with specified chunk size.
20
+
21
+ Args:
22
+ chunk_size (int, optional): Number of rows to write at once.
23
+ Defaults to CHUNK_SIZE_CSV.
24
+
25
+ """
26
+ if chunk_size <= 0:
27
+ raise ValueError("Chunk size must be greater than zero.")
28
+ self.chunk_size = chunk_size
29
+ self.chunker = DataChunker(chunk_size)
30
+ self.formatter = CSVFormatter()
31
+
32
+ def export(
33
+ self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
34
+ ) -> None:
35
+ """Export data to a CSV file with proper formatting and chunking.
36
+
37
+ Args:
38
+ data (list[dict[str, Any]]): List of dictionaries for the data to export.
39
+ output_path (str): Path to the output CSV file.
40
+ schema (dict | None, optional): Schema definition for the data.
41
+
42
+ Raises:
43
+ ValueError: If data is invalid or file operations fail.
44
+
45
+ """
46
+ logger.info("Starting CSV export.")
47
+
48
+ if not data:
49
+ if schema:
50
+ # Write an empty file with headers if schema is provided
51
+ pd.DataFrame(columns=list(schema.keys())).to_csv(
52
+ output_path,
53
+ index=False,
54
+ quoting=csv.QUOTE_NONNUMERIC,
55
+ quotechar='"',
56
+ doublequote=True,
57
+ lineterminator="\n",
58
+ encoding="utf-8",
59
+ )
60
+ logger.info(f"Exported an empty file with headers to {output_path}.")
61
+ else:
62
+ logger.warning(
63
+ "No data provided and no schema to write headers. Exiting."
64
+ )
65
+ return
66
+
67
+ try:
68
+ # Determine fieldnames based on schema or data keys
69
+ if schema:
70
+ fieldnames = list(schema.keys())
71
+ for field in fieldnames:
72
+ if field not in data[0]:
73
+ raise ValueError(
74
+ f"Field '{field}' in schema is not present in data."
75
+ )
76
+ else:
77
+ fieldnames = list(data[0].keys())
78
+
79
+ first_chunk = True
80
+ formatted_rows = []
81
+ for chunk in self.chunker.chunk_data(data):
82
+ formatted_chunk = [self.formatter.format_row(row) for row in chunk]
83
+ formatted_rows.extend(formatted_chunk)
84
+ df = pd.DataFrame(formatted_chunk, columns=fieldnames)
85
+
86
+ # Write the data to CSV in chunks
87
+ mode = "w" if first_chunk else "a"
88
+ header = first_chunk
89
+ df.to_csv( # type: ignore
90
+ output_path,
91
+ index=False, # Do not write the index column
92
+ quoting=csv.QUOTE_NONNUMERIC, # Quote non-numeric fields
93
+ quotechar='"', # Use double quotes for quoting
94
+ doublequote=True, # Escape double quotes by doubling
95
+ lineterminator="\n", # Use Unix-style line endings
96
+ encoding="utf-8", # Use UTF-8 encoding
97
+ mode=mode, # Write mode ('w', else 'a')
98
+ header=header, # Include header only in first chunk
99
+ )
100
+ first_chunk = False
101
+
102
+ logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
103
+
104
+ except UnicodeEncodeError as e:
105
+ logger.error(f"Encoding error: {e}")
106
+ raise ValueError(f"Encoding error: {str(e)}") from e
107
+ except OSError as e:
108
+ logger.error(f"File operation error: {e}")
109
+ raise ValueError(f"File operation error: {str(e)}") from e
110
+ except ValueError as e:
111
+ logger.error(f"Data validation error: {e}")
112
+ raise
113
+ except Exception as e:
114
+ logger.error(f"Unexpected error: {e}")
115
+ raise ValueError(f"Export failed: {str(e)}") from e
@@ -0,0 +1,89 @@
1
+ import json
2
+ import logging
3
+ from typing import Any
4
+
5
+ from .base_exporter import BaseExporter
6
+ from .utils.chunker import DataChunker
7
+ from .utils.constants import CHUNK_SIZE_JSON
8
+ from .utils.formatters import JSONFormatter
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class JsonExporter(BaseExporter):
14
+ """Export data to JSON with support for chunked writing and data type formatting."""
15
+
16
+ def __init__(self, chunk_size: int = CHUNK_SIZE_JSON) -> None:
17
+ """Initialize the JSON exporter with a specified chunk size.
18
+
19
+ Args:
20
+ chunk_size (int): Number of records to process at once.
21
+ Defaults to CHUNK_SIZE_JSON.
22
+
23
+ Raises:
24
+ ValueError: If chunk_size is less than or equal to zero.
25
+
26
+ """
27
+ if chunk_size <= 0:
28
+ raise ValueError("Chunk size must be greater than zero.")
29
+ self.chunk_size = chunk_size
30
+ self.chunker = DataChunker(chunk_size)
31
+ self.formatter = JSONFormatter()
32
+
33
+ def export(
34
+ self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
35
+ ) -> None:
36
+ """Export data to a JSON file with proper formatting and chunking.
37
+
38
+ Args:
39
+ data (list[dict[str, Any]]): List of dictionaries for the data to export.
40
+ output_path (str): Path to the output JSON file.
41
+ schema (dict | None, optional): Schema definition for the data.
42
+
43
+ Raises:
44
+ ValueError: If data is invalid or file operations fail.
45
+
46
+ """
47
+ logger.info("Starting JSON export.")
48
+
49
+ if not data:
50
+ if schema:
51
+ with open(output_path, "w", encoding="utf-8") as f:
52
+ json.dump([], f)
53
+ logger.info("Exported empty array.")
54
+ return
55
+
56
+ try:
57
+ # Validate schema if provided
58
+ if schema:
59
+ fieldnames = list(schema.keys())
60
+ for field in fieldnames:
61
+ if field not in data[0]:
62
+ raise ValueError(
63
+ f"Field '{field}' in schema is not present in data."
64
+ )
65
+ # Format the data and write it in chunks to the output file
66
+ all_formatted_rows = []
67
+ for chunk in self.chunker.chunk_data(data):
68
+ formatted_chunk = [self.formatter.format_row(row) for row in chunk]
69
+ all_formatted_rows.extend(formatted_chunk)
70
+
71
+ # Write the complete file with proper formatting using json.dumps
72
+ with open(output_path, "w", encoding="utf-8") as f:
73
+ json_str = json.dumps(all_formatted_rows, indent=4)
74
+ f.write(json_str)
75
+
76
+ logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
77
+
78
+ except UnicodeEncodeError as e:
79
+ logger.error(f"Encoding error: {e}")
80
+ raise ValueError(f"Encoding error: {str(e)}") from e
81
+ except OSError as e:
82
+ logger.error(f"File operation error: {e}")
83
+ raise ValueError(f"File operation error: {str(e)}") from e
84
+ except ValueError as e:
85
+ logger.error(f"Data validation error: {e}")
86
+ raise
87
+ except Exception as e:
88
+ logger.error(f"Unexpected error: {e}")
89
+ raise
@@ -0,0 +1,198 @@
1
+ import decimal
2
+ from datetime import date, datetime
3
+ from typing import Any
4
+ from uuid import UUID
5
+
6
+ from .base_exporter import BaseExporter
7
+ from .utils.constants import DEFAULT_SCHEMA
8
+
9
+ MSSQL_TYPE_MAPPING = {
10
+ "string": "NVARCHAR(255)", # Unicode string support
11
+ "text": "NVARCHAR(MAX)", # Unicode text, replaces TEXT which is deprecated
12
+ "integer": "INT", # Same as MSSQL
13
+ "bigint": "BIGINT", # Same as MSSQL
14
+ "float": "FLOAT", # Same as MSSQL
15
+ "decimal": "DECIMAL(18,2)", # Same as MSSQL
16
+ "boolean": "BIT", # MSSQL uses BIT instead of TINYINT for boolean
17
+ "date": "DATE", # Same as MSSQL
18
+ "datetime": "DATETIME2", # More precise than DATETIME
19
+ "blob": "VARBINARY(MAX)", # MSSQL equivalent for BLOB
20
+ "uuid": "UNIQUEIDENTIFIER", # Native GUID/UUID type in MSSQL
21
+ "enum": "NVARCHAR(255)", # MSSQL doesn't have ENUM, use NVARCHAR instead
22
+ }
23
+
24
+
25
+ class MssqlExporter(BaseExporter):
26
+ """Exports data to MSSQL compatible SQL file."""
27
+
28
+ def _format_value(
29
+ self,
30
+ value: (
31
+ None
32
+ | str
33
+ | UUID
34
+ | datetime
35
+ | date
36
+ | bool
37
+ | bytes
38
+ | int
39
+ | float
40
+ | decimal.Decimal
41
+ ),
42
+ ) -> str:
43
+ """Format a value for use in a MSSQL query.
44
+
45
+ This method handles various data types and converts them to their
46
+ appropriate string representation for use in MSSQL queries.
47
+ It includes proper escaping for special characters in strings.
48
+
49
+ Args:
50
+ value: The value to format. Can be one of the following types:
51
+ - None: Converted to 'NULL'
52
+ - str: Escaped and wrapped in single quotes
53
+ - UUID: Converted to string, escaped and wrapped in single quotes
54
+ - datetime: Converted to ISO format string and wrapped in single quotes
55
+ - date: Converted to ISO format string and wrapped in single quotes
56
+ - bool: Converted to '1' for True or '0' for False
57
+ - bytes: Use 0x prefix for hexadecimal to binary conversion
58
+ - int: Converted to string representation
59
+ - float: Converted to string representation
60
+ - decimal.Decimal: Converted to string representation
61
+
62
+ Returns:
63
+ str: The formatted value ready for use in a MSSQL query.
64
+
65
+ Raises:
66
+ ValueError: If the input value type is not supported.
67
+
68
+ """
69
+ if value is None:
70
+ return "NULL"
71
+ elif isinstance(value, (str | UUID)):
72
+ return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
73
+ elif isinstance(value, (datetime | date)):
74
+ return f"'{value.isoformat()}'"
75
+ elif isinstance(value, bool):
76
+ return "1" if value else "0"
77
+ elif isinstance(value, bytes):
78
+ return f"0x{value.hex()}"
79
+ elif isinstance(value, (int | float | decimal.Decimal)):
80
+ return str(value)
81
+ else:
82
+ raise ValueError(f"Unsupported type: {type(value)}")
83
+
84
+ def _get_column_type(self, field: dict) -> str:
85
+ """Get the MSSQL column type based on the field type.
86
+
87
+ Args:
88
+ field (dict): A dictionary containing field information, including its type.
89
+ The type can be either a string or a dictionary with a 'type' key.
90
+
91
+ Returns:
92
+ str: The corresponding MSSQL data type from MSSQL_TYPE_MAPPING.
93
+
94
+ Raises:
95
+ KeyError: If the field type is not found in MSSQL_TYPE_MAPPING.
96
+
97
+ """
98
+ field_type = field.get("type", "string")
99
+ if isinstance(field_type, dict):
100
+ field_type = field_type.get("type", "string")
101
+ return MSSQL_TYPE_MAPPING[field_type]
102
+
103
+ def _create_table_stmt(self, schema: dict, table_name: str = "output") -> str:
104
+ """Generate a MSSQL CREATE TABLE statement based on provided schema.
105
+
106
+ This method constructs a CREATE TABLE SQL statement by mapping schema field
107
+ definitions to their corresponding MSSQL column types. For ENUM types,
108
+ it creates a CHECK constraint to validate the values.
109
+
110
+ Args:
111
+ schema (dict): A dictionary defining the table schema with field names
112
+ as keys and type definitions as values.
113
+ table_name (str, optional): Name for the table. Defaults to "output".
114
+
115
+ Returns:
116
+ str: A complete MSSQL CREATE TABLE statement as a string.
117
+
118
+ Example:
119
+ schema = {
120
+ "id": "int",
121
+ "status": {"type": "enum", "values": ["active", "inactive"]}
122
+ }
123
+ result = _create_table_stmt(schema, "users")
124
+ # Returns: CREATE TABLE users (
125
+ # id INT NULL,
126
+ # status NVARCHAR(255) NULL CHECK (status IN ('active','inactive'))
127
+ # );
128
+
129
+ """
130
+ columns = []
131
+ check_constraints = []
132
+
133
+ for field_name, field_def in schema.items():
134
+ field_type_dict = (
135
+ field_def if isinstance(field_def, dict) else {"type": field_def}
136
+ )
137
+ sql_type = self._get_column_type(field_type_dict)
138
+
139
+ if (
140
+ isinstance(field_def, dict)
141
+ and field_def.get("type") == "enum"
142
+ and "values" in field_def
143
+ ):
144
+ values = "','".join(field_def["values"])
145
+ check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
146
+
147
+ columns.append(f" {field_name} {sql_type} NULL")
148
+
149
+ # Combine columns and check constraints
150
+ return (
151
+ f"CREATE TABLE {table_name} (\n"
152
+ + ",\n".join(columns)
153
+ + (
154
+ (",\n " + ",\n ".join(check_constraints))
155
+ if check_constraints
156
+ else ""
157
+ )
158
+ + "\n);\n\n"
159
+ )
160
+
161
+ def _create_insert_stmt(
162
+ self, row: dict[str, Any], table_name: str = "output"
163
+ ) -> str:
164
+ """Create a MSSQL INSERT statement from a dictionary of values.
165
+
166
+ Args:
167
+ row (dict[str, Any]): Dictionary containing column names as keys
168
+ and values to insert
169
+ table_name (str, optional): Name of the target table. Defaults to "output"
170
+
171
+ Returns:
172
+ str: Formatted MSSQL INSERT statement string
173
+
174
+ Example:
175
+ >>> row = {"id": 1, "name": "test"}
176
+ >>> _create_insert_stmt(row, "users")
177
+ 'INSERT INTO users (id, name) VALUES (1, "test");'
178
+
179
+ """
180
+ columns = ", ".join(row.keys())
181
+ values = ", ".join(self._format_value(v) for v in row.values())
182
+ return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
183
+
184
+ def export(
185
+ self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
186
+ ) -> None:
187
+ """Export data to MSSQL compatible SQL file."""
188
+ if not data:
189
+ return
190
+
191
+ table_name = output_path.split("/")[-1].split(".")[0]
192
+
193
+ with open(output_path, "w") as f:
194
+ # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
195
+ schema_to_use = schema or DEFAULT_SCHEMA
196
+ f.write(self._create_table_stmt(schema_to_use, table_name))
197
+ for row in data:
198
+ f.write(self._create_insert_stmt(row, table_name) + "\n")