PyPI - TestDataX - Versions diffs - 0.1.0__py3-none-any.whl - Mend

TestDataX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

src/__init__.py +7 -0
src/cli.py +166 -0
src/exporters/__init__.py +0 -0
src/exporters/base_exporter.py +23 -0
src/exporters/csv_exporter.py +115 -0
src/exporters/json_exporter.py +89 -0
src/exporters/mssql_exporter.py +198 -0
src/exporters/mysql_exporter.py +184 -0
src/exporters/oracle_exporter.py +205 -0
src/exporters/orc_exporter.py +100 -0
src/exporters/parquet_exporter.py +102 -0
src/exporters/utils/__init__.py +0 -0
src/exporters/utils/chunker.py +27 -0
src/exporters/utils/constants.py +55 -0
src/exporters/utils/exporter_config.py +17 -0
src/exporters/utils/formatters.py +165 -0
src/generator.py +117 -0
src/providers/__init__.py +4 -0
src/providers/base.py +58 -0
src/providers/faker_provider.py +65 -0
src/schemas.py +81 -0
testdatax-0.1.0.dist-info/LICENSE +21 -0
testdatax-0.1.0.dist-info/METADATA +345 -0
testdatax-0.1.0.dist-info/RECORD +26 -0
testdatax-0.1.0.dist-info/WHEEL +4 -0
testdatax-0.1.0.dist-info/entry_points.txt +3 -0

src/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""TestDataX package initialization."""
+__version__ = "0.1.0"
+from src.cli import app  # noqa
+__all__ = ["app"]

src/cli.py ADDED Viewed

@@ -0,0 +1,166 @@
+import json
+import traceback
+from pathlib import Path
+from typing import Any
+import typer
+from .exporters.base_exporter import BaseExporter
+from .exporters.utils.constants import DEFAULT_SCHEMA, EXPORT_FORMATS
+from .exporters.utils.exporter_config import EXPORTER_CLASSES
+from .generator import DataGenerator
+from .schemas import DataType, FieldSchema, GeneratorConfig
+def load_schema(schema_path: Path) -> dict[str, Any]:
+    """Load schema from a JSON file.
+    Args:
+        schema_path: Path to the JSON schema file
+    Returns:
+        dict[str, Any]: Loaded schema as a dictionary
+    Raises:
+        FileNotFoundError: If schema file doesn't exist
+        json.JSONDecodeError: If schema file contains invalid JSON
+    """
+    with open(schema_path, encoding="utf-8") as f:
+        data = json.load(f)
+        if not isinstance(data, dict):
+            raise ValueError("Schema file does not contain a JSON object.")
+        return data
+app = typer.Typer()
+OUTPUT_PATH_OPTION = typer.Option(..., "--output", "-o", help="Output file path")
+FORMAT_OPTION = typer.Option(
+    "csv", "--format", "-f", help=f"Output format ({EXPORT_FORMATS})"
+)
+ROWS_OPTION = typer.Option(10, "--rows", "-r", help="Number of rows to generate")
+SCHEMA_PATH_OPTION = typer.Option(None, "--schema", "-s", help="Path to schema file")
+DEBUG_OPTION = typer.Option(False, "--debug", "-d", help="Enable debug output")
+@app.command()
+def generate(
+    output: Path = OUTPUT_PATH_OPTION,
+    format: str = FORMAT_OPTION,
+    rows: int = ROWS_OPTION,
+    schema_path: Path | None = SCHEMA_PATH_OPTION,
+    debug: bool = DEBUG_OPTION,
+) -> None:
+    """Generate synthetic data based on the provided schema."""
+    try:
+        if debug:
+            typer.echo(
+                f"Starting data generation with schema: {schema_path}", err=False
+            )
+        if schema_path:
+            schema = load_schema(schema_path)
+        else:
+            schema = DEFAULT_SCHEMA
+        # Check if the number of rows is zero or negative
+        if rows <= 0:
+            raise ValueError("Number of rows must be greater than zero")
+        # Convert schema to field schemas
+        if debug:
+            typer.echo(f"Loaded schema: {schema}", err=False)
+        fields = []
+        for name, field_def in schema.items():
+            if isinstance(field_def, dict):
+                if "type" not in field_def:
+                    raise ValueError(f"Field '{name}' missing required 'type' key")
+                # Validate min and max values based on type
+                min_value = field_def.get("min")
+                max_value = field_def.get("max")
+                field_type = DataType(field_def["type"])
+                if field_type in {DataType.INTEGER, DataType.BIGINT}:
+                    if min_value is not None and not isinstance(min_value, int):
+                        raise ValueError(
+                            f"Invalid min value for field '{name}': {min_value}"
+                        )
+                    if max_value is not None and not isinstance(max_value, int):
+                        raise ValueError(
+                            f"Invalid max value for field '{name}': {max_value}"
+                        )
+                elif field_type == DataType.STRING:
+                    if min_value is not None or max_value is not None:
+                        raise ValueError(
+                            f"Invalid min or max value for field '{name}': "
+                            f"{min_value}, {max_value}"
+                        )
+                field_schema = FieldSchema(
+                    name=name,
+                    type=field_type,
+                    enum_values=field_def.get("values"),
+                    min_value=min_value,
+                    max_value=max_value,
+                    right_digits=field_def.get("right_digits"),
+                    value_provider=field_def.get("faker"),
+                    pattern=field_def.get("pattern"),
+                )
+                fields.append(field_schema.model_dump())
+            else:
+                fields.append(
+                    FieldSchema(name=name, type=DataType(field_def)).model_dump()
+                )
+        # Validate export format
+        if format not in EXPORT_FORMATS:
+            raise ValueError(f"Unsupported format: {format}")
+        # Create generator config
+        if debug:
+            typer.echo(f"Converted fields: {fields}", err=False)
+        config = GeneratorConfig(
+            fields=fields, row_count=rows, export_format=format, output_path=str(output)
+        )
+        # Generate data
+        if debug:
+            typer.echo(f"Generator config: {config}", err=False)
+        generator = DataGenerator()
+        data = generator.generate_data(config.fields, config.row_count)
+        # Export data
+        exporter = get_exporter(config.export_format)
+        if debug:
+            typer.echo(f"Generated data: {data}", err=False)
+        exporter.export(data, config.output_path, schema=schema)
+        typer.echo(f"Successfully generated {rows} rows of data to {output}")
+        return
+    except FileNotFoundError as e:
+        typer.echo(f"Schema file not found: {e}", err=True)
+        raise typer.Exit(code=1) from e
+    except ValueError as e:
+        typer.echo(f"Value Error: {e}", err=True)
+        raise typer.Exit(code=1) from e
+    except Exception as e:
+        typer.echo(f"Error: {str(e)}", err=True)
+        typer.echo(f"Exception type: {type(e).__name__}", err=True)
+        typer.echo(f"Exception args: {e.args}", err=True)
+        typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
+        raise typer.Exit(code=1) from e
+def get_exporter(format: str) -> BaseExporter:
+    """Return the appropriate exporter based on the format."""
+    exporters = EXPORTER_CLASSES
+    if format not in exporters:
+        raise ValueError(f"Unsupported format: {format}")
+    return exporters[format]
+if __name__ == "__main__":
+    app()

src/exporters/__init__.py ADDED Viewed

File without changes

src/exporters/base_exporter.py ADDED Viewed

@@ -0,0 +1,23 @@
+from abc import ABC, abstractmethod
+from typing import Any
+class BaseExporter(ABC):
+    """Base class for all data exporters.
+    A common interface for data export operations.
+    """
+    @abstractmethod
+    def export(
+        self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
+    ) -> None:
+        """Export data to a specified output path.
+        Args:
+            data: List of dictionaries containing the data to export
+            output_path: Path where the exported data should be saved
+            schema: Dictionary containing schema definition
+        """
+        pass

src/exporters/csv_exporter.py ADDED Viewed

@@ -0,0 +1,115 @@
+import csv
+import logging
+from typing import Any
+import pandas as pd
+from .base_exporter import BaseExporter
+from .utils.chunker import DataChunker
+from .utils.constants import CHUNK_SIZE_CSV
+from .utils.formatters import CSVFormatter
+logger = logging.getLogger(__name__)
+class CsvExporter(BaseExporter):
+    """Export data to CSV with support for chunked writing and data type formatting."""
+    def __init__(self, chunk_size: int = CHUNK_SIZE_CSV) -> None:
+        """Initialize CsvExporter with specified chunk size.
+        Args:
+            chunk_size (int, optional): Number of rows to write at once.
+            Defaults to CHUNK_SIZE_CSV.
+        """
+        if chunk_size <= 0:
+            raise ValueError("Chunk size must be greater than zero.")
+        self.chunk_size = chunk_size
+        self.chunker = DataChunker(chunk_size)
+        self.formatter = CSVFormatter()
+    def export(
+        self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
+    ) -> None:
+        """Export data to a CSV file with proper formatting and chunking.
+        Args:
+            data (list[dict[str, Any]]): List of dictionaries for the data to export.
+            output_path (str): Path to the output CSV file.
+            schema (dict | None, optional): Schema definition for the data.
+        Raises:
+            ValueError: If data is invalid or file operations fail.
+        """
+        logger.info("Starting CSV export.")
+        if not data:
+            if schema:
+                # Write an empty file with headers if schema is provided
+                pd.DataFrame(columns=list(schema.keys())).to_csv(
+                    output_path,
+                    index=False,
+                    quoting=csv.QUOTE_NONNUMERIC,
+                    quotechar='"',
+                    doublequote=True,
+                    lineterminator="\n",
+                    encoding="utf-8",
+                )
+                logger.info(f"Exported an empty file with headers to {output_path}.")
+            else:
+                logger.warning(
+                    "No data provided and no schema to write headers. Exiting."
+                )
+            return
+        try:
+            # Determine fieldnames based on schema or data keys
+            if schema:
+                fieldnames = list(schema.keys())
+                for field in fieldnames:
+                    if field not in data[0]:
+                        raise ValueError(
+                            f"Field '{field}' in schema is not present in data."
+                        )
+            else:
+                fieldnames = list(data[0].keys())
+            first_chunk = True
+            formatted_rows = []
+            for chunk in self.chunker.chunk_data(data):
+                formatted_chunk = [self.formatter.format_row(row) for row in chunk]
+                formatted_rows.extend(formatted_chunk)
+                df = pd.DataFrame(formatted_chunk, columns=fieldnames)
+                # Write the data to CSV in chunks
+                mode = "w" if first_chunk else "a"
+                header = first_chunk
+                df.to_csv(  # type: ignore
+                    output_path,
+                    index=False,  # Do not write the index column
+                    quoting=csv.QUOTE_NONNUMERIC,  # Quote non-numeric fields
+                    quotechar='"',  # Use double quotes for quoting
+                    doublequote=True,  # Escape double quotes by doubling
+                    lineterminator="\n",  # Use Unix-style line endings
+                    encoding="utf-8",  # Use UTF-8 encoding
+                    mode=mode,  # Write mode ('w', else 'a')
+                    header=header,  # Include header only in first chunk
+                )
+                first_chunk = False
+            logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
+        except UnicodeEncodeError as e:
+            logger.error(f"Encoding error: {e}")
+            raise ValueError(f"Encoding error: {str(e)}") from e
+        except OSError as e:
+            logger.error(f"File operation error: {e}")
+            raise ValueError(f"File operation error: {str(e)}") from e
+        except ValueError as e:
+            logger.error(f"Data validation error: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error: {e}")
+            raise ValueError(f"Export failed: {str(e)}") from e

src/exporters/json_exporter.py ADDED Viewed

@@ -0,0 +1,89 @@
+import json
+import logging
+from typing import Any
+from .base_exporter import BaseExporter
+from .utils.chunker import DataChunker
+from .utils.constants import CHUNK_SIZE_JSON
+from .utils.formatters import JSONFormatter
+logger = logging.getLogger(__name__)
+class JsonExporter(BaseExporter):
+    """Export data to JSON with support for chunked writing and data type formatting."""
+    def __init__(self, chunk_size: int = CHUNK_SIZE_JSON) -> None:
+        """Initialize the JSON exporter with a specified chunk size.
+        Args:
+            chunk_size (int): Number of records to process at once.
+            Defaults to CHUNK_SIZE_JSON.
+        Raises:
+            ValueError: If chunk_size is less than or equal to zero.
+        """
+        if chunk_size <= 0:
+            raise ValueError("Chunk size must be greater than zero.")
+        self.chunk_size = chunk_size
+        self.chunker = DataChunker(chunk_size)
+        self.formatter = JSONFormatter()
+    def export(
+        self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
+    ) -> None:
+        """Export data to a JSON file with proper formatting and chunking.
+        Args:
+            data (list[dict[str, Any]]): List of dictionaries for the data to export.
+            output_path (str): Path to the output JSON file.
+            schema (dict | None, optional): Schema definition for the data.
+        Raises:
+            ValueError: If data is invalid or file operations fail.
+        """
+        logger.info("Starting JSON export.")
+        if not data:
+            if schema:
+                with open(output_path, "w", encoding="utf-8") as f:
+                    json.dump([], f)
+                logger.info("Exported empty array.")
+            return
+        try:
+            # Validate schema if provided
+            if schema:
+                fieldnames = list(schema.keys())
+                for field in fieldnames:
+                    if field not in data[0]:
+                        raise ValueError(
+                            f"Field '{field}' in schema is not present in data."
+                        )
+            # Format the data and write it in chunks to the output file
+            all_formatted_rows = []
+            for chunk in self.chunker.chunk_data(data):
+                formatted_chunk = [self.formatter.format_row(row) for row in chunk]
+                all_formatted_rows.extend(formatted_chunk)
+            # Write the complete file with proper formatting using json.dumps
+            with open(output_path, "w", encoding="utf-8") as f:
+                json_str = json.dumps(all_formatted_rows, indent=4)
+                f.write(json_str)
+            logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
+        except UnicodeEncodeError as e:
+            logger.error(f"Encoding error: {e}")
+            raise ValueError(f"Encoding error: {str(e)}") from e
+        except OSError as e:
+            logger.error(f"File operation error: {e}")
+            raise ValueError(f"File operation error: {str(e)}") from e
+        except ValueError as e:
+            logger.error(f"Data validation error: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error: {e}")
+            raise

src/exporters/mssql_exporter.py ADDED Viewed

@@ -0,0 +1,198 @@
+import decimal
+from datetime import date, datetime
+from typing import Any
+from uuid import UUID
+from .base_exporter import BaseExporter
+from .utils.constants import DEFAULT_SCHEMA
+MSSQL_TYPE_MAPPING = {
+    "string": "NVARCHAR(255)",  # Unicode string support
+    "text": "NVARCHAR(MAX)",  # Unicode text, replaces TEXT which is deprecated
+    "integer": "INT",  # Same as MSSQL
+    "bigint": "BIGINT",  # Same as MSSQL
+    "float": "FLOAT",  # Same as MSSQL
+    "decimal": "DECIMAL(18,2)",  # Same as MSSQL
+    "boolean": "BIT",  # MSSQL uses BIT instead of TINYINT for boolean
+    "date": "DATE",  # Same as MSSQL
+    "datetime": "DATETIME2",  # More precise than DATETIME
+    "blob": "VARBINARY(MAX)",  # MSSQL equivalent for BLOB
+    "uuid": "UNIQUEIDENTIFIER",  # Native GUID/UUID type in MSSQL
+    "enum": "NVARCHAR(255)",  # MSSQL doesn't have ENUM, use NVARCHAR instead
+}
+class MssqlExporter(BaseExporter):
+    """Exports data to MSSQL compatible SQL file."""
+    def _format_value(
+        self,
+        value: (
+            None
+            | str
+            | UUID
+            | datetime
+            | date
+            | bool
+            | bytes
+            | int
+            | float
+            | decimal.Decimal
+        ),
+    ) -> str:
+        """Format a value for use in a MSSQL query.
+        This method handles various data types and converts them to their
+        appropriate string representation for use in MSSQL queries.
+        It includes proper escaping for special characters in strings.
+        Args:
+            value: The value to format. Can be one of the following types:
+                - None: Converted to 'NULL'
+                - str: Escaped and wrapped in single quotes
+                - UUID: Converted to string, escaped and wrapped in single quotes
+                - datetime: Converted to ISO format string and wrapped in single quotes
+                - date: Converted to ISO format string and wrapped in single quotes
+                - bool: Converted to '1' for True or '0' for False
+                - bytes: Use 0x prefix for hexadecimal to binary conversion
+                - int: Converted to string representation
+                - float: Converted to string representation
+                - decimal.Decimal: Converted to string representation
+        Returns:
+            str: The formatted value ready for use in a MSSQL query.
+        Raises:
+            ValueError: If the input value type is not supported.
+        """
+        if value is None:
+            return "NULL"
+        elif isinstance(value, (str | UUID)):
+            return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
+        elif isinstance(value, (datetime | date)):
+            return f"'{value.isoformat()}'"
+        elif isinstance(value, bool):
+            return "1" if value else "0"
+        elif isinstance(value, bytes):
+            return f"0x{value.hex()}"
+        elif isinstance(value, (int | float | decimal.Decimal)):
+            return str(value)
+        else:
+            raise ValueError(f"Unsupported type: {type(value)}")
+    def _get_column_type(self, field: dict) -> str:
+        """Get the MSSQL column type based on the field type.
+        Args:
+            field (dict): A dictionary containing field information, including its type.
+                The type can be either a string or a dictionary with a 'type' key.
+        Returns:
+            str: The corresponding MSSQL data type from MSSQL_TYPE_MAPPING.
+        Raises:
+            KeyError: If the field type is not found in MSSQL_TYPE_MAPPING.
+        """
+        field_type = field.get("type", "string")
+        if isinstance(field_type, dict):
+            field_type = field_type.get("type", "string")
+        return MSSQL_TYPE_MAPPING[field_type]
+    def _create_table_stmt(self, schema: dict, table_name: str = "output") -> str:
+        """Generate a MSSQL CREATE TABLE statement based on provided schema.
+        This method constructs a CREATE TABLE SQL statement by mapping schema field
+        definitions to their corresponding MSSQL column types. For ENUM types,
+        it creates a CHECK constraint to validate the values.
+        Args:
+            schema (dict): A dictionary defining the table schema with field names
+                as keys and type definitions as values.
+            table_name (str, optional): Name for the table. Defaults to "output".
+        Returns:
+            str: A complete MSSQL CREATE TABLE statement as a string.
+        Example:
+            schema = {
+                "id": "int",
+                "status": {"type": "enum", "values": ["active", "inactive"]}
+            }
+            result = _create_table_stmt(schema, "users")
+            # Returns: CREATE TABLE users (
+            #     id INT NULL,
+            #     status NVARCHAR(255) NULL CHECK (status IN ('active','inactive'))
+            # );
+        """
+        columns = []
+        check_constraints = []
+        for field_name, field_def in schema.items():
+            field_type_dict = (
+                field_def if isinstance(field_def, dict) else {"type": field_def}
+            )
+            sql_type = self._get_column_type(field_type_dict)
+            if (
+                isinstance(field_def, dict)
+                and field_def.get("type") == "enum"
+                and "values" in field_def
+            ):
+                values = "','".join(field_def["values"])
+                check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
+            columns.append(f"    {field_name} {sql_type} NULL")
+        # Combine columns and check constraints
+        return (
+            f"CREATE TABLE {table_name} (\n"
+            + ",\n".join(columns)
+            + (
+                (",\n    " + ",\n    ".join(check_constraints))
+                if check_constraints
+                else ""
+            )
+            + "\n);\n\n"
+        )
+    def _create_insert_stmt(
+        self, row: dict[str, Any], table_name: str = "output"
+    ) -> str:
+        """Create a MSSQL INSERT statement from a dictionary of values.
+        Args:
+            row (dict[str, Any]): Dictionary containing column names as keys
+                and values to insert
+            table_name (str, optional): Name of the target table. Defaults to "output"
+        Returns:
+            str: Formatted MSSQL INSERT statement string
+        Example:
+            >>> row = {"id": 1, "name": "test"}
+            >>> _create_insert_stmt(row, "users")
+            'INSERT INTO users (id, name) VALUES (1, "test");'
+        """
+        columns = ", ".join(row.keys())
+        values = ", ".join(self._format_value(v) for v in row.values())
+        return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
+    def export(
+        self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
+    ) -> None:
+        """Export data to MSSQL compatible SQL file."""
+        if not data:
+            return
+        table_name = output_path.split("/")[-1].split(".")[0]
+        with open(output_path, "w") as f:
+            # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
+            schema_to_use = schema or DEFAULT_SCHEMA
+            f.write(self._create_table_stmt(schema_to_use, table_name))
+            for row in data:
+                f.write(self._create_insert_stmt(row, table_name) + "\n")