TestDataX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ from ..csv_exporter import CsvExporter
2
+ from ..json_exporter import JsonExporter
3
+ from ..mssql_exporter import MssqlExporter
4
+ from ..mysql_exporter import MysqlExporter
5
+ from ..oracle_exporter import OracleExporter
6
+ from ..orc_exporter import OrcExporter
7
+ from ..parquet_exporter import ParquetExporter
8
+
9
+ EXPORTER_CLASSES = {
10
+ "csv": CsvExporter(),
11
+ "json": JsonExporter(),
12
+ "parquet": ParquetExporter(),
13
+ "orc": OrcExporter(),
14
+ "mysql": MysqlExporter(),
15
+ "mssql": MssqlExporter(),
16
+ "oracle": OracleExporter(),
17
+ }
@@ -0,0 +1,165 @@
1
+ import json
2
+ import uuid
3
+ from abc import abstractmethod
4
+ from datetime import date, datetime
5
+ from decimal import Decimal
6
+ from typing import Any
7
+
8
+
9
+ class BaseFormatter:
10
+ """Base class for handling data type formatting across exporters."""
11
+
12
+ @staticmethod
13
+ def format_none() -> str | None:
14
+ """Format None values."""
15
+ return None
16
+
17
+ @staticmethod
18
+ def format_datetime(value: datetime | date) -> str:
19
+ """Format datetime and date values."""
20
+ return value.isoformat()
21
+
22
+ @staticmethod
23
+ def format_uuid(value: uuid.UUID) -> str:
24
+ """Format UUID values."""
25
+ return str(value)
26
+
27
+ @staticmethod
28
+ def format_bytes(value: bytes) -> str:
29
+ """Format bytes values."""
30
+ return value.hex()
31
+
32
+ @staticmethod
33
+ def format_decimal(value: Decimal) -> float:
34
+ """Format Decimal values."""
35
+ return float(value)
36
+
37
+ @staticmethod
38
+ def format_string(value: str) -> str:
39
+ """Format string values, removing null bytes."""
40
+ return str(value).replace("\x00", "")
41
+
42
+ @abstractmethod
43
+ def format_value(
44
+ self,
45
+ value: (
46
+ None
47
+ | datetime
48
+ | date
49
+ | uuid.UUID
50
+ | bytes
51
+ | Decimal
52
+ | dict
53
+ | list
54
+ | int
55
+ | float
56
+ | str
57
+ ),
58
+ ) -> None | str | float | dict | list | int:
59
+ """Format a single value according to export format requirements.
60
+
61
+ Args:
62
+ value: The value to format
63
+
64
+ Returns:
65
+ Formatted value suitable for the target format
66
+
67
+ """
68
+ pass
69
+
70
+ def format_row(
71
+ self, row: dict[str, Any], **kwargs: dict[str, str | int | float]
72
+ ) -> dict[str, Any]:
73
+ """Format the provided rows with the correct format_value.
74
+
75
+ Args:
76
+ row: Dictionary containing row data
77
+ **kwargs: Additional format-specific parameters
78
+
79
+ Returns:
80
+ Formatted row dictionary
81
+
82
+ """
83
+ formatted_row: dict[str, Any] = {}
84
+ for key, value in row.items():
85
+ try:
86
+ formatted_row[key] = self.format_value(value)
87
+ except Exception as e:
88
+ formatted_row[key] = f"ERROR: {str(e)}"
89
+ return formatted_row
90
+
91
+
92
+ class JSONFormatter(BaseFormatter):
93
+ """Formatter for JSON exports."""
94
+
95
+ @classmethod
96
+ def format_value(
97
+ cls,
98
+ value: (
99
+ datetime
100
+ | date
101
+ | uuid.UUID
102
+ | bytes
103
+ | Decimal
104
+ | dict
105
+ | list
106
+ | int
107
+ | float
108
+ | str
109
+ | None
110
+ ),
111
+ ) -> str | float | dict | list | int | None:
112
+ """Format the provided values."""
113
+ if value is None:
114
+ return cls.format_none()
115
+ elif isinstance(value, (datetime | date)):
116
+ return cls.format_datetime(value)
117
+ elif isinstance(value, uuid.UUID):
118
+ return cls.format_uuid(value)
119
+ elif isinstance(value, bytes):
120
+ return cls.format_bytes(value)
121
+ elif isinstance(value, Decimal):
122
+ return cls.format_decimal(value)
123
+ elif isinstance(value, (dict | list | int | float)):
124
+ return value
125
+ else:
126
+ return cls.format_string(str(value))
127
+
128
+
129
+ class CSVFormatter(BaseFormatter):
130
+ """Formatter for CSV exports."""
131
+
132
+ @classmethod
133
+ def format_value(
134
+ cls,
135
+ value: (
136
+ datetime
137
+ | date
138
+ | uuid.UUID
139
+ | bytes
140
+ | Decimal
141
+ | dict
142
+ | list
143
+ | int
144
+ | float
145
+ | str
146
+ | None
147
+ ),
148
+ ) -> str | float | dict | list | int | None:
149
+ """Format the provided values."""
150
+ if value is None:
151
+ return cls.format_none()
152
+ elif isinstance(value, (datetime | date)):
153
+ return cls.format_datetime(value)
154
+ elif isinstance(value, uuid.UUID):
155
+ return cls.format_uuid(value)
156
+ elif isinstance(value, bytes):
157
+ return cls.format_bytes(value)
158
+ elif isinstance(value, Decimal):
159
+ return cls.format_decimal(value)
160
+ elif isinstance(value, (dict | list)):
161
+ return json.dumps(value)
162
+ elif isinstance(value, (int | float)):
163
+ return value
164
+ else:
165
+ return cls.format_string(str(value))
src/generator.py ADDED
@@ -0,0 +1,117 @@
1
+ from datetime import date, datetime
2
+ from decimal import Decimal
3
+ from typing import Any
4
+ from uuid import UUID
5
+
6
+ from faker import Faker
7
+
8
+ from .providers import DataProvider, FakerProvider
9
+ from .schemas import DataType, FieldSchema
10
+
11
+
12
+ class DataGenerator:
13
+ """The DataGenerator class generates synthetic data based on the provided field schemas.
14
+
15
+ Supported data types:
16
+ - STRING: Generates usernames
17
+ - TEXT: Generates paragraphs of text
18
+ - INTEGER: Generates integers between 0 and 100
19
+ - BIGINT: Generates large integers between 0 and 9999999999
20
+ - FLOAT: Generates floating point numbers with 2 decimal places
21
+ - DECIMAL: Generates Decimal numbers with 2 decimal places
22
+ - BOOLEAN: Generates True/False values
23
+ - DATE: Generates date objects
24
+ - DATETIME: Generates datetime objects
25
+ - BLOB: Generates binary data
26
+ - UUID: Generates UUID objects
27
+ - ENUM: Generates values from provided enum_values list
28
+
29
+ """ # noqa: E501
30
+
31
+ def __init__(self, provider: DataProvider | None = None) -> None:
32
+ """Initialize the DataGenerator with Faker instance and type generator mappings.
33
+
34
+ The constructor initializes the Faker instance and creates a mapping of DataType
35
+ enums to their corresponding generator methods.
36
+ """
37
+ self.provider = provider or FakerProvider()
38
+ self.faker = Faker()
39
+ self.type_generators = {
40
+ DataType.STRING: self._generate_string,
41
+ DataType.TEXT: self._generate_text,
42
+ DataType.INTEGER: self._generate_integer,
43
+ DataType.BIGINT: self._generate_bigint,
44
+ DataType.FLOAT: self._generate_float,
45
+ DataType.DECIMAL: self._generate_decimal,
46
+ DataType.BOOLEAN: self._generate_boolean,
47
+ DataType.DATE: self._generate_date,
48
+ DataType.DATETIME: self._generate_datetime,
49
+ DataType.BLOB: self._generate_blob,
50
+ DataType.UUID: self._generate_uuid,
51
+ DataType.ENUM: self._generate_enum,
52
+ }
53
+
54
+ def generate_data(
55
+ self, fields: list[FieldSchema], count: int
56
+ ) -> list[dict[str, Any]]:
57
+ """Generate data based on the provided schema and count."""
58
+ data = []
59
+ for _ in range(count):
60
+ row = {}
61
+ for field in fields:
62
+ generator = self.type_generators[field.type]
63
+ row[field.name] = generator(field)
64
+ data.append(row)
65
+ return data
66
+
67
+ def _generate_string(self, field: FieldSchema) -> str:
68
+ provider_value = str(field.value_provider) if field.value_provider else "name"
69
+ return self.provider.generate_string(value_provider=provider_value)
70
+
71
+ def _generate_text(self, field: FieldSchema) -> str:
72
+ return self.provider.generate_text()
73
+
74
+ def _generate_integer(self, field: FieldSchema) -> int:
75
+ min_val = (
76
+ int(field.min_value)
77
+ if hasattr(field, "min_value") and field.min_value is not None
78
+ else 0
79
+ )
80
+ max_val = (
81
+ int(field.max_value)
82
+ if hasattr(field, "max_value") and field.max_value is not None
83
+ else 100
84
+ )
85
+ return self.provider.generate_integer(
86
+ min_value=min_val,
87
+ max_value=max_val,
88
+ )
89
+
90
+ def _generate_bigint(self, field: FieldSchema) -> int:
91
+ return self.faker.random_int(min=0, max=9999999999)
92
+
93
+ def _generate_float(self, field: FieldSchema) -> float:
94
+ return self.faker.pyfloat(right_digits=2)
95
+
96
+ def _generate_decimal(self, field: FieldSchema) -> Decimal:
97
+ return self.provider.generate_decimal()
98
+
99
+ def _generate_boolean(self, field: FieldSchema) -> bool:
100
+ return self.provider.generate_boolean()
101
+
102
+ def _generate_date(self, field: FieldSchema) -> date:
103
+ return self.provider.generate_date()
104
+
105
+ def _generate_datetime(self, field: FieldSchema) -> datetime:
106
+ return self.provider.generate_datetime()
107
+
108
+ def _generate_blob(self, field: FieldSchema) -> bytes:
109
+ return self.provider.generate_binary()
110
+
111
+ def _generate_uuid(self, field: FieldSchema) -> UUID:
112
+ return self.provider.generate_uuid()
113
+
114
+ def _generate_enum(self, field: FieldSchema) -> str:
115
+ if not field.enum_values:
116
+ raise ValueError(f"Enum field {field.name} must have values defined")
117
+ return self.provider.generate_enum(field.enum_values)
@@ -0,0 +1,4 @@
1
+ from .base import DataProvider
2
+ from .faker_provider import FakerProvider
3
+
4
+ __all__ = ["DataProvider", "FakerProvider"]
src/providers/base.py ADDED
@@ -0,0 +1,58 @@
1
+ from abc import ABC, abstractmethod
2
+ from datetime import date, datetime
3
+ from decimal import Decimal
4
+ from uuid import UUID
5
+
6
+
7
+ class DataProvider(ABC):
8
+ """Abstract base class for data providers that generate test data values."""
9
+
10
+ @abstractmethod
11
+ def generate_string(self, **kwargs: str) -> str:
12
+ """Generate a string value."""
13
+ pass
14
+
15
+ @abstractmethod
16
+ def generate_text(self, **kwargs: str) -> str:
17
+ """Generate a text value."""
18
+ pass
19
+
20
+ @abstractmethod
21
+ def generate_integer(self, min_value: int = 0, max_value: int = 100) -> int:
22
+ """Generate an integer value."""
23
+ pass
24
+
25
+ @abstractmethod
26
+ def generate_decimal(self, **kwargs: Decimal) -> Decimal:
27
+ """Generate a decimal value."""
28
+ pass
29
+
30
+ @abstractmethod
31
+ def generate_boolean(self) -> bool:
32
+ """Generate a boolean value."""
33
+ pass
34
+
35
+ @abstractmethod
36
+ def generate_date(self) -> date:
37
+ """Generate a date value."""
38
+ pass
39
+
40
+ @abstractmethod
41
+ def generate_datetime(self) -> datetime:
42
+ """Generate a datetime value."""
43
+ pass
44
+
45
+ @abstractmethod
46
+ def generate_binary(self, length: int = 64) -> bytes:
47
+ """Generate binary data."""
48
+ pass
49
+
50
+ @abstractmethod
51
+ def generate_uuid(self) -> UUID:
52
+ """Generate a UUID."""
53
+ pass
54
+
55
+ @abstractmethod
56
+ def generate_enum(self, values: list[str]) -> str:
57
+ """Generate an enum value from the given choices."""
58
+ pass
@@ -0,0 +1,65 @@
1
+ from datetime import date, datetime
2
+ from decimal import Decimal
3
+ from uuid import UUID
4
+
5
+ from faker import Faker
6
+
7
+ from .base import DataProvider
8
+
9
+
10
+ class FakerProvider(DataProvider):
11
+ """FakerProvider generates fake data using the Faker library."""
12
+
13
+ def __init__(self) -> None:
14
+ """Initialize the FakerProvider with a Faker instance."""
15
+ self.faker = Faker()
16
+
17
+ def generate_string(self, **kwargs: str) -> str:
18
+ """Generate a fake string using the specified Faker provider."""
19
+ provider = kwargs.get("value_provider") or "name"
20
+ return str(getattr(self.faker, provider)())
21
+
22
+ def generate_text(self, **kwargs: str) -> str:
23
+ """Generate a fake text string."""
24
+ return self.faker.text()
25
+
26
+ def generate_integer(
27
+ self, min_value: int | None = None, max_value: int | None = None
28
+ ) -> int:
29
+ """Generate a fake integer within the specified range."""
30
+ min_val = min_value if min_value is not None else 0
31
+ max_val = max_value if max_value is not None else 100
32
+ return self.faker.pyint(min_value=min_val, max_value=max_val)
33
+
34
+ def generate_decimal(self, **kwargs: Decimal) -> Decimal:
35
+ """Generate a fake decimal number."""
36
+ return Decimal(str(self.faker.pyfloat(right_digits=2)))
37
+
38
+ def generate_boolean(self) -> bool:
39
+ """Generate a fake boolean value."""
40
+ return self.faker.boolean()
41
+
42
+ def generate_date(self) -> date:
43
+ """Generate a fake date object."""
44
+ return self.faker.date_object()
45
+
46
+ def generate_datetime(self) -> datetime:
47
+ """Generate a fake datetime object."""
48
+ return self.faker.date_time()
49
+
50
+ def generate_binary(self, length: int = 64) -> bytes:
51
+ """Generate a fake binary string of the specified length."""
52
+ return self.faker.binary(length=length)
53
+
54
+ def generate_uuid(self) -> UUID:
55
+ """Generate a fake UUID."""
56
+ uuid_value = self.faker.uuid4()
57
+ if isinstance(uuid_value, UUID):
58
+ return uuid_value
59
+ return UUID(str(uuid_value))
60
+
61
+ def generate_enum(self, values: list[str]) -> str:
62
+ """Generate a fake value from the given list of values."""
63
+ if not values:
64
+ raise ValueError("Enum values cannot be empty")
65
+ return self.faker.random_element(values)
src/schemas.py ADDED
@@ -0,0 +1,81 @@
1
+ from enum import Enum
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from .exporters.utils.constants import EXPORT_PATTERNS
7
+
8
+
9
+ class DataType(str, Enum):
10
+ """An enumeration class for different data types commonly used in database schemas.
11
+
12
+ A comprehensive set of data types that can be used to define the structure of
13
+ generated data in various database contexts.
14
+
15
+ Attributes:
16
+ STRING: Represents a string data type for short character sequences
17
+ TEXT: Represents a text data type for longer character sequences
18
+ INTEGER: Represents a standard integer data type
19
+ BIGINT: Represents a large integer data type
20
+ FLOAT: Represents a floating-point number data type
21
+ DECIMAL: Represents a precise decimal number data type
22
+ BOOLEAN: Represents a boolean (True/False) data type
23
+ DATE: Represents a date data type
24
+ DATETIME: Represents a date and time data type
25
+ BLOB: Represents a binary large object data type
26
+ UUID: Represents a universally unique identifier data type
27
+ ENUM: Represents an enumerated data type
28
+
29
+ """
30
+
31
+ STRING = "string"
32
+ TEXT = "text"
33
+ INTEGER = "integer"
34
+ BIGINT = "bigint"
35
+ FLOAT = "float"
36
+ DECIMAL = "decimal"
37
+ BOOLEAN = "boolean"
38
+ DATE = "date"
39
+ DATETIME = "datetime"
40
+ BLOB = "blob"
41
+ UUID = "uuid"
42
+ ENUM = "enum"
43
+
44
+
45
+ class FieldSchema(BaseModel):
46
+ """Defines the schema for a single field in the data generation configuration.
47
+
48
+ This class represents the structure and constraints for generating field values.
49
+ """
50
+
51
+ name: str
52
+ type: DataType
53
+ enum_values: list[str] | None = None
54
+ min_value: Any | None = None
55
+ max_value: Any | None = None
56
+ right_digits: int | None = None
57
+ value_provider: str | None = None
58
+ pattern: str | None = None
59
+
60
+
61
+ class GeneratorConfig(BaseModel):
62
+ """A configuration class for data generation.
63
+
64
+ This class defines the structure and parameters needed for generating synthetic
65
+ data.
66
+
67
+ Attributes:
68
+ fields (list[FieldSchema]): List of field schemas defining the structure of
69
+ data to generate.
70
+
71
+ row_count (int): Number of rows/records to generate.
72
+ export_format (str): Format for exporting generated data. Must match defined
73
+ export patterns.
74
+ output_path (str): File path where the generated data will be saved.
75
+
76
+ """
77
+
78
+ fields: list[FieldSchema]
79
+ row_count: int
80
+ export_format: str = Field(..., pattern=EXPORT_PATTERNS)
81
+ output_path: str
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 JamesPBrett
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.