TestDataX 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {testdatax-0.1.1 → testdatax-0.2.0}/PKG-INFO +69 -20
- {testdatax-0.1.1 → testdatax-0.2.0}/README.md +64 -17
- {testdatax-0.1.1 → testdatax-0.2.0}/pyproject.toml +21 -6
- {testdatax-0.1.1 → testdatax-0.2.0}/src/__init__.py +1 -1
- {testdatax-0.1.1 → testdatax-0.2.0}/src/cli.py +36 -6
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/csv_exporter.py +0 -2
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/json_exporter.py +14 -8
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/mssql_exporter.py +10 -3
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/mysql_exporter.py +10 -3
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/oracle_exporter.py +10 -3
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/constants.py +6 -0
- testdatax-0.2.0/src/exporters/utils/sql.py +18 -0
- testdatax-0.2.0/src/generator.py +175 -0
- testdatax-0.2.0/src/providers/__init__.py +5 -0
- testdatax-0.2.0/src/providers/base.py +153 -0
- testdatax-0.2.0/src/providers/faker_provider.py +114 -0
- testdatax-0.2.0/src/providers/mimesis_provider.py +153 -0
- testdatax-0.2.0/src/schemas.py +145 -0
- testdatax-0.1.1/src/generator.py +0 -117
- testdatax-0.1.1/src/providers/__init__.py +0 -4
- testdatax-0.1.1/src/providers/base.py +0 -58
- testdatax-0.1.1/src/providers/faker_provider.py +0 -65
- testdatax-0.1.1/src/schemas.py +0 -81
- {testdatax-0.1.1 → testdatax-0.2.0}/LICENSE +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/__init__.py +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/base_exporter.py +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/orc_exporter.py +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/parquet_exporter.py +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/__init__.py +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/chunker.py +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/exporter_config.py +0 -0
- {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/formatters.py +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: TestDataX
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A flexible test data generation toolkit
|
|
5
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Author: JamesPBrett
|
|
7
8
|
Requires-Python: >=3.11,<4.0
|
|
8
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -10,9 +11,10 @@ Classifier: Programming Language :: Python :: 3
|
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
13
15
|
Requires-Dist: faker (>=33.1.0,<34.0.0)
|
|
16
|
+
Requires-Dist: mimesis (>=18.0.0,<19.0.0)
|
|
14
17
|
Requires-Dist: mysql-connector-python (>=9.1.0,<10.0.0)
|
|
15
|
-
Requires-Dist: orjson (>=3.10.12,<4.0.0)
|
|
16
18
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
17
19
|
Requires-Dist: pyarrow (>=18.1.0,<19.0.0)
|
|
18
20
|
Requires-Dist: pydantic (>=2.10.4,<3.0.0)
|
|
@@ -21,14 +23,12 @@ Description-Content-Type: text/markdown
|
|
|
21
23
|
|
|
22
24
|
# TestDataX
|
|
23
25
|
|
|
24
|
-
# TestDataX
|
|
25
|
-
|
|
26
26
|

|
|
27
27
|
[](https://codecov.io/gh/JamesPBrett/testdatax)
|
|
28
28
|

|
|
29
29
|

|
|
30
30
|
|
|
31
|
-
This command-line interface application enables quick and customizable test data generation across various formats. It
|
|
31
|
+
This command-line interface application enables quick and customizable test data generation across various formats. It supports multiple data providers (Mimesis and Faker) for realistic data generation, offers flexible schema configurations, and simplifies output to multiple database dialects or file types. Users can define precise parameters for data volume, types, and constraints for each target data set.
|
|
32
32
|
|
|
33
33
|
## Requirements
|
|
34
34
|
- Python 3.11+
|
|
@@ -41,11 +41,11 @@ pip install testdatax
|
|
|
41
41
|
|
|
42
42
|
# Generate sample data
|
|
43
43
|
testdatax --rows 1000 --format json --output data.json
|
|
44
|
-
|
|
44
|
+
```
|
|
45
45
|
|
|
46
46
|
## Features
|
|
47
47
|
|
|
48
|
-
- Generate realistic test data using
|
|
48
|
+
- Generate realistic test data using multiple data providers (Mimesis, Faker)
|
|
49
49
|
- Support for multiple output formats (CSV, JSON, SQL, etc.)
|
|
50
50
|
- Customizable schema definitions
|
|
51
51
|
- Configurable data generation parameters
|
|
@@ -63,7 +63,7 @@ testdatax --rows 1000 --format json --output data.json
|
|
|
63
63
|
|
|
64
64
|
## CLI Usage
|
|
65
65
|
```bash
|
|
66
|
-
testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> [-d]
|
|
66
|
+
testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> -p <provider> [-d]
|
|
67
67
|
```
|
|
68
68
|
|
|
69
69
|
Options:
|
|
@@ -71,6 +71,7 @@ Options:
|
|
|
71
71
|
- `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
|
|
72
72
|
- `-r, --rows`: Number of rows to generate (default: 10)
|
|
73
73
|
- `-s, --schema`: Path to schema file
|
|
74
|
+
- `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
|
|
74
75
|
- `-d, --debug`: Enable debug output
|
|
75
76
|
|
|
76
77
|
## Usage Examples
|
|
@@ -80,10 +81,20 @@ Generate 10 rows of CSV data:
|
|
|
80
81
|
testdatax -o users.csv -f csv -s schema.json -r 10
|
|
81
82
|
```
|
|
82
83
|
|
|
84
|
+
Generate 10 rows of CSV data using Faker provider:
|
|
85
|
+
```bash
|
|
86
|
+
testdatax -o users.csv -f csv -s schema.json -r 10 -p faker
|
|
87
|
+
```
|
|
88
|
+
|
|
83
89
|
Generate 1000 rows of Parquet data with debug output:
|
|
84
90
|
```bash
|
|
85
91
|
testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -d
|
|
86
92
|
```
|
|
93
|
+
|
|
94
|
+
Generate 1000 rows of Parquet data using Mimesis provider:
|
|
95
|
+
```bash
|
|
96
|
+
testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -p mimesis
|
|
97
|
+
```
|
|
87
98
|
Generate JSON data with default row count (10):
|
|
88
99
|
```bash
|
|
89
100
|
testdatax -o data.json -f json -s schema.json
|
|
@@ -106,7 +117,7 @@ testdatax -o mstest.sql -f mssql -r 1000
|
|
|
106
117
|
|
|
107
118
|
Generate Oracle with default row count (1000), table_name as 'oracle':
|
|
108
119
|
```bash
|
|
109
|
-
|
|
120
|
+
testdatax -o oracle.sql -f oracle -r 1000
|
|
110
121
|
```
|
|
111
122
|
|
|
112
123
|
Each command consists of:
|
|
@@ -114,6 +125,7 @@ Each command consists of:
|
|
|
114
125
|
- `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
|
|
115
126
|
- `-s, --schema`: Path to your schema definition file
|
|
116
127
|
- `-r, --rows`: Number of rows to generate (optional, defaults to 10)
|
|
128
|
+
- `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
|
|
117
129
|
- `-d, --debug`: Enable debug logging (optional)
|
|
118
130
|
|
|
119
131
|
## Schema Example
|
|
@@ -122,7 +134,7 @@ Each command consists of:
|
|
|
122
134
|
{
|
|
123
135
|
"username": {
|
|
124
136
|
"type": "string",
|
|
125
|
-
"
|
|
137
|
+
"provider_field": "name"
|
|
126
138
|
},
|
|
127
139
|
"date_joined": {
|
|
128
140
|
"type": "datetime"
|
|
@@ -169,7 +181,7 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
169
181
|
"type": "string",
|
|
170
182
|
"min_length": 5,
|
|
171
183
|
"max_length": 20,
|
|
172
|
-
"
|
|
184
|
+
"provider_field": "user_name" // Use provider-specific field to generate realistic data
|
|
173
185
|
},
|
|
174
186
|
"description": {
|
|
175
187
|
"type": "text",
|
|
@@ -211,6 +223,12 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
211
223
|
}
|
|
212
224
|
```
|
|
213
225
|
|
|
226
|
+
> **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
|
|
227
|
+
> `format` is set, date/datetime values are rendered to a string with
|
|
228
|
+
> `strftime`; for the SQL exporters this means the column receives a formatted
|
|
229
|
+
> string literal rather than a native date, so `format` is best suited to the
|
|
230
|
+
> CSV/JSON formats.
|
|
231
|
+
|
|
214
232
|
#### Enum Fields
|
|
215
233
|
```json
|
|
216
234
|
{
|
|
@@ -222,25 +240,25 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
222
240
|
}
|
|
223
241
|
```
|
|
224
242
|
|
|
225
|
-
#### Using
|
|
226
|
-
|
|
243
|
+
#### Using Data Providers
|
|
244
|
+
Both Mimesis and Faker providers support the same schema format. You can specify provider-specific generators using the `provider_field` field (works with both providers):
|
|
227
245
|
```json
|
|
228
246
|
{
|
|
229
247
|
"name": {
|
|
230
248
|
"type": "string",
|
|
231
|
-
"
|
|
249
|
+
"provider_field": "name"
|
|
232
250
|
},
|
|
233
251
|
"email": {
|
|
234
252
|
"type": "string",
|
|
235
|
-
"
|
|
253
|
+
"provider_field": "email"
|
|
236
254
|
},
|
|
237
255
|
"address": {
|
|
238
256
|
"type": "string",
|
|
239
|
-
"
|
|
257
|
+
"provider_field": "address"
|
|
240
258
|
},
|
|
241
259
|
"company": {
|
|
242
260
|
"type": "string",
|
|
243
|
-
"
|
|
261
|
+
"provider_field": "company"
|
|
244
262
|
}
|
|
245
263
|
}
|
|
246
264
|
```
|
|
@@ -254,12 +272,12 @@ The generator supports Faker providers for generating realistic data:
|
|
|
254
272
|
},
|
|
255
273
|
"username": {
|
|
256
274
|
"type": "string",
|
|
257
|
-
"
|
|
275
|
+
"provider_field": "user_name",
|
|
258
276
|
"unique": true
|
|
259
277
|
},
|
|
260
278
|
"email": {
|
|
261
279
|
"type": "string",
|
|
262
|
-
"
|
|
280
|
+
"provider_field": "email",
|
|
263
281
|
"unique": true
|
|
264
282
|
},
|
|
265
283
|
"age": {
|
|
@@ -284,6 +302,37 @@ The generator supports Faker providers for generating realistic data:
|
|
|
284
302
|
}
|
|
285
303
|
```
|
|
286
304
|
|
|
305
|
+
## Data Providers
|
|
306
|
+
|
|
307
|
+
TestDataX supports two powerful data providers for generating realistic test data:
|
|
308
|
+
|
|
309
|
+
### Mimesis (Default)
|
|
310
|
+
Mimesis is a high-performance Python library for generating synthetic data. It provides:
|
|
311
|
+
- Fast data generation with excellent performance
|
|
312
|
+
- Support for multiple locales and languages
|
|
313
|
+
- Wide variety of data providers for different domains
|
|
314
|
+
- Lightweight and efficient implementation
|
|
315
|
+
|
|
316
|
+
### Faker
|
|
317
|
+
Faker is a popular Python library for generating fake data. It offers:
|
|
318
|
+
- Extensive provider ecosystem with community contributions
|
|
319
|
+
- Rich set of localized providers
|
|
320
|
+
- Well-established and widely used in the Python community
|
|
321
|
+
- Comprehensive documentation and examples
|
|
322
|
+
|
|
323
|
+
You can specify the provider using the `-p` or `--provider` option:
|
|
324
|
+
```bash
|
|
325
|
+
# Use Mimesis (default)
|
|
326
|
+
testdatax -o data.csv -f csv -p mimesis
|
|
327
|
+
|
|
328
|
+
# Use Faker
|
|
329
|
+
testdatax -o data.csv -f csv -p faker
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
Both providers support the same schema format and generate compatible data types.
|
|
333
|
+
|
|
334
|
+
**Note:** For backward compatibility, the legacy `faker` field name is still supported, but `provider_field` is recommended for new schemas.
|
|
335
|
+
|
|
287
336
|
## Supported Data Types
|
|
288
337
|
|
|
289
338
|
- string
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
# TestDataX
|
|
2
2
|
|
|
3
|
-
# TestDataX
|
|
4
|
-
|
|
5
3
|

|
|
6
4
|
[](https://codecov.io/gh/JamesPBrett/testdatax)
|
|
7
5
|

|
|
8
6
|

|
|
9
7
|
|
|
10
|
-
This command-line interface application enables quick and customizable test data generation across various formats. It
|
|
8
|
+
This command-line interface application enables quick and customizable test data generation across various formats. It supports multiple data providers (Mimesis and Faker) for realistic data generation, offers flexible schema configurations, and simplifies output to multiple database dialects or file types. Users can define precise parameters for data volume, types, and constraints for each target data set.
|
|
11
9
|
|
|
12
10
|
## Requirements
|
|
13
11
|
- Python 3.11+
|
|
@@ -20,11 +18,11 @@ pip install testdatax
|
|
|
20
18
|
|
|
21
19
|
# Generate sample data
|
|
22
20
|
testdatax --rows 1000 --format json --output data.json
|
|
23
|
-
|
|
21
|
+
```
|
|
24
22
|
|
|
25
23
|
## Features
|
|
26
24
|
|
|
27
|
-
- Generate realistic test data using
|
|
25
|
+
- Generate realistic test data using multiple data providers (Mimesis, Faker)
|
|
28
26
|
- Support for multiple output formats (CSV, JSON, SQL, etc.)
|
|
29
27
|
- Customizable schema definitions
|
|
30
28
|
- Configurable data generation parameters
|
|
@@ -42,7 +40,7 @@ testdatax --rows 1000 --format json --output data.json
|
|
|
42
40
|
|
|
43
41
|
## CLI Usage
|
|
44
42
|
```bash
|
|
45
|
-
testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> [-d]
|
|
43
|
+
testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> -p <provider> [-d]
|
|
46
44
|
```
|
|
47
45
|
|
|
48
46
|
Options:
|
|
@@ -50,6 +48,7 @@ Options:
|
|
|
50
48
|
- `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
|
|
51
49
|
- `-r, --rows`: Number of rows to generate (default: 10)
|
|
52
50
|
- `-s, --schema`: Path to schema file
|
|
51
|
+
- `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
|
|
53
52
|
- `-d, --debug`: Enable debug output
|
|
54
53
|
|
|
55
54
|
## Usage Examples
|
|
@@ -59,10 +58,20 @@ Generate 10 rows of CSV data:
|
|
|
59
58
|
testdatax -o users.csv -f csv -s schema.json -r 10
|
|
60
59
|
```
|
|
61
60
|
|
|
61
|
+
Generate 10 rows of CSV data using Faker provider:
|
|
62
|
+
```bash
|
|
63
|
+
testdatax -o users.csv -f csv -s schema.json -r 10 -p faker
|
|
64
|
+
```
|
|
65
|
+
|
|
62
66
|
Generate 1000 rows of Parquet data with debug output:
|
|
63
67
|
```bash
|
|
64
68
|
testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -d
|
|
65
69
|
```
|
|
70
|
+
|
|
71
|
+
Generate 1000 rows of Parquet data using Mimesis provider:
|
|
72
|
+
```bash
|
|
73
|
+
testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -p mimesis
|
|
74
|
+
```
|
|
66
75
|
Generate JSON data with default row count (10):
|
|
67
76
|
```bash
|
|
68
77
|
testdatax -o data.json -f json -s schema.json
|
|
@@ -85,7 +94,7 @@ testdatax -o mstest.sql -f mssql -r 1000
|
|
|
85
94
|
|
|
86
95
|
Generate Oracle with default row count (1000), table_name as 'oracle':
|
|
87
96
|
```bash
|
|
88
|
-
|
|
97
|
+
testdatax -o oracle.sql -f oracle -r 1000
|
|
89
98
|
```
|
|
90
99
|
|
|
91
100
|
Each command consists of:
|
|
@@ -93,6 +102,7 @@ Each command consists of:
|
|
|
93
102
|
- `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
|
|
94
103
|
- `-s, --schema`: Path to your schema definition file
|
|
95
104
|
- `-r, --rows`: Number of rows to generate (optional, defaults to 10)
|
|
105
|
+
- `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
|
|
96
106
|
- `-d, --debug`: Enable debug logging (optional)
|
|
97
107
|
|
|
98
108
|
## Schema Example
|
|
@@ -101,7 +111,7 @@ Each command consists of:
|
|
|
101
111
|
{
|
|
102
112
|
"username": {
|
|
103
113
|
"type": "string",
|
|
104
|
-
"
|
|
114
|
+
"provider_field": "name"
|
|
105
115
|
},
|
|
106
116
|
"date_joined": {
|
|
107
117
|
"type": "datetime"
|
|
@@ -148,7 +158,7 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
148
158
|
"type": "string",
|
|
149
159
|
"min_length": 5,
|
|
150
160
|
"max_length": 20,
|
|
151
|
-
"
|
|
161
|
+
"provider_field": "user_name" // Use provider-specific field to generate realistic data
|
|
152
162
|
},
|
|
153
163
|
"description": {
|
|
154
164
|
"type": "text",
|
|
@@ -190,6 +200,12 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
190
200
|
}
|
|
191
201
|
```
|
|
192
202
|
|
|
203
|
+
> **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
|
|
204
|
+
> `format` is set, date/datetime values are rendered to a string with
|
|
205
|
+
> `strftime`; for the SQL exporters this means the column receives a formatted
|
|
206
|
+
> string literal rather than a native date, so `format` is best suited to the
|
|
207
|
+
> CSV/JSON formats.
|
|
208
|
+
|
|
193
209
|
#### Enum Fields
|
|
194
210
|
```json
|
|
195
211
|
{
|
|
@@ -201,25 +217,25 @@ The schema file defines the structure and constraints of your generated data. Ea
|
|
|
201
217
|
}
|
|
202
218
|
```
|
|
203
219
|
|
|
204
|
-
#### Using
|
|
205
|
-
|
|
220
|
+
#### Using Data Providers
|
|
221
|
+
Both Mimesis and Faker providers support the same schema format. You can specify provider-specific generators using the `provider_field` field (works with both providers):
|
|
206
222
|
```json
|
|
207
223
|
{
|
|
208
224
|
"name": {
|
|
209
225
|
"type": "string",
|
|
210
|
-
"
|
|
226
|
+
"provider_field": "name"
|
|
211
227
|
},
|
|
212
228
|
"email": {
|
|
213
229
|
"type": "string",
|
|
214
|
-
"
|
|
230
|
+
"provider_field": "email"
|
|
215
231
|
},
|
|
216
232
|
"address": {
|
|
217
233
|
"type": "string",
|
|
218
|
-
"
|
|
234
|
+
"provider_field": "address"
|
|
219
235
|
},
|
|
220
236
|
"company": {
|
|
221
237
|
"type": "string",
|
|
222
|
-
"
|
|
238
|
+
"provider_field": "company"
|
|
223
239
|
}
|
|
224
240
|
}
|
|
225
241
|
```
|
|
@@ -233,12 +249,12 @@ The generator supports Faker providers for generating realistic data:
|
|
|
233
249
|
},
|
|
234
250
|
"username": {
|
|
235
251
|
"type": "string",
|
|
236
|
-
"
|
|
252
|
+
"provider_field": "user_name",
|
|
237
253
|
"unique": true
|
|
238
254
|
},
|
|
239
255
|
"email": {
|
|
240
256
|
"type": "string",
|
|
241
|
-
"
|
|
257
|
+
"provider_field": "email",
|
|
242
258
|
"unique": true
|
|
243
259
|
},
|
|
244
260
|
"age": {
|
|
@@ -263,6 +279,37 @@ The generator supports Faker providers for generating realistic data:
|
|
|
263
279
|
}
|
|
264
280
|
```
|
|
265
281
|
|
|
282
|
+
## Data Providers
|
|
283
|
+
|
|
284
|
+
TestDataX supports two powerful data providers for generating realistic test data:
|
|
285
|
+
|
|
286
|
+
### Mimesis (Default)
|
|
287
|
+
Mimesis is a high-performance Python library for generating synthetic data. It provides:
|
|
288
|
+
- Fast data generation with excellent performance
|
|
289
|
+
- Support for multiple locales and languages
|
|
290
|
+
- Wide variety of data providers for different domains
|
|
291
|
+
- Lightweight and efficient implementation
|
|
292
|
+
|
|
293
|
+
### Faker
|
|
294
|
+
Faker is a popular Python library for generating fake data. It offers:
|
|
295
|
+
- Extensive provider ecosystem with community contributions
|
|
296
|
+
- Rich set of localized providers
|
|
297
|
+
- Well-established and widely used in the Python community
|
|
298
|
+
- Comprehensive documentation and examples
|
|
299
|
+
|
|
300
|
+
You can specify the provider using the `-p` or `--provider` option:
|
|
301
|
+
```bash
|
|
302
|
+
# Use Mimesis (default)
|
|
303
|
+
testdatax -o data.csv -f csv -p mimesis
|
|
304
|
+
|
|
305
|
+
# Use Faker
|
|
306
|
+
testdatax -o data.csv -f csv -p faker
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
Both providers support the same schema format and generate compatible data types.
|
|
310
|
+
|
|
311
|
+
**Note:** For backward compatibility, the legacy `faker` field name is still supported, but `provider_field` is recommended for new schemas.
|
|
312
|
+
|
|
266
313
|
## Supported Data Types
|
|
267
314
|
|
|
268
315
|
- string
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "TestDataX"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "A flexible test data generation toolkit"
|
|
5
5
|
authors = ["JamesPBrett"]
|
|
6
6
|
license = "MIT"
|
|
@@ -12,10 +12,10 @@ python = "^3.11"
|
|
|
12
12
|
typer = "^0.15.1"
|
|
13
13
|
faker = "^33.1.0"
|
|
14
14
|
pydantic = "^2.10.4"
|
|
15
|
-
orjson = "^3.10.12"
|
|
16
15
|
pyarrow = "^18.1.0"
|
|
17
16
|
pandas = "^2.2.3"
|
|
18
17
|
mysql-connector-python = "^9.1.0"
|
|
18
|
+
mimesis = "^18.0.0"
|
|
19
19
|
|
|
20
20
|
[tool.poetry.group.dev.dependencies]
|
|
21
21
|
pytest = "^8.3.4"
|
|
@@ -38,13 +38,11 @@ types-psutil = "^6.1.0.20241221"
|
|
|
38
38
|
commitizen = "^3.13.0"
|
|
39
39
|
python-semantic-release = "^9.17.0"
|
|
40
40
|
|
|
41
|
-
[build-system]
|
|
42
|
-
requires = ["poetry-core"]
|
|
43
|
-
build-backend = "poetry.core.masonry.api"
|
|
44
41
|
|
|
45
42
|
[tool.poetry.scripts]
|
|
46
43
|
testdatax = "src.cli:app"
|
|
47
44
|
|
|
45
|
+
|
|
48
46
|
[tool.ruff]
|
|
49
47
|
# Same as Black
|
|
50
48
|
line-length = 88
|
|
@@ -84,6 +82,7 @@ exclude = [
|
|
|
84
82
|
[tool.ruff.lint.isort]
|
|
85
83
|
known-first-party = ["src"]
|
|
86
84
|
|
|
85
|
+
|
|
87
86
|
[tool.black]
|
|
88
87
|
line-length = 88
|
|
89
88
|
target-version = ['py311']
|
|
@@ -119,6 +118,15 @@ warn_unreachable = true
|
|
|
119
118
|
strict_optional = true
|
|
120
119
|
plugins = ["pydantic.mypy"]
|
|
121
120
|
|
|
121
|
+
[[tool.mypy.overrides]]
|
|
122
|
+
module = "mimesis.*"
|
|
123
|
+
ignore_missing_imports = true
|
|
124
|
+
|
|
125
|
+
[[tool.mypy.overrides]]
|
|
126
|
+
module = "src.providers.mimesis_provider"
|
|
127
|
+
warn_return_any = false
|
|
128
|
+
|
|
129
|
+
|
|
122
130
|
[tool.coverage.run]
|
|
123
131
|
source = ["src"]
|
|
124
132
|
branch = true
|
|
@@ -132,15 +140,17 @@ exclude_lines = [
|
|
|
132
140
|
"pass",
|
|
133
141
|
]
|
|
134
142
|
|
|
143
|
+
|
|
135
144
|
[tool.pytest.ini_options]
|
|
136
145
|
testpaths = ["tests"]
|
|
137
146
|
python_files = ["test_*.py"]
|
|
138
147
|
python_classes = ["Test*"]
|
|
139
148
|
python_functions = ["test_*"]
|
|
140
149
|
|
|
150
|
+
|
|
141
151
|
[tool.commitizen]
|
|
142
152
|
name = "cz_conventional_commits"
|
|
143
|
-
version = "0.1.
|
|
153
|
+
version = "0.1.3"
|
|
144
154
|
tag_format = "v$version"
|
|
145
155
|
version_files = [
|
|
146
156
|
"src/__init__.py:__version__",
|
|
@@ -214,3 +224,8 @@ allowed_tags = [
|
|
|
214
224
|
"chore", # Maintenance tasks
|
|
215
225
|
"refactor", # Code changes without fixing bugs or adding features
|
|
216
226
|
]
|
|
227
|
+
|
|
228
|
+
[build-system]
|
|
229
|
+
requires = ["poetry-core"]
|
|
230
|
+
build-backend = "poetry.core.masonry.api"
|
|
231
|
+
|
|
@@ -9,6 +9,7 @@ from .exporters.base_exporter import BaseExporter
|
|
|
9
9
|
from .exporters.utils.constants import DEFAULT_SCHEMA, EXPORT_FORMATS
|
|
10
10
|
from .exporters.utils.exporter_config import EXPORTER_CLASSES
|
|
11
11
|
from .generator import DataGenerator
|
|
12
|
+
from .providers import FakerProvider, MimesisProvider
|
|
12
13
|
from .schemas import DataType, FieldSchema, GeneratorConfig
|
|
13
14
|
|
|
14
15
|
|
|
@@ -42,6 +43,9 @@ FORMAT_OPTION = typer.Option(
|
|
|
42
43
|
ROWS_OPTION = typer.Option(10, "--rows", "-r", help="Number of rows to generate")
|
|
43
44
|
SCHEMA_PATH_OPTION = typer.Option(None, "--schema", "-s", help="Path to schema file")
|
|
44
45
|
DEBUG_OPTION = typer.Option(False, "--debug", "-d", help="Enable debug output")
|
|
46
|
+
PROVIDER_OPTION = typer.Option(
|
|
47
|
+
"mimesis", "--provider", "-p", help="Data provider (faker or mimesis)"
|
|
48
|
+
)
|
|
45
49
|
|
|
46
50
|
|
|
47
51
|
@app.command()
|
|
@@ -51,6 +55,7 @@ def generate(
|
|
|
51
55
|
rows: int = ROWS_OPTION,
|
|
52
56
|
schema_path: Path | None = SCHEMA_PATH_OPTION,
|
|
53
57
|
debug: bool = DEBUG_OPTION,
|
|
58
|
+
provider: str = PROVIDER_OPTION,
|
|
54
59
|
) -> None:
|
|
55
60
|
"""Generate synthetic data based on the provided schema."""
|
|
56
61
|
try:
|
|
@@ -97,15 +102,30 @@ def generate(
|
|
|
97
102
|
f"{min_value}, {max_value}"
|
|
98
103
|
)
|
|
99
104
|
|
|
105
|
+
# Accept "precision" as an alias for "right_digits"; use an
|
|
106
|
+
# explicit None check so an intentional 0 is not dropped.
|
|
107
|
+
right_digits = field_def.get("right_digits")
|
|
108
|
+
if right_digits is None:
|
|
109
|
+
right_digits = field_def.get("precision")
|
|
110
|
+
|
|
100
111
|
field_schema = FieldSchema(
|
|
101
112
|
name=name,
|
|
102
113
|
type=field_type,
|
|
103
114
|
enum_values=field_def.get("values"),
|
|
104
115
|
min_value=min_value,
|
|
105
116
|
max_value=max_value,
|
|
106
|
-
right_digits=
|
|
107
|
-
value_provider=field_def.get("
|
|
117
|
+
right_digits=right_digits,
|
|
118
|
+
value_provider=field_def.get("provider_field")
|
|
119
|
+
or field_def.get("faker"),
|
|
108
120
|
pattern=field_def.get("pattern"),
|
|
121
|
+
nullable=field_def.get("nullable", False),
|
|
122
|
+
unique=field_def.get("unique", False),
|
|
123
|
+
weights=field_def.get("weights"),
|
|
124
|
+
min_length=field_def.get("min_length"),
|
|
125
|
+
max_length=field_def.get("max_length"),
|
|
126
|
+
start_date=field_def.get("start_date"),
|
|
127
|
+
end_date=field_def.get("end_date"),
|
|
128
|
+
format=field_def.get("format"),
|
|
109
129
|
)
|
|
110
130
|
fields.append(field_schema.model_dump())
|
|
111
131
|
else:
|
|
@@ -117,6 +137,10 @@ def generate(
|
|
|
117
137
|
if format not in EXPORT_FORMATS:
|
|
118
138
|
raise ValueError(f"Unsupported format: {format}")
|
|
119
139
|
|
|
140
|
+
# Validate provider
|
|
141
|
+
if provider.lower() not in ["faker", "mimesis"]:
|
|
142
|
+
raise ValueError(f"Unsupported provider: {provider}")
|
|
143
|
+
|
|
120
144
|
# Create generator config
|
|
121
145
|
if debug:
|
|
122
146
|
typer.echo(f"Converted fields: {fields}", err=False)
|
|
@@ -127,7 +151,12 @@ def generate(
|
|
|
127
151
|
# Generate data
|
|
128
152
|
if debug:
|
|
129
153
|
typer.echo(f"Generator config: {config}", err=False)
|
|
130
|
-
|
|
154
|
+
|
|
155
|
+
# Select provider
|
|
156
|
+
data_provider = (
|
|
157
|
+
MimesisProvider() if provider.lower() == "mimesis" else FakerProvider()
|
|
158
|
+
)
|
|
159
|
+
generator = DataGenerator(provider=data_provider)
|
|
131
160
|
data = generator.generate_data(config.fields, config.row_count)
|
|
132
161
|
|
|
133
162
|
# Export data
|
|
@@ -148,9 +177,10 @@ def generate(
|
|
|
148
177
|
raise typer.Exit(code=1) from e
|
|
149
178
|
except Exception as e:
|
|
150
179
|
typer.echo(f"Error: {str(e)}", err=True)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
180
|
+
if debug:
|
|
181
|
+
typer.echo(f"Exception type: {type(e).__name__}", err=True)
|
|
182
|
+
typer.echo(f"Exception args: {e.args}", err=True)
|
|
183
|
+
typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
|
|
154
184
|
raise typer.Exit(code=1) from e
|
|
155
185
|
|
|
156
186
|
|
|
@@ -77,10 +77,8 @@ class CsvExporter(BaseExporter):
|
|
|
77
77
|
fieldnames = list(data[0].keys())
|
|
78
78
|
|
|
79
79
|
first_chunk = True
|
|
80
|
-
formatted_rows = []
|
|
81
80
|
for chunk in self.chunker.chunk_data(data):
|
|
82
81
|
formatted_chunk = [self.formatter.format_row(row) for row in chunk]
|
|
83
|
-
formatted_rows.extend(formatted_chunk)
|
|
84
82
|
df = pd.DataFrame(formatted_chunk, columns=fieldnames)
|
|
85
83
|
|
|
86
84
|
# Write the data to CSV in chunks
|
|
@@ -62,16 +62,22 @@ class JsonExporter(BaseExporter):
|
|
|
62
62
|
raise ValueError(
|
|
63
63
|
f"Field '{field}' in schema is not present in data."
|
|
64
64
|
)
|
|
65
|
-
# Format the data and write it in chunks to the output file
|
|
66
|
-
all_formatted_rows = []
|
|
67
|
-
for chunk in self.chunker.chunk_data(data):
|
|
68
|
-
formatted_chunk = [self.formatter.format_row(row) for row in chunk]
|
|
69
|
-
all_formatted_rows.extend(formatted_chunk)
|
|
70
65
|
|
|
71
|
-
#
|
|
66
|
+
# Stream a valid JSON array to disk one chunk at a time so the whole
|
|
67
|
+
# dataset is never held in memory at once.
|
|
72
68
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
73
|
-
|
|
74
|
-
|
|
69
|
+
f.write("[")
|
|
70
|
+
first = True
|
|
71
|
+
for chunk in self.chunker.chunk_data(data):
|
|
72
|
+
for row in chunk:
|
|
73
|
+
formatted = self.formatter.format_row(row)
|
|
74
|
+
block = json.dumps(formatted, indent=4)
|
|
75
|
+
indented = "\n".join(
|
|
76
|
+
" " + line for line in block.splitlines()
|
|
77
|
+
)
|
|
78
|
+
f.write(("\n" if first else ",\n") + indented)
|
|
79
|
+
first = False
|
|
80
|
+
f.write("\n]" if not first else "]")
|
|
75
81
|
|
|
76
82
|
logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
|
|
77
83
|
|