TestDataX 0.1.1__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {testdatax-0.1.1 → testdatax-0.2.0}/PKG-INFO +69 -20
  2. {testdatax-0.1.1 → testdatax-0.2.0}/README.md +64 -17
  3. {testdatax-0.1.1 → testdatax-0.2.0}/pyproject.toml +21 -6
  4. {testdatax-0.1.1 → testdatax-0.2.0}/src/__init__.py +1 -1
  5. {testdatax-0.1.1 → testdatax-0.2.0}/src/cli.py +36 -6
  6. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/csv_exporter.py +0 -2
  7. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/json_exporter.py +14 -8
  8. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/mssql_exporter.py +10 -3
  9. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/mysql_exporter.py +10 -3
  10. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/oracle_exporter.py +10 -3
  11. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/constants.py +6 -0
  12. testdatax-0.2.0/src/exporters/utils/sql.py +18 -0
  13. testdatax-0.2.0/src/generator.py +175 -0
  14. testdatax-0.2.0/src/providers/__init__.py +5 -0
  15. testdatax-0.2.0/src/providers/base.py +153 -0
  16. testdatax-0.2.0/src/providers/faker_provider.py +114 -0
  17. testdatax-0.2.0/src/providers/mimesis_provider.py +153 -0
  18. testdatax-0.2.0/src/schemas.py +145 -0
  19. testdatax-0.1.1/src/generator.py +0 -117
  20. testdatax-0.1.1/src/providers/__init__.py +0 -4
  21. testdatax-0.1.1/src/providers/base.py +0 -58
  22. testdatax-0.1.1/src/providers/faker_provider.py +0 -65
  23. testdatax-0.1.1/src/schemas.py +0 -81
  24. {testdatax-0.1.1 → testdatax-0.2.0}/LICENSE +0 -0
  25. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/__init__.py +0 -0
  26. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/base_exporter.py +0 -0
  27. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/orc_exporter.py +0 -0
  28. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/parquet_exporter.py +0 -0
  29. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/__init__.py +0 -0
  30. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/chunker.py +0 -0
  31. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/exporter_config.py +0 -0
  32. {testdatax-0.1.1 → testdatax-0.2.0}/src/exporters/utils/formatters.py +0 -0
@@ -1,8 +1,9 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: TestDataX
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: A flexible test data generation toolkit
5
5
  License: MIT
6
+ License-File: LICENSE
6
7
  Author: JamesPBrett
7
8
  Requires-Python: >=3.11,<4.0
8
9
  Classifier: License :: OSI Approved :: MIT License
@@ -10,9 +11,10 @@ Classifier: Programming Language :: Python :: 3
10
11
  Classifier: Programming Language :: Python :: 3.11
11
12
  Classifier: Programming Language :: Python :: 3.12
12
13
  Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
13
15
  Requires-Dist: faker (>=33.1.0,<34.0.0)
16
+ Requires-Dist: mimesis (>=18.0.0,<19.0.0)
14
17
  Requires-Dist: mysql-connector-python (>=9.1.0,<10.0.0)
15
- Requires-Dist: orjson (>=3.10.12,<4.0.0)
16
18
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
17
19
  Requires-Dist: pyarrow (>=18.1.0,<19.0.0)
18
20
  Requires-Dist: pydantic (>=2.10.4,<3.0.0)
@@ -21,14 +23,12 @@ Description-Content-Type: text/markdown
21
23
 
22
24
  # TestDataX
23
25
 
24
- # TestDataX
25
-
26
26
  ![Build Status](https://github.com/JamesPBrett/testdatax/actions/workflows/publish.yml/badge.svg)
27
27
  [![codecov](https://codecov.io/gh/JamesPBrett/testdatax/branch/main/graph/badge.svg?token=6VX62CI6U9)](https://codecov.io/gh/JamesPBrett/testdatax)
28
28
  ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
29
29
  ![License](https://img.shields.io/badge/license-MIT-blue.svg)
30
30
 
31
- This command-line interface application enables quick and customizable test data generation across various formats. It leverages Faker for realistic data fields, offers flexible schema configurations, and simplifies output to multiple database dialects or file types. Users can define precise parameters for data volume, types, and constraints for each target data set.
31
+ This command-line interface application enables quick and customizable test data generation across various formats. It supports multiple data providers (Mimesis and Faker) for realistic data generation, offers flexible schema configurations, and simplifies output to multiple database dialects or file types. Users can define precise parameters for data volume, types, and constraints for each target data set.
32
32
 
33
33
  ## Requirements
34
34
  - Python 3.11+
@@ -41,11 +41,11 @@ pip install testdatax
41
41
 
42
42
  # Generate sample data
43
43
  testdatax --rows 1000 --format json --output data.json
44
-
44
+ ```
45
45
 
46
46
  ## Features
47
47
 
48
- - Generate realistic test data using Data providers
48
+ - Generate realistic test data using multiple data providers (Mimesis, Faker)
49
49
  - Support for multiple output formats (CSV, JSON, SQL, etc.)
50
50
  - Customizable schema definitions
51
51
  - Configurable data generation parameters
@@ -63,7 +63,7 @@ testdatax --rows 1000 --format json --output data.json
63
63
 
64
64
  ## CLI Usage
65
65
  ```bash
66
- testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> [-d]
66
+ testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> -p <provider> [-d]
67
67
  ```
68
68
 
69
69
  Options:
@@ -71,6 +71,7 @@ Options:
71
71
  - `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
72
72
  - `-r, --rows`: Number of rows to generate (default: 10)
73
73
  - `-s, --schema`: Path to schema file
74
+ - `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
74
75
  - `-d, --debug`: Enable debug output
75
76
 
76
77
  ## Usage Examples
@@ -80,10 +81,20 @@ Generate 10 rows of CSV data:
80
81
  testdatax -o users.csv -f csv -s schema.json -r 10
81
82
  ```
82
83
 
84
+ Generate 10 rows of CSV data using Faker provider:
85
+ ```bash
86
+ testdatax -o users.csv -f csv -s schema.json -r 10 -p faker
87
+ ```
88
+
83
89
  Generate 1000 rows of Parquet data with debug output:
84
90
  ```bash
85
91
  testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -d
86
92
  ```
93
+
94
+ Generate 1000 rows of Parquet data using Mimesis provider:
95
+ ```bash
96
+ testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -p mimesis
97
+ ```
87
98
  Generate JSON data with default row count (10):
88
99
  ```bash
89
100
  testdatax -o data.json -f json -s schema.json
@@ -106,7 +117,7 @@ testdatax -o mstest.sql -f mssql -r 1000
106
117
 
107
118
  Generate Oracle with default row count (1000), table_name as 'oracle':
108
119
  ```bash
109
- datagen -o oracle.sql -f oracle -r 1000
120
+ testdatax -o oracle.sql -f oracle -r 1000
110
121
  ```
111
122
 
112
123
  Each command consists of:
@@ -114,6 +125,7 @@ Each command consists of:
114
125
  - `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
115
126
  - `-s, --schema`: Path to your schema definition file
116
127
  - `-r, --rows`: Number of rows to generate (optional, defaults to 10)
128
+ - `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
117
129
  - `-d, --debug`: Enable debug logging (optional)
118
130
 
119
131
  ## Schema Example
@@ -122,7 +134,7 @@ Each command consists of:
122
134
  {
123
135
  "username": {
124
136
  "type": "string",
125
- "faker": "name"
137
+ "provider_field": "name"
126
138
  },
127
139
  "date_joined": {
128
140
  "type": "datetime"
@@ -169,7 +181,7 @@ The schema file defines the structure and constraints of your generated data. Ea
169
181
  "type": "string",
170
182
  "min_length": 5,
171
183
  "max_length": 20,
172
- "faker": "user_name" // Use faker to generate realistic data
184
+ "provider_field": "user_name" // Use provider-specific field to generate realistic data
173
185
  },
174
186
  "description": {
175
187
  "type": "text",
@@ -211,6 +223,12 @@ The schema file defines the structure and constraints of your generated data. Ea
211
223
  }
212
224
  ```
213
225
 
226
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
227
+ > `format` is set, date/datetime values are rendered to a string with
228
+ > `strftime`; for the SQL exporters this means the column receives a formatted
229
+ > string literal rather than a native date, so `format` is best suited to the
230
+ > CSV/JSON formats.
231
+
214
232
  #### Enum Fields
215
233
  ```json
216
234
  {
@@ -222,25 +240,25 @@ The schema file defines the structure and constraints of your generated data. Ea
222
240
  }
223
241
  ```
224
242
 
225
- #### Using Faker
226
- The generator supports Faker providers for generating realistic data:
243
+ #### Using Data Providers
244
+ Both Mimesis and Faker providers support the same schema format. You can specify provider-specific generators using the `provider_field` field (works with both providers):
227
245
  ```json
228
246
  {
229
247
  "name": {
230
248
  "type": "string",
231
- "faker": "name"
249
+ "provider_field": "name"
232
250
  },
233
251
  "email": {
234
252
  "type": "string",
235
- "faker": "email"
253
+ "provider_field": "email"
236
254
  },
237
255
  "address": {
238
256
  "type": "string",
239
- "faker": "address"
257
+ "provider_field": "address"
240
258
  },
241
259
  "company": {
242
260
  "type": "string",
243
- "faker": "company"
261
+ "provider_field": "company"
244
262
  }
245
263
  }
246
264
  ```
@@ -254,12 +272,12 @@ The generator supports Faker providers for generating realistic data:
254
272
  },
255
273
  "username": {
256
274
  "type": "string",
257
- "faker": "user_name",
275
+ "provider_field": "user_name",
258
276
  "unique": true
259
277
  },
260
278
  "email": {
261
279
  "type": "string",
262
- "faker": "email",
280
+ "provider_field": "email",
263
281
  "unique": true
264
282
  },
265
283
  "age": {
@@ -284,6 +302,37 @@ The generator supports Faker providers for generating realistic data:
284
302
  }
285
303
  ```
286
304
 
305
+ ## Data Providers
306
+
307
+ TestDataX supports two powerful data providers for generating realistic test data:
308
+
309
+ ### Mimesis (Default)
310
+ Mimesis is a high-performance Python library for generating synthetic data. It provides:
311
+ - Fast data generation with excellent performance
312
+ - Support for multiple locales and languages
313
+ - Wide variety of data providers for different domains
314
+ - Lightweight and efficient implementation
315
+
316
+ ### Faker
317
+ Faker is a popular Python library for generating fake data. It offers:
318
+ - Extensive provider ecosystem with community contributions
319
+ - Rich set of localized providers
320
+ - Well-established and widely used in the Python community
321
+ - Comprehensive documentation and examples
322
+
323
+ You can specify the provider using the `-p` or `--provider` option:
324
+ ```bash
325
+ # Use Mimesis (default)
326
+ testdatax -o data.csv -f csv -p mimesis
327
+
328
+ # Use Faker
329
+ testdatax -o data.csv -f csv -p faker
330
+ ```
331
+
332
+ Both providers support the same schema format and generate compatible data types.
333
+
334
+ **Note:** For backward compatibility, the legacy `faker` field name is still supported, but `provider_field` is recommended for new schemas.
335
+
287
336
  ## Supported Data Types
288
337
 
289
338
  - string
@@ -1,13 +1,11 @@
1
1
  # TestDataX
2
2
 
3
- # TestDataX
4
-
5
3
  ![Build Status](https://github.com/JamesPBrett/testdatax/actions/workflows/publish.yml/badge.svg)
6
4
  [![codecov](https://codecov.io/gh/JamesPBrett/testdatax/branch/main/graph/badge.svg?token=6VX62CI6U9)](https://codecov.io/gh/JamesPBrett/testdatax)
7
5
  ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
8
6
  ![License](https://img.shields.io/badge/license-MIT-blue.svg)
9
7
 
10
- This command-line interface application enables quick and customizable test data generation across various formats. It leverages Faker for realistic data fields, offers flexible schema configurations, and simplifies output to multiple database dialects or file types. Users can define precise parameters for data volume, types, and constraints for each target data set.
8
+ This command-line interface application enables quick and customizable test data generation across various formats. It supports multiple data providers (Mimesis and Faker) for realistic data generation, offers flexible schema configurations, and simplifies output to multiple database dialects or file types. Users can define precise parameters for data volume, types, and constraints for each target data set.
11
9
 
12
10
  ## Requirements
13
11
  - Python 3.11+
@@ -20,11 +18,11 @@ pip install testdatax
20
18
 
21
19
  # Generate sample data
22
20
  testdatax --rows 1000 --format json --output data.json
23
-
21
+ ```
24
22
 
25
23
  ## Features
26
24
 
27
- - Generate realistic test data using Data providers
25
+ - Generate realistic test data using multiple data providers (Mimesis, Faker)
28
26
  - Support for multiple output formats (CSV, JSON, SQL, etc.)
29
27
  - Customizable schema definitions
30
28
  - Configurable data generation parameters
@@ -42,7 +40,7 @@ testdatax --rows 1000 --format json --output data.json
42
40
 
43
41
  ## CLI Usage
44
42
  ```bash
45
- testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> [-d]
43
+ testdatax -o <output_file> -f <format> -s <schema_file> -r <num_rows> -p <provider> [-d]
46
44
  ```
47
45
 
48
46
  Options:
@@ -50,6 +48,7 @@ Options:
50
48
  - `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
51
49
  - `-r, --rows`: Number of rows to generate (default: 10)
52
50
  - `-s, --schema`: Path to schema file
51
+ - `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
53
52
  - `-d, --debug`: Enable debug output
54
53
 
55
54
  ## Usage Examples
@@ -59,10 +58,20 @@ Generate 10 rows of CSV data:
59
58
  testdatax -o users.csv -f csv -s schema.json -r 10
60
59
  ```
61
60
 
61
+ Generate 10 rows of CSV data using Faker provider:
62
+ ```bash
63
+ testdatax -o users.csv -f csv -s schema.json -r 10 -p faker
64
+ ```
65
+
62
66
  Generate 1000 rows of Parquet data with debug output:
63
67
  ```bash
64
68
  testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -d
65
69
  ```
70
+
71
+ Generate 1000 rows of Parquet data using Mimesis provider:
72
+ ```bash
73
+ testdatax -o large_dataset.parquet -f parquet -s users_schema.json -r 1000 -p mimesis
74
+ ```
66
75
  Generate JSON data with default row count (10):
67
76
  ```bash
68
77
  testdatax -o data.json -f json -s schema.json
@@ -85,7 +94,7 @@ testdatax -o mstest.sql -f mssql -r 1000
85
94
 
86
95
  Generate Oracle with default row count (1000), table_name as 'oracle':
87
96
  ```bash
88
- datagen -o oracle.sql -f oracle -r 1000
97
+ testdatax -o oracle.sql -f oracle -r 1000
89
98
  ```
90
99
 
91
100
  Each command consists of:
@@ -93,6 +102,7 @@ Each command consists of:
93
102
  - `-f, --format`: Output format (csv, json, orc, parquet, mysql, mssql, oracle)
94
103
  - `-s, --schema`: Path to your schema definition file
95
104
  - `-r, --rows`: Number of rows to generate (optional, defaults to 10)
105
+ - `-p, --provider`: Data provider (mimesis, faker) - default: mimesis
96
106
  - `-d, --debug`: Enable debug logging (optional)
97
107
 
98
108
  ## Schema Example
@@ -101,7 +111,7 @@ Each command consists of:
101
111
  {
102
112
  "username": {
103
113
  "type": "string",
104
- "faker": "name"
114
+ "provider_field": "name"
105
115
  },
106
116
  "date_joined": {
107
117
  "type": "datetime"
@@ -148,7 +158,7 @@ The schema file defines the structure and constraints of your generated data. Ea
148
158
  "type": "string",
149
159
  "min_length": 5,
150
160
  "max_length": 20,
151
- "faker": "user_name" // Use faker to generate realistic data
161
+ "provider_field": "user_name" // Use provider-specific field to generate realistic data
152
162
  },
153
163
  "description": {
154
164
  "type": "text",
@@ -190,6 +200,12 @@ The schema file defines the structure and constraints of your generated data. Ea
190
200
  }
191
201
  ```
192
202
 
203
+ > **Note:** `start_date`/`end_date` bound the generated range (inclusive). When
204
+ > `format` is set, date/datetime values are rendered to a string with
205
+ > `strftime`; for the SQL exporters this means the column receives a formatted
206
+ > string literal rather than a native date, so `format` is best suited to the
207
+ > CSV/JSON formats.
208
+
193
209
  #### Enum Fields
194
210
  ```json
195
211
  {
@@ -201,25 +217,25 @@ The schema file defines the structure and constraints of your generated data. Ea
201
217
  }
202
218
  ```
203
219
 
204
- #### Using Faker
205
- The generator supports Faker providers for generating realistic data:
220
+ #### Using Data Providers
221
+ Both Mimesis and Faker providers support the same schema format. You can specify provider-specific generators using the `provider_field` field (works with both providers):
206
222
  ```json
207
223
  {
208
224
  "name": {
209
225
  "type": "string",
210
- "faker": "name"
226
+ "provider_field": "name"
211
227
  },
212
228
  "email": {
213
229
  "type": "string",
214
- "faker": "email"
230
+ "provider_field": "email"
215
231
  },
216
232
  "address": {
217
233
  "type": "string",
218
- "faker": "address"
234
+ "provider_field": "address"
219
235
  },
220
236
  "company": {
221
237
  "type": "string",
222
- "faker": "company"
238
+ "provider_field": "company"
223
239
  }
224
240
  }
225
241
  ```
@@ -233,12 +249,12 @@ The generator supports Faker providers for generating realistic data:
233
249
  },
234
250
  "username": {
235
251
  "type": "string",
236
- "faker": "user_name",
252
+ "provider_field": "user_name",
237
253
  "unique": true
238
254
  },
239
255
  "email": {
240
256
  "type": "string",
241
- "faker": "email",
257
+ "provider_field": "email",
242
258
  "unique": true
243
259
  },
244
260
  "age": {
@@ -263,6 +279,37 @@ The generator supports Faker providers for generating realistic data:
263
279
  }
264
280
  ```
265
281
 
282
+ ## Data Providers
283
+
284
+ TestDataX supports two powerful data providers for generating realistic test data:
285
+
286
+ ### Mimesis (Default)
287
+ Mimesis is a high-performance Python library for generating synthetic data. It provides:
288
+ - Fast data generation with excellent performance
289
+ - Support for multiple locales and languages
290
+ - Wide variety of data providers for different domains
291
+ - Lightweight and efficient implementation
292
+
293
+ ### Faker
294
+ Faker is a popular Python library for generating fake data. It offers:
295
+ - Extensive provider ecosystem with community contributions
296
+ - Rich set of localized providers
297
+ - Well-established and widely used in the Python community
298
+ - Comprehensive documentation and examples
299
+
300
+ You can specify the provider using the `-p` or `--provider` option:
301
+ ```bash
302
+ # Use Mimesis (default)
303
+ testdatax -o data.csv -f csv -p mimesis
304
+
305
+ # Use Faker
306
+ testdatax -o data.csv -f csv -p faker
307
+ ```
308
+
309
+ Both providers support the same schema format and generate compatible data types.
310
+
311
+ **Note:** For backward compatibility, the legacy `faker` field name is still supported, but `provider_field` is recommended for new schemas.
312
+
266
313
  ## Supported Data Types
267
314
 
268
315
  - string
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "TestDataX"
3
- version = "0.1.1"
3
+ version = "0.2.0"
4
4
  description = "A flexible test data generation toolkit"
5
5
  authors = ["JamesPBrett"]
6
6
  license = "MIT"
@@ -12,10 +12,10 @@ python = "^3.11"
12
12
  typer = "^0.15.1"
13
13
  faker = "^33.1.0"
14
14
  pydantic = "^2.10.4"
15
- orjson = "^3.10.12"
16
15
  pyarrow = "^18.1.0"
17
16
  pandas = "^2.2.3"
18
17
  mysql-connector-python = "^9.1.0"
18
+ mimesis = "^18.0.0"
19
19
 
20
20
  [tool.poetry.group.dev.dependencies]
21
21
  pytest = "^8.3.4"
@@ -38,13 +38,11 @@ types-psutil = "^6.1.0.20241221"
38
38
  commitizen = "^3.13.0"
39
39
  python-semantic-release = "^9.17.0"
40
40
 
41
- [build-system]
42
- requires = ["poetry-core"]
43
- build-backend = "poetry.core.masonry.api"
44
41
 
45
42
  [tool.poetry.scripts]
46
43
  testdatax = "src.cli:app"
47
44
 
45
+
48
46
  [tool.ruff]
49
47
  # Same as Black
50
48
  line-length = 88
@@ -84,6 +82,7 @@ exclude = [
84
82
  [tool.ruff.lint.isort]
85
83
  known-first-party = ["src"]
86
84
 
85
+
87
86
  [tool.black]
88
87
  line-length = 88
89
88
  target-version = ['py311']
@@ -119,6 +118,15 @@ warn_unreachable = true
119
118
  strict_optional = true
120
119
  plugins = ["pydantic.mypy"]
121
120
 
121
+ [[tool.mypy.overrides]]
122
+ module = "mimesis.*"
123
+ ignore_missing_imports = true
124
+
125
+ [[tool.mypy.overrides]]
126
+ module = "src.providers.mimesis_provider"
127
+ warn_return_any = false
128
+
129
+
122
130
  [tool.coverage.run]
123
131
  source = ["src"]
124
132
  branch = true
@@ -132,15 +140,17 @@ exclude_lines = [
132
140
  "pass",
133
141
  ]
134
142
 
143
+
135
144
  [tool.pytest.ini_options]
136
145
  testpaths = ["tests"]
137
146
  python_files = ["test_*.py"]
138
147
  python_classes = ["Test*"]
139
148
  python_functions = ["test_*"]
140
149
 
150
+
141
151
  [tool.commitizen]
142
152
  name = "cz_conventional_commits"
143
- version = "0.1.0"
153
+ version = "0.1.3"
144
154
  tag_format = "v$version"
145
155
  version_files = [
146
156
  "src/__init__.py:__version__",
@@ -214,3 +224,8 @@ allowed_tags = [
214
224
  "chore", # Maintenance tasks
215
225
  "refactor", # Code changes without fixing bugs or adding features
216
226
  ]
227
+
228
+ [build-system]
229
+ requires = ["poetry-core"]
230
+ build-backend = "poetry.core.masonry.api"
231
+
@@ -1,6 +1,6 @@
1
1
  """TestDataX package initialization."""
2
2
 
3
- __version__ = "0.1.1"
3
+ __version__ = "0.2.0"
4
4
 
5
5
  from src.cli import app # noqa
6
6
 
@@ -9,6 +9,7 @@ from .exporters.base_exporter import BaseExporter
9
9
  from .exporters.utils.constants import DEFAULT_SCHEMA, EXPORT_FORMATS
10
10
  from .exporters.utils.exporter_config import EXPORTER_CLASSES
11
11
  from .generator import DataGenerator
12
+ from .providers import FakerProvider, MimesisProvider
12
13
  from .schemas import DataType, FieldSchema, GeneratorConfig
13
14
 
14
15
 
@@ -42,6 +43,9 @@ FORMAT_OPTION = typer.Option(
42
43
  ROWS_OPTION = typer.Option(10, "--rows", "-r", help="Number of rows to generate")
43
44
  SCHEMA_PATH_OPTION = typer.Option(None, "--schema", "-s", help="Path to schema file")
44
45
  DEBUG_OPTION = typer.Option(False, "--debug", "-d", help="Enable debug output")
46
+ PROVIDER_OPTION = typer.Option(
47
+ "mimesis", "--provider", "-p", help="Data provider (faker or mimesis)"
48
+ )
45
49
 
46
50
 
47
51
  @app.command()
@@ -51,6 +55,7 @@ def generate(
51
55
  rows: int = ROWS_OPTION,
52
56
  schema_path: Path | None = SCHEMA_PATH_OPTION,
53
57
  debug: bool = DEBUG_OPTION,
58
+ provider: str = PROVIDER_OPTION,
54
59
  ) -> None:
55
60
  """Generate synthetic data based on the provided schema."""
56
61
  try:
@@ -97,15 +102,30 @@ def generate(
97
102
  f"{min_value}, {max_value}"
98
103
  )
99
104
 
105
+ # Accept "precision" as an alias for "right_digits"; use an
106
+ # explicit None check so an intentional 0 is not dropped.
107
+ right_digits = field_def.get("right_digits")
108
+ if right_digits is None:
109
+ right_digits = field_def.get("precision")
110
+
100
111
  field_schema = FieldSchema(
101
112
  name=name,
102
113
  type=field_type,
103
114
  enum_values=field_def.get("values"),
104
115
  min_value=min_value,
105
116
  max_value=max_value,
106
- right_digits=field_def.get("right_digits"),
107
- value_provider=field_def.get("faker"),
117
+ right_digits=right_digits,
118
+ value_provider=field_def.get("provider_field")
119
+ or field_def.get("faker"),
108
120
  pattern=field_def.get("pattern"),
121
+ nullable=field_def.get("nullable", False),
122
+ unique=field_def.get("unique", False),
123
+ weights=field_def.get("weights"),
124
+ min_length=field_def.get("min_length"),
125
+ max_length=field_def.get("max_length"),
126
+ start_date=field_def.get("start_date"),
127
+ end_date=field_def.get("end_date"),
128
+ format=field_def.get("format"),
109
129
  )
110
130
  fields.append(field_schema.model_dump())
111
131
  else:
@@ -117,6 +137,10 @@ def generate(
117
137
  if format not in EXPORT_FORMATS:
118
138
  raise ValueError(f"Unsupported format: {format}")
119
139
 
140
+ # Validate provider
141
+ if provider.lower() not in ["faker", "mimesis"]:
142
+ raise ValueError(f"Unsupported provider: {provider}")
143
+
120
144
  # Create generator config
121
145
  if debug:
122
146
  typer.echo(f"Converted fields: {fields}", err=False)
@@ -127,7 +151,12 @@ def generate(
127
151
  # Generate data
128
152
  if debug:
129
153
  typer.echo(f"Generator config: {config}", err=False)
130
- generator = DataGenerator()
154
+
155
+ # Select provider
156
+ data_provider = (
157
+ MimesisProvider() if provider.lower() == "mimesis" else FakerProvider()
158
+ )
159
+ generator = DataGenerator(provider=data_provider)
131
160
  data = generator.generate_data(config.fields, config.row_count)
132
161
 
133
162
  # Export data
@@ -148,9 +177,10 @@ def generate(
148
177
  raise typer.Exit(code=1) from e
149
178
  except Exception as e:
150
179
  typer.echo(f"Error: {str(e)}", err=True)
151
- typer.echo(f"Exception type: {type(e).__name__}", err=True)
152
- typer.echo(f"Exception args: {e.args}", err=True)
153
- typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
180
+ if debug:
181
+ typer.echo(f"Exception type: {type(e).__name__}", err=True)
182
+ typer.echo(f"Exception args: {e.args}", err=True)
183
+ typer.echo(f"Traceback: {traceback.format_exc()}", err=True)
154
184
  raise typer.Exit(code=1) from e
155
185
 
156
186
 
@@ -77,10 +77,8 @@ class CsvExporter(BaseExporter):
77
77
  fieldnames = list(data[0].keys())
78
78
 
79
79
  first_chunk = True
80
- formatted_rows = []
81
80
  for chunk in self.chunker.chunk_data(data):
82
81
  formatted_chunk = [self.formatter.format_row(row) for row in chunk]
83
- formatted_rows.extend(formatted_chunk)
84
82
  df = pd.DataFrame(formatted_chunk, columns=fieldnames)
85
83
 
86
84
  # Write the data to CSV in chunks
@@ -62,16 +62,22 @@ class JsonExporter(BaseExporter):
62
62
  raise ValueError(
63
63
  f"Field '{field}' in schema is not present in data."
64
64
  )
65
- # Format the data and write it in chunks to the output file
66
- all_formatted_rows = []
67
- for chunk in self.chunker.chunk_data(data):
68
- formatted_chunk = [self.formatter.format_row(row) for row in chunk]
69
- all_formatted_rows.extend(formatted_chunk)
70
65
 
71
- # Write the complete file with proper formatting using json.dumps
66
+ # Stream a valid JSON array to disk one chunk at a time so the whole
67
+ # dataset is never held in memory at once.
72
68
  with open(output_path, "w", encoding="utf-8") as f:
73
- json_str = json.dumps(all_formatted_rows, indent=4)
74
- f.write(json_str)
69
+ f.write("[")
70
+ first = True
71
+ for chunk in self.chunker.chunk_data(data):
72
+ for row in chunk:
73
+ formatted = self.formatter.format_row(row)
74
+ block = json.dumps(formatted, indent=4)
75
+ indented = "\n".join(
76
+ " " + line for line in block.splitlines()
77
+ )
78
+ f.write(("\n" if first else ",\n") + indented)
79
+ first = False
80
+ f.write("\n]" if not first else "]")
75
81
 
76
82
  logger.info(f"Successfully exported {len(data)} rows to {output_path}.")
77
83