great-generator 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- great_generator-0.1.0/.gitignore +14 -0
- great_generator-0.1.0/CHANGELOG.md +56 -0
- great_generator-0.1.0/CODE_OF_CONDUCT.md +26 -0
- great_generator-0.1.0/CONTRIBUTING.md +80 -0
- great_generator-0.1.0/LICENSE +21 -0
- great_generator-0.1.0/PKG-INFO +1181 -0
- great_generator-0.1.0/README.md +1127 -0
- great_generator-0.1.0/SECURITY.md +31 -0
- great_generator-0.1.0/benchmarks/benchmark_pandas_generation.py +28 -0
- great_generator-0.1.0/docs/ANOMALIES.md +31 -0
- great_generator-0.1.0/docs/API_REFERENCE.md +66 -0
- great_generator-0.1.0/docs/BENCHMARKS.md +20 -0
- great_generator-0.1.0/docs/CDC.md +29 -0
- great_generator-0.1.0/docs/CLOUD_DEPLOYMENT.md +128 -0
- great_generator-0.1.0/docs/CONTRIBUTING_GUIDE.md +24 -0
- great_generator-0.1.0/docs/DOMAIN_PACKS.md +30 -0
- great_generator-0.1.0/docs/OPEN_SOURCE_STRATEGY.md +50 -0
- great_generator-0.1.0/docs/PYPI_RELEASE.md +153 -0
- great_generator-0.1.0/docs/QUICKSTART.md +42 -0
- great_generator-0.1.0/docs/REALISTIC_VALUES.md +17 -0
- great_generator-0.1.0/docs/RELEASE_0_1_0.md +57 -0
- great_generator-0.1.0/docs/SPARK_AND_DELTA.md +47 -0
- great_generator-0.1.0/enterprise_synth/__init__.py +9 -0
- great_generator-0.1.0/examples/banking_cdc_demo.py +9 -0
- great_generator-0.1.0/examples/banking_realistic_values_demo.py +20 -0
- great_generator-0.1.0/examples/cdc_pipeline_demo.py +22 -0
- great_generator-0.1.0/examples/data_quality_anomaly_demo.py +26 -0
- great_generator-0.1.0/examples/databricks_delta_export_demo.py +20 -0
- great_generator-0.1.0/examples/ecommerce_demo.py +7 -0
- great_generator-0.1.0/examples/ecommerce_realistic_values_demo.py +20 -0
- great_generator-0.1.0/examples/healthcare_realistic_values_demo.py +20 -0
- great_generator-0.1.0/examples/pandas_quickstart.ipynb +38 -0
- great_generator-0.1.0/examples/spark_delta_demo.py +13 -0
- great_generator-0.1.0/great_generator/__init__.py +23 -0
- great_generator-0.1.0/great_generator/anomalies/__init__.py +1 -0
- great_generator-0.1.0/great_generator/anomalies/injector.py +230 -0
- great_generator-0.1.0/great_generator/api.py +341 -0
- great_generator-0.1.0/great_generator/cdc/__init__.py +1 -0
- great_generator-0.1.0/great_generator/cdc/generator.py +154 -0
- great_generator-0.1.0/great_generator/config.py +538 -0
- great_generator-0.1.0/great_generator/core/__init__.py +1 -0
- great_generator-0.1.0/great_generator/core/realism.py +290 -0
- great_generator-0.1.0/great_generator/core/reference_values.py +221 -0
- great_generator-0.1.0/great_generator/core/value_generator.py +121 -0
- great_generator-0.1.0/great_generator/distributions/__init__.py +1 -0
- great_generator-0.1.0/great_generator/distributions/time_patterns.py +87 -0
- great_generator-0.1.0/great_generator/distributions/weighted.py +28 -0
- great_generator-0.1.0/great_generator/domains/__init__.py +33 -0
- great_generator-0.1.0/great_generator/domains/_industry.py +165 -0
- great_generator-0.1.0/great_generator/domains/automotive.py +160 -0
- great_generator-0.1.0/great_generator/domains/banking.py +495 -0
- great_generator-0.1.0/great_generator/domains/common/__init__.py +1 -0
- great_generator-0.1.0/great_generator/domains/common/reference_values.py +3 -0
- great_generator-0.1.0/great_generator/domains/ecommerce.py +464 -0
- great_generator-0.1.0/great_generator/domains/energy.py +150 -0
- great_generator-0.1.0/great_generator/domains/healthcare.py +416 -0
- great_generator-0.1.0/great_generator/domains/hospitality.py +164 -0
- great_generator-0.1.0/great_generator/domains/insurance.py +162 -0
- great_generator-0.1.0/great_generator/domains/logistics.py +381 -0
- great_generator-0.1.0/great_generator/domains/manufacturing.py +156 -0
- great_generator-0.1.0/great_generator/domains/media.py +147 -0
- great_generator-0.1.0/great_generator/domains/public_sector.py +158 -0
- great_generator-0.1.0/great_generator/domains/saas.py +444 -0
- great_generator-0.1.0/great_generator/domains/telecom.py +410 -0
- great_generator-0.1.0/great_generator/engines/__init__.py +1 -0
- great_generator-0.1.0/great_generator/engines/pandas_engine.py +28 -0
- great_generator-0.1.0/great_generator/engines/spark_engine.py +810 -0
- great_generator-0.1.0/great_generator/exporters/__init__.py +1 -0
- great_generator-0.1.0/great_generator/exporters/csv_exporter.py +34 -0
- great_generator-0.1.0/great_generator/exporters/delta_exporter.py +32 -0
- great_generator-0.1.0/great_generator/exporters/json_exporter.py +36 -0
- great_generator-0.1.0/great_generator/exporters/parquet_exporter.py +39 -0
- great_generator-0.1.0/great_generator/exporters/pathing.py +42 -0
- great_generator-0.1.0/great_generator/exporters/spark_options.py +32 -0
- great_generator-0.1.0/great_generator/relationships/__init__.py +1 -0
- great_generator-0.1.0/great_generator/relationships/graph.py +28 -0
- great_generator-0.1.0/great_generator/relationships/keys.py +37 -0
- great_generator-0.1.0/great_generator/schemas/__init__.py +1 -0
- great_generator-0.1.0/great_generator/schemas/generation.py +450 -0
- great_generator-0.1.0/great_generator/schemas/models.py +87 -0
- great_generator-0.1.0/great_generator/schemas/relational.py +368 -0
- great_generator-0.1.0/great_generator/utils/__init__.py +1 -0
- great_generator-0.1.0/great_generator/utils/random.py +20 -0
- great_generator-0.1.0/great_generator/utils/validation.py +43 -0
- great_generator-0.1.0/pyproject.toml +134 -0
- great_generator-0.1.0/tests/test_additional_domains.py +194 -0
- great_generator-0.1.0/tests/test_anomaly_injection.py +35 -0
- great_generator-0.1.0/tests/test_api.py +47 -0
- great_generator-0.1.0/tests/test_banking_domain.py +44 -0
- great_generator-0.1.0/tests/test_cdc_generation.py +38 -0
- great_generator-0.1.0/tests/test_ecommerce_domain.py +35 -0
- great_generator-0.1.0/tests/test_export_paths.py +140 -0
- great_generator-0.1.0/tests/test_exports.py +24 -0
- great_generator-0.1.0/tests/test_import_compatibility.py +8 -0
- great_generator-0.1.0/tests/test_realistic_values.py +71 -0
- great_generator-0.1.0/tests/test_relational_generation.py +111 -0
- great_generator-0.1.0/tests/test_relationship_integrity.py +33 -0
- great_generator-0.1.0/tests/test_scale_profiles.py +34 -0
- great_generator-0.1.0/tests/test_schema_generation.py +86 -0
- great_generator-0.1.0/tests/test_seed_reproducibility.py +35 -0
- great_generator-0.1.0/tests/test_spark_optional.py +126 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented here.
|
|
4
|
+
|
|
5
|
+
This project follows semantic versioning once public releases begin.
|
|
6
|
+
|
|
7
|
+
## Unreleased
|
|
8
|
+
|
|
9
|
+
No unreleased changes yet.
|
|
10
|
+
|
|
11
|
+
## 0.1.0 - 2026-06-19
|
|
12
|
+
|
|
13
|
+
Initial public PyPI release.
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- Great Generator package identity: publish as `great-generator` and import as `great_generator`.
|
|
18
|
+
- Backward-compatible `enterprise_synth` import alias for pre-release users of the earlier repo name.
|
|
19
|
+
- Faker-backed realistic value generation for pandas outputs.
|
|
20
|
+
- Spark-native deterministic realistic value generation for Spark outputs.
|
|
21
|
+
- `realism` mode for domain, relational, and schema generation APIs.
|
|
22
|
+
- Realistic customer, patient, resident, user, merchant, product, provider, organization, company, phone, email, address, city, state, and postal-code fields where applicable.
|
|
23
|
+
- Curated domain reference values for banking, ecommerce, healthcare, insurance, telecom, manufacturing, logistics, energy, hospitality, SaaS, public sector, media, and automotive-style data.
|
|
24
|
+
- Ecommerce domain pack with customers, products, orders, order items, payments, shipments, and returns.
|
|
25
|
+
- Banking domain pack with customers, accounts, transactions, cards, merchants, fraud events, and CDC-style customer changes.
|
|
26
|
+
- Healthcare domain pack with patients, providers, facilities, encounters, claims, prescriptions, and lab results.
|
|
27
|
+
- Telecom domain pack with customers, plans, devices, subscriptions, usage events, invoices, and support tickets.
|
|
28
|
+
- Logistics domain pack with shippers, warehouses, carriers, products, shipments, shipment events, and inventory movements.
|
|
29
|
+
- SaaS domain pack with organizations, users, plans, subscriptions, features, usage events, invoices, and support tickets.
|
|
30
|
+
- Insurance domain pack with customers, agents, policies, claims, premium payments, risk assessments, and reinsurance contracts.
|
|
31
|
+
- Automotive domain pack with customers, dealers, vehicles, sales, service appointments, warranty claims, and telematics events.
|
|
32
|
+
- Energy domain pack with customers, sites, meters, rate plans, usage readings, outages, and bills.
|
|
33
|
+
- Manufacturing domain pack with suppliers, plants, products, work orders, production runs, quality inspections, and inventory movements.
|
|
34
|
+
- Media domain pack with users, content titles, subscriptions, viewing events, ad campaigns, ad impressions, and game sessions.
|
|
35
|
+
- Public sector domain pack with residents, agencies, programs, applications, cases, payments, and service requests.
|
|
36
|
+
- Hospitality domain pack with customers, properties, rooms, reservations, stays, payments, and reviews.
|
|
37
|
+
- Shared industry-domain generator for compact domain packs with valid relationships and domain-looking values.
|
|
38
|
+
- Schema-driven Spark fallback for newer domain packs, preserving primary-key and foreign-key consistency.
|
|
39
|
+
- Deterministic generation with seeds.
|
|
40
|
+
- Pandas generation engine.
|
|
41
|
+
- Optional Spark generation engine.
|
|
42
|
+
- CSV, JSON, Parquet, and Delta export helpers.
|
|
43
|
+
- Cloud-friendly Spark path handling for local paths, DBFS, S3, ADLS, and GCS-style URIs.
|
|
44
|
+
- Spark export controls for writer options, partitioning, repartitioning, and coalescing.
|
|
45
|
+
- CDC generation for banking customer changes.
|
|
46
|
+
- Opt-in anomaly injection for nulls, duplicates, orphan keys, late records, out-of-order records, outliers, negative amounts, invalid statuses, and skew.
|
|
47
|
+
- Schema-first generation from compact schema strings, pandas DataFrames, PySpark StructTypes, and PySpark DataFrames.
|
|
48
|
+
- Custom relational schema generation with user-provided tables, row counts, primary keys, foreign keys, pandas/Spark output, and optional exports.
|
|
49
|
+
- Realistic-value examples, documentation pages, a GitHub Wiki, and a lightweight pandas benchmark script.
|
|
50
|
+
- Tests for realistic value quality, placeholder compatibility, seed reproducibility, and relationship safety.
|
|
51
|
+
- Tests for domain generation, relationship integrity, exports, CDC, anomalies, seed reproducibility, schema generation, and optional Spark behavior.
|
|
52
|
+
|
|
53
|
+
### Notes
|
|
54
|
+
|
|
55
|
+
- Spark and Delta dependencies are optional extras.
|
|
56
|
+
- JSON-native nested payload generation is planned for a future release.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Code of Conduct
|
|
2
|
+
|
|
3
|
+
This project is meant to be a practical, welcoming place for people building synthetic data tools for demos, tests, education, benchmarking, and research.
|
|
4
|
+
|
|
5
|
+
## Expected behavior
|
|
6
|
+
|
|
7
|
+
- Be respectful and constructive.
|
|
8
|
+
- Assume good intent, but accept correction gracefully.
|
|
9
|
+
- Keep discussions focused on the work.
|
|
10
|
+
- Make space for beginners, students, practitioners, and domain experts.
|
|
11
|
+
- Prefer specific technical feedback over personal criticism.
|
|
12
|
+
|
|
13
|
+
## Unacceptable behavior
|
|
14
|
+
|
|
15
|
+
- Harassment, threats, insults, or discriminatory language.
|
|
16
|
+
- Posting private information without permission.
|
|
17
|
+
- Repeatedly derailing technical discussions.
|
|
18
|
+
- Using issues, pull requests, or discussions to attack people rather than improve the project.
|
|
19
|
+
|
|
20
|
+
## Enforcement
|
|
21
|
+
|
|
22
|
+
Project maintainers may remove comments, close threads, decline contributions, or restrict participation when behavior harms the project or its contributors.
|
|
23
|
+
|
|
24
|
+
If you need to report a conduct issue, email:
|
|
25
|
+
|
|
26
|
+
**ravikiran.pagidi@gmail.com**
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thanks for helping make `great-generator` more useful.
|
|
4
|
+
|
|
5
|
+
## Local setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
python -m venv .venv
|
|
9
|
+
. .venv/bin/activate
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Development workflow
|
|
14
|
+
|
|
15
|
+
1. Create a focused branch.
|
|
16
|
+
2. Add or update tests with every behavior change.
|
|
17
|
+
3. Run:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
ruff check .
|
|
21
|
+
black --check .
|
|
22
|
+
pytest
|
|
23
|
+
python -m build
|
|
24
|
+
python -m twine check dist/*
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
4. Keep domain packs realistic, deterministic, and dependency-light.
|
|
28
|
+
5. Prefer small public APIs with excellent examples over clever internals.
|
|
29
|
+
|
|
30
|
+
## Good contributions
|
|
31
|
+
|
|
32
|
+
- New domain packs with documented relationships and behaviors
|
|
33
|
+
- Additional realistic reference values for existing domains
|
|
34
|
+
- Additional anomaly types that are opt-in and testable
|
|
35
|
+
- Spark generation improvements that preserve deterministic behavior
|
|
36
|
+
- Exporters, schema utilities, and documentation examples
|
|
37
|
+
- Bug fixes with regression tests
|
|
38
|
+
|
|
39
|
+
## Adding a domain pack
|
|
40
|
+
|
|
41
|
+
A good domain pack should include:
|
|
42
|
+
|
|
43
|
+
1. table schemas with primary keys and foreign keys
|
|
44
|
+
2. deterministic pandas generation
|
|
45
|
+
3. Spark support through either a domain-specific generator or schema-driven fallback
|
|
46
|
+
4. realistic distributions, skew, and time behavior
|
|
47
|
+
5. tests for tables, columns, relationships, and seed reproducibility
|
|
48
|
+
6. README or docs examples showing why the domain is useful
|
|
49
|
+
|
|
50
|
+
## Adding realistic values
|
|
51
|
+
|
|
52
|
+
Add reusable business values to `great_generator/core/reference_values.py` when they can help more than one domain or user-provided schema. Keep lists realistic, dependency-light, and safe for public demos. Add tests that prove realistic mode is not returning placeholder-only values.
|
|
53
|
+
|
|
54
|
+
## Suggested starter issues
|
|
55
|
+
|
|
56
|
+
- Add realistic values for telecom plans and device models
|
|
57
|
+
- Add ecommerce dashboard demo notebook
|
|
58
|
+
- Add Spark benchmark script for cluster runs
|
|
59
|
+
- Improve API reference docs with more examples
|
|
60
|
+
- Add Great Expectations integration example
|
|
61
|
+
- Add Microsoft Fabric demo using generated Parquet data
|
|
62
|
+
- Add more healthcare provider and facility reference values
|
|
63
|
+
- Add tests for realistic optional-null distribution
|
|
64
|
+
|
|
65
|
+
## Releases
|
|
66
|
+
|
|
67
|
+
Release work should follow [docs/PYPI_RELEASE.md](docs/PYPI_RELEASE.md).
|
|
68
|
+
|
|
69
|
+
## Community and security
|
|
70
|
+
|
|
71
|
+
- Code of conduct: [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md)
|
|
72
|
+
- Security policy: [SECURITY.md](SECURITY.md)
|
|
73
|
+
|
|
74
|
+
## Design principles
|
|
75
|
+
|
|
76
|
+
- Referential integrity by default
|
|
77
|
+
- Anomalies only when explicitly requested
|
|
78
|
+
- Seeds should make experiments reproducible
|
|
79
|
+
- Optional Spark/Delta dependencies must remain optional
|
|
80
|
+
- A newcomer should succeed in under a minute
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ravi Kiran Pagidi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|