great-generator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. great_generator-0.1.0/.gitignore +14 -0
  2. great_generator-0.1.0/CHANGELOG.md +56 -0
  3. great_generator-0.1.0/CODE_OF_CONDUCT.md +26 -0
  4. great_generator-0.1.0/CONTRIBUTING.md +80 -0
  5. great_generator-0.1.0/LICENSE +21 -0
  6. great_generator-0.1.0/PKG-INFO +1181 -0
  7. great_generator-0.1.0/README.md +1127 -0
  8. great_generator-0.1.0/SECURITY.md +31 -0
  9. great_generator-0.1.0/benchmarks/benchmark_pandas_generation.py +28 -0
  10. great_generator-0.1.0/docs/ANOMALIES.md +31 -0
  11. great_generator-0.1.0/docs/API_REFERENCE.md +66 -0
  12. great_generator-0.1.0/docs/BENCHMARKS.md +20 -0
  13. great_generator-0.1.0/docs/CDC.md +29 -0
  14. great_generator-0.1.0/docs/CLOUD_DEPLOYMENT.md +128 -0
  15. great_generator-0.1.0/docs/CONTRIBUTING_GUIDE.md +24 -0
  16. great_generator-0.1.0/docs/DOMAIN_PACKS.md +30 -0
  17. great_generator-0.1.0/docs/OPEN_SOURCE_STRATEGY.md +50 -0
  18. great_generator-0.1.0/docs/PYPI_RELEASE.md +153 -0
  19. great_generator-0.1.0/docs/QUICKSTART.md +42 -0
  20. great_generator-0.1.0/docs/REALISTIC_VALUES.md +17 -0
  21. great_generator-0.1.0/docs/RELEASE_0_1_0.md +57 -0
  22. great_generator-0.1.0/docs/SPARK_AND_DELTA.md +47 -0
  23. great_generator-0.1.0/enterprise_synth/__init__.py +9 -0
  24. great_generator-0.1.0/examples/banking_cdc_demo.py +9 -0
  25. great_generator-0.1.0/examples/banking_realistic_values_demo.py +20 -0
  26. great_generator-0.1.0/examples/cdc_pipeline_demo.py +22 -0
  27. great_generator-0.1.0/examples/data_quality_anomaly_demo.py +26 -0
  28. great_generator-0.1.0/examples/databricks_delta_export_demo.py +20 -0
  29. great_generator-0.1.0/examples/ecommerce_demo.py +7 -0
  30. great_generator-0.1.0/examples/ecommerce_realistic_values_demo.py +20 -0
  31. great_generator-0.1.0/examples/healthcare_realistic_values_demo.py +20 -0
  32. great_generator-0.1.0/examples/pandas_quickstart.ipynb +38 -0
  33. great_generator-0.1.0/examples/spark_delta_demo.py +13 -0
  34. great_generator-0.1.0/great_generator/__init__.py +23 -0
  35. great_generator-0.1.0/great_generator/anomalies/__init__.py +1 -0
  36. great_generator-0.1.0/great_generator/anomalies/injector.py +230 -0
  37. great_generator-0.1.0/great_generator/api.py +341 -0
  38. great_generator-0.1.0/great_generator/cdc/__init__.py +1 -0
  39. great_generator-0.1.0/great_generator/cdc/generator.py +154 -0
  40. great_generator-0.1.0/great_generator/config.py +538 -0
  41. great_generator-0.1.0/great_generator/core/__init__.py +1 -0
  42. great_generator-0.1.0/great_generator/core/realism.py +290 -0
  43. great_generator-0.1.0/great_generator/core/reference_values.py +221 -0
  44. great_generator-0.1.0/great_generator/core/value_generator.py +121 -0
  45. great_generator-0.1.0/great_generator/distributions/__init__.py +1 -0
  46. great_generator-0.1.0/great_generator/distributions/time_patterns.py +87 -0
  47. great_generator-0.1.0/great_generator/distributions/weighted.py +28 -0
  48. great_generator-0.1.0/great_generator/domains/__init__.py +33 -0
  49. great_generator-0.1.0/great_generator/domains/_industry.py +165 -0
  50. great_generator-0.1.0/great_generator/domains/automotive.py +160 -0
  51. great_generator-0.1.0/great_generator/domains/banking.py +495 -0
  52. great_generator-0.1.0/great_generator/domains/common/__init__.py +1 -0
  53. great_generator-0.1.0/great_generator/domains/common/reference_values.py +3 -0
  54. great_generator-0.1.0/great_generator/domains/ecommerce.py +464 -0
  55. great_generator-0.1.0/great_generator/domains/energy.py +150 -0
  56. great_generator-0.1.0/great_generator/domains/healthcare.py +416 -0
  57. great_generator-0.1.0/great_generator/domains/hospitality.py +164 -0
  58. great_generator-0.1.0/great_generator/domains/insurance.py +162 -0
  59. great_generator-0.1.0/great_generator/domains/logistics.py +381 -0
  60. great_generator-0.1.0/great_generator/domains/manufacturing.py +156 -0
  61. great_generator-0.1.0/great_generator/domains/media.py +147 -0
  62. great_generator-0.1.0/great_generator/domains/public_sector.py +158 -0
  63. great_generator-0.1.0/great_generator/domains/saas.py +444 -0
  64. great_generator-0.1.0/great_generator/domains/telecom.py +410 -0
  65. great_generator-0.1.0/great_generator/engines/__init__.py +1 -0
  66. great_generator-0.1.0/great_generator/engines/pandas_engine.py +28 -0
  67. great_generator-0.1.0/great_generator/engines/spark_engine.py +810 -0
  68. great_generator-0.1.0/great_generator/exporters/__init__.py +1 -0
  69. great_generator-0.1.0/great_generator/exporters/csv_exporter.py +34 -0
  70. great_generator-0.1.0/great_generator/exporters/delta_exporter.py +32 -0
  71. great_generator-0.1.0/great_generator/exporters/json_exporter.py +36 -0
  72. great_generator-0.1.0/great_generator/exporters/parquet_exporter.py +39 -0
  73. great_generator-0.1.0/great_generator/exporters/pathing.py +42 -0
  74. great_generator-0.1.0/great_generator/exporters/spark_options.py +32 -0
  75. great_generator-0.1.0/great_generator/relationships/__init__.py +1 -0
  76. great_generator-0.1.0/great_generator/relationships/graph.py +28 -0
  77. great_generator-0.1.0/great_generator/relationships/keys.py +37 -0
  78. great_generator-0.1.0/great_generator/schemas/__init__.py +1 -0
  79. great_generator-0.1.0/great_generator/schemas/generation.py +450 -0
  80. great_generator-0.1.0/great_generator/schemas/models.py +87 -0
  81. great_generator-0.1.0/great_generator/schemas/relational.py +368 -0
  82. great_generator-0.1.0/great_generator/utils/__init__.py +1 -0
  83. great_generator-0.1.0/great_generator/utils/random.py +20 -0
  84. great_generator-0.1.0/great_generator/utils/validation.py +43 -0
  85. great_generator-0.1.0/pyproject.toml +134 -0
  86. great_generator-0.1.0/tests/test_additional_domains.py +194 -0
  87. great_generator-0.1.0/tests/test_anomaly_injection.py +35 -0
  88. great_generator-0.1.0/tests/test_api.py +47 -0
  89. great_generator-0.1.0/tests/test_banking_domain.py +44 -0
  90. great_generator-0.1.0/tests/test_cdc_generation.py +38 -0
  91. great_generator-0.1.0/tests/test_ecommerce_domain.py +35 -0
  92. great_generator-0.1.0/tests/test_export_paths.py +140 -0
  93. great_generator-0.1.0/tests/test_exports.py +24 -0
  94. great_generator-0.1.0/tests/test_import_compatibility.py +8 -0
  95. great_generator-0.1.0/tests/test_realistic_values.py +71 -0
  96. great_generator-0.1.0/tests/test_relational_generation.py +111 -0
  97. great_generator-0.1.0/tests/test_relationship_integrity.py +33 -0
  98. great_generator-0.1.0/tests/test_scale_profiles.py +34 -0
  99. great_generator-0.1.0/tests/test_schema_generation.py +86 -0
  100. great_generator-0.1.0/tests/test_seed_reproducibility.py +35 -0
  101. great_generator-0.1.0/tests/test_spark_optional.py +126 -0
@@ -0,0 +1,14 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .pytest_cache/
4
+ .ruff_cache/
5
+ .mypy_cache/
6
+ .venv/
7
+ dist/
8
+ build/
9
+ *.egg-info/
10
+ _publish_clone/
11
+ .ipynb_checkpoints/
12
+ .coverage
13
+ coverage.xml
14
+ htmlcov/
@@ -0,0 +1,56 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented here.
4
+
5
+ This project follows semantic versioning once public releases begin.
6
+
7
+ ## Unreleased
8
+
9
+ No unreleased changes yet.
10
+
11
+ ## 0.1.0 - 2026-06-19
12
+
13
+ Initial public PyPI release.
14
+
15
+ ### Added
16
+
17
+ - Great Generator package identity: publish as `great-generator` and import as `great_generator`.
18
+ - Backward-compatible `enterprise_synth` import alias for pre-release users of the earlier repo name.
19
+ - Faker-backed realistic value generation for pandas outputs.
20
+ - Spark-native deterministic realistic value generation for Spark outputs.
21
+ - `realism` mode for domain, relational, and schema generation APIs.
22
+ - Realistic customer, patient, resident, user, merchant, product, provider, organization, company, phone, email, address, city, state, and postal-code fields where applicable.
23
+ - Curated domain reference values for banking, ecommerce, healthcare, insurance, telecom, manufacturing, logistics, energy, hospitality, SaaS, public sector, media, and automotive-style data.
24
+ - Ecommerce domain pack with customers, products, orders, order items, payments, shipments, and returns.
25
+ - Banking domain pack with customers, accounts, transactions, cards, merchants, fraud events, and CDC-style customer changes.
26
+ - Healthcare domain pack with patients, providers, facilities, encounters, claims, prescriptions, and lab results.
27
+ - Telecom domain pack with customers, plans, devices, subscriptions, usage events, invoices, and support tickets.
28
+ - Logistics domain pack with shippers, warehouses, carriers, products, shipments, shipment events, and inventory movements.
29
+ - SaaS domain pack with organizations, users, plans, subscriptions, features, usage events, invoices, and support tickets.
30
+ - Insurance domain pack with customers, agents, policies, claims, premium payments, risk assessments, and reinsurance contracts.
31
+ - Automotive domain pack with customers, dealers, vehicles, sales, service appointments, warranty claims, and telematics events.
32
+ - Energy domain pack with customers, sites, meters, rate plans, usage readings, outages, and bills.
33
+ - Manufacturing domain pack with suppliers, plants, products, work orders, production runs, quality inspections, and inventory movements.
34
+ - Media domain pack with users, content titles, subscriptions, viewing events, ad campaigns, ad impressions, and game sessions.
35
+ - Public sector domain pack with residents, agencies, programs, applications, cases, payments, and service requests.
36
+ - Hospitality domain pack with customers, properties, rooms, reservations, stays, payments, and reviews.
37
+ - Shared industry-domain generator for compact domain packs with valid relationships and domain-looking values.
38
+ - Schema-driven Spark fallback for newer domain packs, preserving primary-key and foreign-key consistency.
39
+ - Deterministic generation with seeds.
40
+ - Pandas generation engine.
41
+ - Optional Spark generation engine.
42
+ - CSV, JSON, Parquet, and Delta export helpers.
43
+ - Cloud-friendly Spark path handling for local paths, DBFS, S3, ADLS, and GCS-style URIs.
44
+ - Spark export controls for writer options, partitioning, repartitioning, and coalescing.
45
+ - CDC generation for banking customer changes.
46
+ - Opt-in anomaly injection for nulls, duplicates, orphan keys, late records, out-of-order records, outliers, negative amounts, invalid statuses, and skew.
47
+ - Schema-first generation from compact schema strings, pandas DataFrames, PySpark StructTypes, and PySpark DataFrames.
48
+ - Custom relational schema generation with user-provided tables, row counts, primary keys, foreign keys, pandas/Spark output, and optional exports.
49
+ - Realistic-value examples, documentation pages, a GitHub Wiki, and a lightweight pandas benchmark script.
50
+ - Tests for realistic value quality, placeholder compatibility, seed reproducibility, and relationship safety.
51
+ - Tests for domain generation, relationship integrity, exports, CDC, anomalies, seed reproducibility, schema generation, and optional Spark behavior.
52
+
53
+ ### Notes
54
+
55
+ - Spark and Delta dependencies are optional extras.
56
+ - JSON-native nested payload generation is planned for a future release.
@@ -0,0 +1,26 @@
1
+ # Code of Conduct
2
+
3
+ This project is meant to be a practical, welcoming place for people building synthetic data tools for demos, tests, education, benchmarking, and research.
4
+
5
+ ## Expected behavior
6
+
7
+ - Be respectful and constructive.
8
+ - Assume good intent, but accept correction gracefully.
9
+ - Keep discussions focused on the work.
10
+ - Make space for beginners, students, practitioners, and domain experts.
11
+ - Prefer specific technical feedback over personal criticism.
12
+
13
+ ## Unacceptable behavior
14
+
15
+ - Harassment, threats, insults, or discriminatory language.
16
+ - Posting private information without permission.
17
+ - Repeatedly derailing technical discussions.
18
+ - Using issues, pull requests, or discussions to attack people rather than improve the project.
19
+
20
+ ## Enforcement
21
+
22
+ Project maintainers may remove comments, close threads, decline contributions, or restrict participation when behavior harms the project or its contributors.
23
+
24
+ If you need to report a conduct issue, email:
25
+
26
+ **ravikiran.pagidi@gmail.com**
@@ -0,0 +1,80 @@
1
+ # Contributing
2
+
3
+ Thanks for helping make `great-generator` more useful.
4
+
5
+ ## Local setup
6
+
7
+ ```bash
8
+ python -m venv .venv
9
+ . .venv/bin/activate
10
+ pip install -e ".[dev]"
11
+ ```
12
+
13
+ ## Development workflow
14
+
15
+ 1. Create a focused branch.
16
+ 2. Add or update tests with every behavior change.
17
+ 3. Run:
18
+
19
+ ```bash
20
+ ruff check .
21
+ black --check .
22
+ pytest
23
+ python -m build
24
+ python -m twine check dist/*
25
+ ```
26
+
27
+ 4. Keep domain packs realistic, deterministic, and dependency-light.
28
+ 5. Prefer small public APIs with excellent examples over clever internals.
29
+
30
+ ## Good contributions
31
+
32
+ - New domain packs with documented relationships and behaviors
33
+ - Additional realistic reference values for existing domains
34
+ - Additional anomaly types that are opt-in and testable
35
+ - Spark generation improvements that preserve deterministic behavior
36
+ - Exporters, schema utilities, and documentation examples
37
+ - Bug fixes with regression tests
38
+
39
+ ## Adding a domain pack
40
+
41
+ A good domain pack should include:
42
+
43
+ 1. table schemas with primary keys and foreign keys
44
+ 2. deterministic pandas generation
45
+ 3. Spark support through either a domain-specific generator or schema-driven fallback
46
+ 4. realistic distributions, skew, and time behavior
47
+ 5. tests for tables, columns, relationships, and seed reproducibility
48
+ 6. README or docs examples showing why the domain is useful
49
+
50
+ ## Adding realistic values
51
+
52
+ Add reusable business values to `great_generator/core/reference_values.py` when they can help more than one domain or user-provided schema. Keep lists realistic, dependency-light, and safe for public demos. Add tests that prove realistic mode is not returning placeholder-only values.
53
+
54
+ ## Suggested starter issues
55
+
56
+ - Add realistic values for telecom plans and device models
57
+ - Add ecommerce dashboard demo notebook
58
+ - Add Spark benchmark script for cluster runs
59
+ - Improve API reference docs with more examples
60
+ - Add Great Expectations integration example
61
+ - Add Microsoft Fabric demo using generated Parquet data
62
+ - Add more healthcare provider and facility reference values
63
+ - Add tests for realistic optional-null distribution
64
+
65
+ ## Releases
66
+
67
+ Release work should follow [docs/PYPI_RELEASE.md](docs/PYPI_RELEASE.md).
68
+
69
+ ## Community and security
70
+
71
+ - Code of conduct: [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md)
72
+ - Security policy: [SECURITY.md](SECURITY.md)
73
+
74
+ ## Design principles
75
+
76
+ - Referential integrity by default
77
+ - Anomalies only when explicitly requested
78
+ - Seeds should make experiments reproducible
79
+ - Optional Spark/Delta dependencies must remain optional
80
+ - A newcomer should succeed in under a minute
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ravi Kiran Pagidi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.