goldencheck 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- goldencheck-0.1.0/.github/FUNDING.yml +1 -0
- goldencheck-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +41 -0
- goldencheck-0.1.0/.github/ISSUE_TEMPLATE/config.yml +5 -0
- goldencheck-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +23 -0
- goldencheck-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +17 -0
- goldencheck-0.1.0/.github/workflows/publish.yml +21 -0
- goldencheck-0.1.0/.github/workflows/test.yml +22 -0
- goldencheck-0.1.0/.gitignore +13 -0
- goldencheck-0.1.0/CODE_OF_CONDUCT.md +36 -0
- goldencheck-0.1.0/CONTRIBUTING.md +54 -0
- goldencheck-0.1.0/LICENSE +21 -0
- goldencheck-0.1.0/PKG-INFO +264 -0
- goldencheck-0.1.0/README.md +225 -0
- goldencheck-0.1.0/SECURITY.md +23 -0
- goldencheck-0.1.0/benchmarks/datasets/goldencheck_bench/ground_truth.json +539 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/.gitignore +160 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/LICENSE.md +201 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/MANIFEST +178 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/MANIFEST.in +3 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/README.md +69 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/requirements.txt +10 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/setup.cfg +2 -0
- goldencheck-0.1.0/benchmarks/datasets/raha_repo/setup.py +17 -0
- goldencheck-0.1.0/benchmarks/detection_benchmark.py +117 -0
- goldencheck-0.1.0/benchmarks/generate_datasets.py +430 -0
- goldencheck-0.1.0/benchmarks/goldencheck_benchmark.py +380 -0
- goldencheck-0.1.0/benchmarks/goldencheck_benchmark_llm.py +193 -0
- goldencheck-0.1.0/benchmarks/speed_benchmark.py +113 -0
- goldencheck-0.1.0/docs/superpowers/plans/2026-03-22-goldencheck-implementation.md +2003 -0
- goldencheck-0.1.0/docs/superpowers/plans/2026-03-23-llm-boost-implementation.md +996 -0
- goldencheck-0.1.0/docs/superpowers/specs/2026-03-22-goldencheck-design.md +356 -0
- goldencheck-0.1.0/docs/superpowers/specs/2026-03-23-llm-boost-design.md +342 -0
- goldencheck-0.1.0/docs/wiki/Architecture.md +217 -0
- goldencheck-0.1.0/docs/wiki/Benchmarks.md +143 -0
- goldencheck-0.1.0/docs/wiki/CLI.md +187 -0
- goldencheck-0.1.0/docs/wiki/Configuration.md +189 -0
- goldencheck-0.1.0/docs/wiki/Installation.md +105 -0
- goldencheck-0.1.0/docs/wiki/Interactive-TUI.md +144 -0
- goldencheck-0.1.0/docs/wiki/LLM-Boost.md +169 -0
- goldencheck-0.1.0/docs/wiki/Profilers.md +310 -0
- goldencheck-0.1.0/goldencheck/__init__.py +3 -0
- goldencheck-0.1.0/goldencheck/cli/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/cli/main.py +212 -0
- goldencheck-0.1.0/goldencheck/config/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/config/loader.py +21 -0
- goldencheck-0.1.0/goldencheck/config/schema.py +33 -0
- goldencheck-0.1.0/goldencheck/config/writer.py +16 -0
- goldencheck-0.1.0/goldencheck/engine/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/engine/reader.py +41 -0
- goldencheck-0.1.0/goldencheck/engine/sampler.py +8 -0
- goldencheck-0.1.0/goldencheck/engine/scanner.py +142 -0
- goldencheck-0.1.0/goldencheck/engine/validator.py +94 -0
- goldencheck-0.1.0/goldencheck/llm/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/llm/budget.py +70 -0
- goldencheck-0.1.0/goldencheck/llm/merger.py +87 -0
- goldencheck-0.1.0/goldencheck/llm/parser.py +27 -0
- goldencheck-0.1.0/goldencheck/llm/prompts.py +62 -0
- goldencheck-0.1.0/goldencheck/llm/providers.py +64 -0
- goldencheck-0.1.0/goldencheck/llm/sample_block.py +92 -0
- goldencheck-0.1.0/goldencheck/models/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/models/finding.py +21 -0
- goldencheck-0.1.0/goldencheck/models/profile.py +52 -0
- goldencheck-0.1.0/goldencheck/profilers/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/profilers/base.py +10 -0
- goldencheck-0.1.0/goldencheck/profilers/cardinality.py +41 -0
- goldencheck-0.1.0/goldencheck/profilers/format_detection.py +60 -0
- goldencheck-0.1.0/goldencheck/profilers/nullability.py +23 -0
- goldencheck-0.1.0/goldencheck/profilers/pattern_consistency.py +75 -0
- goldencheck-0.1.0/goldencheck/profilers/range_distribution.py +56 -0
- goldencheck-0.1.0/goldencheck/profilers/type_inference.py +31 -0
- goldencheck-0.1.0/goldencheck/profilers/uniqueness.py +24 -0
- goldencheck-0.1.0/goldencheck/relations/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/relations/null_correlation.py +64 -0
- goldencheck-0.1.0/goldencheck/relations/temporal.py +92 -0
- goldencheck-0.1.0/goldencheck/reporters/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/reporters/ci_reporter.py +13 -0
- goldencheck-0.1.0/goldencheck/reporters/json_reporter.py +36 -0
- goldencheck-0.1.0/goldencheck/reporters/rich_console.py +31 -0
- goldencheck-0.1.0/goldencheck/tui/__init__.py +0 -0
- goldencheck-0.1.0/goldencheck/tui/app.py +76 -0
- goldencheck-0.1.0/goldencheck/tui/column_detail.py +40 -0
- goldencheck-0.1.0/goldencheck/tui/findings.py +30 -0
- goldencheck-0.1.0/goldencheck/tui/overview.py +29 -0
- goldencheck-0.1.0/goldencheck/tui/rules.py +23 -0
- goldencheck-0.1.0/pyproject.toml +60 -0
- goldencheck-0.1.0/tests/__init__.py +0 -0
- goldencheck-0.1.0/tests/cli/__init__.py +0 -0
- goldencheck-0.1.0/tests/cli/test_cli.py +33 -0
- goldencheck-0.1.0/tests/config/__init__.py +0 -0
- goldencheck-0.1.0/tests/config/test_loader.py +30 -0
- goldencheck-0.1.0/tests/config/test_schema.py +34 -0
- goldencheck-0.1.0/tests/engine/__init__.py +0 -0
- goldencheck-0.1.0/tests/engine/test_reader.py +26 -0
- goldencheck-0.1.0/tests/engine/test_sampler.py +17 -0
- goldencheck-0.1.0/tests/engine/test_scanner.py +22 -0
- goldencheck-0.1.0/tests/engine/test_validator.py +34 -0
- goldencheck-0.1.0/tests/llm/__init__.py +0 -0
- goldencheck-0.1.0/tests/llm/test_budget.py +36 -0
- goldencheck-0.1.0/tests/llm/test_integration.py +27 -0
- goldencheck-0.1.0/tests/llm/test_merger.py +72 -0
- goldencheck-0.1.0/tests/llm/test_parser.py +26 -0
- goldencheck-0.1.0/tests/llm/test_prompts.py +30 -0
- goldencheck-0.1.0/tests/llm/test_sample_block.py +44 -0
- goldencheck-0.1.0/tests/models/__init__.py +0 -0
- goldencheck-0.1.0/tests/models/test_finding.py +34 -0
- goldencheck-0.1.0/tests/models/test_profile.py +45 -0
- goldencheck-0.1.0/tests/profilers/__init__.py +0 -0
- goldencheck-0.1.0/tests/profilers/test_cardinality.py +12 -0
- goldencheck-0.1.0/tests/profilers/test_format_detection.py +19 -0
- goldencheck-0.1.0/tests/profilers/test_nullability.py +18 -0
- goldencheck-0.1.0/tests/profilers/test_pattern_consistency.py +14 -0
- goldencheck-0.1.0/tests/profilers/test_range_distribution.py +20 -0
- goldencheck-0.1.0/tests/profilers/test_type_inference.py +21 -0
- goldencheck-0.1.0/tests/profilers/test_uniqueness.py +19 -0
- goldencheck-0.1.0/tests/relations/__init__.py +0 -0
- goldencheck-0.1.0/tests/relations/test_null_correlation.py +21 -0
- goldencheck-0.1.0/tests/relations/test_temporal.py +25 -0
- goldencheck-0.1.0/tests/reporters/__init__.py +0 -0
- goldencheck-0.1.0/tests/reporters/test_reporters.py +49 -0
- goldencheck-0.1.0/tests/test_integration.py +71 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
github: benzsevern
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Bug Report
|
|
2
|
+
description: Report a bug in GoldenCheck
|
|
3
|
+
labels: ["bug"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: description
|
|
7
|
+
attributes:
|
|
8
|
+
label: Describe the bug
|
|
9
|
+
description: A clear description of what happened
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
- type: textarea
|
|
13
|
+
id: reproduce
|
|
14
|
+
attributes:
|
|
15
|
+
label: Steps to reproduce
|
|
16
|
+
description: |
|
|
17
|
+
1. Run `goldencheck ...`
|
|
18
|
+
2. See error
|
|
19
|
+
validations:
|
|
20
|
+
required: true
|
|
21
|
+
- type: textarea
|
|
22
|
+
id: expected
|
|
23
|
+
attributes:
|
|
24
|
+
label: Expected behavior
|
|
25
|
+
description: What should have happened
|
|
26
|
+
validations:
|
|
27
|
+
required: true
|
|
28
|
+
- type: input
|
|
29
|
+
id: version
|
|
30
|
+
attributes:
|
|
31
|
+
label: GoldenCheck version
|
|
32
|
+
placeholder: "0.1.0"
|
|
33
|
+
validations:
|
|
34
|
+
required: true
|
|
35
|
+
- type: input
|
|
36
|
+
id: python
|
|
37
|
+
attributes:
|
|
38
|
+
label: Python version
|
|
39
|
+
placeholder: "3.12"
|
|
40
|
+
validations:
|
|
41
|
+
required: true
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
name: Feature Request
|
|
2
|
+
description: Suggest a new feature
|
|
3
|
+
labels: ["enhancement"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: problem
|
|
7
|
+
attributes:
|
|
8
|
+
label: Problem
|
|
9
|
+
description: What problem are you trying to solve?
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
- type: textarea
|
|
13
|
+
id: solution
|
|
14
|
+
attributes:
|
|
15
|
+
label: Proposed solution
|
|
16
|
+
description: How would you like this to work?
|
|
17
|
+
validations:
|
|
18
|
+
required: true
|
|
19
|
+
- type: textarea
|
|
20
|
+
id: alternatives
|
|
21
|
+
attributes:
|
|
22
|
+
label: Alternatives considered
|
|
23
|
+
description: What else have you tried?
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
environment: pypi
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.12"
|
|
19
|
+
- run: pip install build
|
|
20
|
+
- run: python -m build
|
|
21
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- run: pip install -e ".[dev]"
|
|
21
|
+
- run: pytest --tb=short -v
|
|
22
|
+
- run: ruff check .
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
|
2
|
+
|
|
3
|
+
## Our Pledge
|
|
4
|
+
|
|
5
|
+
We as members, contributors, and leaders pledge to make participation in our
|
|
6
|
+
community a harassment-free experience for everyone, regardless of age, body
|
|
7
|
+
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
|
8
|
+
identity and expression, level of experience, education, socio-economic status,
|
|
9
|
+
nationality, personal appearance, race, caste, color, religion, or sexual
|
|
10
|
+
identity and orientation.
|
|
11
|
+
|
|
12
|
+
## Our Standards
|
|
13
|
+
|
|
14
|
+
Examples of behavior that contributes to a positive environment:
|
|
15
|
+
|
|
16
|
+
* Using welcoming and inclusive language
|
|
17
|
+
* Being respectful of differing viewpoints and experiences
|
|
18
|
+
* Gracefully accepting constructive criticism
|
|
19
|
+
* Focusing on what is best for the community
|
|
20
|
+
|
|
21
|
+
Examples of unacceptable behavior:
|
|
22
|
+
|
|
23
|
+
* Trolling, insulting or derogatory comments, and personal attacks
|
|
24
|
+
* Public or private harassment
|
|
25
|
+
* Publishing others' private information without explicit permission
|
|
26
|
+
* Other conduct which could reasonably be considered inappropriate
|
|
27
|
+
|
|
28
|
+
## Enforcement
|
|
29
|
+
|
|
30
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
|
31
|
+
reported to **benzsevern@gmail.com**. All complaints will be reviewed and
|
|
32
|
+
investigated promptly and fairly.
|
|
33
|
+
|
|
34
|
+
## Attribution
|
|
35
|
+
|
|
36
|
+
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.1.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Contributing to GoldenCheck
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in improving GoldenCheck!
|
|
4
|
+
|
|
5
|
+
## Getting Started
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
git clone https://github.com/benzsevern/goldencheck.git
|
|
9
|
+
cd goldencheck
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
pytest
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Ways to Contribute
|
|
15
|
+
|
|
16
|
+
- **Bug reports** -- open an issue with reproduction steps
|
|
17
|
+
- **Feature requests** -- describe the problem you're solving
|
|
18
|
+
- **Code** -- fork, branch, PR. All PRs need tests.
|
|
19
|
+
- **Documentation** -- README, docstrings, examples
|
|
20
|
+
|
|
21
|
+
## Development Standards
|
|
22
|
+
|
|
23
|
+
- **Python 3.11+** with type hints
|
|
24
|
+
- **Polars** for all data operations (not pandas)
|
|
25
|
+
- **Ruff** for linting: `ruff check .` (100 char line length)
|
|
26
|
+
- **Pytest** for testing: `pytest --tb=short`
|
|
27
|
+
- **Conventional commits**: `feat:`, `fix:`, `docs:`, `test:`, `chore:`
|
|
28
|
+
|
|
29
|
+
## Architecture
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
goldencheck/
|
|
33
|
+
├── cli/ # Typer CLI entry points
|
|
34
|
+
├── profilers/ # Column-level profilers (one per type)
|
|
35
|
+
├── relations/ # Cross-column profilers
|
|
36
|
+
├── engine/ # Scanner, validator, reader, sampler
|
|
37
|
+
├── config/ # YAML config schema, loader, writer
|
|
38
|
+
├── tui/ # Textual TUI
|
|
39
|
+
├── reporters/ # Rich, JSON, CI output
|
|
40
|
+
└── models/ # Finding, Profile data models
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Each profiler is independent. Add a new one by:
|
|
44
|
+
1. Create `goldencheck/profilers/your_profiler.py` extending `BaseProfiler`
|
|
45
|
+
2. Add tests in `tests/profilers/test_your_profiler.py`
|
|
46
|
+
3. Register it in `goldencheck/engine/scanner.py`
|
|
47
|
+
|
|
48
|
+
## Pull Requests
|
|
49
|
+
|
|
50
|
+
1. Fork and create a feature branch
|
|
51
|
+
2. Write tests first (TDD)
|
|
52
|
+
3. Run `pytest` and `ruff check .`
|
|
53
|
+
4. Open a PR with a clear description
|
|
54
|
+
5. One approval required to merge
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 benzsevern
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: goldencheck
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Data validation that discovers rules from your data so you don't have to write them
|
|
5
|
+
Project-URL: Homepage, https://github.com/benzsevern/goldencheck
|
|
6
|
+
Project-URL: Repository, https://github.com/benzsevern/goldencheck
|
|
7
|
+
Project-URL: Issues, https://github.com/benzsevern/goldencheck/issues
|
|
8
|
+
Author-email: Ben Severn <benzsevern@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: csv,data-checks,data-profiling,data-quality,data-validation,llm,parquet,profiling,tui
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.11
|
|
24
|
+
Requires-Dist: openpyxl>=3.1
|
|
25
|
+
Requires-Dist: polars>=1.0
|
|
26
|
+
Requires-Dist: pydantic>=2.0
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: rich>=13.0
|
|
29
|
+
Requires-Dist: textual>=1.0
|
|
30
|
+
Requires-Dist: typer>=0.12
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
35
|
+
Provides-Extra: llm
|
|
36
|
+
Requires-Dist: anthropic>=0.30; extra == 'llm'
|
|
37
|
+
Requires-Dist: openai>=1.30; extra == 'llm'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# GoldenCheck
|
|
41
|
+
|
|
42
|
+
Data validation that discovers rules from your data so you don't have to write them.
|
|
43
|
+
|
|
44
|
+
[](https://pypi.org/project/goldencheck/)
|
|
45
|
+
[](https://python.org)
|
|
46
|
+
[](https://github.com/benzsevern/goldencheck)
|
|
47
|
+
[](https://opensource.org/licenses/MIT)
|
|
48
|
+
|
|
49
|
+
> Every competitor makes you write rules first. GoldenCheck flips it: **validate first, keep the rules you care about.**
|
|
50
|
+
|
|
51
|
+
## Why GoldenCheck?
|
|
52
|
+
|
|
53
|
+
| | GoldenCheck | Great Expectations | Pandera | Pointblank |
|
|
54
|
+
|---|---|---|---|---|
|
|
55
|
+
| Rules | **Discovered from data** | Written by hand | Written by hand | Written by hand |
|
|
56
|
+
| Config | **Zero to start** | Heavy YAML/Python setup | Decorators/schemas | YAML/Python |
|
|
57
|
+
| Interface | **CLI + interactive TUI** | HTML reports | Exceptions | HTML/notebook |
|
|
58
|
+
| Learning curve | **One command** | Hours/days | Moderate | Moderate |
|
|
59
|
+
| LLM enhancement | **Yes ($0.01/scan)** | No | No | No |
|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install goldencheck
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
With LLM boost support:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install goldencheck[llm]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Scan a file — discovers issues, launches interactive TUI
|
|
77
|
+
goldencheck data.csv
|
|
78
|
+
|
|
79
|
+
# CLI-only output (no TUI)
|
|
80
|
+
goldencheck data.csv --no-tui
|
|
81
|
+
|
|
82
|
+
# With LLM enhancement (requires API key)
|
|
83
|
+
goldencheck data.csv --llm-boost --no-tui
|
|
84
|
+
|
|
85
|
+
# Validate against saved rules (for CI/pipelines)
|
|
86
|
+
goldencheck validate data.csv
|
|
87
|
+
|
|
88
|
+
# JSON output for CI integration
|
|
89
|
+
goldencheck data.csv --no-tui --json
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## How It Works
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
1. SCAN → goldencheck data.csv
|
|
96
|
+
GoldenCheck profiles your data and discovers what "healthy" looks like
|
|
97
|
+
|
|
98
|
+
2. REVIEW → Interactive TUI shows findings sorted by severity
|
|
99
|
+
Each finding has: description, affected rows, sample values
|
|
100
|
+
|
|
101
|
+
3. PIN → Press Space to promote findings into permanent rules
|
|
102
|
+
Dismiss false positives — they won't come back
|
|
103
|
+
|
|
104
|
+
4. EXPORT → Press F2 to save rules to goldencheck.yml
|
|
105
|
+
Human-readable YAML with your pinned rules
|
|
106
|
+
|
|
107
|
+
5. VALIDATE → goldencheck validate data.csv
|
|
108
|
+
Enforce rules in CI with exit codes (0 = pass, 1 = fail)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## What It Detects
|
|
112
|
+
|
|
113
|
+
### Column-Level Profilers
|
|
114
|
+
|
|
115
|
+
| Profiler | What It Catches | Example |
|
|
116
|
+
|----------|----------------|---------|
|
|
117
|
+
| **Type inference** | String columns that are actually numeric | "Column `age` is string but 98% are integer" |
|
|
118
|
+
| **Nullability** | Required vs. optional columns | "0 nulls across 50k rows — likely required" |
|
|
119
|
+
| **Uniqueness** | Primary key candidates, near-duplicates | "100% unique — likely primary key" |
|
|
120
|
+
| **Format detection** | Emails, phones, URLs, dates | "94% email format, 6% malformed" |
|
|
121
|
+
| **Range & distribution** | Outliers, min/max bounds | "3 rows have values >10,000" |
|
|
122
|
+
| **Cardinality** | Low-cardinality enum suggestions | "4 unique values — possible enum" |
|
|
123
|
+
| **Pattern consistency** | Mixed formats within a column | "3 phone formats detected" |
|
|
124
|
+
|
|
125
|
+
### Cross-Column Profilers
|
|
126
|
+
|
|
127
|
+
| Profiler | What It Catches |
|
|
128
|
+
|----------|----------------|
|
|
129
|
+
| **Temporal ordering** | start_date > end_date violations |
|
|
130
|
+
| **Null correlation** | Columns that are null together (e.g., address + city + zip) |
|
|
131
|
+
|
|
132
|
+
## LLM Boost
|
|
133
|
+
|
|
134
|
+
Add `--llm-boost` to enhance profiler findings with LLM intelligence. The LLM receives a representative sample of your data and:
|
|
135
|
+
|
|
136
|
+
1. **Finds issues profilers miss** — semantic understanding (e.g., "12345" in a name column)
|
|
137
|
+
2. **Upgrades severity** — knows "emails should be required" even if the profiler only says "INFO"
|
|
138
|
+
3. **Discovers relationships** — identifies temporal ordering between columns like `signup_date` and `last_login`
|
|
139
|
+
4. **Downgrades false positives** — "mixed phone formats are common, not an error"
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# Using OpenAI
|
|
143
|
+
export OPENAI_API_KEY=sk-...
|
|
144
|
+
goldencheck data.csv --llm-boost --llm-provider openai --no-tui
|
|
145
|
+
|
|
146
|
+
# Using Anthropic
|
|
147
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
148
|
+
goldencheck data.csv --llm-boost --no-tui
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
**Cost:** ~$0.01 per scan (one API call with representative samples, not per-row).
|
|
152
|
+
|
|
153
|
+
**Budget control:**
|
|
154
|
+
```bash
|
|
155
|
+
export GOLDENCHECK_LLM_BUDGET=0.50 # max spend per scan in USD
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Configuration (goldencheck.yml)
|
|
159
|
+
|
|
160
|
+
```yaml
|
|
161
|
+
version: 1
|
|
162
|
+
|
|
163
|
+
settings:
|
|
164
|
+
sample_size: 100000
|
|
165
|
+
fail_on: error
|
|
166
|
+
|
|
167
|
+
columns:
|
|
168
|
+
email:
|
|
169
|
+
type: string
|
|
170
|
+
required: true
|
|
171
|
+
format: email
|
|
172
|
+
unique: true
|
|
173
|
+
|
|
174
|
+
age:
|
|
175
|
+
type: integer
|
|
176
|
+
range: [0, 120]
|
|
177
|
+
|
|
178
|
+
status:
|
|
179
|
+
type: string
|
|
180
|
+
enum: [active, inactive, pending, closed]
|
|
181
|
+
|
|
182
|
+
relations:
|
|
183
|
+
- type: temporal_order
|
|
184
|
+
columns: [start_date, end_date]
|
|
185
|
+
|
|
186
|
+
ignore:
|
|
187
|
+
- column: notes
|
|
188
|
+
check: nullability
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Only pinned rules appear in this file — not every finding. The `ignore` list prevents dismissed findings from reappearing.
|
|
192
|
+
|
|
193
|
+
## CLI Reference
|
|
194
|
+
|
|
195
|
+
| Command | Description |
|
|
196
|
+
|---------|-------------|
|
|
197
|
+
| `goldencheck <file>` | Scan and launch TUI |
|
|
198
|
+
| `goldencheck scan <file>` | Explicit scan |
|
|
199
|
+
| `goldencheck validate <file>` | Validate against goldencheck.yml |
|
|
200
|
+
| `goldencheck review <file>` | Scan + validate, launch TUI |
|
|
201
|
+
|
|
202
|
+
### Flags
|
|
203
|
+
|
|
204
|
+
| Flag | Description |
|
|
205
|
+
|------|-------------|
|
|
206
|
+
| `--no-tui` | Print results to console |
|
|
207
|
+
| `--json` | JSON output |
|
|
208
|
+
| `--fail-on <level>` | Exit 1 on severity: `error` or `warning` |
|
|
209
|
+
| `--llm-boost` | Enable LLM enhancement |
|
|
210
|
+
| `--llm-provider <name>` | LLM provider: `anthropic` (default) or `openai` |
|
|
211
|
+
| `--verbose` | Show info-level logs |
|
|
212
|
+
| `--debug` | Show debug-level logs |
|
|
213
|
+
| `--version` | Show version |
|
|
214
|
+
|
|
215
|
+
## Benchmarks
|
|
216
|
+
|
|
217
|
+
### Speed
|
|
218
|
+
|
|
219
|
+
| Dataset | Time | Throughput |
|
|
220
|
+
|---------|------|------------|
|
|
221
|
+
| 1K rows | 0.05s | 19K rows/sec |
|
|
222
|
+
| 10K rows | 0.23s | 43K rows/sec |
|
|
223
|
+
| 100K rows | 2.29s | 44K rows/sec |
|
|
224
|
+
| **1M rows** | **2.07s** | **482K rows/sec** |
|
|
225
|
+
|
|
226
|
+
### Detection Accuracy
|
|
227
|
+
|
|
228
|
+
| Mode | Column Recall | Cost |
|
|
229
|
+
|------|--------------|------|
|
|
230
|
+
| Profiler-only | 87% | $0 |
|
|
231
|
+
| **With LLM Boost** | **100%** | ~$0.01 |
|
|
232
|
+
|
|
233
|
+
Tested on a custom benchmark with 341 planted data quality issues across 9 categories.
|
|
234
|
+
|
|
235
|
+
### Raha Benchmark Datasets
|
|
236
|
+
|
|
237
|
+
| Dataset | Column Recall |
|
|
238
|
+
|---------|--------------|
|
|
239
|
+
| Flights (2,376 rows) | **100%** (4/4 columns) |
|
|
240
|
+
| Beers (2,410 rows) | **80%** (4/5 columns) |
|
|
241
|
+
|
|
242
|
+
## Tech Stack
|
|
243
|
+
|
|
244
|
+
| Dependency | Purpose |
|
|
245
|
+
|-----------|---------|
|
|
246
|
+
| [Polars](https://pola.rs/) | All data operations |
|
|
247
|
+
| [Typer](https://typer.tiangolo.com/) | CLI framework |
|
|
248
|
+
| [Textual](https://textual.textualize.io/) | Interactive TUI |
|
|
249
|
+
| [Rich](https://rich.readthedocs.io/) | CLI output formatting |
|
|
250
|
+
| [Pydantic 2](https://docs.pydantic.dev/) | Config validation |
|
|
251
|
+
|
|
252
|
+
**Optional:** [Anthropic SDK](https://docs.anthropic.com/) / [OpenAI SDK](https://platform.openai.com/) for LLM Boost
|
|
253
|
+
|
|
254
|
+
## Contributing
|
|
255
|
+
|
|
256
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
|
|
257
|
+
|
|
258
|
+
## License
|
|
259
|
+
|
|
260
|
+
MIT — see [LICENSE](LICENSE)
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
264
|
+
**From the maker of [GoldenMatch](https://github.com/benzsevern/goldenmatch)** — entity resolution toolkit.
|