goldencheck 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. goldencheck-0.1.0/.github/FUNDING.yml +1 -0
  2. goldencheck-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +41 -0
  3. goldencheck-0.1.0/.github/ISSUE_TEMPLATE/config.yml +5 -0
  4. goldencheck-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +23 -0
  5. goldencheck-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +17 -0
  6. goldencheck-0.1.0/.github/workflows/publish.yml +21 -0
  7. goldencheck-0.1.0/.github/workflows/test.yml +22 -0
  8. goldencheck-0.1.0/.gitignore +13 -0
  9. goldencheck-0.1.0/CODE_OF_CONDUCT.md +36 -0
  10. goldencheck-0.1.0/CONTRIBUTING.md +54 -0
  11. goldencheck-0.1.0/LICENSE +21 -0
  12. goldencheck-0.1.0/PKG-INFO +264 -0
  13. goldencheck-0.1.0/README.md +225 -0
  14. goldencheck-0.1.0/SECURITY.md +23 -0
  15. goldencheck-0.1.0/benchmarks/datasets/goldencheck_bench/ground_truth.json +539 -0
  16. goldencheck-0.1.0/benchmarks/datasets/raha_repo/.gitignore +160 -0
  17. goldencheck-0.1.0/benchmarks/datasets/raha_repo/LICENSE.md +201 -0
  18. goldencheck-0.1.0/benchmarks/datasets/raha_repo/MANIFEST +178 -0
  19. goldencheck-0.1.0/benchmarks/datasets/raha_repo/MANIFEST.in +3 -0
  20. goldencheck-0.1.0/benchmarks/datasets/raha_repo/README.md +69 -0
  21. goldencheck-0.1.0/benchmarks/datasets/raha_repo/requirements.txt +10 -0
  22. goldencheck-0.1.0/benchmarks/datasets/raha_repo/setup.cfg +2 -0
  23. goldencheck-0.1.0/benchmarks/datasets/raha_repo/setup.py +17 -0
  24. goldencheck-0.1.0/benchmarks/detection_benchmark.py +117 -0
  25. goldencheck-0.1.0/benchmarks/generate_datasets.py +430 -0
  26. goldencheck-0.1.0/benchmarks/goldencheck_benchmark.py +380 -0
  27. goldencheck-0.1.0/benchmarks/goldencheck_benchmark_llm.py +193 -0
  28. goldencheck-0.1.0/benchmarks/speed_benchmark.py +113 -0
  29. goldencheck-0.1.0/docs/superpowers/plans/2026-03-22-goldencheck-implementation.md +2003 -0
  30. goldencheck-0.1.0/docs/superpowers/plans/2026-03-23-llm-boost-implementation.md +996 -0
  31. goldencheck-0.1.0/docs/superpowers/specs/2026-03-22-goldencheck-design.md +356 -0
  32. goldencheck-0.1.0/docs/superpowers/specs/2026-03-23-llm-boost-design.md +342 -0
  33. goldencheck-0.1.0/docs/wiki/Architecture.md +217 -0
  34. goldencheck-0.1.0/docs/wiki/Benchmarks.md +143 -0
  35. goldencheck-0.1.0/docs/wiki/CLI.md +187 -0
  36. goldencheck-0.1.0/docs/wiki/Configuration.md +189 -0
  37. goldencheck-0.1.0/docs/wiki/Installation.md +105 -0
  38. goldencheck-0.1.0/docs/wiki/Interactive-TUI.md +144 -0
  39. goldencheck-0.1.0/docs/wiki/LLM-Boost.md +169 -0
  40. goldencheck-0.1.0/docs/wiki/Profilers.md +310 -0
  41. goldencheck-0.1.0/goldencheck/__init__.py +3 -0
  42. goldencheck-0.1.0/goldencheck/cli/__init__.py +0 -0
  43. goldencheck-0.1.0/goldencheck/cli/main.py +212 -0
  44. goldencheck-0.1.0/goldencheck/config/__init__.py +0 -0
  45. goldencheck-0.1.0/goldencheck/config/loader.py +21 -0
  46. goldencheck-0.1.0/goldencheck/config/schema.py +33 -0
  47. goldencheck-0.1.0/goldencheck/config/writer.py +16 -0
  48. goldencheck-0.1.0/goldencheck/engine/__init__.py +0 -0
  49. goldencheck-0.1.0/goldencheck/engine/reader.py +41 -0
  50. goldencheck-0.1.0/goldencheck/engine/sampler.py +8 -0
  51. goldencheck-0.1.0/goldencheck/engine/scanner.py +142 -0
  52. goldencheck-0.1.0/goldencheck/engine/validator.py +94 -0
  53. goldencheck-0.1.0/goldencheck/llm/__init__.py +0 -0
  54. goldencheck-0.1.0/goldencheck/llm/budget.py +70 -0
  55. goldencheck-0.1.0/goldencheck/llm/merger.py +87 -0
  56. goldencheck-0.1.0/goldencheck/llm/parser.py +27 -0
  57. goldencheck-0.1.0/goldencheck/llm/prompts.py +62 -0
  58. goldencheck-0.1.0/goldencheck/llm/providers.py +64 -0
  59. goldencheck-0.1.0/goldencheck/llm/sample_block.py +92 -0
  60. goldencheck-0.1.0/goldencheck/models/__init__.py +0 -0
  61. goldencheck-0.1.0/goldencheck/models/finding.py +21 -0
  62. goldencheck-0.1.0/goldencheck/models/profile.py +52 -0
  63. goldencheck-0.1.0/goldencheck/profilers/__init__.py +0 -0
  64. goldencheck-0.1.0/goldencheck/profilers/base.py +10 -0
  65. goldencheck-0.1.0/goldencheck/profilers/cardinality.py +41 -0
  66. goldencheck-0.1.0/goldencheck/profilers/format_detection.py +60 -0
  67. goldencheck-0.1.0/goldencheck/profilers/nullability.py +23 -0
  68. goldencheck-0.1.0/goldencheck/profilers/pattern_consistency.py +75 -0
  69. goldencheck-0.1.0/goldencheck/profilers/range_distribution.py +56 -0
  70. goldencheck-0.1.0/goldencheck/profilers/type_inference.py +31 -0
  71. goldencheck-0.1.0/goldencheck/profilers/uniqueness.py +24 -0
  72. goldencheck-0.1.0/goldencheck/relations/__init__.py +0 -0
  73. goldencheck-0.1.0/goldencheck/relations/null_correlation.py +64 -0
  74. goldencheck-0.1.0/goldencheck/relations/temporal.py +92 -0
  75. goldencheck-0.1.0/goldencheck/reporters/__init__.py +0 -0
  76. goldencheck-0.1.0/goldencheck/reporters/ci_reporter.py +13 -0
  77. goldencheck-0.1.0/goldencheck/reporters/json_reporter.py +36 -0
  78. goldencheck-0.1.0/goldencheck/reporters/rich_console.py +31 -0
  79. goldencheck-0.1.0/goldencheck/tui/__init__.py +0 -0
  80. goldencheck-0.1.0/goldencheck/tui/app.py +76 -0
  81. goldencheck-0.1.0/goldencheck/tui/column_detail.py +40 -0
  82. goldencheck-0.1.0/goldencheck/tui/findings.py +30 -0
  83. goldencheck-0.1.0/goldencheck/tui/overview.py +29 -0
  84. goldencheck-0.1.0/goldencheck/tui/rules.py +23 -0
  85. goldencheck-0.1.0/pyproject.toml +60 -0
  86. goldencheck-0.1.0/tests/__init__.py +0 -0
  87. goldencheck-0.1.0/tests/cli/__init__.py +0 -0
  88. goldencheck-0.1.0/tests/cli/test_cli.py +33 -0
  89. goldencheck-0.1.0/tests/config/__init__.py +0 -0
  90. goldencheck-0.1.0/tests/config/test_loader.py +30 -0
  91. goldencheck-0.1.0/tests/config/test_schema.py +34 -0
  92. goldencheck-0.1.0/tests/engine/__init__.py +0 -0
  93. goldencheck-0.1.0/tests/engine/test_reader.py +26 -0
  94. goldencheck-0.1.0/tests/engine/test_sampler.py +17 -0
  95. goldencheck-0.1.0/tests/engine/test_scanner.py +22 -0
  96. goldencheck-0.1.0/tests/engine/test_validator.py +34 -0
  97. goldencheck-0.1.0/tests/llm/__init__.py +0 -0
  98. goldencheck-0.1.0/tests/llm/test_budget.py +36 -0
  99. goldencheck-0.1.0/tests/llm/test_integration.py +27 -0
  100. goldencheck-0.1.0/tests/llm/test_merger.py +72 -0
  101. goldencheck-0.1.0/tests/llm/test_parser.py +26 -0
  102. goldencheck-0.1.0/tests/llm/test_prompts.py +30 -0
  103. goldencheck-0.1.0/tests/llm/test_sample_block.py +44 -0
  104. goldencheck-0.1.0/tests/models/__init__.py +0 -0
  105. goldencheck-0.1.0/tests/models/test_finding.py +34 -0
  106. goldencheck-0.1.0/tests/models/test_profile.py +45 -0
  107. goldencheck-0.1.0/tests/profilers/__init__.py +0 -0
  108. goldencheck-0.1.0/tests/profilers/test_cardinality.py +12 -0
  109. goldencheck-0.1.0/tests/profilers/test_format_detection.py +19 -0
  110. goldencheck-0.1.0/tests/profilers/test_nullability.py +18 -0
  111. goldencheck-0.1.0/tests/profilers/test_pattern_consistency.py +14 -0
  112. goldencheck-0.1.0/tests/profilers/test_range_distribution.py +20 -0
  113. goldencheck-0.1.0/tests/profilers/test_type_inference.py +21 -0
  114. goldencheck-0.1.0/tests/profilers/test_uniqueness.py +19 -0
  115. goldencheck-0.1.0/tests/relations/__init__.py +0 -0
  116. goldencheck-0.1.0/tests/relations/test_null_correlation.py +21 -0
  117. goldencheck-0.1.0/tests/relations/test_temporal.py +25 -0
  118. goldencheck-0.1.0/tests/reporters/__init__.py +0 -0
  119. goldencheck-0.1.0/tests/reporters/test_reporters.py +49 -0
  120. goldencheck-0.1.0/tests/test_integration.py +71 -0
@@ -0,0 +1 @@
1
+ github: benzsevern
@@ -0,0 +1,41 @@
1
+ name: Bug Report
2
+ description: Report a bug in GoldenCheck
3
+ labels: ["bug"]
4
+ body:
5
+ - type: textarea
6
+ id: description
7
+ attributes:
8
+ label: Describe the bug
9
+ description: A clear description of what happened
10
+ validations:
11
+ required: true
12
+ - type: textarea
13
+ id: reproduce
14
+ attributes:
15
+ label: Steps to reproduce
16
+ description: |
17
+ 1. Run `goldencheck ...`
18
+ 2. See error
19
+ validations:
20
+ required: true
21
+ - type: textarea
22
+ id: expected
23
+ attributes:
24
+ label: Expected behavior
25
+ description: What should have happened
26
+ validations:
27
+ required: true
28
+ - type: input
29
+ id: version
30
+ attributes:
31
+ label: GoldenCheck version
32
+ placeholder: "0.1.0"
33
+ validations:
34
+ required: true
35
+ - type: input
36
+ id: python
37
+ attributes:
38
+ label: Python version
39
+ placeholder: "3.12"
40
+ validations:
41
+ required: true
@@ -0,0 +1,5 @@
1
+ blank_issues_enabled: true
2
+ contact_links:
3
+ - name: Discussions
4
+ url: https://github.com/benzsevern/goldencheck/discussions
5
+ about: Ask questions and share ideas
@@ -0,0 +1,23 @@
1
+ name: Feature Request
2
+ description: Suggest a new feature
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: textarea
6
+ id: problem
7
+ attributes:
8
+ label: Problem
9
+ description: What problem are you trying to solve?
10
+ validations:
11
+ required: true
12
+ - type: textarea
13
+ id: solution
14
+ attributes:
15
+ label: Proposed solution
16
+ description: How would you like this to work?
17
+ validations:
18
+ required: true
19
+ - type: textarea
20
+ id: alternatives
21
+ attributes:
22
+ label: Alternatives considered
23
+ description: What else have you tried?
@@ -0,0 +1,17 @@
1
+ ## What
2
+
3
+ Brief description of the change.
4
+
5
+ ## Why
6
+
7
+ What problem does this solve?
8
+
9
+ ## How
10
+
11
+ Key implementation details.
12
+
13
+ ## Testing
14
+
15
+ - [ ] Tests added/updated
16
+ - [ ] `pytest` passes
17
+ - [ ] `ruff check .` passes
@@ -0,0 +1,21 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ environment: pypi
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+ - run: pip install build
20
+ - run: python -m build
21
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,22 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12", "3.13"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - run: pip install -e ".[dev]"
21
+ - run: pytest --tb=short -v
22
+ - run: ruff check .
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ .env
8
+ *.csv
9
+ *.parquet
10
+ *.xlsx
11
+ goldencheck.yml
12
+ .ruff_cache/
13
+ .testing/
@@ -0,0 +1,36 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, caste, color, religion, or sexual
10
+ identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to a positive environment:
15
+
16
+ * Using welcoming and inclusive language
17
+ * Being respectful of differing viewpoints and experiences
18
+ * Gracefully accepting constructive criticism
19
+ * Focusing on what is best for the community
20
+
21
+ Examples of unacceptable behavior:
22
+
23
+ * Trolling, insulting or derogatory comments, and personal attacks
24
+ * Public or private harassment
25
+ * Publishing others' private information without explicit permission
26
+ * Other conduct which could reasonably be considered inappropriate
27
+
28
+ ## Enforcement
29
+
30
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
31
+ reported to **benzsevern@gmail.com**. All complaints will be reviewed and
32
+ investigated promptly and fairly.
33
+
34
+ ## Attribution
35
+
36
+ This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.1.
@@ -0,0 +1,54 @@
1
+ # Contributing to GoldenCheck
2
+
3
+ Thanks for your interest in improving GoldenCheck!
4
+
5
+ ## Getting Started
6
+
7
+ ```bash
8
+ git clone https://github.com/benzsevern/goldencheck.git
9
+ cd goldencheck
10
+ pip install -e ".[dev]"
11
+ pytest
12
+ ```
13
+
14
+ ## Ways to Contribute
15
+
16
+ - **Bug reports** -- open an issue with reproduction steps
17
+ - **Feature requests** -- describe the problem you're solving
18
+ - **Code** -- fork, branch, PR. All PRs need tests.
19
+ - **Documentation** -- README, docstrings, examples
20
+
21
+ ## Development Standards
22
+
23
+ - **Python 3.11+** with type hints
24
+ - **Polars** for all data operations (not pandas)
25
+ - **Ruff** for linting: `ruff check .` (100 char line length)
26
+ - **Pytest** for testing: `pytest --tb=short`
27
+ - **Conventional commits**: `feat:`, `fix:`, `docs:`, `test:`, `chore:`
28
+
29
+ ## Architecture
30
+
31
+ ```
32
+ goldencheck/
33
+ ├── cli/ # Typer CLI entry points
34
+ ├── profilers/ # Column-level profilers (one per type)
35
+ ├── relations/ # Cross-column profilers
36
+ ├── engine/ # Scanner, validator, reader, sampler
37
+ ├── config/ # YAML config schema, loader, writer
38
+ ├── tui/ # Textual TUI
39
+ ├── reporters/ # Rich, JSON, CI output
40
+ └── models/ # Finding, Profile data models
41
+ ```
42
+
43
+ Each profiler is independent. Add a new one by:
44
+ 1. Create `goldencheck/profilers/your_profiler.py` extending `BaseProfiler`
45
+ 2. Add tests in `tests/profilers/test_your_profiler.py`
46
+ 3. Register it in `goldencheck/engine/scanner.py`
47
+
48
+ ## Pull Requests
49
+
50
+ 1. Fork and create a feature branch
51
+ 2. Write tests first (TDD)
52
+ 3. Run `pytest` and `ruff check .`
53
+ 4. Open a PR with a clear description
54
+ 5. One approval required to merge
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 benzsevern
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,264 @@
1
+ Metadata-Version: 2.4
2
+ Name: goldencheck
3
+ Version: 0.1.0
4
+ Summary: Data validation that discovers rules from your data so you don't have to write them
5
+ Project-URL: Homepage, https://github.com/benzsevern/goldencheck
6
+ Project-URL: Repository, https://github.com/benzsevern/goldencheck
7
+ Project-URL: Issues, https://github.com/benzsevern/goldencheck/issues
8
+ Author-email: Ben Severn <benzsevern@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: csv,data-checks,data-profiling,data-quality,data-validation,llm,parquet,profiling,tui
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Database
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.11
24
+ Requires-Dist: openpyxl>=3.1
25
+ Requires-Dist: polars>=1.0
26
+ Requires-Dist: pydantic>=2.0
27
+ Requires-Dist: pyyaml>=6.0
28
+ Requires-Dist: rich>=13.0
29
+ Requires-Dist: textual>=1.0
30
+ Requires-Dist: typer>=0.12
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
33
+ Requires-Dist: pytest>=8.0; extra == 'dev'
34
+ Requires-Dist: ruff>=0.4; extra == 'dev'
35
+ Provides-Extra: llm
36
+ Requires-Dist: anthropic>=0.30; extra == 'llm'
37
+ Requires-Dist: openai>=1.30; extra == 'llm'
38
+ Description-Content-Type: text/markdown
39
+
40
+ # GoldenCheck
41
+
42
+ Data validation that discovers rules from your data so you don't have to write them.
43
+
44
+ [![PyPI](https://img.shields.io/pypi/v/goldencheck)](https://pypi.org/project/goldencheck/)
45
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://python.org)
46
+ [![Tests](https://img.shields.io/badge/tests-103%20passing-brightgreen)](https://github.com/benzsevern/goldencheck)
47
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
48
+
49
+ > Every competitor makes you write rules first. GoldenCheck flips it: **validate first, keep the rules you care about.**
50
+
51
+ ## Why GoldenCheck?
52
+
53
+ | | GoldenCheck | Great Expectations | Pandera | Pointblank |
54
+ |---|---|---|---|---|
55
+ | Rules | **Discovered from data** | Written by hand | Written by hand | Written by hand |
56
+ | Config | **Zero to start** | Heavy YAML/Python setup | Decorators/schemas | YAML/Python |
57
+ | Interface | **CLI + interactive TUI** | HTML reports | Exceptions | HTML/notebook |
58
+ | Learning curve | **One command** | Hours/days | Moderate | Moderate |
59
+ | LLM enhancement | **Yes ($0.01/scan)** | No | No | No |
60
+
61
+ ## Install
62
+
63
+ ```bash
64
+ pip install goldencheck
65
+ ```
66
+
67
+ With LLM boost support:
68
+
69
+ ```bash
70
+ pip install goldencheck[llm]
71
+ ```
72
+
73
+ ## Quick Start
74
+
75
+ ```bash
76
+ # Scan a file — discovers issues, launches interactive TUI
77
+ goldencheck data.csv
78
+
79
+ # CLI-only output (no TUI)
80
+ goldencheck data.csv --no-tui
81
+
82
+ # With LLM enhancement (requires API key)
83
+ goldencheck data.csv --llm-boost --no-tui
84
+
85
+ # Validate against saved rules (for CI/pipelines)
86
+ goldencheck validate data.csv
87
+
88
+ # JSON output for CI integration
89
+ goldencheck data.csv --no-tui --json
90
+ ```
91
+
92
+ ## How It Works
93
+
94
+ ```
95
+ 1. SCAN → goldencheck data.csv
96
+ GoldenCheck profiles your data and discovers what "healthy" looks like
97
+
98
+ 2. REVIEW → Interactive TUI shows findings sorted by severity
99
+ Each finding has: description, affected rows, sample values
100
+
101
+ 3. PIN → Press Space to promote findings into permanent rules
102
+ Dismiss false positives — they won't come back
103
+
104
+ 4. EXPORT → Press F2 to save rules to goldencheck.yml
105
+ Human-readable YAML with your pinned rules
106
+
107
+ 5. VALIDATE → goldencheck validate data.csv
108
+ Enforce rules in CI with exit codes (0 = pass, 1 = fail)
109
+ ```
110
+
111
+ ## What It Detects
112
+
113
+ ### Column-Level Profilers
114
+
115
+ | Profiler | What It Catches | Example |
116
+ |----------|----------------|---------|
117
+ | **Type inference** | String columns that are actually numeric | "Column `age` is string but 98% are integer" |
118
+ | **Nullability** | Required vs. optional columns | "0 nulls across 50k rows — likely required" |
119
+ | **Uniqueness** | Primary key candidates, near-duplicates | "100% unique — likely primary key" |
120
+ | **Format detection** | Emails, phones, URLs, dates | "94% email format, 6% malformed" |
121
+ | **Range & distribution** | Outliers, min/max bounds | "3 rows have values >10,000" |
122
+ | **Cardinality** | Low-cardinality enum suggestions | "4 unique values — possible enum" |
123
+ | **Pattern consistency** | Mixed formats within a column | "3 phone formats detected" |
124
+
125
+ ### Cross-Column Profilers
126
+
127
+ | Profiler | What It Catches |
128
+ |----------|----------------|
129
+ | **Temporal ordering** | start_date > end_date violations |
130
+ | **Null correlation** | Columns that are null together (e.g., address + city + zip) |
131
+
132
+ ## LLM Boost
133
+
134
+ Add `--llm-boost` to enhance profiler findings with LLM intelligence. The LLM receives a representative sample of your data and:
135
+
136
+ 1. **Finds issues profilers miss** — semantic understanding (e.g., "12345" in a name column)
137
+ 2. **Upgrades severity** — knows "emails should be required" even if the profiler only says "INFO"
138
+ 3. **Discovers relationships** — identifies temporal ordering between columns like `signup_date` and `last_login`
139
+ 4. **Downgrades false positives** — "mixed phone formats are common, not an error"
140
+
141
+ ```bash
142
+ # Using OpenAI
143
+ export OPENAI_API_KEY=sk-...
144
+ goldencheck data.csv --llm-boost --llm-provider openai --no-tui
145
+
146
+ # Using Anthropic
147
+ export ANTHROPIC_API_KEY=sk-ant-...
148
+ goldencheck data.csv --llm-boost --no-tui
149
+ ```
150
+
151
+ **Cost:** ~$0.01 per scan (one API call with representative samples, not per-row).
152
+
153
+ **Budget control:**
154
+ ```bash
155
+ export GOLDENCHECK_LLM_BUDGET=0.50 # max spend per scan in USD
156
+ ```
157
+
158
+ ## Configuration (goldencheck.yml)
159
+
160
+ ```yaml
161
+ version: 1
162
+
163
+ settings:
164
+ sample_size: 100000
165
+ fail_on: error
166
+
167
+ columns:
168
+ email:
169
+ type: string
170
+ required: true
171
+ format: email
172
+ unique: true
173
+
174
+ age:
175
+ type: integer
176
+ range: [0, 120]
177
+
178
+ status:
179
+ type: string
180
+ enum: [active, inactive, pending, closed]
181
+
182
+ relations:
183
+ - type: temporal_order
184
+ columns: [start_date, end_date]
185
+
186
+ ignore:
187
+ - column: notes
188
+ check: nullability
189
+ ```
190
+
191
+ Only pinned rules appear in this file — not every finding. The `ignore` list prevents dismissed findings from reappearing.
192
+
193
+ ## CLI Reference
194
+
195
+ | Command | Description |
196
+ |---------|-------------|
197
+ | `goldencheck <file>` | Scan and launch TUI |
198
+ | `goldencheck scan <file>` | Explicit scan |
199
+ | `goldencheck validate <file>` | Validate against goldencheck.yml |
200
+ | `goldencheck review <file>` | Scan + validate, launch TUI |
201
+
202
+ ### Flags
203
+
204
+ | Flag | Description |
205
+ |------|-------------|
206
+ | `--no-tui` | Print results to console |
207
+ | `--json` | JSON output |
208
+ | `--fail-on <level>` | Exit 1 on severity: `error` or `warning` |
209
+ | `--llm-boost` | Enable LLM enhancement |
210
+ | `--llm-provider <name>` | LLM provider: `anthropic` (default) or `openai` |
211
+ | `--verbose` | Show info-level logs |
212
+ | `--debug` | Show debug-level logs |
213
+ | `--version` | Show version |
214
+
215
+ ## Benchmarks
216
+
217
+ ### Speed
218
+
219
+ | Dataset | Time | Throughput |
220
+ |---------|------|------------|
221
+ | 1K rows | 0.05s | 19K rows/sec |
222
+ | 10K rows | 0.23s | 43K rows/sec |
223
+ | 100K rows | 2.29s | 44K rows/sec |
224
+ | **1M rows** | **2.07s** | **482K rows/sec** |
225
+
226
+ ### Detection Accuracy
227
+
228
+ | Mode | Column Recall | Cost |
229
+ |------|--------------|------|
230
+ | Profiler-only | 87% | $0 |
231
+ | **With LLM Boost** | **100%** | ~$0.01 |
232
+
233
+ Tested on a custom benchmark with 341 planted data quality issues across 9 categories.
234
+
235
+ ### Raha Benchmark Datasets
236
+
237
+ | Dataset | Column Recall |
238
+ |---------|--------------|
239
+ | Flights (2,376 rows) | **100%** (4/4 columns) |
240
+ | Beers (2,410 rows) | **80%** (4/5 columns) |
241
+
242
+ ## Tech Stack
243
+
244
+ | Dependency | Purpose |
245
+ |-----------|---------|
246
+ | [Polars](https://pola.rs/) | All data operations |
247
+ | [Typer](https://typer.tiangolo.com/) | CLI framework |
248
+ | [Textual](https://textual.textualize.io/) | Interactive TUI |
249
+ | [Rich](https://rich.readthedocs.io/) | CLI output formatting |
250
+ | [Pydantic 2](https://docs.pydantic.dev/) | Config validation |
251
+
252
+ **Optional:** [Anthropic SDK](https://docs.anthropic.com/) / [OpenAI SDK](https://platform.openai.com/) for LLM Boost
253
+
254
+ ## Contributing
255
+
256
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
257
+
258
+ ## License
259
+
260
+ MIT — see [LICENSE](LICENSE)
261
+
262
+ ---
263
+
264
+ **From the maker of [GoldenMatch](https://github.com/benzsevern/goldenmatch)** — entity resolution toolkit.