honest-eda 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- honest_eda-0.1.0/LICENSE +21 -0
- honest_eda-0.1.0/PKG-INFO +174 -0
- honest_eda-0.1.0/README.md +143 -0
- honest_eda-0.1.0/honest_eda/__init__.py +8 -0
- honest_eda-0.1.0/honest_eda/cli.py +60 -0
- honest_eda-0.1.0/honest_eda/leakage.py +22 -0
- honest_eda-0.1.0/honest_eda/profile.py +46 -0
- honest_eda-0.1.0/honest_eda/report/__init__.py +5 -0
- honest_eda-0.1.0/honest_eda/report/report.py +146 -0
- honest_eda-0.1.0/honest_eda/runner.py +124 -0
- honest_eda-0.1.0/honest_eda.egg-info/PKG-INFO +174 -0
- honest_eda-0.1.0/honest_eda.egg-info/SOURCES.txt +21 -0
- honest_eda-0.1.0/honest_eda.egg-info/dependency_links.txt +1 -0
- honest_eda-0.1.0/honest_eda.egg-info/entry_points.txt +2 -0
- honest_eda-0.1.0/honest_eda.egg-info/requires.txt +9 -0
- honest_eda-0.1.0/honest_eda.egg-info/top_level.txt +1 -0
- honest_eda-0.1.0/pyproject.toml +49 -0
- honest_eda-0.1.0/setup.cfg +4 -0
- honest_eda-0.1.0/tests/test_cli.py +56 -0
- honest_eda-0.1.0/tests/test_leakage.py +47 -0
- honest_eda-0.1.0/tests/test_profile.py +50 -0
- honest_eda-0.1.0/tests/test_report.py +99 -0
- honest_eda-0.1.0/tests/test_runner.py +48 -0
honest_eda-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 honest-eda contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: honest-eda
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: EDA that admits when there's no signal — wraps nullbic ΔBIC falsification
|
|
5
|
+
Author: honest-eda contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/glogwa68/honest-eda
|
|
8
|
+
Project-URL: Issues, https://github.com/glogwa68/honest-eda/issues
|
|
9
|
+
Project-URL: Related, https://github.com/glogwa68/nullbic
|
|
10
|
+
Keywords: eda,exploratory-data-analysis,symbolic-regression,nullbic,falsification,data-leakage
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: nullbic
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: jinja2>=3.0
|
|
25
|
+
Requires-Dist: plotly>=5.0
|
|
26
|
+
Requires-Dist: typer>=0.12
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# honest-eda
|
|
33
|
+
|
|
34
|
+
**EDA that admits when there's no signal.**
|
|
35
|
+
|
|
36
|
+
[](https://pypi.org/project/honest-eda/)
|
|
37
|
+
[](https://pypi.org/project/honest-eda/)
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
[](https://github.com/glogwa68/honest-eda/actions)
|
|
40
|
+
|
|
41
|
+
> ydata-profiling shows you 200 correlations. 195 are noise. honest-eda shows you the 5 that survive shuffling.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## The Problem
|
|
46
|
+
|
|
47
|
+
pandas-profiling, ydata-profiling, and sweetviz report everything. Every correlation, every association, every distribution shift. The result: 200 "insights" in your report, of which 195 are pure chance — artifacts of finite sample size, collinearity, or subtle target leakage.
|
|
48
|
+
|
|
49
|
+
You end up fitting models on noise, wasting compute on AutoML pipelines that have nothing real to learn, and shipping features that degrade on new data.
|
|
50
|
+
|
|
51
|
+
## The Solution
|
|
52
|
+
|
|
53
|
+
`honest-eda` runs a falsification test on every feature-target pair using [nullbic](https://github.com/glogwa68/nullbic) — a symbolic regression library with built-in self-falsification via ΔBIC.
|
|
54
|
+
|
|
55
|
+
For each relation, the verdict is computed against three null hypotheses:
|
|
56
|
+
|
|
57
|
+
- vs. constant model
|
|
58
|
+
- vs. linear model
|
|
59
|
+
- vs. target-shuffled distribution
|
|
60
|
+
|
|
61
|
+
Only relations that survive all three appear in the report. Everything else is hidden.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install honest-eda
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Usage
|
|
74
|
+
|
|
75
|
+
### Python
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from honest_eda import profile, check
|
|
79
|
+
|
|
80
|
+
# Generate HTML report — only real signals shown
|
|
81
|
+
result = profile("data.csv", target="y", output="report.html")
|
|
82
|
+
print(f"Real signal patterns: {len(result.strong)}")
|
|
83
|
+
print(f"Fake correlations rejected: {result.noise_count}")
|
|
84
|
+
print(f"Leakage suspects: {len(result.leakage_suspects)}")
|
|
85
|
+
|
|
86
|
+
# CI mode — exit 1 if no real signal
|
|
87
|
+
if not check("data.csv", target="y", min_strong=1):
|
|
88
|
+
raise ValueError("No real signal in this dataset!")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### CLI
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
honest-eda profile data.csv --target=y --output=report.html
|
|
95
|
+
honest-eda check data.csv --target=y --min-strong=2 # exit 1 if fails
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## What You Get
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
HONEST EDA REPORT
|
|
104
|
+
─────────────────
|
|
105
|
+
Columns scanned: 47
|
|
106
|
+
Relations tested: 1081
|
|
107
|
+
|
|
108
|
+
REAL signal (STRONG): 6
|
|
109
|
+
• age × tenure → churn
|
|
110
|
+
• monthly_charges → churn
|
|
111
|
+
|
|
112
|
+
WEAK signal: 12
|
|
113
|
+
NOISE rejected: 1063 (hidden)
|
|
114
|
+
|
|
115
|
+
Linear-baseline-only features: 23
|
|
116
|
+
Leakage suspects: 2 ⚠
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
No noise, no false confidence. Only findings that hold up under falsification.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Killer Features
|
|
124
|
+
|
|
125
|
+
**Leakage detector** — When a feature's z-score vs. shuffled target drops below −15, honest-eda flags it as a probable data leak. Catches target-encoded columns, future-information leaks, and accidental label copies before they corrupt your model evaluation.
|
|
126
|
+
|
|
127
|
+
**Linear vs. symbolic dichotomy** — honest-eda tells you explicitly when a linear model would suffice. If symbolic regression finds no improvement over OLS, the feature is labeled "linear-baseline-only". No need to run a neural net to discover this.
|
|
128
|
+
|
|
129
|
+
**CI mode** — `honest-eda check` exits with code 1 if the minimum number of strong signals is not met. Drop it in your CI pipeline to block training runs on datasets with no real predictive content.
|
|
130
|
+
|
|
131
|
+
**Pre-modeling triage** — Know whether there is exploitable signal before you launch XGBoost or AutoML. Saves hours of compute and avoids the "model trained fine but generalizes to nothing" postmortem.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## How It Works
|
|
136
|
+
|
|
137
|
+
For each numeric feature paired with the target:
|
|
138
|
+
|
|
139
|
+
1. `nullbic.discover` fits a symbolic expression and records the BIC improvement over the null model.
|
|
140
|
+
2. The verdict is assigned:
|
|
141
|
+
- **STRONG** — beats constant + linear baseline + all shuffled-target permutations.
|
|
142
|
+
- **WEAK** — beats constant baseline only.
|
|
143
|
+
- **NOISE** — fails to beat the constant. Excluded from the report.
|
|
144
|
+
3. Leakage is flagged when `z_vs_shuffled < −15`, indicating the feature carries near-perfect information about the target.
|
|
145
|
+
|
|
146
|
+
The HTML report contains only STRONG and WEAK relations. NOISE is counted and disclosed in the summary, but not displayed.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Comparison
|
|
151
|
+
|
|
152
|
+
| | honest-eda | ydata-profiling | sweetviz |
|
|
153
|
+
|-------------------------------|:----------:|:---------------:|:--------:|
|
|
154
|
+
| Tests vs shuffled target | ✅ | ❌ | ❌ |
|
|
155
|
+
| Reports only real signal | ✅ | ❌ | ❌ |
|
|
156
|
+
| Symbolic formula extraction | ✅ | ❌ | ❌ |
|
|
157
|
+
| Leakage detection | ✅ | ❌ | ❌ |
|
|
158
|
+
| CI mode (exit code) | ✅ | ❌ | ❌ |
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## License
|
|
163
|
+
|
|
164
|
+
MIT. See [LICENSE](LICENSE).
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Citation / Related
|
|
169
|
+
|
|
170
|
+
honest-eda is built on top of **nullbic**, a library for symbolic regression with automatic ΔBIC falsification:
|
|
171
|
+
|
|
172
|
+
- Repository: https://github.com/glogwa68/nullbic
|
|
173
|
+
|
|
174
|
+
If you use honest-eda in published work, please also cite nullbic.
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# honest-eda
|
|
2
|
+
|
|
3
|
+
**EDA that admits when there's no signal.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/honest-eda/)
|
|
6
|
+
[](https://pypi.org/project/honest-eda/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
[](https://github.com/glogwa68/honest-eda/actions)
|
|
9
|
+
|
|
10
|
+
> ydata-profiling shows you 200 correlations. 195 are noise. honest-eda shows you the 5 that survive shuffling.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## The Problem
|
|
15
|
+
|
|
16
|
+
pandas-profiling, ydata-profiling, and sweetviz report everything. Every correlation, every association, every distribution shift. The result: 200 "insights" in your report, of which 195 are pure chance — artifacts of finite sample size, collinearity, or subtle target leakage.
|
|
17
|
+
|
|
18
|
+
You end up fitting models on noise, wasting compute on AutoML pipelines that have nothing real to learn, and shipping features that degrade on new data.
|
|
19
|
+
|
|
20
|
+
## The Solution
|
|
21
|
+
|
|
22
|
+
`honest-eda` runs a falsification test on every feature-target pair using [nullbic](https://github.com/glogwa68/nullbic) — a symbolic regression library with built-in self-falsification via ΔBIC.
|
|
23
|
+
|
|
24
|
+
For each relation, the verdict is computed against three null hypotheses:
|
|
25
|
+
|
|
26
|
+
- vs. constant model
|
|
27
|
+
- vs. linear model
|
|
28
|
+
- vs. target-shuffled distribution
|
|
29
|
+
|
|
30
|
+
Only relations that survive all three appear in the report. Everything else is hidden.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install honest-eda
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### Python
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from honest_eda import profile, check
|
|
48
|
+
|
|
49
|
+
# Generate HTML report — only real signals shown
|
|
50
|
+
result = profile("data.csv", target="y", output="report.html")
|
|
51
|
+
print(f"Real signal patterns: {len(result.strong)}")
|
|
52
|
+
print(f"Fake correlations rejected: {result.noise_count}")
|
|
53
|
+
print(f"Leakage suspects: {len(result.leakage_suspects)}")
|
|
54
|
+
|
|
55
|
+
# CI mode — exit 1 if no real signal
|
|
56
|
+
if not check("data.csv", target="y", min_strong=1):
|
|
57
|
+
raise ValueError("No real signal in this dataset!")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### CLI
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
honest-eda profile data.csv --target=y --output=report.html
|
|
64
|
+
honest-eda check data.csv --target=y --min-strong=2 # exit 1 if fails
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## What You Get
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
HONEST EDA REPORT
|
|
73
|
+
─────────────────
|
|
74
|
+
Columns scanned: 47
|
|
75
|
+
Relations tested: 1081
|
|
76
|
+
|
|
77
|
+
REAL signal (STRONG): 6
|
|
78
|
+
• age × tenure → churn
|
|
79
|
+
• monthly_charges → churn
|
|
80
|
+
|
|
81
|
+
WEAK signal: 12
|
|
82
|
+
NOISE rejected: 1063 (hidden)
|
|
83
|
+
|
|
84
|
+
Linear-baseline-only features: 23
|
|
85
|
+
Leakage suspects: 2 ⚠
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
No noise, no false confidence. Only findings that hold up under falsification.
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Killer Features
|
|
93
|
+
|
|
94
|
+
**Leakage detector** — When a feature's z-score vs. shuffled target drops below −15, honest-eda flags it as a probable data leak. Catches target-encoded columns, future-information leaks, and accidental label copies before they corrupt your model evaluation.
|
|
95
|
+
|
|
96
|
+
**Linear vs. symbolic dichotomy** — honest-eda tells you explicitly when a linear model would suffice. If symbolic regression finds no improvement over OLS, the feature is labeled "linear-baseline-only". No need to run a neural net to discover this.
|
|
97
|
+
|
|
98
|
+
**CI mode** — `honest-eda check` exits with code 1 if the minimum number of strong signals is not met. Drop it in your CI pipeline to block training runs on datasets with no real predictive content.
|
|
99
|
+
|
|
100
|
+
**Pre-modeling triage** — Know whether there is exploitable signal before you launch XGBoost or AutoML. Saves hours of compute and avoids the "model trained fine but generalizes to nothing" postmortem.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## How It Works
|
|
105
|
+
|
|
106
|
+
For each numeric feature paired with the target:
|
|
107
|
+
|
|
108
|
+
1. `nullbic.discover` fits a symbolic expression and records the BIC improvement over the null model.
|
|
109
|
+
2. The verdict is assigned:
|
|
110
|
+
- **STRONG** — beats constant + linear baseline + all shuffled-target permutations.
|
|
111
|
+
- **WEAK** — beats constant baseline only.
|
|
112
|
+
- **NOISE** — fails to beat the constant. Excluded from the report.
|
|
113
|
+
3. Leakage is flagged when `z_vs_shuffled < −15`, indicating the feature carries near-perfect information about the target.
|
|
114
|
+
|
|
115
|
+
The HTML report contains only STRONG and WEAK relations. NOISE is counted and disclosed in the summary, but not displayed.
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Comparison
|
|
120
|
+
|
|
121
|
+
| | honest-eda | ydata-profiling | sweetviz |
|
|
122
|
+
|-------------------------------|:----------:|:---------------:|:--------:|
|
|
123
|
+
| Tests vs shuffled target | ✅ | ❌ | ❌ |
|
|
124
|
+
| Reports only real signal | ✅ | ❌ | ❌ |
|
|
125
|
+
| Symbolic formula extraction | ✅ | ❌ | ❌ |
|
|
126
|
+
| Leakage detection | ✅ | ❌ | ❌ |
|
|
127
|
+
| CI mode (exit code) | ✅ | ❌ | ❌ |
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT. See [LICENSE](LICENSE).
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Citation / Related
|
|
138
|
+
|
|
139
|
+
honest-eda is built on top of **nullbic**, a library for symbolic regression with automatic ΔBIC falsification:
|
|
140
|
+
|
|
141
|
+
- Repository: https://github.com/glogwa68/nullbic
|
|
142
|
+
|
|
143
|
+
If you use honest-eda in published work, please also cite nullbic.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from honest_eda.profile import check as _check
|
|
8
|
+
from honest_eda.profile import profile as _profile
|
|
9
|
+
from honest_eda.report import render_text_summary
|
|
10
|
+
|
|
11
|
+
app = typer.Typer(no_args_is_help=True, help="EDA that admits when there's no signal.")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@app.command("profile")
|
|
15
|
+
def profile_cmd(
|
|
16
|
+
data: str = typer.Argument(..., help="CSV path"),
|
|
17
|
+
target: str = typer.Option(..., "--target", "-t", help="Target column name"),
|
|
18
|
+
output: str = typer.Option("report.html", "--output", "-o", help="HTML output path"),
|
|
19
|
+
gens: int = typer.Option(40, "--gens", help="nullbic n_generations"),
|
|
20
|
+
pop: int = typer.Option(200, "--pop", help="nullbic pop_size"),
|
|
21
|
+
depth: int = typer.Option(4, "--depth", help="nullbic max_depth"),
|
|
22
|
+
seed: int = typer.Option(0, "--seed", help="Random seed"),
|
|
23
|
+
show_noise: bool = typer.Option(False, "--show-noise", help="Include noise in HTML"),
|
|
24
|
+
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress stdout"),
|
|
25
|
+
) -> None:
|
|
26
|
+
"Run EDA profile and write HTML report."
|
|
27
|
+
result = _profile(
|
|
28
|
+
data,
|
|
29
|
+
target,
|
|
30
|
+
output,
|
|
31
|
+
n_generations=gens,
|
|
32
|
+
pop_size=pop,
|
|
33
|
+
max_depth=depth,
|
|
34
|
+
seed=seed,
|
|
35
|
+
hide_noise=not show_noise,
|
|
36
|
+
)
|
|
37
|
+
if not quiet:
|
|
38
|
+
typer.echo(render_text_summary(result))
|
|
39
|
+
typer.echo(f"Report written to: {output}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@app.command("check")
|
|
43
|
+
def check_cmd(
|
|
44
|
+
data: str = typer.Argument(..., help="CSV path"),
|
|
45
|
+
target: str = typer.Option(..., "--target", "-t", help="Target column name"),
|
|
46
|
+
min_strong: int = typer.Option(1, "--min-strong", help="Minimum STRONG relations required"),
|
|
47
|
+
gens: int = typer.Option(40, "--gens"),
|
|
48
|
+
pop: int = typer.Option(200, "--pop"),
|
|
49
|
+
depth: int = typer.Option(4, "--depth"),
|
|
50
|
+
seed: int = typer.Option(0, "--seed"),
|
|
51
|
+
quiet: bool = typer.Option(False, "--quiet", "-q"),
|
|
52
|
+
) -> None:
|
|
53
|
+
"Exit 0 if enough STRONG signals found, exit 1 otherwise."
|
|
54
|
+
ok = _check(data, target, min_strong=min_strong, n_generations=gens, pop_size=pop, max_depth=depth, seed=seed)
|
|
55
|
+
if not quiet:
|
|
56
|
+
if ok:
|
|
57
|
+
typer.echo(f"PASS: found >= {min_strong} STRONG relation(s) for '{target}'.")
|
|
58
|
+
else:
|
|
59
|
+
typer.echo(f"FAIL: fewer than {min_strong} STRONG relation(s) for '{target}'.", err=True)
|
|
60
|
+
sys.exit(0 if ok else 1)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from honest_eda.runner import RelationResult
|
|
4
|
+
|
|
5
|
+
LEAKAGE_Z_THRESHOLD = -15.0
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_leakage_suspect(rel: RelationResult) -> bool:
|
|
9
|
+
return rel.z_vs_shuffled < LEAKAGE_Z_THRESHOLD
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def flag_leakage(results: list[RelationResult]) -> list[RelationResult]:
|
|
13
|
+
return [r for r in results if is_leakage_suspect(r)]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def explain_leakage(rel: RelationResult) -> str:
|
|
17
|
+
features = ", ".join(rel.features)
|
|
18
|
+
return (
|
|
19
|
+
f"Leakage suspect: feature(s) [{features}] achieve z={rel.z_vs_shuffled:.2f} "
|
|
20
|
+
f"vs shuffled baseline (threshold: {LEAKAGE_Z_THRESHOLD}). "
|
|
21
|
+
f"Verify [{features}] is not derived from '{rel.target}'."
|
|
22
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from honest_eda import report as _report_mod
|
|
6
|
+
from honest_eda.runner import ProfileResult, run_against_target
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def profile(
|
|
10
|
+
data: str | pd.DataFrame,
|
|
11
|
+
target: str,
|
|
12
|
+
output: str = "report.html",
|
|
13
|
+
*,
|
|
14
|
+
n_generations: int = 40,
|
|
15
|
+
pop_size: int = 200,
|
|
16
|
+
max_depth: int = 4,
|
|
17
|
+
seed: int = 0,
|
|
18
|
+
hide_noise: bool = True,
|
|
19
|
+
) -> ProfileResult:
|
|
20
|
+
"Run nullbic-backed EDA and render an HTML report."
|
|
21
|
+
if isinstance(data, str):
|
|
22
|
+
data = pd.read_csv(data)
|
|
23
|
+
result = run_against_target(
|
|
24
|
+
data,
|
|
25
|
+
target,
|
|
26
|
+
n_generations=n_generations,
|
|
27
|
+
pop_size=pop_size,
|
|
28
|
+
max_depth=max_depth,
|
|
29
|
+
seed=seed,
|
|
30
|
+
)
|
|
31
|
+
_report_mod.render_html(result, output, hide_noise=hide_noise)
|
|
32
|
+
return result
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def check(
|
|
36
|
+
data: str | pd.DataFrame,
|
|
37
|
+
target: str,
|
|
38
|
+
*,
|
|
39
|
+
min_strong: int = 1,
|
|
40
|
+
**kwargs,
|
|
41
|
+
) -> bool:
|
|
42
|
+
"CI gate: True if the profile finds at least min_strong STRONG relations."
|
|
43
|
+
if isinstance(data, str):
|
|
44
|
+
data = pd.read_csv(data)
|
|
45
|
+
result = run_against_target(data, target, **kwargs)
|
|
46
|
+
return len(result.strong) >= min_strong
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from jinja2 import Environment, BaseLoader
|
|
7
|
+
|
|
8
|
+
from honest_eda.leakage import LEAKAGE_Z_THRESHOLD
|
|
9
|
+
from honest_eda.runner import ProfileResult
|
|
10
|
+
|
|
11
|
+
_TEMPLATE = """\
|
|
12
|
+
<!DOCTYPE html>
|
|
13
|
+
<html lang="en">
|
|
14
|
+
<head>
|
|
15
|
+
<meta charset="utf-8">
|
|
16
|
+
<title>honest-eda — {{ result.target }}</title>
|
|
17
|
+
<style>
|
|
18
|
+
body{font-family:system-ui,sans-serif;margin:2rem;background:#fafafa;color:#111}
|
|
19
|
+
h1,h2{color:#1a1a2e}
|
|
20
|
+
table{border-collapse:collapse;width:100%;margin-bottom:1.5rem}
|
|
21
|
+
th,td{border:1px solid #ccc;padding:.4rem .8rem;text-align:left;font-size:.9em}
|
|
22
|
+
th{background:#1a1a2e;color:#fff}
|
|
23
|
+
tr.strong{background:#d4edda}tr.weak{background:#fff3cd}
|
|
24
|
+
.badge{border-radius:4px;padding:2px 6px;font-size:.8em;font-weight:700}
|
|
25
|
+
.b-STRONG{background:#198754;color:#fff}.b-WEAK{background:#ffc107;color:#000}
|
|
26
|
+
.b-NOISE{background:#dc3545;color:#fff}
|
|
27
|
+
.warn{color:#856404;background:#fff3cd;border:1px solid #ffc107;padding:.5rem;border-radius:4px}
|
|
28
|
+
</style>
|
|
29
|
+
</head>
|
|
30
|
+
<body>
|
|
31
|
+
<h1>honest-eda — target: <code>{{ result.target }}</code></h1>
|
|
32
|
+
<p>
|
|
33
|
+
Generated: {{ generated_at }} |
|
|
34
|
+
Columns: {{ result.n_columns }} |
|
|
35
|
+
Relations tested: {{ result.n_relations }} |
|
|
36
|
+
<strong>Strong: {{ result.strong | length }}</strong> |
|
|
37
|
+
Weak: {{ result.weak | length }} |
|
|
38
|
+
Noise: {{ result.noise_count }}
|
|
39
|
+
</p>
|
|
40
|
+
|
|
41
|
+
{% if result.leakage_suspects %}
|
|
42
|
+
<div class="warn">
|
|
43
|
+
<strong>Leakage suspects</strong> (z < {{ leakage_threshold }}):<br>
|
|
44
|
+
{% for r in result.leakage_suspects %}
|
|
45
|
+
• {{ r.features | join(", ") }} — z={{ "%.2f" | format(r.z_vs_shuffled) }}<br>
|
|
46
|
+
{% endfor %}
|
|
47
|
+
</div>
|
|
48
|
+
{% endif %}
|
|
49
|
+
|
|
50
|
+
{% if result.linear_only %}
|
|
51
|
+
<h2>Linear-only features</h2>
|
|
52
|
+
<p>{{ result.linear_only | join(", ") }}</p>
|
|
53
|
+
{% endif %}
|
|
54
|
+
|
|
55
|
+
<h2>Strong relations</h2>
|
|
56
|
+
{% if result.strong %}
|
|
57
|
+
<table>
|
|
58
|
+
<tr><th>Features</th><th>Formula</th><th>Verdict</th><th>ΔBIC const</th><th>ΔBIC linear</th><th>z vs shuffled</th></tr>
|
|
59
|
+
{% for r in result.strong %}
|
|
60
|
+
<tr class="strong">
|
|
61
|
+
<td>{{ r.features | join(", ") }}</td>
|
|
62
|
+
<td><code>{{ r.formula }}</code></td>
|
|
63
|
+
<td><span class="badge b-{{ r.verdict }}">{{ r.verdict }}</span></td>
|
|
64
|
+
<td>{{ "%.3f" | format(r.delta_bic_const) }}</td>
|
|
65
|
+
<td>{{ "%.3f" | format(r.delta_bic_linear) }}</td>
|
|
66
|
+
<td>{{ "%.3f" | format(r.z_vs_shuffled) }}</td>
|
|
67
|
+
</tr>
|
|
68
|
+
{% endfor %}
|
|
69
|
+
</table>
|
|
70
|
+
{% else %}<p><em>None found.</em></p>{% endif %}
|
|
71
|
+
|
|
72
|
+
<h2>Weak relations</h2>
|
|
73
|
+
{% if result.weak %}
|
|
74
|
+
<table>
|
|
75
|
+
<tr><th>Features</th><th>Formula</th><th>Verdict</th><th>ΔBIC const</th><th>ΔBIC linear</th><th>z vs shuffled</th></tr>
|
|
76
|
+
{% for r in result.weak %}
|
|
77
|
+
<tr class="weak">
|
|
78
|
+
<td>{{ r.features | join(", ") }}</td>
|
|
79
|
+
<td><code>{{ r.formula }}</code></td>
|
|
80
|
+
<td><span class="badge b-{{ r.verdict }}">{{ r.verdict }}</span></td>
|
|
81
|
+
<td>{{ "%.3f" | format(r.delta_bic_const) }}</td>
|
|
82
|
+
<td>{{ "%.3f" | format(r.delta_bic_linear) }}</td>
|
|
83
|
+
<td>{{ "%.3f" | format(r.z_vs_shuffled) }}</td>
|
|
84
|
+
</tr>
|
|
85
|
+
{% endfor %}
|
|
86
|
+
</table>
|
|
87
|
+
{% else %}<p><em>None found.</em></p>{% endif %}
|
|
88
|
+
|
|
89
|
+
{% if not hide_noise %}
|
|
90
|
+
<h2>Noise ({{ result.noise_count }} relation(s) rejected)</h2>
|
|
91
|
+
{% endif %}
|
|
92
|
+
|
|
93
|
+
</body>
|
|
94
|
+
</html>
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def render_html(
|
|
99
|
+
result: ProfileResult,
|
|
100
|
+
output_path: str | Path,
|
|
101
|
+
*,
|
|
102
|
+
hide_noise: bool = True,
|
|
103
|
+
) -> Path:
|
|
104
|
+
"Render ProfileResult as a self-contained HTML file."
|
|
105
|
+
output_path = Path(output_path).resolve()
|
|
106
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
env = Environment(loader=BaseLoader(), autoescape=False)
|
|
108
|
+
tmpl = env.from_string(_TEMPLATE)
|
|
109
|
+
generated_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
110
|
+
html = tmpl.render(
|
|
111
|
+
result=result,
|
|
112
|
+
hide_noise=hide_noise,
|
|
113
|
+
generated_at=generated_at,
|
|
114
|
+
leakage_threshold=LEAKAGE_Z_THRESHOLD,
|
|
115
|
+
)
|
|
116
|
+
output_path.write_text(html, encoding="utf-8")
|
|
117
|
+
return output_path
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def render_text_summary(result: ProfileResult) -> str:
|
|
121
|
+
"Return a plain-text summary of a ProfileResult."
|
|
122
|
+
lines: list[str] = [
|
|
123
|
+
"HONEST EDA REPORT",
|
|
124
|
+
"─" * 17,
|
|
125
|
+
f"Target: {result.target}",
|
|
126
|
+
f"Columns scanned: {result.n_columns}",
|
|
127
|
+
f"Relations tested: {result.n_relations}",
|
|
128
|
+
"",
|
|
129
|
+
f"REAL signal (STRONG): {len(result.strong)}",
|
|
130
|
+
]
|
|
131
|
+
for r in result.strong:
|
|
132
|
+
lines.append(f" * {', '.join(r.features)} -> {result.target} formula: {r.formula}")
|
|
133
|
+
lines += ["", f"WEAK signal: {len(result.weak)}"]
|
|
134
|
+
for r in result.weak:
|
|
135
|
+
lines.append(f" * {', '.join(r.features)} -> {result.target} formula: {r.formula}")
|
|
136
|
+
lines += [
|
|
137
|
+
"",
|
|
138
|
+
f"NOISE rejected: {result.noise_count}",
|
|
139
|
+
"",
|
|
140
|
+
f"Linear-only features: {', '.join(result.linear_only) or 'none'}",
|
|
141
|
+
"",
|
|
142
|
+
f"Leakage suspects: {len(result.leakage_suspects)}",
|
|
143
|
+
]
|
|
144
|
+
for r in result.leakage_suspects:
|
|
145
|
+
lines.append(f" ! {', '.join(r.features)} z={r.z_vs_shuffled:.2f}")
|
|
146
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import random
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from nullbic import Dataset, discover
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class RelationResult:
|
|
15
|
+
target: str
|
|
16
|
+
features: list[str]
|
|
17
|
+
formula: str
|
|
18
|
+
verdict: str
|
|
19
|
+
delta_bic_const: float
|
|
20
|
+
delta_bic_linear: float
|
|
21
|
+
z_vs_shuffled: float
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ProfileResult:
|
|
26
|
+
n_columns: int
|
|
27
|
+
n_relations: int
|
|
28
|
+
target: str
|
|
29
|
+
strong: list[RelationResult]
|
|
30
|
+
weak: list[RelationResult]
|
|
31
|
+
noise_count: int
|
|
32
|
+
linear_only: list[str]
|
|
33
|
+
leakage_suspects: list[RelationResult]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _select_numeric(df: pd.DataFrame) -> list[str]:
|
|
37
|
+
return df.select_dtypes(include="number").columns.tolist()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _verdict_str(report) -> str:
|
|
41
|
+
try:
|
|
42
|
+
return report.verdict.name
|
|
43
|
+
except AttributeError:
|
|
44
|
+
if not report.is_real_signal():
|
|
45
|
+
return "NOISE"
|
|
46
|
+
# distinguish STRONG vs WEAK via delta_bic_linear threshold
|
|
47
|
+
return "STRONG" if report.delta_bic_linear < -10 else "WEAK"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _run_one(
|
|
51
|
+
df: pd.DataFrame,
|
|
52
|
+
features: list[str],
|
|
53
|
+
target: str,
|
|
54
|
+
n_generations: int,
|
|
55
|
+
pop_size: int,
|
|
56
|
+
max_depth: int,
|
|
57
|
+
seed: int,
|
|
58
|
+
) -> RelationResult | None:
|
|
59
|
+
random.seed(seed)
|
|
60
|
+
try:
|
|
61
|
+
ds = Dataset.from_pandas(df[features + [target]], target=target)
|
|
62
|
+
rep = discover(ds, n_generations=n_generations, pop_size=pop_size, max_depth=max_depth)
|
|
63
|
+
return RelationResult(
|
|
64
|
+
target=target,
|
|
65
|
+
features=features,
|
|
66
|
+
formula=rep.formula,
|
|
67
|
+
verdict=_verdict_str(rep),
|
|
68
|
+
delta_bic_const=float(rep.delta_bic_const),
|
|
69
|
+
delta_bic_linear=float(rep.delta_bic_linear),
|
|
70
|
+
z_vs_shuffled=float(rep.z_vs_shuffled),
|
|
71
|
+
)
|
|
72
|
+
except Exception as exc:
|
|
73
|
+
logger.warning("nullbic failed for features=%s target=%s: %s", features, target, exc)
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def run_against_target(
|
|
78
|
+
df: pd.DataFrame,
|
|
79
|
+
target: str,
|
|
80
|
+
*,
|
|
81
|
+
n_generations: int = 40,
|
|
82
|
+
pop_size: int = 200,
|
|
83
|
+
max_depth: int = 4,
|
|
84
|
+
max_features_per_run: int | None = None,
|
|
85
|
+
seed: int = 0,
|
|
86
|
+
) -> ProfileResult:
|
|
87
|
+
numeric_cols = [c for c in _select_numeric(df) if c != target]
|
|
88
|
+
|
|
89
|
+
if max_features_per_run is not None:
|
|
90
|
+
numeric_cols = numeric_cols[:max_features_per_run]
|
|
91
|
+
|
|
92
|
+
results: list[RelationResult] = []
|
|
93
|
+
|
|
94
|
+
for col in numeric_cols:
|
|
95
|
+
res = _run_one(df, [col], target, n_generations, pop_size, max_depth, seed)
|
|
96
|
+
if res is not None:
|
|
97
|
+
results.append(res)
|
|
98
|
+
|
|
99
|
+
if len(numeric_cols) > 1:
|
|
100
|
+
all_res = _run_one(df, numeric_cols, target, n_generations, pop_size, max_depth, seed)
|
|
101
|
+
if all_res is not None:
|
|
102
|
+
results.append(all_res)
|
|
103
|
+
|
|
104
|
+
strong = [r for r in results if r.verdict == "STRONG"]
|
|
105
|
+
weak = [r for r in results if r.verdict == "WEAK"]
|
|
106
|
+
noise_count = sum(1 for r in results if r.verdict == "NOISE")
|
|
107
|
+
|
|
108
|
+
leakage_suspects = [r for r in results if r.z_vs_shuffled < -15]
|
|
109
|
+
|
|
110
|
+
linear_only: list[str] = []
|
|
111
|
+
for r in results:
|
|
112
|
+
if len(r.features) == 1 and abs(r.delta_bic_linear) < 5 and r.delta_bic_const < -10:
|
|
113
|
+
linear_only.append(r.features[0])
|
|
114
|
+
|
|
115
|
+
return ProfileResult(
|
|
116
|
+
n_columns=len(df.columns),
|
|
117
|
+
n_relations=len(results),
|
|
118
|
+
target=target,
|
|
119
|
+
strong=strong,
|
|
120
|
+
weak=weak,
|
|
121
|
+
noise_count=noise_count,
|
|
122
|
+
linear_only=linear_only,
|
|
123
|
+
leakage_suspects=leakage_suspects,
|
|
124
|
+
)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: honest-eda
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: EDA that admits when there's no signal — wraps nullbic ΔBIC falsification
|
|
5
|
+
Author: honest-eda contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/glogwa68/honest-eda
|
|
8
|
+
Project-URL: Issues, https://github.com/glogwa68/honest-eda/issues
|
|
9
|
+
Project-URL: Related, https://github.com/glogwa68/nullbic
|
|
10
|
+
Keywords: eda,exploratory-data-analysis,symbolic-regression,nullbic,falsification,data-leakage
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: nullbic
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: jinja2>=3.0
|
|
25
|
+
Requires-Dist: plotly>=5.0
|
|
26
|
+
Requires-Dist: typer>=0.12
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# honest-eda
|
|
33
|
+
|
|
34
|
+
**EDA that admits when there's no signal.**
|
|
35
|
+
|
|
36
|
+
[](https://pypi.org/project/honest-eda/)
|
|
37
|
+
[](https://pypi.org/project/honest-eda/)
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
[](https://github.com/glogwa68/honest-eda/actions)
|
|
40
|
+
|
|
41
|
+
> ydata-profiling shows you 200 correlations. 195 are noise. honest-eda shows you the 5 that survive shuffling.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## The Problem
|
|
46
|
+
|
|
47
|
+
pandas-profiling, ydata-profiling, and sweetviz report everything. Every correlation, every association, every distribution shift. The result: 200 "insights" in your report, of which 195 are pure chance — artifacts of finite sample size, collinearity, or subtle target leakage.
|
|
48
|
+
|
|
49
|
+
You end up fitting models on noise, wasting compute on AutoML pipelines that have nothing real to learn, and shipping features that degrade on new data.
|
|
50
|
+
|
|
51
|
+
## The Solution
|
|
52
|
+
|
|
53
|
+
`honest-eda` runs a falsification test on every feature-target pair using [nullbic](https://github.com/glogwa68/nullbic) — a symbolic regression library with built-in self-falsification via ΔBIC.
|
|
54
|
+
|
|
55
|
+
For each relation, the verdict is computed against three null hypotheses:
|
|
56
|
+
|
|
57
|
+
- vs. constant model
|
|
58
|
+
- vs. linear model
|
|
59
|
+
- vs. target-shuffled distribution
|
|
60
|
+
|
|
61
|
+
Only relations that survive all three appear in the report. Everything else is hidden.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install honest-eda
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Usage
|
|
74
|
+
|
|
75
|
+
### Python
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from honest_eda import profile, check
|
|
79
|
+
|
|
80
|
+
# Generate HTML report — only real signals shown
|
|
81
|
+
result = profile("data.csv", target="y", output="report.html")
|
|
82
|
+
print(f"Real signal patterns: {len(result.strong)}")
|
|
83
|
+
print(f"Fake correlations rejected: {result.noise_count}")
|
|
84
|
+
print(f"Leakage suspects: {len(result.leakage_suspects)}")
|
|
85
|
+
|
|
86
|
+
# CI mode — exit 1 if no real signal
|
|
87
|
+
if not check("data.csv", target="y", min_strong=1):
|
|
88
|
+
raise ValueError("No real signal in this dataset!")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### CLI
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
honest-eda profile data.csv --target=y --output=report.html
|
|
95
|
+
honest-eda check data.csv --target=y --min-strong=2 # exit 1 if fails
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## What You Get
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
HONEST EDA REPORT
|
|
104
|
+
─────────────────
|
|
105
|
+
Columns scanned: 47
|
|
106
|
+
Relations tested: 1081
|
|
107
|
+
|
|
108
|
+
REAL signal (STRONG): 6
|
|
109
|
+
• age × tenure → churn
|
|
110
|
+
• monthly_charges → churn
|
|
111
|
+
|
|
112
|
+
WEAK signal: 12
|
|
113
|
+
NOISE rejected: 1063 (hidden)
|
|
114
|
+
|
|
115
|
+
Linear-baseline-only features: 23
|
|
116
|
+
Leakage suspects: 2 ⚠
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
No noise, no false confidence. Only findings that hold up under falsification.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Killer Features
|
|
124
|
+
|
|
125
|
+
**Leakage detector** — When a feature's z-score vs. shuffled target drops below −15, honest-eda flags it as a probable data leak. Catches target-encoded columns, future-information leaks, and accidental label copies before they corrupt your model evaluation.
|
|
126
|
+
|
|
127
|
+
**Linear vs. symbolic dichotomy** — honest-eda tells you explicitly when a linear model would suffice. If symbolic regression finds no improvement over OLS, the feature is labeled "linear-baseline-only". No need to run a neural net to discover this.
|
|
128
|
+
|
|
129
|
+
**CI mode** — `honest-eda check` exits with code 1 if the minimum number of strong signals is not met. Drop it in your CI pipeline to block training runs on datasets with no real predictive content.
|
|
130
|
+
|
|
131
|
+
**Pre-modeling triage** — Know whether there is exploitable signal before you launch XGBoost or AutoML. Saves hours of compute and avoids the "model trained fine but generalizes to nothing" postmortem.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## How It Works
|
|
136
|
+
|
|
137
|
+
For each numeric feature paired with the target:
|
|
138
|
+
|
|
139
|
+
1. `nullbic.discover` fits a symbolic expression and records the BIC improvement over the null model.
|
|
140
|
+
2. The verdict is assigned:
|
|
141
|
+
- **STRONG** — beats constant + linear baseline + all shuffled-target permutations.
|
|
142
|
+
- **WEAK** — beats constant baseline only.
|
|
143
|
+
- **NOISE** — fails to beat the constant. Excluded from the report.
|
|
144
|
+
3. Leakage is flagged when `z_vs_shuffled < −15`, indicating the feature carries near-perfect information about the target.
|
|
145
|
+
|
|
146
|
+
The HTML report contains only STRONG and WEAK relations. NOISE is counted and disclosed in the summary, but not displayed.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Comparison
|
|
151
|
+
|
|
152
|
+
| | honest-eda | ydata-profiling | sweetviz |
|
|
153
|
+
|-------------------------------|:----------:|:---------------:|:--------:|
|
|
154
|
+
| Tests vs shuffled target | ✅ | ❌ | ❌ |
|
|
155
|
+
| Reports only real signal | ✅ | ❌ | ❌ |
|
|
156
|
+
| Symbolic formula extraction | ✅ | ❌ | ❌ |
|
|
157
|
+
| Leakage detection | ✅ | ❌ | ❌ |
|
|
158
|
+
| CI mode (exit code) | ✅ | ❌ | ❌ |
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## License
|
|
163
|
+
|
|
164
|
+
MIT. See [LICENSE](LICENSE).
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Citation / Related
|
|
169
|
+
|
|
170
|
+
honest-eda is built on top of **nullbic**, a library for symbolic regression with automatic ΔBIC falsification:
|
|
171
|
+
|
|
172
|
+
- Repository: https://github.com/glogwa68/nullbic
|
|
173
|
+
|
|
174
|
+
If you use honest-eda in published work, please also cite nullbic.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
honest_eda/__init__.py
|
|
5
|
+
honest_eda/cli.py
|
|
6
|
+
honest_eda/leakage.py
|
|
7
|
+
honest_eda/profile.py
|
|
8
|
+
honest_eda/runner.py
|
|
9
|
+
honest_eda.egg-info/PKG-INFO
|
|
10
|
+
honest_eda.egg-info/SOURCES.txt
|
|
11
|
+
honest_eda.egg-info/dependency_links.txt
|
|
12
|
+
honest_eda.egg-info/entry_points.txt
|
|
13
|
+
honest_eda.egg-info/requires.txt
|
|
14
|
+
honest_eda.egg-info/top_level.txt
|
|
15
|
+
honest_eda/report/__init__.py
|
|
16
|
+
honest_eda/report/report.py
|
|
17
|
+
tests/test_cli.py
|
|
18
|
+
tests/test_leakage.py
|
|
19
|
+
tests/test_profile.py
|
|
20
|
+
tests/test_report.py
|
|
21
|
+
tests/test_runner.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
honest_eda
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "honest-eda"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "EDA that admits when there's no signal — wraps nullbic ΔBIC falsification"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "honest-eda contributors" }]
|
|
13
|
+
keywords = ["eda", "exploratory-data-analysis", "symbolic-regression", "nullbic", "falsification", "data-leakage"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"nullbic",
|
|
26
|
+
"pandas>=2.0",
|
|
27
|
+
"jinja2>=3.0",
|
|
28
|
+
"plotly>=5.0",
|
|
29
|
+
"typer>=0.12",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = ["pytest>=7.0", "pytest-cov"]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/glogwa68/honest-eda"
|
|
37
|
+
Issues = "https://github.com/glogwa68/honest-eda/issues"
|
|
38
|
+
Related = "https://github.com/glogwa68/nullbic"
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
honest-eda = "honest_eda.cli:app"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
where = ["."]
|
|
45
|
+
include = ["honest_eda*"]
|
|
46
|
+
|
|
47
|
+
[tool.pytest.ini_options]
|
|
48
|
+
testpaths = ["tests"]
|
|
49
|
+
addopts = "-ra"
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from typer.testing import CliRunner
|
|
5
|
+
|
|
6
|
+
from honest_eda.cli import app
|
|
7
|
+
|
|
8
|
+
runner = CliRunner()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_help():
|
|
12
|
+
result = runner.invoke(app, ["--help"])
|
|
13
|
+
assert result.exit_code == 0
|
|
14
|
+
assert "profile" in result.output or "check" in result.output
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_profile_command_exit_0(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
|
|
18
|
+
csv_path = str(tmp_path / "data.csv")
|
|
19
|
+
real_signal_df.to_csv(csv_path, index=False)
|
|
20
|
+
out_path = str(tmp_path / "report.html")
|
|
21
|
+
|
|
22
|
+
monkeypatch.setattr(
|
|
23
|
+
"honest_eda.profile._report_mod.render_html",
|
|
24
|
+
lambda result, path, **kw: open(path, "w").write("<html/>"),
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
result = runner.invoke(
|
|
28
|
+
app,
|
|
29
|
+
["profile", csv_path, "--target", "y", "--output", out_path, "--gens", "5"],
|
|
30
|
+
)
|
|
31
|
+
assert result.exit_code == 0
|
|
32
|
+
assert "Report written to:" in result.output
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_check_exit_0_with_signal(real_signal_df, tmp_path, mock_nullbic):
|
|
36
|
+
csv_path = str(tmp_path / "data.csv")
|
|
37
|
+
real_signal_df.to_csv(csv_path, index=False)
|
|
38
|
+
|
|
39
|
+
result = runner.invoke(
|
|
40
|
+
app,
|
|
41
|
+
["check", csv_path, "--target", "y", "--gens", "5"],
|
|
42
|
+
)
|
|
43
|
+
assert result.exit_code == 0
|
|
44
|
+
assert "PASS" in result.output
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_check_exit_1_with_noise(noise_df, tmp_path, mock_nullbic):
|
|
48
|
+
csv_path = str(tmp_path / "data.csv")
|
|
49
|
+
noise_df.to_csv(csv_path, index=False)
|
|
50
|
+
|
|
51
|
+
result = runner.invoke(
|
|
52
|
+
app,
|
|
53
|
+
["check", csv_path, "--target", "y", "--gens", "5"],
|
|
54
|
+
)
|
|
55
|
+
assert result.exit_code == 1
|
|
56
|
+
assert "FAIL" in result.output or "FAIL" in (result.stderr or "")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from honest_eda.leakage import (
|
|
6
|
+
LEAKAGE_Z_THRESHOLD,
|
|
7
|
+
explain_leakage,
|
|
8
|
+
flag_leakage,
|
|
9
|
+
is_leakage_suspect,
|
|
10
|
+
)
|
|
11
|
+
from honest_eda.runner import RelationResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _make_rel(z: float, verdict: str = "STRONG") -> RelationResult:
|
|
15
|
+
return RelationResult(
|
|
16
|
+
target="y",
|
|
17
|
+
features=["x"],
|
|
18
|
+
formula="y ~ x",
|
|
19
|
+
verdict=verdict,
|
|
20
|
+
delta_bic_const=-10.0,
|
|
21
|
+
delta_bic_linear=-8.0,
|
|
22
|
+
z_vs_shuffled=z,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_is_leakage_suspect_extreme_z():
|
|
27
|
+
assert is_leakage_suspect(_make_rel(-20.0)) is True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_is_leakage_suspect_mild_z():
|
|
31
|
+
assert is_leakage_suspect(_make_rel(-5.0)) is False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_flag_leakage_filters_correctly():
|
|
35
|
+
rels = [_make_rel(-20.0), _make_rel(-5.0), _make_rel(-1.0), _make_rel(-16.0)]
|
|
36
|
+
flagged = flag_leakage(rels)
|
|
37
|
+
assert len(flagged) == 2
|
|
38
|
+
assert all(r.z_vs_shuffled < LEAKAGE_Z_THRESHOLD for r in flagged)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_explain_leakage_nonempty():
|
|
42
|
+
rel = _make_rel(-20.0)
|
|
43
|
+
explanation = explain_leakage(rel)
|
|
44
|
+
assert isinstance(explanation, str)
|
|
45
|
+
assert len(explanation) > 0
|
|
46
|
+
assert "x" in explanation
|
|
47
|
+
assert "y" in explanation
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
import honest_eda
|
|
7
|
+
from honest_eda.runner import ProfileResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_profile_accepts_dataframe(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
|
|
11
|
+
out = str(tmp_path / "report.html")
|
|
12
|
+
monkeypatch.setattr("honest_eda.report.render_html", lambda result, path, **kw: None)
|
|
13
|
+
monkeypatch.setattr("honest_eda.profile._report_mod.render_html", lambda result, path, **kw: None)
|
|
14
|
+
result = honest_eda.profile(real_signal_df, "y", out)
|
|
15
|
+
assert isinstance(result, ProfileResult)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_profile_accepts_csv_path(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
|
|
19
|
+
csv_path = str(tmp_path / "data.csv")
|
|
20
|
+
real_signal_df.to_csv(csv_path, index=False)
|
|
21
|
+
out = str(tmp_path / "report.html")
|
|
22
|
+
monkeypatch.setattr("honest_eda.profile._report_mod.render_html", lambda result, path, **kw: None)
|
|
23
|
+
result = honest_eda.profile(csv_path, "y", out)
|
|
24
|
+
assert isinstance(result, ProfileResult)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_profile_writes_html(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
|
|
28
|
+
out = str(tmp_path / "report.html")
|
|
29
|
+
written = {}
|
|
30
|
+
|
|
31
|
+
def fake_render(result, path, **kw):
|
|
32
|
+
written["path"] = path
|
|
33
|
+
with open(path, "w") as f:
|
|
34
|
+
f.write("<html>stub</html>")
|
|
35
|
+
|
|
36
|
+
monkeypatch.setattr("honest_eda.profile._report_mod.render_html", fake_render)
|
|
37
|
+
honest_eda.profile(real_signal_df, "y", out)
|
|
38
|
+
assert "path" in written
|
|
39
|
+
assert written["path"] == out
|
|
40
|
+
assert (tmp_path / "report.html").exists()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_check_true_when_signal(real_signal_df, mock_nullbic):
|
|
44
|
+
result = honest_eda.check(real_signal_df, "y", min_strong=1)
|
|
45
|
+
assert result is True
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_check_false_when_noise(noise_df, mock_nullbic):
|
|
49
|
+
result = honest_eda.check(noise_df, "y", min_strong=1)
|
|
50
|
+
assert result is False
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from honest_eda.report import render_html, render_text_summary
|
|
8
|
+
from honest_eda.runner import ProfileResult, RelationResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _make_profile(target: str = "y", *, with_strong=True, with_leakage=False) -> ProfileResult:
|
|
12
|
+
strong_rel = RelationResult(
|
|
13
|
+
target=target,
|
|
14
|
+
features=["x"],
|
|
15
|
+
formula=f"{target} ~ x",
|
|
16
|
+
verdict="STRONG",
|
|
17
|
+
delta_bic_const=-15.0,
|
|
18
|
+
delta_bic_linear=-12.0,
|
|
19
|
+
z_vs_shuffled=-5.0,
|
|
20
|
+
)
|
|
21
|
+
leakage_rel = RelationResult(
|
|
22
|
+
target=target,
|
|
23
|
+
features=["leak"],
|
|
24
|
+
formula=f"{target} ~ leak",
|
|
25
|
+
verdict="STRONG",
|
|
26
|
+
delta_bic_const=-50.0,
|
|
27
|
+
delta_bic_linear=-30.0,
|
|
28
|
+
z_vs_shuffled=-20.0,
|
|
29
|
+
)
|
|
30
|
+
weak_rel = RelationResult(
|
|
31
|
+
target=target,
|
|
32
|
+
features=["w"],
|
|
33
|
+
formula=f"{target} ~ w",
|
|
34
|
+
verdict="WEAK",
|
|
35
|
+
delta_bic_const=-3.0,
|
|
36
|
+
delta_bic_linear=-2.0,
|
|
37
|
+
z_vs_shuffled=-2.5,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
strong = [strong_rel] if with_strong else []
|
|
41
|
+
leakage = [leakage_rel] if with_leakage else []
|
|
42
|
+
all_rels = strong + leakage + [weak_rel]
|
|
43
|
+
|
|
44
|
+
return ProfileResult(
|
|
45
|
+
n_columns=4,
|
|
46
|
+
n_relations=len(all_rels),
|
|
47
|
+
target=target,
|
|
48
|
+
strong=strong + leakage,
|
|
49
|
+
weak=[weak_rel],
|
|
50
|
+
noise_count=1,
|
|
51
|
+
linear_only=[],
|
|
52
|
+
leakage_suspects=leakage,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_render_html_creates_file(tmp_path):
|
|
57
|
+
result = _make_profile()
|
|
58
|
+
out = tmp_path / "report.html"
|
|
59
|
+
render_html(result, str(out))
|
|
60
|
+
assert out.exists()
|
|
61
|
+
assert out.stat().st_size > 0
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_render_html_contains_target_name(tmp_path):
|
|
65
|
+
result = _make_profile(target="price")
|
|
66
|
+
out = tmp_path / "report.html"
|
|
67
|
+
render_html(result, str(out))
|
|
68
|
+
content = out.read_text(encoding="utf-8")
|
|
69
|
+
assert "price" in content
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_render_html_strong_section_present(tmp_path):
|
|
73
|
+
result = _make_profile(with_strong=True)
|
|
74
|
+
out = tmp_path / "report.html"
|
|
75
|
+
render_html(result, str(out))
|
|
76
|
+
content = out.read_text(encoding="utf-8")
|
|
77
|
+
assert "STRONG" in content or "strong" in content.lower()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_render_html_leakage_section_present(tmp_path):
|
|
81
|
+
result = _make_profile(with_leakage=True)
|
|
82
|
+
out = tmp_path / "report.html"
|
|
83
|
+
render_html(result, str(out))
|
|
84
|
+
content = out.read_text(encoding="utf-8")
|
|
85
|
+
assert "leak" in content.lower() or "-20" in content
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_render_text_summary_counters():
|
|
89
|
+
result = _make_profile(target="y", with_strong=True)
|
|
90
|
+
summary = render_text_summary(result)
|
|
91
|
+
assert "STRONG" in summary
|
|
92
|
+
assert "1" in summary
|
|
93
|
+
assert "y" in summary
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_render_text_summary_noise_count():
|
|
97
|
+
result = _make_profile()
|
|
98
|
+
summary = render_text_summary(result)
|
|
99
|
+
assert "NOISE" in summary or "noise" in summary.lower()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from honest_eda.runner import ProfileResult, RelationResult, run_against_target
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_real_signal_has_strong(real_signal_df, mock_nullbic):
|
|
9
|
+
result = run_against_target(real_signal_df, "y")
|
|
10
|
+
assert isinstance(result, ProfileResult)
|
|
11
|
+
assert len(result.strong) >= 1
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_noise_df_no_strong(noise_df, mock_nullbic):
|
|
15
|
+
result = run_against_target(noise_df, "y")
|
|
16
|
+
assert result.strong == []
|
|
17
|
+
assert result.noise_count > 0
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_leakage_df_suspects(leakage_df, mock_nullbic):
|
|
21
|
+
result = run_against_target(leakage_df, "y")
|
|
22
|
+
assert len(result.leakage_suspects) >= 1
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_nullbic_crash_skipped(real_signal_df, monkeypatch, mock_nullbic):
|
|
26
|
+
import honest_eda.runner as runner_mod
|
|
27
|
+
|
|
28
|
+
call_count = 0
|
|
29
|
+
original_run_one = runner_mod._run_one
|
|
30
|
+
|
|
31
|
+
def failing_run_one(df, features, target, *args, **kwargs):
|
|
32
|
+
nonlocal call_count
|
|
33
|
+
call_count += 1
|
|
34
|
+
if call_count == 1:
|
|
35
|
+
return None
|
|
36
|
+
return original_run_one(df, features, target, *args, **kwargs)
|
|
37
|
+
|
|
38
|
+
monkeypatch.setattr(runner_mod, "_run_one", failing_run_one)
|
|
39
|
+
result = run_against_target(real_signal_df, "y")
|
|
40
|
+
assert isinstance(result, ProfileResult)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_n_columns_and_n_relations(real_signal_df, mock_nullbic):
|
|
44
|
+
result = run_against_target(real_signal_df, "y")
|
|
45
|
+
assert result.n_columns == len(real_signal_df.columns)
|
|
46
|
+
assert result.n_relations >= 0
|
|
47
|
+
total = len(result.strong) + len(result.weak) + result.noise_count
|
|
48
|
+
assert total == result.n_relations
|