honest-eda 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 honest-eda contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: honest-eda
3
+ Version: 0.1.0
4
+ Summary: EDA that admits when there's no signal — wraps nullbic ΔBIC falsification
5
+ Author: honest-eda contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/glogwa68/honest-eda
8
+ Project-URL: Issues, https://github.com/glogwa68/honest-eda/issues
9
+ Project-URL: Related, https://github.com/glogwa68/nullbic
10
+ Keywords: eda,exploratory-data-analysis,symbolic-regression,nullbic,falsification,data-leakage
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: nullbic
23
+ Requires-Dist: pandas>=2.0
24
+ Requires-Dist: jinja2>=3.0
25
+ Requires-Dist: plotly>=5.0
26
+ Requires-Dist: typer>=0.12
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0; extra == "dev"
29
+ Requires-Dist: pytest-cov; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # honest-eda
33
+
34
+ **EDA that admits when there's no signal.**
35
+
36
+ [![PyPI](https://img.shields.io/pypi/v/honest-eda)](https://pypi.org/project/honest-eda/)
37
+ [![Python](https://img.shields.io/pypi/pyversions/honest-eda)](https://pypi.org/project/honest-eda/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
39
+ [![CI](https://github.com/glogwa68/honest-eda/actions/workflows/ci.yml/badge.svg)](https://github.com/glogwa68/honest-eda/actions)
40
+
41
+ > ydata-profiling shows you 200 correlations. 195 are noise. honest-eda shows you the 5 that survive shuffling.
42
+
43
+ ---
44
+
45
+ ## The Problem
46
+
47
+ pandas-profiling, ydata-profiling, and sweetviz report everything. Every correlation, every association, every distribution shift. The result: 200 "insights" in your report, of which 195 are pure chance — artifacts of finite sample size, collinearity, or subtle target leakage.
48
+
49
+ You end up fitting models on noise, wasting compute on AutoML pipelines that have nothing real to learn, and shipping features that degrade on new data.
50
+
51
+ ## The Solution
52
+
53
+ `honest-eda` runs a falsification test on every feature-target pair using [nullbic](https://github.com/glogwa68/nullbic) — a symbolic regression library with built-in self-falsification via ΔBIC.
54
+
55
+ For each relation, the verdict is computed against three null hypotheses:
56
+
57
+ - vs. constant model
58
+ - vs. linear model
59
+ - vs. target-shuffled distribution
60
+
61
+ Only relations that survive all three appear in the report. Everything else is hidden.
62
+
63
+ ---
64
+
65
+ ## Install
66
+
67
+ ```bash
68
+ pip install honest-eda
69
+ ```
70
+
71
+ ---
72
+
73
+ ## Usage
74
+
75
+ ### Python
76
+
77
+ ```python
78
+ from honest_eda import profile, check
79
+
80
+ # Generate HTML report — only real signals shown
81
+ result = profile("data.csv", target="y", output="report.html")
82
+ print(f"Real signal patterns: {len(result.strong)}")
83
+ print(f"Fake correlations rejected: {result.noise_count}")
84
+ print(f"Leakage suspects: {len(result.leakage_suspects)}")
85
+
86
+ # CI mode — exit 1 if no real signal
87
+ if not check("data.csv", target="y", min_strong=1):
88
+ raise ValueError("No real signal in this dataset!")
89
+ ```
90
+
91
+ ### CLI
92
+
93
+ ```bash
94
+ honest-eda profile data.csv --target=y --output=report.html
95
+ honest-eda check data.csv --target=y --min-strong=2 # exit 1 if fails
96
+ ```
97
+
98
+ ---
99
+
100
+ ## What You Get
101
+
102
+ ```
103
+ HONEST EDA REPORT
104
+ ─────────────────
105
+ Columns scanned: 47
106
+ Relations tested: 1081
107
+
108
+ REAL signal (STRONG): 6
109
+ • age × tenure → churn
110
+ • monthly_charges → churn
111
+
112
+ WEAK signal: 12
113
+ NOISE rejected: 1063 (hidden)
114
+
115
+ Linear-baseline-only features: 23
116
+ Leakage suspects: 2 ⚠
117
+ ```
118
+
119
+ No noise, no false confidence. Only findings that hold up under falsification.
120
+
121
+ ---
122
+
123
+ ## Killer Features
124
+
125
+ **Leakage detector** — When a feature's z-score vs. shuffled target drops below −15, honest-eda flags it as a probable data leak. Catches target-encoded columns, future-information leaks, and accidental label copies before they corrupt your model evaluation.
126
+
127
+ **Linear vs. symbolic dichotomy** — honest-eda tells you explicitly when a linear model would suffice. If symbolic regression finds no improvement over OLS, the feature is labeled "linear-baseline-only". No need to run a neural net to discover this.
128
+
129
+ **CI mode** — `honest-eda check` exits with code 1 if the minimum number of strong signals is not met. Drop it in your CI pipeline to block training runs on datasets with no real predictive content.
130
+
131
+ **Pre-modeling triage** — Know whether there is exploitable signal before you launch XGBoost or AutoML. Saves hours of compute and avoids the "model trained fine but generalizes to nothing" postmortem.
132
+
133
+ ---
134
+
135
+ ## How It Works
136
+
137
+ For each numeric feature paired with the target:
138
+
139
+ 1. `nullbic.discover` fits a symbolic expression and records the BIC improvement over the null model.
140
+ 2. The verdict is assigned:
141
+ - **STRONG** — beats constant + linear baseline + all shuffled-target permutations.
142
+ - **WEAK** — beats constant baseline only.
143
+ - **NOISE** — fails to beat the constant. Excluded from the report.
144
+ 3. Leakage is flagged when `z_vs_shuffled < −15`, indicating the feature carries near-perfect information about the target.
145
+
146
+ The HTML report contains only STRONG and WEAK relations. NOISE is counted and disclosed in the summary, but not displayed.
147
+
148
+ ---
149
+
150
+ ## Comparison
151
+
152
+ | | honest-eda | ydata-profiling | sweetviz |
153
+ |-------------------------------|:----------:|:---------------:|:--------:|
154
+ | Tests vs shuffled target | ✅ | ❌ | ❌ |
155
+ | Reports only real signal | ✅ | ❌ | ❌ |
156
+ | Symbolic formula extraction | ✅ | ❌ | ❌ |
157
+ | Leakage detection | ✅ | ❌ | ❌ |
158
+ | CI mode (exit code) | ✅ | ❌ | ❌ |
159
+
160
+ ---
161
+
162
+ ## License
163
+
164
+ MIT. See [LICENSE](LICENSE).
165
+
166
+ ---
167
+
168
+ ## Citation / Related
169
+
170
+ honest-eda is built on top of **nullbic**, a library for symbolic regression with automatic ΔBIC falsification:
171
+
172
+ - Repository: https://github.com/glogwa68/nullbic
173
+
174
+ If you use honest-eda in published work, please also cite nullbic.
@@ -0,0 +1,143 @@
1
+ # honest-eda
2
+
3
+ **EDA that admits when there's no signal.**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/honest-eda)](https://pypi.org/project/honest-eda/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/honest-eda)](https://pypi.org/project/honest-eda/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
8
+ [![CI](https://github.com/glogwa68/honest-eda/actions/workflows/ci.yml/badge.svg)](https://github.com/glogwa68/honest-eda/actions)
9
+
10
+ > ydata-profiling shows you 200 correlations. 195 are noise. honest-eda shows you the 5 that survive shuffling.
11
+
12
+ ---
13
+
14
+ ## The Problem
15
+
16
+ pandas-profiling, ydata-profiling, and sweetviz report everything. Every correlation, every association, every distribution shift. The result: 200 "insights" in your report, of which 195 are pure chance — artifacts of finite sample size, collinearity, or subtle target leakage.
17
+
18
+ You end up fitting models on noise, wasting compute on AutoML pipelines that have nothing real to learn, and shipping features that degrade on new data.
19
+
20
+ ## The Solution
21
+
22
+ `honest-eda` runs a falsification test on every feature-target pair using [nullbic](https://github.com/glogwa68/nullbic) — a symbolic regression library with built-in self-falsification via ΔBIC.
23
+
24
+ For each relation, the verdict is computed against three null hypotheses:
25
+
26
+ - vs. constant model
27
+ - vs. linear model
28
+ - vs. target-shuffled distribution
29
+
30
+ Only relations that survive all three appear in the report. Everything else is hidden.
31
+
32
+ ---
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install honest-eda
38
+ ```
39
+
40
+ ---
41
+
42
+ ## Usage
43
+
44
+ ### Python
45
+
46
+ ```python
47
+ from honest_eda import profile, check
48
+
49
+ # Generate HTML report — only real signals shown
50
+ result = profile("data.csv", target="y", output="report.html")
51
+ print(f"Real signal patterns: {len(result.strong)}")
52
+ print(f"Fake correlations rejected: {result.noise_count}")
53
+ print(f"Leakage suspects: {len(result.leakage_suspects)}")
54
+
55
+ # CI mode — exit 1 if no real signal
56
+ if not check("data.csv", target="y", min_strong=1):
57
+ raise ValueError("No real signal in this dataset!")
58
+ ```
59
+
60
+ ### CLI
61
+
62
+ ```bash
63
+ honest-eda profile data.csv --target=y --output=report.html
64
+ honest-eda check data.csv --target=y --min-strong=2 # exit 1 if fails
65
+ ```
66
+
67
+ ---
68
+
69
+ ## What You Get
70
+
71
+ ```
72
+ HONEST EDA REPORT
73
+ ─────────────────
74
+ Columns scanned: 47
75
+ Relations tested: 1081
76
+
77
+ REAL signal (STRONG): 6
78
+ • age × tenure → churn
79
+ • monthly_charges → churn
80
+
81
+ WEAK signal: 12
82
+ NOISE rejected: 1063 (hidden)
83
+
84
+ Linear-baseline-only features: 23
85
+ Leakage suspects: 2 ⚠
86
+ ```
87
+
88
+ No noise, no false confidence. Only findings that hold up under falsification.
89
+
90
+ ---
91
+
92
+ ## Killer Features
93
+
94
+ **Leakage detector** — When a feature's z-score vs. shuffled target drops below −15, honest-eda flags it as a probable data leak. Catches target-encoded columns, future-information leaks, and accidental label copies before they corrupt your model evaluation.
95
+
96
+ **Linear vs. symbolic dichotomy** — honest-eda tells you explicitly when a linear model would suffice. If symbolic regression finds no improvement over OLS, the feature is labeled "linear-baseline-only". No need to run a neural net to discover this.
97
+
98
+ **CI mode** — `honest-eda check` exits with code 1 if the minimum number of strong signals is not met. Drop it in your CI pipeline to block training runs on datasets with no real predictive content.
99
+
100
+ **Pre-modeling triage** — Know whether there is exploitable signal before you launch XGBoost or AutoML. Saves hours of compute and avoids the "model trained fine but generalizes to nothing" postmortem.
101
+
102
+ ---
103
+
104
+ ## How It Works
105
+
106
+ For each numeric feature paired with the target:
107
+
108
+ 1. `nullbic.discover` fits a symbolic expression and records the BIC improvement over the null model.
109
+ 2. The verdict is assigned:
110
+ - **STRONG** — beats constant + linear baseline + all shuffled-target permutations.
111
+ - **WEAK** — beats constant baseline only.
112
+ - **NOISE** — fails to beat the constant. Excluded from the report.
113
+ 3. Leakage is flagged when `z_vs_shuffled < −15`, indicating the feature carries near-perfect information about the target.
114
+
115
+ The HTML report contains only STRONG and WEAK relations. NOISE is counted and disclosed in the summary, but not displayed.
116
+
117
+ ---
118
+
119
+ ## Comparison
120
+
121
+ | | honest-eda | ydata-profiling | sweetviz |
122
+ |-------------------------------|:----------:|:---------------:|:--------:|
123
+ | Tests vs shuffled target | ✅ | ❌ | ❌ |
124
+ | Reports only real signal | ✅ | ❌ | ❌ |
125
+ | Symbolic formula extraction | ✅ | ❌ | ❌ |
126
+ | Leakage detection | ✅ | ❌ | ❌ |
127
+ | CI mode (exit code) | ✅ | ❌ | ❌ |
128
+
129
+ ---
130
+
131
+ ## License
132
+
133
+ MIT. See [LICENSE](LICENSE).
134
+
135
+ ---
136
+
137
+ ## Citation / Related
138
+
139
+ honest-eda is built on top of **nullbic**, a library for symbolic regression with automatic ΔBIC falsification:
140
+
141
+ - Repository: https://github.com/glogwa68/nullbic
142
+
143
+ If you use honest-eda in published work, please also cite nullbic.
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ from honest_eda.profile import check, profile
4
+ from honest_eda.runner import ProfileResult, RelationResult
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ __all__ = ["profile", "check", "ProfileResult", "RelationResult", "__version__"]
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+
5
+ import typer
6
+
7
+ from honest_eda.profile import check as _check
8
+ from honest_eda.profile import profile as _profile
9
+ from honest_eda.report import render_text_summary
10
+
11
+ app = typer.Typer(no_args_is_help=True, help="EDA that admits when there's no signal.")
12
+
13
+
14
+ @app.command("profile")
15
+ def profile_cmd(
16
+ data: str = typer.Argument(..., help="CSV path"),
17
+ target: str = typer.Option(..., "--target", "-t", help="Target column name"),
18
+ output: str = typer.Option("report.html", "--output", "-o", help="HTML output path"),
19
+ gens: int = typer.Option(40, "--gens", help="nullbic n_generations"),
20
+ pop: int = typer.Option(200, "--pop", help="nullbic pop_size"),
21
+ depth: int = typer.Option(4, "--depth", help="nullbic max_depth"),
22
+ seed: int = typer.Option(0, "--seed", help="Random seed"),
23
+ show_noise: bool = typer.Option(False, "--show-noise", help="Include noise in HTML"),
24
+ quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress stdout"),
25
+ ) -> None:
26
+ "Run EDA profile and write HTML report."
27
+ result = _profile(
28
+ data,
29
+ target,
30
+ output,
31
+ n_generations=gens,
32
+ pop_size=pop,
33
+ max_depth=depth,
34
+ seed=seed,
35
+ hide_noise=not show_noise,
36
+ )
37
+ if not quiet:
38
+ typer.echo(render_text_summary(result))
39
+ typer.echo(f"Report written to: {output}")
40
+
41
+
42
+ @app.command("check")
43
+ def check_cmd(
44
+ data: str = typer.Argument(..., help="CSV path"),
45
+ target: str = typer.Option(..., "--target", "-t", help="Target column name"),
46
+ min_strong: int = typer.Option(1, "--min-strong", help="Minimum STRONG relations required"),
47
+ gens: int = typer.Option(40, "--gens"),
48
+ pop: int = typer.Option(200, "--pop"),
49
+ depth: int = typer.Option(4, "--depth"),
50
+ seed: int = typer.Option(0, "--seed"),
51
+ quiet: bool = typer.Option(False, "--quiet", "-q"),
52
+ ) -> None:
53
+ "Exit 0 if enough STRONG signals found, exit 1 otherwise."
54
+ ok = _check(data, target, min_strong=min_strong, n_generations=gens, pop_size=pop, max_depth=depth, seed=seed)
55
+ if not quiet:
56
+ if ok:
57
+ typer.echo(f"PASS: found >= {min_strong} STRONG relation(s) for '{target}'.")
58
+ else:
59
+ typer.echo(f"FAIL: fewer than {min_strong} STRONG relation(s) for '{target}'.", err=True)
60
+ sys.exit(0 if ok else 1)
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from honest_eda.runner import RelationResult
4
+
5
+ LEAKAGE_Z_THRESHOLD = -15.0
6
+
7
+
8
+ def is_leakage_suspect(rel: RelationResult) -> bool:
9
+ return rel.z_vs_shuffled < LEAKAGE_Z_THRESHOLD
10
+
11
+
12
+ def flag_leakage(results: list[RelationResult]) -> list[RelationResult]:
13
+ return [r for r in results if is_leakage_suspect(r)]
14
+
15
+
16
+ def explain_leakage(rel: RelationResult) -> str:
17
+ features = ", ".join(rel.features)
18
+ return (
19
+ f"Leakage suspect: feature(s) [{features}] achieve z={rel.z_vs_shuffled:.2f} "
20
+ f"vs shuffled baseline (threshold: {LEAKAGE_Z_THRESHOLD}). "
21
+ f"Verify [{features}] is not derived from '{rel.target}'."
22
+ )
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from honest_eda import report as _report_mod
6
+ from honest_eda.runner import ProfileResult, run_against_target
7
+
8
+
9
+ def profile(
10
+ data: str | pd.DataFrame,
11
+ target: str,
12
+ output: str = "report.html",
13
+ *,
14
+ n_generations: int = 40,
15
+ pop_size: int = 200,
16
+ max_depth: int = 4,
17
+ seed: int = 0,
18
+ hide_noise: bool = True,
19
+ ) -> ProfileResult:
20
+ "Run nullbic-backed EDA and render an HTML report."
21
+ if isinstance(data, str):
22
+ data = pd.read_csv(data)
23
+ result = run_against_target(
24
+ data,
25
+ target,
26
+ n_generations=n_generations,
27
+ pop_size=pop_size,
28
+ max_depth=max_depth,
29
+ seed=seed,
30
+ )
31
+ _report_mod.render_html(result, output, hide_noise=hide_noise)
32
+ return result
33
+
34
+
35
+ def check(
36
+ data: str | pd.DataFrame,
37
+ target: str,
38
+ *,
39
+ min_strong: int = 1,
40
+ **kwargs,
41
+ ) -> bool:
42
+ "CI gate: True if the profile finds at least min_strong STRONG relations."
43
+ if isinstance(data, str):
44
+ data = pd.read_csv(data)
45
+ result = run_against_target(data, target, **kwargs)
46
+ return len(result.strong) >= min_strong
@@ -0,0 +1,5 @@
1
+ from __future__ import annotations
2
+
3
+ from .report import render_html, render_text_summary
4
+
5
+ __all__ = ["render_html", "render_text_summary"]
@@ -0,0 +1,146 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+
6
+ from jinja2 import Environment, BaseLoader
7
+
8
+ from honest_eda.leakage import LEAKAGE_Z_THRESHOLD
9
+ from honest_eda.runner import ProfileResult
10
+
11
+ _TEMPLATE = """\
12
+ <!DOCTYPE html>
13
+ <html lang="en">
14
+ <head>
15
+ <meta charset="utf-8">
16
+ <title>honest-eda — {{ result.target }}</title>
17
+ <style>
18
+ body{font-family:system-ui,sans-serif;margin:2rem;background:#fafafa;color:#111}
19
+ h1,h2{color:#1a1a2e}
20
+ table{border-collapse:collapse;width:100%;margin-bottom:1.5rem}
21
+ th,td{border:1px solid #ccc;padding:.4rem .8rem;text-align:left;font-size:.9em}
22
+ th{background:#1a1a2e;color:#fff}
23
+ tr.strong{background:#d4edda}tr.weak{background:#fff3cd}
24
+ .badge{border-radius:4px;padding:2px 6px;font-size:.8em;font-weight:700}
25
+ .b-STRONG{background:#198754;color:#fff}.b-WEAK{background:#ffc107;color:#000}
26
+ .b-NOISE{background:#dc3545;color:#fff}
27
+ .warn{color:#856404;background:#fff3cd;border:1px solid #ffc107;padding:.5rem;border-radius:4px}
28
+ </style>
29
+ </head>
30
+ <body>
31
+ <h1>honest-eda &mdash; target: <code>{{ result.target }}</code></h1>
32
+ <p>
33
+ Generated: {{ generated_at }} &nbsp;|&nbsp;
34
+ Columns: {{ result.n_columns }} &nbsp;|&nbsp;
35
+ Relations tested: {{ result.n_relations }} &nbsp;|&nbsp;
36
+ <strong>Strong: {{ result.strong | length }}</strong> &nbsp;|&nbsp;
37
+ Weak: {{ result.weak | length }} &nbsp;|&nbsp;
38
+ Noise: {{ result.noise_count }}
39
+ </p>
40
+
41
+ {% if result.leakage_suspects %}
42
+ <div class="warn">
43
+ <strong>Leakage suspects</strong> (z &lt; {{ leakage_threshold }}):<br>
44
+ {% for r in result.leakage_suspects %}
45
+ &bull; {{ r.features | join(", ") }} &mdash; z={{ "%.2f" | format(r.z_vs_shuffled) }}<br>
46
+ {% endfor %}
47
+ </div>
48
+ {% endif %}
49
+
50
+ {% if result.linear_only %}
51
+ <h2>Linear-only features</h2>
52
+ <p>{{ result.linear_only | join(", ") }}</p>
53
+ {% endif %}
54
+
55
+ <h2>Strong relations</h2>
56
+ {% if result.strong %}
57
+ <table>
58
+ <tr><th>Features</th><th>Formula</th><th>Verdict</th><th>&#916;BIC const</th><th>&#916;BIC linear</th><th>z vs shuffled</th></tr>
59
+ {% for r in result.strong %}
60
+ <tr class="strong">
61
+ <td>{{ r.features | join(", ") }}</td>
62
+ <td><code>{{ r.formula }}</code></td>
63
+ <td><span class="badge b-{{ r.verdict }}">{{ r.verdict }}</span></td>
64
+ <td>{{ "%.3f" | format(r.delta_bic_const) }}</td>
65
+ <td>{{ "%.3f" | format(r.delta_bic_linear) }}</td>
66
+ <td>{{ "%.3f" | format(r.z_vs_shuffled) }}</td>
67
+ </tr>
68
+ {% endfor %}
69
+ </table>
70
+ {% else %}<p><em>None found.</em></p>{% endif %}
71
+
72
+ <h2>Weak relations</h2>
73
+ {% if result.weak %}
74
+ <table>
75
+ <tr><th>Features</th><th>Formula</th><th>Verdict</th><th>&#916;BIC const</th><th>&#916;BIC linear</th><th>z vs shuffled</th></tr>
76
+ {% for r in result.weak %}
77
+ <tr class="weak">
78
+ <td>{{ r.features | join(", ") }}</td>
79
+ <td><code>{{ r.formula }}</code></td>
80
+ <td><span class="badge b-{{ r.verdict }}">{{ r.verdict }}</span></td>
81
+ <td>{{ "%.3f" | format(r.delta_bic_const) }}</td>
82
+ <td>{{ "%.3f" | format(r.delta_bic_linear) }}</td>
83
+ <td>{{ "%.3f" | format(r.z_vs_shuffled) }}</td>
84
+ </tr>
85
+ {% endfor %}
86
+ </table>
87
+ {% else %}<p><em>None found.</em></p>{% endif %}
88
+
89
+ {% if not hide_noise %}
90
+ <h2>Noise ({{ result.noise_count }} relation(s) rejected)</h2>
91
+ {% endif %}
92
+
93
+ </body>
94
+ </html>
95
+ """
96
+
97
+
98
+ def render_html(
99
+ result: ProfileResult,
100
+ output_path: str | Path,
101
+ *,
102
+ hide_noise: bool = True,
103
+ ) -> Path:
104
+ "Render ProfileResult as a self-contained HTML file."
105
+ output_path = Path(output_path).resolve()
106
+ output_path.parent.mkdir(parents=True, exist_ok=True)
107
+ env = Environment(loader=BaseLoader(), autoescape=False)
108
+ tmpl = env.from_string(_TEMPLATE)
109
+ generated_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
110
+ html = tmpl.render(
111
+ result=result,
112
+ hide_noise=hide_noise,
113
+ generated_at=generated_at,
114
+ leakage_threshold=LEAKAGE_Z_THRESHOLD,
115
+ )
116
+ output_path.write_text(html, encoding="utf-8")
117
+ return output_path
118
+
119
+
120
+ def render_text_summary(result: ProfileResult) -> str:
121
+ "Return a plain-text summary of a ProfileResult."
122
+ lines: list[str] = [
123
+ "HONEST EDA REPORT",
124
+ "─" * 17,
125
+ f"Target: {result.target}",
126
+ f"Columns scanned: {result.n_columns}",
127
+ f"Relations tested: {result.n_relations}",
128
+ "",
129
+ f"REAL signal (STRONG): {len(result.strong)}",
130
+ ]
131
+ for r in result.strong:
132
+ lines.append(f" * {', '.join(r.features)} -> {result.target} formula: {r.formula}")
133
+ lines += ["", f"WEAK signal: {len(result.weak)}"]
134
+ for r in result.weak:
135
+ lines.append(f" * {', '.join(r.features)} -> {result.target} formula: {r.formula}")
136
+ lines += [
137
+ "",
138
+ f"NOISE rejected: {result.noise_count}",
139
+ "",
140
+ f"Linear-only features: {', '.join(result.linear_only) or 'none'}",
141
+ "",
142
+ f"Leakage suspects: {len(result.leakage_suspects)}",
143
+ ]
144
+ for r in result.leakage_suspects:
145
+ lines.append(f" ! {', '.join(r.features)} z={r.z_vs_shuffled:.2f}")
146
+ return "\n".join(lines)
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import random
5
+ from dataclasses import dataclass, field
6
+
7
+ import pandas as pd
8
+ from nullbic import Dataset, discover
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ @dataclass
14
+ class RelationResult:
15
+ target: str
16
+ features: list[str]
17
+ formula: str
18
+ verdict: str
19
+ delta_bic_const: float
20
+ delta_bic_linear: float
21
+ z_vs_shuffled: float
22
+
23
+
24
+ @dataclass
25
+ class ProfileResult:
26
+ n_columns: int
27
+ n_relations: int
28
+ target: str
29
+ strong: list[RelationResult]
30
+ weak: list[RelationResult]
31
+ noise_count: int
32
+ linear_only: list[str]
33
+ leakage_suspects: list[RelationResult]
34
+
35
+
36
+ def _select_numeric(df: pd.DataFrame) -> list[str]:
37
+ return df.select_dtypes(include="number").columns.tolist()
38
+
39
+
40
+ def _verdict_str(report) -> str:
41
+ try:
42
+ return report.verdict.name
43
+ except AttributeError:
44
+ if not report.is_real_signal():
45
+ return "NOISE"
46
+ # distinguish STRONG vs WEAK via delta_bic_linear threshold
47
+ return "STRONG" if report.delta_bic_linear < -10 else "WEAK"
48
+
49
+
50
+ def _run_one(
51
+ df: pd.DataFrame,
52
+ features: list[str],
53
+ target: str,
54
+ n_generations: int,
55
+ pop_size: int,
56
+ max_depth: int,
57
+ seed: int,
58
+ ) -> RelationResult | None:
59
+ random.seed(seed)
60
+ try:
61
+ ds = Dataset.from_pandas(df[features + [target]], target=target)
62
+ rep = discover(ds, n_generations=n_generations, pop_size=pop_size, max_depth=max_depth)
63
+ return RelationResult(
64
+ target=target,
65
+ features=features,
66
+ formula=rep.formula,
67
+ verdict=_verdict_str(rep),
68
+ delta_bic_const=float(rep.delta_bic_const),
69
+ delta_bic_linear=float(rep.delta_bic_linear),
70
+ z_vs_shuffled=float(rep.z_vs_shuffled),
71
+ )
72
+ except Exception as exc:
73
+ logger.warning("nullbic failed for features=%s target=%s: %s", features, target, exc)
74
+ return None
75
+
76
+
77
+ def run_against_target(
78
+ df: pd.DataFrame,
79
+ target: str,
80
+ *,
81
+ n_generations: int = 40,
82
+ pop_size: int = 200,
83
+ max_depth: int = 4,
84
+ max_features_per_run: int | None = None,
85
+ seed: int = 0,
86
+ ) -> ProfileResult:
87
+ numeric_cols = [c for c in _select_numeric(df) if c != target]
88
+
89
+ if max_features_per_run is not None:
90
+ numeric_cols = numeric_cols[:max_features_per_run]
91
+
92
+ results: list[RelationResult] = []
93
+
94
+ for col in numeric_cols:
95
+ res = _run_one(df, [col], target, n_generations, pop_size, max_depth, seed)
96
+ if res is not None:
97
+ results.append(res)
98
+
99
+ if len(numeric_cols) > 1:
100
+ all_res = _run_one(df, numeric_cols, target, n_generations, pop_size, max_depth, seed)
101
+ if all_res is not None:
102
+ results.append(all_res)
103
+
104
+ strong = [r for r in results if r.verdict == "STRONG"]
105
+ weak = [r for r in results if r.verdict == "WEAK"]
106
+ noise_count = sum(1 for r in results if r.verdict == "NOISE")
107
+
108
+ leakage_suspects = [r for r in results if r.z_vs_shuffled < -15]
109
+
110
+ linear_only: list[str] = []
111
+ for r in results:
112
+ if len(r.features) == 1 and abs(r.delta_bic_linear) < 5 and r.delta_bic_const < -10:
113
+ linear_only.append(r.features[0])
114
+
115
+ return ProfileResult(
116
+ n_columns=len(df.columns),
117
+ n_relations=len(results),
118
+ target=target,
119
+ strong=strong,
120
+ weak=weak,
121
+ noise_count=noise_count,
122
+ linear_only=linear_only,
123
+ leakage_suspects=leakage_suspects,
124
+ )
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: honest-eda
3
+ Version: 0.1.0
4
+ Summary: EDA that admits when there's no signal — wraps nullbic ΔBIC falsification
5
+ Author: honest-eda contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/glogwa68/honest-eda
8
+ Project-URL: Issues, https://github.com/glogwa68/honest-eda/issues
9
+ Project-URL: Related, https://github.com/glogwa68/nullbic
10
+ Keywords: eda,exploratory-data-analysis,symbolic-regression,nullbic,falsification,data-leakage
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: nullbic
23
+ Requires-Dist: pandas>=2.0
24
+ Requires-Dist: jinja2>=3.0
25
+ Requires-Dist: plotly>=5.0
26
+ Requires-Dist: typer>=0.12
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0; extra == "dev"
29
+ Requires-Dist: pytest-cov; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # honest-eda
33
+
34
+ **EDA that admits when there's no signal.**
35
+
36
+ [![PyPI](https://img.shields.io/pypi/v/honest-eda)](https://pypi.org/project/honest-eda/)
37
+ [![Python](https://img.shields.io/pypi/pyversions/honest-eda)](https://pypi.org/project/honest-eda/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
39
+ [![CI](https://github.com/glogwa68/honest-eda/actions/workflows/ci.yml/badge.svg)](https://github.com/glogwa68/honest-eda/actions)
40
+
41
+ > ydata-profiling shows you 200 correlations. 195 are noise. honest-eda shows you the 5 that survive shuffling.
42
+
43
+ ---
44
+
45
+ ## The Problem
46
+
47
+ pandas-profiling, ydata-profiling, and sweetviz report everything. Every correlation, every association, every distribution shift. The result: 200 "insights" in your report, of which 195 are pure chance — artifacts of finite sample size, collinearity, or subtle target leakage.
48
+
49
+ You end up fitting models on noise, wasting compute on AutoML pipelines that have nothing real to learn, and shipping features that degrade on new data.
50
+
51
+ ## The Solution
52
+
53
+ `honest-eda` runs a falsification test on every feature-target pair using [nullbic](https://github.com/glogwa68/nullbic) — a symbolic regression library with built-in self-falsification via ΔBIC.
54
+
55
+ For each relation, the verdict is computed against three null hypotheses:
56
+
57
+ - vs. constant model
58
+ - vs. linear model
59
+ - vs. target-shuffled distribution
60
+
61
+ Only relations that survive all three appear in the report. Everything else is hidden.
62
+
63
+ ---
64
+
65
+ ## Install
66
+
67
+ ```bash
68
+ pip install honest-eda
69
+ ```
70
+
71
+ ---
72
+
73
+ ## Usage
74
+
75
+ ### Python
76
+
77
+ ```python
78
+ from honest_eda import profile, check
79
+
80
+ # Generate HTML report — only real signals shown
81
+ result = profile("data.csv", target="y", output="report.html")
82
+ print(f"Real signal patterns: {len(result.strong)}")
83
+ print(f"Fake correlations rejected: {result.noise_count}")
84
+ print(f"Leakage suspects: {len(result.leakage_suspects)}")
85
+
86
+ # CI mode — exit 1 if no real signal
87
+ if not check("data.csv", target="y", min_strong=1):
88
+ raise ValueError("No real signal in this dataset!")
89
+ ```
90
+
91
+ ### CLI
92
+
93
+ ```bash
94
+ honest-eda profile data.csv --target=y --output=report.html
95
+ honest-eda check data.csv --target=y --min-strong=2 # exit 1 if fails
96
+ ```
97
+
98
+ ---
99
+
100
+ ## What You Get
101
+
102
+ ```
103
+ HONEST EDA REPORT
104
+ ─────────────────
105
+ Columns scanned: 47
106
+ Relations tested: 1081
107
+
108
+ REAL signal (STRONG): 6
109
+ • age × tenure → churn
110
+ • monthly_charges → churn
111
+
112
+ WEAK signal: 12
113
+ NOISE rejected: 1063 (hidden)
114
+
115
+ Linear-baseline-only features: 23
116
+ Leakage suspects: 2 ⚠
117
+ ```
118
+
119
+ No noise, no false confidence. Only findings that hold up under falsification.
120
+
121
+ ---
122
+
123
+ ## Killer Features
124
+
125
+ **Leakage detector** — When a feature's z-score vs. shuffled target drops below −15, honest-eda flags it as a probable data leak. Catches target-encoded columns, future-information leaks, and accidental label copies before they corrupt your model evaluation.
126
+
127
+ **Linear vs. symbolic dichotomy** — honest-eda tells you explicitly when a linear model would suffice. If symbolic regression finds no improvement over OLS, the feature is labeled "linear-baseline-only". No need to run a neural net to discover this.
128
+
129
+ **CI mode** — `honest-eda check` exits with code 1 if the minimum number of strong signals is not met. Drop it in your CI pipeline to block training runs on datasets with no real predictive content.
130
+
131
+ **Pre-modeling triage** — Know whether there is exploitable signal before you launch XGBoost or AutoML. Saves hours of compute and avoids the "model trained fine but generalizes to nothing" postmortem.
132
+
133
+ ---
134
+
135
+ ## How It Works
136
+
137
+ For each numeric feature paired with the target:
138
+
139
+ 1. `nullbic.discover` fits a symbolic expression and records the BIC improvement over the null model.
140
+ 2. The verdict is assigned:
141
+ - **STRONG** — beats constant + linear baseline + all shuffled-target permutations.
142
+ - **WEAK** — beats constant baseline only.
143
+ - **NOISE** — fails to beat the constant. Excluded from the report.
144
+ 3. Leakage is flagged when `z_vs_shuffled < −15`, indicating the feature carries near-perfect information about the target.
145
+
146
+ The HTML report contains only STRONG and WEAK relations. NOISE is counted and disclosed in the summary, but not displayed.
147
+
148
+ ---
149
+
150
+ ## Comparison
151
+
152
+ | | honest-eda | ydata-profiling | sweetviz |
153
+ |-------------------------------|:----------:|:---------------:|:--------:|
154
+ | Tests vs shuffled target | ✅ | ❌ | ❌ |
155
+ | Reports only real signal | ✅ | ❌ | ❌ |
156
+ | Symbolic formula extraction | ✅ | ❌ | ❌ |
157
+ | Leakage detection | ✅ | ❌ | ❌ |
158
+ | CI mode (exit code) | ✅ | ❌ | ❌ |
159
+
160
+ ---
161
+
162
+ ## License
163
+
164
+ MIT. See [LICENSE](LICENSE).
165
+
166
+ ---
167
+
168
+ ## Citation / Related
169
+
170
+ honest-eda is built on top of **nullbic**, a library for symbolic regression with automatic ΔBIC falsification:
171
+
172
+ - Repository: https://github.com/glogwa68/nullbic
173
+
174
+ If you use honest-eda in published work, please also cite nullbic.
@@ -0,0 +1,21 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ honest_eda/__init__.py
5
+ honest_eda/cli.py
6
+ honest_eda/leakage.py
7
+ honest_eda/profile.py
8
+ honest_eda/runner.py
9
+ honest_eda.egg-info/PKG-INFO
10
+ honest_eda.egg-info/SOURCES.txt
11
+ honest_eda.egg-info/dependency_links.txt
12
+ honest_eda.egg-info/entry_points.txt
13
+ honest_eda.egg-info/requires.txt
14
+ honest_eda.egg-info/top_level.txt
15
+ honest_eda/report/__init__.py
16
+ honest_eda/report/report.py
17
+ tests/test_cli.py
18
+ tests/test_leakage.py
19
+ tests/test_profile.py
20
+ tests/test_report.py
21
+ tests/test_runner.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ honest-eda = honest_eda.cli:app
@@ -0,0 +1,9 @@
1
+ nullbic
2
+ pandas>=2.0
3
+ jinja2>=3.0
4
+ plotly>=5.0
5
+ typer>=0.12
6
+
7
+ [dev]
8
+ pytest>=7.0
9
+ pytest-cov
@@ -0,0 +1 @@
1
+ honest_eda
@@ -0,0 +1,49 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "honest-eda"
7
+ version = "0.1.0"
8
+ description = "EDA that admits when there's no signal — wraps nullbic ΔBIC falsification"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "honest-eda contributors" }]
13
+ keywords = ["eda", "exploratory-data-analysis", "symbolic-regression", "nullbic", "falsification", "data-leakage"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Scientific/Engineering :: Information Analysis",
23
+ ]
24
+ dependencies = [
25
+ "nullbic",
26
+ "pandas>=2.0",
27
+ "jinja2>=3.0",
28
+ "plotly>=5.0",
29
+ "typer>=0.12",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ dev = ["pytest>=7.0", "pytest-cov"]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/glogwa68/honest-eda"
37
+ Issues = "https://github.com/glogwa68/honest-eda/issues"
38
+ Related = "https://github.com/glogwa68/nullbic"
39
+
40
+ [project.scripts]
41
+ honest-eda = "honest_eda.cli:app"
42
+
43
+ [tool.setuptools.packages.find]
44
+ where = ["."]
45
+ include = ["honest_eda*"]
46
+
47
+ [tool.pytest.ini_options]
48
+ testpaths = ["tests"]
49
+ addopts = "-ra"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+ from typer.testing import CliRunner
5
+
6
+ from honest_eda.cli import app
7
+
8
+ runner = CliRunner()
9
+
10
+
11
+ def test_help():
12
+ result = runner.invoke(app, ["--help"])
13
+ assert result.exit_code == 0
14
+ assert "profile" in result.output or "check" in result.output
15
+
16
+
17
+ def test_profile_command_exit_0(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
18
+ csv_path = str(tmp_path / "data.csv")
19
+ real_signal_df.to_csv(csv_path, index=False)
20
+ out_path = str(tmp_path / "report.html")
21
+
22
+ monkeypatch.setattr(
23
+ "honest_eda.profile._report_mod.render_html",
24
+ lambda result, path, **kw: open(path, "w").write("<html/>"),
25
+ )
26
+
27
+ result = runner.invoke(
28
+ app,
29
+ ["profile", csv_path, "--target", "y", "--output", out_path, "--gens", "5"],
30
+ )
31
+ assert result.exit_code == 0
32
+ assert "Report written to:" in result.output
33
+
34
+
35
+ def test_check_exit_0_with_signal(real_signal_df, tmp_path, mock_nullbic):
36
+ csv_path = str(tmp_path / "data.csv")
37
+ real_signal_df.to_csv(csv_path, index=False)
38
+
39
+ result = runner.invoke(
40
+ app,
41
+ ["check", csv_path, "--target", "y", "--gens", "5"],
42
+ )
43
+ assert result.exit_code == 0
44
+ assert "PASS" in result.output
45
+
46
+
47
+ def test_check_exit_1_with_noise(noise_df, tmp_path, mock_nullbic):
48
+ csv_path = str(tmp_path / "data.csv")
49
+ noise_df.to_csv(csv_path, index=False)
50
+
51
+ result = runner.invoke(
52
+ app,
53
+ ["check", csv_path, "--target", "y", "--gens", "5"],
54
+ )
55
+ assert result.exit_code == 1
56
+ assert "FAIL" in result.output or "FAIL" in (result.stderr or "")
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+
5
+ from honest_eda.leakage import (
6
+ LEAKAGE_Z_THRESHOLD,
7
+ explain_leakage,
8
+ flag_leakage,
9
+ is_leakage_suspect,
10
+ )
11
+ from honest_eda.runner import RelationResult
12
+
13
+
14
+ def _make_rel(z: float, verdict: str = "STRONG") -> RelationResult:
15
+ return RelationResult(
16
+ target="y",
17
+ features=["x"],
18
+ formula="y ~ x",
19
+ verdict=verdict,
20
+ delta_bic_const=-10.0,
21
+ delta_bic_linear=-8.0,
22
+ z_vs_shuffled=z,
23
+ )
24
+
25
+
26
+ def test_is_leakage_suspect_extreme_z():
27
+ assert is_leakage_suspect(_make_rel(-20.0)) is True
28
+
29
+
30
+ def test_is_leakage_suspect_mild_z():
31
+ assert is_leakage_suspect(_make_rel(-5.0)) is False
32
+
33
+
34
+ def test_flag_leakage_filters_correctly():
35
+ rels = [_make_rel(-20.0), _make_rel(-5.0), _make_rel(-1.0), _make_rel(-16.0)]
36
+ flagged = flag_leakage(rels)
37
+ assert len(flagged) == 2
38
+ assert all(r.z_vs_shuffled < LEAKAGE_Z_THRESHOLD for r in flagged)
39
+
40
+
41
+ def test_explain_leakage_nonempty():
42
+ rel = _make_rel(-20.0)
43
+ explanation = explain_leakage(rel)
44
+ assert isinstance(explanation, str)
45
+ assert len(explanation) > 0
46
+ assert "x" in explanation
47
+ assert "y" in explanation
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+ import pytest
5
+
6
+ import honest_eda
7
+ from honest_eda.runner import ProfileResult
8
+
9
+
10
+ def test_profile_accepts_dataframe(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
11
+ out = str(tmp_path / "report.html")
12
+ monkeypatch.setattr("honest_eda.report.render_html", lambda result, path, **kw: None)
13
+ monkeypatch.setattr("honest_eda.profile._report_mod.render_html", lambda result, path, **kw: None)
14
+ result = honest_eda.profile(real_signal_df, "y", out)
15
+ assert isinstance(result, ProfileResult)
16
+
17
+
18
+ def test_profile_accepts_csv_path(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
19
+ csv_path = str(tmp_path / "data.csv")
20
+ real_signal_df.to_csv(csv_path, index=False)
21
+ out = str(tmp_path / "report.html")
22
+ monkeypatch.setattr("honest_eda.profile._report_mod.render_html", lambda result, path, **kw: None)
23
+ result = honest_eda.profile(csv_path, "y", out)
24
+ assert isinstance(result, ProfileResult)
25
+
26
+
27
+ def test_profile_writes_html(real_signal_df, tmp_path, mock_nullbic, monkeypatch):
28
+ out = str(tmp_path / "report.html")
29
+ written = {}
30
+
31
+ def fake_render(result, path, **kw):
32
+ written["path"] = path
33
+ with open(path, "w") as f:
34
+ f.write("<html>stub</html>")
35
+
36
+ monkeypatch.setattr("honest_eda.profile._report_mod.render_html", fake_render)
37
+ honest_eda.profile(real_signal_df, "y", out)
38
+ assert "path" in written
39
+ assert written["path"] == out
40
+ assert (tmp_path / "report.html").exists()
41
+
42
+
43
+ def test_check_true_when_signal(real_signal_df, mock_nullbic):
44
+ result = honest_eda.check(real_signal_df, "y", min_strong=1)
45
+ assert result is True
46
+
47
+
48
+ def test_check_false_when_noise(noise_df, mock_nullbic):
49
+ result = honest_eda.check(noise_df, "y", min_strong=1)
50
+ assert result is False
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from honest_eda.report import render_html, render_text_summary
8
+ from honest_eda.runner import ProfileResult, RelationResult
9
+
10
+
11
+ def _make_profile(target: str = "y", *, with_strong=True, with_leakage=False) -> ProfileResult:
12
+ strong_rel = RelationResult(
13
+ target=target,
14
+ features=["x"],
15
+ formula=f"{target} ~ x",
16
+ verdict="STRONG",
17
+ delta_bic_const=-15.0,
18
+ delta_bic_linear=-12.0,
19
+ z_vs_shuffled=-5.0,
20
+ )
21
+ leakage_rel = RelationResult(
22
+ target=target,
23
+ features=["leak"],
24
+ formula=f"{target} ~ leak",
25
+ verdict="STRONG",
26
+ delta_bic_const=-50.0,
27
+ delta_bic_linear=-30.0,
28
+ z_vs_shuffled=-20.0,
29
+ )
30
+ weak_rel = RelationResult(
31
+ target=target,
32
+ features=["w"],
33
+ formula=f"{target} ~ w",
34
+ verdict="WEAK",
35
+ delta_bic_const=-3.0,
36
+ delta_bic_linear=-2.0,
37
+ z_vs_shuffled=-2.5,
38
+ )
39
+
40
+ strong = [strong_rel] if with_strong else []
41
+ leakage = [leakage_rel] if with_leakage else []
42
+ all_rels = strong + leakage + [weak_rel]
43
+
44
+ return ProfileResult(
45
+ n_columns=4,
46
+ n_relations=len(all_rels),
47
+ target=target,
48
+ strong=strong + leakage,
49
+ weak=[weak_rel],
50
+ noise_count=1,
51
+ linear_only=[],
52
+ leakage_suspects=leakage,
53
+ )
54
+
55
+
56
+ def test_render_html_creates_file(tmp_path):
57
+ result = _make_profile()
58
+ out = tmp_path / "report.html"
59
+ render_html(result, str(out))
60
+ assert out.exists()
61
+ assert out.stat().st_size > 0
62
+
63
+
64
+ def test_render_html_contains_target_name(tmp_path):
65
+ result = _make_profile(target="price")
66
+ out = tmp_path / "report.html"
67
+ render_html(result, str(out))
68
+ content = out.read_text(encoding="utf-8")
69
+ assert "price" in content
70
+
71
+
72
+ def test_render_html_strong_section_present(tmp_path):
73
+ result = _make_profile(with_strong=True)
74
+ out = tmp_path / "report.html"
75
+ render_html(result, str(out))
76
+ content = out.read_text(encoding="utf-8")
77
+ assert "STRONG" in content or "strong" in content.lower()
78
+
79
+
80
+ def test_render_html_leakage_section_present(tmp_path):
81
+ result = _make_profile(with_leakage=True)
82
+ out = tmp_path / "report.html"
83
+ render_html(result, str(out))
84
+ content = out.read_text(encoding="utf-8")
85
+ assert "leak" in content.lower() or "-20" in content
86
+
87
+
88
+ def test_render_text_summary_counters():
89
+ result = _make_profile(target="y", with_strong=True)
90
+ summary = render_text_summary(result)
91
+ assert "STRONG" in summary
92
+ assert "1" in summary
93
+ assert "y" in summary
94
+
95
+
96
+ def test_render_text_summary_noise_count():
97
+ result = _make_profile()
98
+ summary = render_text_summary(result)
99
+ assert "NOISE" in summary or "noise" in summary.lower()
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+
5
+ from honest_eda.runner import ProfileResult, RelationResult, run_against_target
6
+
7
+
8
+ def test_real_signal_has_strong(real_signal_df, mock_nullbic):
9
+ result = run_against_target(real_signal_df, "y")
10
+ assert isinstance(result, ProfileResult)
11
+ assert len(result.strong) >= 1
12
+
13
+
14
+ def test_noise_df_no_strong(noise_df, mock_nullbic):
15
+ result = run_against_target(noise_df, "y")
16
+ assert result.strong == []
17
+ assert result.noise_count > 0
18
+
19
+
20
+ def test_leakage_df_suspects(leakage_df, mock_nullbic):
21
+ result = run_against_target(leakage_df, "y")
22
+ assert len(result.leakage_suspects) >= 1
23
+
24
+
25
+ def test_nullbic_crash_skipped(real_signal_df, monkeypatch, mock_nullbic):
26
+ import honest_eda.runner as runner_mod
27
+
28
+ call_count = 0
29
+ original_run_one = runner_mod._run_one
30
+
31
+ def failing_run_one(df, features, target, *args, **kwargs):
32
+ nonlocal call_count
33
+ call_count += 1
34
+ if call_count == 1:
35
+ return None
36
+ return original_run_one(df, features, target, *args, **kwargs)
37
+
38
+ monkeypatch.setattr(runner_mod, "_run_one", failing_run_one)
39
+ result = run_against_target(real_signal_df, "y")
40
+ assert isinstance(result, ProfileResult)
41
+
42
+
43
+ def test_n_columns_and_n_relations(real_signal_df, mock_nullbic):
44
+ result = run_against_target(real_signal_df, "y")
45
+ assert result.n_columns == len(real_signal_df.columns)
46
+ assert result.n_relations >= 0
47
+ total = len(result.strong) + len(result.weak) + result.noise_count
48
+ assert total == result.n_relations