catstat 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- catstat-0.1.1/.claude/skills/benchmark-harness/SKILL.md +53 -0
- catstat-0.1.1/.claude/skills/leakage-audit/SKILL.md +54 -0
- catstat-0.1.1/.claude/skills/release-prep/SKILL.md +50 -0
- catstat-0.1.1/.claude/skills/sklearn-compat/SKILL.md +51 -0
- catstat-0.1.1/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- catstat-0.1.1/.github/ISSUE_TEMPLATE/config.yml +5 -0
- catstat-0.1.1/.github/ISSUE_TEMPLATE/feature_request.md +22 -0
- catstat-0.1.1/.github/PULL_REQUEST_TEMPLATE.md +16 -0
- catstat-0.1.1/.github/workflows/ci.yml +34 -0
- catstat-0.1.1/.github/workflows/docs.yml +51 -0
- catstat-0.1.1/.github/workflows/release.yml +65 -0
- catstat-0.1.1/.gitignore +25 -0
- catstat-0.1.1/CHANGELOG.md +60 -0
- catstat-0.1.1/CLAUDE.md +146 -0
- catstat-0.1.1/CONTRIBUTING.md +48 -0
- catstat-0.1.1/LICENSE +21 -0
- catstat-0.1.1/PKG-INFO +138 -0
- catstat-0.1.1/README.md +102 -0
- catstat-0.1.1/SECURITY.md +27 -0
- catstat-0.1.1/benchmarks/README.md +42 -0
- catstat-0.1.1/benchmarks/__init__.py +1 -0
- catstat-0.1.1/benchmarks/compare_results.py +61 -0
- catstat-0.1.1/benchmarks/datasets.py +130 -0
- catstat-0.1.1/benchmarks/ledger.py +66 -0
- catstat-0.1.1/benchmarks/results/2026-06-26-T4-gpu-parity.jsonl +8 -0
- catstat-0.1.1/benchmarks/results/baseline-cpu.json +154 -0
- catstat-0.1.1/benchmarks/results/ledger.jsonl +24 -0
- catstat-0.1.1/benchmarks/run_benchmarks.py +108 -0
- catstat-0.1.1/docs/experiment_log.md +177 -0
- catstat-0.1.1/docs/known_issues.md +39 -0
- catstat-0.1.1/docs/next-session-prompt.md +87 -0
- catstat-0.1.1/docs/proposals/claude-md-proposal.md +51 -0
- catstat-0.1.1/docs/proposals/evaluation-harness-design.md +163 -0
- catstat-0.1.1/docs/proposals/self-improvement-loop-design.md +100 -0
- catstat-0.1.1/docs/proposals/skills-proposal.md +94 -0
- catstat-0.1.1/docs/proposals/target-encoder-library-design.md +448 -0
- catstat-0.1.1/docs/publishing_checklist.md +62 -0
- catstat-0.1.1/docs/roadmap.md +102 -0
- catstat-0.1.1/docs/verdicts/.gitkeep +0 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-api-docs-verdict.md +36 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-check-estimator-subset-verdict.md +46 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-ci-pytest-pythonpath-verdict.md +32 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-gpu-crossover-verdict.md +58 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-gpu-parity-report.md +18 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-gpu-parity-verdict.md +42 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-m0-bootstrap-verdict.md +54 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-pandas3-string-dtype-verdict.md +41 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-phase2-stats-gpu-verdict.md +59 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-phase3a-skew-custom-verdict.md +33 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-phase3b-loo-ordered-verdict.md +42 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-project-hygiene-verdict.md +34 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-readme-polish-verdict.md +35 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-release-0.1.0-verdict.md +34 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-release-automation-verdict.md +47 -0
- catstat-0.1.1/docs/verdicts/2026-06-26-sklearn-tags-verdict.md +41 -0
- catstat-0.1.1/docs/verdicts/TEMPLATE-verdict.md +34 -0
- catstat-0.1.1/examples/binary_classification_basic.py +19 -0
- catstat-0.1.1/examples/count_frequency_basic.py +22 -0
- catstat-0.1.1/examples/multiclass_classification_basic.py +19 -0
- catstat-0.1.1/examples/regression_basic.py +22 -0
- catstat-0.1.1/pyproject.toml +72 -0
- catstat-0.1.1/scripts/build_docs.sh +19 -0
- catstat-0.1.1/scripts/check.sh +22 -0
- catstat-0.1.1/scripts/colab_gpu_parity.py +182 -0
- catstat-0.1.1/scripts/colab_gpu_parity.sh +76 -0
- catstat-0.1.1/scripts/summarize_benchmark_results.py +41 -0
- catstat-0.1.1/src/catstat/__init__.py +15 -0
- catstat-0.1.1/src/catstat/_aggregations.py +57 -0
- catstat-0.1.1/src/catstat/_base.py +387 -0
- catstat-0.1.1/src/catstat/_cross_fit.py +83 -0
- catstat-0.1.1/src/catstat/_feature_names.py +49 -0
- catstat-0.1.1/src/catstat/_smoothing.py +56 -0
- catstat-0.1.1/src/catstat/_stats.py +101 -0
- catstat-0.1.1/src/catstat/_validation.py +128 -0
- catstat-0.1.1/src/catstat/backends/__init__.py +5 -0
- catstat-0.1.1/src/catstat/backends/_cpu.py +66 -0
- catstat-0.1.1/src/catstat/backends/_dispatch.py +54 -0
- catstat-0.1.1/src/catstat/backends/_gpu.py +120 -0
- catstat-0.1.1/src/catstat/count_encoder.py +36 -0
- catstat-0.1.1/src/catstat/frequency_encoder.py +26 -0
- catstat-0.1.1/src/catstat/py.typed +0 -0
- catstat-0.1.1/src/catstat/target_encoder.py +59 -0
- catstat-0.1.1/tests/conftest.py +73 -0
- catstat-0.1.1/tests/test_backend.py +27 -0
- catstat-0.1.1/tests/test_check_estimator.py +61 -0
- catstat-0.1.1/tests/test_count_frequency.py +37 -0
- catstat-0.1.1/tests/test_cpu_gpu_parity.py +34 -0
- catstat-0.1.1/tests/test_cross_fit_no_leakage.py +58 -0
- catstat-0.1.1/tests/test_determinism.py +27 -0
- catstat-0.1.1/tests/test_feature_names.py +45 -0
- catstat-0.1.1/tests/test_io_types.py +68 -0
- catstat-0.1.1/tests/test_multi_feature.py +51 -0
- catstat-0.1.1/tests/test_phase3.py +77 -0
- catstat-0.1.1/tests/test_polars.py +32 -0
- catstat-0.1.1/tests/test_scheme.py +87 -0
- catstat-0.1.1/tests/test_sklearn_compat.py +106 -0
- catstat-0.1.1/tests/test_stats.py +69 -0
- catstat-0.1.1/tests/test_target_encoder_binary.py +33 -0
- catstat-0.1.1/tests/test_target_encoder_multiclass.py +34 -0
- catstat-0.1.1/tests/test_target_encoder_regression.py +57 -0
- catstat-0.1.1/tests/test_unknown_missing.py +68 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: benchmark-harness
|
|
3
|
+
description: >-
|
|
4
|
+
Run catstat's benchmark harness reproducibly, persist results to the ledger, compare against the
|
|
5
|
+
committed baseline, and draft a verdict. Invoke for any perf-relevant change, to establish or
|
|
6
|
+
refresh a baseline, or as the benchmark step of the self-improvement loop. Enforces >=5 reps,
|
|
7
|
+
median+spread, pinned seeds/versions/SHA, and never changes a default by itself. Outputs a
|
|
8
|
+
before/after table and a filled docs/verdicts/ entry.
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
You run the **measurement harness** and turn numbers into a decision. You never change a default
|
|
12
|
+
on your own — that requires a written verdict backed by repeated runs.
|
|
13
|
+
|
|
14
|
+
## When to use
|
|
15
|
+
- Any perf-relevant change (backend, group-by, dispatch, conversion path).
|
|
16
|
+
- Establishing or refreshing a committed baseline.
|
|
17
|
+
- The benchmark step of the self-improvement loop.
|
|
18
|
+
|
|
19
|
+
## When NOT to use
|
|
20
|
+
- Correctness-only changes (use `leakage-audit` / `sklearn-compat`).
|
|
21
|
+
- To justify a default change from a single run or a microbenchmark presented as end-to-end.
|
|
22
|
+
|
|
23
|
+
## Required inputs
|
|
24
|
+
- `--size {small,medium,large}`, `--backend {cpu,gpu}`, `--reps N` (≥5), and a committed baseline
|
|
25
|
+
JSON to compare against.
|
|
26
|
+
|
|
27
|
+
## Commands
|
|
28
|
+
```bash
|
|
29
|
+
python3 benchmarks/run_benchmarks.py --backend cpu --reps 5 --out benchmarks/results/<run>.jsonl
|
|
30
|
+
python3 benchmarks/compare_results.py benchmarks/results/<run>.jsonl benchmarks/results/baseline-cpu.json
|
|
31
|
+
python3 scripts/summarize_benchmark_results.py benchmarks/results/<run>.jsonl
|
|
32
|
+
```
|
|
33
|
+
GPU runs go through `scripts/colab_gpu_parity.sh` (Phase 2), not local.
|
|
34
|
+
|
|
35
|
+
## Files to inspect
|
|
36
|
+
`benchmarks/datasets.py`, `benchmarks/run_benchmarks.py`, `benchmarks/ledger.py`,
|
|
37
|
+
`benchmarks/compare_results.py`, `benchmarks/results/baseline-cpu.json`,
|
|
38
|
+
`docs/verdicts/TEMPLATE-verdict.md`, and the harness design `docs/proposals/evaluation-harness-design.md`.
|
|
39
|
+
|
|
40
|
+
## Failure modes to catch
|
|
41
|
+
- Fewer than 5 reps; reporting mean without spread.
|
|
42
|
+
- Comparing across different seeds, package versions, or git SHAs.
|
|
43
|
+
- Reporting a microbenchmark win as an end-to-end win.
|
|
44
|
+
- Updating the committed baseline without a verdict.
|
|
45
|
+
- Bundling a harness change with a behavior change in one diff (result becomes un-attributable —
|
|
46
|
+
keep harness changes in a separate commit).
|
|
47
|
+
- Timing only `fit_transform` wall time without separating `fit` / `transform` / conversion.
|
|
48
|
+
|
|
49
|
+
## Final report format
|
|
50
|
+
A before/after table (per case: `fit_s`, `transform_s`, `fit_transform_s` as median+spread, peak
|
|
51
|
+
memory), regressions/improvements vs the §8 thresholds (harness doc), parity status if relevant,
|
|
52
|
+
and a filled `docs/verdicts/YYYY-MM-DD-<topic>-verdict.md` with a keep/change/revert decision.
|
|
53
|
+
Update the committed baseline **only** if the verdict says so.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: leakage-audit
|
|
3
|
+
description: >-
|
|
4
|
+
Prove that catstat's fit_transform is out-of-fold and that no target information leaks into the
|
|
5
|
+
encoded features. Invoke for ANY change touching the cross-fit, smoothing, or transform path
|
|
6
|
+
(_cross_fit.py, _smoothing.py, _base.py transform, fold assignment) before keeping the change.
|
|
7
|
+
Runs tests/test_cross_fit_no_leakage.py, independently reconstructs each fold's encoding from its
|
|
8
|
+
complement, and checks the noise-trap. Reports PASS/FAIL with the exact offending path on failure.
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
You are the **leakage auditor**. The single question: *does any target information leak into a
|
|
12
|
+
`fit_transform` output, directly or via an implementation detail?* Leakage safety is `catstat`'s
|
|
13
|
+
#1 invariant — you sign off before any cross-fit/smoothing change is kept.
|
|
14
|
+
|
|
15
|
+
## When to use
|
|
16
|
+
- Any diff touching `_cross_fit.py`, `_smoothing.py`, the transform path in `_base.py`, or fold
|
|
17
|
+
assignment.
|
|
18
|
+
- Before keeping such a change, and whenever a new statistic's OOF behavior is added.
|
|
19
|
+
|
|
20
|
+
## When NOT to use
|
|
21
|
+
- Pure docs / benchmark / naming changes.
|
|
22
|
+
- Unsupervised `CountEncoder`/`FrequencyEncoder` logic — there is no target, so only run the
|
|
23
|
+
`fit_transform == fit().transform()` equivalence check (there is nothing to leak).
|
|
24
|
+
|
|
25
|
+
## Required inputs
|
|
26
|
+
- The diff/PR scope.
|
|
27
|
+
- A seeded dataset with known signal **and** a noise-trap (category independent of `y`); use
|
|
28
|
+
`benchmarks/datasets.py::make_leakage_trap` and a signal generator.
|
|
29
|
+
|
|
30
|
+
## Commands
|
|
31
|
+
```bash
|
|
32
|
+
PYTHONPATH=src python3 -m pytest tests/test_cross_fit_no_leakage.py -q
|
|
33
|
+
# ad hoc OOF reconstruction: for each fold, recompute the encoding from the fold's COMPLEMENT and
|
|
34
|
+
# assert it equals the value fit_transform produced for that fold's rows (must be exact on CPU).
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Files to inspect
|
|
38
|
+
`_cross_fit.py`, `_smoothing.py`, `_base.py` (transform), `tests/test_cross_fit_no_leakage.py`,
|
|
39
|
+
and `docs/proposals/target-encoder-library-design.md` §8.
|
|
40
|
+
|
|
41
|
+
## Failure modes to catch
|
|
42
|
+
- Per-fold statistics that secretly include the held-out fold.
|
|
43
|
+
- `smooth="auto"` variance computed on the full data instead of per fold.
|
|
44
|
+
- Row order scrambled on merge (the produced value lands on the wrong row).
|
|
45
|
+
- Unknown/global fallback drawn from the *transformed* set instead of training folds.
|
|
46
|
+
- An example or test that uses `fit().transform()` on the training set.
|
|
47
|
+
|
|
48
|
+
## Final report format
|
|
49
|
+
`PASS` / `FAIL`, plus:
|
|
50
|
+
- OOF-reconstruction result (exact match per fold? yes/no).
|
|
51
|
+
- Noise-trap: correlation of the OOF feature with `y` on held-out rows (should be ≈ 0).
|
|
52
|
+
- Which traps were checked and the asymmetry check (`fit_transform ≠ fit().transform()` with signal).
|
|
53
|
+
- On `FAIL`: the exact file/line and which invariant it violates. Escalate non-trivial fixes; do
|
|
54
|
+
not "make the test pass" by weakening it.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: release-prep
|
|
3
|
+
description: >-
|
|
4
|
+
Prepare a catstat PyPI release: confirm the green gate, bump the version in both pyproject.toml
|
|
5
|
+
and __init__.py (kept in sync), update CHANGELOG.md, build sdist+wheel, and run twine check +
|
|
6
|
+
a clean-venv smoke install. Follows docs/publishing_checklist.md. Prepares and verifies artifacts
|
|
7
|
+
only — never uploads or tags (the maintainer holds PyPI/GitHub credentials).
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
You prepare a release and **verify it builds and installs**, stopping short of publishing.
|
|
11
|
+
|
|
12
|
+
## When to use
|
|
13
|
+
- Cutting a new `catstat` version (after features have landed and the gate is green).
|
|
14
|
+
|
|
15
|
+
## When NOT to use
|
|
16
|
+
- Mid-development; or to change library behavior (that's a normal PR, not a release).
|
|
17
|
+
|
|
18
|
+
## Required inputs
|
|
19
|
+
- The target version (SemVer). Confirm what changed since the last tag (read `git log` + the
|
|
20
|
+
unreleased CHANGELOG section).
|
|
21
|
+
|
|
22
|
+
## Steps / commands
|
|
23
|
+
1. `bash scripts/check.sh` must be green; coverage ≥ floor.
|
|
24
|
+
2. Bump version in **both** `pyproject.toml` `project.version` and `src/catstat/__init__.py`
|
|
25
|
+
`__version__` — they MUST match (a mismatch is a release bug).
|
|
26
|
+
3. Move the CHANGELOG `[Unreleased]` items under a dated `[X.Y.Z]` heading; keep it honest
|
|
27
|
+
(Added / Changed / Fixed / Known limitations).
|
|
28
|
+
4. Build + verify:
|
|
29
|
+
```bash
|
|
30
|
+
python3 -m build
|
|
31
|
+
python3 -m twine check dist/*
|
|
32
|
+
python3 -m pip install dist/catstat-*.whl # in a fresh venv
|
|
33
|
+
python3 -c "import catstat; print(catstat.__version__)"
|
|
34
|
+
```
|
|
35
|
+
5. Hand off: report that artifacts are ready; the maintainer runs `twine upload` + `git tag`.
|
|
36
|
+
|
|
37
|
+
## Files to inspect
|
|
38
|
+
`pyproject.toml`, `src/catstat/__init__.py`, `CHANGELOG.md`, `docs/publishing_checklist.md`,
|
|
39
|
+
`README.md`, `LICENSE`.
|
|
40
|
+
|
|
41
|
+
## Failure modes to catch
|
|
42
|
+
- Version mismatch between pyproject and `__init__`.
|
|
43
|
+
- `twine check` warnings (README won't render, missing metadata).
|
|
44
|
+
- Wheel missing `py.typed` or the package data.
|
|
45
|
+
- Building from a dirty tree (uncommitted changes) or a non-green gate.
|
|
46
|
+
- Uploading from this environment (don't — no credentials; it's the maintainer's step).
|
|
47
|
+
|
|
48
|
+
## Final report format
|
|
49
|
+
Version old→new, the CHANGELOG section, `twine check` result, the smoke-install import line, and an
|
|
50
|
+
explicit "ready to `twine upload` + tag vX.Y.Z" hand-off (or the blocking issue).
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sklearn-compat
|
|
3
|
+
description: >-
|
|
4
|
+
Verify that catstat's encoders behave as well-mannered scikit-learn transformers. Invoke for any
|
|
5
|
+
change to the public classes, constructor params, fitted attributes, feature names, or output
|
|
6
|
+
handling, and before a release. Runs tests/test_sklearn_compat.py and spot-checks clone,
|
|
7
|
+
get/set_params, Pipeline, ColumnTransformer, set_output, and get_feature_names_out. Reports
|
|
8
|
+
PASS/FAIL per check plus the documented subset of check_estimator that applies.
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
You verify **scikit-learn protocol compliance** for `catstat`'s public encoders. Full
|
|
12
|
+
`check_estimator` compliance is unrealistic for supervised, multi-output transformers — target a
|
|
13
|
+
**documented subset** and be explicit about what does/doesn't apply and why.
|
|
14
|
+
|
|
15
|
+
## When to use
|
|
16
|
+
- Changes to `TargetEncoder`/`CountEncoder`/`FrequencyEncoder` public surface: constructor params,
|
|
17
|
+
fitted attributes, feature names, `set_output`/`output=` handling.
|
|
18
|
+
- Before a release.
|
|
19
|
+
|
|
20
|
+
## When NOT to use
|
|
21
|
+
- Internal backend/perf changes with no public-surface effect.
|
|
22
|
+
|
|
23
|
+
## Required inputs
|
|
24
|
+
- The class(es) under test; installed `scikit-learn` (record the version — meaningful
|
|
25
|
+
`check_estimator` coverage needs ≥1.4, which also has `TargetEncoder` for parity).
|
|
26
|
+
|
|
27
|
+
## Commands
|
|
28
|
+
```bash
|
|
29
|
+
PYTHONPATH=src python3 -m pytest tests/test_sklearn_compat.py -q
|
|
30
|
+
```
|
|
31
|
+
Spot-checks (must all hold): `sklearn.base.clone(enc)`; `enc.get_params()` /
|
|
32
|
+
`enc.set_params(**p)` round-trip; use inside `Pipeline` and `ColumnTransformer`;
|
|
33
|
+
`enc.set_output(transform="pandas")` returns a DataFrame; `enc.get_feature_names_out()` length
|
|
34
|
+
equals the output width.
|
|
35
|
+
|
|
36
|
+
## Files to inspect
|
|
37
|
+
`target_encoder.py`, `count_encoder.py`, `frequency_encoder.py`, `_base.py`, `_feature_names.py`,
|
|
38
|
+
`tests/test_sklearn_compat.py`.
|
|
39
|
+
|
|
40
|
+
## Failure modes to catch
|
|
41
|
+
- Constructor mutates or fails to store a param verbatim (breaks `clone`/`get_params`).
|
|
42
|
+
- A fitted attribute missing its trailing underscore, or set before `fit`.
|
|
43
|
+
- `get_feature_names_out` length ≠ number of output columns (esp. multiclass class-expansion and
|
|
44
|
+
class-agnostic count/frequency not multiplied by `K`).
|
|
45
|
+
- `set_output` / `output=` not honored, or names lost.
|
|
46
|
+
- Silent failure inside `ColumnTransformer` (cuML had this historically — assert it actually works).
|
|
47
|
+
|
|
48
|
+
## Final report format
|
|
49
|
+
PASS/FAIL per check; the `scikit-learn` version used; and the **documented** list of
|
|
50
|
+
`check_estimator` checks that are (in)applicable to a supervised multi-output transformer, with a
|
|
51
|
+
one-line reason each. On FAIL, the precise attribute/method and the protocol expectation it misses.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
about: Report a problem with catstat
|
|
4
|
+
title: "[bug] "
|
|
5
|
+
labels: bug
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
**Describe the bug**
|
|
9
|
+
A clear description of what went wrong.
|
|
10
|
+
|
|
11
|
+
**To reproduce**
|
|
12
|
+
A minimal, self-contained snippet:
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from catstat import TargetEncoder
|
|
17
|
+
# ...
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
**Expected behavior**
|
|
21
|
+
What you expected to happen.
|
|
22
|
+
|
|
23
|
+
**Environment**
|
|
24
|
+
- catstat version:
|
|
25
|
+
- Python version:
|
|
26
|
+
- pandas / numpy / scikit-learn versions:
|
|
27
|
+
- backend (cpu/gpu) and OS:
|
|
28
|
+
|
|
29
|
+
**Additional context**
|
|
30
|
+
Full traceback, data shape, or anything else relevant.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
about: Suggest an idea or new capability for catstat
|
|
4
|
+
title: "[feature] "
|
|
5
|
+
labels: enhancement
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
**What problem does this solve?**
|
|
9
|
+
The use case or limitation you're hitting.
|
|
10
|
+
|
|
11
|
+
**Proposed solution**
|
|
12
|
+
What you'd like catstat to do (an API sketch is welcome).
|
|
13
|
+
|
|
14
|
+
**Alternatives considered**
|
|
15
|
+
Other approaches, or how you work around it today.
|
|
16
|
+
|
|
17
|
+
**Scope check**
|
|
18
|
+
- Does it fit catstat's scope (leakage-safe statistical categorical encoding)?
|
|
19
|
+
- Is it a new statistic (`stats=`), a backend/IO option, or core behavior?
|
|
20
|
+
|
|
21
|
+
**Additional context**
|
|
22
|
+
Links, references, or examples.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
## Summary
|
|
2
|
+
|
|
3
|
+
<!-- What does this PR change, and why? -->
|
|
4
|
+
|
|
5
|
+
## Checklist
|
|
6
|
+
|
|
7
|
+
- [ ] `bash scripts/check.sh` is green (ruff + pytest + examples).
|
|
8
|
+
- [ ] Tests cover the new behavior (encode correctness, **OOF / no-leakage**, unknown/missing
|
|
9
|
+
fallback, feature names, determinism — as applicable).
|
|
10
|
+
- [ ] Core invariants intact (leakage safety, smoothing honesty, CPU/GPU parity, public API).
|
|
11
|
+
- [ ] `CHANGELOG.md` updated for user-visible changes; SemVer considered.
|
|
12
|
+
- [ ] Docs updated (`README.md` / `docs/`) where relevant.
|
|
13
|
+
|
|
14
|
+
## Notes
|
|
15
|
+
|
|
16
|
+
<!-- Anything reviewers should know: trade-offs, follow-ups, benchmark/verdict links. -->
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
fail-fast: false
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- name: Install (editable + dev extra)
|
|
21
|
+
run: |
|
|
22
|
+
python -m pip install --upgrade pip
|
|
23
|
+
pip install -e ".[dev]"
|
|
24
|
+
- name: Lint
|
|
25
|
+
run: ruff check src tests examples benchmarks scripts
|
|
26
|
+
- name: Test + coverage
|
|
27
|
+
# GPU/parity tests carry the `gpu` marker and auto-skip without RAPIDS (CPU runner).
|
|
28
|
+
run: pytest tests/ -q --cov=catstat --cov-report=term-missing
|
|
29
|
+
- name: Examples (smoke)
|
|
30
|
+
run: |
|
|
31
|
+
python examples/regression_basic.py
|
|
32
|
+
python examples/binary_classification_basic.py
|
|
33
|
+
python examples/multiclass_classification_basic.py
|
|
34
|
+
python examples/count_frequency_basic.py
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: Docs
|
|
2
|
+
|
|
3
|
+
# Build the API reference (pdoc) and publish it to GitHub Pages on each push to main.
|
|
4
|
+
# One-time setup: repo Settings -> Pages -> Source: "GitHub Actions".
|
|
5
|
+
|
|
6
|
+
on:
|
|
7
|
+
push:
|
|
8
|
+
branches: [main]
|
|
9
|
+
workflow_dispatch:
|
|
10
|
+
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
pages: write
|
|
14
|
+
id-token: write
|
|
15
|
+
|
|
16
|
+
# Allow one concurrent Pages deployment; let an in-progress run finish.
|
|
17
|
+
concurrency:
|
|
18
|
+
group: pages
|
|
19
|
+
cancel-in-progress: false
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
build:
|
|
23
|
+
name: Build API docs
|
|
24
|
+
runs-on: ubuntu-latest
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v4
|
|
27
|
+
- uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: "3.12"
|
|
30
|
+
- name: Install (docs extra)
|
|
31
|
+
run: |
|
|
32
|
+
python -m pip install --upgrade pip
|
|
33
|
+
pip install -e ".[docs]"
|
|
34
|
+
- name: Build with pdoc
|
|
35
|
+
run: bash scripts/build_docs.sh site
|
|
36
|
+
- name: Upload Pages artifact
|
|
37
|
+
uses: actions/upload-pages-artifact@v3
|
|
38
|
+
with:
|
|
39
|
+
path: site
|
|
40
|
+
|
|
41
|
+
deploy:
|
|
42
|
+
name: Deploy to GitHub Pages
|
|
43
|
+
needs: build
|
|
44
|
+
runs-on: ubuntu-latest
|
|
45
|
+
environment:
|
|
46
|
+
name: github-pages
|
|
47
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
48
|
+
steps:
|
|
49
|
+
- name: Deploy
|
|
50
|
+
id: deployment
|
|
51
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Build and publish to PyPI when a version tag (e.g. v0.1.1) is pushed.
|
|
4
|
+
# Publishing uses PyPI Trusted Publishing (OIDC): no API token is stored in the repo.
|
|
5
|
+
# One-time setup and the manual fallback are documented in docs/publishing_checklist.md.
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
push:
|
|
9
|
+
tags: ["v*"]
|
|
10
|
+
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
build:
|
|
16
|
+
name: Build distributions
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.12"
|
|
23
|
+
- name: Verify tag matches package version
|
|
24
|
+
env:
|
|
25
|
+
TAG: ${{ github.ref_name }}
|
|
26
|
+
run: |
|
|
27
|
+
python - <<'PY'
|
|
28
|
+
import os, pathlib, re, sys
|
|
29
|
+
tag = os.environ["TAG"].lstrip("v")
|
|
30
|
+
pp = re.search(r'(?m)^version = "([^"]+)"',
|
|
31
|
+
pathlib.Path("pyproject.toml").read_text()).group(1)
|
|
32
|
+
init = re.search(r'(?m)^__version__ = "([^"]+)"',
|
|
33
|
+
pathlib.Path("src/catstat/__init__.py").read_text()).group(1)
|
|
34
|
+
if not (tag == pp == init):
|
|
35
|
+
sys.exit(f"Version mismatch: tag={tag!r} pyproject={pp!r} __init__={init!r}")
|
|
36
|
+
print(f"OK: version {tag} matches the tag, pyproject.toml, and __init__.py.")
|
|
37
|
+
PY
|
|
38
|
+
- name: Build sdist + wheel
|
|
39
|
+
run: |
|
|
40
|
+
python -m pip install --upgrade pip build twine
|
|
41
|
+
python -m build
|
|
42
|
+
python -m twine check dist/*
|
|
43
|
+
- name: Upload dist artifact
|
|
44
|
+
uses: actions/upload-artifact@v4
|
|
45
|
+
with:
|
|
46
|
+
name: dist
|
|
47
|
+
path: dist/
|
|
48
|
+
|
|
49
|
+
publish:
|
|
50
|
+
name: Publish to PyPI (Trusted Publishing)
|
|
51
|
+
needs: build
|
|
52
|
+
runs-on: ubuntu-latest
|
|
53
|
+
environment:
|
|
54
|
+
name: pypi
|
|
55
|
+
url: https://pypi.org/project/catstat/
|
|
56
|
+
permissions:
|
|
57
|
+
id-token: write # OIDC token for PyPI Trusted Publishing — no stored API token
|
|
58
|
+
steps:
|
|
59
|
+
- name: Download dist artifact
|
|
60
|
+
uses: actions/download-artifact@v4
|
|
61
|
+
with:
|
|
62
|
+
name: dist
|
|
63
|
+
path: dist/
|
|
64
|
+
- name: Publish to PyPI
|
|
65
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
catstat-0.1.1/.gitignore
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
build/
|
|
6
|
+
dist/
|
|
7
|
+
.eggs/
|
|
8
|
+
|
|
9
|
+
# Test / lint / coverage caches
|
|
10
|
+
.pytest_cache/
|
|
11
|
+
.ruff_cache/
|
|
12
|
+
.coverage
|
|
13
|
+
htmlcov/
|
|
14
|
+
|
|
15
|
+
# Benchmark scratch (committed baselines live in benchmarks/results/baseline-*.json)
|
|
16
|
+
benchmarks/results/run.json
|
|
17
|
+
benchmarks/results/_tmp_*.json
|
|
18
|
+
|
|
19
|
+
# API docs build output (published by .github/workflows/docs.yml)
|
|
20
|
+
site/
|
|
21
|
+
|
|
22
|
+
# Env / editor
|
|
23
|
+
.venv/
|
|
24
|
+
venv/
|
|
25
|
+
.DS_Store
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `catstat` are documented here. Format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/); versioning is [SemVer](https://semver.org/).
|
|
5
|
+
|
|
6
|
+
## [0.1.1] — 2026-06-26
|
|
7
|
+
|
|
8
|
+
### Changed
|
|
9
|
+
- Releases now build and publish to PyPI automatically on a `v*` tag, via GitHub Actions and PyPI
|
|
10
|
+
**Trusted Publishing** (OIDC — no stored API token). See `docs/publishing_checklist.md`.
|
|
11
|
+
- `TargetEncoder` / `CountEncoder` / `FrequencyEncoder` now advertise scikit-learn estimator tags
|
|
12
|
+
via both `__sklearn_tags__` (sklearn ≥ 1.6) and `_more_tags` (< 1.6): categorical/string input,
|
|
13
|
+
`allow_nan` (NaN is learned as a level under `handle_missing="value"`), and `requires_y` for the
|
|
14
|
+
supervised encoder — so `check_estimator` skips inapplicable checks.
|
|
15
|
+
|
|
16
|
+
### Documentation
|
|
17
|
+
- Rewrote the README: status badges, an honest CPU/GPU status, install + extras, a statistics/
|
|
18
|
+
feature table, a "leakage-safe by design" note, and a link to the API reference.
|
|
19
|
+
- Published an API reference built with `pdoc` (`scripts/build_docs.sh`), deployed to GitHub Pages
|
|
20
|
+
via `.github/workflows/docs.yml`.
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
- `cols="auto"` now selects pandas `StringDtype` columns, so auto-detection works on pandas ≥ 3.0
|
|
24
|
+
(where string columns default to `StringDtype` rather than `object`). (KI-022)
|
|
25
|
+
- Fitted estimators are now picklable; a cached backend module previously raised
|
|
26
|
+
`TypeError: cannot pickle 'module' object`. (KI-012)
|
|
27
|
+
|
|
28
|
+
## [0.1.0] — 2026-06-26
|
|
29
|
+
|
|
30
|
+
First public release. Leakage-safe, sklearn-compatible statistical categorical encoding with one
|
|
31
|
+
API across CPU (pandas/numpy) and, optionally, GPU (cuDF/CuPy).
|
|
32
|
+
|
|
33
|
+
### Added
|
|
34
|
+
- **`TargetEncoder`** — supervised, leakage-safe. `fit_transform` is cross-fitted; `transform`
|
|
35
|
+
uses full-data encodings for new data. Regression, binary, and multiclass (one-vs-rest) targets.
|
|
36
|
+
- **`CountEncoder` / `FrequencyEncoder`** — unsupervised category-prevalence encoders.
|
|
37
|
+
- **Statistics** (`stats=`): `mean`, `count`, `frequency`, `var`, `std`, `median`, `min`, `max`,
|
|
38
|
+
`skew`, and **custom `(name, callable)` aggregations** (quantiles, IQR, …). Only mean/probability
|
|
39
|
+
are smoothed (m-estimate fixed; empirical-Bayes `smooth="auto"`); other statistics fall back to
|
|
40
|
+
the global value for small/unseen categories and never blend.
|
|
41
|
+
- **Cross-fitting schemes** (`scheme=`): `kfold` (default, out-of-fold), `loo` (leave-one-out),
|
|
42
|
+
`ordered` (CatBoost-style ordered target statistics). loo/ordered apply to the mean.
|
|
43
|
+
- **Multi-column**: `multi_feature_mode="independent"` (default) or `"combination"` (joint).
|
|
44
|
+
- **Missing / unseen handling**: `handle_missing` / `handle_unknown` ∈ {`value`, `return_nan`,
|
|
45
|
+
`error`}, with per-statistic fallbacks (count/frequency → 0; mean → global; etc.).
|
|
46
|
+
- **Backends**: `backend="cpu"` (default via `auto`), `backend="gpu"` (cuDF/CuPy, validated
|
|
47
|
+
CPU/GPU-allclose on a Colab T4, incl. missing). `auto` resolves to CPU for now — the current GPU
|
|
48
|
+
path is not yet faster than CPU up to 1M rows (see `docs/known_issues.md` KI-020).
|
|
49
|
+
- **Output**: `output` ∈ {`auto`, `numpy`, `pandas`, `polars`}; sklearn `set_output`,
|
|
50
|
+
`get_feature_names_out`, `Pipeline` / `ColumnTransformer` compatibility.
|
|
51
|
+
- Test suite (88 tests), synthetic benchmark harness, Colab GPU-parity loop, and CI
|
|
52
|
+
(Python 3.10–3.12).
|
|
53
|
+
|
|
54
|
+
### Known limitations
|
|
55
|
+
- GPU `auto` disabled pending an on-device redesign (KI-020); `combination` and `skew`/custom
|
|
56
|
+
aggregations run on CPU. sklearn-parity tests require `scikit-learn>=1.4`. See
|
|
57
|
+
`docs/known_issues.md`.
|
|
58
|
+
|
|
59
|
+
[0.1.1]: https://github.com/Matapanino/catstat/releases/tag/v0.1.1
|
|
60
|
+
[0.1.0]: https://github.com/Matapanino/catstat/releases/tag/v0.1.0
|