catstat 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. catstat-0.1.1/.claude/skills/benchmark-harness/SKILL.md +53 -0
  2. catstat-0.1.1/.claude/skills/leakage-audit/SKILL.md +54 -0
  3. catstat-0.1.1/.claude/skills/release-prep/SKILL.md +50 -0
  4. catstat-0.1.1/.claude/skills/sklearn-compat/SKILL.md +51 -0
  5. catstat-0.1.1/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  6. catstat-0.1.1/.github/ISSUE_TEMPLATE/config.yml +5 -0
  7. catstat-0.1.1/.github/ISSUE_TEMPLATE/feature_request.md +22 -0
  8. catstat-0.1.1/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  9. catstat-0.1.1/.github/workflows/ci.yml +34 -0
  10. catstat-0.1.1/.github/workflows/docs.yml +51 -0
  11. catstat-0.1.1/.github/workflows/release.yml +65 -0
  12. catstat-0.1.1/.gitignore +25 -0
  13. catstat-0.1.1/CHANGELOG.md +60 -0
  14. catstat-0.1.1/CLAUDE.md +146 -0
  15. catstat-0.1.1/CONTRIBUTING.md +48 -0
  16. catstat-0.1.1/LICENSE +21 -0
  17. catstat-0.1.1/PKG-INFO +138 -0
  18. catstat-0.1.1/README.md +102 -0
  19. catstat-0.1.1/SECURITY.md +27 -0
  20. catstat-0.1.1/benchmarks/README.md +42 -0
  21. catstat-0.1.1/benchmarks/__init__.py +1 -0
  22. catstat-0.1.1/benchmarks/compare_results.py +61 -0
  23. catstat-0.1.1/benchmarks/datasets.py +130 -0
  24. catstat-0.1.1/benchmarks/ledger.py +66 -0
  25. catstat-0.1.1/benchmarks/results/2026-06-26-T4-gpu-parity.jsonl +8 -0
  26. catstat-0.1.1/benchmarks/results/baseline-cpu.json +154 -0
  27. catstat-0.1.1/benchmarks/results/ledger.jsonl +24 -0
  28. catstat-0.1.1/benchmarks/run_benchmarks.py +108 -0
  29. catstat-0.1.1/docs/experiment_log.md +177 -0
  30. catstat-0.1.1/docs/known_issues.md +39 -0
  31. catstat-0.1.1/docs/next-session-prompt.md +87 -0
  32. catstat-0.1.1/docs/proposals/claude-md-proposal.md +51 -0
  33. catstat-0.1.1/docs/proposals/evaluation-harness-design.md +163 -0
  34. catstat-0.1.1/docs/proposals/self-improvement-loop-design.md +100 -0
  35. catstat-0.1.1/docs/proposals/skills-proposal.md +94 -0
  36. catstat-0.1.1/docs/proposals/target-encoder-library-design.md +448 -0
  37. catstat-0.1.1/docs/publishing_checklist.md +62 -0
  38. catstat-0.1.1/docs/roadmap.md +102 -0
  39. catstat-0.1.1/docs/verdicts/.gitkeep +0 -0
  40. catstat-0.1.1/docs/verdicts/2026-06-26-api-docs-verdict.md +36 -0
  41. catstat-0.1.1/docs/verdicts/2026-06-26-check-estimator-subset-verdict.md +46 -0
  42. catstat-0.1.1/docs/verdicts/2026-06-26-ci-pytest-pythonpath-verdict.md +32 -0
  43. catstat-0.1.1/docs/verdicts/2026-06-26-gpu-crossover-verdict.md +58 -0
  44. catstat-0.1.1/docs/verdicts/2026-06-26-gpu-parity-report.md +18 -0
  45. catstat-0.1.1/docs/verdicts/2026-06-26-gpu-parity-verdict.md +42 -0
  46. catstat-0.1.1/docs/verdicts/2026-06-26-m0-bootstrap-verdict.md +54 -0
  47. catstat-0.1.1/docs/verdicts/2026-06-26-pandas3-string-dtype-verdict.md +41 -0
  48. catstat-0.1.1/docs/verdicts/2026-06-26-phase2-stats-gpu-verdict.md +59 -0
  49. catstat-0.1.1/docs/verdicts/2026-06-26-phase3a-skew-custom-verdict.md +33 -0
  50. catstat-0.1.1/docs/verdicts/2026-06-26-phase3b-loo-ordered-verdict.md +42 -0
  51. catstat-0.1.1/docs/verdicts/2026-06-26-project-hygiene-verdict.md +34 -0
  52. catstat-0.1.1/docs/verdicts/2026-06-26-readme-polish-verdict.md +35 -0
  53. catstat-0.1.1/docs/verdicts/2026-06-26-release-0.1.0-verdict.md +34 -0
  54. catstat-0.1.1/docs/verdicts/2026-06-26-release-automation-verdict.md +47 -0
  55. catstat-0.1.1/docs/verdicts/2026-06-26-sklearn-tags-verdict.md +41 -0
  56. catstat-0.1.1/docs/verdicts/TEMPLATE-verdict.md +34 -0
  57. catstat-0.1.1/examples/binary_classification_basic.py +19 -0
  58. catstat-0.1.1/examples/count_frequency_basic.py +22 -0
  59. catstat-0.1.1/examples/multiclass_classification_basic.py +19 -0
  60. catstat-0.1.1/examples/regression_basic.py +22 -0
  61. catstat-0.1.1/pyproject.toml +72 -0
  62. catstat-0.1.1/scripts/build_docs.sh +19 -0
  63. catstat-0.1.1/scripts/check.sh +22 -0
  64. catstat-0.1.1/scripts/colab_gpu_parity.py +182 -0
  65. catstat-0.1.1/scripts/colab_gpu_parity.sh +76 -0
  66. catstat-0.1.1/scripts/summarize_benchmark_results.py +41 -0
  67. catstat-0.1.1/src/catstat/__init__.py +15 -0
  68. catstat-0.1.1/src/catstat/_aggregations.py +57 -0
  69. catstat-0.1.1/src/catstat/_base.py +387 -0
  70. catstat-0.1.1/src/catstat/_cross_fit.py +83 -0
  71. catstat-0.1.1/src/catstat/_feature_names.py +49 -0
  72. catstat-0.1.1/src/catstat/_smoothing.py +56 -0
  73. catstat-0.1.1/src/catstat/_stats.py +101 -0
  74. catstat-0.1.1/src/catstat/_validation.py +128 -0
  75. catstat-0.1.1/src/catstat/backends/__init__.py +5 -0
  76. catstat-0.1.1/src/catstat/backends/_cpu.py +66 -0
  77. catstat-0.1.1/src/catstat/backends/_dispatch.py +54 -0
  78. catstat-0.1.1/src/catstat/backends/_gpu.py +120 -0
  79. catstat-0.1.1/src/catstat/count_encoder.py +36 -0
  80. catstat-0.1.1/src/catstat/frequency_encoder.py +26 -0
  81. catstat-0.1.1/src/catstat/py.typed +0 -0
  82. catstat-0.1.1/src/catstat/target_encoder.py +59 -0
  83. catstat-0.1.1/tests/conftest.py +73 -0
  84. catstat-0.1.1/tests/test_backend.py +27 -0
  85. catstat-0.1.1/tests/test_check_estimator.py +61 -0
  86. catstat-0.1.1/tests/test_count_frequency.py +37 -0
  87. catstat-0.1.1/tests/test_cpu_gpu_parity.py +34 -0
  88. catstat-0.1.1/tests/test_cross_fit_no_leakage.py +58 -0
  89. catstat-0.1.1/tests/test_determinism.py +27 -0
  90. catstat-0.1.1/tests/test_feature_names.py +45 -0
  91. catstat-0.1.1/tests/test_io_types.py +68 -0
  92. catstat-0.1.1/tests/test_multi_feature.py +51 -0
  93. catstat-0.1.1/tests/test_phase3.py +77 -0
  94. catstat-0.1.1/tests/test_polars.py +32 -0
  95. catstat-0.1.1/tests/test_scheme.py +87 -0
  96. catstat-0.1.1/tests/test_sklearn_compat.py +106 -0
  97. catstat-0.1.1/tests/test_stats.py +69 -0
  98. catstat-0.1.1/tests/test_target_encoder_binary.py +33 -0
  99. catstat-0.1.1/tests/test_target_encoder_multiclass.py +34 -0
  100. catstat-0.1.1/tests/test_target_encoder_regression.py +57 -0
  101. catstat-0.1.1/tests/test_unknown_missing.py +68 -0
@@ -0,0 +1,53 @@
1
+ ---
2
+ name: benchmark-harness
3
+ description: >-
4
+ Run catstat's benchmark harness reproducibly, persist results to the ledger, compare against the
5
+ committed baseline, and draft a verdict. Invoke for any perf-relevant change, to establish or
6
+ refresh a baseline, or as the benchmark step of the self-improvement loop. Enforces >=5 reps,
7
+ median+spread, pinned seeds/versions/SHA, and never changes a default by itself. Outputs a
8
+ before/after table and a filled docs/verdicts/ entry.
9
+ ---
10
+
11
+ You run the **measurement harness** and turn numbers into a decision. You never change a default
12
+ on your own — that requires a written verdict backed by repeated runs.
13
+
14
+ ## When to use
15
+ - Any perf-relevant change (backend, group-by, dispatch, conversion path).
16
+ - Establishing or refreshing a committed baseline.
17
+ - The benchmark step of the self-improvement loop.
18
+
19
+ ## When NOT to use
20
+ - Correctness-only changes (use `leakage-audit` / `sklearn-compat`).
21
+ - To justify a default change from a single run or a microbenchmark presented as end-to-end.
22
+
23
+ ## Required inputs
24
+ - `--size {small,medium,large}`, `--backend {cpu,gpu}`, `--reps N` (≥5), and a committed baseline
25
+ JSON to compare against.
26
+
27
+ ## Commands
28
+ ```bash
29
+ python3 benchmarks/run_benchmarks.py --backend cpu --reps 5 --out benchmarks/results/<run>.jsonl
30
+ python3 benchmarks/compare_results.py benchmarks/results/<run>.jsonl benchmarks/results/baseline-cpu.json
31
+ python3 scripts/summarize_benchmark_results.py benchmarks/results/<run>.jsonl
32
+ ```
33
+ GPU runs go through `scripts/colab_gpu_parity.sh` (Phase 2), not local.
34
+
35
+ ## Files to inspect
36
+ `benchmarks/datasets.py`, `benchmarks/run_benchmarks.py`, `benchmarks/ledger.py`,
37
+ `benchmarks/compare_results.py`, `benchmarks/results/baseline-cpu.json`,
38
+ `docs/verdicts/TEMPLATE-verdict.md`, and the harness design `docs/proposals/evaluation-harness-design.md`.
39
+
40
+ ## Failure modes to catch
41
+ - Fewer than 5 reps; reporting mean without spread.
42
+ - Comparing across different seeds, package versions, or git SHAs.
43
+ - Reporting a microbenchmark win as an end-to-end win.
44
+ - Updating the committed baseline without a verdict.
45
+ - Bundling a harness change with a behavior change in one diff (result becomes un-attributable —
46
+ keep harness changes in a separate commit).
47
+ - Timing only `fit_transform` wall time without separating `fit` / `transform` / conversion.
48
+
49
+ ## Final report format
50
+ A before/after table (per case: `fit_s`, `transform_s`, `fit_transform_s` as median+spread, peak
51
+ memory), regressions/improvements vs the §8 thresholds (harness doc), parity status if relevant,
52
+ and a filled `docs/verdicts/YYYY-MM-DD-<topic>-verdict.md` with a keep/change/revert decision.
53
+ Update the committed baseline **only** if the verdict says so.
@@ -0,0 +1,54 @@
1
+ ---
2
+ name: leakage-audit
3
+ description: >-
4
+ Prove that catstat's fit_transform is out-of-fold and that no target information leaks into the
5
+ encoded features. Invoke for ANY change touching the cross-fit, smoothing, or transform path
6
+ (_cross_fit.py, _smoothing.py, _base.py transform, fold assignment) before keeping the change.
7
+ Runs tests/test_cross_fit_no_leakage.py, independently reconstructs each fold's encoding from its
8
+ complement, and checks the noise-trap. Reports PASS/FAIL with the exact offending path on failure.
9
+ ---
10
+
11
+ You are the **leakage auditor**. The single question: *does any target information leak into a
12
+ `fit_transform` output, directly or via an implementation detail?* Leakage safety is `catstat`'s
13
+ #1 invariant — you sign off before any cross-fit/smoothing change is kept.
14
+
15
+ ## When to use
16
+ - Any diff touching `_cross_fit.py`, `_smoothing.py`, the transform path in `_base.py`, or fold
17
+ assignment.
18
+ - Before keeping such a change, and whenever a new statistic's OOF behavior is added.
19
+
20
+ ## When NOT to use
21
+ - Pure docs / benchmark / naming changes.
22
+ - Unsupervised `CountEncoder`/`FrequencyEncoder` logic — there is no target, so only run the
23
+ `fit_transform == fit().transform()` equivalence check (there is nothing to leak).
24
+
25
+ ## Required inputs
26
+ - The diff/PR scope.
27
+ - A seeded dataset with known signal **and** a noise-trap (category independent of `y`); use
28
+ `benchmarks/datasets.py::make_leakage_trap` and a signal generator.
29
+
30
+ ## Commands
31
+ ```bash
32
+ PYTHONPATH=src python3 -m pytest tests/test_cross_fit_no_leakage.py -q
33
+ # ad hoc OOF reconstruction: for each fold, recompute the encoding from the fold's COMPLEMENT and
34
+ # assert it equals the value fit_transform produced for that fold's rows (must be exact on CPU).
35
+ ```
36
+
37
+ ## Files to inspect
38
+ `_cross_fit.py`, `_smoothing.py`, `_base.py` (transform), `tests/test_cross_fit_no_leakage.py`,
39
+ and `docs/proposals/target-encoder-library-design.md` §8.
40
+
41
+ ## Failure modes to catch
42
+ - Per-fold statistics that secretly include the held-out fold.
43
+ - `smooth="auto"` variance computed on the full data instead of per fold.
44
+ - Row order scrambled on merge (the produced value lands on the wrong row).
45
+ - Unknown/global fallback drawn from the *transformed* set instead of training folds.
46
+ - An example or test that uses `fit().transform()` on the training set.
47
+
48
+ ## Final report format
49
+ `PASS` / `FAIL`, plus:
50
+ - OOF-reconstruction result (exact match per fold? yes/no).
51
+ - Noise-trap: correlation of the OOF feature with `y` on held-out rows (should be ≈ 0).
52
+ - Which traps were checked and the asymmetry check (`fit_transform ≠ fit().transform()` with signal).
53
+ - On `FAIL`: the exact file/line and which invariant it violates. Escalate non-trivial fixes; do
54
+ not "make the test pass" by weakening it.
@@ -0,0 +1,50 @@
1
+ ---
2
+ name: release-prep
3
+ description: >-
4
+ Prepare a catstat PyPI release: confirm the green gate, bump the version in both pyproject.toml
5
+ and __init__.py (kept in sync), update CHANGELOG.md, build sdist+wheel, and run twine check +
6
+ a clean-venv smoke install. Follows docs/publishing_checklist.md. Prepares and verifies artifacts
7
+ only — never uploads or tags (the maintainer holds PyPI/GitHub credentials).
8
+ ---
9
+
10
+ You prepare a release and **verify it builds and installs**, stopping short of publishing.
11
+
12
+ ## When to use
13
+ - Cutting a new `catstat` version (after features have landed and the gate is green).
14
+
15
+ ## When NOT to use
16
+ - Mid-development; or to change library behavior (that's a normal PR, not a release).
17
+
18
+ ## Required inputs
19
+ - The target version (SemVer). Confirm what changed since the last tag (read `git log` + the
20
+ unreleased CHANGELOG section).
21
+
22
+ ## Steps / commands
23
+ 1. `bash scripts/check.sh` must be green; coverage ≥ floor.
24
+ 2. Bump version in **both** `pyproject.toml` `project.version` and `src/catstat/__init__.py`
25
+ `__version__` — they MUST match (a mismatch is a release bug).
26
+ 3. Move the CHANGELOG `[Unreleased]` items under a dated `[X.Y.Z]` heading; keep it honest
27
+ (Added / Changed / Fixed / Known limitations).
28
+ 4. Build + verify:
29
+ ```bash
30
+ python3 -m build
31
+ python3 -m twine check dist/*
32
+ python3 -m pip install dist/catstat-*.whl # in a fresh venv
33
+ python3 -c "import catstat; print(catstat.__version__)"
34
+ ```
35
+ 5. Hand off: report that artifacts are ready; the maintainer runs `twine upload` + `git tag`.
36
+
37
+ ## Files to inspect
38
+ `pyproject.toml`, `src/catstat/__init__.py`, `CHANGELOG.md`, `docs/publishing_checklist.md`,
39
+ `README.md`, `LICENSE`.
40
+
41
+ ## Failure modes to catch
42
+ - Version mismatch between pyproject and `__init__`.
43
+ - `twine check` warnings (README won't render, missing metadata).
44
+ - Wheel missing `py.typed` or the package data.
45
+ - Building from a dirty tree (uncommitted changes) or a non-green gate.
46
+ - Uploading from this environment (don't — no credentials; it's the maintainer's step).
47
+
48
+ ## Final report format
49
+ Version old→new, the CHANGELOG section, `twine check` result, the smoke-install import line, and an
50
+ explicit "ready to `twine upload` + tag vX.Y.Z" hand-off (or the blocking issue).
@@ -0,0 +1,51 @@
1
+ ---
2
+ name: sklearn-compat
3
+ description: >-
4
+ Verify that catstat's encoders behave as well-mannered scikit-learn transformers. Invoke for any
5
+ change to the public classes, constructor params, fitted attributes, feature names, or output
6
+ handling, and before a release. Runs tests/test_sklearn_compat.py and spot-checks clone,
7
+ get/set_params, Pipeline, ColumnTransformer, set_output, and get_feature_names_out. Reports
8
+ PASS/FAIL per check plus the documented subset of check_estimator that applies.
9
+ ---
10
+
11
+ You verify **scikit-learn protocol compliance** for `catstat`'s public encoders. Full
12
+ `check_estimator` compliance is unrealistic for supervised, multi-output transformers — target a
13
+ **documented subset** and be explicit about what does/doesn't apply and why.
14
+
15
+ ## When to use
16
+ - Changes to `TargetEncoder`/`CountEncoder`/`FrequencyEncoder` public surface: constructor params,
17
+ fitted attributes, feature names, `set_output`/`output=` handling.
18
+ - Before a release.
19
+
20
+ ## When NOT to use
21
+ - Internal backend/perf changes with no public-surface effect.
22
+
23
+ ## Required inputs
24
+ - The class(es) under test; installed `scikit-learn` (record the version — meaningful
25
+ `check_estimator` coverage needs ≥1.4, which also has `TargetEncoder` for parity).
26
+
27
+ ## Commands
28
+ ```bash
29
+ PYTHONPATH=src python3 -m pytest tests/test_sklearn_compat.py -q
30
+ ```
31
+ Spot-checks (must all hold): `sklearn.base.clone(enc)`; `enc.get_params()` /
32
+ `enc.set_params(**p)` round-trip; use inside `Pipeline` and `ColumnTransformer`;
33
+ `enc.set_output(transform="pandas")` returns a DataFrame; `enc.get_feature_names_out()` length
34
+ equals the output width.
35
+
36
+ ## Files to inspect
37
+ `target_encoder.py`, `count_encoder.py`, `frequency_encoder.py`, `_base.py`, `_feature_names.py`,
38
+ `tests/test_sklearn_compat.py`.
39
+
40
+ ## Failure modes to catch
41
+ - Constructor mutates or fails to store a param verbatim (breaks `clone`/`get_params`).
42
+ - A fitted attribute missing its trailing underscore, or set before `fit`.
43
+ - `get_feature_names_out` length ≠ number of output columns (esp. multiclass class-expansion and
44
+ class-agnostic count/frequency not multiplied by `K`).
45
+ - `set_output` / `output=` not honored, or names lost.
46
+ - Silent failure inside `ColumnTransformer` (cuML had this historically — assert it actually works).
47
+
48
+ ## Final report format
49
+ PASS/FAIL per check; the `scikit-learn` version used; and the **documented** list of
50
+ `check_estimator` checks that are (in)applicable to a supervised multi-output transformer, with a
51
+ one-line reason each. On FAIL, the precise attribute/method and the protocol expectation it misses.
@@ -0,0 +1,30 @@
1
+ ---
2
+ name: Bug report
3
+ about: Report a problem with catstat
4
+ title: "[bug] "
5
+ labels: bug
6
+ ---
7
+
8
+ **Describe the bug**
9
+ A clear description of what went wrong.
10
+
11
+ **To reproduce**
12
+ A minimal, self-contained snippet:
13
+
14
+ ```python
15
+ import pandas as pd
16
+ from catstat import TargetEncoder
17
+ # ...
18
+ ```
19
+
20
+ **Expected behavior**
21
+ What you expected to happen.
22
+
23
+ **Environment**
24
+ - catstat version:
25
+ - Python version:
26
+ - pandas / numpy / scikit-learn versions:
27
+ - backend (cpu/gpu) and OS:
28
+
29
+ **Additional context**
30
+ Full traceback, data shape, or anything else relevant.
@@ -0,0 +1,5 @@
1
+ blank_issues_enabled: true
2
+ contact_links:
3
+ - name: Report a security vulnerability
4
+ url: https://github.com/Matapanino/catstat/security/advisories/new
5
+ about: Please report vulnerabilities privately (see SECURITY.md), not as public issues.
@@ -0,0 +1,22 @@
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea or new capability for catstat
4
+ title: "[feature] "
5
+ labels: enhancement
6
+ ---
7
+
8
+ **What problem does this solve?**
9
+ The use case or limitation you're hitting.
10
+
11
+ **Proposed solution**
12
+ What you'd like catstat to do (an API sketch is welcome).
13
+
14
+ **Alternatives considered**
15
+ Other approaches, or how you work around it today.
16
+
17
+ **Scope check**
18
+ - Does it fit catstat's scope (leakage-safe statistical categorical encoding)?
19
+ - Is it a new statistic (`stats=`), a backend/IO option, or core behavior?
20
+
21
+ **Additional context**
22
+ Links, references, or examples.
@@ -0,0 +1,16 @@
1
+ ## Summary
2
+
3
+ <!-- What does this PR change, and why? -->
4
+
5
+ ## Checklist
6
+
7
+ - [ ] `bash scripts/check.sh` is green (ruff + pytest + examples).
8
+ - [ ] Tests cover the new behavior (encode correctness, **OOF / no-leakage**, unknown/missing
9
+ fallback, feature names, determinism — as applicable).
10
+ - [ ] Core invariants intact (leakage safety, smoothing honesty, CPU/GPU parity, public API).
11
+ - [ ] `CHANGELOG.md` updated for user-visible changes; SemVer considered.
12
+ - [ ] Docs updated (`README.md` / `docs/`) where relevant.
13
+
14
+ ## Notes
15
+
16
+ <!-- Anything reviewers should know: trade-offs, follow-ups, benchmark/verdict links. -->
@@ -0,0 +1,34 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - name: Install (editable + dev extra)
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ pip install -e ".[dev]"
24
+ - name: Lint
25
+ run: ruff check src tests examples benchmarks scripts
26
+ - name: Test + coverage
27
+ # GPU/parity tests carry the `gpu` marker and auto-skip without RAPIDS (CPU runner).
28
+ run: pytest tests/ -q --cov=catstat --cov-report=term-missing
29
+ - name: Examples (smoke)
30
+ run: |
31
+ python examples/regression_basic.py
32
+ python examples/binary_classification_basic.py
33
+ python examples/multiclass_classification_basic.py
34
+ python examples/count_frequency_basic.py
@@ -0,0 +1,51 @@
1
+ name: Docs
2
+
3
+ # Build the API reference (pdoc) and publish it to GitHub Pages on each push to main.
4
+ # One-time setup: repo Settings -> Pages -> Source: "GitHub Actions".
5
+
6
+ on:
7
+ push:
8
+ branches: [main]
9
+ workflow_dispatch:
10
+
11
+ permissions:
12
+ contents: read
13
+ pages: write
14
+ id-token: write
15
+
16
+ # Allow one concurrent Pages deployment; let an in-progress run finish.
17
+ concurrency:
18
+ group: pages
19
+ cancel-in-progress: false
20
+
21
+ jobs:
22
+ build:
23
+ name: Build API docs
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+ - uses: actions/setup-python@v5
28
+ with:
29
+ python-version: "3.12"
30
+ - name: Install (docs extra)
31
+ run: |
32
+ python -m pip install --upgrade pip
33
+ pip install -e ".[docs]"
34
+ - name: Build with pdoc
35
+ run: bash scripts/build_docs.sh site
36
+ - name: Upload Pages artifact
37
+ uses: actions/upload-pages-artifact@v3
38
+ with:
39
+ path: site
40
+
41
+ deploy:
42
+ name: Deploy to GitHub Pages
43
+ needs: build
44
+ runs-on: ubuntu-latest
45
+ environment:
46
+ name: github-pages
47
+ url: ${{ steps.deployment.outputs.page_url }}
48
+ steps:
49
+ - name: Deploy
50
+ id: deployment
51
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,65 @@
1
+ name: Release
2
+
3
+ # Build and publish to PyPI when a version tag (e.g. v0.1.1) is pushed.
4
+ # Publishing uses PyPI Trusted Publishing (OIDC): no API token is stored in the repo.
5
+ # One-time setup and the manual fallback are documented in docs/publishing_checklist.md.
6
+
7
+ on:
8
+ push:
9
+ tags: ["v*"]
10
+
11
+ permissions:
12
+ contents: read
13
+
14
+ jobs:
15
+ build:
16
+ name: Build distributions
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+ - name: Verify tag matches package version
24
+ env:
25
+ TAG: ${{ github.ref_name }}
26
+ run: |
27
+ python - <<'PY'
28
+ import os, pathlib, re, sys
29
+ tag = os.environ["TAG"].lstrip("v")
30
+ pp = re.search(r'(?m)^version = "([^"]+)"',
31
+ pathlib.Path("pyproject.toml").read_text()).group(1)
32
+ init = re.search(r'(?m)^__version__ = "([^"]+)"',
33
+ pathlib.Path("src/catstat/__init__.py").read_text()).group(1)
34
+ if not (tag == pp == init):
35
+ sys.exit(f"Version mismatch: tag={tag!r} pyproject={pp!r} __init__={init!r}")
36
+ print(f"OK: version {tag} matches the tag, pyproject.toml, and __init__.py.")
37
+ PY
38
+ - name: Build sdist + wheel
39
+ run: |
40
+ python -m pip install --upgrade pip build twine
41
+ python -m build
42
+ python -m twine check dist/*
43
+ - name: Upload dist artifact
44
+ uses: actions/upload-artifact@v4
45
+ with:
46
+ name: dist
47
+ path: dist/
48
+
49
+ publish:
50
+ name: Publish to PyPI (Trusted Publishing)
51
+ needs: build
52
+ runs-on: ubuntu-latest
53
+ environment:
54
+ name: pypi
55
+ url: https://pypi.org/project/catstat/
56
+ permissions:
57
+ id-token: write # OIDC token for PyPI Trusted Publishing — no stored API token
58
+ steps:
59
+ - name: Download dist artifact
60
+ uses: actions/download-artifact@v4
61
+ with:
62
+ name: dist
63
+ path: dist/
64
+ - name: Publish to PyPI
65
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,25 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ build/
6
+ dist/
7
+ .eggs/
8
+
9
+ # Test / lint / coverage caches
10
+ .pytest_cache/
11
+ .ruff_cache/
12
+ .coverage
13
+ htmlcov/
14
+
15
+ # Benchmark scratch (committed baselines live in benchmarks/results/baseline-*.json)
16
+ benchmarks/results/run.json
17
+ benchmarks/results/_tmp_*.json
18
+
19
+ # API docs build output (published by .github/workflows/docs.yml)
20
+ site/
21
+
22
+ # Env / editor
23
+ .venv/
24
+ venv/
25
+ .DS_Store
@@ -0,0 +1,60 @@
1
+ # Changelog
2
+
3
+ All notable changes to `catstat` are documented here. Format follows
4
+ [Keep a Changelog](https://keepachangelog.com/); versioning is [SemVer](https://semver.org/).
5
+
6
+ ## [0.1.1] — 2026-06-26
7
+
8
+ ### Changed
9
+ - Releases now build and publish to PyPI automatically on a `v*` tag, via GitHub Actions and PyPI
10
+ **Trusted Publishing** (OIDC — no stored API token). See `docs/publishing_checklist.md`.
11
+ - `TargetEncoder` / `CountEncoder` / `FrequencyEncoder` now advertise scikit-learn estimator tags
12
+ via both `__sklearn_tags__` (sklearn ≥ 1.6) and `_more_tags` (< 1.6): categorical/string input,
13
+ `allow_nan` (NaN is learned as a level under `handle_missing="value"`), and `requires_y` for the
14
+ supervised encoder — so `check_estimator` skips inapplicable checks.
15
+
16
+ ### Documentation
17
+ - Rewrote the README: status badges, an honest CPU/GPU status, install + extras, a statistics/
18
+ feature table, a "leakage-safe by design" note, and a link to the API reference.
19
+ - Published an API reference built with `pdoc` (`scripts/build_docs.sh`), deployed to GitHub Pages
20
+ via `.github/workflows/docs.yml`.
21
+
22
+ ### Fixed
23
+ - `cols="auto"` now selects pandas `StringDtype` columns, so auto-detection works on pandas ≥ 3.0
24
+ (where string columns default to `StringDtype` rather than `object`). (KI-022)
25
+ - Fitted estimators are now picklable; a cached backend module previously raised
26
+ `TypeError: cannot pickle 'module' object`. (KI-012)
27
+
28
+ ## [0.1.0] — 2026-06-26
29
+
30
+ First public release. Leakage-safe, sklearn-compatible statistical categorical encoding with one
31
+ API across CPU (pandas/numpy) and, optionally, GPU (cuDF/CuPy).
32
+
33
+ ### Added
34
+ - **`TargetEncoder`** — supervised, leakage-safe. `fit_transform` is cross-fitted; `transform`
35
+ uses full-data encodings for new data. Regression, binary, and multiclass (one-vs-rest) targets.
36
+ - **`CountEncoder` / `FrequencyEncoder`** — unsupervised category-prevalence encoders.
37
+ - **Statistics** (`stats=`): `mean`, `count`, `frequency`, `var`, `std`, `median`, `min`, `max`,
38
+ `skew`, and **custom `(name, callable)` aggregations** (quantiles, IQR, …). Only mean/probability
39
+ are smoothed (m-estimate fixed; empirical-Bayes `smooth="auto"`); other statistics fall back to
40
+ the global value for small/unseen categories and never blend.
41
+ - **Cross-fitting schemes** (`scheme=`): `kfold` (default, out-of-fold), `loo` (leave-one-out),
42
+ `ordered` (CatBoost-style ordered target statistics). loo/ordered apply to the mean.
43
+ - **Multi-column**: `multi_feature_mode="independent"` (default) or `"combination"` (joint).
44
+ - **Missing / unseen handling**: `handle_missing` / `handle_unknown` ∈ {`value`, `return_nan`,
45
+ `error`}, with per-statistic fallbacks (count/frequency → 0; mean → global; etc.).
46
+ - **Backends**: `backend="cpu"` (default via `auto`), `backend="gpu"` (cuDF/CuPy, validated
47
+ CPU/GPU-allclose on a Colab T4, incl. missing). `auto` resolves to CPU for now — the current GPU
48
+ path is not yet faster than CPU up to 1M rows (see `docs/known_issues.md` KI-020).
49
+ - **Output**: `output` ∈ {`auto`, `numpy`, `pandas`, `polars`}; sklearn `set_output`,
50
+ `get_feature_names_out`, `Pipeline` / `ColumnTransformer` compatibility.
51
+ - Test suite (88 tests), synthetic benchmark harness, Colab GPU-parity loop, and CI
52
+ (Python 3.10–3.12).
53
+
54
+ ### Known limitations
55
+ - GPU `auto` disabled pending an on-device redesign (KI-020); `combination` and `skew`/custom
56
+ aggregations run on CPU. sklearn-parity tests require `scikit-learn>=1.4`. See
57
+ `docs/known_issues.md`.
58
+
59
+ [0.1.1]: https://github.com/Matapanino/catstat/releases/tag/v0.1.1
60
+ [0.1.0]: https://github.com/Matapanino/catstat/releases/tag/v0.1.0