tpcp 2.1.2__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {tpcp-2.1.2 → tpcp-2.2.0}/PKG-INFO +6 -2
  2. {tpcp-2.1.2 → tpcp-2.2.0}/README.md +5 -1
  3. {tpcp-2.1.2 → tpcp-2.2.0}/pyproject.toml +7 -1
  4. tpcp-2.2.0/skills/tpcp/tpcp-basics/SKILL.md +80 -0
  5. tpcp-2.2.0/skills/tpcp/tpcp-builder/SKILL.md +67 -0
  6. tpcp-2.2.0/skills/tpcp/tpcp-datasets/SKILL.md +74 -0
  7. tpcp-2.2.0/skills/tpcp/tpcp-multiprocessing/SKILL.md +45 -0
  8. tpcp-2.2.0/skills/tpcp/tpcp-optimization/SKILL.md +76 -0
  9. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/__init__.py +1 -1
  10. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_algorithm.py +9 -1
  11. tpcp-2.2.0/src/tpcp/_cli.py +95 -0
  12. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_dataset.py +1 -1
  13. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_pipeline.py +7 -1
  14. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_algorithm_utils.py +0 -0
  15. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_base.py +0 -0
  16. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_hash.py +0 -0
  17. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_optimize.py +0 -0
  18. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_parameters.py +0 -0
  19. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_utils/__init__.py +0 -0
  20. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_utils/_general.py +0 -0
  21. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_utils/_score.py +0 -0
  22. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/caching.py +0 -0
  23. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/exceptions.py +0 -0
  24. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/misc/__init__.py +0 -0
  25. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/misc/_class_utils.py +0 -0
  26. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/misc/_typed_iterator.py +0 -0
  27. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/optimize/__init__.py +0 -0
  28. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/optimize/_optimize.py +0 -0
  29. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/optimize/optuna.py +0 -0
  30. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/parallel.py +0 -0
  31. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/testing/__init__.py +0 -0
  32. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/testing/_algorithm_test_mixin.py +0 -0
  33. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/testing/_regression_utils.py +0 -0
  34. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/types.py +0 -0
  35. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/__init__.py +0 -0
  36. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/_cross_val_helper.py +0 -0
  37. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/_scorer.py +0 -0
  38. {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/_validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: tpcp
3
- Version: 2.1.2
3
+ Version: 2.2.0
4
4
  Summary: Pipeline and Dataset helpers for complex algorithm evaluation.
5
5
  Author: Arne Küderle, Robert Richer, Raul C. Sîmpetru, Björn Eskofier
6
6
  Author-email: Arne Küderle <arne.kuederle@fau.de>, Robert Richer <robert.richer@fau.de>, Raul C. Sîmpetru <raul.simpetru@fau.de>, Björn Eskofier <bjoern.eskofier@fau.de>
@@ -29,7 +29,6 @@ Description-Content-Type: text/markdown
29
29
  [![Documentation Status](https://readthedocs.org/projects/tpcp/badge/?version=latest)](https://tpcp.readthedocs.io/en/latest/?badge=latest)
30
30
  [![codecov](https://codecov.io/gh/mad-lab-fau/tpcp/branch/main/graph/badge.svg?token=ZNVT5LNYHO)](https://codecov.io/gh/mad-lab-fau/tpcp)
31
31
  [![Test and Lint](https://github.com/mad-lab-fau/tpcp/actions/workflows/test-and-lint.yml/badge.svg?branch=main)](https://github.com/mad-lab-fau/tpcp/actions/workflows/test-and-lint.yml)
32
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
33
32
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/tpcp)
34
33
  [![DOI](https://joss.theoj.org/papers/10.21105/joss.04953/status.svg)](https://doi.org/10.21105/joss.04953)
35
34
 
@@ -47,6 +46,11 @@ Or add it to your project with [uv](https://docs.astral.sh/uv/):
47
46
  uv add tpcp
48
47
  ```
49
48
 
49
+ If you want to install the bundled tpcp agent skills into the current project's `.agent` folder, run:
50
+ ```bash
51
+ tpcp install-skills
52
+ ```
53
+
50
54
  ## Why?
51
55
 
52
56
  Evaluating Algorithms - in particular when they contain machine learning - is hard.
@@ -4,7 +4,6 @@
4
4
  [![Documentation Status](https://readthedocs.org/projects/tpcp/badge/?version=latest)](https://tpcp.readthedocs.io/en/latest/?badge=latest)
5
5
  [![codecov](https://codecov.io/gh/mad-lab-fau/tpcp/branch/main/graph/badge.svg?token=ZNVT5LNYHO)](https://codecov.io/gh/mad-lab-fau/tpcp)
6
6
  [![Test and Lint](https://github.com/mad-lab-fau/tpcp/actions/workflows/test-and-lint.yml/badge.svg?branch=main)](https://github.com/mad-lab-fau/tpcp/actions/workflows/test-and-lint.yml)
7
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
8
7
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/tpcp)
9
8
  [![DOI](https://joss.theoj.org/papers/10.21105/joss.04953/status.svg)](https://doi.org/10.21105/joss.04953)
10
9
 
@@ -22,6 +21,11 @@ Or add it to your project with [uv](https://docs.astral.sh/uv/):
22
21
  uv add tpcp
23
22
  ```
24
23
 
24
+ If you want to install the bundled tpcp agent skills into the current project's `.agent` folder, run:
25
+ ```bash
26
+ tpcp install-skills
27
+ ```
28
+
25
29
  ## Why?
26
30
 
27
31
  Evaluating Algorithms - in particular when they contain machine learning - is hard.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "tpcp"
3
- version = "2.1.2"
3
+ version = "2.2.0"
4
4
  description = "Pipeline and Dataset helpers for complex algorithm evaluation."
5
5
  authors = [
6
6
  { name = "Arne Küderle", email = "arne.kuederle@fau.de" },
@@ -29,6 +29,9 @@ attrs = ["attrs>=22.1.0"]
29
29
  Homepage = "https://github.com/mad-lab-fau/tpcp"
30
30
  Repository = "https://github.com/mad-lab-fau/tpcp"
31
31
 
32
+ [project.scripts]
33
+ tpcp = "tpcp._cli:main"
34
+
32
35
  [project.entry-points.pytest11]
33
36
  tpcp_snapshots = "tpcp.testing._regression_utils"
34
37
 
@@ -53,6 +56,9 @@ dev = [
53
56
  [tool.uv]
54
57
  default-groups = "all"
55
58
 
59
+ [tool.uv.build-backend]
60
+ data = { data = "skills" }
61
+
56
62
  [tool.uv.sources]
57
63
  torch = { index = "torch_cpu" }
58
64
 
@@ -0,0 +1,80 @@
1
+ ---
2
+ name: tpcp-basics
3
+ description: Use when implementing or reviewing core tpcp classes, especially Algorithms and Pipelines, parameter definitions, action methods, result attributes, cloning, and nested parameter handling.
4
+ ---
5
+
6
+ # tpcp Basics
7
+
8
+ Read `../tpcp-builder/SKILL.md` first for the global guardrails.
9
+ Also load `../tpcp-datasets/SKILL.md` for custom datasets and `../tpcp-optimization/SKILL.md` for `self_optimize`.
10
+
11
+ ## Build Pattern
12
+
13
+ - Subclass `Algorithm`, `Pipeline`, or `OptimizablePipeline`.
14
+ - Declare any useful class-level parameter annotations.
15
+ - In `__init__`, assign each arg directly to `self`.
16
+ - Algorithms should accept simple/raw inputs, not whole dataset objects.
17
+ - Action methods compute results, store them on `*_` attrs, and return `self`.
18
+ - Pipelines consume one dataset datapoint/group, not an entire dataset split.
19
+
20
+ ## Parameters
21
+
22
+ - In tpcp, all init args are parameters.
23
+ - If a value should be tunable/trainable/searchable, expose it in `__init__`.
24
+ - Use `set_params(...)` for programmatic updates, including nested updates like `algo__threshold=...`.
25
+ - Nested parameter annotations belong on the current class, e.g. `algorithm__threshold: OptimizableParameter[float]`.
26
+
27
+ ## Action Methods
28
+
29
+ - Custom algorithms should set `_action_methods` to their action name(s), e.g. `"detect"`.
30
+ - Pipelines already use `run`/`safe_run`.
31
+ - Prefer `@make_action_safe`.
32
+ - `safe_run()` checks:
33
+ - returns `self`
34
+ - writes at least one `*_` result
35
+ - does not modify parameters
36
+
37
+ ## Cloning
38
+
39
+ - Clone before each per-datapoint execution of a nested algorithm.
40
+ - Clone before mutating a nested algorithm/object inside `run` or `self_optimize`.
41
+ - `clone()` copies parameters but drops results and other non-parameter attrs.
42
+ - tpcp clones nested tpcp objects recursively and deep-copies other objects.
43
+ - Unlike `sklearn.clone`, tpcp keeps fitted sklearn estimator state because trained models are treated as parameters.
44
+
45
+ ## Mutable Defaults
46
+
47
+ - Wrap defaults like `list`, `dict`, `np.ndarray`, `pd.DataFrame`, tpcp objects, sklearn estimators, or custom class instances in `cf(...)`.
48
+ - For dataclasses/attrs, use their own factories instead of `cf(...)`.
49
+
50
+ ## Common Mistakes
51
+
52
+ - Doing parameter validation or derived-parameter setup in `__init__`.
53
+ - Giving a parameter a trailing `_`; that suffix is reserved for results.
54
+ - Forgetting to clone a nested algorithm before calling it.
55
+ - Running one algorithm instance repeatedly and expecting older results to remain.
56
+ - Storing learned templates/models on ad-hoc attrs instead of init parameters.
57
+
58
+ ## Minimal Pattern
59
+
60
+ ```python
61
+ class MyPipe(Pipeline[MyDataset]):
62
+ algo: Parameter[MyAlgo]
63
+ output_: pd.DataFrame
64
+
65
+ def __init__(self, algo: MyAlgo = cf(MyAlgo())):
66
+ self.algo = algo
67
+
68
+ def run(self, datapoint: MyDataset):
69
+ algo = self.algo.clone()
70
+ algo = algo.detect(datapoint.signal, datapoint.sampling_rate_hz)
71
+ self.output_ = algo.events_
72
+ return self
73
+ ```
74
+
75
+ ## Source of Truth
76
+
77
+ - `https://tpcp.readthedocs.io/en/latest/guides/general_concepts.html`
78
+ - `https://tpcp.readthedocs.io/en/latest/guides/algorithms_pipelines_datasets.html`
79
+ - `https://tpcp.readthedocs.io/en/latest/auto_examples/algorithms/_01_algorithms_qrs_detection.html`
80
+ - API docs for `Algorithm`, `Pipeline`, `make_action_safe`, and `clone`
@@ -0,0 +1,67 @@
1
+ ---
2
+ name: tpcp-builder
3
+ description: Use when building, reviewing, or refactoring code that subclasses tpcp Dataset, Algorithm, Pipeline, or uses tpcp optimization/validation. Gives the global rules, main pitfalls, and points to focused tpcp skills for basics, datasets, and optimization.
4
+ ---
5
+
6
+ # tpcp Builder
7
+
8
+ Read this first, then load the focused sibling skill(s) you need:
9
+
10
+ - Basics: `../tpcp-basics/SKILL.md`
11
+ - Datasets: `../tpcp-datasets/SKILL.md`
12
+ - Optimization: `../tpcp-optimization/SKILL.md`
13
+
14
+ ## Mental Model
15
+
16
+ - `Dataset`: index + lazy access to actual data.
17
+ - `Algorithm`: reusable step with one or more action methods.
18
+ - `Pipeline`: glue code that runs on exactly one dataset datapoint/group.
19
+ - Optimization in tpcp means "data-driven changes to init parameters", including model training.
20
+
21
+ ## Non-Negotiable Rules
22
+
23
+ - Every `__init__` argument must be stored unchanged on `self` under the same name.
24
+ - Do not validate, coerce, derive, or mutate parameters in `__init__`; do that in `run`/action methods.
25
+ - Do not use `*args` in tpcp object `__init__`.
26
+ - Parameter names must not contain `__` or end with `_`.
27
+ - Any mutable default or nested object default must use `cf(...)` or a dataclass/attrs factory.
28
+ - Results live on attributes ending with `_`.
29
+ - Action methods and `self_optimize` must return `self` (or `(self, info)` for `self_optimize_with_info`).
30
+ - Algorithms should take the simplest raw inputs they need; pipelines are the place that consume dataset datapoints.
31
+ - `run`/action methods must not modify parameters.
32
+ - Clone nested algorithms/objects before running or mutating them.
33
+ - Never optimize on test data.
34
+ - Any value that training/search changes must be an exposed init parameter.
35
+
36
+ ## Highest-Risk Pitfalls
37
+
38
+ - Shared mutable defaults create silent cross-instance state and can cause train-test leakage.
39
+ - Reusing one nested algorithm instance across datapoints overwrites results and leaks fitted state.
40
+ - Building a non-deterministic dataset index breaks splits, caching, and reproducibility.
41
+ - Passing a multi-row/multi-group dataset into `Pipeline.run` violates the intended interface.
42
+ - Storing learned state outside init parameters makes `clone()` drop it and breaks optimization semantics.
43
+ - Marking `PureParameter` incorrectly can invalidate optimization shortcuts; default to plain `Parameter` unless sure.
44
+
45
+ ## Safe Defaults
46
+
47
+ - Prefer `pipeline.safe_run(datapoint)` over `pipeline.run(datapoint)`.
48
+ - Prefer `@make_action_safe` on custom action methods.
49
+ - Prefer `@make_optimize_safe` and `Optimize(...)` for `self_optimize` pipelines.
50
+ - Prefer `GridSearch`/`GridSearchCV`/`OptunaSearch` for brute-force or black-box search.
51
+
52
+ ## Quick Triage
53
+
54
+ If behavior is strange, check these first:
55
+
56
+ 1. Mutable default or shared nested object?
57
+ 2. Missing `clone()` before nested `run`/`detect`/`self_optimize`?
58
+ 3. Dataset `create_index()` deterministic and sorted?
59
+ 4. `run` touching parameters instead of only writing `*_` results?
60
+ 5. `self_optimize` changing non-optimizable params or non-parameter attrs?
61
+
62
+ ## Source of Truth
63
+
64
+ - Concepts: `https://tpcp.readthedocs.io/en/latest/guides/general_concepts.html`
65
+ - Dataset/algorithm/pipeline model: `https://tpcp.readthedocs.io/en/latest/guides/algorithms_pipelines_datasets.html`
66
+ - Evaluation and leakage rules: `https://tpcp.readthedocs.io/en/latest/guides/algorithm_evaluation.html`
67
+ - Optimization rules: `https://tpcp.readthedocs.io/en/latest/guides/optimization.html`
@@ -0,0 +1,74 @@
1
+ ---
2
+ name: tpcp-datasets
3
+ description: Use when implementing or reviewing tpcp Dataset classes, dataset indexes, grouping, subsetting, data accessors, and split/group label behavior for validation workflows.
4
+ ---
5
+
6
+ # tpcp Datasets
7
+
8
+ Read `../tpcp-builder/SKILL.md` first for global guardrails.
9
+ Also load `../tpcp-basics/SKILL.md` when the dataset feeds a custom pipeline.
10
+
11
+ ## Required Shape
12
+
13
+ - Subclass `Dataset[...]`.
14
+ - `__init__` must include `groupby_cols=None, subset_index=None` at the end of the signature and forward both to `super().__init__(...)`.
15
+ - Implement `create_index()` to return the full metadata index as a `pd.DataFrame`.
16
+ - Keep actual file/data loading out of `create_index()`; load lazily in properties/methods.
17
+
18
+ ## Index Rules
19
+
20
+ - `create_index()` must be deterministic. tpcp calls it twice and will error if outputs differ.
21
+ - Sort the final index explicitly; do not rely on filesystem order or `set` iteration.
22
+ - Index columns should be valid Python identifiers. Invalid names break ergonomics around `get_subset`, `group_label`, and `group_labels`.
23
+ - If you use a typed named-tuple group label generic, its field names and order must match index columns exactly.
24
+
25
+ ## Data Accessors
26
+
27
+ - Properties that expose actual data should usually require a single row via `assert_is_single(...)`.
28
+ - Properties that expose group-level data should require a single current group via `assert_is_single_group(...)`.
29
+ - Think carefully about what counts as a datapoint in your project before designing accessors.
30
+
31
+ ## Grouping and Iteration
32
+
33
+ - Ungrouped dataset length = row count.
34
+ - Grouped dataset length = unique group count.
35
+ - Grouping changes what "one datapoint" means for iteration and splitting.
36
+ - `group_labels` follow current grouping; `index_as_tuples()` always reflects raw rows.
37
+
38
+ ## Subsetting and Splits
39
+
40
+ - `get_subset(...)` accepts exactly one selector mode at a time.
41
+ - Use `groupby(...)` when train/test splitting should happen on a higher level than raw rows.
42
+ - Use `create_string_group_labels(...)` for `GroupKFold` and similar sklearn splitters.
43
+ - If the dataset is already grouped, `create_string_group_labels(...)` columns must be a subset of `groupby_cols`.
44
+
45
+ ## Common Mistakes
46
+
47
+ - Non-deterministic index creation.
48
+ - Loading full signals/dataframes in `create_index()`.
49
+ - Forgetting `groupby_cols`/`subset_index` in custom dataset init.
50
+ - Accessing per-recording data from a subset that still contains multiple rows/groups.
51
+ - Splitting raw rows when the real independence unit is participant/session/day.
52
+
53
+ ## Minimal Pattern
54
+
55
+ ```python
56
+ class MyDataset(Dataset[MyGroupLabel]):
57
+ def __init__(self, root: Path, groupby_cols=None, subset_index=None):
58
+ self.root = root
59
+ super().__init__(groupby_cols=groupby_cols, subset_index=subset_index)
60
+
61
+ def create_index(self) -> pd.DataFrame:
62
+ return build_index(self.root).sort_values(["participant", "recording"]).reset_index(drop=True)
63
+
64
+ @property
65
+ def signal(self) -> pd.DataFrame:
66
+ self.assert_is_single(None, "signal")
67
+ return load_signal(...)
68
+ ```
69
+
70
+ ## Source of Truth
71
+
72
+ - `https://tpcp.readthedocs.io/en/latest/auto_examples/datasets/_01_datasets_basics.html`
73
+ - `https://tpcp.readthedocs.io/en/latest/guides/algorithms_pipelines_datasets.html`
74
+ - API docs for `Dataset`
@@ -0,0 +1,45 @@
1
+ ---
2
+ name: tpcp-multiprocessing
3
+ description: Use when working on tpcp code that uses n_jobs, joblib parallelism, tpcp.parallel, caching in parallel workers, or when debugging multiprocessing, serialization, or global state issues in tpcp.
4
+ ---
5
+
6
+ # tpcp Multiprocessing
7
+
8
+ Read the official `Multiprocessing Caveats` guide first:
9
+ `https://tpcp.readthedocs.io/en/latest/guides/multiprocessing_caveats.html`
10
+
11
+ ## Use this skill when
12
+
13
+ - `validate`, `cross_validate`, `Scorer`, or an optimizer uses `n_jobs`
14
+ - global config seems missing in workers
15
+ - runtime monkey-patching/decorators/caches behave differently in parallel
16
+ - joblib raises pickle or `__main__`-related errors
17
+ - heavy imports make parallel runs unexpectedly slow
18
+
19
+ ## Main Caveats
20
+
21
+ - Worker processes do not automatically inherit runtime global state changes from the parent process.
22
+ - Joblib `loky` workers are reused, so worker-side global mutations can leak into later jobs.
23
+ - Serialization is often the real failure point, not the parallel API itself.
24
+ - Objects defined in `__main__`, lambdas, nested classes/functions, and runtime-replaced globals are high risk.
25
+ - Heavy optional imports can dominate worker startup cost.
26
+
27
+ ## tpcp-Specific Guidance
28
+
29
+ - For global state restoration, use `tpcp.parallel.delayed` together with `register_global_parallel_callback(...)`.
30
+ - Assume runtime-applied decorators or caches are not visible in workers unless explicitly restored there.
31
+ - If tests require a clean worker pool, shut down the reusable loky executor explicitly.
32
+ - If debugging cost outweighs the speedup, fall back to `n_jobs=1`.
33
+
34
+ ## Fast Triage
35
+
36
+ 1. Missing config only in workers: global-state problem, use `tpcp.parallel`.
37
+ 2. Error mentions `__main__`: move code to an importable module.
38
+ 3. Works once, fails later: suspect process-pool reuse and leaked worker state.
39
+ 4. Parallel is slower than serial: inspect import cost and serialization overhead.
40
+
41
+ ## Source of Truth
42
+
43
+ - `https://tpcp.readthedocs.io/en/latest/guides/multiprocessing_caveats.html`
44
+ - API docs for `tpcp.parallel`
45
+ - GitHub issue `#119`
@@ -0,0 +1,76 @@
1
+ ---
2
+ name: tpcp-optimization
3
+ description: Use when implementing or reviewing tpcp self_optimize logic, parameter annotations, Optimize/GridSearch/GridSearchCV usage, and validation workflows that must avoid train-test leakage.
4
+ ---
5
+
6
+ # tpcp Optimization
7
+
8
+ Read `../tpcp-builder/SKILL.md` first for the global guardrails.
9
+ Also load `../tpcp-basics/SKILL.md` for cloning/parameter rules and `../tpcp-datasets/SKILL.md` for split semantics.
10
+
11
+ ## Pick the Right Tool
12
+
13
+ - Use `self_optimize` only for algorithm-specific training logic.
14
+ - Do not re-implement brute-force search inside `self_optimize`.
15
+ - Use `Optimize(pipeline)` for pipelines that implement `self_optimize`.
16
+ - Use `GridSearch` for brute-force search without inner CV.
17
+ - Use `GridSearchCV` when hyperparameter search itself needs CV.
18
+ - Use `DummyOptimize` if you need a non-optimizable baseline on the same CV folds.
19
+
20
+ ## Parameter Semantics
21
+
22
+ - `OptimizableParameter`: changed by `self_optimize`.
23
+ - `HyperParameter`: changes how `self_optimize` behaves but is not changed by it.
24
+ - `PureParameter`: does not affect `self_optimize`; only use when you are certain.
25
+ - If unsure, prefer plain `Parameter` over `PureParameter`.
26
+
27
+ ## Hard Rules for `self_optimize`
28
+
29
+ - Return `self`, or `(self, info)` from `self_optimize_with_info`.
30
+ - Modify only parameters marked as optimizable on the current class.
31
+ - Do not store learned state on non-parameter attrs.
32
+ - Any learned model/template/weights must survive `clone()`, so they must be represented as parameters.
33
+ - Clone nested algorithms before training/mutating them.
34
+ - Prefer `@make_optimize_safe`; `Optimize(...)` also applies equivalent checks.
35
+
36
+ ## Evaluation Rules
37
+
38
+ - Never search/tune/train on the final test data.
39
+ - Outer evaluation measures the whole training procedure, not one already-trained instance.
40
+ - `cross_validate(...)` expects an optimizer object, not a bare pipeline.
41
+ - For grouped or stratified splits, create explicit group/label arrays or use `DatasetSplitter`.
42
+ - In custom scorers, call `pipeline.safe_run(datapoint)`.
43
+
44
+ ## Common Mistakes
45
+
46
+ - Putting black-box parameter search into `self_optimize` instead of `GridSearch`/`OptunaSearch`.
47
+ - Forgetting to annotate optimizable params, causing safety checks to fail.
48
+ - Changing non-optimizable params during `self_optimize`.
49
+ - Marking a parameter as `PureParameter` even though it affects training.
50
+ - Training a nested algorithm in place and then reusing it across folds/datapoints.
51
+ - Calling `self_optimize` directly in user-facing code instead of using `Optimize(...)`.
52
+
53
+ ## Minimal Pattern
54
+
55
+ ```python
56
+ class MyPipeline(OptimizablePipeline[MyDataset]):
57
+ model: Parameter[MyAlgo]
58
+ model__weights: OptimizableParameter[np.ndarray]
59
+
60
+ def __init__(self, model: MyAlgo = cf(MyAlgo())):
61
+ self.model = model
62
+
63
+ def self_optimize(self, dataset: MyDataset, **kwargs):
64
+ model = self.model.clone()
65
+ self.model = model.self_optimize(...)
66
+ return self
67
+ ```
68
+
69
+ ## Source of Truth
70
+
71
+ - `https://tpcp.readthedocs.io/en/latest/guides/optimization.html`
72
+ - `https://tpcp.readthedocs.io/en/latest/guides/algorithm_evaluation.html`
73
+ - `https://tpcp.readthedocs.io/en/latest/guides/algorithm_validation_tpcp.html`
74
+ - `https://tpcp.readthedocs.io/en/latest/auto_examples/parameter_optimization/_02_optimizable_pipelines.html`
75
+ - `https://tpcp.readthedocs.io/en/latest/auto_examples/parameter_optimization/_03_gridsearch_cv.html`
76
+ - API docs for `Optimize`, `GridSearch`, `GridSearchCV`, and `make_optimize_safe`
@@ -24,7 +24,7 @@ from tpcp._parameters import (
24
24
  )
25
25
  from tpcp._pipeline import OptimizablePipeline, Pipeline
26
26
 
27
- __version__ = "2.1.2"
27
+ __version__ = "2.2.0"
28
28
 
29
29
 
30
30
  __all__ = [
@@ -14,9 +14,17 @@ class Algorithm(BaseTpcpObject):
14
14
 
15
15
  All type-specific algorithm classes should inherit from this class and need to
16
16
 
17
- 1. overwrite `_action_method` with the name of the actual action method of this class type
17
+ 1. overwrite `_action_methods` with the name of the actual action method of this class type
18
18
  2. implement a stub for the action method
19
19
 
20
+ Examples
21
+ --------
22
+ >>> class MyAlgorithm(Algorithm):
23
+ ... _action_methods = "detect"
24
+ ...
25
+ ... def detect(self, data):
26
+ ... return self
27
+
20
28
  If you want to create an optimizable algorithm, add a `self_optimize` or (`self_optimize_with_info`) method to your
21
29
  class.
22
30
  We do not provide a separate base class for that, as we can make no assumptions about the call signature of your
@@ -0,0 +1,95 @@
1
+ """CLI helpers for tpcp."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import shutil
7
+ import sysconfig
8
+ from pathlib import Path
9
+
10
+
11
+ def _find_distributed_skills_dir() -> Path:
12
+ repo_skills = Path(__file__).resolve().parents[2] / "skills" / "tpcp"
13
+ if repo_skills.is_dir():
14
+ return repo_skills
15
+
16
+ installed_skills = Path(sysconfig.get_paths()["data"]) / "tpcp"
17
+ if installed_skills.is_dir():
18
+ return installed_skills
19
+
20
+ raise FileNotFoundError(
21
+ "Could not locate the distributed tpcp skills. "
22
+ "Expected either a repository checkout at `skills/tpcp` or installed package data at "
23
+ f"`{installed_skills}`."
24
+ )
25
+
26
+
27
+ def install_skills(project_dir: Path, *, force: bool = False) -> list[Path]:
28
+ """Install the distributed tpcp skills into the project's `.agent` folder."""
29
+ source_dir = _find_distributed_skills_dir()
30
+ destination_dir = project_dir / ".agent"
31
+ destination_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ skill_dirs = sorted(p for p in source_dir.iterdir() if p.is_dir())
34
+ if not force:
35
+ conflicts = [
36
+ destination_dir / skill_dir.name for skill_dir in skill_dirs if (destination_dir / skill_dir.name).exists()
37
+ ]
38
+ if conflicts:
39
+ conflict_names = ", ".join(sorted(p.name for p in conflicts))
40
+ raise FileExistsError(
41
+ "The following skills already exist in the destination: "
42
+ f"{conflict_names}. Re-run with `--force` to replace them."
43
+ )
44
+
45
+ installed = []
46
+ for skill_dir in skill_dirs:
47
+ target = destination_dir / skill_dir.name
48
+ if target.exists():
49
+ if target.is_dir():
50
+ shutil.rmtree(target)
51
+ else:
52
+ target.unlink()
53
+ shutil.copytree(skill_dir, target)
54
+ installed.append(target)
55
+
56
+ return installed
57
+
58
+
59
+ def main(argv: list[str] | None = None) -> int:
60
+ """Run the tpcp CLI."""
61
+ parser = argparse.ArgumentParser(prog="tpcp")
62
+ subparsers = parser.add_subparsers(dest="command")
63
+
64
+ install_parser = subparsers.add_parser(
65
+ "install-skills",
66
+ help="Copy the distributed tpcp skills into the current project's `.agent` folder.",
67
+ )
68
+ install_parser.add_argument(
69
+ "--project-dir",
70
+ type=Path,
71
+ default=Path.cwd(),
72
+ help="Project directory that should receive the `.agent` folder. Defaults to the current working directory.",
73
+ )
74
+ install_parser.add_argument(
75
+ "--force",
76
+ action="store_true",
77
+ help="Replace already installed tpcp skills in the destination.",
78
+ )
79
+
80
+ args = parser.parse_args(argv)
81
+
82
+ if args.command == "install-skills":
83
+ installed = install_skills(args.project_dir.resolve(), force=args.force)
84
+ if installed:
85
+ print(f"Installed {len(installed)} skill(s) into {args.project_dir.resolve() / '.agent'}.")
86
+ else:
87
+ print("No new skills installed.")
88
+ return 0
89
+
90
+ parser.print_help()
91
+ return 1
92
+
93
+
94
+ if __name__ == "__main__":
95
+ raise SystemExit(main())
@@ -474,7 +474,7 @@ class _Dataset(BaseTpcpObject, Generic[GroupLabelT]):
474
474
 
475
475
  def create_group_labels(self, label_cols: Union[str, list[str]]) -> list[str]:
476
476
  warnings.warn(
477
- "The method `create_string_group_labels` is deprecated and will be removed in a future version. "
477
+ "The method `create_group_labels` is deprecated and will be removed in a future version. "
478
478
  "Use `create_string_group_labels` instead.",
479
479
  DeprecationWarning,
480
480
  stacklevel=1,
@@ -17,6 +17,7 @@ class Pipeline(Algorithm, Generic[DatasetT]):
17
17
  """Baseclass for all custom pipelines.
18
18
 
19
19
  To create your own custom pipeline, subclass this class and implement `run`.
20
+ The `run` method is expected to operate on exactly one dataset datapoint/group.
20
21
  """
21
22
 
22
23
  _action_methods: ClassVar[tuple[str, str]] = ("safe_run", "run")
@@ -30,6 +31,10 @@ class Pipeline(Algorithm, Generic[DatasetT]):
30
31
  .. note::
31
32
  It is usually preferred to use `safe_run` on custom pipelines instead of `run`, as `safe_run` can
32
33
  catch certain implementation errors of the run method.
34
+ However, neither `run` nor `safe_run` verify that `datapoint` actually represents only a single
35
+ datapoint/group.
36
+ Pipeline implementations should enforce this through dataset accessors and/or explicit
37
+ `assert_is_single(...)`/`assert_is_single_group(...)` checks.
33
38
 
34
39
  Parameters
35
40
  ----------
@@ -50,6 +55,7 @@ class Pipeline(Algorithm, Generic[DatasetT]):
50
55
 
51
56
  It is preferred to use this method over `run`, as it can catch some simple implementation errors of custom
52
57
  pipelines.
58
+ It does not validate that the provided dataset instance contains only a single datapoint/group.
53
59
 
54
60
  The following things are checked:
55
61
 
@@ -81,7 +87,7 @@ class OptimizablePipeline(Pipeline[DatasetT]):
81
87
 
82
88
  OptimizablePipelines are expected to implement a concrete way to train internal models or optimize parameters.
83
89
  This should not be a reimplementation of GridSearch or similar methods.
84
- For this :class:`tpcp.pipelines.GridSearch` should be used directly.
90
+ For this :class:`tpcp.optimize.GridSearch` should be used directly.
85
91
 
86
92
  It is important that `self_optimize` only modifies input parameters of the pipeline that are marked as
87
93
  `OptimizableParameter`.
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes