tpcp 2.1.2__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tpcp-2.1.2 → tpcp-2.2.0}/PKG-INFO +6 -2
- {tpcp-2.1.2 → tpcp-2.2.0}/README.md +5 -1
- {tpcp-2.1.2 → tpcp-2.2.0}/pyproject.toml +7 -1
- tpcp-2.2.0/skills/tpcp/tpcp-basics/SKILL.md +80 -0
- tpcp-2.2.0/skills/tpcp/tpcp-builder/SKILL.md +67 -0
- tpcp-2.2.0/skills/tpcp/tpcp-datasets/SKILL.md +74 -0
- tpcp-2.2.0/skills/tpcp/tpcp-multiprocessing/SKILL.md +45 -0
- tpcp-2.2.0/skills/tpcp/tpcp-optimization/SKILL.md +76 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/__init__.py +1 -1
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_algorithm.py +9 -1
- tpcp-2.2.0/src/tpcp/_cli.py +95 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_dataset.py +1 -1
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_pipeline.py +7 -1
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_algorithm_utils.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_base.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_hash.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_optimize.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_parameters.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_utils/__init__.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_utils/_general.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/_utils/_score.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/caching.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/exceptions.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/misc/__init__.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/misc/_class_utils.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/misc/_typed_iterator.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/optimize/__init__.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/optimize/_optimize.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/optimize/optuna.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/parallel.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/testing/__init__.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/testing/_algorithm_test_mixin.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/testing/_regression_utils.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/types.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/__init__.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/_cross_val_helper.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/_scorer.py +0 -0
- {tpcp-2.1.2 → tpcp-2.2.0}/src/tpcp/validate/_validate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: tpcp
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Pipeline and Dataset helpers for complex algorithm evaluation.
|
|
5
5
|
Author: Arne Küderle, Robert Richer, Raul C. Sîmpetru, Björn Eskofier
|
|
6
6
|
Author-email: Arne Küderle <arne.kuederle@fau.de>, Robert Richer <robert.richer@fau.de>, Raul C. Sîmpetru <raul.simpetru@fau.de>, Björn Eskofier <bjoern.eskofier@fau.de>
|
|
@@ -29,7 +29,6 @@ Description-Content-Type: text/markdown
|
|
|
29
29
|
[](https://tpcp.readthedocs.io/en/latest/?badge=latest)
|
|
30
30
|
[](https://codecov.io/gh/mad-lab-fau/tpcp)
|
|
31
31
|
[](https://github.com/mad-lab-fau/tpcp/actions/workflows/test-and-lint.yml)
|
|
32
|
-
[](https://github.com/psf/black)
|
|
33
32
|

|
|
34
33
|
[](https://doi.org/10.21105/joss.04953)
|
|
35
34
|
|
|
@@ -47,6 +46,11 @@ Or add it to your project with [uv](https://docs.astral.sh/uv/):
|
|
|
47
46
|
uv add tpcp
|
|
48
47
|
```
|
|
49
48
|
|
|
49
|
+
If you want to install the bundled tpcp agent skills into the current project's `.agent` folder, run:
|
|
50
|
+
```bash
|
|
51
|
+
tpcp install-skills
|
|
52
|
+
```
|
|
53
|
+
|
|
50
54
|
## Why?
|
|
51
55
|
|
|
52
56
|
Evaluating Algorithms - in particular when they contain machine learning - is hard.
|
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
[](https://tpcp.readthedocs.io/en/latest/?badge=latest)
|
|
5
5
|
[](https://codecov.io/gh/mad-lab-fau/tpcp)
|
|
6
6
|
[](https://github.com/mad-lab-fau/tpcp/actions/workflows/test-and-lint.yml)
|
|
7
|
-
[](https://github.com/psf/black)
|
|
8
7
|

|
|
9
8
|
[](https://doi.org/10.21105/joss.04953)
|
|
10
9
|
|
|
@@ -22,6 +21,11 @@ Or add it to your project with [uv](https://docs.astral.sh/uv/):
|
|
|
22
21
|
uv add tpcp
|
|
23
22
|
```
|
|
24
23
|
|
|
24
|
+
If you want to install the bundled tpcp agent skills into the current project's `.agent` folder, run:
|
|
25
|
+
```bash
|
|
26
|
+
tpcp install-skills
|
|
27
|
+
```
|
|
28
|
+
|
|
25
29
|
## Why?
|
|
26
30
|
|
|
27
31
|
Evaluating Algorithms - in particular when they contain machine learning - is hard.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tpcp"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.2.0"
|
|
4
4
|
description = "Pipeline and Dataset helpers for complex algorithm evaluation."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Arne Küderle", email = "arne.kuederle@fau.de" },
|
|
@@ -29,6 +29,9 @@ attrs = ["attrs>=22.1.0"]
|
|
|
29
29
|
Homepage = "https://github.com/mad-lab-fau/tpcp"
|
|
30
30
|
Repository = "https://github.com/mad-lab-fau/tpcp"
|
|
31
31
|
|
|
32
|
+
[project.scripts]
|
|
33
|
+
tpcp = "tpcp._cli:main"
|
|
34
|
+
|
|
32
35
|
[project.entry-points.pytest11]
|
|
33
36
|
tpcp_snapshots = "tpcp.testing._regression_utils"
|
|
34
37
|
|
|
@@ -53,6 +56,9 @@ dev = [
|
|
|
53
56
|
[tool.uv]
|
|
54
57
|
default-groups = "all"
|
|
55
58
|
|
|
59
|
+
[tool.uv.build-backend]
|
|
60
|
+
data = { data = "skills" }
|
|
61
|
+
|
|
56
62
|
[tool.uv.sources]
|
|
57
63
|
torch = { index = "torch_cpu" }
|
|
58
64
|
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tpcp-basics
|
|
3
|
+
description: Use when implementing or reviewing core tpcp classes, especially Algorithms and Pipelines, parameter definitions, action methods, result attributes, cloning, and nested parameter handling.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# tpcp Basics
|
|
7
|
+
|
|
8
|
+
Read `../tpcp-builder/SKILL.md` first for the global guardrails.
|
|
9
|
+
Also load `../tpcp-datasets/SKILL.md` for custom datasets and `../tpcp-optimization/SKILL.md` for `self_optimize`.
|
|
10
|
+
|
|
11
|
+
## Build Pattern
|
|
12
|
+
|
|
13
|
+
- Subclass `Algorithm`, `Pipeline`, or `OptimizablePipeline`.
|
|
14
|
+
- Declare any useful class-level parameter annotations.
|
|
15
|
+
- In `__init__`, assign each arg directly to `self`.
|
|
16
|
+
- Algorithms should accept simple/raw inputs, not whole dataset objects.
|
|
17
|
+
- Action methods compute results, store them on `*_` attrs, and return `self`.
|
|
18
|
+
- Pipelines consume one dataset datapoint/group, not an entire dataset split.
|
|
19
|
+
|
|
20
|
+
## Parameters
|
|
21
|
+
|
|
22
|
+
- In tpcp, all init args are parameters.
|
|
23
|
+
- If a value should be tunable/trainable/searchable, expose it in `__init__`.
|
|
24
|
+
- Use `set_params(...)` for programmatic updates, including nested updates like `algo__threshold=...`.
|
|
25
|
+
- Nested parameter annotations belong on the current class, e.g. `algorithm__threshold: OptimizableParameter[float]`.
|
|
26
|
+
|
|
27
|
+
## Action Methods
|
|
28
|
+
|
|
29
|
+
- Custom algorithms should set `_action_methods` to their action name(s), e.g. `"detect"`.
|
|
30
|
+
- Pipelines already use `run`/`safe_run`.
|
|
31
|
+
- Prefer `@make_action_safe`.
|
|
32
|
+
- `safe_run()` checks:
|
|
33
|
+
- returns `self`
|
|
34
|
+
- writes at least one `*_` result
|
|
35
|
+
- does not modify parameters
|
|
36
|
+
|
|
37
|
+
## Cloning
|
|
38
|
+
|
|
39
|
+
- Clone before each per-datapoint execution of a nested algorithm.
|
|
40
|
+
- Clone before mutating a nested algorithm/object inside `run` or `self_optimize`.
|
|
41
|
+
- `clone()` copies parameters but drops results and other non-parameter attrs.
|
|
42
|
+
- tpcp clones nested tpcp objects recursively and deep-copies other objects.
|
|
43
|
+
- Unlike `sklearn.clone`, tpcp keeps fitted sklearn estimator state because trained models are treated as parameters.
|
|
44
|
+
|
|
45
|
+
## Mutable Defaults
|
|
46
|
+
|
|
47
|
+
- Wrap defaults like `list`, `dict`, `np.ndarray`, `pd.DataFrame`, tpcp objects, sklearn estimators, or custom class instances in `cf(...)`.
|
|
48
|
+
- For dataclasses/attrs, use their own factories instead of `cf(...)`.
|
|
49
|
+
|
|
50
|
+
## Common Mistakes
|
|
51
|
+
|
|
52
|
+
- Doing parameter validation or derived-parameter setup in `__init__`.
|
|
53
|
+
- Giving a parameter a trailing `_`; that suffix is reserved for results.
|
|
54
|
+
- Forgetting to clone a nested algorithm before calling it.
|
|
55
|
+
- Running one algorithm instance repeatedly and expecting older results to remain.
|
|
56
|
+
- Storing learned templates/models on ad-hoc attrs instead of init parameters.
|
|
57
|
+
|
|
58
|
+
## Minimal Pattern
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
class MyPipe(Pipeline[MyDataset]):
|
|
62
|
+
algo: Parameter[MyAlgo]
|
|
63
|
+
output_: pd.DataFrame
|
|
64
|
+
|
|
65
|
+
def __init__(self, algo: MyAlgo = cf(MyAlgo())):
|
|
66
|
+
self.algo = algo
|
|
67
|
+
|
|
68
|
+
def run(self, datapoint: MyDataset):
|
|
69
|
+
algo = self.algo.clone()
|
|
70
|
+
algo = algo.detect(datapoint.signal, datapoint.sampling_rate_hz)
|
|
71
|
+
self.output_ = algo.events_
|
|
72
|
+
return self
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Source of Truth
|
|
76
|
+
|
|
77
|
+
- `https://tpcp.readthedocs.io/en/latest/guides/general_concepts.html`
|
|
78
|
+
- `https://tpcp.readthedocs.io/en/latest/guides/algorithms_pipelines_datasets.html`
|
|
79
|
+
- `https://tpcp.readthedocs.io/en/latest/auto_examples/algorithms/_01_algorithms_qrs_detection.html`
|
|
80
|
+
- API docs for `Algorithm`, `Pipeline`, `make_action_safe`, and `clone`
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tpcp-builder
|
|
3
|
+
description: Use when building, reviewing, or refactoring code that subclasses tpcp Dataset, Algorithm, Pipeline, or uses tpcp optimization/validation. Gives the global rules, main pitfalls, and points to focused tpcp skills for basics, datasets, and optimization.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# tpcp Builder
|
|
7
|
+
|
|
8
|
+
Read this first, then load the focused sibling skill(s) you need:
|
|
9
|
+
|
|
10
|
+
- Basics: `../tpcp-basics/SKILL.md`
|
|
11
|
+
- Datasets: `../tpcp-datasets/SKILL.md`
|
|
12
|
+
- Optimization: `../tpcp-optimization/SKILL.md`
|
|
13
|
+
|
|
14
|
+
## Mental Model
|
|
15
|
+
|
|
16
|
+
- `Dataset`: index + lazy access to actual data.
|
|
17
|
+
- `Algorithm`: reusable step with one or more action methods.
|
|
18
|
+
- `Pipeline`: glue code that runs on exactly one dataset datapoint/group.
|
|
19
|
+
- Optimization in tpcp means "data-driven changes to init parameters", including model training.
|
|
20
|
+
|
|
21
|
+
## Non-Negotiable Rules
|
|
22
|
+
|
|
23
|
+
- Every `__init__` argument must be stored unchanged on `self` under the same name.
|
|
24
|
+
- Do not validate, coerce, derive, or mutate parameters in `__init__`; do that in `run`/action methods.
|
|
25
|
+
- Do not use `*args` in tpcp object `__init__`.
|
|
26
|
+
- Parameter names must not contain `__` or end with `_`.
|
|
27
|
+
- Any mutable default or nested object default must use `cf(...)` or a dataclass/attrs factory.
|
|
28
|
+
- Results live on attributes ending with `_`.
|
|
29
|
+
- Action methods and `self_optimize` must return `self` (or `(self, info)` for `self_optimize_with_info`).
|
|
30
|
+
- Algorithms should take the simplest raw inputs they need; pipelines are the place that consume dataset datapoints.
|
|
31
|
+
- `run`/action methods must not modify parameters.
|
|
32
|
+
- Clone nested algorithms/objects before running or mutating them.
|
|
33
|
+
- Never optimize on test data.
|
|
34
|
+
- Any value that training/search changes must be an exposed init parameter.
|
|
35
|
+
|
|
36
|
+
## Highest-Risk Pitfalls
|
|
37
|
+
|
|
38
|
+
- Shared mutable defaults create silent cross-instance state and can cause train-test leakage.
|
|
39
|
+
- Reusing one nested algorithm instance across datapoints overwrites results and leaks fitted state.
|
|
40
|
+
- Building a non-deterministic dataset index breaks splits, caching, and reproducibility.
|
|
41
|
+
- Passing a multi-row/multi-group dataset into `Pipeline.run` violates the intended interface.
|
|
42
|
+
- Storing learned state outside init parameters makes `clone()` drop it and breaks optimization semantics.
|
|
43
|
+
- Marking `PureParameter` incorrectly can invalidate optimization shortcuts; default to plain `Parameter` unless sure.
|
|
44
|
+
|
|
45
|
+
## Safe Defaults
|
|
46
|
+
|
|
47
|
+
- Prefer `pipeline.safe_run(datapoint)` over `pipeline.run(datapoint)`.
|
|
48
|
+
- Prefer `@make_action_safe` on custom action methods.
|
|
49
|
+
- Prefer `@make_optimize_safe` and `Optimize(...)` for `self_optimize` pipelines.
|
|
50
|
+
- Prefer `GridSearch`/`GridSearchCV`/`OptunaSearch` for brute-force or black-box search.
|
|
51
|
+
|
|
52
|
+
## Quick Triage
|
|
53
|
+
|
|
54
|
+
If behavior is strange, check these first:
|
|
55
|
+
|
|
56
|
+
1. Mutable default or shared nested object?
|
|
57
|
+
2. Missing `clone()` before nested `run`/`detect`/`self_optimize`?
|
|
58
|
+
3. Dataset `create_index()` deterministic and sorted?
|
|
59
|
+
4. `run` touching parameters instead of only writing `*_` results?
|
|
60
|
+
5. `self_optimize` changing non-optimizable params or non-parameter attrs?
|
|
61
|
+
|
|
62
|
+
## Source of Truth
|
|
63
|
+
|
|
64
|
+
- Concepts: `https://tpcp.readthedocs.io/en/latest/guides/general_concepts.html`
|
|
65
|
+
- Dataset/algorithm/pipeline model: `https://tpcp.readthedocs.io/en/latest/guides/algorithms_pipelines_datasets.html`
|
|
66
|
+
- Evaluation and leakage rules: `https://tpcp.readthedocs.io/en/latest/guides/algorithm_evaluation.html`
|
|
67
|
+
- Optimization rules: `https://tpcp.readthedocs.io/en/latest/guides/optimization.html`
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tpcp-datasets
|
|
3
|
+
description: Use when implementing or reviewing tpcp Dataset classes, dataset indexes, grouping, subsetting, data accessors, and split/group label behavior for validation workflows.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# tpcp Datasets
|
|
7
|
+
|
|
8
|
+
Read `../tpcp-builder/SKILL.md` first for global guardrails.
|
|
9
|
+
Also load `../tpcp-basics/SKILL.md` when the dataset feeds a custom pipeline.
|
|
10
|
+
|
|
11
|
+
## Required Shape
|
|
12
|
+
|
|
13
|
+
- Subclass `Dataset[...]`.
|
|
14
|
+
- `__init__` must include `groupby_cols=None, subset_index=None` at the end of the signature and forward both to `super().__init__(...)`.
|
|
15
|
+
- Implement `create_index()` to return the full metadata index as a `pd.DataFrame`.
|
|
16
|
+
- Keep actual file/data loading out of `create_index()`; load lazily in properties/methods.
|
|
17
|
+
|
|
18
|
+
## Index Rules
|
|
19
|
+
|
|
20
|
+
- `create_index()` must be deterministic. tpcp calls it twice and will error if outputs differ.
|
|
21
|
+
- Sort the final index explicitly; do not rely on filesystem order or `set` iteration.
|
|
22
|
+
- Index columns should be valid Python identifiers. Invalid names break ergonomics around `get_subset`, `group_label`, and `group_labels`.
|
|
23
|
+
- If you use a typed named-tuple group label generic, its field names and order must match index columns exactly.
|
|
24
|
+
|
|
25
|
+
## Data Accessors
|
|
26
|
+
|
|
27
|
+
- Properties that expose actual data should usually require a single row via `assert_is_single(...)`.
|
|
28
|
+
- Properties that expose group-level data should require a single current group via `assert_is_single_group(...)`.
|
|
29
|
+
- Think carefully about what counts as a datapoint in your project before designing accessors.
|
|
30
|
+
|
|
31
|
+
## Grouping and Iteration
|
|
32
|
+
|
|
33
|
+
- Ungrouped dataset length = row count.
|
|
34
|
+
- Grouped dataset length = unique group count.
|
|
35
|
+
- Grouping changes what "one datapoint" means for iteration and splitting.
|
|
36
|
+
- `group_labels` follow current grouping; `index_as_tuples()` always reflects raw rows.
|
|
37
|
+
|
|
38
|
+
## Subsetting and Splits
|
|
39
|
+
|
|
40
|
+
- `get_subset(...)` accepts exactly one selector mode at a time.
|
|
41
|
+
- Use `groupby(...)` when train/test splitting should happen on a higher level than raw rows.
|
|
42
|
+
- Use `create_string_group_labels(...)` for `GroupKFold` and similar sklearn splitters.
|
|
43
|
+
- If the dataset is already grouped, `create_string_group_labels(...)` columns must be a subset of `groupby_cols`.
|
|
44
|
+
|
|
45
|
+
## Common Mistakes
|
|
46
|
+
|
|
47
|
+
- Non-deterministic index creation.
|
|
48
|
+
- Loading full signals/dataframes in `create_index()`.
|
|
49
|
+
- Forgetting `groupby_cols`/`subset_index` in custom dataset init.
|
|
50
|
+
- Accessing per-recording data from a subset that still contains multiple rows/groups.
|
|
51
|
+
- Splitting raw rows when the real independence unit is participant/session/day.
|
|
52
|
+
|
|
53
|
+
## Minimal Pattern
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
class MyDataset(Dataset[MyGroupLabel]):
|
|
57
|
+
def __init__(self, root: Path, groupby_cols=None, subset_index=None):
|
|
58
|
+
self.root = root
|
|
59
|
+
super().__init__(groupby_cols=groupby_cols, subset_index=subset_index)
|
|
60
|
+
|
|
61
|
+
def create_index(self) -> pd.DataFrame:
|
|
62
|
+
return build_index(self.root).sort_values(["participant", "recording"]).reset_index(drop=True)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def signal(self) -> pd.DataFrame:
|
|
66
|
+
self.assert_is_single(None, "signal")
|
|
67
|
+
return load_signal(...)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Source of Truth
|
|
71
|
+
|
|
72
|
+
- `https://tpcp.readthedocs.io/en/latest/auto_examples/datasets/_01_datasets_basics.html`
|
|
73
|
+
- `https://tpcp.readthedocs.io/en/latest/guides/algorithms_pipelines_datasets.html`
|
|
74
|
+
- API docs for `Dataset`
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tpcp-multiprocessing
|
|
3
|
+
description: Use when working on tpcp code that uses n_jobs, joblib parallelism, tpcp.parallel, caching in parallel workers, or when debugging multiprocessing, serialization, or global state issues in tpcp.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# tpcp Multiprocessing
|
|
7
|
+
|
|
8
|
+
Read the official `Multiprocessing Caveats` guide first:
|
|
9
|
+
`https://tpcp.readthedocs.io/en/latest/guides/multiprocessing_caveats.html`
|
|
10
|
+
|
|
11
|
+
## Use this skill when
|
|
12
|
+
|
|
13
|
+
- `validate`, `cross_validate`, `Scorer`, or an optimizer uses `n_jobs`
|
|
14
|
+
- global config seems missing in workers
|
|
15
|
+
- runtime monkey-patching/decorators/caches behave differently in parallel
|
|
16
|
+
- joblib raises pickle or `__main__`-related errors
|
|
17
|
+
- heavy imports make parallel runs unexpectedly slow
|
|
18
|
+
|
|
19
|
+
## Main Caveats
|
|
20
|
+
|
|
21
|
+
- Worker processes do not automatically inherit runtime global state changes from the parent process.
|
|
22
|
+
- Joblib `loky` workers are reused, so worker-side global mutations can leak into later jobs.
|
|
23
|
+
- Serialization is often the real failure point, not the parallel API itself.
|
|
24
|
+
- Objects defined in `__main__`, lambdas, nested classes/functions, and runtime-replaced globals are high risk.
|
|
25
|
+
- Heavy optional imports can dominate worker startup cost.
|
|
26
|
+
|
|
27
|
+
## tpcp-Specific Guidance
|
|
28
|
+
|
|
29
|
+
- For global state restoration, use `tpcp.parallel.delayed` together with `register_global_parallel_callback(...)`.
|
|
30
|
+
- Assume runtime-applied decorators or caches are not visible in workers unless explicitly restored there.
|
|
31
|
+
- If tests require a clean worker pool, shut down the reusable loky executor explicitly.
|
|
32
|
+
- If debugging cost outweighs the speedup, fall back to `n_jobs=1`.
|
|
33
|
+
|
|
34
|
+
## Fast Triage
|
|
35
|
+
|
|
36
|
+
1. Missing config only in workers: global-state problem, use `tpcp.parallel`.
|
|
37
|
+
2. Error mentions `__main__`: move code to an importable module.
|
|
38
|
+
3. Works once, fails later: suspect process-pool reuse and leaked worker state.
|
|
39
|
+
4. Parallel is slower than serial: inspect import cost and serialization overhead.
|
|
40
|
+
|
|
41
|
+
## Source of Truth
|
|
42
|
+
|
|
43
|
+
- `https://tpcp.readthedocs.io/en/latest/guides/multiprocessing_caveats.html`
|
|
44
|
+
- API docs for `tpcp.parallel`
|
|
45
|
+
- GitHub issue `#119`
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tpcp-optimization
|
|
3
|
+
description: Use when implementing or reviewing tpcp self_optimize logic, parameter annotations, Optimize/GridSearch/GridSearchCV usage, and validation workflows that must avoid train-test leakage.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# tpcp Optimization
|
|
7
|
+
|
|
8
|
+
Read `../tpcp-builder/SKILL.md` first for the global guardrails.
|
|
9
|
+
Also load `../tpcp-basics/SKILL.md` for cloning/parameter rules and `../tpcp-datasets/SKILL.md` for split semantics.
|
|
10
|
+
|
|
11
|
+
## Pick the Right Tool
|
|
12
|
+
|
|
13
|
+
- Use `self_optimize` only for algorithm-specific training logic.
|
|
14
|
+
- Do not re-implement brute-force search inside `self_optimize`.
|
|
15
|
+
- Use `Optimize(pipeline)` for pipelines that implement `self_optimize`.
|
|
16
|
+
- Use `GridSearch` for brute-force search without inner CV.
|
|
17
|
+
- Use `GridSearchCV` when hyperparameter search itself needs CV.
|
|
18
|
+
- Use `DummyOptimize` if you need a non-optimizable baseline on the same CV folds.
|
|
19
|
+
|
|
20
|
+
## Parameter Semantics
|
|
21
|
+
|
|
22
|
+
- `OptimizableParameter`: changed by `self_optimize`.
|
|
23
|
+
- `HyperParameter`: changes how `self_optimize` behaves but is not changed by it.
|
|
24
|
+
- `PureParameter`: does not affect `self_optimize`; only use when you are certain.
|
|
25
|
+
- If unsure, prefer plain `Parameter` over `PureParameter`.
|
|
26
|
+
|
|
27
|
+
## Hard Rules for `self_optimize`
|
|
28
|
+
|
|
29
|
+
- Return `self`, or `(self, info)` from `self_optimize_with_info`.
|
|
30
|
+
- Modify only parameters marked as optimizable on the current class.
|
|
31
|
+
- Do not store learned state on non-parameter attrs.
|
|
32
|
+
- Any learned model/template/weights must survive `clone()`, so they must be represented as parameters.
|
|
33
|
+
- Clone nested algorithms before training/mutating them.
|
|
34
|
+
- Prefer `@make_optimize_safe`; `Optimize(...)` also applies equivalent checks.
|
|
35
|
+
|
|
36
|
+
## Evaluation Rules
|
|
37
|
+
|
|
38
|
+
- Never search/tune/train on the final test data.
|
|
39
|
+
- Outer evaluation measures the whole training procedure, not one already-trained instance.
|
|
40
|
+
- `cross_validate(...)` expects an optimizer object, not a bare pipeline.
|
|
41
|
+
- For grouped or stratified splits, create explicit group/label arrays or use `DatasetSplitter`.
|
|
42
|
+
- In custom scorers, call `pipeline.safe_run(datapoint)`.
|
|
43
|
+
|
|
44
|
+
## Common Mistakes
|
|
45
|
+
|
|
46
|
+
- Putting black-box parameter search into `self_optimize` instead of `GridSearch`/`OptunaSearch`.
|
|
47
|
+
- Forgetting to annotate optimizable params, causing safety checks to fail.
|
|
48
|
+
- Changing non-optimizable params during `self_optimize`.
|
|
49
|
+
- Marking a parameter as `PureParameter` even though it affects training.
|
|
50
|
+
- Training a nested algorithm in place and then reusing it across folds/datapoints.
|
|
51
|
+
- Calling `self_optimize` directly in user-facing code instead of using `Optimize(...)`.
|
|
52
|
+
|
|
53
|
+
## Minimal Pattern
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
class MyPipeline(OptimizablePipeline[MyDataset]):
|
|
57
|
+
model: Parameter[MyAlgo]
|
|
58
|
+
model__weights: OptimizableParameter[np.ndarray]
|
|
59
|
+
|
|
60
|
+
def __init__(self, model: MyAlgo = cf(MyAlgo())):
|
|
61
|
+
self.model = model
|
|
62
|
+
|
|
63
|
+
def self_optimize(self, dataset: MyDataset, **kwargs):
|
|
64
|
+
model = self.model.clone()
|
|
65
|
+
self.model = model.self_optimize(...)
|
|
66
|
+
return self
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Source of Truth
|
|
70
|
+
|
|
71
|
+
- `https://tpcp.readthedocs.io/en/latest/guides/optimization.html`
|
|
72
|
+
- `https://tpcp.readthedocs.io/en/latest/guides/algorithm_evaluation.html`
|
|
73
|
+
- `https://tpcp.readthedocs.io/en/latest/guides/algorithm_validation_tpcp.html`
|
|
74
|
+
- `https://tpcp.readthedocs.io/en/latest/auto_examples/parameter_optimization/_02_optimizable_pipelines.html`
|
|
75
|
+
- `https://tpcp.readthedocs.io/en/latest/auto_examples/parameter_optimization/_03_gridsearch_cv.html`
|
|
76
|
+
- API docs for `Optimize`, `GridSearch`, `GridSearchCV`, and `make_optimize_safe`
|
|
@@ -14,9 +14,17 @@ class Algorithm(BaseTpcpObject):
|
|
|
14
14
|
|
|
15
15
|
All type-specific algorithm classes should inherit from this class and need to
|
|
16
16
|
|
|
17
|
-
1. overwrite `
|
|
17
|
+
1. overwrite `_action_methods` with the name of the actual action method of this class type
|
|
18
18
|
2. implement a stub for the action method
|
|
19
19
|
|
|
20
|
+
Examples
|
|
21
|
+
--------
|
|
22
|
+
>>> class MyAlgorithm(Algorithm):
|
|
23
|
+
... _action_methods = "detect"
|
|
24
|
+
...
|
|
25
|
+
... def detect(self, data):
|
|
26
|
+
... return self
|
|
27
|
+
|
|
20
28
|
If you want to create an optimizable algorithm, add a `self_optimize` or (`self_optimize_with_info`) method to your
|
|
21
29
|
class.
|
|
22
30
|
We do not provide a separate base class for that, as we can make no assumptions about the call signature of your
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""CLI helpers for tpcp."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import shutil
|
|
7
|
+
import sysconfig
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _find_distributed_skills_dir() -> Path:
|
|
12
|
+
repo_skills = Path(__file__).resolve().parents[2] / "skills" / "tpcp"
|
|
13
|
+
if repo_skills.is_dir():
|
|
14
|
+
return repo_skills
|
|
15
|
+
|
|
16
|
+
installed_skills = Path(sysconfig.get_paths()["data"]) / "tpcp"
|
|
17
|
+
if installed_skills.is_dir():
|
|
18
|
+
return installed_skills
|
|
19
|
+
|
|
20
|
+
raise FileNotFoundError(
|
|
21
|
+
"Could not locate the distributed tpcp skills. "
|
|
22
|
+
"Expected either a repository checkout at `skills/tpcp` or installed package data at "
|
|
23
|
+
f"`{installed_skills}`."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def install_skills(project_dir: Path, *, force: bool = False) -> list[Path]:
|
|
28
|
+
"""Install the distributed tpcp skills into the project's `.agent` folder."""
|
|
29
|
+
source_dir = _find_distributed_skills_dir()
|
|
30
|
+
destination_dir = project_dir / ".agent"
|
|
31
|
+
destination_dir.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
|
|
33
|
+
skill_dirs = sorted(p for p in source_dir.iterdir() if p.is_dir())
|
|
34
|
+
if not force:
|
|
35
|
+
conflicts = [
|
|
36
|
+
destination_dir / skill_dir.name for skill_dir in skill_dirs if (destination_dir / skill_dir.name).exists()
|
|
37
|
+
]
|
|
38
|
+
if conflicts:
|
|
39
|
+
conflict_names = ", ".join(sorted(p.name for p in conflicts))
|
|
40
|
+
raise FileExistsError(
|
|
41
|
+
"The following skills already exist in the destination: "
|
|
42
|
+
f"{conflict_names}. Re-run with `--force` to replace them."
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
installed = []
|
|
46
|
+
for skill_dir in skill_dirs:
|
|
47
|
+
target = destination_dir / skill_dir.name
|
|
48
|
+
if target.exists():
|
|
49
|
+
if target.is_dir():
|
|
50
|
+
shutil.rmtree(target)
|
|
51
|
+
else:
|
|
52
|
+
target.unlink()
|
|
53
|
+
shutil.copytree(skill_dir, target)
|
|
54
|
+
installed.append(target)
|
|
55
|
+
|
|
56
|
+
return installed
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def main(argv: list[str] | None = None) -> int:
|
|
60
|
+
"""Run the tpcp CLI."""
|
|
61
|
+
parser = argparse.ArgumentParser(prog="tpcp")
|
|
62
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
63
|
+
|
|
64
|
+
install_parser = subparsers.add_parser(
|
|
65
|
+
"install-skills",
|
|
66
|
+
help="Copy the distributed tpcp skills into the current project's `.agent` folder.",
|
|
67
|
+
)
|
|
68
|
+
install_parser.add_argument(
|
|
69
|
+
"--project-dir",
|
|
70
|
+
type=Path,
|
|
71
|
+
default=Path.cwd(),
|
|
72
|
+
help="Project directory that should receive the `.agent` folder. Defaults to the current working directory.",
|
|
73
|
+
)
|
|
74
|
+
install_parser.add_argument(
|
|
75
|
+
"--force",
|
|
76
|
+
action="store_true",
|
|
77
|
+
help="Replace already installed tpcp skills in the destination.",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
args = parser.parse_args(argv)
|
|
81
|
+
|
|
82
|
+
if args.command == "install-skills":
|
|
83
|
+
installed = install_skills(args.project_dir.resolve(), force=args.force)
|
|
84
|
+
if installed:
|
|
85
|
+
print(f"Installed {len(installed)} skill(s) into {args.project_dir.resolve() / '.agent'}.")
|
|
86
|
+
else:
|
|
87
|
+
print("No new skills installed.")
|
|
88
|
+
return 0
|
|
89
|
+
|
|
90
|
+
parser.print_help()
|
|
91
|
+
return 1
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
raise SystemExit(main())
|
|
@@ -474,7 +474,7 @@ class _Dataset(BaseTpcpObject, Generic[GroupLabelT]):
|
|
|
474
474
|
|
|
475
475
|
def create_group_labels(self, label_cols: Union[str, list[str]]) -> list[str]:
|
|
476
476
|
warnings.warn(
|
|
477
|
-
"The method `
|
|
477
|
+
"The method `create_group_labels` is deprecated and will be removed in a future version. "
|
|
478
478
|
"Use `create_string_group_labels` instead.",
|
|
479
479
|
DeprecationWarning,
|
|
480
480
|
stacklevel=1,
|
|
@@ -17,6 +17,7 @@ class Pipeline(Algorithm, Generic[DatasetT]):
|
|
|
17
17
|
"""Baseclass for all custom pipelines.
|
|
18
18
|
|
|
19
19
|
To create your own custom pipeline, subclass this class and implement `run`.
|
|
20
|
+
The `run` method is expected to operate on exactly one dataset datapoint/group.
|
|
20
21
|
"""
|
|
21
22
|
|
|
22
23
|
_action_methods: ClassVar[tuple[str, str]] = ("safe_run", "run")
|
|
@@ -30,6 +31,10 @@ class Pipeline(Algorithm, Generic[DatasetT]):
|
|
|
30
31
|
.. note::
|
|
31
32
|
It is usually preferred to use `safe_run` on custom pipelines instead of `run`, as `safe_run` can
|
|
32
33
|
catch certain implementation errors of the run method.
|
|
34
|
+
However, neither `run` nor `safe_run` verify that `datapoint` actually represents only a single
|
|
35
|
+
datapoint/group.
|
|
36
|
+
Pipeline implementations should enforce this through dataset accessors and/or explicit
|
|
37
|
+
`assert_is_single(...)`/`assert_is_single_group(...)` checks.
|
|
33
38
|
|
|
34
39
|
Parameters
|
|
35
40
|
----------
|
|
@@ -50,6 +55,7 @@ class Pipeline(Algorithm, Generic[DatasetT]):
|
|
|
50
55
|
|
|
51
56
|
It is preferred to use this method over `run`, as it can catch some simple implementation errors of custom
|
|
52
57
|
pipelines.
|
|
58
|
+
It does not validate that the provided dataset instance contains only a single datapoint/group.
|
|
53
59
|
|
|
54
60
|
The following things are checked:
|
|
55
61
|
|
|
@@ -81,7 +87,7 @@ class OptimizablePipeline(Pipeline[DatasetT]):
|
|
|
81
87
|
|
|
82
88
|
OptimizablePipelines are expected to implement a concrete way to train internal models or optimize parameters.
|
|
83
89
|
This should not be a reimplementation of GridSearch or similar methods.
|
|
84
|
-
For this :class:`tpcp.
|
|
90
|
+
For this :class:`tpcp.optimize.GridSearch` should be used directly.
|
|
85
91
|
|
|
86
92
|
It is important that `self_optimize` only modifies input parameters of the pipeline that are marked as
|
|
87
93
|
`OptimizableParameter`.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|