stet 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. stet-0.0.1/.github/workflows/ci.yml +52 -0
  2. stet-0.0.1/.gitignore +23 -0
  3. stet-0.0.1/.python-version +1 -0
  4. stet-0.0.1/LICENSE +21 -0
  5. stet-0.0.1/PKG-INFO +105 -0
  6. stet-0.0.1/PYPITOKEN +1 -0
  7. stet-0.0.1/README.md +92 -0
  8. stet-0.0.1/benchmarks/run_benchmarks.py +175 -0
  9. stet-0.0.1/docs/explanation/how-once-works.md +66 -0
  10. stet-0.0.1/docs/explanation/performance.md +63 -0
  11. stet-0.0.1/docs/explanation/related-libraries.md +33 -0
  12. stet-0.0.1/docs/how-to/choose-a-backend.md +66 -0
  13. stet-0.0.1/docs/how-to/choose-key-parameters.md +41 -0
  14. stet-0.0.1/docs/how-to/decorate-a-function.md +37 -0
  15. stet-0.0.1/docs/how-to/handle-reruns.md +9 -0
  16. stet-0.0.1/docs/how-to/import-existing-results.md +64 -0
  17. stet-0.0.1/docs/how-to/inspect-and-reset-a-store.md +32 -0
  18. stet-0.0.1/docs/how-to/install.md +55 -0
  19. stet-0.0.1/docs/how-to/recover-from-a-crash.md +27 -0
  20. stet-0.0.1/docs/how-to/rerun-everything.md +10 -0
  21. stet-0.0.1/docs/how-to/specify-a-store.md +53 -0
  22. stet-0.0.1/docs/how-to/use-with-multiprocessing.md +44 -0
  23. stet-0.0.1/docs/img/logo.svg +10 -0
  24. stet-0.0.1/docs/img/record_overhead.png +0 -0
  25. stet-0.0.1/docs/img/skip_overhead.png +0 -0
  26. stet-0.0.1/docs/index.md +60 -0
  27. stet-0.0.1/docs/reference/api.md +23 -0
  28. stet-0.0.1/docs/tutorials/getting-started.md +74 -0
  29. stet-0.0.1/mkdocs.yml +50 -0
  30. stet-0.0.1/pyproject.toml +45 -0
  31. stet-0.0.1/src/stet/__init__.py +18 -0
  32. stet-0.0.1/src/stet/_decorator.py +126 -0
  33. stet-0.0.1/src/stet/_utils.py +97 -0
  34. stet-0.0.1/src/stet/backends/__init__.py +54 -0
  35. stet-0.0.1/src/stet/backends/_base.py +62 -0
  36. stet-0.0.1/src/stet/backends/_csv.py +110 -0
  37. stet-0.0.1/src/stet/backends/_json.py +108 -0
  38. stet-0.0.1/src/stet/backends/_parquet.py +117 -0
  39. stet-0.0.1/src/stet/backends/_sqlite.py +133 -0
  40. stet-0.0.1/src/stet/py.typed +0 -0
  41. stet-0.0.1/tests/__init__.py +0 -0
  42. stet-0.0.1/tests/test_backends.py +241 -0
  43. stet-0.0.1/tests/test_decorator.py +129 -0
  44. stet-0.0.1/tests/test_utils.py +163 -0
@@ -0,0 +1,52 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ lint:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: astral-sh/setup-uv@v5
14
+ - run: uv sync --extra parquet
15
+ - run: uv run ruff format --check .
16
+ - run: uv run ruff check .
17
+ - run: uv run ty check
18
+
19
+ test:
20
+ runs-on: ${{ matrix.os }}
21
+ strategy:
22
+ fail-fast: false
23
+ matrix:
24
+ os: [ubuntu-latest, windows-latest, macos-latest]
25
+ python: ["3.11", "3.12", "3.13"]
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - uses: astral-sh/setup-uv@v5
29
+ with:
30
+ python-version: ${{ matrix.python }}
31
+ - run: uv sync --extra parquet
32
+ - run: uv run pytest
33
+
34
+ docs:
35
+ runs-on: ubuntu-latest
36
+ steps:
37
+ - uses: actions/checkout@v4
38
+ - uses: astral-sh/setup-uv@v5
39
+ - run: uv sync
40
+ - run: uv run mkdocs build --strict
41
+
42
+ docs-deploy:
43
+ runs-on: ubuntu-latest
44
+ needs: [lint, test, docs]
45
+ if: github.ref == 'refs/heads/main' && github.event_name == 'push'
46
+ permissions:
47
+ contents: write
48
+ steps:
49
+ - uses: actions/checkout@v4
50
+ - uses: astral-sh/setup-uv@v5
51
+ - run: uv sync
52
+ - run: uv run mkdocs gh-deploy --force
stet-0.0.1/.gitignore ADDED
@@ -0,0 +1,23 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Test and coverage
13
+ .coverage
14
+ .pytest_cache/
15
+
16
+ # MkDocs build output
17
+ site/
18
+
19
+ # Runtime filelock files
20
+ *.lock
21
+
22
+ # Default once store files (generated at runtime)
23
+ _once_store.*
@@ -0,0 +1 @@
1
+ 3.14
stet-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vince Knight
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
stet-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: stet
3
+ Version: 0.0.1
4
+ Summary: Persistent memoization by parameter identity for experiment scripts
5
+ Author-email: Vince Knight <vince@vknight.org>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: filelock>=3.25.1
9
+ Requires-Dist: pandas>=3.0.1
10
+ Provides-Extra: parquet
11
+ Requires-Dist: pyarrow>=23.0.1; extra == 'parquet'
12
+ Description-Content-Type: text/markdown
13
+
14
+ # stet
15
+
16
+ A Python library for making parameter sweeps safely resumable.
17
+
18
+ When a long-running experiment script is re-run — whether after a crash, a time limit, or deliberately to extend a sweep — `stet` automatically skips any parameter combinations that have already been completed.
19
+
20
+ ```python
21
+ import stet
22
+
23
+ @stet.once(store='markov_runs.csv', key=['alpha', 'n_states', 'seed'])
24
+ def solve_markov(alpha, n_states, seed, n_iter=10_000):
25
+ # expensive computation
26
+ ...
27
+
28
+ for alpha in alphas:
29
+ for n_states in [10, 50, 100]:
30
+ for seed in range(20):
31
+ solve_markov(alpha=alpha, n_states=n_states, seed=seed)
32
+ ```
33
+
34
+ On restart, any already-completed `(alpha, n_states, seed)` combinations are skipped:
35
+
36
+ ```
37
+ [once] Skipping solve_markov(alpha=0.01, n_states=10, seed=0)
38
+ [once] Skipping solve_markov(alpha=0.01, n_states=10, seed=1)
39
+ ...
40
+ ```
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ uv add stet
46
+ # or: python -m pip install stet
47
+ ```
48
+
49
+ With Parquet support:
50
+
51
+ ```bash
52
+ uv add stet[parquet]
53
+ # or: python -m pip install stet[parquet]
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ **Zero config** — uses `_stet_store.csv` in the current directory, all parameters as the key:
59
+
60
+ ```python
61
+ @stet.once
62
+ def run_experiment(alpha, seed):
63
+ ...
64
+ ```
65
+
66
+ **Named store** — backend selected from file extension (`.csv`, `.json`, `.sqlite`, `.parquet`):
67
+
68
+ ```python
69
+ @stet.once(store='runs.sqlite')
70
+ def run_experiment(alpha, seed):
71
+ ...
72
+ ```
73
+
74
+ **Key subset** — only `alpha` and `seed` determine whether a run is skipped; `n_iter` is ignored:
75
+
76
+ ```python
77
+ @stet.once(store='runs.csv', key=['alpha', 'seed'])
78
+ def run_experiment(alpha, seed, n_iter=1000):
79
+ ...
80
+ ```
81
+
82
+ ## Utilities
83
+
84
+ ```python
85
+ stet.status() # print a summary of completed runs
86
+ stet.reset() # clear the store
87
+ stet.reset(key_dict={'alpha': '0.1', 'seed': '42'}) # remove one entry
88
+ ```
89
+
90
+ ## Storage backends
91
+
92
+ | Extension | Backend | Notes |
93
+ |-----------|---------|-------|
94
+ | `.csv` | pandas CSV | Default. Human-readable. |
95
+ | `.json` | stdlib json | No extra dependencies. |
96
+ | `.sqlite` | stdlib sqlite3 | Best for large stores and parallel workers. |
97
+ | `.parquet` | pandas + pyarrow | Requires `stet[parquet]`. |
98
+
99
+ ## Documentation
100
+
101
+ Full documentation including how-to guides, API reference, and explanation of design decisions:
102
+
103
+ ```bash
104
+ uv run mkdocs serve
105
+ ```
stet-0.0.1/PYPITOKEN ADDED
@@ -0,0 +1 @@
1
+ pypi-AgEIcHlwaS5vcmcCJGJlYjRmNWE4LTUxMGYtNGYzZC05ZmNmLTlmMzNjYjQ0YzBlMgACKlszLCJjZjcxYTNlZS0zOWZiLTRiNzgtOTk3MC1jMDFhNDY1ZjQ2Y2IiXQAABiAwdw95RJ2c_tU31jur4vq-JK_FFUme19FScapJbCKjCQ
stet-0.0.1/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # stet
2
+
3
+ A Python library for making parameter sweeps safely resumable.
4
+
5
+ When a long-running experiment script is re-run — whether after a crash, a time limit, or deliberately to extend a sweep — `stet` automatically skips any parameter combinations that have already been completed.
6
+
7
+ ```python
8
+ import stet
9
+
10
+ @stet.once(store='markov_runs.csv', key=['alpha', 'n_states', 'seed'])
11
+ def solve_markov(alpha, n_states, seed, n_iter=10_000):
12
+ # expensive computation
13
+ ...
14
+
15
+ for alpha in alphas:
16
+ for n_states in [10, 50, 100]:
17
+ for seed in range(20):
18
+ solve_markov(alpha=alpha, n_states=n_states, seed=seed)
19
+ ```
20
+
21
+ On restart, any already-completed `(alpha, n_states, seed)` combinations are skipped:
22
+
23
+ ```
24
+ [once] Skipping solve_markov(alpha=0.01, n_states=10, seed=0)
25
+ [once] Skipping solve_markov(alpha=0.01, n_states=10, seed=1)
26
+ ...
27
+ ```
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ uv add stet
33
+ # or: python -m pip install stet
34
+ ```
35
+
36
+ With Parquet support:
37
+
38
+ ```bash
39
+ uv add stet[parquet]
40
+ # or: python -m pip install stet[parquet]
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ **Zero config** — uses `_stet_store.csv` in the current directory, all parameters as the key:
46
+
47
+ ```python
48
+ @stet.once
49
+ def run_experiment(alpha, seed):
50
+ ...
51
+ ```
52
+
53
+ **Named store** — backend selected from file extension (`.csv`, `.json`, `.sqlite`, `.parquet`):
54
+
55
+ ```python
56
+ @stet.once(store='runs.sqlite')
57
+ def run_experiment(alpha, seed):
58
+ ...
59
+ ```
60
+
61
+ **Key subset** — only `alpha` and `seed` determine whether a run is skipped; `n_iter` is ignored:
62
+
63
+ ```python
64
+ @stet.once(store='runs.csv', key=['alpha', 'seed'])
65
+ def run_experiment(alpha, seed, n_iter=1000):
66
+ ...
67
+ ```
68
+
69
+ ## Utilities
70
+
71
+ ```python
72
+ stet.status() # print a summary of completed runs
73
+ stet.reset() # clear the store
74
+ stet.reset(key_dict={'alpha': '0.1', 'seed': '42'}) # remove one entry
75
+ ```
76
+
77
+ ## Storage backends
78
+
79
+ | Extension | Backend | Notes |
80
+ |-----------|---------|-------|
81
+ | `.csv` | pandas CSV | Default. Human-readable. |
82
+ | `.json` | stdlib json | No extra dependencies. |
83
+ | `.sqlite` | stdlib sqlite3 | Best for large stores and parallel workers. |
84
+ | `.parquet` | pandas + pyarrow | Requires `stet[parquet]`. |
85
+
86
+ ## Documentation
87
+
88
+ Full documentation including how-to guides, API reference, and explanation of design decisions:
89
+
90
+ ```bash
91
+ uv run mkdocs serve
92
+ ```
@@ -0,0 +1,175 @@
1
+ """Benchmark the overhead of stet across backends and store sizes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import statistics
6
+ import tempfile
7
+ import time
8
+ from pathlib import Path
9
+
10
+ import matplotlib.pyplot as plt
11
+ import matplotlib.ticker as ticker
12
+
13
+ from stet.backends._csv import CsvBackend
14
+ from stet.backends._json import JsonBackend
15
+ from stet.backends._sqlite import SqliteBackend
16
+
17
+ try:
18
+ from stet.backends._parquet import ParquetBackend
19
+
20
+ HAS_PARQUET = True
21
+ except ImportError:
22
+ HAS_PARQUET = False
23
+
24
+ BACKENDS = {
25
+ "CSV": CsvBackend,
26
+ "JSON": JsonBackend,
27
+ "SQLite": SqliteBackend,
28
+ }
29
+ if HAS_PARQUET:
30
+ BACKENDS["Parquet"] = ParquetBackend # type: ignore[assignment]
31
+
32
+ EXTENSIONS = {
33
+ "CSV": "csv",
34
+ "JSON": "json",
35
+ "SQLite": "sqlite",
36
+ "Parquet": "parquet",
37
+ }
38
+
39
+ N_REPEATS = 20 # repetitions per measurement
40
+ STORE_SIZES = [0, 10, 100, 500, 1000, 2000, 5000, 10000]
41
+
42
+
43
+ def measure(fn, n: int = N_REPEATS) -> tuple[float, float]:
44
+ """Return (mean_ms, stdev_ms) over n repetitions."""
45
+ times = []
46
+ for _ in range(n):
47
+ t0 = time.perf_counter()
48
+ fn()
49
+ times.append((time.perf_counter() - t0) * 1000)
50
+ return statistics.mean(times), statistics.stdev(times)
51
+
52
+
53
+ def bench_record(backend_cls, ext: str, n_existing: int) -> tuple[float, float]:
54
+ """Time a single record() call with n_existing records already present."""
55
+ with tempfile.TemporaryDirectory() as d:
56
+ path = Path(d) / f"store.{ext}"
57
+ b = backend_cls(path)
58
+ for i in range(n_existing):
59
+ b.record({"x": i, "y": i * 2})
60
+
61
+ counter = [n_existing]
62
+
63
+ def fn() -> None:
64
+ b.record({"x": counter[0], "y": counter[0] * 2})
65
+ counter[0] += 1
66
+
67
+ return measure(fn)
68
+
69
+
70
+ def bench_has_hit(backend_cls, ext: str, n_existing: int) -> tuple[float, float]:
71
+ """Time a has() call that returns True (skip path) with n_existing records."""
72
+ with tempfile.TemporaryDirectory() as d:
73
+ path = Path(d) / f"store.{ext}"
74
+ b = backend_cls(path)
75
+ for i in range(n_existing):
76
+ b.record({"x": i, "y": i * 2})
77
+ # target: last record inserted
78
+ target = {"x": n_existing - 1, "y": (n_existing - 1) * 2}
79
+ return measure(lambda: b.has(target))
80
+
81
+
82
+ def bench_has_miss(backend_cls, ext: str, n_existing: int) -> tuple[float, float]:
83
+ """Time a has() call that returns False (run path) with n_existing records."""
84
+ with tempfile.TemporaryDirectory() as d:
85
+ path = Path(d) / f"store.{ext}"
86
+ b = backend_cls(path)
87
+ for i in range(n_existing):
88
+ b.record({"x": i, "y": i * 2})
89
+ absent = {"x": 999_999, "y": 999_999}
90
+ return measure(lambda: b.has(absent))
91
+
92
+
93
+ def run_scaling_benchmark() -> dict[str, dict[str, list]]:
94
+ """
95
+ For each backend, measure has() (hit and miss) and record() time
96
+ as store size grows.
97
+ """
98
+ results: dict[str, dict[str, list]] = {
99
+ name: {"sizes": [], "has_hit": [], "has_miss": [], "record": []}
100
+ for name in BACKENDS
101
+ }
102
+
103
+ for name, cls in BACKENDS.items():
104
+ ext = EXTENSIONS[name]
105
+ print(f"\n{name}")
106
+ for n in STORE_SIZES:
107
+ if n == 0 and name != "SQLite":
108
+ # has() on empty store is trivial; use 1 as minimum for has_hit
109
+ hit_mean, _ = bench_has_hit(cls, ext, max(n, 1))
110
+ else:
111
+ hit_mean, _ = bench_has_hit(cls, ext, max(n, 1))
112
+ miss_mean, _ = bench_has_miss(cls, ext, n)
113
+ rec_mean, _ = bench_record(cls, ext, n)
114
+ results[name]["sizes"].append(n)
115
+ results[name]["has_hit"].append(hit_mean)
116
+ results[name]["has_miss"].append(miss_mean)
117
+ results[name]["record"].append(rec_mean)
118
+ print(
119
+ f" n={n:5d} has_hit={hit_mean:.2f}ms"
120
+ f" has_miss={miss_mean:.2f}ms record={rec_mean:.2f}ms"
121
+ )
122
+
123
+ return results
124
+
125
+
126
+ def plot_results(results: dict, out_dir: Path) -> None:
127
+ out_dir.mkdir(parents=True, exist_ok=True)
128
+ colors = {
129
+ "CSV": "#1f77b4",
130
+ "JSON": "#ff7f0e",
131
+ "SQLite": "#2ca02c",
132
+ "Parquet": "#d62728",
133
+ }
134
+
135
+ # --- Plot 1: has() scaling (skip path) ---
136
+ fig, ax = plt.subplots(figsize=(7, 4))
137
+ for name, data in results.items():
138
+ ax.plot(
139
+ data["sizes"], data["has_hit"], marker="o", label=name, color=colors[name]
140
+ )
141
+ ax.set_xlabel("Records in store")
142
+ ax.set_ylabel("Time (ms)")
143
+ ax.set_title("Skip check overhead by store size")
144
+ ax.legend()
145
+ ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
146
+ ax.grid(True, which="both", alpha=0.3)
147
+ fig.tight_layout()
148
+ fig.savefig(out_dir / "skip_overhead.png", dpi=150)
149
+ plt.close(fig)
150
+ print(f"\nSaved: {out_dir / 'skip_overhead.png'}")
151
+
152
+ # --- Plot 2: record() scaling ---
153
+ fig, ax = plt.subplots(figsize=(7, 4))
154
+ for name, data in results.items():
155
+ ax.plot(
156
+ data["sizes"], data["record"], marker="o", label=name, color=colors[name]
157
+ )
158
+ ax.set_xlabel("Records in store")
159
+ ax.set_ylabel("Time (ms)")
160
+ ax.set_title("Record overhead by store size")
161
+ ax.legend()
162
+ ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
163
+ ax.grid(True, which="both", alpha=0.3)
164
+ fig.tight_layout()
165
+ fig.savefig(out_dir / "record_overhead.png", dpi=150)
166
+ plt.close(fig)
167
+ print(f"Saved: {out_dir / 'record_overhead.png'}")
168
+
169
+
170
+ if __name__ == "__main__":
171
+ print("Running benchmarks...")
172
+ results = run_scaling_benchmark()
173
+ out_dir = Path(__file__).parent.parent / "docs" / "img"
174
+ plot_results(results, out_dir)
175
+ print("\nDone.")
@@ -0,0 +1,66 @@
1
+ # How stet Works
2
+
3
+ ## Parameter-keyed tracking, not memoization
4
+
5
+ Traditional memoization (e.g. `functools.lru_cache`) stores the *return value* of a function so that repeated calls with the same inputs return the cached result. `stet` does something different: it tracks *which parameter combinations have been executed* and skips them on future calls — without storing return values at all.
6
+
7
+ This distinction matters for experiment scripts. Researchers typically write their own outputs (CSV rows, model files, database records). What they need is not cached results, but a durable record of which experiments have already run.
8
+
9
+ ## How the decorator works
10
+
11
+ When you write:
12
+
13
+ ```python
14
+ @stet.once(store='_stet_store.csv', key=['alpha', 'beta'])
15
+ def run_experiment(alpha, beta, n_steps):
16
+ ...
17
+ ```
18
+
19
+ `stet` wraps `run_experiment` so that each call:
20
+
21
+ 1. Binds all arguments to their parameter names using `inspect.signature`.
22
+ 2. Extracts only the `key` parameters (`alpha`, `beta`).
23
+ 3. Checks the store: has this `(alpha, beta)` combination been recorded?
24
+ 4. If yes: prints a skip message and returns `None`.
25
+ 5. If no: calls the real function, then records the key parameters plus a timestamp.
26
+
27
+ ## Why the store is separate from your data
28
+
29
+ A natural question is: why not write results directly into the store file, so there is only one file to manage?
30
+
31
+ The short answer is that experiment outputs are too varied for `stet` to own them. A single experiment might produce a row in a CSV, a trained model checkpoint, a plot, entries in a database, or all of the above. There is no single file format or schema that fits every case, and any attempt to impose one would either be too restrictive or too complex to be useful.
32
+
33
+ Keeping the store separate means `stet` only needs to solve the narrow problem it was designed for — *did this parameter combination run?* — and leaves the richer question of *what did it produce?* entirely to you. This also means your output files stay in whatever format your analysis tools already expect, with no `stet`-specific structure mixed in.
34
+
35
+ The practical consequence is two files: a `_stet_store.csv` (or `.sqlite`, `.json`, etc.) that `stet` manages, and your own output file that your script manages. If you want to check whether a particular run completed, use `stet.status()`; if you want to inspect the results, open your output file directly.
36
+
37
+ There is one failure mode worth being aware of: if your function crashes *after* writing output but *before* returning (so `stet` never records the run), the store and your output file will be out of sync. On restart `stet` will re-run that experiment. Whether that is a problem depends on your output — appending a duplicate row to a CSV is usually harmless; re-training an expensive model is not. If atomicity matters, the safest pattern is to return results from the function and write output *after* the call returns:
38
+
39
+ ```python
40
+ @stet.once(store='_stet_store.csv', key=['alpha', 'seed'])
41
+ def run_experiment(alpha, seed):
42
+ return expensive_computation(alpha, seed)
43
+
44
+ for alpha in alphas:
45
+ for seed in seeds:
46
+ result = run_experiment(alpha=alpha, seed=seed)
47
+ if result is not None: # None means the run was skipped
48
+ write_output(result)
49
+ ```
50
+
51
+ This way `stet` records the run only after the function has completed successfully, and you write output only after `stet` has recorded it.
52
+
53
+ ## Backend selection
54
+
55
+ The backend is selected automatically from the file extension:
56
+
57
+ - `.csv` → `CsvBackend` (pandas)
58
+ - `.parquet` → `ParquetBackend` (pandas + pyarrow)
59
+ - `.sqlite` / `.db` → `SqliteBackend` (stdlib sqlite3)
60
+ - `.json` → `JsonBackend` (stdlib json)
61
+
62
+ All backends implement the same interface (`BaseBackend`), so they're interchangeable.
63
+
64
+ ## File locking
65
+
66
+ `stet` uses `filelock` to acquire a file-level lock before every read or write. This ensures that concurrent processes (e.g. `multiprocessing`, `joblib`) cannot corrupt the store. The lock is released immediately after the operation.
@@ -0,0 +1,63 @@
1
+ # Performance
2
+
3
+ The dominant cost in `stet` is store I/O — reading the store to check whether a key exists, and writing to it when recording a new run.
4
+
5
+ ## What was measured
6
+
7
+ The benchmark calls `backend.has()` and `backend.record()` directly — it does not go through the decorator. It isolates the store I/O cost specifically, and does not include the decorator's own work (argument binding, key extraction, file lock acquisition).
8
+
9
+ For each backend and a range of store sizes, we measured:
10
+
11
+ - **Skip check** (`has()`) — the time to look up a key that is present in the store.
12
+ - **Record** (`record()`) — the time to write a new entry to the store.
13
+
14
+ Each measurement is the mean of 20 repetitions. Store entries each had two key columns (`x`, `y`).
15
+
16
+ ### Environment
17
+
18
+ | | |
19
+ |---|---|
20
+ | Machine | Apple M2, 16 GB RAM |
21
+ | OS | macOS (darwin arm64) |
22
+ | Python | 3.14.0 |
23
+ | pandas | 3.0.1 |
24
+ | pyarrow | 23.0.1 |
25
+ | filelock | 3.25.1 |
26
+ | Store sizes tested | 0, 10, 100, 500, 1000, 2000, 5000, 10000 records |
27
+
28
+ Results on different hardware will vary, but the relative shape — SQLite staying flat, file-based backends growing linearly — holds regardless of machine.
29
+
30
+ ## Results
31
+
32
+ ### Skip check overhead
33
+
34
+ ![Skip check overhead by store size](../img/skip_overhead.png)
35
+
36
+ ### Record overhead
37
+
38
+ ![Record overhead by store size](../img/record_overhead.png)
39
+
40
+ ## What the numbers mean
41
+
42
+ **SQLite is the only backend that stays flat.** Its skip check and record time are essentially constant (~0.3–0.6 ms) regardless of how many records are in the store. This is because SQLite uses a B-tree index for lookups and writes are transactional.
43
+
44
+ **CSV, JSON, and Parquet all read the entire file on every operation.** Their overhead grows with store size because checking whether a key exists requires loading all existing records first. At 10,000 records, CSV and JSON skip checks take ~6 ms each; JSON record writes reach ~23 ms because the whole file is rewritten on every call.
45
+
46
+ **Parquet has a higher fixed cost than CSV or JSON at small sizes** (~1 ms at 10 records vs ~0.6 ms) due to the cost of parsing the binary format, but it scales more gently than JSON at larger sizes (~3.8 ms record writes at 10,000 records vs ~23 ms for JSON).
47
+
48
+ ## Practical guidance
49
+
50
+ For most experiment sweeps the overhead is negligible — a skip check under 1 ms is undetectable against any function that does real work. But it becomes relevant in two situations:
51
+
52
+ - **Very fast functions** (sub-millisecond): the overhead can dominate. Consider batching or restructuring so that `stet` wraps a coarser unit of work.
53
+ - **Very large stores with file-based backends**: at 5,000+ records, CSV and JSON become noticeably slow. Switch to SQLite if your store will grow large, especially in combination with parallel workers (see [Use with Multiprocessing](../how-to/use-with-multiprocessing.md)).
54
+
55
+ ## Reproducing these results
56
+
57
+ The benchmarking script lives at `benchmarks/run_benchmarks.py`:
58
+
59
+ ```
60
+ $ uv run python benchmarks/run_benchmarks.py
61
+ ```
62
+
63
+ It re-generates the plots in `docs/img/`.
@@ -0,0 +1,33 @@
1
+ # How `stet` relates to similar libraries
2
+
3
+ Several Python libraries solve adjacent problems to `stet`. Understanding how they differ clarifies what `stet` is for and why it makes the design choices it does.
4
+
5
+ ## `joblib.Memory`
6
+
7
+ `joblib.Memory` is the most widely used tool in this space. It wraps a function so that the return value is persisted to disk on the first call and returned from the cache on subsequent identical calls — standard memoization, made durable across processes and restarts.
8
+
9
+ Because `joblib.Memory` stores return values, its storage is opaque: results live as pickle files in a nested directory structure not meant to be read by anything other than `joblib` itself. There is no way to open the cache in a spreadsheet, query it with SQL, or inspect it with standard tools. It also stores everything the function returns, which can mean large files for functions that return arrays or models.
10
+
11
+ It also has no concept of a key subset. The cache key is the full set of arguments; there is no way to say "these parameters define the experiment identity, but this one is just a computational setting". For research workflows where you want to vary convergence tolerances or iteration counts without invalidating cached results, this is a significant constraint.
12
+
13
+ `stet` is not trying to replace `joblib.Memory` — it is solving a different problem.
14
+
15
+ ## `checkpointing`
16
+
17
+ The `checkpointing` package (PyPI) is a decorator that caches return values as pickle files and skips re-execution for identical arguments, with configurable behaviour on errors. Its intent and API are close to `joblib.Memory`: persist outputs, skip re-computation.
18
+
19
+ The same differences apply. Storage is not human-readable or queryable, return values are always persisted, and there is no user-defined key subset. It adds some useful error-handling flexibility that `joblib.Memory` lacks, but the underlying model — cache the output, look it up on re-call — is the same.
20
+
21
+ ## `checkpointer`
22
+
23
+ `checkpointer` (PyPI) is a more sophisticated decorator focused on *cache correctness*. It can detect when the decorated function's source code or dependencies have changed and invalidate the cache accordingly. It also supports async functions and robust hashing of complex objects such as NumPy arrays and PyTorch tensors.
24
+
25
+ This solves a different problem. The question `checkpointer` answers is: *is this cached result still valid given that the code may have changed?* The question `stet` answers is: *has this parameter combination been run before?* For a researcher running a fixed parameter sweep across sessions, code-aware invalidation is not needed — and the overhead of hashing complex objects would add cost to every call.
26
+
27
+ ## `memento` (wickerlab)
28
+
29
+ `memento` is the closest in intent to `stet`. It is explicitly designed for researchers running expensive experiments over a parameter grid, with built-in parallelisation and result caching.
30
+
31
+ The key difference is that `memento` is workflow-prescriptive. You define your parameter grid upfront as a configuration object and pass it to `Memento.run()`, which handles iteration and parallelisation. This requires structuring your experiment around `memento`'s API.
32
+
33
+ `stet` takes the opposite position: it is workflow-neutral. You decorate your function and loop over parameters however you already do — a `for` loop, a list comprehension, a call from a notebook cell, a parallel map. The decorator fits into existing code without restructuring it, and composes with whatever parallelisation approach you already use.