pycache-skip 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cache_skip/__init__.py +4 -0
- cache_skip/decorator.py +186 -0
- cache_skip/deps.py +156 -0
- cache_skip/dirmaker.py +32 -0
- cache_skip/scanner.py +38 -0
- cache_skip/state.py +36 -0
- cache_skip/tests/__init__.py +0 -0
- cache_skip/tests/test_decorator.py +228 -0
- cache_skip/tests/test_deps.py +75 -0
- cache_skip/tests/test_dirmaker.py +34 -0
- cache_skip/tests/test_scanner.py +57 -0
- cache_skip/tests/test_state.py +39 -0
- pycache_skip-0.1.0.dist-info/METADATA +130 -0
- pycache_skip-0.1.0.dist-info/RECORD +16 -0
- pycache_skip-0.1.0.dist-info/WHEEL +4 -0
- pycache_skip-0.1.0.dist-info/licenses/LICENSE +21 -0
cache_skip/__init__.py
ADDED
cache_skip/decorator.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import shutil
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import xxhash
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from . import deps as _deps
|
|
11
|
+
from .scanner import scan_inputs
|
|
12
|
+
from .state import FileRecord, InputState, read_state, write_state
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _collect_input_paths(bound: inspect.BoundArguments) -> list[Path]:
|
|
16
|
+
paths = []
|
|
17
|
+
for name, value in bound.arguments.items():
|
|
18
|
+
if name == "_output":
|
|
19
|
+
continue
|
|
20
|
+
if isinstance(value, Path):
|
|
21
|
+
paths.append(value)
|
|
22
|
+
return paths
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _compute_args_hash(bound: inspect.BoundArguments) -> str:
|
|
26
|
+
non_path_args: dict[str, str] = {}
|
|
27
|
+
for name, value in bound.arguments.items():
|
|
28
|
+
if name == "_output":
|
|
29
|
+
continue
|
|
30
|
+
if isinstance(value, Path):
|
|
31
|
+
continue
|
|
32
|
+
non_path_args[name] = repr(value)
|
|
33
|
+
combined = "|".join(f"{k}={v}" for k, v in sorted(non_path_args.items()))
|
|
34
|
+
return xxhash.xxh128(combined.encode()).hexdigest()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _compute_file_hash(path: Path) -> str:
|
|
38
|
+
return xxhash.xxh128(path.read_bytes()).hexdigest()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _do_rerun(
|
|
42
|
+
fn: Callable,
|
|
43
|
+
args: tuple,
|
|
44
|
+
kwargs: dict,
|
|
45
|
+
input_paths: list[Path],
|
|
46
|
+
args_hash: str,
|
|
47
|
+
dep_hash: str,
|
|
48
|
+
output: Path,
|
|
49
|
+
) -> object:
|
|
50
|
+
shutil.rmtree(output, ignore_errors=True)
|
|
51
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
result = fn(*args, **kwargs)
|
|
53
|
+
state = scan_inputs(input_paths)
|
|
54
|
+
state.args_hash = args_hash
|
|
55
|
+
state.dep_hash = dep_hash
|
|
56
|
+
write_state(state, output / ".input_state.json")
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def cache_skip(
|
|
61
|
+
_fn: Callable | None = None,
|
|
62
|
+
*,
|
|
63
|
+
track_dependencies: bool = True,
|
|
64
|
+
) -> Callable:
|
|
65
|
+
def decorator(fn: Callable) -> Callable:
|
|
66
|
+
@wraps(fn)
|
|
67
|
+
def wrapper(*args, **kwargs):
|
|
68
|
+
# Step 0 — extract _output
|
|
69
|
+
output = kwargs.get("_output")
|
|
70
|
+
assert (
|
|
71
|
+
output is not None
|
|
72
|
+
), f"{fn.__qualname__}() missing required keyword argument: '_output'"
|
|
73
|
+
output = Path(output)
|
|
74
|
+
|
|
75
|
+
# Step 1 — collect input Paths
|
|
76
|
+
sig = inspect.signature(fn)
|
|
77
|
+
bound = sig.bind(*args, **kwargs)
|
|
78
|
+
bound.apply_defaults()
|
|
79
|
+
input_paths = _collect_input_paths(bound)
|
|
80
|
+
|
|
81
|
+
# Step 2 — compute args_hash
|
|
82
|
+
args_hash = _compute_args_hash(bound)
|
|
83
|
+
|
|
84
|
+
# Step 3 — compute dep_hash
|
|
85
|
+
if track_dependencies:
|
|
86
|
+
dep_hash = _deps.compute_dep_hash(fn)
|
|
87
|
+
else:
|
|
88
|
+
dep_hash = ""
|
|
89
|
+
|
|
90
|
+
# Step 4 — output absent → fresh run
|
|
91
|
+
if not output.exists():
|
|
92
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
result = fn(*args, **kwargs)
|
|
94
|
+
state = scan_inputs(input_paths)
|
|
95
|
+
state.args_hash = args_hash
|
|
96
|
+
state.dep_hash = dep_hash
|
|
97
|
+
write_state(state, output / ".input_state.json")
|
|
98
|
+
logger.info(
|
|
99
|
+
"cache_skip: ran {} — output did not exist", fn.__qualname__
|
|
100
|
+
)
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
# Step 5 — output present → compare state
|
|
104
|
+
state_path = output / ".input_state.json"
|
|
105
|
+
if not state_path.exists():
|
|
106
|
+
logger.info("cache_skip: rerunning {} — no state file", fn.__qualname__)
|
|
107
|
+
return _do_rerun(
|
|
108
|
+
fn, args, kwargs, input_paths, args_hash, dep_hash, output
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
stored = read_state(state_path)
|
|
112
|
+
|
|
113
|
+
if stored.args_hash != args_hash:
|
|
114
|
+
logger.info("cache_skip: rerunning {} — args changed", fn.__qualname__)
|
|
115
|
+
return _do_rerun(
|
|
116
|
+
fn, args, kwargs, input_paths, args_hash, dep_hash, output
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if stored.dep_hash != dep_hash:
|
|
120
|
+
logger.info(
|
|
121
|
+
"cache_skip: rerunning {} — dep_hash changed", fn.__qualname__
|
|
122
|
+
)
|
|
123
|
+
return _do_rerun(
|
|
124
|
+
fn, args, kwargs, input_paths, args_hash, dep_hash, output
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
current = scan_inputs(input_paths)
|
|
128
|
+
|
|
129
|
+
if set(current.files) != set(stored.files):
|
|
130
|
+
logger.info(
|
|
131
|
+
"cache_skip: rerunning {} — file set changed", fn.__qualname__
|
|
132
|
+
)
|
|
133
|
+
return _do_rerun(
|
|
134
|
+
fn, args, kwargs, input_paths, args_hash, dep_hash, output
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
changed = False
|
|
138
|
+
updated_files: dict[str, FileRecord] = {}
|
|
139
|
+
|
|
140
|
+
for path_str, current_rec in current.files.items():
|
|
141
|
+
stored_rec = stored.files.get(path_str)
|
|
142
|
+
if stored_rec is None:
|
|
143
|
+
changed = True
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
if (
|
|
147
|
+
current_rec.mtime == stored_rec.mtime
|
|
148
|
+
and current_rec.size == stored_rec.size
|
|
149
|
+
and current_rec.inode == stored_rec.inode
|
|
150
|
+
):
|
|
151
|
+
updated_files[path_str] = stored_rec
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
actual_hash = _compute_file_hash(Path(path_str))
|
|
155
|
+
if actual_hash == stored_rec.hash:
|
|
156
|
+
updated_files[path_str] = FileRecord(
|
|
157
|
+
path=path_str,
|
|
158
|
+
mtime=current_rec.mtime,
|
|
159
|
+
inode=current_rec.inode,
|
|
160
|
+
size=current_rec.size,
|
|
161
|
+
hash=actual_hash,
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
changed = True
|
|
165
|
+
break
|
|
166
|
+
|
|
167
|
+
if changed:
|
|
168
|
+
logger.info(
|
|
169
|
+
"cache_skip: rerunning {} — file content changed", fn.__qualname__
|
|
170
|
+
)
|
|
171
|
+
return _do_rerun(
|
|
172
|
+
fn, args, kwargs, input_paths, args_hash, dep_hash, output
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if updated_files != stored.files:
|
|
176
|
+
stored.files = updated_files
|
|
177
|
+
write_state(stored, state_path)
|
|
178
|
+
|
|
179
|
+
logger.info("cache_skip: skipping {} — inputs unchanged", fn.__qualname__)
|
|
180
|
+
return output
|
|
181
|
+
|
|
182
|
+
return wrapper
|
|
183
|
+
|
|
184
|
+
if _fn is not None:
|
|
185
|
+
return decorator(_fn)
|
|
186
|
+
return decorator
|
cache_skip/deps.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import inspect
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import xxhash
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_dep_hash(
|
|
10
|
+
func: Callable,
|
|
11
|
+
dep_roots: list[Path] | None = None,
|
|
12
|
+
dep_files: list[Path] | None = None,
|
|
13
|
+
) -> str:
|
|
14
|
+
if dep_files is not None:
|
|
15
|
+
return _hash_files(dep_files)
|
|
16
|
+
|
|
17
|
+
mod = inspect.getmodule(func)
|
|
18
|
+
assert mod is not None, f"Cannot determine module for {func}"
|
|
19
|
+
assert (
|
|
20
|
+
hasattr(mod, "__file__") and mod.__file__ is not None
|
|
21
|
+
), f"Module {mod.__name__} has no __file__"
|
|
22
|
+
|
|
23
|
+
if dep_roots is None:
|
|
24
|
+
dep_roots = _auto_detect_roots(mod)
|
|
25
|
+
|
|
26
|
+
dep_roots_resolved = [Path(r).resolve() for r in dep_roots]
|
|
27
|
+
start_file = Path(mod.__file__).resolve()
|
|
28
|
+
|
|
29
|
+
visited: set[Path] = set()
|
|
30
|
+
_walk_imports(start_file, dep_roots_resolved, visited, is_entry=True)
|
|
31
|
+
|
|
32
|
+
return _hash_files(list(visited))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _auto_detect_roots(mod) -> list[Path]:
|
|
36
|
+
mod_file = Path(mod.__file__).resolve()
|
|
37
|
+
parts = mod.__name__.split(".")
|
|
38
|
+
pkg_dir = mod_file.parent
|
|
39
|
+
for _ in range(len(parts) - 1):
|
|
40
|
+
pkg_dir = pkg_dir.parent
|
|
41
|
+
return [pkg_dir]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _hash_files(paths: list[Path]) -> str:
|
|
45
|
+
file_hashes: list[tuple[str, str]] = []
|
|
46
|
+
for src_path in sorted(paths, key=lambda p: str(p.resolve())):
|
|
47
|
+
resolved = src_path.resolve()
|
|
48
|
+
content_hash = xxhash.xxh128(resolved.read_bytes()).hexdigest()
|
|
49
|
+
file_hashes.append((str(resolved), content_hash))
|
|
50
|
+
combined = "\n".join(f"{p}:{h}" for p, h in file_hashes)
|
|
51
|
+
return xxhash.xxh128(combined.encode()).hexdigest()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _walk_imports(
|
|
55
|
+
source_file: Path,
|
|
56
|
+
dep_roots: list[Path],
|
|
57
|
+
visited: set[Path],
|
|
58
|
+
*,
|
|
59
|
+
is_entry: bool = False,
|
|
60
|
+
) -> None:
|
|
61
|
+
source_file = source_file.resolve()
|
|
62
|
+
if source_file in visited:
|
|
63
|
+
return
|
|
64
|
+
if not source_file.is_file() or source_file.suffix != ".py":
|
|
65
|
+
return
|
|
66
|
+
if not is_entry and not any(_is_under(source_file, root) for root in dep_roots):
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
visited.add(source_file)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
tree = ast.parse(source_file.read_bytes(), filename=str(source_file))
|
|
73
|
+
except SyntaxError:
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
for node in ast.walk(tree):
|
|
77
|
+
if isinstance(node, ast.Import):
|
|
78
|
+
for alias in node.names:
|
|
79
|
+
resolved = _resolve_module(alias.name, source_file, dep_roots)
|
|
80
|
+
if resolved:
|
|
81
|
+
_walk_imports(resolved, dep_roots, visited)
|
|
82
|
+
elif isinstance(node, ast.ImportFrom):
|
|
83
|
+
if node.module is None:
|
|
84
|
+
continue
|
|
85
|
+
module_name = node.module
|
|
86
|
+
if node.level > 0:
|
|
87
|
+
module_name = _resolve_relative(
|
|
88
|
+
module_name, node.level, source_file, dep_roots
|
|
89
|
+
)
|
|
90
|
+
if module_name is None:
|
|
91
|
+
continue
|
|
92
|
+
resolved = _resolve_module(module_name, source_file, dep_roots)
|
|
93
|
+
if resolved:
|
|
94
|
+
_walk_imports(resolved, dep_roots, visited)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _resolve_module(
|
|
98
|
+
module_name: str,
|
|
99
|
+
from_file: Path,
|
|
100
|
+
dep_roots: list[Path],
|
|
101
|
+
) -> Path | None:
|
|
102
|
+
parts = module_name.split(".")
|
|
103
|
+
for root in dep_roots:
|
|
104
|
+
pkg_path = root / "/".join(parts) / "__init__.py"
|
|
105
|
+
if pkg_path.is_file():
|
|
106
|
+
return pkg_path.resolve()
|
|
107
|
+
mod_path = (
|
|
108
|
+
root / "/".join(parts[:-1]) / (parts[-1] + ".py")
|
|
109
|
+
if len(parts) > 1
|
|
110
|
+
else root / (parts[0] + ".py")
|
|
111
|
+
)
|
|
112
|
+
if mod_path.is_file():
|
|
113
|
+
return mod_path.resolve()
|
|
114
|
+
for i in range(len(parts), 0, -1):
|
|
115
|
+
sub = parts[:i]
|
|
116
|
+
pkg_init = root / "/".join(sub) / "__init__.py"
|
|
117
|
+
if pkg_init.is_file():
|
|
118
|
+
return pkg_init.resolve()
|
|
119
|
+
mod_file = (
|
|
120
|
+
root / "/".join(sub[:-1]) / (sub[-1] + ".py")
|
|
121
|
+
if len(sub) > 1
|
|
122
|
+
else root / (sub[0] + ".py")
|
|
123
|
+
)
|
|
124
|
+
if mod_file.is_file():
|
|
125
|
+
return mod_file.resolve()
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _resolve_relative(
|
|
130
|
+
module_name: str,
|
|
131
|
+
level: int,
|
|
132
|
+
from_file: Path,
|
|
133
|
+
dep_roots: list[Path],
|
|
134
|
+
) -> str | None:
|
|
135
|
+
pkg_dir = from_file.parent
|
|
136
|
+
for _ in range(level - 1):
|
|
137
|
+
pkg_dir = pkg_dir.parent
|
|
138
|
+
for root in dep_roots:
|
|
139
|
+
if _is_under(pkg_dir, root):
|
|
140
|
+
try:
|
|
141
|
+
rel = pkg_dir.relative_to(root)
|
|
142
|
+
prefix = ".".join(rel.parts)
|
|
143
|
+
if prefix and module_name:
|
|
144
|
+
return f"{prefix}.{module_name}"
|
|
145
|
+
return prefix or module_name
|
|
146
|
+
except ValueError:
|
|
147
|
+
continue
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _is_under(path: Path, root: Path) -> bool:
|
|
152
|
+
try:
|
|
153
|
+
path.relative_to(root)
|
|
154
|
+
return True
|
|
155
|
+
except ValueError:
|
|
156
|
+
return False
|
cache_skip/dirmaker.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Dirmaker — manages output paths under a pipeline staging root."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Dirmaker:
|
|
10
|
+
"""Allocates named output directories under a staging root.
|
|
11
|
+
|
|
12
|
+
Each pipeline step requests a named output directory via new_output_dir().
|
|
13
|
+
The Dirmaker creates it and returns the path. Steps write all their
|
|
14
|
+
outputs there — never to input directories.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, base: Path):
|
|
18
|
+
self.root = Path(base)
|
|
19
|
+
self.root.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
def new_output_dir(self, name: str) -> Path:
|
|
22
|
+
"""Allocate a named output directory. Deletes existing output if present."""
|
|
23
|
+
out = self.root / name
|
|
24
|
+
if out.exists():
|
|
25
|
+
logger.info(f"Removing existing output dir: {out}")
|
|
26
|
+
shutil.rmtree(out)
|
|
27
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
return out
|
|
29
|
+
|
|
30
|
+
def path_for(self, name: str) -> Path:
|
|
31
|
+
"""Resolve output path for a step (no side effects)."""
|
|
32
|
+
return self.root / name
|
cache_skip/scanner.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import xxhash
|
|
5
|
+
|
|
6
|
+
from .state import FileRecord, InputState
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _hash_file(path: Path) -> str:
|
|
10
|
+
return xxhash.xxh128(path.read_bytes()).hexdigest()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _make_record(p: Path) -> FileRecord:
|
|
14
|
+
stat = os.stat(p)
|
|
15
|
+
return FileRecord(
|
|
16
|
+
path=str(p),
|
|
17
|
+
mtime=stat.st_mtime_ns / 1e9,
|
|
18
|
+
inode=stat.st_ino,
|
|
19
|
+
size=stat.st_size,
|
|
20
|
+
hash=_hash_file(p),
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def scan_inputs(input_paths: list[Path]) -> InputState:
|
|
25
|
+
files: dict[str, FileRecord] = {}
|
|
26
|
+
for path in input_paths:
|
|
27
|
+
assert path.exists(), f"Input path does not exist: {path}"
|
|
28
|
+
if path.is_file():
|
|
29
|
+
p = path.resolve()
|
|
30
|
+
files[str(p)] = _make_record(p)
|
|
31
|
+
elif path.is_dir():
|
|
32
|
+
for root, _, filenames in os.walk(path):
|
|
33
|
+
for fn in filenames:
|
|
34
|
+
if fn == ".input_state.json":
|
|
35
|
+
continue
|
|
36
|
+
p = Path(root, fn).resolve()
|
|
37
|
+
files[str(p)] = _make_record(p)
|
|
38
|
+
return InputState(version=1, files=files, args_hash="", dep_hash="")
|
cache_skip/state.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import asdict, dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class FileRecord:
|
|
8
|
+
path: str
|
|
9
|
+
mtime: float
|
|
10
|
+
inode: int
|
|
11
|
+
size: int
|
|
12
|
+
hash: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class InputState:
|
|
17
|
+
version: int
|
|
18
|
+
files: dict[str, FileRecord]
|
|
19
|
+
args_hash: str
|
|
20
|
+
dep_hash: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def write_state(state: InputState, path: Path) -> None:
|
|
24
|
+
path.write_text(json.dumps(asdict(state), indent=2))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def read_state(path: Path) -> InputState:
|
|
28
|
+
data = json.loads(path.read_text())
|
|
29
|
+
assert data["version"] == 1, f"Unknown state version: {data['version']}"
|
|
30
|
+
files = {k: FileRecord(**v) for k, v in data["files"].items()}
|
|
31
|
+
return InputState(
|
|
32
|
+
version=data["version"],
|
|
33
|
+
files=files,
|
|
34
|
+
args_hash=data["args_hash"],
|
|
35
|
+
dep_hash=data["dep_hash"],
|
|
36
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from datetime import date
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from unittest.mock import patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from cache_skip.decorator import cache_skip
|
|
10
|
+
from cache_skip.state import read_state
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture
|
|
14
|
+
def inp(tmp_path):
|
|
15
|
+
d = tmp_path / "input"
|
|
16
|
+
d.mkdir()
|
|
17
|
+
(d / "data.txt").write_bytes(b"hello")
|
|
18
|
+
return d
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_first_call_creates_output_and_state(tmp_path, inp):
|
|
22
|
+
output = tmp_path / "output"
|
|
23
|
+
calls = []
|
|
24
|
+
|
|
25
|
+
@cache_skip(track_dependencies=False)
|
|
26
|
+
def step(i: Path, _output: Path):
|
|
27
|
+
calls.append(1)
|
|
28
|
+
|
|
29
|
+
step(inp, _output=output)
|
|
30
|
+
assert len(calls) == 1
|
|
31
|
+
assert output.exists()
|
|
32
|
+
assert (output / ".input_state.json").exists()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_second_call_no_changes_skips(tmp_path, inp):
|
|
36
|
+
output = tmp_path / "output"
|
|
37
|
+
calls = []
|
|
38
|
+
|
|
39
|
+
@cache_skip(track_dependencies=False)
|
|
40
|
+
def step(i: Path, _output: Path):
|
|
41
|
+
calls.append(1)
|
|
42
|
+
|
|
43
|
+
step(inp, _output=output)
|
|
44
|
+
result = step(inp, _output=output)
|
|
45
|
+
assert len(calls) == 1
|
|
46
|
+
assert result == output
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_second_call_input_changed_reruns(tmp_path, inp):
|
|
50
|
+
output = tmp_path / "output"
|
|
51
|
+
calls = []
|
|
52
|
+
|
|
53
|
+
@cache_skip(track_dependencies=False)
|
|
54
|
+
def step(i: Path, _output: Path):
|
|
55
|
+
calls.append(1)
|
|
56
|
+
|
|
57
|
+
step(inp, _output=output)
|
|
58
|
+
(inp / "data.txt").write_bytes(b"changed content")
|
|
59
|
+
step(inp, _output=output)
|
|
60
|
+
assert len(calls) == 2
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_metadata_drift_same_content_no_rerun(tmp_path, inp):
|
|
64
|
+
output = tmp_path / "output"
|
|
65
|
+
calls = []
|
|
66
|
+
|
|
67
|
+
@cache_skip(track_dependencies=False)
|
|
68
|
+
def step(i: Path, _output: Path):
|
|
69
|
+
calls.append(1)
|
|
70
|
+
|
|
71
|
+
step(inp, _output=output)
|
|
72
|
+
|
|
73
|
+
data_file = inp / "data.txt"
|
|
74
|
+
stat = os.stat(data_file)
|
|
75
|
+
os.utime(data_file, (stat.st_atime + 2.0, stat.st_mtime + 2.0))
|
|
76
|
+
|
|
77
|
+
result = step(inp, _output=output)
|
|
78
|
+
assert len(calls) == 1
|
|
79
|
+
assert result == output
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_args_changed_reruns(tmp_path, inp):
|
|
83
|
+
output = tmp_path / "output"
|
|
84
|
+
calls = []
|
|
85
|
+
|
|
86
|
+
@cache_skip(track_dependencies=False)
|
|
87
|
+
def step(i: Path, scale: int, _output: Path):
|
|
88
|
+
calls.append(1)
|
|
89
|
+
|
|
90
|
+
step(inp, 1, _output=output)
|
|
91
|
+
step(inp, 2, _output=output)
|
|
92
|
+
assert len(calls) == 2
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_dep_hash_changed_reruns(tmp_path, inp):
|
|
96
|
+
output = tmp_path / "output"
|
|
97
|
+
calls = []
|
|
98
|
+
|
|
99
|
+
@cache_skip
|
|
100
|
+
def step(i: Path, _output: Path):
|
|
101
|
+
calls.append(1)
|
|
102
|
+
|
|
103
|
+
with patch("cache_skip.deps.compute_dep_hash", return_value="hash_a" + "0" * 26):
|
|
104
|
+
step(inp, _output=output)
|
|
105
|
+
with patch("cache_skip.deps.compute_dep_hash", return_value="hash_b" + "0" * 26):
|
|
106
|
+
step(inp, _output=output)
|
|
107
|
+
assert len(calls) == 2
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_output_exists_no_state_file_reruns(tmp_path, inp):
|
|
111
|
+
output = tmp_path / "output"
|
|
112
|
+
output.mkdir()
|
|
113
|
+
calls = []
|
|
114
|
+
|
|
115
|
+
@cache_skip(track_dependencies=False)
|
|
116
|
+
def step(i: Path, _output: Path):
|
|
117
|
+
calls.append(1)
|
|
118
|
+
|
|
119
|
+
step(inp, _output=output)
|
|
120
|
+
assert len(calls) == 1
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_output_deleted_externally_runs_fresh(tmp_path, inp):
|
|
124
|
+
output = tmp_path / "output"
|
|
125
|
+
calls = []
|
|
126
|
+
|
|
127
|
+
@cache_skip(track_dependencies=False)
|
|
128
|
+
def step(i: Path, _output: Path):
|
|
129
|
+
calls.append(1)
|
|
130
|
+
|
|
131
|
+
step(inp, _output=output)
|
|
132
|
+
shutil.rmtree(output)
|
|
133
|
+
step(inp, _output=output)
|
|
134
|
+
assert len(calls) == 2
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_input_file_deleted_reruns(tmp_path, inp):
|
|
138
|
+
output = tmp_path / "output"
|
|
139
|
+
calls = []
|
|
140
|
+
|
|
141
|
+
@cache_skip(track_dependencies=False)
|
|
142
|
+
def step(i: Path, _output: Path):
|
|
143
|
+
calls.append(1)
|
|
144
|
+
|
|
145
|
+
step(inp, _output=output)
|
|
146
|
+
(inp / "data.txt").unlink()
|
|
147
|
+
step(inp, _output=output)
|
|
148
|
+
assert len(calls) == 2
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_missing_output_kwarg_raises(tmp_path, inp):
|
|
152
|
+
@cache_skip(track_dependencies=False)
|
|
153
|
+
def step(i: Path, _output: Path):
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
with pytest.raises(AssertionError, match="_output"):
|
|
157
|
+
step(inp)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_track_dependencies_false_does_not_call_compute_dep_hash(tmp_path, inp):
|
|
161
|
+
output = tmp_path / "output"
|
|
162
|
+
|
|
163
|
+
with patch("cache_skip.deps.compute_dep_hash") as mock_dep:
|
|
164
|
+
|
|
165
|
+
@cache_skip(track_dependencies=False)
|
|
166
|
+
def step(i: Path, _output: Path):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
step(inp, _output=output)
|
|
170
|
+
mock_dep.assert_not_called()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def test_both_decorator_forms_work(tmp_path, inp):
|
|
174
|
+
output1 = tmp_path / "output1"
|
|
175
|
+
output2 = tmp_path / "output2"
|
|
176
|
+
|
|
177
|
+
@cache_skip
|
|
178
|
+
def step_a(i: Path, _output: Path):
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
@cache_skip(track_dependencies=False)
|
|
182
|
+
def step_b(i: Path, _output: Path):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
with patch("cache_skip.deps.compute_dep_hash", return_value="x" * 32):
|
|
186
|
+
step_a(inp, _output=output1)
|
|
187
|
+
step_b(inp, _output=output2)
|
|
188
|
+
assert output1.exists()
|
|
189
|
+
assert output2.exists()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_non_path_args_change_triggers_rerun(tmp_path, inp):
|
|
193
|
+
output = tmp_path / "output"
|
|
194
|
+
calls = []
|
|
195
|
+
|
|
196
|
+
@cache_skip(track_dependencies=False)
|
|
197
|
+
def step(i: Path, schedule_date: date, _output: Path):
|
|
198
|
+
calls.append(1)
|
|
199
|
+
|
|
200
|
+
step(inp, date(2025, 1, 1), _output=output)
|
|
201
|
+
step(inp, date(2025, 1, 2), _output=output)
|
|
202
|
+
assert len(calls) == 2
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def test_optional_path_none_not_in_input_paths(tmp_path, inp):
|
|
206
|
+
output = tmp_path / "output"
|
|
207
|
+
calls = []
|
|
208
|
+
|
|
209
|
+
@cache_skip(track_dependencies=False)
|
|
210
|
+
def step(i: Path, extra: Path | None, _output: Path):
|
|
211
|
+
calls.append(1)
|
|
212
|
+
|
|
213
|
+
step(inp, None, _output=output)
|
|
214
|
+
result = step(inp, None, _output=output)
|
|
215
|
+
assert len(calls) == 1
|
|
216
|
+
assert result == output
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def test_state_dep_hash_empty_when_disabled(tmp_path, inp):
|
|
220
|
+
output = tmp_path / "output"
|
|
221
|
+
|
|
222
|
+
@cache_skip(track_dependencies=False)
|
|
223
|
+
def step(i: Path, _output: Path):
|
|
224
|
+
pass
|
|
225
|
+
|
|
226
|
+
step(inp, _output=output)
|
|
227
|
+
state = read_state(output / ".input_state.json")
|
|
228
|
+
assert state.dep_hash == ""
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from cache_skip import deps
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_hash_files_returns_hex_string(tmp_path):
|
|
9
|
+
f = tmp_path / "sample.py"
|
|
10
|
+
f.write_text("x = 1\n")
|
|
11
|
+
result = deps._hash_files([f])
|
|
12
|
+
assert isinstance(result, str)
|
|
13
|
+
assert len(result) == 32
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_same_files_same_hash(tmp_path):
|
|
17
|
+
f = tmp_path / "sample.py"
|
|
18
|
+
f.write_text("x = 1\n")
|
|
19
|
+
h1 = deps._hash_files([f])
|
|
20
|
+
h2 = deps._hash_files([f])
|
|
21
|
+
assert h1 == h2
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_changed_file_different_hash(tmp_path):
|
|
25
|
+
f = tmp_path / "sample.py"
|
|
26
|
+
f.write_text("x = 1\n")
|
|
27
|
+
h1 = deps._hash_files([f])
|
|
28
|
+
f.write_text("x = 99\n")
|
|
29
|
+
h2 = deps._hash_files([f])
|
|
30
|
+
assert h1 != h2
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_compute_dep_hash_returns_hex_string(tmp_path):
|
|
34
|
+
f = tmp_path / "sample.py"
|
|
35
|
+
f.write_text("x = 42\n")
|
|
36
|
+
result = deps.compute_dep_hash(lambda: None, dep_files=[f])
|
|
37
|
+
assert isinstance(result, str)
|
|
38
|
+
assert len(result) == 32
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_compute_dep_hash_deterministic(tmp_path):
|
|
42
|
+
f = tmp_path / "sample.py"
|
|
43
|
+
f.write_text("x = 42\n")
|
|
44
|
+
h1 = deps.compute_dep_hash(lambda: None, dep_files=[f])
|
|
45
|
+
h2 = deps.compute_dep_hash(lambda: None, dep_files=[f])
|
|
46
|
+
assert h1 == h2
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_compute_dep_hash_changes_with_file(tmp_path):
|
|
50
|
+
f = tmp_path / "sample.py"
|
|
51
|
+
f.write_text("x = 42\n")
|
|
52
|
+
h1 = deps.compute_dep_hash(lambda: None, dep_files=[f])
|
|
53
|
+
f.write_text("x = 99\n")
|
|
54
|
+
h2 = deps.compute_dep_hash(lambda: None, dep_files=[f])
|
|
55
|
+
assert h1 != h2
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_track_dependencies_false_dep_hash_is_empty(tmp_path):
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
|
|
61
|
+
from cache_skip.decorator import cache_skip
|
|
62
|
+
from cache_skip.state import read_state
|
|
63
|
+
|
|
64
|
+
inp = tmp_path / "inp"
|
|
65
|
+
inp.mkdir()
|
|
66
|
+
(inp / "data.txt").write_bytes(b"x")
|
|
67
|
+
output = tmp_path / "output"
|
|
68
|
+
|
|
69
|
+
@cache_skip(track_dependencies=False)
|
|
70
|
+
def step(i: Path, _output: Path):
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
step(inp, _output=output)
|
|
74
|
+
state = read_state(output / ".input_state.json")
|
|
75
|
+
assert state.dep_hash == ""
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from cache_skip.dirmaker import Dirmaker
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_new_output_dir_creates_directory(tmp_path):
|
|
7
|
+
dm = Dirmaker(tmp_path / "root")
|
|
8
|
+
out = dm.new_output_dir("step1")
|
|
9
|
+
assert out.exists()
|
|
10
|
+
assert out.is_dir()
|
|
11
|
+
assert out == tmp_path / "root" / "step1"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_new_output_dir_deletes_and_recreates(tmp_path):
|
|
15
|
+
dm = Dirmaker(tmp_path / "root")
|
|
16
|
+
out = dm.new_output_dir("step1")
|
|
17
|
+
(out / "result.txt").write_text("old")
|
|
18
|
+
out2 = dm.new_output_dir("step1")
|
|
19
|
+
assert out2 == out
|
|
20
|
+
assert not (out / "result.txt").exists()
|
|
21
|
+
assert out.exists()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_path_for_no_side_effects(tmp_path):
|
|
25
|
+
dm = Dirmaker(tmp_path / "root")
|
|
26
|
+
p = dm.path_for("step1")
|
|
27
|
+
assert p == tmp_path / "root" / "step1"
|
|
28
|
+
assert not p.exists()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_dirmaker_creates_root_on_init(tmp_path):
|
|
32
|
+
root = tmp_path / "deep" / "root"
|
|
33
|
+
dm = Dirmaker(root)
|
|
34
|
+
assert root.exists()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
import xxhash
|
|
6
|
+
|
|
7
|
+
from cache_skip.scanner import scan_inputs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_single_file(tmp_path):
|
|
11
|
+
f = tmp_path / "data.txt"
|
|
12
|
+
f.write_bytes(b"hello world")
|
|
13
|
+
state = scan_inputs([f])
|
|
14
|
+
assert len(state.files) == 1
|
|
15
|
+
key = str(f.resolve())
|
|
16
|
+
rec = state.files[key]
|
|
17
|
+
stat = os.stat(f)
|
|
18
|
+
assert rec.size == 11
|
|
19
|
+
assert rec.inode == stat.st_ino
|
|
20
|
+
assert rec.mtime == pytest.approx(stat.st_mtime_ns / 1e9)
|
|
21
|
+
assert rec.hash == xxhash.xxh128(b"hello world").hexdigest()
|
|
22
|
+
assert state.args_hash == ""
|
|
23
|
+
assert state.dep_hash == ""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_directory(tmp_path):
|
|
27
|
+
sub = tmp_path / "sub"
|
|
28
|
+
sub.mkdir()
|
|
29
|
+
(sub / "a.txt").write_bytes(b"a")
|
|
30
|
+
(sub / "b.txt").write_bytes(b"b")
|
|
31
|
+
state = scan_inputs([sub])
|
|
32
|
+
assert len(state.files) == 2
|
|
33
|
+
for key in state.files:
|
|
34
|
+
assert Path(key).is_absolute()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_nonexistent(tmp_path):
|
|
38
|
+
missing = tmp_path / "does_not_exist"
|
|
39
|
+
with pytest.raises(AssertionError, match="does not exist"):
|
|
40
|
+
scan_inputs([missing])
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_empty_directory(tmp_path):
|
|
44
|
+
empty = tmp_path / "empty"
|
|
45
|
+
empty.mkdir()
|
|
46
|
+
state = scan_inputs([empty])
|
|
47
|
+
assert state.files == {}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_skips_input_state_json(tmp_path):
|
|
51
|
+
d = tmp_path / "dir"
|
|
52
|
+
d.mkdir()
|
|
53
|
+
(d / "data.txt").write_bytes(b"x")
|
|
54
|
+
(d / ".input_state.json").write_bytes(b"{}")
|
|
55
|
+
state = scan_inputs([d])
|
|
56
|
+
assert len(state.files) == 1
|
|
57
|
+
assert all(".input_state.json" not in k for k in state.files)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from cache_skip.state import FileRecord, InputState, read_state, write_state
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _make_state() -> InputState:
|
|
10
|
+
return InputState(
|
|
11
|
+
version=1,
|
|
12
|
+
files={
|
|
13
|
+
"/tmp/foo.txt": FileRecord(
|
|
14
|
+
path="/tmp/foo.txt",
|
|
15
|
+
mtime=1716000000.123,
|
|
16
|
+
inode=98765,
|
|
17
|
+
size=4096,
|
|
18
|
+
hash="abcdef1234567890abcdef1234567890",
|
|
19
|
+
)
|
|
20
|
+
},
|
|
21
|
+
args_hash="aabbccdd11223344aabbccdd11223344",
|
|
22
|
+
dep_hash="eeff00112233445566778899aabbccdd",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_round_trip(tmp_path):
|
|
27
|
+
state = _make_state()
|
|
28
|
+
path = tmp_path / "state.json"
|
|
29
|
+
write_state(state, path)
|
|
30
|
+
loaded = read_state(path)
|
|
31
|
+
assert loaded == state
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_version_mismatch(tmp_path):
|
|
35
|
+
path = tmp_path / "state.json"
|
|
36
|
+
data = {"version": 99, "files": {}, "args_hash": "", "dep_hash": ""}
|
|
37
|
+
path.write_text(json.dumps(data))
|
|
38
|
+
with pytest.raises(AssertionError):
|
|
39
|
+
read_state(path)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pycache_skip
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Skip pipeline steps when inputs are unchanged — content-aware, with module dependency tracking
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Requires-Dist: loguru
|
|
13
|
+
Requires-Dist: xxhash
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# pycache_skip
|
|
17
|
+
|
|
18
|
+
Skip pipeline steps when their inputs have not changed.
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv add pycache_skip
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## What it does
|
|
25
|
+
|
|
26
|
+
`cache_skip` wraps a pipeline step function and skips re-execution when all
|
|
27
|
+
inputs are unchanged. It stores a compact state file (`.input_state.json`)
|
|
28
|
+
alongside each output directory. On subsequent calls it compares the current
|
|
29
|
+
inputs against the stored state and only reruns the function when something
|
|
30
|
+
actually changed.
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
### Basic example (single input directory)
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
from cache_skip import cache_skip, Dirmaker
|
|
39
|
+
|
|
40
|
+
dm = Dirmaker(Path("/data/pipeline/run-001"))
|
|
41
|
+
|
|
42
|
+
@cache_skip
|
|
43
|
+
def step_transform(raw: Path, *, _output: Path) -> Path:
|
|
44
|
+
# heavy transformation ...
|
|
45
|
+
return _output
|
|
46
|
+
|
|
47
|
+
# First call — runs the function and records input state.
|
|
48
|
+
step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
|
|
49
|
+
|
|
50
|
+
# Second call — skips the function, returns the output path immediately.
|
|
51
|
+
step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Example with non-Path args
|
|
55
|
+
|
|
56
|
+
Non-`Path` arguments (dates, strings, ints, etc.) are also part of the cache
|
|
57
|
+
key. Changing them triggers a rerun.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import datetime as dt
|
|
61
|
+
|
|
62
|
+
@cache_skip(track_dependencies=False)
|
|
63
|
+
def step_build_config(
|
|
64
|
+
schedule_date: dt.date,
|
|
65
|
+
template: Path,
|
|
66
|
+
*,
|
|
67
|
+
_output: Path,
|
|
68
|
+
) -> Path:
|
|
69
|
+
...
|
|
70
|
+
|
|
71
|
+
# Changing schedule_date from 2025-01-01 to 2025-01-02 invalidates the cache.
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Dirmaker companion
|
|
75
|
+
|
|
76
|
+
`Dirmaker` allocates named output directories under a staging root. Use
|
|
77
|
+
`path_for(name)` to resolve the path without side effects (for `@cache_skip`),
|
|
78
|
+
or `new_output_dir(name)` to delete and recreate explicitly.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
dm = Dirmaker(Path("/data/pipeline/run-001"))
|
|
82
|
+
|
|
83
|
+
# Pass path to decorator — decorator manages deletion on rerun.
|
|
84
|
+
step_transform(raw, _output=dm.path_for("transform"))
|
|
85
|
+
|
|
86
|
+
# Or manage the directory yourself:
|
|
87
|
+
out = dm.new_output_dir("transform") # deletes existing, creates fresh
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## How invalidation works
|
|
91
|
+
|
|
92
|
+
Three-tier change detection on every call after the first:
|
|
93
|
+
|
|
94
|
+
1. **Args hash** — all non-`Path`, non-`_output` arguments are hashed via
|
|
95
|
+
`repr()`. A change in any scalar argument (date, string, int, …) triggers
|
|
96
|
+
a rerun immediately.
|
|
97
|
+
|
|
98
|
+
2. **Dependency hash** — the source files of the decorated function and all
|
|
99
|
+
modules it imports (static AST analysis) are hashed. Editing the function's
|
|
100
|
+
source code triggers a rerun. Disable with `track_dependencies=False`.
|
|
101
|
+
|
|
102
|
+
3. **File content hash** — every file under each input `Path` is compared.
|
|
103
|
+
Metadata (mtime, inode, size) is checked first as a fast path. If metadata
|
|
104
|
+
is identical the stored hash is trusted. If metadata drifted but content
|
|
105
|
+
hash matches, the state file is updated silently without a rerun (handles
|
|
106
|
+
`rsync` / `cp -p` copies with timestamp noise).
|
|
107
|
+
|
|
108
|
+
## track_dependencies
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
@cache_skip(track_dependencies=False)
|
|
112
|
+
def step(...):
|
|
113
|
+
...
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Set `track_dependencies=False` to skip module source hashing. Useful when the
|
|
117
|
+
function imports large, rarely-changing libraries and startup cost matters, or
|
|
118
|
+
in tests.
|
|
119
|
+
|
|
120
|
+
## Comparison with auto_skip
|
|
121
|
+
|
|
122
|
+
`cache_skip` is a simpler, self-contained alternative to `auto_skip`:
|
|
123
|
+
|
|
124
|
+
| Feature | `cache_skip` | `auto_skip` |
|
|
125
|
+
| ------------------- | ---------------------------- | -------------------- |
|
|
126
|
+
| Input detection | explicit `Path` args | strace / audit hooks |
|
|
127
|
+
| Non-Path args | hashed | ignored |
|
|
128
|
+
| Module dep tracking | static AST | runtime import list |
|
|
129
|
+
| External deps | `xxhash`, `loguru` | heavier stack |
|
|
130
|
+
| Output format | dir with `.input_state.json` | opaque cache store |
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
cache_skip/__init__.py,sha256=ys0DCGkYl4ePzT-uM9RPM_iStHKuUj8npAHRNjKf2jQ,123
|
|
2
|
+
cache_skip/decorator.py,sha256=F6O2RfpRtngQCjB9hsM9ZXB59dlTYl--HMz3VVYtSpo,6207
|
|
3
|
+
cache_skip/deps.py,sha256=jAQpBKPMfjbLHPuWbWYaOd2TRBvsG03PhdJD-NYGXEw,4710
|
|
4
|
+
cache_skip/dirmaker.py,sha256=QnHhOqPs4yOkN3LL-IdpsKrJBAWzQg0gXGBTzrA5fpI,1043
|
|
5
|
+
cache_skip/scanner.py,sha256=9_8y6C5Cmme5_xLJEWS6cKE8NZM8bkZlnI6euPocEoA,1090
|
|
6
|
+
cache_skip/state.py,sha256=pM6T1kVDCrz9Jcojlq1QnFb5bRPyt8LhWA19qOa3uKI,800
|
|
7
|
+
cache_skip/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
cache_skip/tests/test_decorator.py,sha256=uNXBpGtapKMsP2Od2Y837svxPP8sdnbB_XG6t7yMEBQ,5674
|
|
9
|
+
cache_skip/tests/test_deps.py,sha256=M5Vf360reYb_bUGQ3NhSRWhXpHtdpMzPnIdJ58svbno,1983
|
|
10
|
+
cache_skip/tests/test_dirmaker.py,sha256=RXB2YXvWnGG54vhGK-P-bT3OJytEswbVe7CiqRtkQd4,918
|
|
11
|
+
cache_skip/tests/test_scanner.py,sha256=VZR8diOCKjJob2jvx-he_7qXM-XMoWaS9IXLqFURO78,1500
|
|
12
|
+
cache_skip/tests/test_state.py,sha256=8Q7lBorHWbxrHW6av1wls9updfqwrhHv9geWcEBih4w,1019
|
|
13
|
+
pycache_skip-0.1.0.dist-info/METADATA,sha256=gvIHtDPlUE2YKncTV6TAFflgdQHsnhMK2tzIMW6L2rc,4280
|
|
14
|
+
pycache_skip-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
15
|
+
pycache_skip-0.1.0.dist-info/licenses/LICENSE,sha256=yiKnYC3HrKFysIItc-FrkG5MPsK8qOP1tc2283dEG1k,1080
|
|
16
|
+
pycache_skip-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 cache_skip contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|