pycache-skip 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cache_skip/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from cache_skip.decorator import cache_skip
2
+ from cache_skip.dirmaker import Dirmaker
3
+
4
+ __all__ = ["cache_skip", "Dirmaker"]
@@ -0,0 +1,186 @@
1
+ import inspect
2
+ import shutil
3
+ from collections.abc import Callable
4
+ from functools import wraps
5
+ from pathlib import Path
6
+
7
+ import xxhash
8
+ from loguru import logger
9
+
10
+ from . import deps as _deps
11
+ from .scanner import scan_inputs
12
+ from .state import FileRecord, InputState, read_state, write_state
13
+
14
+
15
+ def _collect_input_paths(bound: inspect.BoundArguments) -> list[Path]:
16
+ paths = []
17
+ for name, value in bound.arguments.items():
18
+ if name == "_output":
19
+ continue
20
+ if isinstance(value, Path):
21
+ paths.append(value)
22
+ return paths
23
+
24
+
25
+ def _compute_args_hash(bound: inspect.BoundArguments) -> str:
26
+ non_path_args: dict[str, str] = {}
27
+ for name, value in bound.arguments.items():
28
+ if name == "_output":
29
+ continue
30
+ if isinstance(value, Path):
31
+ continue
32
+ non_path_args[name] = repr(value)
33
+ combined = "|".join(f"{k}={v}" for k, v in sorted(non_path_args.items()))
34
+ return xxhash.xxh128(combined.encode()).hexdigest()
35
+
36
+
37
+ def _compute_file_hash(path: Path) -> str:
38
+ return xxhash.xxh128(path.read_bytes()).hexdigest()
39
+
40
+
41
+ def _do_rerun(
42
+ fn: Callable,
43
+ args: tuple,
44
+ kwargs: dict,
45
+ input_paths: list[Path],
46
+ args_hash: str,
47
+ dep_hash: str,
48
+ output: Path,
49
+ ) -> object:
50
+ shutil.rmtree(output, ignore_errors=True)
51
+ output.mkdir(parents=True, exist_ok=True)
52
+ result = fn(*args, **kwargs)
53
+ state = scan_inputs(input_paths)
54
+ state.args_hash = args_hash
55
+ state.dep_hash = dep_hash
56
+ write_state(state, output / ".input_state.json")
57
+ return result
58
+
59
+
60
+ def cache_skip(
61
+ _fn: Callable | None = None,
62
+ *,
63
+ track_dependencies: bool = True,
64
+ ) -> Callable:
65
+ def decorator(fn: Callable) -> Callable:
66
+ @wraps(fn)
67
+ def wrapper(*args, **kwargs):
68
+ # Step 0 — extract _output
69
+ output = kwargs.get("_output")
70
+ assert (
71
+ output is not None
72
+ ), f"{fn.__qualname__}() missing required keyword argument: '_output'"
73
+ output = Path(output)
74
+
75
+ # Step 1 — collect input Paths
76
+ sig = inspect.signature(fn)
77
+ bound = sig.bind(*args, **kwargs)
78
+ bound.apply_defaults()
79
+ input_paths = _collect_input_paths(bound)
80
+
81
+ # Step 2 — compute args_hash
82
+ args_hash = _compute_args_hash(bound)
83
+
84
+ # Step 3 — compute dep_hash
85
+ if track_dependencies:
86
+ dep_hash = _deps.compute_dep_hash(fn)
87
+ else:
88
+ dep_hash = ""
89
+
90
+ # Step 4 — output absent → fresh run
91
+ if not output.exists():
92
+ output.mkdir(parents=True, exist_ok=True)
93
+ result = fn(*args, **kwargs)
94
+ state = scan_inputs(input_paths)
95
+ state.args_hash = args_hash
96
+ state.dep_hash = dep_hash
97
+ write_state(state, output / ".input_state.json")
98
+ logger.info(
99
+ "cache_skip: ran {} — output did not exist", fn.__qualname__
100
+ )
101
+ return result
102
+
103
+ # Step 5 — output present → compare state
104
+ state_path = output / ".input_state.json"
105
+ if not state_path.exists():
106
+ logger.info("cache_skip: rerunning {} — no state file", fn.__qualname__)
107
+ return _do_rerun(
108
+ fn, args, kwargs, input_paths, args_hash, dep_hash, output
109
+ )
110
+
111
+ stored = read_state(state_path)
112
+
113
+ if stored.args_hash != args_hash:
114
+ logger.info("cache_skip: rerunning {} — args changed", fn.__qualname__)
115
+ return _do_rerun(
116
+ fn, args, kwargs, input_paths, args_hash, dep_hash, output
117
+ )
118
+
119
+ if stored.dep_hash != dep_hash:
120
+ logger.info(
121
+ "cache_skip: rerunning {} — dep_hash changed", fn.__qualname__
122
+ )
123
+ return _do_rerun(
124
+ fn, args, kwargs, input_paths, args_hash, dep_hash, output
125
+ )
126
+
127
+ current = scan_inputs(input_paths)
128
+
129
+ if set(current.files) != set(stored.files):
130
+ logger.info(
131
+ "cache_skip: rerunning {} — file set changed", fn.__qualname__
132
+ )
133
+ return _do_rerun(
134
+ fn, args, kwargs, input_paths, args_hash, dep_hash, output
135
+ )
136
+
137
+ changed = False
138
+ updated_files: dict[str, FileRecord] = {}
139
+
140
+ for path_str, current_rec in current.files.items():
141
+ stored_rec = stored.files.get(path_str)
142
+ if stored_rec is None:
143
+ changed = True
144
+ break
145
+
146
+ if (
147
+ current_rec.mtime == stored_rec.mtime
148
+ and current_rec.size == stored_rec.size
149
+ and current_rec.inode == stored_rec.inode
150
+ ):
151
+ updated_files[path_str] = stored_rec
152
+ continue
153
+
154
+ actual_hash = _compute_file_hash(Path(path_str))
155
+ if actual_hash == stored_rec.hash:
156
+ updated_files[path_str] = FileRecord(
157
+ path=path_str,
158
+ mtime=current_rec.mtime,
159
+ inode=current_rec.inode,
160
+ size=current_rec.size,
161
+ hash=actual_hash,
162
+ )
163
+ else:
164
+ changed = True
165
+ break
166
+
167
+ if changed:
168
+ logger.info(
169
+ "cache_skip: rerunning {} — file content changed", fn.__qualname__
170
+ )
171
+ return _do_rerun(
172
+ fn, args, kwargs, input_paths, args_hash, dep_hash, output
173
+ )
174
+
175
+ if updated_files != stored.files:
176
+ stored.files = updated_files
177
+ write_state(stored, state_path)
178
+
179
+ logger.info("cache_skip: skipping {} — inputs unchanged", fn.__qualname__)
180
+ return output
181
+
182
+ return wrapper
183
+
184
+ if _fn is not None:
185
+ return decorator(_fn)
186
+ return decorator
cache_skip/deps.py ADDED
@@ -0,0 +1,156 @@
1
+ import ast
2
+ import inspect
3
+ from collections.abc import Callable
4
+ from pathlib import Path
5
+
6
+ import xxhash
7
+
8
+
9
+ def compute_dep_hash(
10
+ func: Callable,
11
+ dep_roots: list[Path] | None = None,
12
+ dep_files: list[Path] | None = None,
13
+ ) -> str:
14
+ if dep_files is not None:
15
+ return _hash_files(dep_files)
16
+
17
+ mod = inspect.getmodule(func)
18
+ assert mod is not None, f"Cannot determine module for {func}"
19
+ assert (
20
+ hasattr(mod, "__file__") and mod.__file__ is not None
21
+ ), f"Module {mod.__name__} has no __file__"
22
+
23
+ if dep_roots is None:
24
+ dep_roots = _auto_detect_roots(mod)
25
+
26
+ dep_roots_resolved = [Path(r).resolve() for r in dep_roots]
27
+ start_file = Path(mod.__file__).resolve()
28
+
29
+ visited: set[Path] = set()
30
+ _walk_imports(start_file, dep_roots_resolved, visited, is_entry=True)
31
+
32
+ return _hash_files(list(visited))
33
+
34
+
35
+ def _auto_detect_roots(mod) -> list[Path]:
36
+ mod_file = Path(mod.__file__).resolve()
37
+ parts = mod.__name__.split(".")
38
+ pkg_dir = mod_file.parent
39
+ for _ in range(len(parts) - 1):
40
+ pkg_dir = pkg_dir.parent
41
+ return [pkg_dir]
42
+
43
+
44
+ def _hash_files(paths: list[Path]) -> str:
45
+ file_hashes: list[tuple[str, str]] = []
46
+ for src_path in sorted(paths, key=lambda p: str(p.resolve())):
47
+ resolved = src_path.resolve()
48
+ content_hash = xxhash.xxh128(resolved.read_bytes()).hexdigest()
49
+ file_hashes.append((str(resolved), content_hash))
50
+ combined = "\n".join(f"{p}:{h}" for p, h in file_hashes)
51
+ return xxhash.xxh128(combined.encode()).hexdigest()
52
+
53
+
54
+ def _walk_imports(
55
+ source_file: Path,
56
+ dep_roots: list[Path],
57
+ visited: set[Path],
58
+ *,
59
+ is_entry: bool = False,
60
+ ) -> None:
61
+ source_file = source_file.resolve()
62
+ if source_file in visited:
63
+ return
64
+ if not source_file.is_file() or source_file.suffix != ".py":
65
+ return
66
+ if not is_entry and not any(_is_under(source_file, root) for root in dep_roots):
67
+ return
68
+
69
+ visited.add(source_file)
70
+
71
+ try:
72
+ tree = ast.parse(source_file.read_bytes(), filename=str(source_file))
73
+ except SyntaxError:
74
+ return
75
+
76
+ for node in ast.walk(tree):
77
+ if isinstance(node, ast.Import):
78
+ for alias in node.names:
79
+ resolved = _resolve_module(alias.name, source_file, dep_roots)
80
+ if resolved:
81
+ _walk_imports(resolved, dep_roots, visited)
82
+ elif isinstance(node, ast.ImportFrom):
83
+ if node.module is None:
84
+ continue
85
+ module_name = node.module
86
+ if node.level > 0:
87
+ module_name = _resolve_relative(
88
+ module_name, node.level, source_file, dep_roots
89
+ )
90
+ if module_name is None:
91
+ continue
92
+ resolved = _resolve_module(module_name, source_file, dep_roots)
93
+ if resolved:
94
+ _walk_imports(resolved, dep_roots, visited)
95
+
96
+
97
+ def _resolve_module(
98
+ module_name: str,
99
+ from_file: Path,
100
+ dep_roots: list[Path],
101
+ ) -> Path | None:
102
+ parts = module_name.split(".")
103
+ for root in dep_roots:
104
+ pkg_path = root / "/".join(parts) / "__init__.py"
105
+ if pkg_path.is_file():
106
+ return pkg_path.resolve()
107
+ mod_path = (
108
+ root / "/".join(parts[:-1]) / (parts[-1] + ".py")
109
+ if len(parts) > 1
110
+ else root / (parts[0] + ".py")
111
+ )
112
+ if mod_path.is_file():
113
+ return mod_path.resolve()
114
+ for i in range(len(parts), 0, -1):
115
+ sub = parts[:i]
116
+ pkg_init = root / "/".join(sub) / "__init__.py"
117
+ if pkg_init.is_file():
118
+ return pkg_init.resolve()
119
+ mod_file = (
120
+ root / "/".join(sub[:-1]) / (sub[-1] + ".py")
121
+ if len(sub) > 1
122
+ else root / (sub[0] + ".py")
123
+ )
124
+ if mod_file.is_file():
125
+ return mod_file.resolve()
126
+ return None
127
+
128
+
129
+ def _resolve_relative(
130
+ module_name: str,
131
+ level: int,
132
+ from_file: Path,
133
+ dep_roots: list[Path],
134
+ ) -> str | None:
135
+ pkg_dir = from_file.parent
136
+ for _ in range(level - 1):
137
+ pkg_dir = pkg_dir.parent
138
+ for root in dep_roots:
139
+ if _is_under(pkg_dir, root):
140
+ try:
141
+ rel = pkg_dir.relative_to(root)
142
+ prefix = ".".join(rel.parts)
143
+ if prefix and module_name:
144
+ return f"{prefix}.{module_name}"
145
+ return prefix or module_name
146
+ except ValueError:
147
+ continue
148
+ return None
149
+
150
+
151
+ def _is_under(path: Path, root: Path) -> bool:
152
+ try:
153
+ path.relative_to(root)
154
+ return True
155
+ except ValueError:
156
+ return False
cache_skip/dirmaker.py ADDED
@@ -0,0 +1,32 @@
1
+ """Dirmaker — manages output paths under a pipeline staging root."""
2
+
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ from loguru import logger
7
+
8
+
9
+ class Dirmaker:
10
+ """Allocates named output directories under a staging root.
11
+
12
+ Each pipeline step requests a named output directory via new_output_dir().
13
+ The Dirmaker creates it and returns the path. Steps write all their
14
+ outputs there — never to input directories.
15
+ """
16
+
17
+ def __init__(self, base: Path):
18
+ self.root = Path(base)
19
+ self.root.mkdir(parents=True, exist_ok=True)
20
+
21
+ def new_output_dir(self, name: str) -> Path:
22
+ """Allocate a named output directory. Deletes existing output if present."""
23
+ out = self.root / name
24
+ if out.exists():
25
+ logger.info(f"Removing existing output dir: {out}")
26
+ shutil.rmtree(out)
27
+ out.mkdir(parents=True, exist_ok=True)
28
+ return out
29
+
30
+ def path_for(self, name: str) -> Path:
31
+ """Resolve output path for a step (no side effects)."""
32
+ return self.root / name
cache_skip/scanner.py ADDED
@@ -0,0 +1,38 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import xxhash
5
+
6
+ from .state import FileRecord, InputState
7
+
8
+
9
+ def _hash_file(path: Path) -> str:
10
+ return xxhash.xxh128(path.read_bytes()).hexdigest()
11
+
12
+
13
+ def _make_record(p: Path) -> FileRecord:
14
+ stat = os.stat(p)
15
+ return FileRecord(
16
+ path=str(p),
17
+ mtime=stat.st_mtime_ns / 1e9,
18
+ inode=stat.st_ino,
19
+ size=stat.st_size,
20
+ hash=_hash_file(p),
21
+ )
22
+
23
+
24
+ def scan_inputs(input_paths: list[Path]) -> InputState:
25
+ files: dict[str, FileRecord] = {}
26
+ for path in input_paths:
27
+ assert path.exists(), f"Input path does not exist: {path}"
28
+ if path.is_file():
29
+ p = path.resolve()
30
+ files[str(p)] = _make_record(p)
31
+ elif path.is_dir():
32
+ for root, _, filenames in os.walk(path):
33
+ for fn in filenames:
34
+ if fn == ".input_state.json":
35
+ continue
36
+ p = Path(root, fn).resolve()
37
+ files[str(p)] = _make_record(p)
38
+ return InputState(version=1, files=files, args_hash="", dep_hash="")
cache_skip/state.py ADDED
@@ -0,0 +1,36 @@
1
+ import json
2
+ from dataclasses import asdict, dataclass
3
+ from pathlib import Path
4
+
5
+
6
+ @dataclass
7
+ class FileRecord:
8
+ path: str
9
+ mtime: float
10
+ inode: int
11
+ size: int
12
+ hash: str
13
+
14
+
15
+ @dataclass
16
+ class InputState:
17
+ version: int
18
+ files: dict[str, FileRecord]
19
+ args_hash: str
20
+ dep_hash: str
21
+
22
+
23
+ def write_state(state: InputState, path: Path) -> None:
24
+ path.write_text(json.dumps(asdict(state), indent=2))
25
+
26
+
27
+ def read_state(path: Path) -> InputState:
28
+ data = json.loads(path.read_text())
29
+ assert data["version"] == 1, f"Unknown state version: {data['version']}"
30
+ files = {k: FileRecord(**v) for k, v in data["files"].items()}
31
+ return InputState(
32
+ version=data["version"],
33
+ files=files,
34
+ args_hash=data["args_hash"],
35
+ dep_hash=data["dep_hash"],
36
+ )
File without changes
@@ -0,0 +1,228 @@
1
+ import os
2
+ import shutil
3
+ from datetime import date
4
+ from pathlib import Path
5
+ from unittest.mock import patch
6
+
7
+ import pytest
8
+
9
+ from cache_skip.decorator import cache_skip
10
+ from cache_skip.state import read_state
11
+
12
+
13
+ @pytest.fixture
14
+ def inp(tmp_path):
15
+ d = tmp_path / "input"
16
+ d.mkdir()
17
+ (d / "data.txt").write_bytes(b"hello")
18
+ return d
19
+
20
+
21
+ def test_first_call_creates_output_and_state(tmp_path, inp):
22
+ output = tmp_path / "output"
23
+ calls = []
24
+
25
+ @cache_skip(track_dependencies=False)
26
+ def step(i: Path, _output: Path):
27
+ calls.append(1)
28
+
29
+ step(inp, _output=output)
30
+ assert len(calls) == 1
31
+ assert output.exists()
32
+ assert (output / ".input_state.json").exists()
33
+
34
+
35
+ def test_second_call_no_changes_skips(tmp_path, inp):
36
+ output = tmp_path / "output"
37
+ calls = []
38
+
39
+ @cache_skip(track_dependencies=False)
40
+ def step(i: Path, _output: Path):
41
+ calls.append(1)
42
+
43
+ step(inp, _output=output)
44
+ result = step(inp, _output=output)
45
+ assert len(calls) == 1
46
+ assert result == output
47
+
48
+
49
+ def test_second_call_input_changed_reruns(tmp_path, inp):
50
+ output = tmp_path / "output"
51
+ calls = []
52
+
53
+ @cache_skip(track_dependencies=False)
54
+ def step(i: Path, _output: Path):
55
+ calls.append(1)
56
+
57
+ step(inp, _output=output)
58
+ (inp / "data.txt").write_bytes(b"changed content")
59
+ step(inp, _output=output)
60
+ assert len(calls) == 2
61
+
62
+
63
+ def test_metadata_drift_same_content_no_rerun(tmp_path, inp):
64
+ output = tmp_path / "output"
65
+ calls = []
66
+
67
+ @cache_skip(track_dependencies=False)
68
+ def step(i: Path, _output: Path):
69
+ calls.append(1)
70
+
71
+ step(inp, _output=output)
72
+
73
+ data_file = inp / "data.txt"
74
+ stat = os.stat(data_file)
75
+ os.utime(data_file, (stat.st_atime + 2.0, stat.st_mtime + 2.0))
76
+
77
+ result = step(inp, _output=output)
78
+ assert len(calls) == 1
79
+ assert result == output
80
+
81
+
82
+ def test_args_changed_reruns(tmp_path, inp):
83
+ output = tmp_path / "output"
84
+ calls = []
85
+
86
+ @cache_skip(track_dependencies=False)
87
+ def step(i: Path, scale: int, _output: Path):
88
+ calls.append(1)
89
+
90
+ step(inp, 1, _output=output)
91
+ step(inp, 2, _output=output)
92
+ assert len(calls) == 2
93
+
94
+
95
+ def test_dep_hash_changed_reruns(tmp_path, inp):
96
+ output = tmp_path / "output"
97
+ calls = []
98
+
99
+ @cache_skip
100
+ def step(i: Path, _output: Path):
101
+ calls.append(1)
102
+
103
+ with patch("cache_skip.deps.compute_dep_hash", return_value="hash_a" + "0" * 26):
104
+ step(inp, _output=output)
105
+ with patch("cache_skip.deps.compute_dep_hash", return_value="hash_b" + "0" * 26):
106
+ step(inp, _output=output)
107
+ assert len(calls) == 2
108
+
109
+
110
+ def test_output_exists_no_state_file_reruns(tmp_path, inp):
111
+ output = tmp_path / "output"
112
+ output.mkdir()
113
+ calls = []
114
+
115
+ @cache_skip(track_dependencies=False)
116
+ def step(i: Path, _output: Path):
117
+ calls.append(1)
118
+
119
+ step(inp, _output=output)
120
+ assert len(calls) == 1
121
+
122
+
123
+ def test_output_deleted_externally_runs_fresh(tmp_path, inp):
124
+ output = tmp_path / "output"
125
+ calls = []
126
+
127
+ @cache_skip(track_dependencies=False)
128
+ def step(i: Path, _output: Path):
129
+ calls.append(1)
130
+
131
+ step(inp, _output=output)
132
+ shutil.rmtree(output)
133
+ step(inp, _output=output)
134
+ assert len(calls) == 2
135
+
136
+
137
+ def test_input_file_deleted_reruns(tmp_path, inp):
138
+ output = tmp_path / "output"
139
+ calls = []
140
+
141
+ @cache_skip(track_dependencies=False)
142
+ def step(i: Path, _output: Path):
143
+ calls.append(1)
144
+
145
+ step(inp, _output=output)
146
+ (inp / "data.txt").unlink()
147
+ step(inp, _output=output)
148
+ assert len(calls) == 2
149
+
150
+
151
+ def test_missing_output_kwarg_raises(tmp_path, inp):
152
+ @cache_skip(track_dependencies=False)
153
+ def step(i: Path, _output: Path):
154
+ pass
155
+
156
+ with pytest.raises(AssertionError, match="_output"):
157
+ step(inp)
158
+
159
+
160
+ def test_track_dependencies_false_does_not_call_compute_dep_hash(tmp_path, inp):
161
+ output = tmp_path / "output"
162
+
163
+ with patch("cache_skip.deps.compute_dep_hash") as mock_dep:
164
+
165
+ @cache_skip(track_dependencies=False)
166
+ def step(i: Path, _output: Path):
167
+ pass
168
+
169
+ step(inp, _output=output)
170
+ mock_dep.assert_not_called()
171
+
172
+
173
+ def test_both_decorator_forms_work(tmp_path, inp):
174
+ output1 = tmp_path / "output1"
175
+ output2 = tmp_path / "output2"
176
+
177
+ @cache_skip
178
+ def step_a(i: Path, _output: Path):
179
+ pass
180
+
181
+ @cache_skip(track_dependencies=False)
182
+ def step_b(i: Path, _output: Path):
183
+ pass
184
+
185
+ with patch("cache_skip.deps.compute_dep_hash", return_value="x" * 32):
186
+ step_a(inp, _output=output1)
187
+ step_b(inp, _output=output2)
188
+ assert output1.exists()
189
+ assert output2.exists()
190
+
191
+
192
+ def test_non_path_args_change_triggers_rerun(tmp_path, inp):
193
+ output = tmp_path / "output"
194
+ calls = []
195
+
196
+ @cache_skip(track_dependencies=False)
197
+ def step(i: Path, schedule_date: date, _output: Path):
198
+ calls.append(1)
199
+
200
+ step(inp, date(2025, 1, 1), _output=output)
201
+ step(inp, date(2025, 1, 2), _output=output)
202
+ assert len(calls) == 2
203
+
204
+
205
+ def test_optional_path_none_not_in_input_paths(tmp_path, inp):
206
+ output = tmp_path / "output"
207
+ calls = []
208
+
209
+ @cache_skip(track_dependencies=False)
210
+ def step(i: Path, extra: Path | None, _output: Path):
211
+ calls.append(1)
212
+
213
+ step(inp, None, _output=output)
214
+ result = step(inp, None, _output=output)
215
+ assert len(calls) == 1
216
+ assert result == output
217
+
218
+
219
+ def test_state_dep_hash_empty_when_disabled(tmp_path, inp):
220
+ output = tmp_path / "output"
221
+
222
+ @cache_skip(track_dependencies=False)
223
+ def step(i: Path, _output: Path):
224
+ pass
225
+
226
+ step(inp, _output=output)
227
+ state = read_state(output / ".input_state.json")
228
+ assert state.dep_hash == ""
@@ -0,0 +1,75 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+ from cache_skip import deps
6
+
7
+
8
+ def test_hash_files_returns_hex_string(tmp_path):
9
+ f = tmp_path / "sample.py"
10
+ f.write_text("x = 1\n")
11
+ result = deps._hash_files([f])
12
+ assert isinstance(result, str)
13
+ assert len(result) == 32
14
+
15
+
16
+ def test_same_files_same_hash(tmp_path):
17
+ f = tmp_path / "sample.py"
18
+ f.write_text("x = 1\n")
19
+ h1 = deps._hash_files([f])
20
+ h2 = deps._hash_files([f])
21
+ assert h1 == h2
22
+
23
+
24
+ def test_changed_file_different_hash(tmp_path):
25
+ f = tmp_path / "sample.py"
26
+ f.write_text("x = 1\n")
27
+ h1 = deps._hash_files([f])
28
+ f.write_text("x = 99\n")
29
+ h2 = deps._hash_files([f])
30
+ assert h1 != h2
31
+
32
+
33
+ def test_compute_dep_hash_returns_hex_string(tmp_path):
34
+ f = tmp_path / "sample.py"
35
+ f.write_text("x = 42\n")
36
+ result = deps.compute_dep_hash(lambda: None, dep_files=[f])
37
+ assert isinstance(result, str)
38
+ assert len(result) == 32
39
+
40
+
41
+ def test_compute_dep_hash_deterministic(tmp_path):
42
+ f = tmp_path / "sample.py"
43
+ f.write_text("x = 42\n")
44
+ h1 = deps.compute_dep_hash(lambda: None, dep_files=[f])
45
+ h2 = deps.compute_dep_hash(lambda: None, dep_files=[f])
46
+ assert h1 == h2
47
+
48
+
49
+ def test_compute_dep_hash_changes_with_file(tmp_path):
50
+ f = tmp_path / "sample.py"
51
+ f.write_text("x = 42\n")
52
+ h1 = deps.compute_dep_hash(lambda: None, dep_files=[f])
53
+ f.write_text("x = 99\n")
54
+ h2 = deps.compute_dep_hash(lambda: None, dep_files=[f])
55
+ assert h1 != h2
56
+
57
+
58
+ def test_track_dependencies_false_dep_hash_is_empty(tmp_path):
59
+ from pathlib import Path
60
+
61
+ from cache_skip.decorator import cache_skip
62
+ from cache_skip.state import read_state
63
+
64
+ inp = tmp_path / "inp"
65
+ inp.mkdir()
66
+ (inp / "data.txt").write_bytes(b"x")
67
+ output = tmp_path / "output"
68
+
69
+ @cache_skip(track_dependencies=False)
70
+ def step(i: Path, _output: Path):
71
+ pass
72
+
73
+ step(inp, _output=output)
74
+ state = read_state(output / ".input_state.json")
75
+ assert state.dep_hash == ""
@@ -0,0 +1,34 @@
1
+ from pathlib import Path
2
+
3
+ from cache_skip.dirmaker import Dirmaker
4
+
5
+
6
+ def test_new_output_dir_creates_directory(tmp_path):
7
+ dm = Dirmaker(tmp_path / "root")
8
+ out = dm.new_output_dir("step1")
9
+ assert out.exists()
10
+ assert out.is_dir()
11
+ assert out == tmp_path / "root" / "step1"
12
+
13
+
14
+ def test_new_output_dir_deletes_and_recreates(tmp_path):
15
+ dm = Dirmaker(tmp_path / "root")
16
+ out = dm.new_output_dir("step1")
17
+ (out / "result.txt").write_text("old")
18
+ out2 = dm.new_output_dir("step1")
19
+ assert out2 == out
20
+ assert not (out / "result.txt").exists()
21
+ assert out.exists()
22
+
23
+
24
+ def test_path_for_no_side_effects(tmp_path):
25
+ dm = Dirmaker(tmp_path / "root")
26
+ p = dm.path_for("step1")
27
+ assert p == tmp_path / "root" / "step1"
28
+ assert not p.exists()
29
+
30
+
31
+ def test_dirmaker_creates_root_on_init(tmp_path):
32
+ root = tmp_path / "deep" / "root"
33
+ dm = Dirmaker(root)
34
+ assert root.exists()
@@ -0,0 +1,57 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+ import xxhash
6
+
7
+ from cache_skip.scanner import scan_inputs
8
+
9
+
10
+ def test_single_file(tmp_path):
11
+ f = tmp_path / "data.txt"
12
+ f.write_bytes(b"hello world")
13
+ state = scan_inputs([f])
14
+ assert len(state.files) == 1
15
+ key = str(f.resolve())
16
+ rec = state.files[key]
17
+ stat = os.stat(f)
18
+ assert rec.size == 11
19
+ assert rec.inode == stat.st_ino
20
+ assert rec.mtime == pytest.approx(stat.st_mtime_ns / 1e9)
21
+ assert rec.hash == xxhash.xxh128(b"hello world").hexdigest()
22
+ assert state.args_hash == ""
23
+ assert state.dep_hash == ""
24
+
25
+
26
+ def test_directory(tmp_path):
27
+ sub = tmp_path / "sub"
28
+ sub.mkdir()
29
+ (sub / "a.txt").write_bytes(b"a")
30
+ (sub / "b.txt").write_bytes(b"b")
31
+ state = scan_inputs([sub])
32
+ assert len(state.files) == 2
33
+ for key in state.files:
34
+ assert Path(key).is_absolute()
35
+
36
+
37
+ def test_nonexistent(tmp_path):
38
+ missing = tmp_path / "does_not_exist"
39
+ with pytest.raises(AssertionError, match="does not exist"):
40
+ scan_inputs([missing])
41
+
42
+
43
+ def test_empty_directory(tmp_path):
44
+ empty = tmp_path / "empty"
45
+ empty.mkdir()
46
+ state = scan_inputs([empty])
47
+ assert state.files == {}
48
+
49
+
50
+ def test_skips_input_state_json(tmp_path):
51
+ d = tmp_path / "dir"
52
+ d.mkdir()
53
+ (d / "data.txt").write_bytes(b"x")
54
+ (d / ".input_state.json").write_bytes(b"{}")
55
+ state = scan_inputs([d])
56
+ assert len(state.files) == 1
57
+ assert all(".input_state.json" not in k for k in state.files)
@@ -0,0 +1,39 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+ from cache_skip.state import FileRecord, InputState, read_state, write_state
7
+
8
+
9
+ def _make_state() -> InputState:
10
+ return InputState(
11
+ version=1,
12
+ files={
13
+ "/tmp/foo.txt": FileRecord(
14
+ path="/tmp/foo.txt",
15
+ mtime=1716000000.123,
16
+ inode=98765,
17
+ size=4096,
18
+ hash="abcdef1234567890abcdef1234567890",
19
+ )
20
+ },
21
+ args_hash="aabbccdd11223344aabbccdd11223344",
22
+ dep_hash="eeff00112233445566778899aabbccdd",
23
+ )
24
+
25
+
26
+ def test_round_trip(tmp_path):
27
+ state = _make_state()
28
+ path = tmp_path / "state.json"
29
+ write_state(state, path)
30
+ loaded = read_state(path)
31
+ assert loaded == state
32
+
33
+
34
+ def test_version_mismatch(tmp_path):
35
+ path = tmp_path / "state.json"
36
+ data = {"version": 99, "files": {}, "args_hash": "", "dep_hash": ""}
37
+ path.write_text(json.dumps(data))
38
+ with pytest.raises(AssertionError):
39
+ read_state(path)
@@ -0,0 +1,130 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycache_skip
3
+ Version: 0.1.0
4
+ Summary: Skip pipeline steps when inputs are unchanged — content-aware, with module dependency tracking
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: POSIX :: Linux
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
11
+ Requires-Python: >=3.12
12
+ Requires-Dist: loguru
13
+ Requires-Dist: xxhash
14
+ Description-Content-Type: text/markdown
15
+
16
+ # pycache_skip
17
+
18
+ Skip pipeline steps when their inputs have not changed.
19
+
20
+ ```bash
21
+ uv add pycache_skip
22
+ ```
23
+
24
+ ## What it does
25
+
26
+ `cache_skip` wraps a pipeline step function and skips re-execution when all
27
+ inputs are unchanged. It stores a compact state file (`.input_state.json`)
28
+ alongside each output directory. On subsequent calls it compares the current
29
+ inputs against the stored state and only reruns the function when something
30
+ actually changed.
31
+
32
+ ## Usage
33
+
34
+ ### Basic example (single input directory)
35
+
36
+ ```python
37
+ from pathlib import Path
38
+ from cache_skip import cache_skip, Dirmaker
39
+
40
+ dm = Dirmaker(Path("/data/pipeline/run-001"))
41
+
42
+ @cache_skip
43
+ def step_transform(raw: Path, *, _output: Path) -> Path:
44
+ # heavy transformation ...
45
+ return _output
46
+
47
+ # First call — runs the function and records input state.
48
+ step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
49
+
50
+ # Second call — skips the function, returns the output path immediately.
51
+ step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
52
+ ```
53
+
54
+ ### Example with non-Path args
55
+
56
+ Non-`Path` arguments (dates, strings, ints, etc.) are also part of the cache
57
+ key. Changing them triggers a rerun.
58
+
59
+ ```python
60
+ import datetime as dt
61
+
62
+ @cache_skip(track_dependencies=False)
63
+ def step_build_config(
64
+ schedule_date: dt.date,
65
+ template: Path,
66
+ *,
67
+ _output: Path,
68
+ ) -> Path:
69
+ ...
70
+
71
+ # Changing schedule_date from 2025-01-01 to 2025-01-02 invalidates the cache.
72
+ ```
73
+
74
+ ### Dirmaker companion
75
+
76
+ `Dirmaker` allocates named output directories under a staging root. Use
77
+ `path_for(name)` to resolve the path without side effects (for `@cache_skip`),
78
+ or `new_output_dir(name)` to delete and recreate explicitly.
79
+
80
+ ```python
81
+ dm = Dirmaker(Path("/data/pipeline/run-001"))
82
+
83
+ # Pass path to decorator — decorator manages deletion on rerun.
84
+ step_transform(raw, _output=dm.path_for("transform"))
85
+
86
+ # Or manage the directory yourself:
87
+ out = dm.new_output_dir("transform") # deletes existing, creates fresh
88
+ ```
89
+
90
+ ## How invalidation works
91
+
92
+ Three-tier change detection on every call after the first:
93
+
94
+ 1. **Args hash** — all non-`Path`, non-`_output` arguments are hashed via
95
+ `repr()`. A change in any scalar argument (date, string, int, …) triggers
96
+ a rerun immediately.
97
+
98
+ 2. **Dependency hash** — the source files of the decorated function and all
99
+ modules it imports (static AST analysis) are hashed. Editing the function's
100
+ source code triggers a rerun. Disable with `track_dependencies=False`.
101
+
102
+ 3. **File content hash** — every file under each input `Path` is compared.
103
+ Metadata (mtime, inode, size) is checked first as a fast path. If metadata
104
+ is identical the stored hash is trusted. If metadata drifted but content
105
+ hash matches, the state file is updated silently without a rerun (handles
106
+ `rsync` / `cp -p` copies with timestamp noise).
107
+
108
+ ## track_dependencies
109
+
110
+ ```python
111
+ @cache_skip(track_dependencies=False)
112
+ def step(...):
113
+ ...
114
+ ```
115
+
116
+ Set `track_dependencies=False` to skip module source hashing. Useful when the
117
+ function imports large, rarely-changing libraries and startup cost matters, or
118
+ in tests.
119
+
120
+ ## Comparison with auto_skip
121
+
122
+ `cache_skip` is a simpler, self-contained alternative to `auto_skip`:
123
+
124
+ | Feature | `cache_skip` | `auto_skip` |
125
+ | ------------------- | ---------------------------- | -------------------- |
126
+ | Input detection | explicit `Path` args | strace / audit hooks |
127
+ | Non-Path args | hashed | ignored |
128
+ | Module dep tracking | static AST | runtime import list |
129
+ | External deps | `xxhash`, `loguru` | heavier stack |
130
+ | Output format | dir with `.input_state.json` | opaque cache store |
@@ -0,0 +1,16 @@
1
+ cache_skip/__init__.py,sha256=ys0DCGkYl4ePzT-uM9RPM_iStHKuUj8npAHRNjKf2jQ,123
2
+ cache_skip/decorator.py,sha256=F6O2RfpRtngQCjB9hsM9ZXB59dlTYl--HMz3VVYtSpo,6207
3
+ cache_skip/deps.py,sha256=jAQpBKPMfjbLHPuWbWYaOd2TRBvsG03PhdJD-NYGXEw,4710
4
+ cache_skip/dirmaker.py,sha256=QnHhOqPs4yOkN3LL-IdpsKrJBAWzQg0gXGBTzrA5fpI,1043
5
+ cache_skip/scanner.py,sha256=9_8y6C5Cmme5_xLJEWS6cKE8NZM8bkZlnI6euPocEoA,1090
6
+ cache_skip/state.py,sha256=pM6T1kVDCrz9Jcojlq1QnFb5bRPyt8LhWA19qOa3uKI,800
7
+ cache_skip/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ cache_skip/tests/test_decorator.py,sha256=uNXBpGtapKMsP2Od2Y837svxPP8sdnbB_XG6t7yMEBQ,5674
9
+ cache_skip/tests/test_deps.py,sha256=M5Vf360reYb_bUGQ3NhSRWhXpHtdpMzPnIdJ58svbno,1983
10
+ cache_skip/tests/test_dirmaker.py,sha256=RXB2YXvWnGG54vhGK-P-bT3OJytEswbVe7CiqRtkQd4,918
11
+ cache_skip/tests/test_scanner.py,sha256=VZR8diOCKjJob2jvx-he_7qXM-XMoWaS9IXLqFURO78,1500
12
+ cache_skip/tests/test_state.py,sha256=8Q7lBorHWbxrHW6av1wls9updfqwrhHv9geWcEBih4w,1019
13
+ pycache_skip-0.1.0.dist-info/METADATA,sha256=gvIHtDPlUE2YKncTV6TAFflgdQHsnhMK2tzIMW6L2rc,4280
14
+ pycache_skip-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
15
+ pycache_skip-0.1.0.dist-info/licenses/LICENSE,sha256=yiKnYC3HrKFysIItc-FrkG5MPsK8qOP1tc2283dEG1k,1080
16
+ pycache_skip-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 cache_skip contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.