PyPI - pycache-skip - Versions diffs - 0.1.0__tar.gz - Mend

pycache-skip 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pycache_skip-0.1.0/.gitignore +2 -0
pycache_skip-0.1.0/LICENSE +21 -0
pycache_skip-0.1.0/PKG-INFO +130 -0
pycache_skip-0.1.0/README.md +115 -0
pycache_skip-0.1.0/idea.md +38 -0
pycache_skip-0.1.0/pyproject.toml +32 -0
pycache_skip-0.1.0/src/cache_skip/__init__.py +4 -0
pycache_skip-0.1.0/src/cache_skip/decorator.py +186 -0
pycache_skip-0.1.0/src/cache_skip/deps.py +156 -0
pycache_skip-0.1.0/src/cache_skip/dirmaker.py +32 -0
pycache_skip-0.1.0/src/cache_skip/scanner.py +38 -0
pycache_skip-0.1.0/src/cache_skip/state.py +36 -0
pycache_skip-0.1.0/src/cache_skip/tests/__init__.py +0 -0
pycache_skip-0.1.0/src/cache_skip/tests/test_decorator.py +228 -0
pycache_skip-0.1.0/src/cache_skip/tests/test_deps.py +75 -0
pycache_skip-0.1.0/src/cache_skip/tests/test_dirmaker.py +34 -0
pycache_skip-0.1.0/src/cache_skip/tests/test_scanner.py +57 -0
pycache_skip-0.1.0/src/cache_skip/tests/test_state.py +39 -0
pycache_skip-0.1.0/upload_pypi.sh +18 -0
pycache_skip-0.1.0/uv.lock +704 -0

pycache_skip-0.1.0/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ .playwright-mcp
2	+

pycache_skip-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 cache_skip contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pycache_skip-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,130 @@
+Metadata-Version: 2.4
+Name: pycache_skip
+Version: 0.1.0
+Summary: Skip pipeline steps when inputs are unchanged — content-aware, with module dependency tracking
+License-Expression: MIT
+License-File: LICENSE
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.12
+Requires-Dist: loguru
+Requires-Dist: xxhash
+Description-Content-Type: text/markdown
+# pycache_skip
+Skip pipeline steps when their inputs have not changed.
+```bash
+uv add pycache_skip
+```
+## What it does
+`cache_skip` wraps a pipeline step function and skips re-execution when all
+inputs are unchanged. It stores a compact state file (`.input_state.json`)
+alongside each output directory. On subsequent calls it compares the current
+inputs against the stored state and only reruns the function when something
+actually changed.
+## Usage
+### Basic example (single input directory)
+```python
+from pathlib import Path
+from cache_skip import cache_skip, Dirmaker
+dm = Dirmaker(Path("/data/pipeline/run-001"))
+@cache_skip
+def step_transform(raw: Path, *, _output: Path) -> Path:
+    # heavy transformation ...
+    return _output
+# First call — runs the function and records input state.
+step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
+# Second call — skips the function, returns the output path immediately.
+step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
+```
+### Example with non-Path args
+Non-`Path` arguments (dates, strings, ints, etc.) are also part of the cache
+key. Changing them triggers a rerun.
+```python
+import datetime as dt
+@cache_skip(track_dependencies=False)
+def step_build_config(
+    schedule_date: dt.date,
+    template: Path,
+    *,
+    _output: Path,
+) -> Path:
+    ...
+# Changing schedule_date from 2025-01-01 to 2025-01-02 invalidates the cache.
+```
+### Dirmaker companion
+`Dirmaker` allocates named output directories under a staging root. Use
+`path_for(name)` to resolve the path without side effects (for `@cache_skip`),
+or `new_output_dir(name)` to delete and recreate explicitly.
+```python
+dm = Dirmaker(Path("/data/pipeline/run-001"))
+# Pass path to decorator — decorator manages deletion on rerun.
+step_transform(raw, _output=dm.path_for("transform"))
+# Or manage the directory yourself:
+out = dm.new_output_dir("transform")   # deletes existing, creates fresh
+```
+## How invalidation works
+Three-tier change detection on every call after the first:
+1. **Args hash** — all non-`Path`, non-`_output` arguments are hashed via
+   `repr()`. A change in any scalar argument (date, string, int, …) triggers
+   a rerun immediately.
+2. **Dependency hash** — the source files of the decorated function and all
+   modules it imports (static AST analysis) are hashed. Editing the function's
+   source code triggers a rerun. Disable with `track_dependencies=False`.
+3. **File content hash** — every file under each input `Path` is compared.
+   Metadata (mtime, inode, size) is checked first as a fast path. If metadata
+   is identical the stored hash is trusted. If metadata drifted but content
+   hash matches, the state file is updated silently without a rerun (handles
+   `rsync` / `cp -p` copies with timestamp noise).
+## track_dependencies
+```python
+@cache_skip(track_dependencies=False)
+def step(...):
+    ...
+```
+Set `track_dependencies=False` to skip module source hashing. Useful when the
+function imports large, rarely-changing libraries and startup cost matters, or
+in tests.
+## Comparison with auto_skip
+`cache_skip` is a simpler, self-contained alternative to `auto_skip`:
+| Feature             | `cache_skip`                 | `auto_skip`          |
+| ------------------- | ---------------------------- | -------------------- |
+| Input detection     | explicit `Path` args         | strace / audit hooks |
+| Non-Path args       | hashed                       | ignored              |
+| Module dep tracking | static AST                   | runtime import list  |
+| External deps       | `xxhash`, `loguru`           | heavier stack        |
+| Output format       | dir with `.input_state.json` | opaque cache store   |

pycache_skip-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,115 @@
+# pycache_skip
+Skip pipeline steps when their inputs have not changed.
+```bash
+uv add pycache_skip
+```
+## What it does
+`cache_skip` wraps a pipeline step function and skips re-execution when all
+inputs are unchanged. It stores a compact state file (`.input_state.json`)
+alongside each output directory. On subsequent calls it compares the current
+inputs against the stored state and only reruns the function when something
+actually changed.
+## Usage
+### Basic example (single input directory)
+```python
+from pathlib import Path
+from cache_skip import cache_skip, Dirmaker
+dm = Dirmaker(Path("/data/pipeline/run-001"))
+@cache_skip
+def step_transform(raw: Path, *, _output: Path) -> Path:
+    # heavy transformation ...
+    return _output
+# First call — runs the function and records input state.
+step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
+# Second call — skips the function, returns the output path immediately.
+step_transform(Path("/data/raw"), _output=dm.path_for("transform"))
+```
+### Example with non-Path args
+Non-`Path` arguments (dates, strings, ints, etc.) are also part of the cache
+key. Changing them triggers a rerun.
+```python
+import datetime as dt
+@cache_skip(track_dependencies=False)
+def step_build_config(
+    schedule_date: dt.date,
+    template: Path,
+    *,
+    _output: Path,
+) -> Path:
+    ...
+# Changing schedule_date from 2025-01-01 to 2025-01-02 invalidates the cache.
+```
+### Dirmaker companion
+`Dirmaker` allocates named output directories under a staging root. Use
+`path_for(name)` to resolve the path without side effects (for `@cache_skip`),
+or `new_output_dir(name)` to delete and recreate explicitly.
+```python
+dm = Dirmaker(Path("/data/pipeline/run-001"))
+# Pass path to decorator — decorator manages deletion on rerun.
+step_transform(raw, _output=dm.path_for("transform"))
+# Or manage the directory yourself:
+out = dm.new_output_dir("transform")   # deletes existing, creates fresh
+```
+## How invalidation works
+Three-tier change detection on every call after the first:
+1. **Args hash** — all non-`Path`, non-`_output` arguments are hashed via
+   `repr()`. A change in any scalar argument (date, string, int, …) triggers
+   a rerun immediately.
+2. **Dependency hash** — the source files of the decorated function and all
+   modules it imports (static AST analysis) are hashed. Editing the function's
+   source code triggers a rerun. Disable with `track_dependencies=False`.
+3. **File content hash** — every file under each input `Path` is compared.
+   Metadata (mtime, inode, size) is checked first as a fast path. If metadata
+   is identical the stored hash is trusted. If metadata drifted but content
+   hash matches, the state file is updated silently without a rerun (handles
+   `rsync` / `cp -p` copies with timestamp noise).
+## track_dependencies
+```python
+@cache_skip(track_dependencies=False)
+def step(...):
+    ...
+```
+Set `track_dependencies=False` to skip module source hashing. Useful when the
+function imports large, rarely-changing libraries and startup cost matters, or
+in tests.
+## Comparison with auto_skip
+`cache_skip` is a simpler, self-contained alternative to `auto_skip`:
+| Feature             | `cache_skip`                 | `auto_skip`          |
+| ------------------- | ---------------------------- | -------------------- |
+| Input detection     | explicit `Path` args         | strace / audit hooks |
+| Non-Path args       | hashed                       | ignored              |
+| Module dep tracking | static AST                   | runtime import list  |
+| External deps       | `xxhash`, `loguru`           | heavier stack        |
+| Output format       | dir with `.input_state.json` | opaque cache store   |

pycache_skip-0.1.0/idea.md ADDED Viewed

@@ -0,0 +1,38 @@
+from casher use concept of:
+in-process module dependency tracking to compute implicit dependent files
+for all input dirs track each file and dir recursively:
+record in a file:
+- path
+- last-modified
+- node-id?
+- size
+- hash
+if last-modified size or node-id changed: compute hash and compare this:
+if identical last-modified and size. assume not changed
+if modifed compute hash. if has same, update entry, not chnages
+the file/dirlist lives in otput dir under .input.txt
+if output exists:
+read .input.txt and compare with current state of input dirs. if any changes, mrun function and created update .input.txt with new state. if no changes, skip run and reuse output
+update .input.txt if last-modified or size changed, but hash changed to reflect current state, in this case no recompution required as same input
+important the files and hashes include all module dependencies catured in dependency tracker! any change there, even if not in input dirs, should trigger recomputation as it may change the behavior of the function
+decorator: @cache_skip
+@cache_skip(track_dependencies=True) # default is true
+def pipeline_function(input_dir1: Path, input_dir2: Path, \_output: Path): # function body
+do lots of computation and write to \_output to \_output dir
+return \_output
+see file in /home/ralf/sync/synced_develop/lsy/hd-demo-designer/src/pipeline
+it will replace the auto_skip decorator there
+- module like casher will be uploaded to pypi, too same license etc.
+- proper README.md with usage instructions and examples.

pycache_skip-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,32 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "pycache_skip"
+version = "0.1.0"
+description = "Skip pipeline steps when inputs are unchanged — content-aware, with module dependency tracking"
+readme = "README.md"
+requires-python = ">=3.12"
+license = "MIT"
+license-files = ["LICENSE"]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: POSIX :: Linux",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = ["loguru", "xxhash"]
+[tool.hatch.build.targets.wheel]
+packages = ["src/cache_skip"]
+[tool.hatch.build]
+exclude = ["test.sh", "coverage.sh"]
+[tool.pytest.ini_options]
+testpaths = ["src/cache_skip/tests"]
+timeout = 10
+[dependency-groups]
+dev = ["pytest>=9.0.3", "pytest-timeout>=2.4.0", "build>=1.5.0", "twine>=6.2.0"]

pycache_skip-0.1.0/src/cache_skip/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from cache_skip.decorator import cache_skip
+from cache_skip.dirmaker import Dirmaker
+__all__ = ["cache_skip", "Dirmaker"]

pycache_skip-0.1.0/src/cache_skip/decorator.py ADDED Viewed

@@ -0,0 +1,186 @@
+import inspect
+import shutil
+from collections.abc import Callable
+from functools import wraps
+from pathlib import Path
+import xxhash
+from loguru import logger
+from . import deps as _deps
+from .scanner import scan_inputs
+from .state import FileRecord, InputState, read_state, write_state
+def _collect_input_paths(bound: inspect.BoundArguments) -> list[Path]:
+    paths = []
+    for name, value in bound.arguments.items():
+        if name == "_output":
+            continue
+        if isinstance(value, Path):
+            paths.append(value)
+    return paths
+def _compute_args_hash(bound: inspect.BoundArguments) -> str:
+    non_path_args: dict[str, str] = {}
+    for name, value in bound.arguments.items():
+        if name == "_output":
+            continue
+        if isinstance(value, Path):
+            continue
+        non_path_args[name] = repr(value)
+    combined = "|".join(f"{k}={v}" for k, v in sorted(non_path_args.items()))
+    return xxhash.xxh128(combined.encode()).hexdigest()
+def _compute_file_hash(path: Path) -> str:
+    return xxhash.xxh128(path.read_bytes()).hexdigest()
+def _do_rerun(
+    fn: Callable,
+    args: tuple,
+    kwargs: dict,
+    input_paths: list[Path],
+    args_hash: str,
+    dep_hash: str,
+    output: Path,
+) -> object:
+    shutil.rmtree(output, ignore_errors=True)
+    output.mkdir(parents=True, exist_ok=True)
+    result = fn(*args, **kwargs)
+    state = scan_inputs(input_paths)
+    state.args_hash = args_hash
+    state.dep_hash = dep_hash
+    write_state(state, output / ".input_state.json")
+    return result
+def cache_skip(
+    _fn: Callable | None = None,
+    *,
+    track_dependencies: bool = True,
+) -> Callable:
+    def decorator(fn: Callable) -> Callable:
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            # Step 0 — extract _output
+            output = kwargs.get("_output")
+            assert (
+                output is not None
+            ), f"{fn.__qualname__}() missing required keyword argument: '_output'"
+            output = Path(output)
+            # Step 1 — collect input Paths
+            sig = inspect.signature(fn)
+            bound = sig.bind(*args, **kwargs)
+            bound.apply_defaults()
+            input_paths = _collect_input_paths(bound)
+            # Step 2 — compute args_hash
+            args_hash = _compute_args_hash(bound)
+            # Step 3 — compute dep_hash
+            if track_dependencies:
+                dep_hash = _deps.compute_dep_hash(fn)
+            else:
+                dep_hash = ""
+            # Step 4 — output absent → fresh run
+            if not output.exists():
+                output.mkdir(parents=True, exist_ok=True)
+                result = fn(*args, **kwargs)
+                state = scan_inputs(input_paths)
+                state.args_hash = args_hash
+                state.dep_hash = dep_hash
+                write_state(state, output / ".input_state.json")
+                logger.info(
+                    "cache_skip: ran {} — output did not exist", fn.__qualname__
+                )
+                return result
+            # Step 5 — output present → compare state
+            state_path = output / ".input_state.json"
+            if not state_path.exists():
+                logger.info("cache_skip: rerunning {} — no state file", fn.__qualname__)
+                return _do_rerun(
+                    fn, args, kwargs, input_paths, args_hash, dep_hash, output
+                )
+            stored = read_state(state_path)
+            if stored.args_hash != args_hash:
+                logger.info("cache_skip: rerunning {} — args changed", fn.__qualname__)
+                return _do_rerun(
+                    fn, args, kwargs, input_paths, args_hash, dep_hash, output
+                )
+            if stored.dep_hash != dep_hash:
+                logger.info(
+                    "cache_skip: rerunning {} — dep_hash changed", fn.__qualname__
+                )
+                return _do_rerun(
+                    fn, args, kwargs, input_paths, args_hash, dep_hash, output
+                )
+            current = scan_inputs(input_paths)
+            if set(current.files) != set(stored.files):
+                logger.info(
+                    "cache_skip: rerunning {} — file set changed", fn.__qualname__
+                )
+                return _do_rerun(
+                    fn, args, kwargs, input_paths, args_hash, dep_hash, output
+                )
+            changed = False
+            updated_files: dict[str, FileRecord] = {}
+            for path_str, current_rec in current.files.items():
+                stored_rec = stored.files.get(path_str)
+                if stored_rec is None:
+                    changed = True
+                    break
+                if (
+                    current_rec.mtime == stored_rec.mtime
+                    and current_rec.size == stored_rec.size
+                    and current_rec.inode == stored_rec.inode
+                ):
+                    updated_files[path_str] = stored_rec
+                    continue
+                actual_hash = _compute_file_hash(Path(path_str))
+                if actual_hash == stored_rec.hash:
+                    updated_files[path_str] = FileRecord(
+                        path=path_str,
+                        mtime=current_rec.mtime,
+                        inode=current_rec.inode,
+                        size=current_rec.size,
+                        hash=actual_hash,
+                    )
+                else:
+                    changed = True
+                    break
+            if changed:
+                logger.info(
+                    "cache_skip: rerunning {} — file content changed", fn.__qualname__
+                )
+                return _do_rerun(
+                    fn, args, kwargs, input_paths, args_hash, dep_hash, output
+                )
+            if updated_files != stored.files:
+                stored.files = updated_files
+                write_state(stored, state_path)
+            logger.info("cache_skip: skipping {} — inputs unchanged", fn.__qualname__)
+            return output
+        return wrapper
+    if _fn is not None:
+        return decorator(_fn)
+    return decorator

pycache_skip-0.1.0/src/cache_skip/deps.py ADDED Viewed

@@ -0,0 +1,156 @@
+import ast
+import inspect
+from collections.abc import Callable
+from pathlib import Path
+import xxhash
+def compute_dep_hash(
+    func: Callable,
+    dep_roots: list[Path] | None = None,
+    dep_files: list[Path] | None = None,
+) -> str:
+    if dep_files is not None:
+        return _hash_files(dep_files)
+    mod = inspect.getmodule(func)
+    assert mod is not None, f"Cannot determine module for {func}"
+    assert (
+        hasattr(mod, "__file__") and mod.__file__ is not None
+    ), f"Module {mod.__name__} has no __file__"
+    if dep_roots is None:
+        dep_roots = _auto_detect_roots(mod)
+    dep_roots_resolved = [Path(r).resolve() for r in dep_roots]
+    start_file = Path(mod.__file__).resolve()
+    visited: set[Path] = set()
+    _walk_imports(start_file, dep_roots_resolved, visited, is_entry=True)
+    return _hash_files(list(visited))
+def _auto_detect_roots(mod) -> list[Path]:
+    mod_file = Path(mod.__file__).resolve()
+    parts = mod.__name__.split(".")
+    pkg_dir = mod_file.parent
+    for _ in range(len(parts) - 1):
+        pkg_dir = pkg_dir.parent
+    return [pkg_dir]
+def _hash_files(paths: list[Path]) -> str:
+    file_hashes: list[tuple[str, str]] = []
+    for src_path in sorted(paths, key=lambda p: str(p.resolve())):
+        resolved = src_path.resolve()
+        content_hash = xxhash.xxh128(resolved.read_bytes()).hexdigest()
+        file_hashes.append((str(resolved), content_hash))
+    combined = "\n".join(f"{p}:{h}" for p, h in file_hashes)
+    return xxhash.xxh128(combined.encode()).hexdigest()
+def _walk_imports(
+    source_file: Path,
+    dep_roots: list[Path],
+    visited: set[Path],
+    *,
+    is_entry: bool = False,
+) -> None:
+    source_file = source_file.resolve()
+    if source_file in visited:
+        return
+    if not source_file.is_file() or source_file.suffix != ".py":
+        return
+    if not is_entry and not any(_is_under(source_file, root) for root in dep_roots):
+        return
+    visited.add(source_file)
+    try:
+        tree = ast.parse(source_file.read_bytes(), filename=str(source_file))
+    except SyntaxError:
+        return
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                resolved = _resolve_module(alias.name, source_file, dep_roots)
+                if resolved:
+                    _walk_imports(resolved, dep_roots, visited)
+        elif isinstance(node, ast.ImportFrom):
+            if node.module is None:
+                continue
+            module_name = node.module
+            if node.level > 0:
+                module_name = _resolve_relative(
+                    module_name, node.level, source_file, dep_roots
+                )
+                if module_name is None:
+                    continue
+            resolved = _resolve_module(module_name, source_file, dep_roots)
+            if resolved:
+                _walk_imports(resolved, dep_roots, visited)
+def _resolve_module(
+    module_name: str,
+    from_file: Path,
+    dep_roots: list[Path],
+) -> Path | None:
+    parts = module_name.split(".")
+    for root in dep_roots:
+        pkg_path = root / "/".join(parts) / "__init__.py"
+        if pkg_path.is_file():
+            return pkg_path.resolve()
+        mod_path = (
+            root / "/".join(parts[:-1]) / (parts[-1] + ".py")
+            if len(parts) > 1
+            else root / (parts[0] + ".py")
+        )
+        if mod_path.is_file():
+            return mod_path.resolve()
+        for i in range(len(parts), 0, -1):
+            sub = parts[:i]
+            pkg_init = root / "/".join(sub) / "__init__.py"
+            if pkg_init.is_file():
+                return pkg_init.resolve()
+            mod_file = (
+                root / "/".join(sub[:-1]) / (sub[-1] + ".py")
+                if len(sub) > 1
+                else root / (sub[0] + ".py")
+            )
+            if mod_file.is_file():
+                return mod_file.resolve()
+    return None
+def _resolve_relative(
+    module_name: str,
+    level: int,
+    from_file: Path,
+    dep_roots: list[Path],
+) -> str | None:
+    pkg_dir = from_file.parent
+    for _ in range(level - 1):
+        pkg_dir = pkg_dir.parent
+    for root in dep_roots:
+        if _is_under(pkg_dir, root):
+            try:
+                rel = pkg_dir.relative_to(root)
+                prefix = ".".join(rel.parts)
+                if prefix and module_name:
+                    return f"{prefix}.{module_name}"
+                return prefix or module_name
+            except ValueError:
+                continue
+    return None
+def _is_under(path: Path, root: Path) -> bool:
+    try:
+        path.relative_to(root)
+        return True
+    except ValueError:
+        return False