ntask 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ntask/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """ntask: a Python-native task runner with content-hash caching and DAG execution."""
2
+
3
+ from ._depends import depends
4
+ from ._errors import CycleError, DiscoveryError, NtaskError, ShellError
5
+ from ._shell import ShellResult, shell
6
+ from ._task import cached, group, task
7
+
8
+ __version__ = "1.0.0"
9
+ __all__ = [
10
+ "CycleError",
11
+ "DiscoveryError",
12
+ "NtaskError",
13
+ "ShellError",
14
+ "ShellResult",
15
+ "__version__",
16
+ "cached",
17
+ "depends",
18
+ "group",
19
+ "shell",
20
+ "task",
21
+ ]
ntask/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ import sys
2
+
3
+ from ._cli import main
4
+
5
+ if __name__ == "__main__":
6
+ sys.exit(main())
@@ -0,0 +1,178 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import platform
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+
9
+ from .._remote.base import RemoteBackend
10
+ from .._task import Task
11
+ from .body import hash_task_body
12
+ from .key import CacheBreakdown, CacheKeyInputs, InputRecord, compute_cache_key
13
+ from .manifest import compute_input_manifest
14
+ from .outputs import OutputStore
15
+ from .store import CacheEntry, CacheStore
16
+
17
+ # Module-level flag for warn-once pattern.
18
+ _remote_warn_fired = False
19
+
20
+
21
+ def _warn_once_remote_failed(e: BaseException) -> None:
22
+ """Print a single warning the first time a remote call fails in this process."""
23
+ global _remote_warn_fired
24
+ if not _remote_warn_fired:
25
+ print(
26
+ f"warning: remote cache unreachable: {type(e).__name__}: {e}. "
27
+ f"Falling back to local.",
28
+ file=sys.stderr,
29
+ )
30
+ _remote_warn_fired = True
31
+
32
+
33
+ def _python_version_str() -> str:
34
+ return ".".join(str(x) for x in sys.version_info[:3])
35
+
36
+
37
+ def _platform_tag_str() -> str:
38
+ return f"{platform.system()}-{platform.machine()}".lower()
39
+
40
+
41
+ class CacheEngine:
42
+ def __init__(self, root: Path, remote: RemoteBackend | None = None):
43
+ self.root = root
44
+ self.store = CacheStore(root=root)
45
+ self.outputs = OutputStore(root=root)
46
+ self._remote = remote
47
+
48
+ def compute_key_and_breakdown(
49
+ self,
50
+ t: Task,
51
+ *,
52
+ workspace: Path,
53
+ upstream_keys_by_dep: dict[str, str],
54
+ ) -> tuple[str, CacheBreakdown]:
55
+ cfg = t.cached_config
56
+ assert cfg is not None
57
+ manifest = compute_input_manifest(cfg.inputs, root=workspace)
58
+ body = hash_task_body(t.func)
59
+ env_values = {
60
+ name: os.environ[name] if name in os.environ else "<unset>"
61
+ for name in cfg.env
62
+ }
63
+ py_version = _python_version_str()
64
+ plat_tag = _platform_tag_str()
65
+
66
+ # Tuple ordering for cache key matches insertion order of upstream_keys_by_dep.
67
+ upstream_tuple = tuple(upstream_keys_by_dep.values())
68
+ key_inputs = CacheKeyInputs(
69
+ task_fqn=t.fqn,
70
+ task_body_hash=body,
71
+ env={name: ("" if v == "<unset>" else v) for name, v in env_values.items()},
72
+ env_names=cfg.env,
73
+ input_patterns=cfg.inputs,
74
+ input_manifest_digest=manifest.digest,
75
+ root=workspace,
76
+ upstream_keys=upstream_tuple if cfg.propagate else (),
77
+ strict=cfg.strict,
78
+ )
79
+ key = compute_cache_key(key_inputs)
80
+
81
+ input_records = tuple(
82
+ InputRecord(
83
+ path=fh.path.relative_to(workspace).as_posix(),
84
+ digest=fh.digest,
85
+ mode=fh.mode,
86
+ )
87
+ for fh in manifest.files
88
+ )
89
+ breakdown = CacheBreakdown(
90
+ input_patterns=cfg.inputs,
91
+ inputs=input_records,
92
+ env_values=env_values,
93
+ task_body_hash=body,
94
+ python_version=py_version,
95
+ platform_tag=plat_tag,
96
+ upstream_keys_by_dep=dict(upstream_keys_by_dep),
97
+ )
98
+ return key, breakdown
99
+
100
+ def check(self, t: Task, key: str) -> CacheEntry | None:
101
+ return self.store.get(t.fqn, key)
102
+
103
+ def store_entry(
104
+ self,
105
+ t: Task,
106
+ key: str,
107
+ *,
108
+ workspace: Path,
109
+ duration: float,
110
+ breakdown: CacheBreakdown,
111
+ upstream_keys: tuple[str, ...],
112
+ ) -> CacheEntry:
113
+ cfg = t.cached_config
114
+ assert cfg is not None
115
+ outputs_hash = (
116
+ self.outputs.capture(cfg.outputs, root=workspace) if cfg.outputs else None
117
+ )
118
+ entry = CacheEntry(
119
+ key=key,
120
+ outputs_hash=outputs_hash,
121
+ duration=duration,
122
+ completed_at=time.time(),
123
+ upstream_keys=upstream_keys,
124
+ breakdown=breakdown,
125
+ )
126
+ self.store.put(t.fqn, entry)
127
+ return entry
128
+
129
+ def restore_outputs(self, entry: CacheEntry, *, workspace: Path) -> None:
130
+ if entry.outputs_hash:
131
+ self.outputs.restore(entry.outputs_hash, root=workspace)
132
+
133
+ def check_remote(self, t: Task, key: str) -> CacheEntry | None:
134
+ """Try remote: if hit, download entry + outputs and populate local store."""
135
+ if self._remote is None:
136
+ return None
137
+ try:
138
+ entry_dict = self._remote.get_entry(t.fqn, key)
139
+ except BaseException as e:
140
+ _warn_once_remote_failed(e)
141
+ return None
142
+ if entry_dict is None:
143
+ return None
144
+ try:
145
+ entry = CacheEntry.from_dict(entry_dict)
146
+ except Exception as e:
147
+ _warn_once_remote_failed(e)
148
+ return None
149
+ # Download outputs into the content-addressed store if needed.
150
+ if entry.outputs_hash is not None:
151
+ target_dir = self.outputs._hash_dir(entry.outputs_hash)
152
+ if not target_dir.exists():
153
+ try:
154
+ self._remote.get_output(entry.outputs_hash, target_dir)
155
+ except BaseException as e:
156
+ _warn_once_remote_failed(e)
157
+ return None
158
+ # Mirror the entry into local store so future runs short-circuit.
159
+ self.store.put(t.fqn, entry)
160
+ return entry
161
+
162
+ def push_remote(self, t: Task, entry: CacheEntry) -> None:
163
+ """Upload entry + outputs to remote. Non-fatal on error."""
164
+ if self._remote is None:
165
+ return
166
+ try:
167
+ self._remote.put_entry(t.fqn, entry.key, entry.to_dict())
168
+ except BaseException as e:
169
+ _warn_once_remote_failed(e)
170
+ return
171
+ if entry.outputs_hash is not None:
172
+ source = self.outputs._hash_dir(entry.outputs_hash)
173
+ if source.is_dir():
174
+ try:
175
+ self._remote.put_output(entry.outputs_hash, source)
176
+ except BaseException as e:
177
+ _warn_once_remote_failed(e)
178
+
ntask/_cache/body.py ADDED
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import inspect
5
+ import textwrap
6
+ from collections.abc import Callable
7
+ from typing import Any
8
+
9
+ from .hash import hash_bytes
10
+
11
+
12
+ def _strip_docstring(tree: ast.AST) -> None:
13
+ """Remove docstring nodes from functions/classes/modules in-place."""
14
+ for node in ast.walk(tree):
15
+ if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef,
16
+ ast.ClassDef, ast.Module)):
17
+ continue
18
+ body = node.body
19
+ if (body and isinstance(body[0], ast.Expr)
20
+ and isinstance(body[0].value, ast.Constant)
21
+ and isinstance(body[0].value.value, str)):
22
+ node.body = body[1:]
23
+
24
+
25
+ def hash_task_body_source(source: str) -> str:
26
+ """Hash source code structurally, ignoring docstrings/comments/whitespace."""
27
+ tree = ast.parse(textwrap.dedent(source))
28
+ _strip_docstring(tree)
29
+ dumped = ast.dump(tree, annotate_fields=False, include_attributes=False)
30
+ return hash_bytes(dumped.encode())
31
+
32
+
33
+ def hash_task_body(func: Callable[..., Any]) -> str:
34
+ """Structural hash of a function's source. Stable under doc/whitespace/comment noise."""
35
+ try:
36
+ source = inspect.getsource(func)
37
+ except (OSError, TypeError):
38
+ return hash_bytes(f"<no-source:{func.__qualname__}>".encode())
39
+ return hash_task_body_source(source)
ntask/_cache/diff.py ADDED
@@ -0,0 +1,127 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ from .key import CacheBreakdown, InputRecord
7
+
8
+ MissKind = Literal[
9
+ "first-run",
10
+ "input-modified", "input-added", "input-removed",
11
+ "env-changed", "env-added", "env-removed",
12
+ "body-changed",
13
+ "upstream-invalidated",
14
+ "python-changed", "platform-changed",
15
+ ]
16
+
17
+
18
+ @dataclass(frozen=True, slots=True)
19
+ class MissItem:
20
+ kind: MissKind
21
+ detail: str
22
+
23
+
24
+ @dataclass(frozen=True, slots=True)
25
+ class MissReport:
26
+ items: tuple[MissItem, ...]
27
+
28
+ @property
29
+ def is_hit(self) -> bool:
30
+ return not self.items
31
+
32
+ def summary(self) -> str:
33
+ if not self.items:
34
+ return "no changes"
35
+ first = self.items[0]
36
+ rest = len(self.items) - 1
37
+ label = f"{first.kind}: {first.detail}" if first.detail else first.kind
38
+ return f"{label}" + (f" (+{rest} more)" if rest else "")
39
+
40
+
41
+ def _diff_inputs(
42
+ current: tuple[InputRecord, ...],
43
+ prior: tuple[InputRecord, ...],
44
+ ) -> list[MissItem]:
45
+ prior_by_path = {r.path: r for r in prior}
46
+ current_by_path = {r.path: r for r in current}
47
+ items: list[MissItem] = []
48
+
49
+ all_paths = sorted(set(prior_by_path) | set(current_by_path))
50
+ for path in all_paths:
51
+ p = prior_by_path.get(path)
52
+ c = current_by_path.get(path)
53
+ if p is None and c is not None:
54
+ items.append(MissItem(kind="input-added", detail=path))
55
+ elif c is None and p is not None:
56
+ items.append(MissItem(kind="input-removed", detail=path))
57
+ elif p is not None and c is not None:
58
+ if p.digest != c.digest or p.mode != c.mode:
59
+ items.append(MissItem(kind="input-modified", detail=path))
60
+ return items
61
+
62
+
63
+ def _diff_env(
64
+ current: dict[str, str],
65
+ prior: dict[str, str],
66
+ ) -> list[MissItem]:
67
+ items: list[MissItem] = []
68
+ all_keys = sorted(set(prior) | set(current))
69
+ for name in all_keys:
70
+ if name in current and name not in prior:
71
+ items.append(MissItem(kind="env-added", detail=name))
72
+ elif name in prior and name not in current:
73
+ items.append(MissItem(kind="env-removed", detail=name))
74
+ elif current[name] != prior[name]:
75
+ items.append(MissItem(
76
+ kind="env-changed",
77
+ detail=f"{name}: {prior[name]!r} → {current[name]!r}",
78
+ ))
79
+ return items
80
+
81
+
82
+ def _diff_upstream(
83
+ current: dict[str, str],
84
+ prior: dict[str, str],
85
+ ) -> list[MissItem]:
86
+ items: list[MissItem] = []
87
+ all_deps = sorted(set(prior) | set(current))
88
+ for dep in all_deps:
89
+ if prior.get(dep) != current.get(dep):
90
+ items.append(MissItem(kind="upstream-invalidated", detail=dep))
91
+ return items
92
+
93
+
94
+ def diff_cache_state(
95
+ current: CacheBreakdown,
96
+ prior: CacheBreakdown | None,
97
+ ) -> MissReport:
98
+ """Compute a MissReport describing what changed between prior and current.
99
+
100
+ Items are emitted in priority order:
101
+ 1. input-* (alphabetical by path)
102
+ 2. env-* (alphabetical by name)
103
+ 3. body-changed
104
+ 4. upstream-invalidated (alphabetical by dep name)
105
+ 5. python-changed, platform-changed
106
+ """
107
+ if prior is None:
108
+ return MissReport(items=(MissItem(kind="first-run", detail=""),))
109
+
110
+ items: list[MissItem] = []
111
+ items.extend(_diff_inputs(current.inputs, prior.inputs))
112
+ items.extend(_diff_env(current.env_values, prior.env_values))
113
+ if current.task_body_hash != prior.task_body_hash:
114
+ items.append(MissItem(kind="body-changed", detail=""))
115
+ items.extend(_diff_upstream(current.upstream_keys_by_dep, prior.upstream_keys_by_dep))
116
+ if current.python_version != prior.python_version:
117
+ items.append(MissItem(
118
+ kind="python-changed",
119
+ detail=f"{prior.python_version!r} → {current.python_version!r}",
120
+ ))
121
+ if current.platform_tag != prior.platform_tag:
122
+ items.append(MissItem(
123
+ kind="platform-changed",
124
+ detail=f"{prior.platform_tag!r} → {current.platform_tag!r}",
125
+ ))
126
+
127
+ return MissReport(items=tuple(items))
ntask/_cache/hash.py ADDED
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ import xxhash
8
+
9
+
10
+ def hash_bytes(data: bytes) -> str:
11
+ return xxhash.xxh3_128(data).hexdigest()
12
+
13
+
14
+ def hash_many(parts: list[bytes]) -> str:
15
+ h = xxhash.xxh3_128()
16
+ for p in parts:
17
+ h.update(len(p).to_bytes(8, "little"))
18
+ h.update(p)
19
+ return h.hexdigest()
20
+
21
+
22
+ @dataclass(frozen=True, slots=True)
23
+ class FileHash:
24
+ path: Path
25
+ digest: str
26
+ mode: int
27
+
28
+
29
+ def hash_file(path: Path) -> FileHash:
30
+ h = xxhash.xxh3_128()
31
+ with path.open("rb") as f:
32
+ while chunk := f.read(1 << 20): # 1 MiB
33
+ h.update(chunk)
34
+ st = path.stat()
35
+ return FileHash(path=path, digest=h.hexdigest(), mode=st.st_mode & 0o777)
36
+
37
+
38
+ def hash_many_files(paths: list[Path]) -> list[FileHash]:
39
+ if not paths:
40
+ return []
41
+ sorted_paths = sorted(paths)
42
+ with ThreadPoolExecutor() as ex:
43
+ return list(ex.map(hash_file, sorted_paths))
ntask/_cache/key.py ADDED
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ import platform
4
+ import sys
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from .hash import hash_many
9
+
10
+ FORMAT_VERSION = b"ntask/v1"
11
+
12
+
13
+ @dataclass(frozen=True, slots=True)
14
+ class CacheKeyInputs:
15
+ task_fqn: str
16
+ task_body_hash: str
17
+ env: dict[str, str]
18
+ env_names: tuple[str, ...]
19
+ input_patterns: tuple[str, ...]
20
+ input_manifest_digest: str
21
+ root: Path
22
+ upstream_keys: tuple[str, ...]
23
+ strict: bool = True
24
+
25
+
26
+ def _python_version_tuple() -> bytes:
27
+ return ".".join(str(x) for x in sys.version_info[:3]).encode()
28
+
29
+
30
+ def _platform_tag() -> bytes:
31
+ return f"{platform.system()}-{platform.machine()}".lower().encode()
32
+
33
+
34
+ def compute_cache_key(inp: CacheKeyInputs) -> str:
35
+ parts: list[bytes] = [
36
+ FORMAT_VERSION,
37
+ inp.task_fqn.encode(),
38
+ inp.task_body_hash.encode() if inp.strict else b"<body-unstrict>",
39
+ _python_version_tuple() if inp.strict else b"<py-unstrict>",
40
+ _platform_tag() if inp.strict else b"<plat-unstrict>",
41
+ ]
42
+ env_parts: list[str] = []
43
+ for name in sorted(inp.env_names):
44
+ val = inp.env.get(name)
45
+ env_parts.append(f"{name}={'<unset>' if val is None else val}")
46
+ parts.append("\n".join(env_parts).encode())
47
+ parts.extend(p.encode() for p in sorted(inp.input_patterns))
48
+ parts.append(inp.input_manifest_digest.encode())
49
+ parts.extend(k.encode() for k in inp.upstream_keys)
50
+ return hash_many(parts)
51
+
52
+
53
+ @dataclass(frozen=True, slots=True)
54
+ class InputRecord:
55
+ path: str
56
+ digest: str
57
+ mode: int
58
+
59
+
60
+ @dataclass(frozen=True, slots=True)
61
+ class CacheBreakdown:
62
+ input_patterns: tuple[str, ...]
63
+ inputs: tuple[InputRecord, ...]
64
+ env_values: dict[str, str]
65
+ task_body_hash: str
66
+ python_version: str
67
+ platform_tag: str
68
+ upstream_keys_by_dep: dict[str, str]
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ import pathspec
7
+ from pathspec.pattern import Pattern
8
+
9
+ from .hash import FileHash, hash_many, hash_many_files
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class InputManifest:
14
+ files: list[FileHash]
15
+ digest: str
16
+
17
+
18
+ def _load_gitignore(root: Path) -> pathspec.PathSpec[Pattern] | None:
19
+ gi = root / ".gitignore"
20
+ if not gi.is_file():
21
+ return None
22
+ return pathspec.PathSpec.from_lines("gitwildmatch", gi.read_text().splitlines())
23
+
24
+
25
+ def compute_input_manifest(
26
+ patterns: list[str] | tuple[str, ...],
27
+ *,
28
+ root: Path,
29
+ respect_gitignore: bool = True,
30
+ ) -> InputManifest:
31
+ if not patterns:
32
+ return InputManifest(files=[], digest=hash_many([b"<empty-manifest>"]))
33
+
34
+ spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
35
+ candidates: set[Path] = set()
36
+ for p in root.rglob("*"):
37
+ if not p.is_file():
38
+ continue
39
+ rel = p.relative_to(root).as_posix()
40
+ if spec.match_file(rel):
41
+ candidates.add(p)
42
+
43
+ if respect_gitignore:
44
+ ig = _load_gitignore(root)
45
+ if ig is not None:
46
+ candidates = {
47
+ p for p in candidates if not ig.match_file(p.relative_to(root).as_posix())
48
+ }
49
+
50
+ file_hashes = hash_many_files(sorted(candidates))
51
+ parts: list[bytes] = []
52
+ for fh in file_hashes:
53
+ rel_str: str = fh.path.relative_to(root).as_posix()
54
+ parts.append(rel_str.encode())
55
+ parts.append(fh.digest.encode())
56
+ parts.append(str(fh.mode).encode())
57
+ digest = hash_many(parts) if parts else hash_many([b"<empty-manifest>"])
58
+ return InputManifest(files=file_hashes, digest=digest)
@@ -0,0 +1,86 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+ import pathspec
8
+
9
+ from .hash import hash_many, hash_many_files
10
+
11
+
12
+ class OutputStore:
13
+ """Content-addressed output blob store.
14
+
15
+ Layout: ``<root>/outputs/<outputs-hash>/...`` - preserving relative paths.
16
+ Restore uses hardlinks on POSIX, falls back to copy on Windows or cross-fs.
17
+ """
18
+
19
+ def __init__(self, root: Path):
20
+ self.root = root
21
+ self._outputs_dir = root / "outputs"
22
+
23
+ def _hash_dir(self, digest: str) -> Path:
24
+ return self._outputs_dir / digest
25
+
26
+ def capture(
27
+ self,
28
+ patterns: list[str] | tuple[str, ...],
29
+ *,
30
+ root: Path,
31
+ ) -> str | None:
32
+ """Capture matching files into the content-addressed store, return the hash."""
33
+ if not patterns:
34
+ return None
35
+ spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
36
+ matches: list[Path] = []
37
+ for p in root.rglob("*"):
38
+ if not p.is_file():
39
+ continue
40
+ rel = p.relative_to(root).as_posix()
41
+ if spec.match_file(rel):
42
+ matches.append(p)
43
+ if not matches:
44
+ return None
45
+
46
+ file_hashes = hash_many_files(sorted(matches))
47
+ parts: list[bytes] = []
48
+ for fh in file_hashes:
49
+ rel_encoded = fh.path.relative_to(root).as_posix().encode()
50
+ parts.append(rel_encoded)
51
+ parts.append(fh.digest.encode())
52
+ digest = hash_many(parts)
53
+
54
+ target = self._hash_dir(digest)
55
+ if target.exists():
56
+ return digest
57
+
58
+ target.mkdir(parents=True, exist_ok=True)
59
+ for fh in file_hashes:
60
+ rel_path = fh.path.relative_to(root)
61
+ dest = target / rel_path
62
+ dest.parent.mkdir(parents=True, exist_ok=True)
63
+ self._link_or_copy(fh.path, dest)
64
+ return digest
65
+
66
+ def restore(self, digest: str, *, root: Path) -> None:
67
+ """Restore a previously captured output tree into ``root``."""
68
+ src = self._hash_dir(digest)
69
+ if not src.is_dir():
70
+ raise FileNotFoundError(f"no output blob for hash {digest}")
71
+ for p in src.rglob("*"):
72
+ if not p.is_file():
73
+ continue
74
+ rel = p.relative_to(src)
75
+ dest = root / rel
76
+ dest.parent.mkdir(parents=True, exist_ok=True)
77
+ if dest.exists():
78
+ dest.unlink()
79
+ self._link_or_copy(p, dest)
80
+
81
+ @staticmethod
82
+ def _link_or_copy(src: Path, dest: Path) -> None:
83
+ try:
84
+ os.link(src, dest)
85
+ except (OSError, NotImplementedError):
86
+ shutil.copy2(src, dest)