ntrk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ntrk-0.1.0/LICENSE +21 -0
- ntrk-0.1.0/PKG-INFO +99 -0
- ntrk-0.1.0/README.md +86 -0
- ntrk-0.1.0/pyproject.toml +30 -0
- ntrk-0.1.0/src/ntrk/__init__.py +3 -0
- ntrk-0.1.0/src/ntrk/__main__.py +6 -0
- ntrk-0.1.0/src/ntrk/cli.py +206 -0
- ntrk-0.1.0/src/ntrk/config.py +38 -0
- ntrk-0.1.0/src/ntrk/gitutil.py +84 -0
- ntrk-0.1.0/src/ntrk/hashing.py +15 -0
- ntrk-0.1.0/src/ntrk/parse.py +83 -0
- ntrk-0.1.0/src/ntrk/record.py +31 -0
- ntrk-0.1.0/src/ntrk/store.py +196 -0
ntrk-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 tim-hudelmaier
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ntrk-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ntrk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Wrap a command; later, ask how any file was made. A minimal experiment & lineage tracker.
|
|
5
|
+
Keywords: lineage,provenance,experiment,reproducibility,cli
|
|
6
|
+
Author: tim-hudelmaier
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Project-URL: Homepage, https://github.com/tim-hudelmaier/ntrk
|
|
11
|
+
Project-URL: Repository, https://github.com/tim-hudelmaier/ntrk
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# ntrk
|
|
15
|
+
|
|
16
|
+
**Wrap a command. Later, ask how any file was made.**
|
|
17
|
+
|
|
18
|
+
ntrk records how each output was produced — the exact command, the git commit, and md5
|
|
19
|
+
hashes of every input and output — so you can point at a file weeks later and see its full
|
|
20
|
+
lineage, back to the raw inputs. Pure Python standard library, no dependencies, two commands.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uv tool install ntrk
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Commands: `nt` (alias `ntrk`). Run `uv tool update-shell` once if `nt` isn't on your `PATH`.
|
|
29
|
+
|
|
30
|
+
## Add to Claude
|
|
31
|
+
|
|
32
|
+
This repo is also a Claude Code plugin marketplace shipping the `ntrk` skill, so your coding
|
|
33
|
+
agent knows when to record runs and trace outputs:
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
/plugin marketplace add tim-hudelmaier/ntrk
|
|
37
|
+
/plugin install ntrk@ntrk
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Use
|
|
41
|
+
|
|
42
|
+
Two commands, no flags.
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
nt track python main.py -i in.csv -o out.csv # run it, record how
|
|
46
|
+
nt trace out.csv # how was this made?
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
`nt track` refuses to run on a dirty repo, so every result maps to a real commit. `nt trace`
|
|
50
|
+
prints one line per step — raw inputs first, the file you asked about last — so you can pipe it
|
|
51
|
+
through `head` / `grep` / `awk`. A `[modified]` marker appears if a file changed since it was made:
|
|
52
|
+
|
|
53
|
+
```console
|
|
54
|
+
$ nt trace out_final.csv
|
|
55
|
+
in.csv -> step1.py @ 9c4b2a1 -> out.csv
|
|
56
|
+
out.csv -> step2.py @ 3a1f9c8 -> out_final.csv
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Inputs and outputs are detected from the conventional flags
|
|
60
|
+
`-i -o --in --out --input --output` (repeated, space-, or comma-separated lists work). If your
|
|
61
|
+
scripts use other flag names, add a small `ntrk.toml`:
|
|
62
|
+
|
|
63
|
+
```toml
|
|
64
|
+
[flags]
|
|
65
|
+
inputs = ["--source"]
|
|
66
|
+
outputs = ["--dest"]
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## How it works
|
|
70
|
+
|
|
71
|
+
Each run is appended as one JSON line to `.ntrk/runs.jsonl` (commit it — it's your lineage
|
|
72
|
+
history), and each output also gets an invisible self-contained sidecar (`.NAME.ntrk`) so a file
|
|
73
|
+
stays traceable even when copied outside the repo. `trace` walks the chain by content hash, so it
|
|
74
|
+
still works after intermediate files are renamed or deleted.
|
|
75
|
+
|
|
76
|
+
## Development
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
uv sync # set up the environment
|
|
80
|
+
uv run pytest # run the tests
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Releasing (maintainer)
|
|
84
|
+
|
|
85
|
+
CI (`.github/workflows/ci.yml`) runs the test matrix on every push/PR and publishes to PyPI on
|
|
86
|
+
push to `main` via **trusted publishing** (OIDC, no API token), using `skip-existing` so an
|
|
87
|
+
unchanged version is a no-op. One-time PyPI setup (a *pending publisher*, since the project is not
|
|
88
|
+
yet on PyPI):
|
|
89
|
+
|
|
90
|
+
1. PyPI → *Your account* → *Publishing* → add a pending GitHub publisher:
|
|
91
|
+
- Project: `ntrk` · Owner: `tim-hudelmaier` · Repo: `ntrk`
|
|
92
|
+
- Workflow: `ci.yml` · Environment: `pypi`
|
|
93
|
+
2. In GitHub → *Settings → Environments*, create an environment named `pypi`.
|
|
94
|
+
3. Bump the version in `pyproject.toml` and push to `main` — the first successful publish creates
|
|
95
|
+
the project and converts the pending publisher.
|
|
96
|
+
|
|
97
|
+
## License
|
|
98
|
+
|
|
99
|
+
MIT
|
ntrk-0.1.0/README.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# ntrk
|
|
2
|
+
|
|
3
|
+
**Wrap a command. Later, ask how any file was made.**
|
|
4
|
+
|
|
5
|
+
ntrk records how each output was produced — the exact command, the git commit, and md5
|
|
6
|
+
hashes of every input and output — so you can point at a file weeks later and see its full
|
|
7
|
+
lineage, back to the raw inputs. Pure Python standard library, no dependencies, two commands.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv tool install ntrk
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Commands: `nt` (alias `ntrk`). Run `uv tool update-shell` once if `nt` isn't on your `PATH`.
|
|
16
|
+
|
|
17
|
+
## Add to Claude
|
|
18
|
+
|
|
19
|
+
This repo is also a Claude Code plugin marketplace shipping the `ntrk` skill, so your coding
|
|
20
|
+
agent knows when to record runs and trace outputs:
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
/plugin marketplace add tim-hudelmaier/ntrk
|
|
24
|
+
/plugin install ntrk@ntrk
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Use
|
|
28
|
+
|
|
29
|
+
Two commands, no flags.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
nt track python main.py -i in.csv -o out.csv # run it, record how
|
|
33
|
+
nt trace out.csv # how was this made?
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
`nt track` refuses to run on a dirty repo, so every result maps to a real commit. `nt trace`
|
|
37
|
+
prints one line per step — raw inputs first, the file you asked about last — so you can pipe it
|
|
38
|
+
through `head` / `grep` / `awk`. A `[modified]` marker appears if a file changed since it was made:
|
|
39
|
+
|
|
40
|
+
```console
|
|
41
|
+
$ nt trace out_final.csv
|
|
42
|
+
in.csv -> step1.py @ 9c4b2a1 -> out.csv
|
|
43
|
+
out.csv -> step2.py @ 3a1f9c8 -> out_final.csv
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Inputs and outputs are detected from the conventional flags
|
|
47
|
+
`-i -o --in --out --input --output` (repeated, space-, or comma-separated lists work). If your
|
|
48
|
+
scripts use other flag names, add a small `ntrk.toml`:
|
|
49
|
+
|
|
50
|
+
```toml
|
|
51
|
+
[flags]
|
|
52
|
+
inputs = ["--source"]
|
|
53
|
+
outputs = ["--dest"]
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## How it works
|
|
57
|
+
|
|
58
|
+
Each run is appended as one JSON line to `.ntrk/runs.jsonl` (commit it — it's your lineage
|
|
59
|
+
history), and each output also gets an invisible self-contained sidecar (`.NAME.ntrk`) so a file
|
|
60
|
+
stays traceable even when copied outside the repo. `trace` walks the chain by content hash, so it
|
|
61
|
+
still works after intermediate files are renamed or deleted.
|
|
62
|
+
|
|
63
|
+
## Development
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
uv sync # set up the environment
|
|
67
|
+
uv run pytest # run the tests
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Releasing (maintainer)
|
|
71
|
+
|
|
72
|
+
CI (`.github/workflows/ci.yml`) runs the test matrix on every push/PR and publishes to PyPI on
|
|
73
|
+
push to `main` via **trusted publishing** (OIDC, no API token), using `skip-existing` so an
|
|
74
|
+
unchanged version is a no-op. One-time PyPI setup (a *pending publisher*, since the project is not
|
|
75
|
+
yet on PyPI):
|
|
76
|
+
|
|
77
|
+
1. PyPI → *Your account* → *Publishing* → add a pending GitHub publisher:
|
|
78
|
+
- Project: `ntrk` · Owner: `tim-hudelmaier` · Repo: `ntrk`
|
|
79
|
+
- Workflow: `ci.yml` · Environment: `pypi`
|
|
80
|
+
2. In GitHub → *Settings → Environments*, create an environment named `pypi`.
|
|
81
|
+
3. Bump the version in `pyproject.toml` and push to `main` — the first successful publish creates
|
|
82
|
+
the project and converts the pending publisher.
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ntrk"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Wrap a command; later, ask how any file was made. A minimal experiment & lineage tracker."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
license-files = ["LICENSE"]
|
|
9
|
+
authors = [{ name = "tim-hudelmaier" }]
|
|
10
|
+
keywords = ["lineage", "provenance", "experiment", "reproducibility", "cli"]
|
|
11
|
+
dependencies = []
|
|
12
|
+
|
|
13
|
+
[project.urls]
|
|
14
|
+
Homepage = "https://github.com/tim-hudelmaier/ntrk"
|
|
15
|
+
Repository = "https://github.com/tim-hudelmaier/ntrk"
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
nt = "ntrk.cli:main"
|
|
19
|
+
ntrk = "ntrk.cli:main"
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["uv_build>=0.11.3,<0.12"]
|
|
23
|
+
build-backend = "uv_build"
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = ["pytest>=8"]
|
|
27
|
+
|
|
28
|
+
[tool.pytest.ini_options]
|
|
29
|
+
testpaths = ["tests"]
|
|
30
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""ntrk CLI: two verbs, no options.
|
|
2
|
+
|
|
3
|
+
nt track <command...> run a command, record how each output was made
|
|
4
|
+
nt trace <file> print how a file was made, back to raw inputs
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from . import config, gitutil, hashing, parse, record, store
|
|
12
|
+
|
|
13
|
+
USAGE = (
|
|
14
|
+
"usage: nt <track|trace> ...\n"
|
|
15
|
+
" nt track <command...> run a command and record its lineage\n"
|
|
16
|
+
" nt trace <file> show how a file was made\n"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def main(argv=None):
|
|
21
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
22
|
+
if not argv:
|
|
23
|
+
sys.stderr.write(USAGE)
|
|
24
|
+
return 2
|
|
25
|
+
cmd, rest = argv[0], argv[1:]
|
|
26
|
+
if cmd == "track":
|
|
27
|
+
return cmd_track(rest)
|
|
28
|
+
if cmd == "trace":
|
|
29
|
+
return cmd_trace(rest)
|
|
30
|
+
sys.stderr.write(USAGE)
|
|
31
|
+
return 2
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# --- track -----------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
def _to_rel(root, given):
|
|
37
|
+
"""Resolve a command path argument (relative to cwd) to a root-relative
|
|
38
|
+
posix path. Returns (rel_or_None, abspath)."""
|
|
39
|
+
ap = (Path.cwd() / given).resolve()
|
|
40
|
+
try:
|
|
41
|
+
return ap.relative_to(Path(root).resolve()).as_posix(), ap
|
|
42
|
+
except ValueError:
|
|
43
|
+
return None, ap
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _collect(root, tokens, kind):
|
|
47
|
+
"""Hash existing in-tree files among ``tokens``; warn-and-skip the rest."""
|
|
48
|
+
items = []
|
|
49
|
+
for tok in tokens:
|
|
50
|
+
rel, ap = _to_rel(root, tok)
|
|
51
|
+
if rel is None:
|
|
52
|
+
sys.stderr.write(f"nt: {kind} outside repo, not tracked: {tok}\n")
|
|
53
|
+
continue
|
|
54
|
+
if not ap.exists():
|
|
55
|
+
sys.stderr.write(f"nt: {kind} not found, not tracked: {tok}\n")
|
|
56
|
+
continue
|
|
57
|
+
try:
|
|
58
|
+
md5, size = hashing.md5_file(ap)
|
|
59
|
+
except OSError as e:
|
|
60
|
+
sys.stderr.write(f"nt: cannot read {kind} {tok}: {e}\n")
|
|
61
|
+
continue
|
|
62
|
+
items.append({"path": rel, "md5": md5, "size": size})
|
|
63
|
+
return items
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def cmd_track(rest):
|
|
67
|
+
if not rest:
|
|
68
|
+
sys.stderr.write("usage: nt track <command...>\n")
|
|
69
|
+
return 2
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
root = gitutil.repo_root()
|
|
73
|
+
except gitutil.GitError:
|
|
74
|
+
sys.stderr.write("nt: not a git repository\n")
|
|
75
|
+
return 1
|
|
76
|
+
|
|
77
|
+
dirty = gitutil.dirty_paths(root)
|
|
78
|
+
if dirty:
|
|
79
|
+
sys.stderr.write(
|
|
80
|
+
"nt: refusing to run — commit your changes first "
|
|
81
|
+
"(every run maps to a commit):\n"
|
|
82
|
+
)
|
|
83
|
+
for p in dirty:
|
|
84
|
+
sys.stderr.write(f" {p}\n")
|
|
85
|
+
return 1
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
commit = gitutil.head_commit(root)
|
|
89
|
+
branch_name = gitutil.branch(root)
|
|
90
|
+
except gitutil.GitError:
|
|
91
|
+
sys.stderr.write("nt: repository has no commits yet\n")
|
|
92
|
+
return 1
|
|
93
|
+
|
|
94
|
+
cfg = config.load(root)
|
|
95
|
+
parsed = parse.parse(rest, cfg)
|
|
96
|
+
|
|
97
|
+
inputs = _collect(root, parsed.inputs, "input")
|
|
98
|
+
|
|
99
|
+
proc = subprocess.run(rest)
|
|
100
|
+
if proc.returncode != 0:
|
|
101
|
+
return proc.returncode
|
|
102
|
+
|
|
103
|
+
outputs = _collect(root, parsed.outputs, "output")
|
|
104
|
+
if not outputs:
|
|
105
|
+
sys.stderr.write("nt: no tracked outputs produced; nothing recorded.\n")
|
|
106
|
+
return proc.returncode
|
|
107
|
+
|
|
108
|
+
script_meta = None
|
|
109
|
+
if parsed.script:
|
|
110
|
+
rel, ap = _to_rel(root, parsed.script)
|
|
111
|
+
if rel is not None and ap.exists():
|
|
112
|
+
md5, _ = hashing.md5_file(ap)
|
|
113
|
+
script_meta = {"path": rel, "blob": gitutil.blob_hash(root, rel), "md5": md5}
|
|
114
|
+
else:
|
|
115
|
+
script_meta = {"path": parsed.script, "blob": None, "md5": None}
|
|
116
|
+
|
|
117
|
+
cwd_rel = _to_rel(root, ".")[0] or "."
|
|
118
|
+
rec = record.build(commit, branch_name, cwd_rel, script_meta, rest, inputs, outputs)
|
|
119
|
+
store.append(root, rec)
|
|
120
|
+
for o in outputs:
|
|
121
|
+
store.write_sidecar(root, o["path"], rec)
|
|
122
|
+
|
|
123
|
+
sys.stderr.write(
|
|
124
|
+
f"nt: recorded {rec['id'][:8]} ({len(outputs)} output(s)); "
|
|
125
|
+
"commit .ntrk/runs.jsonl to keep it.\n"
|
|
126
|
+
)
|
|
127
|
+
return 0
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# --- trace -----------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def cmd_trace(rest):
|
|
133
|
+
if len(rest) != 1:
|
|
134
|
+
sys.stderr.write("usage: nt trace <file>\n")
|
|
135
|
+
return 2
|
|
136
|
+
target = rest[0]
|
|
137
|
+
try:
|
|
138
|
+
root = gitutil.repo_root()
|
|
139
|
+
except gitutil.GitError:
|
|
140
|
+
root = None
|
|
141
|
+
|
|
142
|
+
lines = _trace_chain(root, target)
|
|
143
|
+
if not lines:
|
|
144
|
+
sys.stderr.write(f"nt: no lineage found for {target}\n")
|
|
145
|
+
return 1
|
|
146
|
+
for line in lines:
|
|
147
|
+
sys.stdout.write(line + "\n")
|
|
148
|
+
return 0
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _trace_chain(root, target):
|
|
152
|
+
records = store.read_records(root)
|
|
153
|
+
first, out_rel = store.resolve(root, target)
|
|
154
|
+
if first is None:
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
index_by_id = {r.get("id"): i for i, r in enumerate(records)}
|
|
158
|
+
start_idx = index_by_id.get(first.get("id"), len(records))
|
|
159
|
+
|
|
160
|
+
steps = [] # (record, output_rel), discovered newest -> oldest
|
|
161
|
+
visited = set()
|
|
162
|
+
|
|
163
|
+
def walk(rec, output_rel, idx):
|
|
164
|
+
key = (rec.get("id"), output_rel)
|
|
165
|
+
if key in visited:
|
|
166
|
+
return
|
|
167
|
+
visited.add(key)
|
|
168
|
+
steps.append((rec, output_rel))
|
|
169
|
+
for inp in rec.get("inputs", []):
|
|
170
|
+
prod, prod_out, prod_idx = store.producer_before(
|
|
171
|
+
records, inp.get("md5"), idx, exclude_id=rec.get("id")
|
|
172
|
+
)
|
|
173
|
+
if prod is not None:
|
|
174
|
+
walk(prod, prod_out, prod_idx)
|
|
175
|
+
|
|
176
|
+
walk(first, out_rel, start_idx)
|
|
177
|
+
steps.reverse() # roots first -> reads left-to-right, top-to-bottom
|
|
178
|
+
return [_format_step(root, rec, output_rel) for rec, output_rel in steps]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _marker(root, path_rel, recorded_md5):
|
|
182
|
+
if root is None or path_rel is None or recorded_md5 is None:
|
|
183
|
+
return ""
|
|
184
|
+
ap = Path(root) / path_rel
|
|
185
|
+
if not ap.exists():
|
|
186
|
+
return " [missing]"
|
|
187
|
+
try:
|
|
188
|
+
cur, _ = hashing.md5_file(ap)
|
|
189
|
+
except OSError:
|
|
190
|
+
return ""
|
|
191
|
+
return "" if cur == recorded_md5 else " [modified]"
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _format_step(root, rec, output_rel):
|
|
195
|
+
ins = rec.get("inputs", [])
|
|
196
|
+
if ins:
|
|
197
|
+
in_str = ", ".join(
|
|
198
|
+
i.get("path", "?") + _marker(root, i.get("path"), i.get("md5"))
|
|
199
|
+
for i in ins
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
in_str = "(no tracked inputs)"
|
|
203
|
+
script = (rec.get("script") or {}).get("path") or "?"
|
|
204
|
+
commit = (rec.get("git") or {}).get("commit", "")[:7]
|
|
205
|
+
out_str = (output_rel or "?") + _marker(root, output_rel, store.md5_for_output(rec, output_rel))
|
|
206
|
+
return f"{in_str} -> {script} @ {commit} -> {out_str}"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Built-in input/output flag convention, plus optional per-repo extensions.
|
|
2
|
+
|
|
3
|
+
ntrk knows the conventional flags out of the box. A repo only needs a
|
|
4
|
+
``ntrk.toml`` if its scripts use other flag names:
|
|
5
|
+
|
|
6
|
+
[flags]
|
|
7
|
+
inputs = ["--source"]
|
|
8
|
+
outputs = ["--dest"]
|
|
9
|
+
|
|
10
|
+
These are *added* to the built-ins (the defaults always keep working). There is
|
|
11
|
+
no other configuration: strict-on-dirty and md5 are not negotiable.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import tomllib
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
DEFAULT_INPUT_FLAGS = ("-i", "--in", "--input")
|
|
18
|
+
DEFAULT_OUTPUT_FLAGS = ("-o", "--out", "--output")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Config:
|
|
22
|
+
def __init__(self, input_flags, output_flags):
|
|
23
|
+
self.input_flags = set(input_flags)
|
|
24
|
+
self.output_flags = set(output_flags)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load(root):
|
|
28
|
+
"""Return a Config: built-in flags merged with optional ``ntrk.toml``."""
|
|
29
|
+
inputs = set(DEFAULT_INPUT_FLAGS)
|
|
30
|
+
outputs = set(DEFAULT_OUTPUT_FLAGS)
|
|
31
|
+
cfg = Path(root) / "ntrk.toml"
|
|
32
|
+
if cfg.exists():
|
|
33
|
+
with open(cfg, "rb") as f:
|
|
34
|
+
data = tomllib.load(f)
|
|
35
|
+
flags = data.get("flags", {})
|
|
36
|
+
inputs |= set(flags.get("inputs", []))
|
|
37
|
+
outputs |= set(flags.get("outputs", []))
|
|
38
|
+
return Config(inputs, outputs)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Thin, read-only git interface. ntrk never mutates git state."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class GitError(Exception):
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _git(args, cwd):
|
|
12
|
+
result = subprocess.run(
|
|
13
|
+
["git", *args], cwd=str(cwd), capture_output=True, text=True
|
|
14
|
+
)
|
|
15
|
+
if result.returncode != 0:
|
|
16
|
+
raise GitError(result.stderr.strip() or f"git {' '.join(args)} failed")
|
|
17
|
+
return result.stdout
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def repo_root(cwd="."):
|
|
21
|
+
"""Absolute path to the git work-tree root. Raises GitError outside a repo."""
|
|
22
|
+
out = _git(["rev-parse", "--show-toplevel"], cwd)
|
|
23
|
+
return Path(out.strip())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def head_commit(root):
|
|
27
|
+
"""40-char commit sha of HEAD. Raises GitError if there are no commits."""
|
|
28
|
+
return _git(["rev-parse", "HEAD"], root).strip()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def branch(root):
|
|
32
|
+
return _git(["rev-parse", "--abbrev-ref", "HEAD"], root).strip()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def blob_hash(root, relpath):
|
|
36
|
+
"""git blob sha of ``relpath`` at HEAD, or None if untracked / no commits."""
|
|
37
|
+
try:
|
|
38
|
+
return _git(["rev-parse", f"HEAD:{relpath}"], root).strip()
|
|
39
|
+
except GitError:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _is_ntrk_file(path):
|
|
44
|
+
# Only ntrk's own artifacts: the .ntrk/ dir and dot-prefixed
|
|
45
|
+
# `.NAME.ntrk` sidecars. A genuine experiment file named e.g. `model.ntrk`
|
|
46
|
+
# is NOT excluded — it must still count toward a dirty tree.
|
|
47
|
+
name = path.rsplit("/", 1)[-1]
|
|
48
|
+
return (
|
|
49
|
+
path == ".ntrk"
|
|
50
|
+
or path.startswith(".ntrk/")
|
|
51
|
+
or (name.startswith(".") and name.endswith(".ntrk"))
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def dirty_paths(root):
|
|
56
|
+
"""Porcelain paths that count as 'dirty', excluding ntrk's own files.
|
|
57
|
+
|
|
58
|
+
Uses ``--porcelain -z`` so paths with spaces/unicode aren't quoted/escaped
|
|
59
|
+
and rename entries are unambiguous (the original path follows in its own
|
|
60
|
+
NUL-separated field)."""
|
|
61
|
+
# -uall lists untracked files individually (not collapsed to a dir) so a
|
|
62
|
+
# sidecar in a new directory can be excluded by name.
|
|
63
|
+
out = _git(["status", "--porcelain", "-z", "-uall"], root)
|
|
64
|
+
fields = out.split("\0")
|
|
65
|
+
dirty = []
|
|
66
|
+
i = 0
|
|
67
|
+
while i < len(fields):
|
|
68
|
+
entry = fields[i]
|
|
69
|
+
if not entry:
|
|
70
|
+
i += 1
|
|
71
|
+
continue
|
|
72
|
+
status, path = entry[:2], entry[3:]
|
|
73
|
+
# rename/copy: the next field is the original path; skip it.
|
|
74
|
+
if status[:1] in ("R", "C") or status[1:2] in ("R", "C"):
|
|
75
|
+
i += 2
|
|
76
|
+
else:
|
|
77
|
+
i += 1
|
|
78
|
+
if not _is_ntrk_file(path):
|
|
79
|
+
dirty.append(path)
|
|
80
|
+
return dirty
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def is_clean(root):
|
|
84
|
+
return not dirty_paths(root)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""md5 file fingerprints. md5 is a content fingerprint, not tamper-evidence."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def md5_file(path):
|
|
8
|
+
"""Return ``(hexdigest, size_bytes)`` for ``path``, streamed.
|
|
9
|
+
|
|
10
|
+
Raises FileNotFoundError if the path does not exist.
|
|
11
|
+
"""
|
|
12
|
+
p = Path(path)
|
|
13
|
+
with open(p, "rb") as f:
|
|
14
|
+
digest = hashlib.file_digest(f, lambda: hashlib.md5(usedforsecurity=False))
|
|
15
|
+
return digest.hexdigest(), p.stat().st_size
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Classify a wrapped command's tokens into inputs, outputs, and the script.
|
|
2
|
+
|
|
3
|
+
An input/output flag is *greedy*: it consumes every following token up to the
|
|
4
|
+
next ``-``-prefixed token, and each consumed token is also split on commas. So
|
|
5
|
+
``-i a.csv b.csv,c.csv -o out1,out2`` -> inputs ``[a,b,c]``, outputs ``[out1,out2]``.
|
|
6
|
+
|
|
7
|
+
Accepted caveat: a filename containing a comma, or starting with ``-``, can't be
|
|
8
|
+
expressed.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
INTERPRETERS = {
|
|
12
|
+
"python", "python3", "python2", "Rscript", "bash", "sh",
|
|
13
|
+
"node", "ruby", "perl", "julia",
|
|
14
|
+
}
|
|
15
|
+
SOURCE_EXTS = (".py", ".R", ".r", ".sh", ".jl", ".rb", ".js", ".pl")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ParsedCommand:
|
|
19
|
+
def __init__(self, command, script, inputs, outputs):
|
|
20
|
+
self.command = command # list[str], verbatim
|
|
21
|
+
self.script = script # str or None
|
|
22
|
+
self.inputs = inputs # list[str]
|
|
23
|
+
self.outputs = outputs # list[str]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _role(flag, config):
|
|
27
|
+
if flag in config.input_flags:
|
|
28
|
+
return "in"
|
|
29
|
+
if flag in config.output_flags:
|
|
30
|
+
return "out"
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _split(token):
|
|
35
|
+
return [part for part in token.split(",") if part]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse(argv, config):
|
|
39
|
+
inputs, outputs = [], []
|
|
40
|
+
i, n = 0, len(argv)
|
|
41
|
+
while i < n:
|
|
42
|
+
flag, eq, inline = argv[i].partition("=")
|
|
43
|
+
role = _role(flag, config)
|
|
44
|
+
if role is None:
|
|
45
|
+
i += 1
|
|
46
|
+
continue
|
|
47
|
+
bucket = inputs if role == "in" else outputs
|
|
48
|
+
if eq: # --flag=value : self-contained, comma-split, no greedy consume
|
|
49
|
+
bucket.extend(_split(inline))
|
|
50
|
+
i += 1
|
|
51
|
+
continue
|
|
52
|
+
# bare flag: greedily consume until the next '-'-prefixed token
|
|
53
|
+
i += 1
|
|
54
|
+
while i < n and not argv[i].startswith("-"):
|
|
55
|
+
bucket.extend(_split(argv[i]))
|
|
56
|
+
i += 1
|
|
57
|
+
classified = set(inputs) | set(outputs)
|
|
58
|
+
return ParsedCommand(list(argv), _detect_script(argv, classified), inputs, outputs)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _is_interpreter(tok):
|
|
62
|
+
base = tok.rsplit("/", 1)[-1]
|
|
63
|
+
return base in INTERPRETERS or base.startswith("python") or base.startswith("Rscript")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _detect_script(argv, classified=()):
|
|
67
|
+
classified = set(classified)
|
|
68
|
+
# 1. first non-input/output token ending in a known source extension
|
|
69
|
+
for tok in argv:
|
|
70
|
+
if tok not in classified and tok.endswith(SOURCE_EXTS):
|
|
71
|
+
return tok
|
|
72
|
+
# 2. first non-flag, non-input/output token after a known interpreter
|
|
73
|
+
for idx, tok in enumerate(argv):
|
|
74
|
+
if _is_interpreter(tok):
|
|
75
|
+
for nxt in argv[idx + 1:]:
|
|
76
|
+
if not nxt.startswith("-") and nxt not in classified:
|
|
77
|
+
return nxt
|
|
78
|
+
break
|
|
79
|
+
# 3. fall back to the first non-flag, non-input/output token
|
|
80
|
+
for tok in argv:
|
|
81
|
+
if not tok.startswith("-") and tok not in classified:
|
|
82
|
+
return tok
|
|
83
|
+
return argv[0] if argv else None
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""The lineage record schema (v1)."""
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
SCHEMA_VERSION = 1
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def now_iso():
|
|
10
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def new_id():
|
|
14
|
+
return uuid.uuid4().hex
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build(commit, branch_name, cwd_rel, script, command, inputs, outputs,
|
|
18
|
+
ts=None, rid=None):
|
|
19
|
+
"""Assemble one run record. ``script`` is ``{path, blob, md5}`` or None;
|
|
20
|
+
``inputs``/``outputs`` are lists of ``{path, md5, size}``."""
|
|
21
|
+
return {
|
|
22
|
+
"v": SCHEMA_VERSION,
|
|
23
|
+
"id": rid or new_id(),
|
|
24
|
+
"ts": ts or now_iso(),
|
|
25
|
+
"git": {"commit": commit, "branch": branch_name},
|
|
26
|
+
"cwd": cwd_rel,
|
|
27
|
+
"script": script,
|
|
28
|
+
"command": list(command),
|
|
29
|
+
"inputs": inputs,
|
|
30
|
+
"outputs": outputs,
|
|
31
|
+
}
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Storage: the append-only central log plus per-output sidecars.
|
|
2
|
+
|
|
3
|
+
The log (``.ntrk/runs.jsonl``) is the source of truth — committable,
|
|
4
|
+
greppable, rename-robust by md5. Each output also gets an invisible
|
|
5
|
+
self-contained sidecar (``.NAME.ntrk``) so a file can be traced even when copied
|
|
6
|
+
outside the repo.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import fcntl
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from . import hashing
|
|
15
|
+
|
|
16
|
+
STORE_DIR = ".ntrk"
|
|
17
|
+
LOG_NAME = "runs.jsonl"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# --- central log ----------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
def store_dir(root):
|
|
23
|
+
return Path(root) / STORE_DIR
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def log_path(root):
|
|
27
|
+
return store_dir(root) / LOG_NAME
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def ensure_store(root):
|
|
31
|
+
"""Create ``.ntrk/`` with the log and a union-merge .gitattributes."""
|
|
32
|
+
d = store_dir(root)
|
|
33
|
+
d.mkdir(exist_ok=True)
|
|
34
|
+
log = d / LOG_NAME
|
|
35
|
+
if not log.exists():
|
|
36
|
+
log.touch()
|
|
37
|
+
ga = d / ".gitattributes"
|
|
38
|
+
if not ga.exists():
|
|
39
|
+
ga.write_text(f"{LOG_NAME} merge=union\n", encoding="utf-8")
|
|
40
|
+
return log
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def append(root, rec):
|
|
44
|
+
"""Append one record as a single JSON line, serialized under an exclusive
|
|
45
|
+
lock so concurrent runs never interleave."""
|
|
46
|
+
ensure_store(root)
|
|
47
|
+
line = json.dumps(rec, sort_keys=True, ensure_ascii=False) + "\n"
|
|
48
|
+
with open(log_path(root), "a", encoding="utf-8") as f:
|
|
49
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
50
|
+
try:
|
|
51
|
+
f.write(line)
|
|
52
|
+
f.flush()
|
|
53
|
+
os.fsync(f.fileno())
|
|
54
|
+
finally:
|
|
55
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def read_records(root):
|
|
59
|
+
if root is None:
|
|
60
|
+
return []
|
|
61
|
+
log = log_path(root)
|
|
62
|
+
if not log.exists():
|
|
63
|
+
return []
|
|
64
|
+
records = []
|
|
65
|
+
with open(log, encoding="utf-8") as f:
|
|
66
|
+
for line in f:
|
|
67
|
+
line = line.strip()
|
|
68
|
+
if not line:
|
|
69
|
+
continue
|
|
70
|
+
try:
|
|
71
|
+
records.append(json.loads(line))
|
|
72
|
+
except json.JSONDecodeError:
|
|
73
|
+
continue
|
|
74
|
+
return records
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# --- sidecars --------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
def sidecar_path(output_path):
|
|
80
|
+
p = Path(output_path)
|
|
81
|
+
return p.with_name("." + p.name + ".ntrk")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def write_sidecar(root, output_rel, rec):
|
|
85
|
+
side = sidecar_path(Path(root) / output_rel)
|
|
86
|
+
data = dict(rec)
|
|
87
|
+
data["sidecar_for"] = output_rel
|
|
88
|
+
tmp = side.with_name(side.name + ".tmp")
|
|
89
|
+
tmp.write_text(
|
|
90
|
+
json.dumps(data, sort_keys=True, ensure_ascii=False, indent=2),
|
|
91
|
+
encoding="utf-8",
|
|
92
|
+
)
|
|
93
|
+
os.replace(tmp, side)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def read_sidecar(target):
|
|
97
|
+
side = sidecar_path(Path(target))
|
|
98
|
+
if not side.exists():
|
|
99
|
+
return None
|
|
100
|
+
try:
|
|
101
|
+
data = json.loads(side.read_text(encoding="utf-8"))
|
|
102
|
+
except (json.JSONDecodeError, OSError):
|
|
103
|
+
return None
|
|
104
|
+
return data if isinstance(data, dict) else None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# --- resolution & lineage links -------------------------------------------
|
|
108
|
+
|
|
109
|
+
def md5_for_output(rec, output_rel):
|
|
110
|
+
"""The recorded md5 of ``output_rel`` within ``rec``."""
|
|
111
|
+
outs = rec.get("outputs", [])
|
|
112
|
+
for o in outs:
|
|
113
|
+
if o.get("path") == output_rel:
|
|
114
|
+
return o.get("md5")
|
|
115
|
+
# only guess the sole output when no specific path was asked for
|
|
116
|
+
if output_rel is None and len(outs) == 1:
|
|
117
|
+
return outs[0].get("md5")
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def producer_of(records, md5):
|
|
122
|
+
"""Newest record whose outputs contain ``md5`` -> (record, output_rel)."""
|
|
123
|
+
for rec in reversed(records):
|
|
124
|
+
for o in rec.get("outputs", []):
|
|
125
|
+
if o.get("md5") == md5:
|
|
126
|
+
return rec, o.get("path")
|
|
127
|
+
return None, None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def producer_before(records, md5, before_idx, exclude_id=None):
|
|
131
|
+
"""Newest record strictly before ``before_idx`` whose outputs contain
|
|
132
|
+
``md5`` (skipping ``exclude_id``) -> (record, output_rel, index).
|
|
133
|
+
|
|
134
|
+
The trace walk uses this so an input always links to an *earlier* producer,
|
|
135
|
+
never to the consuming run itself or a later one that re-emits the same
|
|
136
|
+
bytes (which would mis-attribute byte-identical chain files)."""
|
|
137
|
+
for i in range(min(before_idx, len(records)) - 1, -1, -1):
|
|
138
|
+
rec = records[i]
|
|
139
|
+
if rec.get("id") == exclude_id:
|
|
140
|
+
continue
|
|
141
|
+
for o in rec.get("outputs", []):
|
|
142
|
+
if o.get("md5") == md5:
|
|
143
|
+
return rec, o.get("path"), i
|
|
144
|
+
return None, None, None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _rel_to_root(root, target):
|
|
148
|
+
if root is None:
|
|
149
|
+
return None
|
|
150
|
+
try:
|
|
151
|
+
return Path(target).resolve().relative_to(Path(root).resolve()).as_posix()
|
|
152
|
+
except ValueError:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def resolve(root, target):
|
|
157
|
+
"""Find the run that produced ``target`` -> (record, output_rel).
|
|
158
|
+
|
|
159
|
+
Order: sidecar next to the file -> exact (same path AND same bytes) ->
|
|
160
|
+
content match (rename-robust) -> path match (file changed since produced).
|
|
161
|
+
Returns (None, None) if nothing produced it.
|
|
162
|
+
"""
|
|
163
|
+
target = Path(target)
|
|
164
|
+
|
|
165
|
+
side = read_sidecar(target)
|
|
166
|
+
if side is not None:
|
|
167
|
+
return side, side.get("sidecar_for")
|
|
168
|
+
|
|
169
|
+
records = read_records(root)
|
|
170
|
+
rel = _rel_to_root(root, target)
|
|
171
|
+
|
|
172
|
+
on_disk_md5 = None
|
|
173
|
+
if target.exists():
|
|
174
|
+
try:
|
|
175
|
+
on_disk_md5, _ = hashing.md5_file(target)
|
|
176
|
+
except OSError:
|
|
177
|
+
on_disk_md5 = None
|
|
178
|
+
|
|
179
|
+
# 1. exact: same recorded path AND same content (most specific)
|
|
180
|
+
if rel is not None and on_disk_md5 is not None:
|
|
181
|
+
for rec in reversed(records):
|
|
182
|
+
for o in rec.get("outputs", []):
|
|
183
|
+
if o.get("path") == rel and o.get("md5") == on_disk_md5:
|
|
184
|
+
return rec, rel
|
|
185
|
+
# 2. content match (survives renames of unchanged files)
|
|
186
|
+
if on_disk_md5 is not None:
|
|
187
|
+
rec, out_rel = producer_of(records, on_disk_md5)
|
|
188
|
+
if rec is not None:
|
|
189
|
+
return rec, out_rel
|
|
190
|
+
# 3. path match (file changed since it was produced)
|
|
191
|
+
if rel is not None:
|
|
192
|
+
for rec in reversed(records):
|
|
193
|
+
for o in rec.get("outputs", []):
|
|
194
|
+
if o.get("path") == rel:
|
|
195
|
+
return rec, rel
|
|
196
|
+
return None, None
|