dorian-vwp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dorian/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """dorian — Validity Warrant Protocol (VWP) reference implementation.
2
+
3
+ PyPI distribution: `dorian-vwp`; import package: `dorian`; CLI: `dorian`.
4
+ """
5
+
6
+ __version__ = "1.0.0"
dorian/__main__.py ADDED
@@ -0,0 +1,10 @@
1
+ """Enable `python -m dorian ...` as an alias for the `dorian` console script."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ from dorian.cli import main
8
+
9
+ if __name__ == "__main__":
10
+ sys.exit(main())
@@ -0,0 +1,30 @@
1
+ """Isolated regex matcher for C3 ``regex:`` checks.
2
+
3
+ Run in a separate (spawned) process so a catastrophic-backtracking pattern can be
4
+ stopped by a wall-clock timeout. A thread cannot do this: a single C-level
5
+ ``re.search()`` never yields the GIL back to the interpreter, so neither a timer
6
+ thread nor a Python-level signal handler runs until the match returns. A child
7
+ process, by contrast, can be killed by the OS mid-match (SIGTERM/SIGKILL).
8
+
9
+ Kept deliberately import-light (stdlib ``re`` only, no dorian imports) so the
10
+ ``spawn`` start method — which re-imports this module in the child — stays cheap.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+
17
+ # result.value sentinels (signed char, typecode "b"): the parent seeds -1 and
18
+ # only the worker overwrites it, so -1 survives iff the process was killed.
19
+ MATCH = 1
20
+ NO_MATCH = 0
21
+ WORKER_ERROR = -2
22
+
23
+
24
+ def search_worker(pattern: str, flags: int, text: str, result) -> None:
25
+ """Set ``result.value`` to MATCH / NO_MATCH, or WORKER_ERROR on an unexpected
26
+ failure (the pattern is pre-compiled in the parent, so this is defensive)."""
27
+ try:
28
+ result.value = MATCH if re.search(pattern, text, flags) else NO_MATCH
29
+ except Exception:
30
+ result.value = WORKER_ERROR
dorian/bindings.py ADDED
@@ -0,0 +1,366 @@
1
+ """Binding-quality diagnostics: surface weak or suspicious claim->file bindings.
2
+
3
+ The v0.0 benchmark's one recall miss was a claim whose checker watched file A
4
+ while the breaking commit changed file B — the claim text mentioned an
5
+ identifier that also lived in B, but the binding never covered B. `analyze`
6
+ flags that shape (plus other weak-binding smells) without auto-fixing anything;
7
+ `dorian bindings` is a diagnostic, never a gate.
8
+
9
+ Content-free invariant: findings carry repo-relative file PATHS only — never
10
+ matched lines or any other file content.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import fnmatch
16
+ import re
17
+ import shlex
18
+ from collections.abc import Mapping, Sequence
19
+ from pathlib import Path, PurePosixPath
20
+
21
+ from dorian import gitio
22
+ from dorian.model import Claim, Warrant
23
+
24
+ _MAX_FILE_BYTES = 1 << 20 # tracked files larger than 1 MiB are skipped
25
+ _MAX_CANDIDATES = 8 # candidate tokens extracted per claim (bounds scan WORK, not just report)
26
+ _MAX_TOKENS = 5 # reported mention tokens per claim
27
+ _MAX_FILES = 5 # reported unwatched files per token
28
+ _MIN_LITERAL = 6 # string:/shell-grep operands shorter than this are suspect
29
+ _SNIFF_BYTES = 8192 # null-byte binary sniff window
30
+ _GLOB_CHARS = ("*", "?", "[") # mirrors store.claims_for_paths
31
+
32
+ _BACKTICK_RE = re.compile(r"`([^`\n]+)`")
33
+ _PATH_RE = re.compile(r"\b(?:[\w.-]+/)+[\w.-]*\.\w+\b") # has '/' + dot-extension
34
+ _SNAKE_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9]*_[A-Za-z0-9_]+\b")
35
+ _CAMEL_RE = re.compile(r"\b[A-Za-z][a-z0-9]+(?:[A-Z][a-z0-9]+)+\b")
36
+ _MIN_IDENT = 4 # snake/Camel identifiers shorter than this are noise
37
+ _IDENT_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*$") # a single identifier span
38
+
39
+ # Bare common words inside backticks are markup ("the `config` file", "a `list`"),
40
+ # not symbol references; binding one to a same-named one-definer symbol is a false
41
+ # BROKEN / false restricted-scope-refusal risk. snake_case / CamelCase spans are still
42
+ # admitted (real identifiers); this set blocks only bare common words.
43
+ _BACKTICK_STOPWORDS = frozenset(
44
+ {
45
+ "list",
46
+ "run",
47
+ "get",
48
+ "set",
49
+ "put",
50
+ "post",
51
+ "config",
52
+ "class",
53
+ "function",
54
+ "token",
55
+ "route",
56
+ "handler",
57
+ }
58
+ )
59
+
60
+ _GREP_NAMES = frozenset({"grep", "egrep", "fgrep", "rg"})
61
+
62
+
63
+ def analyze(repo: Path, artifact_uri: str) -> list[dict]:
64
+ """Per-claim binding diagnostics for a warranted artifact, in claim order:
65
+ {claim_id, text, watch, flags, mentions}. Flags (fixed order): 'unbacked' |
66
+ 'single-file' | 'short-literal' | 'ambiguous-mention' | 'trigger-only-symbol' |
67
+ 'unwatched-mention'. File-backed: loads the sidecar, then delegates to
68
+ analyze_candidate (the same logic the seal-time --binding-gate runs in memory)."""
69
+ repo = repo.resolve()
70
+ warrant = Warrant.load(repo / (artifact_uri + ".warrant"))
71
+ entry_uris = {e.id: e.uri for e in warrant.read_set}
72
+ return analyze_candidate(
73
+ repo, artifact_uri=artifact_uri, claims=list(warrant.claims), entry_uris=entry_uris
74
+ )
75
+
76
+
77
+ def _claim_input_sidecars(artifact_uri: str) -> set[str]:
78
+ """Likely human/agent authoring inputs for a warrant.
79
+
80
+ These files are not evidence for the claim; scanning them would make a
81
+ committed claims file self-referentially satisfy or weaken its own binding.
82
+ """
83
+ p = PurePosixPath(artifact_uri)
84
+ out = {
85
+ (p.parent / "claims.json").as_posix(),
86
+ p.with_suffix(".claims.json").as_posix(),
87
+ f"{artifact_uri}.claims.json",
88
+ }
89
+ return {x for x in out if x and x != "."}
90
+
91
+
92
+ def analyze_candidate(
93
+ repo: Path,
94
+ *,
95
+ artifact_uri: str,
96
+ claims: Sequence[Claim],
97
+ entry_uris: Mapping[str, str],
98
+ ) -> list[dict]:
99
+ """The diagnostic core, over CANDIDATE data (the claims plus their read-set
100
+ entry uris) rather than a written sidecar — so it can run at seal time, before
101
+ any `.warrant` is dumped, for the opt-in --binding-gate. `analyze` is the
102
+ file-backed wrapper around this. Output shape and flag set are identical."""
103
+ from dorian import symbol_index # lazy: symbol_index imports _tokens from here (cycle)
104
+
105
+ repo = repo.resolve()
106
+ claims = list(claims)
107
+ ambiguous = symbol_index.ambiguous_symbol_mentions(repo, claims)
108
+
109
+ claim_tokens = {c.id: _tokens(c.text) for c in claims}
110
+ all_tokens = sorted({t for toks in claim_tokens.values() for t in toks})
111
+ tracked = gitio.ls_files(repo) if all_tokens else []
112
+ skip = {artifact_uri, *_claim_input_sidecars(artifact_uri)}
113
+ hits = _scan_files(repo, tracked, all_tokens, skip=skip)
114
+
115
+ diags: list[dict] = []
116
+ for claim in claims:
117
+ cover = _watch_support(claim, entry_uris)
118
+ flags: list[str] = []
119
+ if not claim.checkers:
120
+ flags.append("unbacked")
121
+ elif len({w for spec in claim.checkers for w in spec.watch}) == 1:
122
+ flags.append("single-file")
123
+ if _short_literal(claim):
124
+ flags.append("short-literal")
125
+ amb = ambiguous.get(claim.id, {})
126
+ if any(not any(_covered(f, cover) for f in files) for files in amb.values()):
127
+ flags.append("ambiguous-mention")
128
+ named = _checker_named_files(claim, entry_uris)
129
+ if claim.load_bearing and any(
130
+ w not in named for spec in claim.checkers for w in spec.watch
131
+ ):
132
+ flags.append("trigger-only-symbol")
133
+ mentions: list[dict] = []
134
+ for tok in claim_tokens[claim.id]:
135
+ if len(mentions) == _MAX_TOKENS:
136
+ break
137
+ unwatched = [f for f in hits[tok] if not _covered(f, cover)][:_MAX_FILES]
138
+ if unwatched:
139
+ mentions.append({"token": tok, "unwatched_files": unwatched})
140
+ if mentions:
141
+ flags.append("unwatched-mention")
142
+ diags.append(
143
+ {
144
+ "claim_id": claim.id,
145
+ "text": claim.text,
146
+ "watch": sorted(cover),
147
+ "flags": flags,
148
+ "mentions": mentions,
149
+ }
150
+ )
151
+ return diags
152
+
153
+
154
+ # --- opt-in weak-binding gate policy (seal-time review only; never trust state) ----
155
+
156
+ GATE_MODES = ("off", "warn", "fail")
157
+
158
+ # 'single-file' is the EXPECTED shape of an honest one-checker C3 path/symbol/regex
159
+ # claim (the launch-train Dorian-on-Dorian warrant carries five), so it is warn-only —
160
+ # never a default seal refusal. The rest are weak-binding smells worth blocking a
161
+ # strict review gate. Weak binding is a false-CONFIDENCE risk, never proof a claim is
162
+ # false: the gate maps to the seal-refused path (exit 4), never to a claim/trust state.
163
+ HIGH_RISK_FLAGS = frozenset(
164
+ {"unbacked", "short-literal", "ambiguous-mention", "trigger-only-symbol", "unwatched-mention"}
165
+ )
166
+
167
+
168
+ def blocking_findings(diags: list[dict]) -> list[dict]:
169
+ """Diagnostics carrying at least one HIGH_RISK_FLAGS flag — exactly what
170
+ --binding-gate=fail refuses on. A claim flagged only 'single-file' is never
171
+ blocking, so honest one-checker C3 warrants still seal under `fail`."""
172
+ return [d for d in diags if any(f in HIGH_RISK_FLAGS for f in d["flags"])]
173
+
174
+
175
+ def weak_binding_lines(diags: list[dict]) -> list[str]:
176
+ """One deterministic line per FLAGGED claim for --binding-gate output: claim
177
+ id, flags, watch paths, and any unwatched mention token -> paths. Content-free:
178
+ carries repo-relative paths, claim-text tokens, flags, and claim ids only —
179
+ never a matched line or any file content."""
180
+ lines: list[str] = []
181
+ for d in diags:
182
+ if not d["flags"]:
183
+ continue
184
+ parts = [
185
+ f"claim {d['claim_id']!r}",
186
+ f"flags={','.join(d['flags'])}",
187
+ f"watch={','.join(d['watch']) or '-'}",
188
+ ]
189
+ for m in d["mentions"]:
190
+ parts.append(f"unwatched[{m['token']}]={','.join(m['unwatched_files'])}")
191
+ lines.append(" ".join(parts))
192
+ return lines
193
+
194
+
195
+ def _checker_named_files(claim: Claim, entry_uris: dict[str, str]) -> set[str]:
196
+ """The files a claim's checker PROGRAMS name (the truth they verify), independent of
197
+ symbol-definer watch paths added at verify time. A watch path NOT in this set is a
198
+ re-check TRIGGER that no checker exercises — the binding fix's trigger != truth gap,
199
+ which the 'trigger-only-symbol' flag surfaces."""
200
+ # lazy: reuse seal's canonical C3 file-operand form set and C5 path grammar
201
+ from dorian.seal import _C3_FILE_OPERAND_FORMS, _c5_data_paths
202
+
203
+ named: set[str] = set()
204
+ for spec in claim.checkers:
205
+ prefix, _, rest = spec.program.partition(":")
206
+ if spec.type == "C1":
207
+ uri = entry_uris.get(spec.program)
208
+ if uri:
209
+ named.add(uri)
210
+ elif spec.type == "C3":
211
+ named.add(rest.partition("::")[0] if prefix in _C3_FILE_OPERAND_FORMS else rest)
212
+ elif spec.type == "C4" and prefix == "pytest":
213
+ named.add(rest.partition("::")[0].strip()) # parity with seal._derive_watch
214
+ elif spec.type == "C5":
215
+ # typed C5 derives its data path; a shell checker derives none, so its
216
+ # EXPLICIT watch is what it exercises (else a load-bearing shell claim
217
+ # gets a spurious 'trigger-only-symbol' flag).
218
+ named.update(_c5_data_paths(prefix, rest) or spec.watch)
219
+ return {f for f in named if f}
220
+
221
+
222
+ def _backtick_binds(tok: str) -> bool:
223
+ """A backtick span is a candidate identifier only when it is a single
224
+ identifier-shaped token of >= _MIN_IDENT chars that is not a bare common word.
225
+ snake_case / CamelCase spans always pass (real identifiers); markup around an
226
+ English word ('`config`', '`token`') does not — binding it is a false-positive risk."""
227
+ if len(tok) < _MIN_IDENT or not _IDENT_RE.match(tok):
228
+ return False
229
+ if _SNAKE_RE.fullmatch(tok) or _CAMEL_RE.fullmatch(tok):
230
+ return True
231
+ return tok.lower() not in _BACKTICK_STOPWORDS
232
+
233
+
234
+ def _tokens(text: str) -> list[str]:
235
+ """Candidate tokens from claim text, deduped in first-appearance order per
236
+ class: backtick spans, then path-like tokens, then snake/Camel identifiers.
237
+ Capped at the first _MAX_CANDIDATES tokens in that order: the cap bounds the
238
+ repo scan itself, so a token-stuffed claim cannot blow up analyze()."""
239
+ out: list[str] = []
240
+ seen: set[str] = set()
241
+
242
+ def add(tok: str) -> None:
243
+ tok = tok.strip()
244
+ if tok and tok not in seen:
245
+ seen.add(tok)
246
+ out.append(tok)
247
+
248
+ for m in _BACKTICK_RE.finditer(text):
249
+ tok = m.group(1).strip()
250
+ if _backtick_binds(tok):
251
+ add(tok)
252
+ for m in _PATH_RE.finditer(text):
253
+ add(m.group(0))
254
+ for rx in (_SNAKE_RE, _CAMEL_RE):
255
+ for m in rx.finditer(text):
256
+ if len(m.group(0)) >= _MIN_IDENT:
257
+ add(m.group(0))
258
+ return out[:_MAX_CANDIDATES]
259
+
260
+
261
+ _WORD_CHARS = frozenset("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_")
262
+
263
+
264
+ def _combined_pattern(tokens: list[str]) -> re.Pattern[str]:
265
+ """One alternation over all tokens, found via a zero-width lookahead so every
266
+ start position is tested independently (overlaps cannot hide a token).
267
+ Longest-first ordering so a short token never shadows a longer one that
268
+ starts at the same position."""
269
+ ordered = sorted(tokens, key=lambda t: (-len(t), t))
270
+ alts = "|".join(re.escape(t) + r"(?![A-Za-z0-9_])" for t in ordered)
271
+ return re.compile(r"(?<![A-Za-z0-9_])(?=(" + alts + r"))")
272
+
273
+
274
+ def _scan_files(
275
+ repo: Path, paths: list[str], tokens: list[str], skip: set[str]
276
+ ) -> dict[str, list[str]]:
277
+ """token -> tracked files with a whole-word match, in `paths` order. Each
278
+ file is searched ONCE with a combined alternation pattern (not once per
279
+ token). Skips the artifact (`skip`), *.warrant sidecars, binaries
280
+ (null-byte sniff), oversized and unreadable files. File content never
281
+ leaves this function."""
282
+ hits: dict[str, list[str]] = {t: [] for t in tokens}
283
+ if not tokens:
284
+ return hits
285
+ combined = _combined_pattern(tokens)
286
+ # the one residual shadow: tokens starting at the SAME position, where the
287
+ # longer one wins the alternation — but then the shorter is a prefix of the
288
+ # longer with a non-word boundary char, so finding B always implies A
289
+ implied = {
290
+ b: [a for a in tokens if a != b and b.startswith(a) and b[len(a)] not in _WORD_CHARS]
291
+ for b in tokens
292
+ }
293
+ for rel in paths:
294
+ if rel in skip or rel.endswith(".warrant"):
295
+ continue
296
+ p = repo / rel
297
+ try:
298
+ if not p.is_file() or p.stat().st_size > _MAX_FILE_BYTES:
299
+ continue
300
+ data = p.read_bytes()
301
+ except OSError:
302
+ continue # vanished or unreadable: diagnostics never hard-fail on a file
303
+ if b"\0" in data[:_SNIFF_BYTES]:
304
+ continue
305
+ text = data.decode("utf-8", errors="replace")
306
+ found: set[str] = set()
307
+ for m in combined.finditer(text):
308
+ tok = m.group(1)
309
+ if tok in found:
310
+ continue
311
+ found.add(tok)
312
+ found.update(implied[tok])
313
+ if len(found) == len(tokens):
314
+ break
315
+ for tok in tokens:
316
+ if tok in found:
317
+ hits[tok].append(rel)
318
+ return hits
319
+
320
+
321
+ def _watch_support(claim: Claim, entry_uris: dict[str, str]) -> set[str]:
322
+ """The claim's binding cover: checker watch paths/globs + support entry uris."""
323
+ cover = {w for spec in claim.checkers for w in spec.watch}
324
+ cover.update(uri for sid in claim.supports if (uri := entry_uris.get(sid)))
325
+ return cover
326
+
327
+
328
+ def _covered(path: str, cover: set[str]) -> bool:
329
+ if path in cover:
330
+ return True
331
+ return any(any(ch in pat for ch in _GLOB_CHARS) and fnmatch.fnmatch(path, pat) for pat in cover)
332
+
333
+
334
+ def _short_literal(claim: Claim) -> bool:
335
+ """Any string:/shell-grep program whose literal/pattern operand is < 6 chars
336
+ (over-tight or trivially matchable)."""
337
+ for spec in claim.checkers:
338
+ prefix, _, rest = spec.program.partition(":")
339
+ if prefix == "string":
340
+ _, sep, literal = rest.partition("::")
341
+ if sep and literal and len(literal) < _MIN_LITERAL:
342
+ return True
343
+ elif prefix == "shell":
344
+ pattern = _grep_pattern(rest)
345
+ if pattern is not None and len(pattern) < _MIN_LITERAL:
346
+ return True
347
+ return False
348
+
349
+
350
+ def _grep_pattern(cmd: str) -> str | None:
351
+ """The pattern operand of a shell grep command; None when the command is
352
+ not a grep (or is unparseable — diagnostics never guess)."""
353
+ try:
354
+ tokens = shlex.split(cmd)
355
+ except ValueError:
356
+ return None
357
+ if not tokens or Path(tokens[0]).name not in _GREP_NAMES:
358
+ return None
359
+ it = iter(tokens[1:])
360
+ for tok in it:
361
+ if tok == "--" or tok in ("-e", "--regexp"):
362
+ return next(it, None)
363
+ if tok.startswith("-"):
364
+ continue
365
+ return tok
366
+ return None
dorian/blast.py ADDED
@@ -0,0 +1,119 @@
1
+ """Blast radius: downstream warrants reachable from a path or warrant.
2
+
3
+ Walks the store's `derives` table, whose rows are (from_warrant, to_warrant) =
4
+ (downstream child, upstream parent) — `store.upsert_warrant` inserts one row per
5
+ `Warrant.derives_from` parent. "Who depends on the seed" therefore selects
6
+ from_warrant where to_warrant is in the frontier. The table is rebuilt from
7
+ sidecars by `dorian sync`, so blast output is a pure index query: deterministic
8
+ and reconstructible (no local-history rows involved).
9
+
10
+ Supersede lineage: a re-seal with --supersede replaces the upstream sidecar, so
11
+ a downstream `derives_from` keeps the now-dead predecessor id. Each frontier
12
+ warrant is therefore expanded with the predecessor ids reachable through the
13
+ stored `supersedes` edges (current warrants only — replaced sidecars are gone,
14
+ so the chain is as deep as the surviving rows record), and downstream hits are
15
+ reported `via` the live successor. Without this, one routine doc re-seal would
16
+ permanently blind blast/recall for the whole downstream graph.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import sqlite3
22
+ from pathlib import Path
23
+
24
+ from dorian import store
25
+
26
+
27
+ def blast(repo: Path, target: str, max_depth: int = 8) -> list[dict]:
28
+ """Downstream warrants of `target` (a repo-relative path or a warrant id),
29
+ as dicts of {warrant_id, artifact_uri, via, depth, trust_state}; seeds
30
+ themselves are not listed. Reads the existing index (no sync)."""
31
+ conn = store.connect(Path(repo).resolve())
32
+ try:
33
+ return blast_conn(conn, target, max_depth)
34
+ finally:
35
+ conn.close()
36
+
37
+
38
+ def _predecessors(conn: sqlite3.Connection) -> dict[str, list[str]]:
39
+ """Reverse supersedes edges from CURRENT warrant rows: successor id ->
40
+ superseded ids. Replaced sidecars no longer exist, so the map holds exactly
41
+ the lineage the surviving sidecars record (rebuildable by `dorian sync`)."""
42
+ preds: dict[str, list[str]] = {}
43
+ for r in conn.execute("SELECT id, supersedes FROM warrant WHERE supersedes IS NOT NULL"):
44
+ preds.setdefault(r["id"], []).append(r["supersedes"])
45
+ return preds
46
+
47
+
48
+ def _lineage(warrant_id: str, predecessors: dict[str, list[str]]) -> list[str]:
49
+ """The warrant id plus every id it transitively supersedes, sorted for a
50
+ deterministic edge scan; cycle-safe via the seen set."""
51
+ seen = {warrant_id}
52
+ queue = [warrant_id]
53
+ while queue:
54
+ for pred in predecessors.get(queue.pop(), ()):
55
+ if pred not in seen:
56
+ seen.add(pred)
57
+ queue.append(pred)
58
+ return sorted(seen)
59
+
60
+
61
+ def blast_conn(conn: sqlite3.Connection, target: str, max_depth: int = 8) -> list[dict]:
62
+ """`blast` over an already-open store connection.
63
+
64
+ Seeds: a `sha256:` target is a warrant id; anything else is matched against
65
+ read-set resource uris (every warrant that read the path). The walk is
66
+ breadth-first, cycle-safe via a seen set, bounded by `max_depth`, and
67
+ deterministic: each frontier is sorted, and a warrant linked in by several
68
+ upstreams at the same depth gets the lexicographically first `via`. Each
69
+ frontier warrant also matches derives edges pointing at any id it
70
+ supersedes (transitively), so downstream warrants sealed against a
71
+ predecessor stay reachable after a --supersede re-seal.
72
+ """
73
+ if target.startswith("sha256:"):
74
+ seeds = [target]
75
+ else:
76
+ seeds = sorted(
77
+ {
78
+ r["warrant_id"]
79
+ for r in conn.execute(
80
+ "SELECT DISTINCT warrant_id FROM resource WHERE uri = ?", (target,)
81
+ )
82
+ }
83
+ )
84
+ predecessors = _predecessors(conn)
85
+ seen = set(seeds)
86
+ hits: list[dict] = []
87
+ frontier = seeds
88
+ depth = 0
89
+ while frontier and depth < max_depth:
90
+ depth += 1
91
+ edges: list[tuple[str, str]] = [] # (downstream, live upstream that linked it)
92
+ for upstream in frontier:
93
+ for alias in _lineage(upstream, predecessors):
94
+ edges.extend(
95
+ (r["from_warrant"], upstream)
96
+ for r in conn.execute(
97
+ "SELECT from_warrant FROM derives WHERE to_warrant = ?",
98
+ (alias,),
99
+ )
100
+ )
101
+ frontier = []
102
+ for wid, via in sorted(edges):
103
+ if wid in seen:
104
+ continue
105
+ seen.add(wid)
106
+ w = conn.execute(
107
+ "SELECT artifact_uri, trust_state FROM warrant WHERE id = ?", (wid,)
108
+ ).fetchone()
109
+ hits.append(
110
+ {
111
+ "warrant_id": wid,
112
+ "artifact_uri": w["artifact_uri"] if w else None,
113
+ "via": via,
114
+ "depth": depth,
115
+ "trust_state": w["trust_state"] if w else "UNKNOWN",
116
+ }
117
+ )
118
+ frontier.append(wid)
119
+ return hits
File without changes
@@ -0,0 +1,53 @@
1
+ """Manual read-set capture from explicit 'path' or 'path:Lx-y' specs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+
8
+ from dorian import gitio
9
+ from dorian.model import ProducedBy, ReadSet, ReadSetEntry, parse_selector
10
+
11
+
12
+ def parse_manual(specs: list[str], repo: Path) -> ReadSet:
13
+ """Build a ReadSet from user-supplied specs. Raises ValueError on a missing
14
+ file or a malformed selector; manual capture is exhaustive (coverage 1.0).
15
+
16
+ Known limits, both deliberate for explicit user input:
17
+ - The last ':' always starts a selector, so a filename containing a literal
18
+ ':' cannot be expressed — it fails loudly with a bad-selector error.
19
+ - Specs are taken literally: unlike transcript capture (which dedupes on
20
+ (uri, selector)), duplicate specs yield duplicate entries.
21
+ """
22
+ repo = repo.resolve()
23
+ head = gitio.head_ref(repo)
24
+ entries: list[ReadSetEntry] = []
25
+ for spec in specs:
26
+ raw, selector = spec, None
27
+ if ":" in spec:
28
+ raw, sel = spec.rsplit(":", 1)
29
+ parse_selector(sel) # raises ValueError on a bad selector
30
+ selector = sel
31
+ p = Path(raw)
32
+ # Resolve both absolute and relative specs and require repo containment,
33
+ # so '../x' or 'src/../../x' cannot escape and be hashed as 'project'.
34
+ resolved = (p if p.is_absolute() else repo / p).resolve()
35
+ try:
36
+ uri = resolved.relative_to(repo).as_posix()
37
+ except ValueError:
38
+ raise ValueError(f"path outside repo: {raw}") from None
39
+ h = gitio.working_hash(repo, uri, selector)
40
+ if h is None:
41
+ raise ValueError(f"missing file: {raw}")
42
+ entries.append(
43
+ ReadSetEntry(
44
+ id=f"rs-{len(entries)}",
45
+ uri=uri,
46
+ selector=selector,
47
+ hash=h,
48
+ version=head,
49
+ scope="project",
50
+ )
51
+ )
52
+ produced_by = ProducedBy(runner="manual", captured_at=datetime.now(UTC).isoformat())
53
+ return ReadSet(entries=tuple(entries), produced_by=produced_by, coverage=1.0)