scriptoria 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {scriptoria-0.3.0 → scriptoria-0.4.0}/PKG-INFO +1 -1
  2. {scriptoria-0.3.0 → scriptoria-0.4.0}/pyproject.toml +1 -1
  3. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/__init__.py +1 -1
  4. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/cli.py +67 -0
  5. scriptoria-0.4.0/src/scrip/similar.py +168 -0
  6. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/conftest.py +8 -4
  7. scriptoria-0.4.0/tests/test_similar_cmd.py +234 -0
  8. scriptoria-0.4.0/tests/test_version.py +11 -0
  9. {scriptoria-0.3.0 → scriptoria-0.4.0}/uv.lock +1 -1
  10. {scriptoria-0.3.0 → scriptoria-0.4.0}/.gitignore +0 -0
  11. {scriptoria-0.3.0 → scriptoria-0.4.0}/README.md +0 -0
  12. {scriptoria-0.3.0 → scriptoria-0.4.0}/pyrightconfig.json +0 -0
  13. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/anchors.py +0 -0
  14. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/blocks.py +0 -0
  15. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/embeddings.py +0 -0
  16. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/errors.py +0 -0
  17. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/facts.py +0 -0
  18. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/frontmatter.py +0 -0
  19. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/graph.py +0 -0
  20. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/hashing.py +0 -0
  21. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/ingest.py +0 -0
  22. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/lock.py +0 -0
  23. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/manifest.py +0 -0
  24. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/query.py +0 -0
  25. {scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/retrieval.py +0 -0
  26. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_anchor_cmd.py +0 -0
  27. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_anchors.py +0 -0
  28. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_blocks.py +0 -0
  29. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_embeddings.py +0 -0
  30. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_fact_cmd.py +0 -0
  31. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_graph_status.py +0 -0
  32. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_hashing.py +0 -0
  33. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_index_cmd.py +0 -0
  34. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_ingest.py +0 -0
  35. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_json_shapes.py +0 -0
  36. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_lock.py +0 -0
  37. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_manifest.py +0 -0
  38. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_new_cmd.py +0 -0
  39. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_query.py +0 -0
  40. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_retrieval.py +0 -0
  41. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_stamp.py +0 -0
  42. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_status_cmd.py +0 -0
  43. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_unlock_cmd.py +0 -0
  44. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_verify.py +0 -0
  45. {scriptoria-0.3.0 → scriptoria-0.4.0}/tests/test_watch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scriptoria
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
5
5
  Project-URL: Homepage, https://github.com/coredipper/scriptorium
6
6
  Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md
@@ -2,7 +2,7 @@
2
2
  # Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
3
3
  # CLI command and the import package both remain `scrip`.
4
4
  name = "scriptoria"
5
- version = "0.3.0"
5
+ version = "0.4.0"
6
6
  description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
7
7
  readme = "README.md"
8
8
  requires-python = ">=3.10"
@@ -13,7 +13,7 @@ from __future__ import annotations
13
13
 
14
14
  from pathlib import Path
15
15
 
16
- __version__ = "0.3.0"
16
+ __version__ = "0.4.0"
17
17
 
18
18
  # --- canonical vault layout ------------------------------------------------
19
19
  # ``root`` is the repo/instance root: the directory containing ``vault/``.
@@ -336,6 +336,42 @@ def cmd_new(args: argparse.Namespace) -> int:
336
336
  return 0
337
337
 
338
338
 
339
+ def _parse_source_ids(raw: str) -> list[str]:
340
+ """Parse a comma-separated `--from` value into validated source ids, WITHOUT
341
+ requiring the sources to exist (unlike `cmd_new`): scoring a not-yet-ingested
342
+ proposed topic is legitimate. Keeps the traversal-safety check."""
343
+ ids: list[str] = []
344
+ for s in (part.strip() for part in raw.split(",")):
345
+ if not s:
346
+ continue
347
+ sid = s if s.startswith("raw/") else f"raw/{s}"
348
+ _safe_slug(sid.split("#", 1)[0][len("raw/") :], "source")
349
+ ids.append(sid)
350
+ if not ids:
351
+ raise errors.UsageError("--from requires at least one source id")
352
+ return ids
353
+
354
+
355
+ def cmd_similar(args: argparse.Namespace) -> int:
356
+ from . import similar
357
+
358
+ root = resolve_root(args.root)
359
+ sources = _parse_source_ids(args.sources)
360
+ result = similar.compute_similar(
361
+ root,
362
+ title=args.title,
363
+ sources=sources,
364
+ kind=args.kind,
365
+ exclude=set(args.exclude),
366
+ top=args.top,
367
+ )
368
+ if args.json:
369
+ _emit(result)
370
+ else:
371
+ similar.print_similar(result)
372
+ return 0
373
+
374
+
339
375
  def cmd_fact_add(args: argparse.Namespace) -> int:
340
376
  from . import facts
341
377
 
@@ -549,6 +585,37 @@ def build_parser() -> argparse.ArgumentParser:
549
585
  pn.add_argument("--title", help="human title (default: the slug)")
550
586
  pn.set_defaults(func=cmd_new)
551
587
 
588
+ psim = sub.add_parser(
589
+ "similar",
590
+ parents=[common],
591
+ help="score existing wiki pages by topic overlap with a proposed page (PROMOTE step 1)",
592
+ )
593
+ psim.add_argument(
594
+ "--title", required=True, help="proposed page title (tokenized for title overlap)"
595
+ )
596
+ psim.add_argument(
597
+ "--from",
598
+ dest="sources",
599
+ required=True,
600
+ metavar="raw/a,raw/b",
601
+ help="comma-separated source ids the proposed page would derive from",
602
+ )
603
+ psim.add_argument(
604
+ "--kind",
605
+ choices=["concept", "entity"],
606
+ default="concept",
607
+ help="score only candidates of this kind (default: concept)",
608
+ )
609
+ psim.add_argument(
610
+ "--exclude",
611
+ metavar="ID",
612
+ action="append",
613
+ default=[],
614
+ help="page id to skip (repeatable); use when re-scoring an existing page",
615
+ )
616
+ psim.add_argument("--top", type=int, metavar="N", help="limit to the N highest-scoring candidates")
617
+ psim.set_defaults(func=cmd_similar)
618
+
552
619
  pfact = sub.add_parser(
553
620
  "fact",
554
621
  help="validated writers for the facts/ layer (claims mint verified anchors)",
@@ -0,0 +1,168 @@
1
+ """Deterministic topic-overlap scoring for PROMOTE step 1 (`scrip similar`).
2
+
3
+ Ranks existing wiki pages by how much a proposed topic overlaps each, from three
4
+ file-derived signals:
5
+
6
+ - **title** — Jaccard of normalized title tokens (the §6 normalization).
7
+ - **sources** — Jaccard of `derived-from` source ids (block suffix stripped).
8
+ - **tags** — Jaccard of tag sets. Pages carry no `tags` frontmatter (SPEC §4),
9
+ so a page's tags are *derived*: the union of `tags` over claims whose
10
+ `source_id` is one of the page's sources.
11
+
12
+ `combined` is a weighted sum (sources dominates — shared sources is the strongest
13
+ same-topic signal). This is **purely informational**: it reports scores and
14
+ leaves the High/Middle/Low merge decision of AGENT.md PROMOTE to the caller,
15
+ exactly as `query contradictions` leaves adjudication to the agent. No lock, no
16
+ model, no DuckDB.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ from collections.abc import Iterable, Mapping
23
+ from pathlib import Path
24
+
25
+ from . import facts_dir, frontmatter
26
+ from .errors import DataError
27
+ from .graph import scan_derived
28
+ from .hashing import normalize
29
+
30
+ DEFAULT_WEIGHTS = {"title": 0.25, "sources": 0.5, "tags": 0.25}
31
+
32
+
33
+ def _tokens(title: str) -> set[str]:
34
+ return set(normalize(title).split())
35
+
36
+
37
+ def _strip_block(dep: str) -> str:
38
+ """`raw/x#b3` -> `raw/x` (block-scoped deps share their whole source)."""
39
+ return dep.split("#", 1)[0]
40
+
41
+
42
+ def _source_set(derived_from: Iterable[str]) -> set[str]:
43
+ return {_strip_block(d) for d in derived_from}
44
+
45
+
46
+ def _jaccard(a: set[str], b: set[str]) -> float:
47
+ if not a and not b:
48
+ return 0.0
49
+ return len(a & b) / len(a | b)
50
+
51
+
52
+ def _source_tags(root: Path) -> dict[str, set[str]]:
53
+ """Map each `source_id` to the union of `tags` over its claims. Built once
54
+ per run from facts/claims.ndjson (parsed directly — no DuckDB dependency)."""
55
+ out: dict[str, set[str]] = {}
56
+ p = facts_dir(root) / "claims.ndjson"
57
+ if not p.exists():
58
+ return out
59
+ for lineno, raw_line in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
60
+ line = raw_line.strip()
61
+ if not line:
62
+ continue
63
+ try:
64
+ rec = json.loads(line)
65
+ except json.JSONDecodeError as e:
66
+ raise DataError(f"claims.ndjson:{lineno}: invalid JSON: {e}") from e
67
+ if not isinstance(rec, dict):
68
+ raise DataError(f"claims.ndjson:{lineno}: expected a JSON object")
69
+ sid = rec.get("source_id")
70
+ if not isinstance(sid, str):
71
+ raise DataError(f"claims.ndjson:{lineno}: 'source_id' must be a string")
72
+ tags = rec.get("tags")
73
+ if tags is None:
74
+ continue
75
+ if not isinstance(tags, list) or any(not isinstance(t, str) for t in tags):
76
+ raise DataError(f"claims.ndjson:{lineno}: 'tags' must be a list of strings")
77
+ out.setdefault(sid, set()).update(tags)
78
+ return out
79
+
80
+
81
+ def _page_tags(sources: set[str], source_tags: Mapping[str, set[str]]) -> set[str]:
82
+ out: set[str] = set()
83
+ for s in sources:
84
+ out |= source_tags.get(s, set())
85
+ return out
86
+
87
+
88
+ def compute_similar(
89
+ root: str | Path,
90
+ *,
91
+ title: str,
92
+ sources: Iterable[str],
93
+ kind: str = "concept",
94
+ exclude: Iterable[str] | None = None,
95
+ top: int | None = None,
96
+ weights: Mapping[str, float] | None = None,
97
+ ) -> dict:
98
+ """Score existing `kind` wiki pages against the proposed (title, sources).
99
+
100
+ Returns ``{proposed, weights, candidates}`` with candidates sorted by
101
+ ``combined`` desc then id asc, truncated to ``top``.
102
+ """
103
+ root = Path(root)
104
+ w = dict(weights or DEFAULT_WEIGHTS)
105
+ skip = set(exclude or ())
106
+ prop_sources = _source_set(sources)
107
+ prop_tokens = _tokens(title)
108
+ source_tags = _source_tags(root)
109
+ prop_tags = _page_tags(prop_sources, source_tags)
110
+
111
+ want_type = f"wiki.{kind}"
112
+ candidates: list[dict] = []
113
+ for cid, d in scan_derived(root).items():
114
+ if d.get("type") != want_type or cid in skip:
115
+ continue # other-kind pages and the facts.set row are dropped here
116
+ c_sources = _source_set(d["derived_from"])
117
+ meta, _ = frontmatter.load(root / d["path"])
118
+ c_title = (meta.get("title") if meta else "") or ""
119
+ c_tags = _page_tags(c_sources, source_tags)
120
+
121
+ title_s = _jaccard(prop_tokens, _tokens(c_title))
122
+ sources_s = _jaccard(prop_sources, c_sources)
123
+ tags_s = _jaccard(prop_tags, c_tags)
124
+ combined = w["title"] * title_s + w["sources"] * sources_s + w["tags"] * tags_s
125
+ candidates.append(
126
+ {
127
+ "id": cid,
128
+ "title": c_title,
129
+ "path": d["path"],
130
+ "kind": kind,
131
+ "scores": {
132
+ "title": round(title_s, 6),
133
+ "sources": round(sources_s, 6),
134
+ "tags": round(tags_s, 6),
135
+ "combined": round(combined, 6),
136
+ },
137
+ "shared": {
138
+ "sources": sorted(prop_sources & c_sources),
139
+ "tags": sorted(prop_tags & c_tags),
140
+ },
141
+ }
142
+ )
143
+
144
+ candidates.sort(key=lambda c: (-c["scores"]["combined"], c["id"]))
145
+ if top is not None:
146
+ candidates = candidates[:top]
147
+ return {
148
+ "proposed": {"title": title, "derived_from": list(sources), "kind": kind},
149
+ "weights": w,
150
+ "candidates": candidates,
151
+ }
152
+
153
+
154
+ def print_similar(result: dict) -> None:
155
+ p = result["proposed"]
156
+ print(f'proposed: "{p["title"]}" ({p["kind"]}, from {len(p["derived_from"])} source(s))')
157
+ cands = result["candidates"]
158
+ if not cands:
159
+ print(f"no existing {p['kind']} pages to compare.")
160
+ return
161
+ for c in cands:
162
+ s = c["scores"]
163
+ print(f' {s["combined"]:.3f} {c["id"]} "{c["title"]}"')
164
+ print(
165
+ f' sources {s["sources"]:.2f} tags {s["tags"]:.2f} title {s["title"]:.2f}'
166
+ f' shared sources: {len(c["shared"]["sources"])}, tags: {len(c["shared"]["tags"])}'
167
+ )
168
+ print(f"({len(cands)} candidate(s))")
@@ -70,6 +70,8 @@ class KB:
70
70
  *,
71
71
  stamp: bool = True,
72
72
  body: str = "Body.\n",
73
+ title: str | None = None,
74
+ kind: str = "concept",
73
75
  ) -> str:
74
76
  deps = {
75
77
  sid: h
@@ -77,16 +79,18 @@ class KB:
77
79
  if (h := self._dep_hash(sid)) is not None
78
80
  }
79
81
  meta: dict = {
80
- "id": f"concept/{slug}",
81
- "type": "wiki.concept",
82
- "title": slug,
82
+ "id": f"{kind}/{slug}",
83
+ "type": f"wiki.{kind}",
84
+ "title": title or slug,
83
85
  "derived-from": list(derived_from),
84
86
  }
85
87
  if stamp:
86
88
  meta["input-hash"] = hashing.input_hash(deps)
87
89
  meta["last-compiled"] = "2026-01-01T00:00:00Z"
88
90
  meta["confidence"] = 0.9
89
- path = self.root / "vault" / "wiki" / "concepts" / f"{slug}.md"
91
+ subdir = "concepts" if kind == "concept" else "entities"
92
+ path = self.root / "vault" / "wiki" / subdir / f"{slug}.md"
93
+ path.parent.mkdir(parents=True, exist_ok=True)
90
94
  path.write_text(frontmatter.dump(meta, body), encoding="utf-8")
91
95
  return meta["id"]
92
96
 
@@ -0,0 +1,234 @@
1
+ """`scrip similar` — deterministic topic-overlap scorer for PROMOTE step 1.
2
+ Ranks existing wiki pages by title-token + shared-source + derived-tag overlap.
3
+ Pure informational (always exit 0); no model, no lock."""
4
+
5
+ import json
6
+
7
+ import pytest
8
+
9
+ from scrip import cli
10
+
11
+
12
+ def _similar(kb, *args):
13
+ """Run `scrip similar … --json` and return the parsed payload."""
14
+ rc = cli.main(["similar", *args, "--json", "--root", str(kb.root)])
15
+ return rc
16
+
17
+
18
+ def _json(capsys):
19
+ return json.loads(capsys.readouterr().out)
20
+
21
+
22
+ # --------------------------------------------------------------------------- #
23
+ # Ranking
24
+ # --------------------------------------------------------------------------- #
25
+ def test_similar_high_overlap_ranks_first(kb, capsys):
26
+ kb.add_raw("a", "# A\n\nAlpha.\n")
27
+ kb.add_raw("b", "# B\n\nBeta.\n")
28
+ kb.add_raw("c", "# C\n\nGamma.\n")
29
+ kb.add_wiki("twin", ["raw/a", "raw/b"], title="Alpha and Beta")
30
+ kb.add_wiki("unrelated", ["raw/c"], title="Gamma only")
31
+
32
+ rc = _similar(kb, "--title", "Alpha and Beta", "--from", "raw/a,raw/b")
33
+ assert rc == 0
34
+ data = _json(capsys)
35
+ assert data["candidates"][0]["id"] == "concept/twin"
36
+ assert data["candidates"][0]["scores"]["sources"] == 1.0
37
+ assert data["candidates"][0]["scores"]["title"] == 1.0
38
+ # the unrelated page is present but ranked strictly lower
39
+ assert data["candidates"][-1]["id"] == "concept/unrelated"
40
+ assert data["candidates"][-1]["scores"]["combined"] < data["candidates"][0]["scores"]["combined"]
41
+
42
+
43
+ def test_similar_partial_source_overlap_scores_between(kb, capsys):
44
+ kb.add_raw("a", "# A\n\nAlpha.\n")
45
+ kb.add_raw("b", "# B\n\nBeta.\n")
46
+ kb.add_wiki("both", ["raw/a", "raw/b"], title="Shared")
47
+ kb.add_wiki("half", ["raw/a"], title="Partial")
48
+
49
+ _similar(kb, "--title", "Shared", "--from", "raw/a,raw/b")
50
+ data = _json(capsys)
51
+ by_id = {c["id"]: c for c in data["candidates"]}
52
+ assert by_id["concept/both"]["scores"]["sources"] == 1.0
53
+ # half shares 1 of {a,b} → Jaccard 1/2
54
+ assert by_id["concept/half"]["scores"]["sources"] == 0.5
55
+ assert by_id["concept/half"]["scores"]["combined"] < by_id["concept/both"]["scores"]["combined"]
56
+
57
+
58
+ def test_similar_no_candidates_exits_0(kb, capsys):
59
+ kb.add_raw("a", "# A\n\nAlpha.\n")
60
+ rc = _similar(kb, "--title", "Anything", "--from", "raw/a")
61
+ assert rc == 0
62
+ assert _json(capsys)["candidates"] == []
63
+
64
+
65
+ # --------------------------------------------------------------------------- #
66
+ # Self-exclusion
67
+ # --------------------------------------------------------------------------- #
68
+ def test_similar_excludes_named_ids(kb, capsys):
69
+ kb.add_raw("a", "# A\n\nAlpha.\n")
70
+ kb.add_wiki("self", ["raw/a"], title="Self Page")
71
+ kb.add_wiki("peer", ["raw/a"], title="Self Page")
72
+
73
+ _similar(kb, "--title", "Self Page", "--from", "raw/a", "--exclude", "concept/self")
74
+ ids = [c["id"] for c in _json(capsys)["candidates"]]
75
+ assert "concept/self" not in ids
76
+ assert "concept/peer" in ids
77
+
78
+
79
+ # --------------------------------------------------------------------------- #
80
+ # Tags derived from claims (pages carry no tags frontmatter)
81
+ # --------------------------------------------------------------------------- #
82
+ def test_similar_tags_derived_from_claims(kb, capsys):
83
+ kb.add_raw("shared", "# S\n\nA cited sentence about caching.\n")
84
+ kb.add_raw("bare", "# B\n\nUncited content here.\n")
85
+ kb.add_claim("clm_0001", "shared", "A cited sentence about caching.", tags=["caching", "cost"])
86
+ kb.add_wiki("with-tags", ["raw/shared"], title="Tagged")
87
+ kb.add_wiki("no-tags", ["raw/bare"], title="Untagged")
88
+
89
+ _similar(kb, "--title", "Proposed", "--from", "raw/shared")
90
+ by_id = {c["id"]: c for c in _json(capsys)["candidates"]}
91
+ # proposed derives tags {caching,cost} from raw/shared's claim → perfect tag match
92
+ assert by_id["concept/with-tags"]["scores"]["tags"] == 1.0
93
+ assert by_id["concept/with-tags"]["shared"]["tags"] == ["caching", "cost"]
94
+ # raw/bare has no claims → no tags → tag score 0
95
+ assert by_id["concept/no-tags"]["scores"]["tags"] == 0.0
96
+
97
+
98
+ # --------------------------------------------------------------------------- #
99
+ # Block-scoped derived-from is stripped to the whole source
100
+ # --------------------------------------------------------------------------- #
101
+ def test_similar_strips_block_suffix(kb, capsys):
102
+ kb.add_raw("a", "# Heading\n\nFirst paragraph body.\n\nSecond paragraph body.\n")
103
+ bid = kb.block_id("a", "First paragraph")
104
+ kb.add_wiki("whole", ["raw/a"], title="Whole source page")
105
+
106
+ _similar(kb, "--title", "Block page", "--from", f"raw/a#{bid}")
107
+ [cand] = _json(capsys)["candidates"]
108
+ assert cand["id"] == "concept/whole"
109
+ assert cand["scores"]["sources"] == 1.0 # raw/a#<block> stripped to raw/a
110
+
111
+
112
+ # --------------------------------------------------------------------------- #
113
+ # Kind filter
114
+ # --------------------------------------------------------------------------- #
115
+ def test_similar_scores_only_same_kind(kb, capsys):
116
+ kb.add_raw("a", "# A\n\nAlpha.\n")
117
+ kb.add_wiki("a-concept", ["raw/a"], title="Topic", kind="concept")
118
+ kb.add_wiki("an-entity", ["raw/a"], title="Topic", kind="entity")
119
+ # a facts.set row also exists in scan_derived once _meta is present:
120
+ (kb.root / "vault" / "facts" / "_meta.yaml").write_text(
121
+ "id: facts/core\ntype: facts.set\nderived-from:\n- raw/a\n", encoding="utf-8"
122
+ )
123
+
124
+ _similar(kb, "--title", "Topic", "--from", "raw/a", "--kind", "concept")
125
+ ids = [c["id"] for c in _json(capsys)["candidates"]]
126
+ assert ids == ["concept/a-concept"]
127
+ assert "entity/an-entity" not in ids
128
+ assert "facts/core" not in ids
129
+
130
+
131
+ def test_similar_entity_kind_scores_entities(kb, capsys):
132
+ kb.add_raw("a", "# A\n\nAlpha.\n")
133
+ kb.add_wiki("a-concept", ["raw/a"], title="Topic", kind="concept")
134
+ kb.add_wiki("an-entity", ["raw/a"], title="Topic", kind="entity")
135
+
136
+ _similar(kb, "--title", "Topic", "--from", "raw/a", "--kind", "entity")
137
+ ids = [c["id"] for c in _json(capsys)["candidates"]]
138
+ assert ids == ["entity/an-entity"]
139
+
140
+
141
+ # --------------------------------------------------------------------------- #
142
+ # --top + shape
143
+ # --------------------------------------------------------------------------- #
144
+ def test_similar_top_limits_results(kb, capsys):
145
+ kb.add_raw("a", "# A\n\nAlpha.\n")
146
+ kb.add_raw("b", "# B\n\nBeta.\n")
147
+ kb.add_wiki("p1", ["raw/a", "raw/b"], title="Best match")
148
+ kb.add_wiki("p2", ["raw/a"], title="Partial")
149
+ kb.add_wiki("p3", ["raw/b"], title="Other")
150
+
151
+ _similar(kb, "--title", "Best match", "--from", "raw/a,raw/b", "--top", "1")
152
+ data = _json(capsys)
153
+ assert len(data["candidates"]) == 1
154
+ assert data["candidates"][0]["id"] == "concept/p1"
155
+
156
+
157
+ def test_similar_json_shape(kb, capsys):
158
+ kb.add_raw("a", "# A\n\nAlpha.\n")
159
+ kb.add_wiki("p", ["raw/a"], title="A Page")
160
+ _similar(kb, "--title", "Proposed", "--from", "raw/a")
161
+ data = _json(capsys)
162
+ assert set(data) == {"proposed", "weights", "candidates"}
163
+ assert set(data["proposed"]) == {"title", "derived_from", "kind"}
164
+ assert set(data["weights"]) == {"title", "sources", "tags"}
165
+ [cand] = data["candidates"]
166
+ assert set(cand) == {"id", "title", "path", "kind", "scores", "shared"}
167
+ assert set(cand["scores"]) == {"title", "sources", "tags", "combined"}
168
+ assert set(cand["shared"]) == {"sources", "tags"}
169
+
170
+
171
+ def test_similar_human_output_lists_candidate(kb, capsys):
172
+ kb.add_raw("a", "# A\n\nAlpha.\n")
173
+ kb.add_wiki("twin", ["raw/a"], title="Twin")
174
+ rc = cli.main(["similar", "--title", "Twin", "--from", "raw/a", "--root", str(kb.root)])
175
+ assert rc == 0
176
+ assert "concept/twin" in capsys.readouterr().out
177
+
178
+
179
+ # --------------------------------------------------------------------------- #
180
+ # Errors
181
+ # --------------------------------------------------------------------------- #
182
+ def test_similar_missing_title_is_usage_error(kb):
183
+ with pytest.raises(SystemExit) as e:
184
+ cli.main(["similar", "--from", "raw/a", "--root", str(kb.root)])
185
+ assert e.value.code == 2
186
+
187
+
188
+ def test_similar_missing_from_is_usage_error(kb):
189
+ with pytest.raises(SystemExit) as e:
190
+ cli.main(["similar", "--title", "X", "--root", str(kb.root)])
191
+ assert e.value.code == 2
192
+
193
+
194
+ def test_similar_empty_from_is_usage_error(kb):
195
+ assert cli.main(["similar", "--title", "X", "--from", " , ", "--root", str(kb.root)]) == 2
196
+
197
+
198
+ def test_similar_outside_a_vault_is_usage_error(tmp_path):
199
+ assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(tmp_path / "no")]) == 2
200
+
201
+
202
+ def test_similar_malformed_claims_is_data_error(kb):
203
+ kb.add_raw("a", "# A\n\nAlpha.\n")
204
+ kb.add_wiki("p", ["raw/a"], title="P")
205
+ (kb.root / "vault" / "facts" / "claims.ndjson").write_text("{not json}\n", encoding="utf-8")
206
+ assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3
207
+
208
+
209
+ def test_similar_non_object_claim_line_is_data_error(kb):
210
+ # a valid-JSON non-object must be a clean data error (3), not an internal one (4)
211
+ kb.add_raw("a", "# A\n\nAlpha.\n")
212
+ kb.add_wiki("p", ["raw/a"], title="P")
213
+ (kb.root / "vault" / "facts" / "claims.ndjson").write_text("[]\n", encoding="utf-8")
214
+ assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3
215
+
216
+
217
+ def test_similar_bad_source_id_is_data_error(kb):
218
+ # a non-string source_id is malformed facts data → exit 3, not a silent skip
219
+ kb.add_raw("a", "# A\n\nAlpha.\n")
220
+ kb.add_wiki("p", ["raw/a"], title="P")
221
+ (kb.root / "vault" / "facts" / "claims.ndjson").write_text(
222
+ '{"source_id": 1, "tags": "oops"}\n', encoding="utf-8"
223
+ )
224
+ assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3
225
+
226
+
227
+ def test_similar_bad_tags_shape_is_data_error(kb):
228
+ kb.add_raw("a", "# A\n\nAlpha.\n")
229
+ kb.add_wiki("p", ["raw/a"], title="P")
230
+ (kb.root / "vault" / "facts" / "claims.ndjson").write_text(
231
+ '{"claim_id": "clm_0001", "source_id": "raw/a", "anchor": "qh:x|loc:0|len:1", "tags": "oops"}\n',
232
+ encoding="utf-8",
233
+ )
234
+ assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3
@@ -0,0 +1,11 @@
1
+ """The public __version__ must match the installed package metadata, so a release
2
+ that bumps pyproject can't leave the runtime attribute stale. (Dist name is
3
+ ``scriptoria``; the import package is ``scrip``.)"""
4
+
5
+ import importlib.metadata
6
+
7
+ import scrip
8
+
9
+
10
+ def test_version_matches_package_metadata():
11
+ assert scrip.__version__ == importlib.metadata.version("scriptoria")
@@ -1141,7 +1141,7 @@ wheels = [
1141
1141
 
1142
1142
  [[package]]
1143
1143
  name = "scriptoria"
1144
- version = "0.3.0"
1144
+ version = "0.4.0"
1145
1145
  source = { editable = "." }
1146
1146
  dependencies = [
1147
1147
  { name = "duckdb" },
File without changes
File without changes
File without changes