scriptoria 0.3.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {scriptoria-0.3.0 → scriptoria-0.5.0}/PKG-INFO +1 -1
  2. {scriptoria-0.3.0 → scriptoria-0.5.0}/pyproject.toml +1 -1
  3. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/__init__.py +1 -1
  4. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/anchors.py +27 -0
  5. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/cli.py +110 -3
  6. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/facts.py +97 -7
  7. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/query.py +21 -3
  8. scriptoria-0.5.0/src/scrip/similar.py +168 -0
  9. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/conftest.py +8 -4
  10. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_fact_cmd.py +116 -0
  11. scriptoria-0.5.0/tests/test_query.py +109 -0
  12. scriptoria-0.5.0/tests/test_similar_cmd.py +234 -0
  13. scriptoria-0.5.0/tests/test_span_cmd.py +86 -0
  14. scriptoria-0.5.0/tests/test_version.py +11 -0
  15. {scriptoria-0.3.0 → scriptoria-0.5.0}/uv.lock +1 -1
  16. scriptoria-0.3.0/tests/test_query.py +0 -58
  17. {scriptoria-0.3.0 → scriptoria-0.5.0}/.gitignore +0 -0
  18. {scriptoria-0.3.0 → scriptoria-0.5.0}/README.md +0 -0
  19. {scriptoria-0.3.0 → scriptoria-0.5.0}/pyrightconfig.json +0 -0
  20. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/blocks.py +0 -0
  21. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/embeddings.py +0 -0
  22. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/errors.py +0 -0
  23. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/frontmatter.py +0 -0
  24. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/graph.py +0 -0
  25. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/hashing.py +0 -0
  26. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/ingest.py +0 -0
  27. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/lock.py +0 -0
  28. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/manifest.py +0 -0
  29. {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/retrieval.py +0 -0
  30. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_anchor_cmd.py +0 -0
  31. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_anchors.py +0 -0
  32. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_blocks.py +0 -0
  33. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_embeddings.py +0 -0
  34. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_graph_status.py +0 -0
  35. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_hashing.py +0 -0
  36. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_index_cmd.py +0 -0
  37. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_ingest.py +0 -0
  38. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_json_shapes.py +0 -0
  39. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_lock.py +0 -0
  40. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_manifest.py +0 -0
  41. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_new_cmd.py +0 -0
  42. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_retrieval.py +0 -0
  43. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_stamp.py +0 -0
  44. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_status_cmd.py +0 -0
  45. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_unlock_cmd.py +0 -0
  46. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_verify.py +0 -0
  47. {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_watch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scriptoria
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
5
5
  Project-URL: Homepage, https://github.com/coredipper/scriptorium
6
6
  Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md
@@ -2,7 +2,7 @@
2
2
  # Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
3
3
  # CLI command and the import package both remain `scrip`.
4
4
  name = "scriptoria"
5
- version = "0.3.0"
5
+ version = "0.5.0"
6
6
  description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
7
7
  readme = "README.md"
8
8
  requires-python = ">=3.10"
@@ -13,7 +13,7 @@ from __future__ import annotations
13
13
 
14
14
  from pathlib import Path
15
15
 
16
- __version__ = "0.3.0"
16
+ __version__ = "0.5.0"
17
17
 
18
18
  # --- canonical vault layout ------------------------------------------------
19
19
  # ``root`` is the repo/instance root: the directory containing ``vault/``.
@@ -90,6 +90,33 @@ def resolve(source_text: str, anchor: str) -> str:
90
90
  return "OK" if matches == 1 else "AMBIGUOUS"
91
91
 
92
92
 
93
+ def span(source_text: str, anchor: str) -> tuple[str, str | None]:
94
+ """Return ``(status, cited_text)`` for ``anchor`` in ``source_text``.
95
+
96
+ Same verdicts as :func:`resolve`, but also returns the matched span (the
97
+ normalized cited text) so a caller can *read* what an anchor cites. For
98
+ ``AMBIGUOUS`` the window nearest the anchor's ``loc`` hint is returned; for
99
+ ``BROKEN`` the text is ``None``.
100
+ """
101
+ a = parse_anchor(anchor)
102
+ ns = normalize(source_text)
103
+ n, target, loc = a["len"], a["qh"], a["loc"]
104
+ length = len(ns)
105
+ if n <= 0 or n > length:
106
+ return "BROKEN", None
107
+ hits = [
108
+ start
109
+ for start in range(length - n + 1)
110
+ if hashlib.sha256(ns[start : start + n].encode("utf-8")).hexdigest() == target
111
+ ]
112
+ if not hits:
113
+ return "BROKEN", None
114
+ if len(hits) == 1:
115
+ return "OK", ns[hits[0] : hits[0] + n]
116
+ nearest = min(hits, key=lambda s: abs(s - loc * length))
117
+ return "AMBIGUOUS", ns[nearest : nearest + n]
118
+
119
+
93
120
  # --------------------------------------------------------------------------- #
94
121
  # Vault-wide verification
95
122
  # --------------------------------------------------------------------------- #
@@ -336,6 +336,67 @@ def cmd_new(args: argparse.Namespace) -> int:
336
336
  return 0
337
337
 
338
338
 
339
+ def _parse_source_ids(raw: str) -> list[str]:
340
+ """Parse a comma-separated `--from` value into validated source ids, WITHOUT
341
+ requiring the sources to exist (unlike `cmd_new`): scoring a not-yet-ingested
342
+ proposed topic is legitimate. Keeps the traversal-safety check."""
343
+ ids: list[str] = []
344
+ for s in (part.strip() for part in raw.split(",")):
345
+ if not s:
346
+ continue
347
+ sid = s if s.startswith("raw/") else f"raw/{s}"
348
+ _safe_slug(sid.split("#", 1)[0][len("raw/") :], "source")
349
+ ids.append(sid)
350
+ if not ids:
351
+ raise errors.UsageError("--from requires at least one source id")
352
+ return ids
353
+
354
+
355
+ def cmd_span(args: argparse.Namespace) -> int:
356
+ from . import anchors
357
+
358
+ root = resolve_root(args.root)
359
+ if args.claim:
360
+ from . import facts
361
+
362
+ source_id, anchor = facts.claim_source_anchor(root, args.claim)
363
+ else:
364
+ if "#" not in args.target:
365
+ raise errors.UsageError("target must be raw/<slug>#<anchor>")
366
+ source_id, anchor = args.target.split("#", 1)
367
+ source_id = source_id if source_id.startswith("raw/") else f"raw/{source_id}"
368
+ _safe_slug(source_id[len("raw/") :], "source")
369
+ text = anchors.source_text(root, source_id)
370
+ status, cited = anchors.span(text, anchor)
371
+ if args.json:
372
+ _emit({"target": f"{source_id}#{anchor}", "status": status, "text": cited})
373
+ else:
374
+ print(f"[{status}] {source_id}")
375
+ if cited is not None:
376
+ print(cited)
377
+ return 0 if status == "OK" else 1
378
+
379
+
380
+ def cmd_similar(args: argparse.Namespace) -> int:
381
+ from . import similar
382
+
383
+ root = resolve_root(args.root)
384
+ sources = _parse_source_ids(args.sources)
385
+ result = similar.compute_similar(
386
+ root,
387
+ title=args.title,
388
+ sources=sources,
389
+ kind=args.kind,
390
+ exclude=set(args.exclude),
391
+ top=args.top,
392
+ )
393
+ if args.json:
394
+ _emit(result)
395
+ else:
396
+ similar.print_similar(result)
397
+ return 0
398
+
399
+
339
400
  def cmd_fact_add(args: argparse.Namespace) -> int:
340
401
  from . import facts
341
402
 
@@ -352,7 +413,12 @@ def cmd_fact_add(args: argparse.Namespace) -> int:
352
413
  _emit(result)
353
414
  else:
354
415
  for r in result["appended"]:
355
- ident = r.get("claim_id") or r.get("entity_id") or f"{r['src']} -> {r['dst']}"
416
+ ident = (
417
+ r.get("claim_id")
418
+ or r.get("entity_id")
419
+ or r.get("reconciliation_id")
420
+ or f"{r.get('src')} -> {r.get('dst')}"
421
+ )
356
422
  print(f" appended {ident}")
357
423
  for s in result["skipped"]:
358
424
  print(f" = record {s['index']} skipped (duplicate)")
@@ -481,7 +547,7 @@ def build_parser() -> argparse.ArgumentParser:
481
547
  pq.add_argument(
482
548
  "name",
483
549
  nargs="?",
484
- choices=["claims", "entities", "edges", "contradictions"],
550
+ choices=["claims", "entities", "edges", "contradictions", "reconciliations"],
485
551
  help="a named query (omit when using --sql)",
486
552
  )
487
553
  pq.add_argument("--sql", help="raw DuckDB SQL (views: claims, entities, edges)")
@@ -549,6 +615,47 @@ def build_parser() -> argparse.ArgumentParser:
549
615
  pn.add_argument("--title", help="human title (default: the slug)")
550
616
  pn.set_defaults(func=cmd_new)
551
617
 
618
+ psp = sub.add_parser(
619
+ "span",
620
+ parents=[common],
621
+ help="resolve an anchor and print the cited text (read both sides of a contradiction)",
622
+ )
623
+ span_src = psp.add_mutually_exclusive_group(required=True)
624
+ span_src.add_argument("target", nargs="?", metavar="raw/<slug>#<anchor>", help="anchor target")
625
+ span_src.add_argument("--claim", metavar="ID", help="resolve this claim's anchor instead")
626
+ psp.set_defaults(func=cmd_span)
627
+
628
+ psim = sub.add_parser(
629
+ "similar",
630
+ parents=[common],
631
+ help="score existing wiki pages by topic overlap with a proposed page (PROMOTE step 1)",
632
+ )
633
+ psim.add_argument(
634
+ "--title", required=True, help="proposed page title (tokenized for title overlap)"
635
+ )
636
+ psim.add_argument(
637
+ "--from",
638
+ dest="sources",
639
+ required=True,
640
+ metavar="raw/a,raw/b",
641
+ help="comma-separated source ids the proposed page would derive from",
642
+ )
643
+ psim.add_argument(
644
+ "--kind",
645
+ choices=["concept", "entity"],
646
+ default="concept",
647
+ help="score only candidates of this kind (default: concept)",
648
+ )
649
+ psim.add_argument(
650
+ "--exclude",
651
+ metavar="ID",
652
+ action="append",
653
+ default=[],
654
+ help="page id to skip (repeatable); use when re-scoring an existing page",
655
+ )
656
+ psim.add_argument("--top", type=int, metavar="N", help="limit to the N highest-scoring candidates")
657
+ psim.set_defaults(func=cmd_similar)
658
+
552
659
  pfact = sub.add_parser(
553
660
  "fact",
554
661
  help="validated writers for the facts/ layer (claims mint verified anchors)",
@@ -562,7 +669,7 @@ def build_parser() -> argparse.ArgumentParser:
562
669
  )
563
670
  pfa.add_argument(
564
671
  "--table",
565
- choices=["claims", "entities", "edges"],
672
+ choices=["claims", "entities", "edges", "reconciliations"],
566
673
  default="claims",
567
674
  help="facts table to append to (default: claims)",
568
675
  )
@@ -36,10 +36,12 @@ _FILES = {
36
36
  "claims": "claims.ndjson",
37
37
  "entities": "entities.ndjson",
38
38
  "edges": "graph.ndjson",
39
+ "reconciliations": "reconciliations.ndjson",
39
40
  }
40
41
 
41
42
  # Fields scrip mints itself; proposing them is a schema error, not a finding.
42
43
  _SCRIP_OWNED = ("claim_id", "anchor", "extracted_at")
44
+ _RECON_OWNED = ("reconciliation_id", "at")
43
45
 
44
46
  _CLAIM_REQUIRED = ("quote", "source_id", "subject", "predicate", "object", "polarity", "confidence")
45
47
  _CLAIM_ALLOWED = frozenset((*_CLAIM_REQUIRED, "claim_text", "tags"))
@@ -47,12 +49,16 @@ _ENTITY_REQUIRED = ("entity_id", "name", "kind")
47
49
  _ENTITY_ALLOWED = frozenset((*_ENTITY_REQUIRED, "tags"))
48
50
  _EDGE_REQUIRED = ("src", "dst", "kind")
49
51
  _EDGE_ALLOWED = frozenset(_EDGE_REQUIRED)
52
+ _DECISIONS = ("supersede", "qualify", "keep-both")
53
+ _RECON_REQUIRED = ("decision", "claim_a", "claim_b")
54
+ _RECON_ALLOWED = frozenset((*_RECON_REQUIRED, "winner", "rationale"))
50
55
 
51
56
  # Same conservative shape ``cli._safe_slug`` enforces — no path separators,
52
57
  # '..', or leading dot — applied to source ids arriving as record *data*.
53
58
  _SLUG_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
54
59
 
55
60
  _CLAIM_ID_RE = re.compile(r"clm_(\d+)")
61
+ _RECON_ID_RE = re.compile(r"rec_(\d+)")
56
62
 
57
63
 
58
64
  def _now() -> str:
@@ -96,12 +102,18 @@ def _check_tags(rec: dict, index: int) -> None:
96
102
  raise DataError(f"record {index}: 'tags' must be a list of strings")
97
103
 
98
104
 
99
- def _check_shape(rec: dict, index: int, required: tuple[str, ...], allowed: frozenset[str]) -> None:
100
- owned = [k for k in _SCRIP_OWNED if k in rec]
101
- if owned:
105
+ def _check_shape(
106
+ rec: dict,
107
+ index: int,
108
+ required: tuple[str, ...],
109
+ allowed: frozenset[str],
110
+ owned: tuple[str, ...] = _SCRIP_OWNED,
111
+ ) -> None:
112
+ present = [k for k in owned if k in rec]
113
+ if present:
102
114
  raise DataError(
103
- f"record {index}: scrip mints {', '.join(owned)} itself — propose a "
104
- f"verbatim 'quote', not precomputed ids/anchors/timestamps"
115
+ f"record {index}: scrip mints {', '.join(present)} itself — do not "
116
+ f"propose precomputed ids/anchors/timestamps"
105
117
  )
106
118
  unknown = sorted(rec.keys() - allowed)
107
119
  if unknown:
@@ -136,10 +148,27 @@ def _validate(table: str, rec: dict, index: int) -> None:
136
148
  if not (eid.startswith("entity/") and _SLUG_RE.fullmatch(eid[len("entity/") :])):
137
149
  raise DataError(f"record {index}: entity_id must look like entity/<slug>")
138
150
  _check_tags(rec, index)
139
- else: # edges
151
+ elif table == "edges":
140
152
  _check_shape(rec, index, _EDGE_REQUIRED, _EDGE_ALLOWED)
141
153
  for key in _EDGE_REQUIRED:
142
154
  _check_str(rec, key, index)
155
+ else: # reconciliations
156
+ _check_shape(rec, index, _RECON_REQUIRED, _RECON_ALLOWED, owned=_RECON_OWNED)
157
+ for key in ("decision", "claim_a", "claim_b"):
158
+ _check_str(rec, key, index)
159
+ if rec["decision"] not in _DECISIONS:
160
+ raise DataError(f"record {index}: decision must be one of {', '.join(_DECISIONS)}")
161
+ if "rationale" in rec:
162
+ _check_str(rec, "rationale", index, allow_blank=True)
163
+ # winner is required for (and only for) supersede, and must be one of the pair
164
+ if rec["decision"] == "supersede":
165
+ winner = rec.get("winner")
166
+ if winner not in (rec["claim_a"], rec["claim_b"]):
167
+ raise DataError(
168
+ f"record {index}: supersede needs 'winner' = claim_a or claim_b"
169
+ )
170
+ elif "winner" in rec:
171
+ raise DataError(f"record {index}: 'winner' is only valid for decision 'supersede'")
143
172
 
144
173
 
145
174
  # --------------------------------------------------------------------------- #
@@ -218,6 +247,19 @@ def _read_table(path: Path) -> tuple[list[dict], str]:
218
247
  return records, text
219
248
 
220
249
 
250
+ def claim_source_anchor(root: Path, claim_id: str) -> tuple[str, str]:
251
+ """Return ``(source_id, anchor)`` for a claim, for `scrip span --claim`.
252
+ Raises :class:`DataError` if the claim is missing or lacks the fields."""
253
+ records, _ = _read_table(facts_dir(root) / "claims.ndjson")
254
+ for rec in records:
255
+ if rec.get("claim_id") == claim_id:
256
+ sid, anchor = rec.get("source_id"), rec.get("anchor")
257
+ if not isinstance(sid, str) or not isinstance(anchor, str):
258
+ raise DataError(f"claim {claim_id} is missing source_id/anchor")
259
+ return sid, anchor
260
+ raise DataError(f"no such claim: {claim_id}")
261
+
262
+
221
263
  def _claim_key(source_id: str, qh: str, rec: dict) -> tuple:
222
264
  return (
223
265
  source_id,
@@ -251,6 +293,17 @@ def _next_claim_id(existing: list[dict]) -> tuple[int, int]:
251
293
  return highest + 1, max(4, len(str(highest)))
252
294
 
253
295
 
296
+ def _next_recon_id(existing: list[dict]) -> tuple[int, int]:
297
+ """Return ``(next_number, pad_width)`` continuing the ``rec_NNNN`` sequence."""
298
+ numbers = [
299
+ int(m.group(1))
300
+ for rec in existing
301
+ if (m := _RECON_ID_RE.fullmatch(str(rec.get("reconciliation_id", ""))))
302
+ ]
303
+ highest = max(numbers, default=0)
304
+ return highest + 1, max(4, len(str(highest)))
305
+
306
+
254
307
  # --------------------------------------------------------------------------- #
255
308
  # facts/_meta.yaml: merge derived-from, never stamp
256
309
  # --------------------------------------------------------------------------- #
@@ -394,7 +447,7 @@ def add(root: Path, table: str, proposals: list[dict]) -> dict:
394
447
  "detail": "an entity with this id already exists with different fields",
395
448
  }
396
449
  )
397
- else: # edges
450
+ elif table == "edges":
398
451
  seen_edges = {
399
452
  (rec.get("src"), rec.get("dst"), rec.get("kind")) for rec in existing
400
453
  }
@@ -405,6 +458,43 @@ def add(root: Path, table: str, proposals: list[dict]) -> dict:
405
458
  continue
406
459
  seen_edges.add(key)
407
460
  appended.append({"src": rec["src"], "dst": rec["dst"], "kind": rec["kind"]})
461
+ else: # reconciliations
462
+ claim_ids = {c.get("claim_id") for c in _read_table(facts_dir(root) / "claims.ndjson")[0]}
463
+ for i, rec in enumerate(proposals):
464
+ refs = [rec["claim_a"], rec["claim_b"]]
465
+ if rec["decision"] == "supersede":
466
+ refs.append(rec["winner"])
467
+ missing = next((r for r in refs if r not in claim_ids), None)
468
+ if missing is not None:
469
+ failures.append({
470
+ "index": i, "status": "MISSING_CLAIM", "claim": missing,
471
+ "detail": f"{missing!r} is not a claim in claims.ndjson",
472
+ })
473
+ if failures:
474
+ return {"table": table, "appended": [], "skipped": [], "failures": failures}
475
+ seen_pairs = {frozenset((r.get("claim_a"), r.get("claim_b"))) for r in existing}
476
+ number, width = _next_recon_id(existing)
477
+ now = _now()
478
+ for i, rec in enumerate(proposals):
479
+ pair = frozenset((rec["claim_a"], rec["claim_b"]))
480
+ if pair in seen_pairs:
481
+ skipped.append({"index": i, "reason": "duplicate", "existing_id": None})
482
+ continue
483
+ seen_pairs.add(pair)
484
+ rid = f"rec_{number:0{width}d}"
485
+ number += 1
486
+ full = {
487
+ "reconciliation_id": rid,
488
+ "decision": rec["decision"],
489
+ "claim_a": rec["claim_a"],
490
+ "claim_b": rec["claim_b"],
491
+ }
492
+ if rec["decision"] == "supersede":
493
+ full["winner"] = rec["winner"]
494
+ if rec.get("rationale"):
495
+ full["rationale"] = rec["rationale"]
496
+ full["at"] = now
497
+ appended.append(full)
408
498
 
409
499
  if failures:
410
500
  return {"table": table, "appended": [], "skipped": skipped, "failures": failures}
@@ -22,15 +22,18 @@ _VIEWS = {
22
22
  "claims": "claims.ndjson",
23
23
  "entities": "entities.ndjson",
24
24
  "edges": "graph.ndjson",
25
+ "reconciliations": "reconciliations.ndjson",
25
26
  }
26
27
 
27
28
  _NAMED = {
28
29
  "claims": "SELECT * FROM claims",
29
30
  "entities": "SELECT * FROM entities",
30
31
  "edges": "SELECT * FROM edges",
32
+ "reconciliations": "SELECT * FROM reconciliations",
31
33
  # contradiction *candidates*: same subject+predicate, opposing polarity,
32
- # from different sources. Detection is deterministic; adjudication is the
33
- # agent's job.
34
+ # from different sources, AND not yet adjudicated (no reconciliation record
35
+ # for the pair, either order) — so RECONCILE makes the set converge.
36
+ # Detection is deterministic; adjudication is the agent's job.
34
37
  "contradictions": """
35
38
  SELECT a.claim_id AS claim_a, b.claim_id AS claim_b,
36
39
  a.subject, a.predicate,
@@ -41,10 +44,15 @@ _NAMED = {
41
44
  WHERE a.polarity = 'asserts'
42
45
  AND b.polarity = 'denies'
43
46
  AND a.source_id <> b.source_id
47
+ AND NOT EXISTS (
48
+ SELECT 1 FROM reconciliations r
49
+ WHERE (r.claim_a = a.claim_id AND r.claim_b = b.claim_id)
50
+ OR (r.claim_a = b.claim_id AND r.claim_b = a.claim_id)
51
+ )
44
52
  """,
45
53
  }
46
54
 
47
- _FILTERABLE = {"claims", "entities", "edges"}
55
+ _FILTERABLE = {"claims", "entities", "edges", "reconciliations"}
48
56
 
49
57
 
50
58
  def _connect(root: Path) -> duckdb.DuckDBPyConnection:
@@ -57,6 +65,16 @@ def _connect(root: Path) -> duckdb.DuckDBPyConnection:
57
65
  f"CREATE VIEW {view} AS "
58
66
  f"SELECT * FROM read_ndjson_auto('{p.as_posix()}')"
59
67
  )
68
+ elif view == "reconciliations":
69
+ # Always present (empty stub) so `contradictions` can anti-join it and
70
+ # raw SQL over its columns works even before any reconciliation exists.
71
+ con.execute(
72
+ "CREATE VIEW reconciliations AS SELECT "
73
+ "NULL::VARCHAR AS reconciliation_id, NULL::VARCHAR AS decision, "
74
+ "NULL::VARCHAR AS claim_a, NULL::VARCHAR AS claim_b, "
75
+ "NULL::VARCHAR AS winner, NULL::VARCHAR AS rationale, "
76
+ "NULL::VARCHAR AS at WHERE FALSE"
77
+ )
60
78
  return con
61
79
 
62
80
 
@@ -0,0 +1,168 @@
1
+ """Deterministic topic-overlap scoring for PROMOTE step 1 (`scrip similar`).
2
+
3
+ Ranks existing wiki pages by how much a proposed topic overlaps each, from three
4
+ file-derived signals:
5
+
6
+ - **title** — Jaccard of normalized title tokens (the §6 normalization).
7
+ - **sources** — Jaccard of `derived-from` source ids (block suffix stripped).
8
+ - **tags** — Jaccard of tag sets. Pages carry no `tags` frontmatter (SPEC §4),
9
+ so a page's tags are *derived*: the union of `tags` over claims whose
10
+ `source_id` is one of the page's sources.
11
+
12
+ `combined` is a weighted sum (sources dominates — shared sources is the strongest
13
+ same-topic signal). This is **purely informational**: it reports scores and
14
+ leaves the High/Middle/Low merge decision of AGENT.md PROMOTE to the caller,
15
+ exactly as `query contradictions` leaves adjudication to the agent. No lock, no
16
+ model, no DuckDB.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ from collections.abc import Iterable, Mapping
23
+ from pathlib import Path
24
+
25
+ from . import facts_dir, frontmatter
26
+ from .errors import DataError
27
+ from .graph import scan_derived
28
+ from .hashing import normalize
29
+
30
+ DEFAULT_WEIGHTS = {"title": 0.25, "sources": 0.5, "tags": 0.25}
31
+
32
+
33
+ def _tokens(title: str) -> set[str]:
34
+ return set(normalize(title).split())
35
+
36
+
37
+ def _strip_block(dep: str) -> str:
38
+ """`raw/x#b3` -> `raw/x` (block-scoped deps share their whole source)."""
39
+ return dep.split("#", 1)[0]
40
+
41
+
42
+ def _source_set(derived_from: Iterable[str]) -> set[str]:
43
+ return {_strip_block(d) for d in derived_from}
44
+
45
+
46
+ def _jaccard(a: set[str], b: set[str]) -> float:
47
+ if not a and not b:
48
+ return 0.0
49
+ return len(a & b) / len(a | b)
50
+
51
+
52
+ def _source_tags(root: Path) -> dict[str, set[str]]:
53
+ """Map each `source_id` to the union of `tags` over its claims. Built once
54
+ per run from facts/claims.ndjson (parsed directly — no DuckDB dependency)."""
55
+ out: dict[str, set[str]] = {}
56
+ p = facts_dir(root) / "claims.ndjson"
57
+ if not p.exists():
58
+ return out
59
+ for lineno, raw_line in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
60
+ line = raw_line.strip()
61
+ if not line:
62
+ continue
63
+ try:
64
+ rec = json.loads(line)
65
+ except json.JSONDecodeError as e:
66
+ raise DataError(f"claims.ndjson:{lineno}: invalid JSON: {e}") from e
67
+ if not isinstance(rec, dict):
68
+ raise DataError(f"claims.ndjson:{lineno}: expected a JSON object")
69
+ sid = rec.get("source_id")
70
+ if not isinstance(sid, str):
71
+ raise DataError(f"claims.ndjson:{lineno}: 'source_id' must be a string")
72
+ tags = rec.get("tags")
73
+ if tags is None:
74
+ continue
75
+ if not isinstance(tags, list) or any(not isinstance(t, str) for t in tags):
76
+ raise DataError(f"claims.ndjson:{lineno}: 'tags' must be a list of strings")
77
+ out.setdefault(sid, set()).update(tags)
78
+ return out
79
+
80
+
81
+ def _page_tags(sources: set[str], source_tags: Mapping[str, set[str]]) -> set[str]:
82
+ out: set[str] = set()
83
+ for s in sources:
84
+ out |= source_tags.get(s, set())
85
+ return out
86
+
87
+
88
+ def compute_similar(
89
+ root: str | Path,
90
+ *,
91
+ title: str,
92
+ sources: Iterable[str],
93
+ kind: str = "concept",
94
+ exclude: Iterable[str] | None = None,
95
+ top: int | None = None,
96
+ weights: Mapping[str, float] | None = None,
97
+ ) -> dict:
98
+ """Score existing `kind` wiki pages against the proposed (title, sources).
99
+
100
+ Returns ``{proposed, weights, candidates}`` with candidates sorted by
101
+ ``combined`` desc then id asc, truncated to ``top``.
102
+ """
103
+ root = Path(root)
104
+ w = dict(weights or DEFAULT_WEIGHTS)
105
+ skip = set(exclude or ())
106
+ prop_sources = _source_set(sources)
107
+ prop_tokens = _tokens(title)
108
+ source_tags = _source_tags(root)
109
+ prop_tags = _page_tags(prop_sources, source_tags)
110
+
111
+ want_type = f"wiki.{kind}"
112
+ candidates: list[dict] = []
113
+ for cid, d in scan_derived(root).items():
114
+ if d.get("type") != want_type or cid in skip:
115
+ continue # other-kind pages and the facts.set row are dropped here
116
+ c_sources = _source_set(d["derived_from"])
117
+ meta, _ = frontmatter.load(root / d["path"])
118
+ c_title = (meta.get("title") if meta else "") or ""
119
+ c_tags = _page_tags(c_sources, source_tags)
120
+
121
+ title_s = _jaccard(prop_tokens, _tokens(c_title))
122
+ sources_s = _jaccard(prop_sources, c_sources)
123
+ tags_s = _jaccard(prop_tags, c_tags)
124
+ combined = w["title"] * title_s + w["sources"] * sources_s + w["tags"] * tags_s
125
+ candidates.append(
126
+ {
127
+ "id": cid,
128
+ "title": c_title,
129
+ "path": d["path"],
130
+ "kind": kind,
131
+ "scores": {
132
+ "title": round(title_s, 6),
133
+ "sources": round(sources_s, 6),
134
+ "tags": round(tags_s, 6),
135
+ "combined": round(combined, 6),
136
+ },
137
+ "shared": {
138
+ "sources": sorted(prop_sources & c_sources),
139
+ "tags": sorted(prop_tags & c_tags),
140
+ },
141
+ }
142
+ )
143
+
144
+ candidates.sort(key=lambda c: (-c["scores"]["combined"], c["id"]))
145
+ if top is not None:
146
+ candidates = candidates[:top]
147
+ return {
148
+ "proposed": {"title": title, "derived_from": list(sources), "kind": kind},
149
+ "weights": w,
150
+ "candidates": candidates,
151
+ }
152
+
153
+
154
+ def print_similar(result: dict) -> None:
155
+ p = result["proposed"]
156
+ print(f'proposed: "{p["title"]}" ({p["kind"]}, from {len(p["derived_from"])} source(s))')
157
+ cands = result["candidates"]
158
+ if not cands:
159
+ print(f"no existing {p['kind']} pages to compare.")
160
+ return
161
+ for c in cands:
162
+ s = c["scores"]
163
+ print(f' {s["combined"]:.3f} {c["id"]} "{c["title"]}"')
164
+ print(
165
+ f' sources {s["sources"]:.2f} tags {s["tags"]:.2f} title {s["title"]:.2f}'
166
+ f' shared sources: {len(c["shared"]["sources"])}, tags: {len(c["shared"]["tags"])}'
167
+ )
168
+ print(f"({len(cands)} candidate(s))")
@@ -70,6 +70,8 @@ class KB:
70
70
  *,
71
71
  stamp: bool = True,
72
72
  body: str = "Body.\n",
73
+ title: str | None = None,
74
+ kind: str = "concept",
73
75
  ) -> str:
74
76
  deps = {
75
77
  sid: h
@@ -77,16 +79,18 @@ class KB:
77
79
  if (h := self._dep_hash(sid)) is not None
78
80
  }
79
81
  meta: dict = {
80
- "id": f"concept/{slug}",
81
- "type": "wiki.concept",
82
- "title": slug,
82
+ "id": f"{kind}/{slug}",
83
+ "type": f"wiki.{kind}",
84
+ "title": title or slug,
83
85
  "derived-from": list(derived_from),
84
86
  }
85
87
  if stamp:
86
88
  meta["input-hash"] = hashing.input_hash(deps)
87
89
  meta["last-compiled"] = "2026-01-01T00:00:00Z"
88
90
  meta["confidence"] = 0.9
89
- path = self.root / "vault" / "wiki" / "concepts" / f"{slug}.md"
91
+ subdir = "concepts" if kind == "concept" else "entities"
92
+ path = self.root / "vault" / "wiki" / subdir / f"{slug}.md"
93
+ path.parent.mkdir(parents=True, exist_ok=True)
90
94
  path.write_text(frontmatter.dump(meta, body), encoding="utf-8")
91
95
  return meta["id"]
92
96