scriptoria 0.3.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scriptoria-0.3.0 → scriptoria-0.5.0}/PKG-INFO +1 -1
- {scriptoria-0.3.0 → scriptoria-0.5.0}/pyproject.toml +1 -1
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/__init__.py +1 -1
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/anchors.py +27 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/cli.py +110 -3
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/facts.py +97 -7
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/query.py +21 -3
- scriptoria-0.5.0/src/scrip/similar.py +168 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/conftest.py +8 -4
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_fact_cmd.py +116 -0
- scriptoria-0.5.0/tests/test_query.py +109 -0
- scriptoria-0.5.0/tests/test_similar_cmd.py +234 -0
- scriptoria-0.5.0/tests/test_span_cmd.py +86 -0
- scriptoria-0.5.0/tests/test_version.py +11 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/uv.lock +1 -1
- scriptoria-0.3.0/tests/test_query.py +0 -58
- {scriptoria-0.3.0 → scriptoria-0.5.0}/.gitignore +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/README.md +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/pyrightconfig.json +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/blocks.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/embeddings.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/errors.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/frontmatter.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/graph.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/hashing.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/ingest.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/lock.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/manifest.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/src/scrip/retrieval.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_anchor_cmd.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_anchors.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_blocks.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_embeddings.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_graph_status.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_hashing.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_index_cmd.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_ingest.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_json_shapes.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_lock.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_manifest.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_new_cmd.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_retrieval.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_stamp.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_status_cmd.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_unlock_cmd.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_verify.py +0 -0
- {scriptoria-0.3.0 → scriptoria-0.5.0}/tests/test_watch.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scriptoria
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
|
|
5
5
|
Project-URL: Homepage, https://github.com/coredipper/scriptorium
|
|
6
6
|
Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
|
|
3
3
|
# CLI command and the import package both remain `scrip`.
|
|
4
4
|
name = "scriptoria"
|
|
5
|
-
version = "0.
|
|
5
|
+
version = "0.5.0"
|
|
6
6
|
description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
requires-python = ">=3.10"
|
|
@@ -13,7 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.5.0"
|
|
17
17
|
|
|
18
18
|
# --- canonical vault layout ------------------------------------------------
|
|
19
19
|
# ``root`` is the repo/instance root: the directory containing ``vault/``.
|
|
@@ -90,6 +90,33 @@ def resolve(source_text: str, anchor: str) -> str:
|
|
|
90
90
|
return "OK" if matches == 1 else "AMBIGUOUS"
|
|
91
91
|
|
|
92
92
|
|
|
93
|
+
def span(source_text: str, anchor: str) -> tuple[str, str | None]:
|
|
94
|
+
"""Return ``(status, cited_text)`` for ``anchor`` in ``source_text``.
|
|
95
|
+
|
|
96
|
+
Same verdicts as :func:`resolve`, but also returns the matched span (the
|
|
97
|
+
normalized cited text) so a caller can *read* what an anchor cites. For
|
|
98
|
+
``AMBIGUOUS`` the window nearest the anchor's ``loc`` hint is returned; for
|
|
99
|
+
``BROKEN`` the text is ``None``.
|
|
100
|
+
"""
|
|
101
|
+
a = parse_anchor(anchor)
|
|
102
|
+
ns = normalize(source_text)
|
|
103
|
+
n, target, loc = a["len"], a["qh"], a["loc"]
|
|
104
|
+
length = len(ns)
|
|
105
|
+
if n <= 0 or n > length:
|
|
106
|
+
return "BROKEN", None
|
|
107
|
+
hits = [
|
|
108
|
+
start
|
|
109
|
+
for start in range(length - n + 1)
|
|
110
|
+
if hashlib.sha256(ns[start : start + n].encode("utf-8")).hexdigest() == target
|
|
111
|
+
]
|
|
112
|
+
if not hits:
|
|
113
|
+
return "BROKEN", None
|
|
114
|
+
if len(hits) == 1:
|
|
115
|
+
return "OK", ns[hits[0] : hits[0] + n]
|
|
116
|
+
nearest = min(hits, key=lambda s: abs(s - loc * length))
|
|
117
|
+
return "AMBIGUOUS", ns[nearest : nearest + n]
|
|
118
|
+
|
|
119
|
+
|
|
93
120
|
# --------------------------------------------------------------------------- #
|
|
94
121
|
# Vault-wide verification
|
|
95
122
|
# --------------------------------------------------------------------------- #
|
|
@@ -336,6 +336,67 @@ def cmd_new(args: argparse.Namespace) -> int:
|
|
|
336
336
|
return 0
|
|
337
337
|
|
|
338
338
|
|
|
339
|
+
def _parse_source_ids(raw: str) -> list[str]:
|
|
340
|
+
"""Parse a comma-separated `--from` value into validated source ids, WITHOUT
|
|
341
|
+
requiring the sources to exist (unlike `cmd_new`): scoring a not-yet-ingested
|
|
342
|
+
proposed topic is legitimate. Keeps the traversal-safety check."""
|
|
343
|
+
ids: list[str] = []
|
|
344
|
+
for s in (part.strip() for part in raw.split(",")):
|
|
345
|
+
if not s:
|
|
346
|
+
continue
|
|
347
|
+
sid = s if s.startswith("raw/") else f"raw/{s}"
|
|
348
|
+
_safe_slug(sid.split("#", 1)[0][len("raw/") :], "source")
|
|
349
|
+
ids.append(sid)
|
|
350
|
+
if not ids:
|
|
351
|
+
raise errors.UsageError("--from requires at least one source id")
|
|
352
|
+
return ids
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def cmd_span(args: argparse.Namespace) -> int:
|
|
356
|
+
from . import anchors
|
|
357
|
+
|
|
358
|
+
root = resolve_root(args.root)
|
|
359
|
+
if args.claim:
|
|
360
|
+
from . import facts
|
|
361
|
+
|
|
362
|
+
source_id, anchor = facts.claim_source_anchor(root, args.claim)
|
|
363
|
+
else:
|
|
364
|
+
if "#" not in args.target:
|
|
365
|
+
raise errors.UsageError("target must be raw/<slug>#<anchor>")
|
|
366
|
+
source_id, anchor = args.target.split("#", 1)
|
|
367
|
+
source_id = source_id if source_id.startswith("raw/") else f"raw/{source_id}"
|
|
368
|
+
_safe_slug(source_id[len("raw/") :], "source")
|
|
369
|
+
text = anchors.source_text(root, source_id)
|
|
370
|
+
status, cited = anchors.span(text, anchor)
|
|
371
|
+
if args.json:
|
|
372
|
+
_emit({"target": f"{source_id}#{anchor}", "status": status, "text": cited})
|
|
373
|
+
else:
|
|
374
|
+
print(f"[{status}] {source_id}")
|
|
375
|
+
if cited is not None:
|
|
376
|
+
print(cited)
|
|
377
|
+
return 0 if status == "OK" else 1
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def cmd_similar(args: argparse.Namespace) -> int:
|
|
381
|
+
from . import similar
|
|
382
|
+
|
|
383
|
+
root = resolve_root(args.root)
|
|
384
|
+
sources = _parse_source_ids(args.sources)
|
|
385
|
+
result = similar.compute_similar(
|
|
386
|
+
root,
|
|
387
|
+
title=args.title,
|
|
388
|
+
sources=sources,
|
|
389
|
+
kind=args.kind,
|
|
390
|
+
exclude=set(args.exclude),
|
|
391
|
+
top=args.top,
|
|
392
|
+
)
|
|
393
|
+
if args.json:
|
|
394
|
+
_emit(result)
|
|
395
|
+
else:
|
|
396
|
+
similar.print_similar(result)
|
|
397
|
+
return 0
|
|
398
|
+
|
|
399
|
+
|
|
339
400
|
def cmd_fact_add(args: argparse.Namespace) -> int:
|
|
340
401
|
from . import facts
|
|
341
402
|
|
|
@@ -352,7 +413,12 @@ def cmd_fact_add(args: argparse.Namespace) -> int:
|
|
|
352
413
|
_emit(result)
|
|
353
414
|
else:
|
|
354
415
|
for r in result["appended"]:
|
|
355
|
-
ident =
|
|
416
|
+
ident = (
|
|
417
|
+
r.get("claim_id")
|
|
418
|
+
or r.get("entity_id")
|
|
419
|
+
or r.get("reconciliation_id")
|
|
420
|
+
or f"{r.get('src')} -> {r.get('dst')}"
|
|
421
|
+
)
|
|
356
422
|
print(f" appended {ident}")
|
|
357
423
|
for s in result["skipped"]:
|
|
358
424
|
print(f" = record {s['index']} skipped (duplicate)")
|
|
@@ -481,7 +547,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
481
547
|
pq.add_argument(
|
|
482
548
|
"name",
|
|
483
549
|
nargs="?",
|
|
484
|
-
choices=["claims", "entities", "edges", "contradictions"],
|
|
550
|
+
choices=["claims", "entities", "edges", "contradictions", "reconciliations"],
|
|
485
551
|
help="a named query (omit when using --sql)",
|
|
486
552
|
)
|
|
487
553
|
pq.add_argument("--sql", help="raw DuckDB SQL (views: claims, entities, edges)")
|
|
@@ -549,6 +615,47 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
549
615
|
pn.add_argument("--title", help="human title (default: the slug)")
|
|
550
616
|
pn.set_defaults(func=cmd_new)
|
|
551
617
|
|
|
618
|
+
psp = sub.add_parser(
|
|
619
|
+
"span",
|
|
620
|
+
parents=[common],
|
|
621
|
+
help="resolve an anchor and print the cited text (read both sides of a contradiction)",
|
|
622
|
+
)
|
|
623
|
+
span_src = psp.add_mutually_exclusive_group(required=True)
|
|
624
|
+
span_src.add_argument("target", nargs="?", metavar="raw/<slug>#<anchor>", help="anchor target")
|
|
625
|
+
span_src.add_argument("--claim", metavar="ID", help="resolve this claim's anchor instead")
|
|
626
|
+
psp.set_defaults(func=cmd_span)
|
|
627
|
+
|
|
628
|
+
psim = sub.add_parser(
|
|
629
|
+
"similar",
|
|
630
|
+
parents=[common],
|
|
631
|
+
help="score existing wiki pages by topic overlap with a proposed page (PROMOTE step 1)",
|
|
632
|
+
)
|
|
633
|
+
psim.add_argument(
|
|
634
|
+
"--title", required=True, help="proposed page title (tokenized for title overlap)"
|
|
635
|
+
)
|
|
636
|
+
psim.add_argument(
|
|
637
|
+
"--from",
|
|
638
|
+
dest="sources",
|
|
639
|
+
required=True,
|
|
640
|
+
metavar="raw/a,raw/b",
|
|
641
|
+
help="comma-separated source ids the proposed page would derive from",
|
|
642
|
+
)
|
|
643
|
+
psim.add_argument(
|
|
644
|
+
"--kind",
|
|
645
|
+
choices=["concept", "entity"],
|
|
646
|
+
default="concept",
|
|
647
|
+
help="score only candidates of this kind (default: concept)",
|
|
648
|
+
)
|
|
649
|
+
psim.add_argument(
|
|
650
|
+
"--exclude",
|
|
651
|
+
metavar="ID",
|
|
652
|
+
action="append",
|
|
653
|
+
default=[],
|
|
654
|
+
help="page id to skip (repeatable); use when re-scoring an existing page",
|
|
655
|
+
)
|
|
656
|
+
psim.add_argument("--top", type=int, metavar="N", help="limit to the N highest-scoring candidates")
|
|
657
|
+
psim.set_defaults(func=cmd_similar)
|
|
658
|
+
|
|
552
659
|
pfact = sub.add_parser(
|
|
553
660
|
"fact",
|
|
554
661
|
help="validated writers for the facts/ layer (claims mint verified anchors)",
|
|
@@ -562,7 +669,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
562
669
|
)
|
|
563
670
|
pfa.add_argument(
|
|
564
671
|
"--table",
|
|
565
|
-
choices=["claims", "entities", "edges"],
|
|
672
|
+
choices=["claims", "entities", "edges", "reconciliations"],
|
|
566
673
|
default="claims",
|
|
567
674
|
help="facts table to append to (default: claims)",
|
|
568
675
|
)
|
|
@@ -36,10 +36,12 @@ _FILES = {
|
|
|
36
36
|
"claims": "claims.ndjson",
|
|
37
37
|
"entities": "entities.ndjson",
|
|
38
38
|
"edges": "graph.ndjson",
|
|
39
|
+
"reconciliations": "reconciliations.ndjson",
|
|
39
40
|
}
|
|
40
41
|
|
|
41
42
|
# Fields scrip mints itself; proposing them is a schema error, not a finding.
|
|
42
43
|
_SCRIP_OWNED = ("claim_id", "anchor", "extracted_at")
|
|
44
|
+
_RECON_OWNED = ("reconciliation_id", "at")
|
|
43
45
|
|
|
44
46
|
_CLAIM_REQUIRED = ("quote", "source_id", "subject", "predicate", "object", "polarity", "confidence")
|
|
45
47
|
_CLAIM_ALLOWED = frozenset((*_CLAIM_REQUIRED, "claim_text", "tags"))
|
|
@@ -47,12 +49,16 @@ _ENTITY_REQUIRED = ("entity_id", "name", "kind")
|
|
|
47
49
|
_ENTITY_ALLOWED = frozenset((*_ENTITY_REQUIRED, "tags"))
|
|
48
50
|
_EDGE_REQUIRED = ("src", "dst", "kind")
|
|
49
51
|
_EDGE_ALLOWED = frozenset(_EDGE_REQUIRED)
|
|
52
|
+
_DECISIONS = ("supersede", "qualify", "keep-both")
|
|
53
|
+
_RECON_REQUIRED = ("decision", "claim_a", "claim_b")
|
|
54
|
+
_RECON_ALLOWED = frozenset((*_RECON_REQUIRED, "winner", "rationale"))
|
|
50
55
|
|
|
51
56
|
# Same conservative shape ``cli._safe_slug`` enforces — no path separators,
|
|
52
57
|
# '..', or leading dot — applied to source ids arriving as record *data*.
|
|
53
58
|
_SLUG_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
|
|
54
59
|
|
|
55
60
|
_CLAIM_ID_RE = re.compile(r"clm_(\d+)")
|
|
61
|
+
_RECON_ID_RE = re.compile(r"rec_(\d+)")
|
|
56
62
|
|
|
57
63
|
|
|
58
64
|
def _now() -> str:
|
|
@@ -96,12 +102,18 @@ def _check_tags(rec: dict, index: int) -> None:
|
|
|
96
102
|
raise DataError(f"record {index}: 'tags' must be a list of strings")
|
|
97
103
|
|
|
98
104
|
|
|
99
|
-
def _check_shape(
|
|
100
|
-
|
|
101
|
-
|
|
105
|
+
def _check_shape(
|
|
106
|
+
rec: dict,
|
|
107
|
+
index: int,
|
|
108
|
+
required: tuple[str, ...],
|
|
109
|
+
allowed: frozenset[str],
|
|
110
|
+
owned: tuple[str, ...] = _SCRIP_OWNED,
|
|
111
|
+
) -> None:
|
|
112
|
+
present = [k for k in owned if k in rec]
|
|
113
|
+
if present:
|
|
102
114
|
raise DataError(
|
|
103
|
-
f"record {index}: scrip mints {', '.join(
|
|
104
|
-
f"
|
|
115
|
+
f"record {index}: scrip mints {', '.join(present)} itself — do not "
|
|
116
|
+
f"propose precomputed ids/anchors/timestamps"
|
|
105
117
|
)
|
|
106
118
|
unknown = sorted(rec.keys() - allowed)
|
|
107
119
|
if unknown:
|
|
@@ -136,10 +148,27 @@ def _validate(table: str, rec: dict, index: int) -> None:
|
|
|
136
148
|
if not (eid.startswith("entity/") and _SLUG_RE.fullmatch(eid[len("entity/") :])):
|
|
137
149
|
raise DataError(f"record {index}: entity_id must look like entity/<slug>")
|
|
138
150
|
_check_tags(rec, index)
|
|
139
|
-
|
|
151
|
+
elif table == "edges":
|
|
140
152
|
_check_shape(rec, index, _EDGE_REQUIRED, _EDGE_ALLOWED)
|
|
141
153
|
for key in _EDGE_REQUIRED:
|
|
142
154
|
_check_str(rec, key, index)
|
|
155
|
+
else: # reconciliations
|
|
156
|
+
_check_shape(rec, index, _RECON_REQUIRED, _RECON_ALLOWED, owned=_RECON_OWNED)
|
|
157
|
+
for key in ("decision", "claim_a", "claim_b"):
|
|
158
|
+
_check_str(rec, key, index)
|
|
159
|
+
if rec["decision"] not in _DECISIONS:
|
|
160
|
+
raise DataError(f"record {index}: decision must be one of {', '.join(_DECISIONS)}")
|
|
161
|
+
if "rationale" in rec:
|
|
162
|
+
_check_str(rec, "rationale", index, allow_blank=True)
|
|
163
|
+
# winner is required for (and only for) supersede, and must be one of the pair
|
|
164
|
+
if rec["decision"] == "supersede":
|
|
165
|
+
winner = rec.get("winner")
|
|
166
|
+
if winner not in (rec["claim_a"], rec["claim_b"]):
|
|
167
|
+
raise DataError(
|
|
168
|
+
f"record {index}: supersede needs 'winner' = claim_a or claim_b"
|
|
169
|
+
)
|
|
170
|
+
elif "winner" in rec:
|
|
171
|
+
raise DataError(f"record {index}: 'winner' is only valid for decision 'supersede'")
|
|
143
172
|
|
|
144
173
|
|
|
145
174
|
# --------------------------------------------------------------------------- #
|
|
@@ -218,6 +247,19 @@ def _read_table(path: Path) -> tuple[list[dict], str]:
|
|
|
218
247
|
return records, text
|
|
219
248
|
|
|
220
249
|
|
|
250
|
+
def claim_source_anchor(root: Path, claim_id: str) -> tuple[str, str]:
|
|
251
|
+
"""Return ``(source_id, anchor)`` for a claim, for `scrip span --claim`.
|
|
252
|
+
Raises :class:`DataError` if the claim is missing or lacks the fields."""
|
|
253
|
+
records, _ = _read_table(facts_dir(root) / "claims.ndjson")
|
|
254
|
+
for rec in records:
|
|
255
|
+
if rec.get("claim_id") == claim_id:
|
|
256
|
+
sid, anchor = rec.get("source_id"), rec.get("anchor")
|
|
257
|
+
if not isinstance(sid, str) or not isinstance(anchor, str):
|
|
258
|
+
raise DataError(f"claim {claim_id} is missing source_id/anchor")
|
|
259
|
+
return sid, anchor
|
|
260
|
+
raise DataError(f"no such claim: {claim_id}")
|
|
261
|
+
|
|
262
|
+
|
|
221
263
|
def _claim_key(source_id: str, qh: str, rec: dict) -> tuple:
|
|
222
264
|
return (
|
|
223
265
|
source_id,
|
|
@@ -251,6 +293,17 @@ def _next_claim_id(existing: list[dict]) -> tuple[int, int]:
|
|
|
251
293
|
return highest + 1, max(4, len(str(highest)))
|
|
252
294
|
|
|
253
295
|
|
|
296
|
+
def _next_recon_id(existing: list[dict]) -> tuple[int, int]:
|
|
297
|
+
"""Return ``(next_number, pad_width)`` continuing the ``rec_NNNN`` sequence."""
|
|
298
|
+
numbers = [
|
|
299
|
+
int(m.group(1))
|
|
300
|
+
for rec in existing
|
|
301
|
+
if (m := _RECON_ID_RE.fullmatch(str(rec.get("reconciliation_id", ""))))
|
|
302
|
+
]
|
|
303
|
+
highest = max(numbers, default=0)
|
|
304
|
+
return highest + 1, max(4, len(str(highest)))
|
|
305
|
+
|
|
306
|
+
|
|
254
307
|
# --------------------------------------------------------------------------- #
|
|
255
308
|
# facts/_meta.yaml: merge derived-from, never stamp
|
|
256
309
|
# --------------------------------------------------------------------------- #
|
|
@@ -394,7 +447,7 @@ def add(root: Path, table: str, proposals: list[dict]) -> dict:
|
|
|
394
447
|
"detail": "an entity with this id already exists with different fields",
|
|
395
448
|
}
|
|
396
449
|
)
|
|
397
|
-
|
|
450
|
+
elif table == "edges":
|
|
398
451
|
seen_edges = {
|
|
399
452
|
(rec.get("src"), rec.get("dst"), rec.get("kind")) for rec in existing
|
|
400
453
|
}
|
|
@@ -405,6 +458,43 @@ def add(root: Path, table: str, proposals: list[dict]) -> dict:
|
|
|
405
458
|
continue
|
|
406
459
|
seen_edges.add(key)
|
|
407
460
|
appended.append({"src": rec["src"], "dst": rec["dst"], "kind": rec["kind"]})
|
|
461
|
+
else: # reconciliations
|
|
462
|
+
claim_ids = {c.get("claim_id") for c in _read_table(facts_dir(root) / "claims.ndjson")[0]}
|
|
463
|
+
for i, rec in enumerate(proposals):
|
|
464
|
+
refs = [rec["claim_a"], rec["claim_b"]]
|
|
465
|
+
if rec["decision"] == "supersede":
|
|
466
|
+
refs.append(rec["winner"])
|
|
467
|
+
missing = next((r for r in refs if r not in claim_ids), None)
|
|
468
|
+
if missing is not None:
|
|
469
|
+
failures.append({
|
|
470
|
+
"index": i, "status": "MISSING_CLAIM", "claim": missing,
|
|
471
|
+
"detail": f"{missing!r} is not a claim in claims.ndjson",
|
|
472
|
+
})
|
|
473
|
+
if failures:
|
|
474
|
+
return {"table": table, "appended": [], "skipped": [], "failures": failures}
|
|
475
|
+
seen_pairs = {frozenset((r.get("claim_a"), r.get("claim_b"))) for r in existing}
|
|
476
|
+
number, width = _next_recon_id(existing)
|
|
477
|
+
now = _now()
|
|
478
|
+
for i, rec in enumerate(proposals):
|
|
479
|
+
pair = frozenset((rec["claim_a"], rec["claim_b"]))
|
|
480
|
+
if pair in seen_pairs:
|
|
481
|
+
skipped.append({"index": i, "reason": "duplicate", "existing_id": None})
|
|
482
|
+
continue
|
|
483
|
+
seen_pairs.add(pair)
|
|
484
|
+
rid = f"rec_{number:0{width}d}"
|
|
485
|
+
number += 1
|
|
486
|
+
full = {
|
|
487
|
+
"reconciliation_id": rid,
|
|
488
|
+
"decision": rec["decision"],
|
|
489
|
+
"claim_a": rec["claim_a"],
|
|
490
|
+
"claim_b": rec["claim_b"],
|
|
491
|
+
}
|
|
492
|
+
if rec["decision"] == "supersede":
|
|
493
|
+
full["winner"] = rec["winner"]
|
|
494
|
+
if rec.get("rationale"):
|
|
495
|
+
full["rationale"] = rec["rationale"]
|
|
496
|
+
full["at"] = now
|
|
497
|
+
appended.append(full)
|
|
408
498
|
|
|
409
499
|
if failures:
|
|
410
500
|
return {"table": table, "appended": [], "skipped": skipped, "failures": failures}
|
|
@@ -22,15 +22,18 @@ _VIEWS = {
|
|
|
22
22
|
"claims": "claims.ndjson",
|
|
23
23
|
"entities": "entities.ndjson",
|
|
24
24
|
"edges": "graph.ndjson",
|
|
25
|
+
"reconciliations": "reconciliations.ndjson",
|
|
25
26
|
}
|
|
26
27
|
|
|
27
28
|
_NAMED = {
|
|
28
29
|
"claims": "SELECT * FROM claims",
|
|
29
30
|
"entities": "SELECT * FROM entities",
|
|
30
31
|
"edges": "SELECT * FROM edges",
|
|
32
|
+
"reconciliations": "SELECT * FROM reconciliations",
|
|
31
33
|
# contradiction *candidates*: same subject+predicate, opposing polarity,
|
|
32
|
-
# from different sources
|
|
33
|
-
#
|
|
34
|
+
# from different sources, AND not yet adjudicated (no reconciliation record
|
|
35
|
+
# for the pair, either order) — so RECONCILE makes the set converge.
|
|
36
|
+
# Detection is deterministic; adjudication is the agent's job.
|
|
34
37
|
"contradictions": """
|
|
35
38
|
SELECT a.claim_id AS claim_a, b.claim_id AS claim_b,
|
|
36
39
|
a.subject, a.predicate,
|
|
@@ -41,10 +44,15 @@ _NAMED = {
|
|
|
41
44
|
WHERE a.polarity = 'asserts'
|
|
42
45
|
AND b.polarity = 'denies'
|
|
43
46
|
AND a.source_id <> b.source_id
|
|
47
|
+
AND NOT EXISTS (
|
|
48
|
+
SELECT 1 FROM reconciliations r
|
|
49
|
+
WHERE (r.claim_a = a.claim_id AND r.claim_b = b.claim_id)
|
|
50
|
+
OR (r.claim_a = b.claim_id AND r.claim_b = a.claim_id)
|
|
51
|
+
)
|
|
44
52
|
""",
|
|
45
53
|
}
|
|
46
54
|
|
|
47
|
-
_FILTERABLE = {"claims", "entities", "edges"}
|
|
55
|
+
_FILTERABLE = {"claims", "entities", "edges", "reconciliations"}
|
|
48
56
|
|
|
49
57
|
|
|
50
58
|
def _connect(root: Path) -> duckdb.DuckDBPyConnection:
|
|
@@ -57,6 +65,16 @@ def _connect(root: Path) -> duckdb.DuckDBPyConnection:
|
|
|
57
65
|
f"CREATE VIEW {view} AS "
|
|
58
66
|
f"SELECT * FROM read_ndjson_auto('{p.as_posix()}')"
|
|
59
67
|
)
|
|
68
|
+
elif view == "reconciliations":
|
|
69
|
+
# Always present (empty stub) so `contradictions` can anti-join it and
|
|
70
|
+
# raw SQL over its columns works even before any reconciliation exists.
|
|
71
|
+
con.execute(
|
|
72
|
+
"CREATE VIEW reconciliations AS SELECT "
|
|
73
|
+
"NULL::VARCHAR AS reconciliation_id, NULL::VARCHAR AS decision, "
|
|
74
|
+
"NULL::VARCHAR AS claim_a, NULL::VARCHAR AS claim_b, "
|
|
75
|
+
"NULL::VARCHAR AS winner, NULL::VARCHAR AS rationale, "
|
|
76
|
+
"NULL::VARCHAR AS at WHERE FALSE"
|
|
77
|
+
)
|
|
60
78
|
return con
|
|
61
79
|
|
|
62
80
|
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Deterministic topic-overlap scoring for PROMOTE step 1 (`scrip similar`).
|
|
2
|
+
|
|
3
|
+
Ranks existing wiki pages by how much a proposed topic overlaps each, from three
|
|
4
|
+
file-derived signals:
|
|
5
|
+
|
|
6
|
+
- **title** — Jaccard of normalized title tokens (the §6 normalization).
|
|
7
|
+
- **sources** — Jaccard of `derived-from` source ids (block suffix stripped).
|
|
8
|
+
- **tags** — Jaccard of tag sets. Pages carry no `tags` frontmatter (SPEC §4),
|
|
9
|
+
so a page's tags are *derived*: the union of `tags` over claims whose
|
|
10
|
+
`source_id` is one of the page's sources.
|
|
11
|
+
|
|
12
|
+
`combined` is a weighted sum (sources dominates — shared sources is the strongest
|
|
13
|
+
same-topic signal). This is **purely informational**: it reports scores and
|
|
14
|
+
leaves the High/Middle/Low merge decision of AGENT.md PROMOTE to the caller,
|
|
15
|
+
exactly as `query contradictions` leaves adjudication to the agent. No lock, no
|
|
16
|
+
model, no DuckDB.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
from collections.abc import Iterable, Mapping
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from . import facts_dir, frontmatter
|
|
26
|
+
from .errors import DataError
|
|
27
|
+
from .graph import scan_derived
|
|
28
|
+
from .hashing import normalize
|
|
29
|
+
|
|
30
|
+
DEFAULT_WEIGHTS = {"title": 0.25, "sources": 0.5, "tags": 0.25}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _tokens(title: str) -> set[str]:
|
|
34
|
+
return set(normalize(title).split())
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _strip_block(dep: str) -> str:
|
|
38
|
+
"""`raw/x#b3` -> `raw/x` (block-scoped deps share their whole source)."""
|
|
39
|
+
return dep.split("#", 1)[0]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _source_set(derived_from: Iterable[str]) -> set[str]:
|
|
43
|
+
return {_strip_block(d) for d in derived_from}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _jaccard(a: set[str], b: set[str]) -> float:
|
|
47
|
+
if not a and not b:
|
|
48
|
+
return 0.0
|
|
49
|
+
return len(a & b) / len(a | b)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _source_tags(root: Path) -> dict[str, set[str]]:
|
|
53
|
+
"""Map each `source_id` to the union of `tags` over its claims. Built once
|
|
54
|
+
per run from facts/claims.ndjson (parsed directly — no DuckDB dependency)."""
|
|
55
|
+
out: dict[str, set[str]] = {}
|
|
56
|
+
p = facts_dir(root) / "claims.ndjson"
|
|
57
|
+
if not p.exists():
|
|
58
|
+
return out
|
|
59
|
+
for lineno, raw_line in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
|
|
60
|
+
line = raw_line.strip()
|
|
61
|
+
if not line:
|
|
62
|
+
continue
|
|
63
|
+
try:
|
|
64
|
+
rec = json.loads(line)
|
|
65
|
+
except json.JSONDecodeError as e:
|
|
66
|
+
raise DataError(f"claims.ndjson:{lineno}: invalid JSON: {e}") from e
|
|
67
|
+
if not isinstance(rec, dict):
|
|
68
|
+
raise DataError(f"claims.ndjson:{lineno}: expected a JSON object")
|
|
69
|
+
sid = rec.get("source_id")
|
|
70
|
+
if not isinstance(sid, str):
|
|
71
|
+
raise DataError(f"claims.ndjson:{lineno}: 'source_id' must be a string")
|
|
72
|
+
tags = rec.get("tags")
|
|
73
|
+
if tags is None:
|
|
74
|
+
continue
|
|
75
|
+
if not isinstance(tags, list) or any(not isinstance(t, str) for t in tags):
|
|
76
|
+
raise DataError(f"claims.ndjson:{lineno}: 'tags' must be a list of strings")
|
|
77
|
+
out.setdefault(sid, set()).update(tags)
|
|
78
|
+
return out
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _page_tags(sources: set[str], source_tags: Mapping[str, set[str]]) -> set[str]:
|
|
82
|
+
out: set[str] = set()
|
|
83
|
+
for s in sources:
|
|
84
|
+
out |= source_tags.get(s, set())
|
|
85
|
+
return out
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def compute_similar(
|
|
89
|
+
root: str | Path,
|
|
90
|
+
*,
|
|
91
|
+
title: str,
|
|
92
|
+
sources: Iterable[str],
|
|
93
|
+
kind: str = "concept",
|
|
94
|
+
exclude: Iterable[str] | None = None,
|
|
95
|
+
top: int | None = None,
|
|
96
|
+
weights: Mapping[str, float] | None = None,
|
|
97
|
+
) -> dict:
|
|
98
|
+
"""Score existing `kind` wiki pages against the proposed (title, sources).
|
|
99
|
+
|
|
100
|
+
Returns ``{proposed, weights, candidates}`` with candidates sorted by
|
|
101
|
+
``combined`` desc then id asc, truncated to ``top``.
|
|
102
|
+
"""
|
|
103
|
+
root = Path(root)
|
|
104
|
+
w = dict(weights or DEFAULT_WEIGHTS)
|
|
105
|
+
skip = set(exclude or ())
|
|
106
|
+
prop_sources = _source_set(sources)
|
|
107
|
+
prop_tokens = _tokens(title)
|
|
108
|
+
source_tags = _source_tags(root)
|
|
109
|
+
prop_tags = _page_tags(prop_sources, source_tags)
|
|
110
|
+
|
|
111
|
+
want_type = f"wiki.{kind}"
|
|
112
|
+
candidates: list[dict] = []
|
|
113
|
+
for cid, d in scan_derived(root).items():
|
|
114
|
+
if d.get("type") != want_type or cid in skip:
|
|
115
|
+
continue # other-kind pages and the facts.set row are dropped here
|
|
116
|
+
c_sources = _source_set(d["derived_from"])
|
|
117
|
+
meta, _ = frontmatter.load(root / d["path"])
|
|
118
|
+
c_title = (meta.get("title") if meta else "") or ""
|
|
119
|
+
c_tags = _page_tags(c_sources, source_tags)
|
|
120
|
+
|
|
121
|
+
title_s = _jaccard(prop_tokens, _tokens(c_title))
|
|
122
|
+
sources_s = _jaccard(prop_sources, c_sources)
|
|
123
|
+
tags_s = _jaccard(prop_tags, c_tags)
|
|
124
|
+
combined = w["title"] * title_s + w["sources"] * sources_s + w["tags"] * tags_s
|
|
125
|
+
candidates.append(
|
|
126
|
+
{
|
|
127
|
+
"id": cid,
|
|
128
|
+
"title": c_title,
|
|
129
|
+
"path": d["path"],
|
|
130
|
+
"kind": kind,
|
|
131
|
+
"scores": {
|
|
132
|
+
"title": round(title_s, 6),
|
|
133
|
+
"sources": round(sources_s, 6),
|
|
134
|
+
"tags": round(tags_s, 6),
|
|
135
|
+
"combined": round(combined, 6),
|
|
136
|
+
},
|
|
137
|
+
"shared": {
|
|
138
|
+
"sources": sorted(prop_sources & c_sources),
|
|
139
|
+
"tags": sorted(prop_tags & c_tags),
|
|
140
|
+
},
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
candidates.sort(key=lambda c: (-c["scores"]["combined"], c["id"]))
|
|
145
|
+
if top is not None:
|
|
146
|
+
candidates = candidates[:top]
|
|
147
|
+
return {
|
|
148
|
+
"proposed": {"title": title, "derived_from": list(sources), "kind": kind},
|
|
149
|
+
"weights": w,
|
|
150
|
+
"candidates": candidates,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def print_similar(result: dict) -> None:
|
|
155
|
+
p = result["proposed"]
|
|
156
|
+
print(f'proposed: "{p["title"]}" ({p["kind"]}, from {len(p["derived_from"])} source(s))')
|
|
157
|
+
cands = result["candidates"]
|
|
158
|
+
if not cands:
|
|
159
|
+
print(f"no existing {p['kind']} pages to compare.")
|
|
160
|
+
return
|
|
161
|
+
for c in cands:
|
|
162
|
+
s = c["scores"]
|
|
163
|
+
print(f' {s["combined"]:.3f} {c["id"]} "{c["title"]}"')
|
|
164
|
+
print(
|
|
165
|
+
f' sources {s["sources"]:.2f} tags {s["tags"]:.2f} title {s["title"]:.2f}'
|
|
166
|
+
f' shared sources: {len(c["shared"]["sources"])}, tags: {len(c["shared"]["tags"])}'
|
|
167
|
+
)
|
|
168
|
+
print(f"({len(cands)} candidate(s))")
|
|
@@ -70,6 +70,8 @@ class KB:
|
|
|
70
70
|
*,
|
|
71
71
|
stamp: bool = True,
|
|
72
72
|
body: str = "Body.\n",
|
|
73
|
+
title: str | None = None,
|
|
74
|
+
kind: str = "concept",
|
|
73
75
|
) -> str:
|
|
74
76
|
deps = {
|
|
75
77
|
sid: h
|
|
@@ -77,16 +79,18 @@ class KB:
|
|
|
77
79
|
if (h := self._dep_hash(sid)) is not None
|
|
78
80
|
}
|
|
79
81
|
meta: dict = {
|
|
80
|
-
"id": f"
|
|
81
|
-
"type": "wiki.
|
|
82
|
-
"title": slug,
|
|
82
|
+
"id": f"{kind}/{slug}",
|
|
83
|
+
"type": f"wiki.{kind}",
|
|
84
|
+
"title": title or slug,
|
|
83
85
|
"derived-from": list(derived_from),
|
|
84
86
|
}
|
|
85
87
|
if stamp:
|
|
86
88
|
meta["input-hash"] = hashing.input_hash(deps)
|
|
87
89
|
meta["last-compiled"] = "2026-01-01T00:00:00Z"
|
|
88
90
|
meta["confidence"] = 0.9
|
|
89
|
-
|
|
91
|
+
subdir = "concepts" if kind == "concept" else "entities"
|
|
92
|
+
path = self.root / "vault" / "wiki" / subdir / f"{slug}.md"
|
|
93
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
90
94
|
path.write_text(frontmatter.dump(meta, body), encoding="utf-8")
|
|
91
95
|
return meta["id"]
|
|
92
96
|
|