scriptoria 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scriptoria-0.2.0 → scriptoria-0.3.0}/.gitignore +5 -2
- {scriptoria-0.2.0 → scriptoria-0.3.0}/PKG-INFO +16 -1
- {scriptoria-0.2.0 → scriptoria-0.3.0}/pyproject.toml +23 -2
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/__init__.py +1 -1
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/cli.py +58 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/embeddings.py +3 -3
- scriptoria-0.3.0/src/scrip/facts.py +426 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/graph.py +1 -2
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/query.py +1 -1
- scriptoria-0.3.0/tests/test_fact_cmd.py +362 -0
- scriptoria-0.3.0/tests/test_index_cmd.py +77 -0
- scriptoria-0.3.0/tests/test_json_shapes.py +57 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_manifest.py +1 -0
- scriptoria-0.3.0/tests/test_status_cmd.py +56 -0
- scriptoria-0.3.0/tests/test_unlock_cmd.py +41 -0
- scriptoria-0.3.0/tests/test_watch.py +48 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/uv.lock +7 -2
- scriptoria-0.2.0/tests/test_watch.py +0 -22
- {scriptoria-0.2.0 → scriptoria-0.3.0}/README.md +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/pyrightconfig.json +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/anchors.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/blocks.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/errors.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/frontmatter.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/hashing.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/ingest.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/lock.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/manifest.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/retrieval.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/conftest.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_anchor_cmd.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_anchors.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_blocks.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_embeddings.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_graph_status.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_hashing.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_ingest.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_lock.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_new_cmd.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_query.py +1 -1
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_retrieval.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_stamp.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_verify.py +1 -1
|
@@ -21,8 +21,11 @@ build/
|
|
|
21
21
|
# Advisory write lock: ephemeral runtime state, never committed (see SPEC §11)
|
|
22
22
|
/.kb/lock
|
|
23
23
|
|
|
24
|
-
#
|
|
25
|
-
#
|
|
24
|
+
# Manifest: a regenerable speed cache. SPEC §8 says it *may* be committed; we
|
|
25
|
+
# choose not to — it stores (mtime, size) that are wrong on every fresh clone
|
|
26
|
+
# anyway, and its hashes/timestamps churn diffs. Rebuild any time with
|
|
26
27
|
# `scrip status --rebuild-manifest`.
|
|
28
|
+
/.kb/manifest.json
|
|
29
|
+
|
|
27
30
|
# roborev snapshots
|
|
28
31
|
/.roborev/
|
|
@@ -1,8 +1,23 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scriptoria
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
|
|
5
|
+
Project-URL: Homepage, https://github.com/coredipper/scriptorium
|
|
6
|
+
Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md
|
|
7
|
+
Project-URL: Issues, https://github.com/coredipper/scriptorium/issues
|
|
5
8
|
License: MIT
|
|
9
|
+
Keywords: agent,knowledge-base,markdown,provenance,staleness,wiki
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
6
21
|
Requires-Python: >=3.10
|
|
7
22
|
Requires-Dist: duckdb>=1.0
|
|
8
23
|
Requires-Dist: pyyaml>=6.0
|
|
@@ -2,16 +2,35 @@
|
|
|
2
2
|
# Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
|
|
3
3
|
# CLI command and the import package both remain `scrip`.
|
|
4
4
|
name = "scriptoria"
|
|
5
|
-
version = "0.
|
|
5
|
+
version = "0.3.0"
|
|
6
6
|
description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
requires-python = ">=3.10"
|
|
9
9
|
license = { text = "MIT" }
|
|
10
|
+
keywords = ["knowledge-base", "provenance", "staleness", "agent", "markdown", "wiki"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"Environment :: Console",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Programming Language :: Python :: 3.14",
|
|
22
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
23
|
+
]
|
|
10
24
|
dependencies = [
|
|
11
25
|
"duckdb>=1.0",
|
|
12
26
|
"pyyaml>=6.0",
|
|
13
27
|
]
|
|
14
28
|
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/coredipper/scriptorium"
|
|
31
|
+
Changelog = "https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md"
|
|
32
|
+
Issues = "https://github.com/coredipper/scriptorium/issues"
|
|
33
|
+
|
|
15
34
|
[project.scripts]
|
|
16
35
|
scrip = "scrip.cli:main"
|
|
17
36
|
|
|
@@ -26,7 +45,9 @@ embeddings = ["model2vec>=0.3", "numpy>=1.24"]
|
|
|
26
45
|
ingest = ["trafilatura>=1.8", "pypdf>=4.0"]
|
|
27
46
|
|
|
28
47
|
[dependency-groups]
|
|
29
|
-
|
|
48
|
+
# numpy is here so the embeddings index/search path is testable with a toy
|
|
49
|
+
# encoder — the real backend (model2vec) stays an optional [embeddings] extra.
|
|
50
|
+
dev = ["pytest>=8", "numpy>=1.24"]
|
|
30
51
|
|
|
31
52
|
[build-system]
|
|
32
53
|
requires = ["hatchling"]
|
|
@@ -13,7 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.3.0"
|
|
17
17
|
|
|
18
18
|
# --- canonical vault layout ------------------------------------------------
|
|
19
19
|
# ``root`` is the repo/instance root: the directory containing ``vault/``.
|
|
@@ -336,6 +336,40 @@ def cmd_new(args: argparse.Namespace) -> int:
|
|
|
336
336
|
return 0
|
|
337
337
|
|
|
338
338
|
|
|
339
|
+
def cmd_fact_add(args: argparse.Namespace) -> int:
|
|
340
|
+
from . import facts
|
|
341
|
+
|
|
342
|
+
root = resolve_root(args.root)
|
|
343
|
+
if args.file:
|
|
344
|
+
try:
|
|
345
|
+
text = Path(args.file).read_text(encoding="utf-8")
|
|
346
|
+
except OSError as e:
|
|
347
|
+
raise errors.UsageError(f"cannot read --file: {e}") from e
|
|
348
|
+
else:
|
|
349
|
+
text = sys.stdin.read()
|
|
350
|
+
result = facts.add(root, args.table, facts.parse_ndjson(text))
|
|
351
|
+
if args.json:
|
|
352
|
+
_emit(result)
|
|
353
|
+
else:
|
|
354
|
+
for r in result["appended"]:
|
|
355
|
+
ident = r.get("claim_id") or r.get("entity_id") or f"{r['src']} -> {r['dst']}"
|
|
356
|
+
print(f" appended {ident}")
|
|
357
|
+
for s in result["skipped"]:
|
|
358
|
+
print(f" = record {s['index']} skipped (duplicate)")
|
|
359
|
+
for f in result["failures"]:
|
|
360
|
+
print(f" ✗ record {f['index']}: {f['status']} — {f['detail']}")
|
|
361
|
+
if result["failures"]:
|
|
362
|
+
print(
|
|
363
|
+
f"nothing appended: {len(result['failures'])} record(s) failed "
|
|
364
|
+
f"(the batch is all-or-nothing)"
|
|
365
|
+
)
|
|
366
|
+
else:
|
|
367
|
+
print(f"{len(result['appended'])} record(s) appended to facts/")
|
|
368
|
+
if result["appended"]:
|
|
369
|
+
print(" next: `scrip stamp vault/facts/_meta.yaml`, then `scrip verify`")
|
|
370
|
+
return 1 if result["failures"] else 0
|
|
371
|
+
|
|
372
|
+
|
|
339
373
|
def cmd_ingest(args: argparse.Namespace) -> int:
|
|
340
374
|
from . import ingest, lock
|
|
341
375
|
|
|
@@ -515,6 +549,30 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
515
549
|
pn.add_argument("--title", help="human title (default: the slug)")
|
|
516
550
|
pn.set_defaults(func=cmd_new)
|
|
517
551
|
|
|
552
|
+
pfact = sub.add_parser(
|
|
553
|
+
"fact",
|
|
554
|
+
help="validated writers for the facts/ layer (claims mint verified anchors)",
|
|
555
|
+
)
|
|
556
|
+
fact_sub = pfact.add_subparsers(dest="fact_command", required=True, metavar="<action>")
|
|
557
|
+
pfa = fact_sub.add_parser(
|
|
558
|
+
"add",
|
|
559
|
+
parents=[common],
|
|
560
|
+
help="validate proposed NDJSON records and append them all-or-nothing; "
|
|
561
|
+
"claims carry a verbatim `quote` and scrip mints the anchor/id/timestamp",
|
|
562
|
+
)
|
|
563
|
+
pfa.add_argument(
|
|
564
|
+
"--table",
|
|
565
|
+
choices=["claims", "entities", "edges"],
|
|
566
|
+
default="claims",
|
|
567
|
+
help="facts table to append to (default: claims)",
|
|
568
|
+
)
|
|
569
|
+
fact_in = pfa.add_mutually_exclusive_group(required=True)
|
|
570
|
+
fact_in.add_argument("--file", metavar="NDJSON", help="read proposed records from a file")
|
|
571
|
+
fact_in.add_argument(
|
|
572
|
+
"--stdin", action="store_true", help="read proposed records from stdin"
|
|
573
|
+
)
|
|
574
|
+
pfa.set_defaults(func=cmd_fact_add)
|
|
575
|
+
|
|
518
576
|
pin = sub.add_parser(
|
|
519
577
|
"ingest",
|
|
520
578
|
parents=[common],
|
|
@@ -47,7 +47,7 @@ def _get_model():
|
|
|
47
47
|
|
|
48
48
|
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
|
49
49
|
try:
|
|
50
|
-
from model2vec import StaticModel
|
|
50
|
+
from model2vec import StaticModel # pyright: ignore[reportMissingImports]
|
|
51
51
|
except Exception:
|
|
52
52
|
return None
|
|
53
53
|
try:
|
|
@@ -95,7 +95,7 @@ def build_index(root: Path) -> int:
|
|
|
95
95
|
model = _get_model()
|
|
96
96
|
if model is None:
|
|
97
97
|
raise RuntimeError("no embeddings backend available")
|
|
98
|
-
import numpy as np
|
|
98
|
+
import numpy as np # pyright: ignore[reportMissingImports]
|
|
99
99
|
|
|
100
100
|
items = list(_iter_blocks(root))
|
|
101
101
|
if items:
|
|
@@ -126,7 +126,7 @@ def vector_search(root: Path, query: str, k: int = 5):
|
|
|
126
126
|
d = _embeddings_dir(root)
|
|
127
127
|
if model is None or not (d / "vectors.npy").exists() or not (d / "meta.json").exists():
|
|
128
128
|
return None
|
|
129
|
-
import numpy as np
|
|
129
|
+
import numpy as np # pyright: ignore[reportMissingImports]
|
|
130
130
|
|
|
131
131
|
meta = json.loads((d / "meta.json").read_text(encoding="utf-8"))
|
|
132
132
|
items = meta["items"]
|
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
"""Validated, locked writers for the facts/ layer — behind ``scrip fact add``.
|
|
2
|
+
|
|
3
|
+
The agent (or harness) *proposes* records; scrip owns everything checkable,
|
|
4
|
+
mirroring how ``scrip anchor`` mints citations for wiki prose:
|
|
5
|
+
|
|
6
|
+
- a proposed claim carries a verbatim ``quote`` — never an ``anchor``,
|
|
7
|
+
``claim_id``, or ``extracted_at``; those are minted here, and the anchor is
|
|
8
|
+
verified to resolve uniquely in the stored source text;
|
|
9
|
+
- the batch is **all-or-nothing**: one unresolvable quote (or conflicting
|
|
10
|
+
entity id) means nothing is appended, and every failure is reported with its
|
|
11
|
+
input index so the caller can retry just the failing records;
|
|
12
|
+
- exact duplicates (same source, normalized quote, triple, and polarity) are
|
|
13
|
+
skipped and reported, so re-running an extraction is safe;
|
|
14
|
+
- quote verification, ids, and the append all happen under the advisory write
|
|
15
|
+
lock; claim sources are merged into ``facts/_meta.yaml`` ``derived-from`` and
|
|
16
|
+
every append (any table) drops the set's ``input-hash`` — the facts set
|
|
17
|
+
honestly shows STALE until ``scrip stamp`` re-blesses it.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
import re
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
import yaml
|
|
28
|
+
|
|
29
|
+
from . import anchors, facts_dir, lock
|
|
30
|
+
from .errors import DataError, UsageError
|
|
31
|
+
|
|
32
|
+
_POLARITIES = ("asserts", "denies", "qualifies")
|
|
33
|
+
|
|
34
|
+
# table name -> file under vault/facts/
|
|
35
|
+
_FILES = {
|
|
36
|
+
"claims": "claims.ndjson",
|
|
37
|
+
"entities": "entities.ndjson",
|
|
38
|
+
"edges": "graph.ndjson",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# Fields scrip mints itself; proposing them is a schema error, not a finding.
|
|
42
|
+
_SCRIP_OWNED = ("claim_id", "anchor", "extracted_at")
|
|
43
|
+
|
|
44
|
+
_CLAIM_REQUIRED = ("quote", "source_id", "subject", "predicate", "object", "polarity", "confidence")
|
|
45
|
+
_CLAIM_ALLOWED = frozenset((*_CLAIM_REQUIRED, "claim_text", "tags"))
|
|
46
|
+
_ENTITY_REQUIRED = ("entity_id", "name", "kind")
|
|
47
|
+
_ENTITY_ALLOWED = frozenset((*_ENTITY_REQUIRED, "tags"))
|
|
48
|
+
_EDGE_REQUIRED = ("src", "dst", "kind")
|
|
49
|
+
_EDGE_ALLOWED = frozenset(_EDGE_REQUIRED)
|
|
50
|
+
|
|
51
|
+
# Same conservative shape ``cli._safe_slug`` enforces — no path separators,
|
|
52
|
+
# '..', or leading dot — applied to source ids arriving as record *data*.
|
|
53
|
+
_SLUG_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
|
|
54
|
+
|
|
55
|
+
_CLAIM_ID_RE = re.compile(r"clm_(\d+)")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _now() -> str:
|
|
59
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# --------------------------------------------------------------------------- #
|
|
63
|
+
# Input parsing & structural validation (DataError, exit 3)
|
|
64
|
+
# --------------------------------------------------------------------------- #
|
|
65
|
+
def parse_ndjson(text: str) -> list[dict]:
|
|
66
|
+
"""Parse proposed records (one JSON object per line). Malformed input is a
|
|
67
|
+
:class:`DataError` with its line number; an empty input is a usage error."""
|
|
68
|
+
records: list[dict] = []
|
|
69
|
+
for lineno, raw_line in enumerate(text.splitlines(), start=1):
|
|
70
|
+
line = raw_line.strip()
|
|
71
|
+
if not line:
|
|
72
|
+
continue
|
|
73
|
+
try:
|
|
74
|
+
rec = json.loads(line)
|
|
75
|
+
except json.JSONDecodeError as e:
|
|
76
|
+
raise DataError(f"input line {lineno}: invalid JSON: {e}") from e
|
|
77
|
+
if not isinstance(rec, dict):
|
|
78
|
+
raise DataError(f"input line {lineno}: expected a JSON object")
|
|
79
|
+
records.append(rec)
|
|
80
|
+
if not records:
|
|
81
|
+
raise UsageError("no records in input")
|
|
82
|
+
return records
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _check_str(rec: dict, key: str, index: int, *, allow_blank: bool = False) -> None:
|
|
86
|
+
v = rec[key]
|
|
87
|
+
if not isinstance(v, str) or (not allow_blank and not v.strip()):
|
|
88
|
+
raise DataError(f"record {index}: '{key}' must be a non-empty string")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _check_tags(rec: dict, index: int) -> None:
|
|
92
|
+
tags = rec.get("tags")
|
|
93
|
+
if tags is None:
|
|
94
|
+
return
|
|
95
|
+
if not isinstance(tags, list) or any(not isinstance(t, str) for t in tags):
|
|
96
|
+
raise DataError(f"record {index}: 'tags' must be a list of strings")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _check_shape(rec: dict, index: int, required: tuple[str, ...], allowed: frozenset[str]) -> None:
|
|
100
|
+
owned = [k for k in _SCRIP_OWNED if k in rec]
|
|
101
|
+
if owned:
|
|
102
|
+
raise DataError(
|
|
103
|
+
f"record {index}: scrip mints {', '.join(owned)} itself — propose a "
|
|
104
|
+
f"verbatim 'quote', not precomputed ids/anchors/timestamps"
|
|
105
|
+
)
|
|
106
|
+
unknown = sorted(rec.keys() - allowed)
|
|
107
|
+
if unknown:
|
|
108
|
+
raise DataError(f"record {index}: unknown field(s): {', '.join(unknown)}")
|
|
109
|
+
missing = sorted(k for k in required if k not in rec)
|
|
110
|
+
if missing:
|
|
111
|
+
raise DataError(f"record {index}: missing required field(s): {', '.join(missing)}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _validate(table: str, rec: dict, index: int) -> None:
|
|
115
|
+
if table == "claims":
|
|
116
|
+
_check_shape(rec, index, _CLAIM_REQUIRED, _CLAIM_ALLOWED)
|
|
117
|
+
# the quote's *emptiness* is a per-record finding, not a schema error
|
|
118
|
+
_check_str(rec, "quote", index, allow_blank=True)
|
|
119
|
+
for key in ("source_id", "subject", "predicate", "object"):
|
|
120
|
+
_check_str(rec, key, index)
|
|
121
|
+
if "claim_text" in rec:
|
|
122
|
+
_check_str(rec, "claim_text", index, allow_blank=True)
|
|
123
|
+
if rec["polarity"] not in _POLARITIES:
|
|
124
|
+
raise DataError(
|
|
125
|
+
f"record {index}: polarity must be one of {', '.join(_POLARITIES)}"
|
|
126
|
+
)
|
|
127
|
+
c = rec["confidence"]
|
|
128
|
+
if isinstance(c, bool) or not isinstance(c, (int, float)) or not 0 <= c <= 1:
|
|
129
|
+
raise DataError(f"record {index}: confidence must be a number in [0, 1]")
|
|
130
|
+
_check_tags(rec, index)
|
|
131
|
+
elif table == "entities":
|
|
132
|
+
_check_shape(rec, index, _ENTITY_REQUIRED, _ENTITY_ALLOWED)
|
|
133
|
+
for key in _ENTITY_REQUIRED:
|
|
134
|
+
_check_str(rec, key, index)
|
|
135
|
+
eid = rec["entity_id"]
|
|
136
|
+
if not (eid.startswith("entity/") and _SLUG_RE.fullmatch(eid[len("entity/") :])):
|
|
137
|
+
raise DataError(f"record {index}: entity_id must look like entity/<slug>")
|
|
138
|
+
_check_tags(rec, index)
|
|
139
|
+
else: # edges
|
|
140
|
+
_check_shape(rec, index, _EDGE_REQUIRED, _EDGE_ALLOWED)
|
|
141
|
+
for key in _EDGE_REQUIRED:
|
|
142
|
+
_check_str(rec, key, index)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# --------------------------------------------------------------------------- #
|
|
146
|
+
# Claim content checks (per-record findings, exit 1)
|
|
147
|
+
# --------------------------------------------------------------------------- #
|
|
148
|
+
def _resolve_claim(
|
|
149
|
+
root: Path, rec: dict, index: int, src_cache: dict[str, str | None]
|
|
150
|
+
) -> tuple[dict | None, dict | None]:
|
|
151
|
+
"""Mint+verify the anchor for one proposed claim. Returns
|
|
152
|
+
``(failure, resolved)`` — exactly one is non-None. ``resolved`` carries the
|
|
153
|
+
normalized ``source_id``, the minted ``anchor``, and its ``qh``."""
|
|
154
|
+
|
|
155
|
+
def failure(status: str, detail: str, source_id: str) -> tuple[dict, None]:
|
|
156
|
+
return (
|
|
157
|
+
{
|
|
158
|
+
"index": index,
|
|
159
|
+
"status": status,
|
|
160
|
+
"source_id": source_id,
|
|
161
|
+
"quote": rec["quote"],
|
|
162
|
+
"detail": detail,
|
|
163
|
+
},
|
|
164
|
+
None,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
given = rec["source_id"]
|
|
168
|
+
source_id = given if given.startswith("raw/") else f"raw/{given}"
|
|
169
|
+
slug = source_id[len("raw/") :]
|
|
170
|
+
if not _SLUG_RE.fullmatch(slug):
|
|
171
|
+
return failure("INVALID_SOURCE", "unsafe source id (path separators or '..')", given)
|
|
172
|
+
|
|
173
|
+
if source_id not in src_cache:
|
|
174
|
+
try:
|
|
175
|
+
src_cache[source_id] = anchors.source_text(root, source_id)
|
|
176
|
+
except DataError:
|
|
177
|
+
src_cache[source_id] = None
|
|
178
|
+
text = src_cache[source_id]
|
|
179
|
+
if text is None:
|
|
180
|
+
return failure("MISSING_SOURCE", "source does not exist in vault/raw/", source_id)
|
|
181
|
+
|
|
182
|
+
if not anchors.normalize(rec["quote"]):
|
|
183
|
+
return failure("EMPTY_QUOTE", "quote is empty after normalization", source_id)
|
|
184
|
+
|
|
185
|
+
anchor = anchors.make_anchor(text, rec["quote"])
|
|
186
|
+
status = anchors.resolve(text, anchor)
|
|
187
|
+
if status != "OK":
|
|
188
|
+
remedy = (
|
|
189
|
+
"lengthen the quote until it is unique"
|
|
190
|
+
if status == "AMBIGUOUS"
|
|
191
|
+
else "the quote must appear verbatim in the source"
|
|
192
|
+
)
|
|
193
|
+
return failure(status, remedy, source_id)
|
|
194
|
+
|
|
195
|
+
qh = anchors.parse_anchor(anchor)["qh"]
|
|
196
|
+
return None, {"source_id": source_id, "anchor": anchor, "qh": qh}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# --------------------------------------------------------------------------- #
|
|
200
|
+
# Existing-table reads & keys
|
|
201
|
+
# --------------------------------------------------------------------------- #
|
|
202
|
+
def _read_table(path: Path) -> tuple[list[dict], str]:
|
|
203
|
+
"""Read an NDJSON table, returning ``(records, raw_text)``. Malformed lines
|
|
204
|
+
are a :class:`DataError` — the vault on disk violates the contract."""
|
|
205
|
+
if not path.exists():
|
|
206
|
+
return [], ""
|
|
207
|
+
text = path.read_text(encoding="utf-8")
|
|
208
|
+
records: list[dict] = []
|
|
209
|
+
for lineno, raw_line in enumerate(text.splitlines(), start=1):
|
|
210
|
+
line = raw_line.strip()
|
|
211
|
+
if not line:
|
|
212
|
+
continue
|
|
213
|
+
try:
|
|
214
|
+
rec = json.loads(line)
|
|
215
|
+
except json.JSONDecodeError as e:
|
|
216
|
+
raise DataError(f"{path.name}:{lineno}: invalid JSON: {e}") from e
|
|
217
|
+
records.append(rec)
|
|
218
|
+
return records, text
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _claim_key(source_id: str, qh: str, rec: dict) -> tuple:
|
|
222
|
+
return (
|
|
223
|
+
source_id,
|
|
224
|
+
qh,
|
|
225
|
+
rec.get("subject"),
|
|
226
|
+
rec.get("predicate"),
|
|
227
|
+
rec.get("object"),
|
|
228
|
+
rec.get("polarity"),
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _existing_claim_keys(existing: list[dict]) -> dict[tuple, str]:
|
|
233
|
+
keys: dict[tuple, str] = {}
|
|
234
|
+
for rec in existing:
|
|
235
|
+
anchor = rec.get("anchor")
|
|
236
|
+
if not isinstance(anchor, str):
|
|
237
|
+
continue
|
|
238
|
+
qh = anchors.parse_anchor(anchor)["qh"]
|
|
239
|
+
keys[_claim_key(rec.get("source_id", ""), qh, rec)] = str(rec.get("claim_id", ""))
|
|
240
|
+
return keys
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _next_claim_id(existing: list[dict]) -> tuple[int, int]:
|
|
244
|
+
"""Return ``(next_number, pad_width)`` continuing the ``clm_NNNN`` sequence."""
|
|
245
|
+
numbers = [
|
|
246
|
+
int(m.group(1))
|
|
247
|
+
for rec in existing
|
|
248
|
+
if (m := _CLAIM_ID_RE.fullmatch(str(rec.get("claim_id", ""))))
|
|
249
|
+
]
|
|
250
|
+
highest = max(numbers, default=0)
|
|
251
|
+
return highest + 1, max(4, len(str(highest)))
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# --------------------------------------------------------------------------- #
|
|
255
|
+
# facts/_meta.yaml: merge derived-from, never stamp
|
|
256
|
+
# --------------------------------------------------------------------------- #
|
|
257
|
+
def _load_meta(root: Path) -> dict:
|
|
258
|
+
"""Parse (or default) ``facts/_meta.yaml``. Called *before* any append so a
|
|
259
|
+
malformed file fails the whole add — claims must never land with their
|
|
260
|
+
source missing from ``derived-from`` (an undetectable staleness lie)."""
|
|
261
|
+
p = facts_dir(root) / "_meta.yaml"
|
|
262
|
+
if not p.exists():
|
|
263
|
+
return {
|
|
264
|
+
"id": "facts/core",
|
|
265
|
+
"type": "facts.set",
|
|
266
|
+
"derived-from": [],
|
|
267
|
+
"members": [
|
|
268
|
+
"facts/entities.ndjson",
|
|
269
|
+
"facts/claims.ndjson",
|
|
270
|
+
"facts/graph.ndjson",
|
|
271
|
+
],
|
|
272
|
+
"confidence": 0.0,
|
|
273
|
+
}
|
|
274
|
+
try:
|
|
275
|
+
data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
|
276
|
+
except yaml.YAMLError as e:
|
|
277
|
+
raise DataError(f"invalid facts/_meta.yaml: {e}") from e
|
|
278
|
+
if not isinstance(data, dict):
|
|
279
|
+
raise DataError("invalid facts/_meta.yaml: expected a mapping")
|
|
280
|
+
return data
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _write_meta(root: Path, data: dict, new_sources: list[str]) -> None:
|
|
284
|
+
derived = list(data.get("derived-from") or [])
|
|
285
|
+
for sid in new_sources:
|
|
286
|
+
if sid not in derived:
|
|
287
|
+
derived.append(sid)
|
|
288
|
+
data["derived-from"] = derived
|
|
289
|
+
# Drop the stamp on EVERY append: with an unchanged derived-from the
|
|
290
|
+
# recomputed input-hash would still match the stored one, and status would
|
|
291
|
+
# report OK over facts nobody has blessed. Removing input-hash forces STALE
|
|
292
|
+
# ("no input-hash recorded") until `scrip stamp vault/facts/_meta.yaml`.
|
|
293
|
+
# last-compiled is kept as the historical record of the last bless.
|
|
294
|
+
data.pop("input-hash", None)
|
|
295
|
+
(facts_dir(root) / "_meta.yaml").write_text(
|
|
296
|
+
yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# --------------------------------------------------------------------------- #
|
|
301
|
+
# The writer
|
|
302
|
+
# --------------------------------------------------------------------------- #
|
|
303
|
+
def add(root: Path, table: str, proposals: list[dict]) -> dict:
|
|
304
|
+
"""Validate ``proposals`` and append them to ``facts/`` all-or-nothing.
|
|
305
|
+
|
|
306
|
+
Returns ``{"table", "appended", "skipped", "failures"}``; the caller maps a
|
|
307
|
+
non-empty ``failures`` to exit 1. Structural problems raise
|
|
308
|
+
:class:`DataError`/:class:`UsageError` instead.
|
|
309
|
+
"""
|
|
310
|
+
if table not in _FILES:
|
|
311
|
+
raise UsageError(f"unknown facts table: {table}")
|
|
312
|
+
for i, rec in enumerate(proposals):
|
|
313
|
+
_validate(table, rec, i)
|
|
314
|
+
|
|
315
|
+
failures: list[dict] = []
|
|
316
|
+
resolved: list[dict | None] = [None] * len(proposals)
|
|
317
|
+
path = facts_dir(root) / _FILES[table]
|
|
318
|
+
with lock.write_lock(root):
|
|
319
|
+
if table == "claims":
|
|
320
|
+
# Resolve quotes INSIDE the lock: raw/ only changes via a *locked*
|
|
321
|
+
# `ingest --reingest`, so holding the lock from verification through
|
|
322
|
+
# append closes the window where a re-ingest could land between the
|
|
323
|
+
# two and silently break the just-minted anchors.
|
|
324
|
+
src_cache: dict[str, str | None] = {}
|
|
325
|
+
for i, rec in enumerate(proposals):
|
|
326
|
+
fail, res = _resolve_claim(root, rec, i, src_cache)
|
|
327
|
+
if fail:
|
|
328
|
+
failures.append(fail)
|
|
329
|
+
else:
|
|
330
|
+
resolved[i] = res
|
|
331
|
+
if failures:
|
|
332
|
+
return {"table": table, "appended": [], "skipped": [], "failures": failures}
|
|
333
|
+
|
|
334
|
+
existing, existing_text = _read_table(path)
|
|
335
|
+
meta = _load_meta(root) # parse before appending: fail whole, not half
|
|
336
|
+
appended: list[dict] = []
|
|
337
|
+
skipped: list[dict] = []
|
|
338
|
+
|
|
339
|
+
if table == "claims":
|
|
340
|
+
keys = _existing_claim_keys(existing)
|
|
341
|
+
number, width = _next_claim_id(existing)
|
|
342
|
+
now = _now()
|
|
343
|
+
for i, rec in enumerate(proposals):
|
|
344
|
+
res = resolved[i]
|
|
345
|
+
assert res is not None # failures returned above
|
|
346
|
+
key = _claim_key(res["source_id"], res["qh"], rec)
|
|
347
|
+
if key in keys:
|
|
348
|
+
skipped.append({"index": i, "reason": "duplicate", "existing_id": keys[key]})
|
|
349
|
+
continue
|
|
350
|
+
cid = f"clm_{number:0{width}d}"
|
|
351
|
+
number += 1
|
|
352
|
+
full = {
|
|
353
|
+
"claim_id": cid,
|
|
354
|
+
"subject": rec["subject"],
|
|
355
|
+
"predicate": rec["predicate"],
|
|
356
|
+
"object": rec["object"],
|
|
357
|
+
"claim_text": rec.get("claim_text") or rec["quote"],
|
|
358
|
+
"source_id": res["source_id"],
|
|
359
|
+
"anchor": res["anchor"],
|
|
360
|
+
"confidence": rec["confidence"],
|
|
361
|
+
"polarity": rec["polarity"],
|
|
362
|
+
"extracted_at": now,
|
|
363
|
+
"tags": rec.get("tags") or [],
|
|
364
|
+
}
|
|
365
|
+
keys[key] = cid
|
|
366
|
+
appended.append(full)
|
|
367
|
+
elif table == "entities":
|
|
368
|
+
def canon(rec: dict) -> dict:
|
|
369
|
+
return {
|
|
370
|
+
"entity_id": rec["entity_id"],
|
|
371
|
+
"name": rec["name"],
|
|
372
|
+
"kind": rec["kind"],
|
|
373
|
+
"tags": rec.get("tags") or [],
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
byid = {rec.get("entity_id"): canon(rec) for rec in existing if "entity_id" in rec
|
|
377
|
+
and isinstance(rec.get("name"), str) and isinstance(rec.get("kind"), str)}
|
|
378
|
+
for i, rec in enumerate(proposals):
|
|
379
|
+
new = canon(rec)
|
|
380
|
+
seen = byid.get(new["entity_id"])
|
|
381
|
+
if seen is None:
|
|
382
|
+
byid[new["entity_id"]] = new
|
|
383
|
+
appended.append(new)
|
|
384
|
+
elif seen == new:
|
|
385
|
+
skipped.append(
|
|
386
|
+
{"index": i, "reason": "duplicate", "existing_id": new["entity_id"]}
|
|
387
|
+
)
|
|
388
|
+
else:
|
|
389
|
+
failures.append(
|
|
390
|
+
{
|
|
391
|
+
"index": i,
|
|
392
|
+
"status": "ID_CONFLICT",
|
|
393
|
+
"entity_id": new["entity_id"],
|
|
394
|
+
"detail": "an entity with this id already exists with different fields",
|
|
395
|
+
}
|
|
396
|
+
)
|
|
397
|
+
else: # edges
|
|
398
|
+
seen_edges = {
|
|
399
|
+
(rec.get("src"), rec.get("dst"), rec.get("kind")) for rec in existing
|
|
400
|
+
}
|
|
401
|
+
for i, rec in enumerate(proposals):
|
|
402
|
+
key = (rec["src"], rec["dst"], rec["kind"])
|
|
403
|
+
if key in seen_edges:
|
|
404
|
+
skipped.append({"index": i, "reason": "duplicate", "existing_id": None})
|
|
405
|
+
continue
|
|
406
|
+
seen_edges.add(key)
|
|
407
|
+
appended.append({"src": rec["src"], "dst": rec["dst"], "kind": rec["kind"]})
|
|
408
|
+
|
|
409
|
+
if failures:
|
|
410
|
+
return {"table": table, "appended": [], "skipped": skipped, "failures": failures}
|
|
411
|
+
|
|
412
|
+
if appended:
|
|
413
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
414
|
+
payload = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in appended)
|
|
415
|
+
with open(path, "a", encoding="utf-8") as f:
|
|
416
|
+
if existing_text and not existing_text.endswith("\n"):
|
|
417
|
+
f.write("\n")
|
|
418
|
+
f.write(payload)
|
|
419
|
+
new_sources: list[str] = []
|
|
420
|
+
if table == "claims":
|
|
421
|
+
for rec in appended:
|
|
422
|
+
if rec["source_id"] not in new_sources:
|
|
423
|
+
new_sources.append(rec["source_id"])
|
|
424
|
+
_write_meta(root, meta, new_sources)
|
|
425
|
+
|
|
426
|
+
return {"table": table, "appended": appended, "skipped": skipped, "failures": []}
|
|
@@ -13,9 +13,8 @@ from pathlib import Path
|
|
|
13
13
|
|
|
14
14
|
import yaml
|
|
15
15
|
|
|
16
|
-
from . import facts_dir, hashing, raw_dir, wiki_dir
|
|
17
16
|
from . import blocks as blocks_mod
|
|
18
|
-
from . import frontmatter
|
|
17
|
+
from . import facts_dir, frontmatter, hashing, raw_dir, wiki_dir
|
|
19
18
|
from . import manifest as manifest_mod
|
|
20
19
|
from .errors import DataError
|
|
21
20
|
|
|
@@ -94,7 +94,7 @@ def run(
|
|
|
94
94
|
raise DataError(f"query failed: {e}") from e
|
|
95
95
|
|
|
96
96
|
columns = [d[0] for d in cur.description] if cur.description else []
|
|
97
|
-
rows = [dict(zip(columns, r)) for r in cur.fetchall()]
|
|
97
|
+
rows = [dict(zip(columns, r, strict=True)) for r in cur.fetchall()]
|
|
98
98
|
return columns, rows
|
|
99
99
|
finally:
|
|
100
100
|
con.close()
|