scriptoria 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {scriptoria-0.2.0 → scriptoria-0.3.0}/.gitignore +5 -2
  2. {scriptoria-0.2.0 → scriptoria-0.3.0}/PKG-INFO +16 -1
  3. {scriptoria-0.2.0 → scriptoria-0.3.0}/pyproject.toml +23 -2
  4. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/__init__.py +1 -1
  5. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/cli.py +58 -0
  6. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/embeddings.py +3 -3
  7. scriptoria-0.3.0/src/scrip/facts.py +426 -0
  8. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/graph.py +1 -2
  9. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/query.py +1 -1
  10. scriptoria-0.3.0/tests/test_fact_cmd.py +362 -0
  11. scriptoria-0.3.0/tests/test_index_cmd.py +77 -0
  12. scriptoria-0.3.0/tests/test_json_shapes.py +57 -0
  13. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_manifest.py +1 -0
  14. scriptoria-0.3.0/tests/test_status_cmd.py +56 -0
  15. scriptoria-0.3.0/tests/test_unlock_cmd.py +41 -0
  16. scriptoria-0.3.0/tests/test_watch.py +48 -0
  17. {scriptoria-0.2.0 → scriptoria-0.3.0}/uv.lock +7 -2
  18. scriptoria-0.2.0/tests/test_watch.py +0 -22
  19. {scriptoria-0.2.0 → scriptoria-0.3.0}/README.md +0 -0
  20. {scriptoria-0.2.0 → scriptoria-0.3.0}/pyrightconfig.json +0 -0
  21. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/anchors.py +0 -0
  22. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/blocks.py +0 -0
  23. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/errors.py +0 -0
  24. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/frontmatter.py +0 -0
  25. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/hashing.py +0 -0
  26. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/ingest.py +0 -0
  27. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/lock.py +0 -0
  28. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/manifest.py +0 -0
  29. {scriptoria-0.2.0 → scriptoria-0.3.0}/src/scrip/retrieval.py +0 -0
  30. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/conftest.py +0 -0
  31. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_anchor_cmd.py +0 -0
  32. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_anchors.py +0 -0
  33. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_blocks.py +0 -0
  34. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_embeddings.py +0 -0
  35. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_graph_status.py +0 -0
  36. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_hashing.py +0 -0
  37. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_ingest.py +0 -0
  38. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_lock.py +0 -0
  39. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_new_cmd.py +0 -0
  40. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_query.py +1 -1
  41. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_retrieval.py +0 -0
  42. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_stamp.py +0 -0
  43. {scriptoria-0.2.0 → scriptoria-0.3.0}/tests/test_verify.py +1 -1
@@ -21,8 +21,11 @@ build/
21
21
  # Advisory write lock: ephemeral runtime state, never committed (see SPEC §11)
22
22
  /.kb/lock
23
23
 
24
- # NOTE: .kb/manifest.json is intentionally NOT ignored it is a committed,
25
- # regenerable cache (see SPEC §8). On a merge conflict, discard it and run
24
+ # Manifest: a regenerable speed cache. SPEC §8 says it *may* be committed; we
25
+ # choose not to — it stores (mtime, size) that are wrong on every fresh clone
26
+ # anyway, and its hashes/timestamps churn diffs. Rebuild any time with
26
27
  # `scrip status --rebuild-manifest`.
28
+ /.kb/manifest.json
29
+
27
30
  # roborev snapshots
28
31
  /.roborev/
@@ -1,8 +1,23 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scriptoria
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
5
+ Project-URL: Homepage, https://github.com/coredipper/scriptorium
6
+ Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md
7
+ Project-URL: Issues, https://github.com/coredipper/scriptorium/issues
5
8
  License: MIT
9
+ Keywords: agent,knowledge-base,markdown,provenance,staleness,wiki
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
6
21
  Requires-Python: >=3.10
7
22
  Requires-Dist: duckdb>=1.0
8
23
  Requires-Dist: pyyaml>=6.0
@@ -2,16 +2,35 @@
2
2
  # Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
3
3
  # CLI command and the import package both remain `scrip`.
4
4
  name = "scriptoria"
5
- version = "0.2.0"
5
+ version = "0.3.0"
6
6
  description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
7
7
  readme = "README.md"
8
8
  requires-python = ">=3.10"
9
9
  license = { text = "MIT" }
10
+ keywords = ["knowledge-base", "provenance", "staleness", "agent", "markdown", "wiki"]
11
+ classifiers = [
12
+ "Development Status :: 4 - Beta",
13
+ "Environment :: Console",
14
+ "Intended Audience :: Developers",
15
+ "Operating System :: OS Independent",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Programming Language :: Python :: 3.14",
22
+ "Topic :: Text Processing :: Markup :: Markdown",
23
+ ]
10
24
  dependencies = [
11
25
  "duckdb>=1.0",
12
26
  "pyyaml>=6.0",
13
27
  ]
14
28
 
29
+ [project.urls]
30
+ Homepage = "https://github.com/coredipper/scriptorium"
31
+ Changelog = "https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md"
32
+ Issues = "https://github.com/coredipper/scriptorium/issues"
33
+
15
34
  [project.scripts]
16
35
  scrip = "scrip.cli:main"
17
36
 
@@ -26,7 +45,9 @@ embeddings = ["model2vec>=0.3", "numpy>=1.24"]
26
45
  ingest = ["trafilatura>=1.8", "pypdf>=4.0"]
27
46
 
28
47
  [dependency-groups]
29
- dev = ["pytest>=8"]
48
+ # numpy is here so the embeddings index/search path is testable with a toy
49
+ # encoder — the real backend (model2vec) stays an optional [embeddings] extra.
50
+ dev = ["pytest>=8", "numpy>=1.24"]
30
51
 
31
52
  [build-system]
32
53
  requires = ["hatchling"]
@@ -13,7 +13,7 @@ from __future__ import annotations
13
13
 
14
14
  from pathlib import Path
15
15
 
16
- __version__ = "0.2.0"
16
+ __version__ = "0.3.0"
17
17
 
18
18
  # --- canonical vault layout ------------------------------------------------
19
19
  # ``root`` is the repo/instance root: the directory containing ``vault/``.
@@ -336,6 +336,40 @@ def cmd_new(args: argparse.Namespace) -> int:
336
336
  return 0
337
337
 
338
338
 
339
+ def cmd_fact_add(args: argparse.Namespace) -> int:
340
+ from . import facts
341
+
342
+ root = resolve_root(args.root)
343
+ if args.file:
344
+ try:
345
+ text = Path(args.file).read_text(encoding="utf-8")
346
+ except OSError as e:
347
+ raise errors.UsageError(f"cannot read --file: {e}") from e
348
+ else:
349
+ text = sys.stdin.read()
350
+ result = facts.add(root, args.table, facts.parse_ndjson(text))
351
+ if args.json:
352
+ _emit(result)
353
+ else:
354
+ for r in result["appended"]:
355
+ ident = r.get("claim_id") or r.get("entity_id") or f"{r['src']} -> {r['dst']}"
356
+ print(f" appended {ident}")
357
+ for s in result["skipped"]:
358
+ print(f" = record {s['index']} skipped (duplicate)")
359
+ for f in result["failures"]:
360
+ print(f" ✗ record {f['index']}: {f['status']} — {f['detail']}")
361
+ if result["failures"]:
362
+ print(
363
+ f"nothing appended: {len(result['failures'])} record(s) failed "
364
+ f"(the batch is all-or-nothing)"
365
+ )
366
+ else:
367
+ print(f"{len(result['appended'])} record(s) appended to facts/")
368
+ if result["appended"]:
369
+ print(" next: `scrip stamp vault/facts/_meta.yaml`, then `scrip verify`")
370
+ return 1 if result["failures"] else 0
371
+
372
+
339
373
  def cmd_ingest(args: argparse.Namespace) -> int:
340
374
  from . import ingest, lock
341
375
 
@@ -515,6 +549,30 @@ def build_parser() -> argparse.ArgumentParser:
515
549
  pn.add_argument("--title", help="human title (default: the slug)")
516
550
  pn.set_defaults(func=cmd_new)
517
551
 
552
+ pfact = sub.add_parser(
553
+ "fact",
554
+ help="validated writers for the facts/ layer (claims mint verified anchors)",
555
+ )
556
+ fact_sub = pfact.add_subparsers(dest="fact_command", required=True, metavar="<action>")
557
+ pfa = fact_sub.add_parser(
558
+ "add",
559
+ parents=[common],
560
+ help="validate proposed NDJSON records and append them all-or-nothing; "
561
+ "claims carry a verbatim `quote` and scrip mints the anchor/id/timestamp",
562
+ )
563
+ pfa.add_argument(
564
+ "--table",
565
+ choices=["claims", "entities", "edges"],
566
+ default="claims",
567
+ help="facts table to append to (default: claims)",
568
+ )
569
+ fact_in = pfa.add_mutually_exclusive_group(required=True)
570
+ fact_in.add_argument("--file", metavar="NDJSON", help="read proposed records from a file")
571
+ fact_in.add_argument(
572
+ "--stdin", action="store_true", help="read proposed records from stdin"
573
+ )
574
+ pfa.set_defaults(func=cmd_fact_add)
575
+
518
576
  pin = sub.add_parser(
519
577
  "ingest",
520
578
  parents=[common],
@@ -47,7 +47,7 @@ def _get_model():
47
47
 
48
48
  os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
49
49
  try:
50
- from model2vec import StaticModel
50
+ from model2vec import StaticModel # pyright: ignore[reportMissingImports]
51
51
  except Exception:
52
52
  return None
53
53
  try:
@@ -95,7 +95,7 @@ def build_index(root: Path) -> int:
95
95
  model = _get_model()
96
96
  if model is None:
97
97
  raise RuntimeError("no embeddings backend available")
98
- import numpy as np
98
+ import numpy as np # pyright: ignore[reportMissingImports]
99
99
 
100
100
  items = list(_iter_blocks(root))
101
101
  if items:
@@ -126,7 +126,7 @@ def vector_search(root: Path, query: str, k: int = 5):
126
126
  d = _embeddings_dir(root)
127
127
  if model is None or not (d / "vectors.npy").exists() or not (d / "meta.json").exists():
128
128
  return None
129
- import numpy as np
129
+ import numpy as np # pyright: ignore[reportMissingImports]
130
130
 
131
131
  meta = json.loads((d / "meta.json").read_text(encoding="utf-8"))
132
132
  items = meta["items"]
@@ -0,0 +1,426 @@
1
+ """Validated, locked writers for the facts/ layer — behind ``scrip fact add``.
2
+
3
+ The agent (or harness) *proposes* records; scrip owns everything checkable,
4
+ mirroring how ``scrip anchor`` mints citations for wiki prose:
5
+
6
+ - a proposed claim carries a verbatim ``quote`` — never an ``anchor``,
7
+ ``claim_id``, or ``extracted_at``; those are minted here, and the anchor is
8
+ verified to resolve uniquely in the stored source text;
9
+ - the batch is **all-or-nothing**: one unresolvable quote (or conflicting
10
+ entity id) means nothing is appended, and every failure is reported with its
11
+ input index so the caller can retry just the failing records;
12
+ - exact duplicates (same source, normalized quote, triple, and polarity) are
13
+ skipped and reported, so re-running an extraction is safe;
14
+ - quote verification, ids, and the append all happen under the advisory write
15
+ lock; claim sources are merged into ``facts/_meta.yaml`` ``derived-from`` and
16
+ every append (any table) drops the set's ``input-hash`` — the facts set
17
+ honestly shows STALE until ``scrip stamp`` re-blesses it.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import re
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+
27
+ import yaml
28
+
29
+ from . import anchors, facts_dir, lock
30
+ from .errors import DataError, UsageError
31
+
32
+ _POLARITIES = ("asserts", "denies", "qualifies")
33
+
34
+ # table name -> file under vault/facts/
35
+ _FILES = {
36
+ "claims": "claims.ndjson",
37
+ "entities": "entities.ndjson",
38
+ "edges": "graph.ndjson",
39
+ }
40
+
41
+ # Fields scrip mints itself; proposing them is a schema error, not a finding.
42
+ _SCRIP_OWNED = ("claim_id", "anchor", "extracted_at")
43
+
44
+ _CLAIM_REQUIRED = ("quote", "source_id", "subject", "predicate", "object", "polarity", "confidence")
45
+ _CLAIM_ALLOWED = frozenset((*_CLAIM_REQUIRED, "claim_text", "tags"))
46
+ _ENTITY_REQUIRED = ("entity_id", "name", "kind")
47
+ _ENTITY_ALLOWED = frozenset((*_ENTITY_REQUIRED, "tags"))
48
+ _EDGE_REQUIRED = ("src", "dst", "kind")
49
+ _EDGE_ALLOWED = frozenset(_EDGE_REQUIRED)
50
+
51
+ # Same conservative shape ``cli._safe_slug`` enforces — no path separators,
52
+ # '..', or leading dot — applied to source ids arriving as record *data*.
53
+ _SLUG_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
54
+
55
+ _CLAIM_ID_RE = re.compile(r"clm_(\d+)")
56
+
57
+
58
+ def _now() -> str:
59
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
60
+
61
+
62
+ # --------------------------------------------------------------------------- #
63
+ # Input parsing & structural validation (DataError, exit 3)
64
+ # --------------------------------------------------------------------------- #
65
+ def parse_ndjson(text: str) -> list[dict]:
66
+ """Parse proposed records (one JSON object per line). Malformed input is a
67
+ :class:`DataError` with its line number; an empty input is a usage error."""
68
+ records: list[dict] = []
69
+ for lineno, raw_line in enumerate(text.splitlines(), start=1):
70
+ line = raw_line.strip()
71
+ if not line:
72
+ continue
73
+ try:
74
+ rec = json.loads(line)
75
+ except json.JSONDecodeError as e:
76
+ raise DataError(f"input line {lineno}: invalid JSON: {e}") from e
77
+ if not isinstance(rec, dict):
78
+ raise DataError(f"input line {lineno}: expected a JSON object")
79
+ records.append(rec)
80
+ if not records:
81
+ raise UsageError("no records in input")
82
+ return records
83
+
84
+
85
+ def _check_str(rec: dict, key: str, index: int, *, allow_blank: bool = False) -> None:
86
+ v = rec[key]
87
+ if not isinstance(v, str) or (not allow_blank and not v.strip()):
88
+ raise DataError(f"record {index}: '{key}' must be a non-empty string")
89
+
90
+
91
+ def _check_tags(rec: dict, index: int) -> None:
92
+ tags = rec.get("tags")
93
+ if tags is None:
94
+ return
95
+ if not isinstance(tags, list) or any(not isinstance(t, str) for t in tags):
96
+ raise DataError(f"record {index}: 'tags' must be a list of strings")
97
+
98
+
99
+ def _check_shape(rec: dict, index: int, required: tuple[str, ...], allowed: frozenset[str]) -> None:
100
+ owned = [k for k in _SCRIP_OWNED if k in rec]
101
+ if owned:
102
+ raise DataError(
103
+ f"record {index}: scrip mints {', '.join(owned)} itself — propose a "
104
+ f"verbatim 'quote', not precomputed ids/anchors/timestamps"
105
+ )
106
+ unknown = sorted(rec.keys() - allowed)
107
+ if unknown:
108
+ raise DataError(f"record {index}: unknown field(s): {', '.join(unknown)}")
109
+ missing = sorted(k for k in required if k not in rec)
110
+ if missing:
111
+ raise DataError(f"record {index}: missing required field(s): {', '.join(missing)}")
112
+
113
+
114
+ def _validate(table: str, rec: dict, index: int) -> None:
115
+ if table == "claims":
116
+ _check_shape(rec, index, _CLAIM_REQUIRED, _CLAIM_ALLOWED)
117
+ # the quote's *emptiness* is a per-record finding, not a schema error
118
+ _check_str(rec, "quote", index, allow_blank=True)
119
+ for key in ("source_id", "subject", "predicate", "object"):
120
+ _check_str(rec, key, index)
121
+ if "claim_text" in rec:
122
+ _check_str(rec, "claim_text", index, allow_blank=True)
123
+ if rec["polarity"] not in _POLARITIES:
124
+ raise DataError(
125
+ f"record {index}: polarity must be one of {', '.join(_POLARITIES)}"
126
+ )
127
+ c = rec["confidence"]
128
+ if isinstance(c, bool) or not isinstance(c, (int, float)) or not 0 <= c <= 1:
129
+ raise DataError(f"record {index}: confidence must be a number in [0, 1]")
130
+ _check_tags(rec, index)
131
+ elif table == "entities":
132
+ _check_shape(rec, index, _ENTITY_REQUIRED, _ENTITY_ALLOWED)
133
+ for key in _ENTITY_REQUIRED:
134
+ _check_str(rec, key, index)
135
+ eid = rec["entity_id"]
136
+ if not (eid.startswith("entity/") and _SLUG_RE.fullmatch(eid[len("entity/") :])):
137
+ raise DataError(f"record {index}: entity_id must look like entity/<slug>")
138
+ _check_tags(rec, index)
139
+ else: # edges
140
+ _check_shape(rec, index, _EDGE_REQUIRED, _EDGE_ALLOWED)
141
+ for key in _EDGE_REQUIRED:
142
+ _check_str(rec, key, index)
143
+
144
+
145
+ # --------------------------------------------------------------------------- #
146
+ # Claim content checks (per-record findings, exit 1)
147
+ # --------------------------------------------------------------------------- #
148
+ def _resolve_claim(
149
+ root: Path, rec: dict, index: int, src_cache: dict[str, str | None]
150
+ ) -> tuple[dict | None, dict | None]:
151
+ """Mint+verify the anchor for one proposed claim. Returns
152
+ ``(failure, resolved)`` — exactly one is non-None. ``resolved`` carries the
153
+ normalized ``source_id``, the minted ``anchor``, and its ``qh``."""
154
+
155
+ def failure(status: str, detail: str, source_id: str) -> tuple[dict, None]:
156
+ return (
157
+ {
158
+ "index": index,
159
+ "status": status,
160
+ "source_id": source_id,
161
+ "quote": rec["quote"],
162
+ "detail": detail,
163
+ },
164
+ None,
165
+ )
166
+
167
+ given = rec["source_id"]
168
+ source_id = given if given.startswith("raw/") else f"raw/{given}"
169
+ slug = source_id[len("raw/") :]
170
+ if not _SLUG_RE.fullmatch(slug):
171
+ return failure("INVALID_SOURCE", "unsafe source id (path separators or '..')", given)
172
+
173
+ if source_id not in src_cache:
174
+ try:
175
+ src_cache[source_id] = anchors.source_text(root, source_id)
176
+ except DataError:
177
+ src_cache[source_id] = None
178
+ text = src_cache[source_id]
179
+ if text is None:
180
+ return failure("MISSING_SOURCE", "source does not exist in vault/raw/", source_id)
181
+
182
+ if not anchors.normalize(rec["quote"]):
183
+ return failure("EMPTY_QUOTE", "quote is empty after normalization", source_id)
184
+
185
+ anchor = anchors.make_anchor(text, rec["quote"])
186
+ status = anchors.resolve(text, anchor)
187
+ if status != "OK":
188
+ remedy = (
189
+ "lengthen the quote until it is unique"
190
+ if status == "AMBIGUOUS"
191
+ else "the quote must appear verbatim in the source"
192
+ )
193
+ return failure(status, remedy, source_id)
194
+
195
+ qh = anchors.parse_anchor(anchor)["qh"]
196
+ return None, {"source_id": source_id, "anchor": anchor, "qh": qh}
197
+
198
+
199
+ # --------------------------------------------------------------------------- #
200
+ # Existing-table reads & keys
201
+ # --------------------------------------------------------------------------- #
202
+ def _read_table(path: Path) -> tuple[list[dict], str]:
203
+ """Read an NDJSON table, returning ``(records, raw_text)``. Malformed lines
204
+ are a :class:`DataError` — the vault on disk violates the contract."""
205
+ if not path.exists():
206
+ return [], ""
207
+ text = path.read_text(encoding="utf-8")
208
+ records: list[dict] = []
209
+ for lineno, raw_line in enumerate(text.splitlines(), start=1):
210
+ line = raw_line.strip()
211
+ if not line:
212
+ continue
213
+ try:
214
+ rec = json.loads(line)
215
+ except json.JSONDecodeError as e:
216
+ raise DataError(f"{path.name}:{lineno}: invalid JSON: {e}") from e
217
+ records.append(rec)
218
+ return records, text
219
+
220
+
221
+ def _claim_key(source_id: str, qh: str, rec: dict) -> tuple:
222
+ return (
223
+ source_id,
224
+ qh,
225
+ rec.get("subject"),
226
+ rec.get("predicate"),
227
+ rec.get("object"),
228
+ rec.get("polarity"),
229
+ )
230
+
231
+
232
+ def _existing_claim_keys(existing: list[dict]) -> dict[tuple, str]:
233
+ keys: dict[tuple, str] = {}
234
+ for rec in existing:
235
+ anchor = rec.get("anchor")
236
+ if not isinstance(anchor, str):
237
+ continue
238
+ qh = anchors.parse_anchor(anchor)["qh"]
239
+ keys[_claim_key(rec.get("source_id", ""), qh, rec)] = str(rec.get("claim_id", ""))
240
+ return keys
241
+
242
+
243
+ def _next_claim_id(existing: list[dict]) -> tuple[int, int]:
244
+ """Return ``(next_number, pad_width)`` continuing the ``clm_NNNN`` sequence."""
245
+ numbers = [
246
+ int(m.group(1))
247
+ for rec in existing
248
+ if (m := _CLAIM_ID_RE.fullmatch(str(rec.get("claim_id", ""))))
249
+ ]
250
+ highest = max(numbers, default=0)
251
+ return highest + 1, max(4, len(str(highest)))
252
+
253
+
254
+ # --------------------------------------------------------------------------- #
255
+ # facts/_meta.yaml: merge derived-from, never stamp
256
+ # --------------------------------------------------------------------------- #
257
+ def _load_meta(root: Path) -> dict:
258
+ """Parse (or default) ``facts/_meta.yaml``. Called *before* any append so a
259
+ malformed file fails the whole add — claims must never land with their
260
+ source missing from ``derived-from`` (an undetectable staleness lie)."""
261
+ p = facts_dir(root) / "_meta.yaml"
262
+ if not p.exists():
263
+ return {
264
+ "id": "facts/core",
265
+ "type": "facts.set",
266
+ "derived-from": [],
267
+ "members": [
268
+ "facts/entities.ndjson",
269
+ "facts/claims.ndjson",
270
+ "facts/graph.ndjson",
271
+ ],
272
+ "confidence": 0.0,
273
+ }
274
+ try:
275
+ data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
276
+ except yaml.YAMLError as e:
277
+ raise DataError(f"invalid facts/_meta.yaml: {e}") from e
278
+ if not isinstance(data, dict):
279
+ raise DataError("invalid facts/_meta.yaml: expected a mapping")
280
+ return data
281
+
282
+
283
+ def _write_meta(root: Path, data: dict, new_sources: list[str]) -> None:
284
+ derived = list(data.get("derived-from") or [])
285
+ for sid in new_sources:
286
+ if sid not in derived:
287
+ derived.append(sid)
288
+ data["derived-from"] = derived
289
+ # Drop the stamp on EVERY append: with an unchanged derived-from the
290
+ # recomputed input-hash would still match the stored one, and status would
291
+ # report OK over facts nobody has blessed. Removing input-hash forces STALE
292
+ # ("no input-hash recorded") until `scrip stamp vault/facts/_meta.yaml`.
293
+ # last-compiled is kept as the historical record of the last bless.
294
+ data.pop("input-hash", None)
295
+ (facts_dir(root) / "_meta.yaml").write_text(
296
+ yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8"
297
+ )
298
+
299
+
300
+ # --------------------------------------------------------------------------- #
301
+ # The writer
302
+ # --------------------------------------------------------------------------- #
303
+ def add(root: Path, table: str, proposals: list[dict]) -> dict:
304
+ """Validate ``proposals`` and append them to ``facts/`` all-or-nothing.
305
+
306
+ Returns ``{"table", "appended", "skipped", "failures"}``; the caller maps a
307
+ non-empty ``failures`` to exit 1. Structural problems raise
308
+ :class:`DataError`/:class:`UsageError` instead.
309
+ """
310
+ if table not in _FILES:
311
+ raise UsageError(f"unknown facts table: {table}")
312
+ for i, rec in enumerate(proposals):
313
+ _validate(table, rec, i)
314
+
315
+ failures: list[dict] = []
316
+ resolved: list[dict | None] = [None] * len(proposals)
317
+ path = facts_dir(root) / _FILES[table]
318
+ with lock.write_lock(root):
319
+ if table == "claims":
320
+ # Resolve quotes INSIDE the lock: raw/ only changes via a *locked*
321
+ # `ingest --reingest`, so holding the lock from verification through
322
+ # append closes the window where a re-ingest could land between the
323
+ # two and silently break the just-minted anchors.
324
+ src_cache: dict[str, str | None] = {}
325
+ for i, rec in enumerate(proposals):
326
+ fail, res = _resolve_claim(root, rec, i, src_cache)
327
+ if fail:
328
+ failures.append(fail)
329
+ else:
330
+ resolved[i] = res
331
+ if failures:
332
+ return {"table": table, "appended": [], "skipped": [], "failures": failures}
333
+
334
+ existing, existing_text = _read_table(path)
335
+ meta = _load_meta(root) # parse before appending: fail whole, not half
336
+ appended: list[dict] = []
337
+ skipped: list[dict] = []
338
+
339
+ if table == "claims":
340
+ keys = _existing_claim_keys(existing)
341
+ number, width = _next_claim_id(existing)
342
+ now = _now()
343
+ for i, rec in enumerate(proposals):
344
+ res = resolved[i]
345
+ assert res is not None # failures returned above
346
+ key = _claim_key(res["source_id"], res["qh"], rec)
347
+ if key in keys:
348
+ skipped.append({"index": i, "reason": "duplicate", "existing_id": keys[key]})
349
+ continue
350
+ cid = f"clm_{number:0{width}d}"
351
+ number += 1
352
+ full = {
353
+ "claim_id": cid,
354
+ "subject": rec["subject"],
355
+ "predicate": rec["predicate"],
356
+ "object": rec["object"],
357
+ "claim_text": rec.get("claim_text") or rec["quote"],
358
+ "source_id": res["source_id"],
359
+ "anchor": res["anchor"],
360
+ "confidence": rec["confidence"],
361
+ "polarity": rec["polarity"],
362
+ "extracted_at": now,
363
+ "tags": rec.get("tags") or [],
364
+ }
365
+ keys[key] = cid
366
+ appended.append(full)
367
+ elif table == "entities":
368
+ def canon(rec: dict) -> dict:
369
+ return {
370
+ "entity_id": rec["entity_id"],
371
+ "name": rec["name"],
372
+ "kind": rec["kind"],
373
+ "tags": rec.get("tags") or [],
374
+ }
375
+
376
+ byid = {rec.get("entity_id"): canon(rec) for rec in existing if "entity_id" in rec
377
+ and isinstance(rec.get("name"), str) and isinstance(rec.get("kind"), str)}
378
+ for i, rec in enumerate(proposals):
379
+ new = canon(rec)
380
+ seen = byid.get(new["entity_id"])
381
+ if seen is None:
382
+ byid[new["entity_id"]] = new
383
+ appended.append(new)
384
+ elif seen == new:
385
+ skipped.append(
386
+ {"index": i, "reason": "duplicate", "existing_id": new["entity_id"]}
387
+ )
388
+ else:
389
+ failures.append(
390
+ {
391
+ "index": i,
392
+ "status": "ID_CONFLICT",
393
+ "entity_id": new["entity_id"],
394
+ "detail": "an entity with this id already exists with different fields",
395
+ }
396
+ )
397
+ else: # edges
398
+ seen_edges = {
399
+ (rec.get("src"), rec.get("dst"), rec.get("kind")) for rec in existing
400
+ }
401
+ for i, rec in enumerate(proposals):
402
+ key = (rec["src"], rec["dst"], rec["kind"])
403
+ if key in seen_edges:
404
+ skipped.append({"index": i, "reason": "duplicate", "existing_id": None})
405
+ continue
406
+ seen_edges.add(key)
407
+ appended.append({"src": rec["src"], "dst": rec["dst"], "kind": rec["kind"]})
408
+
409
+ if failures:
410
+ return {"table": table, "appended": [], "skipped": skipped, "failures": failures}
411
+
412
+ if appended:
413
+ path.parent.mkdir(parents=True, exist_ok=True)
414
+ payload = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in appended)
415
+ with open(path, "a", encoding="utf-8") as f:
416
+ if existing_text and not existing_text.endswith("\n"):
417
+ f.write("\n")
418
+ f.write(payload)
419
+ new_sources: list[str] = []
420
+ if table == "claims":
421
+ for rec in appended:
422
+ if rec["source_id"] not in new_sources:
423
+ new_sources.append(rec["source_id"])
424
+ _write_meta(root, meta, new_sources)
425
+
426
+ return {"table": table, "appended": appended, "skipped": skipped, "failures": []}
@@ -13,9 +13,8 @@ from pathlib import Path
13
13
 
14
14
  import yaml
15
15
 
16
- from . import facts_dir, hashing, raw_dir, wiki_dir
17
16
  from . import blocks as blocks_mod
18
- from . import frontmatter
17
+ from . import facts_dir, frontmatter, hashing, raw_dir, wiki_dir
19
18
  from . import manifest as manifest_mod
20
19
  from .errors import DataError
21
20
 
@@ -94,7 +94,7 @@ def run(
94
94
  raise DataError(f"query failed: {e}") from e
95
95
 
96
96
  columns = [d[0] for d in cur.description] if cur.description else []
97
- rows = [dict(zip(columns, r)) for r in cur.fetchall()]
97
+ rows = [dict(zip(columns, r, strict=True)) for r in cur.fetchall()]
98
98
  return columns, rows
99
99
  finally:
100
100
  con.close()