scriptoria 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {scriptoria-0.2.0 → scriptoria-0.4.0}/.gitignore +5 -2
  2. {scriptoria-0.2.0 → scriptoria-0.4.0}/PKG-INFO +16 -1
  3. {scriptoria-0.2.0 → scriptoria-0.4.0}/pyproject.toml +23 -2
  4. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/__init__.py +1 -1
  5. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/cli.py +125 -0
  6. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/embeddings.py +3 -3
  7. scriptoria-0.4.0/src/scrip/facts.py +426 -0
  8. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/graph.py +1 -2
  9. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/query.py +1 -1
  10. scriptoria-0.4.0/src/scrip/similar.py +168 -0
  11. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/conftest.py +8 -4
  12. scriptoria-0.4.0/tests/test_fact_cmd.py +362 -0
  13. scriptoria-0.4.0/tests/test_index_cmd.py +77 -0
  14. scriptoria-0.4.0/tests/test_json_shapes.py +57 -0
  15. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_manifest.py +1 -0
  16. scriptoria-0.4.0/tests/test_similar_cmd.py +234 -0
  17. scriptoria-0.4.0/tests/test_status_cmd.py +56 -0
  18. scriptoria-0.4.0/tests/test_unlock_cmd.py +41 -0
  19. scriptoria-0.4.0/tests/test_version.py +11 -0
  20. scriptoria-0.4.0/tests/test_watch.py +48 -0
  21. {scriptoria-0.2.0 → scriptoria-0.4.0}/uv.lock +7 -2
  22. scriptoria-0.2.0/tests/test_watch.py +0 -22
  23. {scriptoria-0.2.0 → scriptoria-0.4.0}/README.md +0 -0
  24. {scriptoria-0.2.0 → scriptoria-0.4.0}/pyrightconfig.json +0 -0
  25. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/anchors.py +0 -0
  26. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/blocks.py +0 -0
  27. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/errors.py +0 -0
  28. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/frontmatter.py +0 -0
  29. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/hashing.py +0 -0
  30. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/ingest.py +0 -0
  31. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/lock.py +0 -0
  32. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/manifest.py +0 -0
  33. {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/retrieval.py +0 -0
  34. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_anchor_cmd.py +0 -0
  35. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_anchors.py +0 -0
  36. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_blocks.py +0 -0
  37. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_embeddings.py +0 -0
  38. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_graph_status.py +0 -0
  39. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_hashing.py +0 -0
  40. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_ingest.py +0 -0
  41. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_lock.py +0 -0
  42. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_new_cmd.py +0 -0
  43. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_query.py +1 -1
  44. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_retrieval.py +0 -0
  45. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_stamp.py +0 -0
  46. {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_verify.py +1 -1
@@ -21,8 +21,11 @@ build/
21
21
  # Advisory write lock: ephemeral runtime state, never committed (see SPEC §11)
22
22
  /.kb/lock
23
23
 
24
- # NOTE: .kb/manifest.json is intentionally NOT ignored it is a committed,
25
- # regenerable cache (see SPEC §8). On a merge conflict, discard it and run
24
+ # Manifest: a regenerable speed cache. SPEC §8 says it *may* be committed; we
25
+ # choose not to — it stores (mtime, size) that are wrong on every fresh clone
26
+ # anyway, and its hashes/timestamps churn diffs. Rebuild any time with
26
27
  # `scrip status --rebuild-manifest`.
28
+ /.kb/manifest.json
29
+
27
30
  # roborev snapshots
28
31
  /.roborev/
@@ -1,8 +1,23 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scriptoria
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
5
+ Project-URL: Homepage, https://github.com/coredipper/scriptorium
6
+ Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md
7
+ Project-URL: Issues, https://github.com/coredipper/scriptorium/issues
5
8
  License: MIT
9
+ Keywords: agent,knowledge-base,markdown,provenance,staleness,wiki
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
6
21
  Requires-Python: >=3.10
7
22
  Requires-Dist: duckdb>=1.0
8
23
  Requires-Dist: pyyaml>=6.0
@@ -2,16 +2,35 @@
2
2
  # Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
3
3
  # CLI command and the import package both remain `scrip`.
4
4
  name = "scriptoria"
5
- version = "0.2.0"
5
+ version = "0.4.0"
6
6
  description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
7
7
  readme = "README.md"
8
8
  requires-python = ">=3.10"
9
9
  license = { text = "MIT" }
10
+ keywords = ["knowledge-base", "provenance", "staleness", "agent", "markdown", "wiki"]
11
+ classifiers = [
12
+ "Development Status :: 4 - Beta",
13
+ "Environment :: Console",
14
+ "Intended Audience :: Developers",
15
+ "Operating System :: OS Independent",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Programming Language :: Python :: 3.14",
22
+ "Topic :: Text Processing :: Markup :: Markdown",
23
+ ]
10
24
  dependencies = [
11
25
  "duckdb>=1.0",
12
26
  "pyyaml>=6.0",
13
27
  ]
14
28
 
29
+ [project.urls]
30
+ Homepage = "https://github.com/coredipper/scriptorium"
31
+ Changelog = "https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md"
32
+ Issues = "https://github.com/coredipper/scriptorium/issues"
33
+
15
34
  [project.scripts]
16
35
  scrip = "scrip.cli:main"
17
36
 
@@ -26,7 +45,9 @@ embeddings = ["model2vec>=0.3", "numpy>=1.24"]
26
45
  ingest = ["trafilatura>=1.8", "pypdf>=4.0"]
27
46
 
28
47
  [dependency-groups]
29
- dev = ["pytest>=8"]
48
+ # numpy is here so the embeddings index/search path is testable with a toy
49
+ # encoder — the real backend (model2vec) stays an optional [embeddings] extra.
50
+ dev = ["pytest>=8", "numpy>=1.24"]
30
51
 
31
52
  [build-system]
32
53
  requires = ["hatchling"]
@@ -13,7 +13,7 @@ from __future__ import annotations
13
13
 
14
14
  from pathlib import Path
15
15
 
16
- __version__ = "0.2.0"
16
+ __version__ = "0.4.0"
17
17
 
18
18
  # --- canonical vault layout ------------------------------------------------
19
19
  # ``root`` is the repo/instance root: the directory containing ``vault/``.
@@ -336,6 +336,76 @@ def cmd_new(args: argparse.Namespace) -> int:
336
336
  return 0
337
337
 
338
338
 
339
+ def _parse_source_ids(raw: str) -> list[str]:
340
+ """Parse a comma-separated `--from` value into validated source ids, WITHOUT
341
+ requiring the sources to exist (unlike `cmd_new`): scoring a not-yet-ingested
342
+ proposed topic is legitimate. Keeps the traversal-safety check."""
343
+ ids: list[str] = []
344
+ for s in (part.strip() for part in raw.split(",")):
345
+ if not s:
346
+ continue
347
+ sid = s if s.startswith("raw/") else f"raw/{s}"
348
+ _safe_slug(sid.split("#", 1)[0][len("raw/") :], "source")
349
+ ids.append(sid)
350
+ if not ids:
351
+ raise errors.UsageError("--from requires at least one source id")
352
+ return ids
353
+
354
+
355
+ def cmd_similar(args: argparse.Namespace) -> int:
356
+ from . import similar
357
+
358
+ root = resolve_root(args.root)
359
+ sources = _parse_source_ids(args.sources)
360
+ result = similar.compute_similar(
361
+ root,
362
+ title=args.title,
363
+ sources=sources,
364
+ kind=args.kind,
365
+ exclude=set(args.exclude),
366
+ top=args.top,
367
+ )
368
+ if args.json:
369
+ _emit(result)
370
+ else:
371
+ similar.print_similar(result)
372
+ return 0
373
+
374
+
375
+ def cmd_fact_add(args: argparse.Namespace) -> int:
376
+ from . import facts
377
+
378
+ root = resolve_root(args.root)
379
+ if args.file:
380
+ try:
381
+ text = Path(args.file).read_text(encoding="utf-8")
382
+ except OSError as e:
383
+ raise errors.UsageError(f"cannot read --file: {e}") from e
384
+ else:
385
+ text = sys.stdin.read()
386
+ result = facts.add(root, args.table, facts.parse_ndjson(text))
387
+ if args.json:
388
+ _emit(result)
389
+ else:
390
+ for r in result["appended"]:
391
+ ident = r.get("claim_id") or r.get("entity_id") or f"{r['src']} -> {r['dst']}"
392
+ print(f" appended {ident}")
393
+ for s in result["skipped"]:
394
+ print(f" = record {s['index']} skipped (duplicate)")
395
+ for f in result["failures"]:
396
+ print(f" ✗ record {f['index']}: {f['status']} — {f['detail']}")
397
+ if result["failures"]:
398
+ print(
399
+ f"nothing appended: {len(result['failures'])} record(s) failed "
400
+ f"(the batch is all-or-nothing)"
401
+ )
402
+ else:
403
+ print(f"{len(result['appended'])} record(s) appended to facts/")
404
+ if result["appended"]:
405
+ print(" next: `scrip stamp vault/facts/_meta.yaml`, then `scrip verify`")
406
+ return 1 if result["failures"] else 0
407
+
408
+
339
409
  def cmd_ingest(args: argparse.Namespace) -> int:
340
410
  from . import ingest, lock
341
411
 
@@ -515,6 +585,61 @@ def build_parser() -> argparse.ArgumentParser:
515
585
  pn.add_argument("--title", help="human title (default: the slug)")
516
586
  pn.set_defaults(func=cmd_new)
517
587
 
588
+ psim = sub.add_parser(
589
+ "similar",
590
+ parents=[common],
591
+ help="score existing wiki pages by topic overlap with a proposed page (PROMOTE step 1)",
592
+ )
593
+ psim.add_argument(
594
+ "--title", required=True, help="proposed page title (tokenized for title overlap)"
595
+ )
596
+ psim.add_argument(
597
+ "--from",
598
+ dest="sources",
599
+ required=True,
600
+ metavar="raw/a,raw/b",
601
+ help="comma-separated source ids the proposed page would derive from",
602
+ )
603
+ psim.add_argument(
604
+ "--kind",
605
+ choices=["concept", "entity"],
606
+ default="concept",
607
+ help="score only candidates of this kind (default: concept)",
608
+ )
609
+ psim.add_argument(
610
+ "--exclude",
611
+ metavar="ID",
612
+ action="append",
613
+ default=[],
614
+ help="page id to skip (repeatable); use when re-scoring an existing page",
615
+ )
616
+ psim.add_argument("--top", type=int, metavar="N", help="limit to the N highest-scoring candidates")
617
+ psim.set_defaults(func=cmd_similar)
618
+
619
+ pfact = sub.add_parser(
620
+ "fact",
621
+ help="validated writers for the facts/ layer (claims mint verified anchors)",
622
+ )
623
+ fact_sub = pfact.add_subparsers(dest="fact_command", required=True, metavar="<action>")
624
+ pfa = fact_sub.add_parser(
625
+ "add",
626
+ parents=[common],
627
+ help="validate proposed NDJSON records and append them all-or-nothing; "
628
+ "claims carry a verbatim `quote` and scrip mints the anchor/id/timestamp",
629
+ )
630
+ pfa.add_argument(
631
+ "--table",
632
+ choices=["claims", "entities", "edges"],
633
+ default="claims",
634
+ help="facts table to append to (default: claims)",
635
+ )
636
+ fact_in = pfa.add_mutually_exclusive_group(required=True)
637
+ fact_in.add_argument("--file", metavar="NDJSON", help="read proposed records from a file")
638
+ fact_in.add_argument(
639
+ "--stdin", action="store_true", help="read proposed records from stdin"
640
+ )
641
+ pfa.set_defaults(func=cmd_fact_add)
642
+
518
643
  pin = sub.add_parser(
519
644
  "ingest",
520
645
  parents=[common],
@@ -47,7 +47,7 @@ def _get_model():
47
47
 
48
48
  os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
49
49
  try:
50
- from model2vec import StaticModel
50
+ from model2vec import StaticModel # pyright: ignore[reportMissingImports]
51
51
  except Exception:
52
52
  return None
53
53
  try:
@@ -95,7 +95,7 @@ def build_index(root: Path) -> int:
95
95
  model = _get_model()
96
96
  if model is None:
97
97
  raise RuntimeError("no embeddings backend available")
98
- import numpy as np
98
+ import numpy as np # pyright: ignore[reportMissingImports]
99
99
 
100
100
  items = list(_iter_blocks(root))
101
101
  if items:
@@ -126,7 +126,7 @@ def vector_search(root: Path, query: str, k: int = 5):
126
126
  d = _embeddings_dir(root)
127
127
  if model is None or not (d / "vectors.npy").exists() or not (d / "meta.json").exists():
128
128
  return None
129
- import numpy as np
129
+ import numpy as np # pyright: ignore[reportMissingImports]
130
130
 
131
131
  meta = json.loads((d / "meta.json").read_text(encoding="utf-8"))
132
132
  items = meta["items"]