scriptoria 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scriptoria-0.2.0 → scriptoria-0.4.0}/.gitignore +5 -2
- {scriptoria-0.2.0 → scriptoria-0.4.0}/PKG-INFO +16 -1
- {scriptoria-0.2.0 → scriptoria-0.4.0}/pyproject.toml +23 -2
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/__init__.py +1 -1
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/cli.py +125 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/embeddings.py +3 -3
- scriptoria-0.4.0/src/scrip/facts.py +426 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/graph.py +1 -2
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/query.py +1 -1
- scriptoria-0.4.0/src/scrip/similar.py +168 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/conftest.py +8 -4
- scriptoria-0.4.0/tests/test_fact_cmd.py +362 -0
- scriptoria-0.4.0/tests/test_index_cmd.py +77 -0
- scriptoria-0.4.0/tests/test_json_shapes.py +57 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_manifest.py +1 -0
- scriptoria-0.4.0/tests/test_similar_cmd.py +234 -0
- scriptoria-0.4.0/tests/test_status_cmd.py +56 -0
- scriptoria-0.4.0/tests/test_unlock_cmd.py +41 -0
- scriptoria-0.4.0/tests/test_version.py +11 -0
- scriptoria-0.4.0/tests/test_watch.py +48 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/uv.lock +7 -2
- scriptoria-0.2.0/tests/test_watch.py +0 -22
- {scriptoria-0.2.0 → scriptoria-0.4.0}/README.md +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/pyrightconfig.json +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/anchors.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/blocks.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/errors.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/frontmatter.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/hashing.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/ingest.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/lock.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/manifest.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/src/scrip/retrieval.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_anchor_cmd.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_anchors.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_blocks.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_embeddings.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_graph_status.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_hashing.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_ingest.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_lock.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_new_cmd.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_query.py +1 -1
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_retrieval.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_stamp.py +0 -0
- {scriptoria-0.2.0 → scriptoria-0.4.0}/tests/test_verify.py +1 -1
|
@@ -21,8 +21,11 @@ build/
|
|
|
21
21
|
# Advisory write lock: ephemeral runtime state, never committed (see SPEC §11)
|
|
22
22
|
/.kb/lock
|
|
23
23
|
|
|
24
|
-
#
|
|
25
|
-
#
|
|
24
|
+
# Manifest: a regenerable speed cache. SPEC §8 says it *may* be committed; we
|
|
25
|
+
# choose not to — it stores (mtime, size) that are wrong on every fresh clone
|
|
26
|
+
# anyway, and its hashes/timestamps churn diffs. Rebuild any time with
|
|
26
27
|
# `scrip status --rebuild-manifest`.
|
|
28
|
+
/.kb/manifest.json
|
|
29
|
+
|
|
27
30
|
# roborev snapshots
|
|
28
31
|
/.roborev/
|
|
@@ -1,8 +1,23 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scriptoria
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
|
|
5
|
+
Project-URL: Homepage, https://github.com/coredipper/scriptorium
|
|
6
|
+
Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md
|
|
7
|
+
Project-URL: Issues, https://github.com/coredipper/scriptorium/issues
|
|
5
8
|
License: MIT
|
|
9
|
+
Keywords: agent,knowledge-base,markdown,provenance,staleness,wiki
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
6
21
|
Requires-Python: >=3.10
|
|
7
22
|
Requires-Dist: duckdb>=1.0
|
|
8
23
|
Requires-Dist: pyyaml>=6.0
|
|
@@ -2,16 +2,35 @@
|
|
|
2
2
|
# Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
|
|
3
3
|
# CLI command and the import package both remain `scrip`.
|
|
4
4
|
name = "scriptoria"
|
|
5
|
-
version = "0.
|
|
5
|
+
version = "0.4.0"
|
|
6
6
|
description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
requires-python = ">=3.10"
|
|
9
9
|
license = { text = "MIT" }
|
|
10
|
+
keywords = ["knowledge-base", "provenance", "staleness", "agent", "markdown", "wiki"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"Environment :: Console",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Programming Language :: Python :: 3.14",
|
|
22
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
23
|
+
]
|
|
10
24
|
dependencies = [
|
|
11
25
|
"duckdb>=1.0",
|
|
12
26
|
"pyyaml>=6.0",
|
|
13
27
|
]
|
|
14
28
|
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/coredipper/scriptorium"
|
|
31
|
+
Changelog = "https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md"
|
|
32
|
+
Issues = "https://github.com/coredipper/scriptorium/issues"
|
|
33
|
+
|
|
15
34
|
[project.scripts]
|
|
16
35
|
scrip = "scrip.cli:main"
|
|
17
36
|
|
|
@@ -26,7 +45,9 @@ embeddings = ["model2vec>=0.3", "numpy>=1.24"]
|
|
|
26
45
|
ingest = ["trafilatura>=1.8", "pypdf>=4.0"]
|
|
27
46
|
|
|
28
47
|
[dependency-groups]
|
|
29
|
-
|
|
48
|
+
# numpy is here so the embeddings index/search path is testable with a toy
|
|
49
|
+
# encoder — the real backend (model2vec) stays an optional [embeddings] extra.
|
|
50
|
+
dev = ["pytest>=8", "numpy>=1.24"]
|
|
30
51
|
|
|
31
52
|
[build-system]
|
|
32
53
|
requires = ["hatchling"]
|
|
@@ -13,7 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.4.0"
|
|
17
17
|
|
|
18
18
|
# --- canonical vault layout ------------------------------------------------
|
|
19
19
|
# ``root`` is the repo/instance root: the directory containing ``vault/``.
|
|
@@ -336,6 +336,76 @@ def cmd_new(args: argparse.Namespace) -> int:
|
|
|
336
336
|
return 0
|
|
337
337
|
|
|
338
338
|
|
|
339
|
+
def _parse_source_ids(raw: str) -> list[str]:
|
|
340
|
+
"""Parse a comma-separated `--from` value into validated source ids, WITHOUT
|
|
341
|
+
requiring the sources to exist (unlike `cmd_new`): scoring a not-yet-ingested
|
|
342
|
+
proposed topic is legitimate. Keeps the traversal-safety check."""
|
|
343
|
+
ids: list[str] = []
|
|
344
|
+
for s in (part.strip() for part in raw.split(",")):
|
|
345
|
+
if not s:
|
|
346
|
+
continue
|
|
347
|
+
sid = s if s.startswith("raw/") else f"raw/{s}"
|
|
348
|
+
_safe_slug(sid.split("#", 1)[0][len("raw/") :], "source")
|
|
349
|
+
ids.append(sid)
|
|
350
|
+
if not ids:
|
|
351
|
+
raise errors.UsageError("--from requires at least one source id")
|
|
352
|
+
return ids
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def cmd_similar(args: argparse.Namespace) -> int:
|
|
356
|
+
from . import similar
|
|
357
|
+
|
|
358
|
+
root = resolve_root(args.root)
|
|
359
|
+
sources = _parse_source_ids(args.sources)
|
|
360
|
+
result = similar.compute_similar(
|
|
361
|
+
root,
|
|
362
|
+
title=args.title,
|
|
363
|
+
sources=sources,
|
|
364
|
+
kind=args.kind,
|
|
365
|
+
exclude=set(args.exclude),
|
|
366
|
+
top=args.top,
|
|
367
|
+
)
|
|
368
|
+
if args.json:
|
|
369
|
+
_emit(result)
|
|
370
|
+
else:
|
|
371
|
+
similar.print_similar(result)
|
|
372
|
+
return 0
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def cmd_fact_add(args: argparse.Namespace) -> int:
|
|
376
|
+
from . import facts
|
|
377
|
+
|
|
378
|
+
root = resolve_root(args.root)
|
|
379
|
+
if args.file:
|
|
380
|
+
try:
|
|
381
|
+
text = Path(args.file).read_text(encoding="utf-8")
|
|
382
|
+
except OSError as e:
|
|
383
|
+
raise errors.UsageError(f"cannot read --file: {e}") from e
|
|
384
|
+
else:
|
|
385
|
+
text = sys.stdin.read()
|
|
386
|
+
result = facts.add(root, args.table, facts.parse_ndjson(text))
|
|
387
|
+
if args.json:
|
|
388
|
+
_emit(result)
|
|
389
|
+
else:
|
|
390
|
+
for r in result["appended"]:
|
|
391
|
+
ident = r.get("claim_id") or r.get("entity_id") or f"{r['src']} -> {r['dst']}"
|
|
392
|
+
print(f" appended {ident}")
|
|
393
|
+
for s in result["skipped"]:
|
|
394
|
+
print(f" = record {s['index']} skipped (duplicate)")
|
|
395
|
+
for f in result["failures"]:
|
|
396
|
+
print(f" ✗ record {f['index']}: {f['status']} — {f['detail']}")
|
|
397
|
+
if result["failures"]:
|
|
398
|
+
print(
|
|
399
|
+
f"nothing appended: {len(result['failures'])} record(s) failed "
|
|
400
|
+
f"(the batch is all-or-nothing)"
|
|
401
|
+
)
|
|
402
|
+
else:
|
|
403
|
+
print(f"{len(result['appended'])} record(s) appended to facts/")
|
|
404
|
+
if result["appended"]:
|
|
405
|
+
print(" next: `scrip stamp vault/facts/_meta.yaml`, then `scrip verify`")
|
|
406
|
+
return 1 if result["failures"] else 0
|
|
407
|
+
|
|
408
|
+
|
|
339
409
|
def cmd_ingest(args: argparse.Namespace) -> int:
|
|
340
410
|
from . import ingest, lock
|
|
341
411
|
|
|
@@ -515,6 +585,61 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
515
585
|
pn.add_argument("--title", help="human title (default: the slug)")
|
|
516
586
|
pn.set_defaults(func=cmd_new)
|
|
517
587
|
|
|
588
|
+
psim = sub.add_parser(
|
|
589
|
+
"similar",
|
|
590
|
+
parents=[common],
|
|
591
|
+
help="score existing wiki pages by topic overlap with a proposed page (PROMOTE step 1)",
|
|
592
|
+
)
|
|
593
|
+
psim.add_argument(
|
|
594
|
+
"--title", required=True, help="proposed page title (tokenized for title overlap)"
|
|
595
|
+
)
|
|
596
|
+
psim.add_argument(
|
|
597
|
+
"--from",
|
|
598
|
+
dest="sources",
|
|
599
|
+
required=True,
|
|
600
|
+
metavar="raw/a,raw/b",
|
|
601
|
+
help="comma-separated source ids the proposed page would derive from",
|
|
602
|
+
)
|
|
603
|
+
psim.add_argument(
|
|
604
|
+
"--kind",
|
|
605
|
+
choices=["concept", "entity"],
|
|
606
|
+
default="concept",
|
|
607
|
+
help="score only candidates of this kind (default: concept)",
|
|
608
|
+
)
|
|
609
|
+
psim.add_argument(
|
|
610
|
+
"--exclude",
|
|
611
|
+
metavar="ID",
|
|
612
|
+
action="append",
|
|
613
|
+
default=[],
|
|
614
|
+
help="page id to skip (repeatable); use when re-scoring an existing page",
|
|
615
|
+
)
|
|
616
|
+
psim.add_argument("--top", type=int, metavar="N", help="limit to the N highest-scoring candidates")
|
|
617
|
+
psim.set_defaults(func=cmd_similar)
|
|
618
|
+
|
|
619
|
+
pfact = sub.add_parser(
|
|
620
|
+
"fact",
|
|
621
|
+
help="validated writers for the facts/ layer (claims mint verified anchors)",
|
|
622
|
+
)
|
|
623
|
+
fact_sub = pfact.add_subparsers(dest="fact_command", required=True, metavar="<action>")
|
|
624
|
+
pfa = fact_sub.add_parser(
|
|
625
|
+
"add",
|
|
626
|
+
parents=[common],
|
|
627
|
+
help="validate proposed NDJSON records and append them all-or-nothing; "
|
|
628
|
+
"claims carry a verbatim `quote` and scrip mints the anchor/id/timestamp",
|
|
629
|
+
)
|
|
630
|
+
pfa.add_argument(
|
|
631
|
+
"--table",
|
|
632
|
+
choices=["claims", "entities", "edges"],
|
|
633
|
+
default="claims",
|
|
634
|
+
help="facts table to append to (default: claims)",
|
|
635
|
+
)
|
|
636
|
+
fact_in = pfa.add_mutually_exclusive_group(required=True)
|
|
637
|
+
fact_in.add_argument("--file", metavar="NDJSON", help="read proposed records from a file")
|
|
638
|
+
fact_in.add_argument(
|
|
639
|
+
"--stdin", action="store_true", help="read proposed records from stdin"
|
|
640
|
+
)
|
|
641
|
+
pfa.set_defaults(func=cmd_fact_add)
|
|
642
|
+
|
|
518
643
|
pin = sub.add_parser(
|
|
519
644
|
"ingest",
|
|
520
645
|
parents=[common],
|
|
@@ -47,7 +47,7 @@ def _get_model():
|
|
|
47
47
|
|
|
48
48
|
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
|
49
49
|
try:
|
|
50
|
-
from model2vec import StaticModel
|
|
50
|
+
from model2vec import StaticModel # pyright: ignore[reportMissingImports]
|
|
51
51
|
except Exception:
|
|
52
52
|
return None
|
|
53
53
|
try:
|
|
@@ -95,7 +95,7 @@ def build_index(root: Path) -> int:
|
|
|
95
95
|
model = _get_model()
|
|
96
96
|
if model is None:
|
|
97
97
|
raise RuntimeError("no embeddings backend available")
|
|
98
|
-
import numpy as np
|
|
98
|
+
import numpy as np # pyright: ignore[reportMissingImports]
|
|
99
99
|
|
|
100
100
|
items = list(_iter_blocks(root))
|
|
101
101
|
if items:
|
|
@@ -126,7 +126,7 @@ def vector_search(root: Path, query: str, k: int = 5):
|
|
|
126
126
|
d = _embeddings_dir(root)
|
|
127
127
|
if model is None or not (d / "vectors.npy").exists() or not (d / "meta.json").exists():
|
|
128
128
|
return None
|
|
129
|
-
import numpy as np
|
|
129
|
+
import numpy as np # pyright: ignore[reportMissingImports]
|
|
130
130
|
|
|
131
131
|
meta = json.loads((d / "meta.json").read_text(encoding="utf-8"))
|
|
132
132
|
items = meta["items"]
|