PyPI - scriptoria - Versions diffs - 0.3.0__tar.gz → 0.4.0__tar.gz - Mend

scriptoria 0.3.0tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{scriptoria-0.3.0 → scriptoria-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scriptoria
-Version: 0.3.0
+Version: 0.4.0
 Summary: Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base
 Project-URL: Homepage, https://github.com/coredipper/scriptorium
 Project-URL: Changelog, https://github.com/coredipper/scriptorium/blob/main/CHANGELOG.md

{scriptoria-0.3.0 → scriptoria-0.4.0}/pyproject.toml RENAMED Viewed

@@ -2,7 +2,7 @@
 # Distribution name on PyPI is `scriptoria` (scrip/scriptorium were taken); the
 # CLI command and the import package both remain `scrip`.
 name = "scriptoria"
-version = "0.3.0"
+version = "0.4.0"
 description = "Deterministic scriptorium-keeper (the `scrip` CLI): staleness, provenance integrity, and fact queries for an agent-compiled knowledge base"
 readme = "README.md"
 requires-python = ">=3.10"

{scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/__init__.py RENAMED Viewed

@@ -13,7 +13,7 @@ from __future__ import annotations
 from pathlib import Path
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 # --- canonical vault layout ------------------------------------------------
 # ``root`` is the repo/instance root: the directory containing ``vault/``.

{scriptoria-0.3.0 → scriptoria-0.4.0}/src/scrip/cli.py RENAMED Viewed

@@ -336,6 +336,42 @@ def cmd_new(args: argparse.Namespace) -> int:
     return 0
+def _parse_source_ids(raw: str) -> list[str]:
+    """Parse a comma-separated `--from` value into validated source ids, WITHOUT
+    requiring the sources to exist (unlike `cmd_new`): scoring a not-yet-ingested
+    proposed topic is legitimate. Keeps the traversal-safety check."""
+    ids: list[str] = []
+    for s in (part.strip() for part in raw.split(",")):
+        if not s:
+            continue
+        sid = s if s.startswith("raw/") else f"raw/{s}"
+        _safe_slug(sid.split("#", 1)[0][len("raw/") :], "source")
+        ids.append(sid)
+    if not ids:
+        raise errors.UsageError("--from requires at least one source id")
+    return ids
+def cmd_similar(args: argparse.Namespace) -> int:
+    from . import similar
+    root = resolve_root(args.root)
+    sources = _parse_source_ids(args.sources)
+    result = similar.compute_similar(
+        root,
+        title=args.title,
+        sources=sources,
+        kind=args.kind,
+        exclude=set(args.exclude),
+        top=args.top,
+    )
+    if args.json:
+        _emit(result)
+    else:
+        similar.print_similar(result)
+    return 0
 def cmd_fact_add(args: argparse.Namespace) -> int:
     from . import facts
@@ -549,6 +585,37 @@ def build_parser() -> argparse.ArgumentParser:
     pn.add_argument("--title", help="human title (default: the slug)")
     pn.set_defaults(func=cmd_new)
+    psim = sub.add_parser(
+        "similar",
+        parents=[common],
+        help="score existing wiki pages by topic overlap with a proposed page (PROMOTE step 1)",
+    )
+    psim.add_argument(
+        "--title", required=True, help="proposed page title (tokenized for title overlap)"
+    )
+    psim.add_argument(
+        "--from",
+        dest="sources",
+        required=True,
+        metavar="raw/a,raw/b",
+        help="comma-separated source ids the proposed page would derive from",
+    )
+    psim.add_argument(
+        "--kind",
+        choices=["concept", "entity"],
+        default="concept",
+        help="score only candidates of this kind (default: concept)",
+    )
+    psim.add_argument(
+        "--exclude",
+        metavar="ID",
+        action="append",
+        default=[],
+        help="page id to skip (repeatable); use when re-scoring an existing page",
+    )
+    psim.add_argument("--top", type=int, metavar="N", help="limit to the N highest-scoring candidates")
+    psim.set_defaults(func=cmd_similar)
     pfact = sub.add_parser(
         "fact",
         help="validated writers for the facts/ layer (claims mint verified anchors)",

scriptoria-0.4.0/src/scrip/similar.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Deterministic topic-overlap scoring for PROMOTE step 1 (`scrip similar`).
+Ranks existing wiki pages by how much a proposed topic overlaps each, from three
+file-derived signals:
+- **title** — Jaccard of normalized title tokens (the §6 normalization).
+- **sources** — Jaccard of `derived-from` source ids (block suffix stripped).
+- **tags** — Jaccard of tag sets. Pages carry no `tags` frontmatter (SPEC §4),
+  so a page's tags are *derived*: the union of `tags` over claims whose
+  `source_id` is one of the page's sources.
+`combined` is a weighted sum (sources dominates — shared sources is the strongest
+same-topic signal). This is **purely informational**: it reports scores and
+leaves the High/Middle/Low merge decision of AGENT.md PROMOTE to the caller,
+exactly as `query contradictions` leaves adjudication to the agent. No lock, no
+model, no DuckDB.
+"""
+from __future__ import annotations
+import json
+from collections.abc import Iterable, Mapping
+from pathlib import Path
+from . import facts_dir, frontmatter
+from .errors import DataError
+from .graph import scan_derived
+from .hashing import normalize
+DEFAULT_WEIGHTS = {"title": 0.25, "sources": 0.5, "tags": 0.25}
+def _tokens(title: str) -> set[str]:
+    return set(normalize(title).split())
+def _strip_block(dep: str) -> str:
+    """`raw/x#b3` -> `raw/x` (block-scoped deps share their whole source)."""
+    return dep.split("#", 1)[0]
+def _source_set(derived_from: Iterable[str]) -> set[str]:
+    return {_strip_block(d) for d in derived_from}
+def _jaccard(a: set[str], b: set[str]) -> float:
+    if not a and not b:
+        return 0.0
+    return len(a & b) / len(a | b)
+def _source_tags(root: Path) -> dict[str, set[str]]:
+    """Map each `source_id` to the union of `tags` over its claims. Built once
+    per run from facts/claims.ndjson (parsed directly — no DuckDB dependency)."""
+    out: dict[str, set[str]] = {}
+    p = facts_dir(root) / "claims.ndjson"
+    if not p.exists():
+        return out
+    for lineno, raw_line in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
+        line = raw_line.strip()
+        if not line:
+            continue
+        try:
+            rec = json.loads(line)
+        except json.JSONDecodeError as e:
+            raise DataError(f"claims.ndjson:{lineno}: invalid JSON: {e}") from e
+        if not isinstance(rec, dict):
+            raise DataError(f"claims.ndjson:{lineno}: expected a JSON object")
+        sid = rec.get("source_id")
+        if not isinstance(sid, str):
+            raise DataError(f"claims.ndjson:{lineno}: 'source_id' must be a string")
+        tags = rec.get("tags")
+        if tags is None:
+            continue
+        if not isinstance(tags, list) or any(not isinstance(t, str) for t in tags):
+            raise DataError(f"claims.ndjson:{lineno}: 'tags' must be a list of strings")
+        out.setdefault(sid, set()).update(tags)
+    return out
+def _page_tags(sources: set[str], source_tags: Mapping[str, set[str]]) -> set[str]:
+    out: set[str] = set()
+    for s in sources:
+        out |= source_tags.get(s, set())
+    return out
+def compute_similar(
+    root: str | Path,
+    *,
+    title: str,
+    sources: Iterable[str],
+    kind: str = "concept",
+    exclude: Iterable[str] | None = None,
+    top: int | None = None,
+    weights: Mapping[str, float] | None = None,
+) -> dict:
+    """Score existing `kind` wiki pages against the proposed (title, sources).
+    Returns ``{proposed, weights, candidates}`` with candidates sorted by
+    ``combined`` desc then id asc, truncated to ``top``.
+    """
+    root = Path(root)
+    w = dict(weights or DEFAULT_WEIGHTS)
+    skip = set(exclude or ())
+    prop_sources = _source_set(sources)
+    prop_tokens = _tokens(title)
+    source_tags = _source_tags(root)
+    prop_tags = _page_tags(prop_sources, source_tags)
+    want_type = f"wiki.{kind}"
+    candidates: list[dict] = []
+    for cid, d in scan_derived(root).items():
+        if d.get("type") != want_type or cid in skip:
+            continue  # other-kind pages and the facts.set row are dropped here
+        c_sources = _source_set(d["derived_from"])
+        meta, _ = frontmatter.load(root / d["path"])
+        c_title = (meta.get("title") if meta else "") or ""
+        c_tags = _page_tags(c_sources, source_tags)
+        title_s = _jaccard(prop_tokens, _tokens(c_title))
+        sources_s = _jaccard(prop_sources, c_sources)
+        tags_s = _jaccard(prop_tags, c_tags)
+        combined = w["title"] * title_s + w["sources"] * sources_s + w["tags"] * tags_s
+        candidates.append(
+            {
+                "id": cid,
+                "title": c_title,
+                "path": d["path"],
+                "kind": kind,
+                "scores": {
+                    "title": round(title_s, 6),
+                    "sources": round(sources_s, 6),
+                    "tags": round(tags_s, 6),
+                    "combined": round(combined, 6),
+                },
+                "shared": {
+                    "sources": sorted(prop_sources & c_sources),
+                    "tags": sorted(prop_tags & c_tags),
+                },
+            }
+        )
+    candidates.sort(key=lambda c: (-c["scores"]["combined"], c["id"]))
+    if top is not None:
+        candidates = candidates[:top]
+    return {
+        "proposed": {"title": title, "derived_from": list(sources), "kind": kind},
+        "weights": w,
+        "candidates": candidates,
+    }
+def print_similar(result: dict) -> None:
+    p = result["proposed"]
+    print(f'proposed: "{p["title"]}"  ({p["kind"]}, from {len(p["derived_from"])} source(s))')
+    cands = result["candidates"]
+    if not cands:
+        print(f"no existing {p['kind']} pages to compare.")
+        return
+    for c in cands:
+        s = c["scores"]
+        print(f'  {s["combined"]:.3f}  {c["id"]}  "{c["title"]}"')
+        print(
+            f'         sources {s["sources"]:.2f}  tags {s["tags"]:.2f}  title {s["title"]:.2f}'
+            f'   shared sources: {len(c["shared"]["sources"])}, tags: {len(c["shared"]["tags"])}'
+        )
+    print(f"({len(cands)} candidate(s))")

{scriptoria-0.3.0 → scriptoria-0.4.0}/tests/conftest.py RENAMED Viewed

@@ -70,6 +70,8 @@ class KB:
         *,
         stamp: bool = True,
         body: str = "Body.\n",
+        title: str | None = None,
+        kind: str = "concept",
     ) -> str:
         deps = {
             sid: h
@@ -77,16 +79,18 @@ class KB:
             if (h := self._dep_hash(sid)) is not None
         }
         meta: dict = {
-            "id": f"concept/{slug}",
-            "type": "wiki.concept",
-            "title": slug,
+            "id": f"{kind}/{slug}",
+            "type": f"wiki.{kind}",
+            "title": title or slug,
             "derived-from": list(derived_from),
         }
         if stamp:
             meta["input-hash"] = hashing.input_hash(deps)
         meta["last-compiled"] = "2026-01-01T00:00:00Z"
         meta["confidence"] = 0.9
-        path = self.root / "vault" / "wiki" / "concepts" / f"{slug}.md"
+        subdir = "concepts" if kind == "concept" else "entities"
+        path = self.root / "vault" / "wiki" / subdir / f"{slug}.md"
+        path.parent.mkdir(parents=True, exist_ok=True)
         path.write_text(frontmatter.dump(meta, body), encoding="utf-8")
         return meta["id"]

scriptoria-0.4.0/tests/test_similar_cmd.py ADDED Viewed

@@ -0,0 +1,234 @@
+"""`scrip similar` — deterministic topic-overlap scorer for PROMOTE step 1.
+Ranks existing wiki pages by title-token + shared-source + derived-tag overlap.
+Pure informational (always exit 0); no model, no lock."""
+import json
+import pytest
+from scrip import cli
+def _similar(kb, *args):
+    """Run `scrip similar … --json` and return the parsed payload."""
+    rc = cli.main(["similar", *args, "--json", "--root", str(kb.root)])
+    return rc
+def _json(capsys):
+    return json.loads(capsys.readouterr().out)
+# --------------------------------------------------------------------------- #
+# Ranking
+# --------------------------------------------------------------------------- #
+def test_similar_high_overlap_ranks_first(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_raw("b", "# B\n\nBeta.\n")
+    kb.add_raw("c", "# C\n\nGamma.\n")
+    kb.add_wiki("twin", ["raw/a", "raw/b"], title="Alpha and Beta")
+    kb.add_wiki("unrelated", ["raw/c"], title="Gamma only")
+    rc = _similar(kb, "--title", "Alpha and Beta", "--from", "raw/a,raw/b")
+    assert rc == 0
+    data = _json(capsys)
+    assert data["candidates"][0]["id"] == "concept/twin"
+    assert data["candidates"][0]["scores"]["sources"] == 1.0
+    assert data["candidates"][0]["scores"]["title"] == 1.0
+    # the unrelated page is present but ranked strictly lower
+    assert data["candidates"][-1]["id"] == "concept/unrelated"
+    assert data["candidates"][-1]["scores"]["combined"] < data["candidates"][0]["scores"]["combined"]
+def test_similar_partial_source_overlap_scores_between(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_raw("b", "# B\n\nBeta.\n")
+    kb.add_wiki("both", ["raw/a", "raw/b"], title="Shared")
+    kb.add_wiki("half", ["raw/a"], title="Partial")
+    _similar(kb, "--title", "Shared", "--from", "raw/a,raw/b")
+    data = _json(capsys)
+    by_id = {c["id"]: c for c in data["candidates"]}
+    assert by_id["concept/both"]["scores"]["sources"] == 1.0
+    # half shares 1 of {a,b} → Jaccard 1/2
+    assert by_id["concept/half"]["scores"]["sources"] == 0.5
+    assert by_id["concept/half"]["scores"]["combined"] < by_id["concept/both"]["scores"]["combined"]
+def test_similar_no_candidates_exits_0(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    rc = _similar(kb, "--title", "Anything", "--from", "raw/a")
+    assert rc == 0
+    assert _json(capsys)["candidates"] == []
+# --------------------------------------------------------------------------- #
+# Self-exclusion
+# --------------------------------------------------------------------------- #
+def test_similar_excludes_named_ids(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("self", ["raw/a"], title="Self Page")
+    kb.add_wiki("peer", ["raw/a"], title="Self Page")
+    _similar(kb, "--title", "Self Page", "--from", "raw/a", "--exclude", "concept/self")
+    ids = [c["id"] for c in _json(capsys)["candidates"]]
+    assert "concept/self" not in ids
+    assert "concept/peer" in ids
+# --------------------------------------------------------------------------- #
+# Tags derived from claims (pages carry no tags frontmatter)
+# --------------------------------------------------------------------------- #
+def test_similar_tags_derived_from_claims(kb, capsys):
+    kb.add_raw("shared", "# S\n\nA cited sentence about caching.\n")
+    kb.add_raw("bare", "# B\n\nUncited content here.\n")
+    kb.add_claim("clm_0001", "shared", "A cited sentence about caching.", tags=["caching", "cost"])
+    kb.add_wiki("with-tags", ["raw/shared"], title="Tagged")
+    kb.add_wiki("no-tags", ["raw/bare"], title="Untagged")
+    _similar(kb, "--title", "Proposed", "--from", "raw/shared")
+    by_id = {c["id"]: c for c in _json(capsys)["candidates"]}
+    # proposed derives tags {caching,cost} from raw/shared's claim → perfect tag match
+    assert by_id["concept/with-tags"]["scores"]["tags"] == 1.0
+    assert by_id["concept/with-tags"]["shared"]["tags"] == ["caching", "cost"]
+    # raw/bare has no claims → no tags → tag score 0
+    assert by_id["concept/no-tags"]["scores"]["tags"] == 0.0
+# --------------------------------------------------------------------------- #
+# Block-scoped derived-from is stripped to the whole source
+# --------------------------------------------------------------------------- #
+def test_similar_strips_block_suffix(kb, capsys):
+    kb.add_raw("a", "# Heading\n\nFirst paragraph body.\n\nSecond paragraph body.\n")
+    bid = kb.block_id("a", "First paragraph")
+    kb.add_wiki("whole", ["raw/a"], title="Whole source page")
+    _similar(kb, "--title", "Block page", "--from", f"raw/a#{bid}")
+    [cand] = _json(capsys)["candidates"]
+    assert cand["id"] == "concept/whole"
+    assert cand["scores"]["sources"] == 1.0  # raw/a#<block> stripped to raw/a
+# --------------------------------------------------------------------------- #
+# Kind filter
+# --------------------------------------------------------------------------- #
+def test_similar_scores_only_same_kind(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("a-concept", ["raw/a"], title="Topic", kind="concept")
+    kb.add_wiki("an-entity", ["raw/a"], title="Topic", kind="entity")
+    # a facts.set row also exists in scan_derived once _meta is present:
+    (kb.root / "vault" / "facts" / "_meta.yaml").write_text(
+        "id: facts/core\ntype: facts.set\nderived-from:\n- raw/a\n", encoding="utf-8"
+    )
+    _similar(kb, "--title", "Topic", "--from", "raw/a", "--kind", "concept")
+    ids = [c["id"] for c in _json(capsys)["candidates"]]
+    assert ids == ["concept/a-concept"]
+    assert "entity/an-entity" not in ids
+    assert "facts/core" not in ids
+def test_similar_entity_kind_scores_entities(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("a-concept", ["raw/a"], title="Topic", kind="concept")
+    kb.add_wiki("an-entity", ["raw/a"], title="Topic", kind="entity")
+    _similar(kb, "--title", "Topic", "--from", "raw/a", "--kind", "entity")
+    ids = [c["id"] for c in _json(capsys)["candidates"]]
+    assert ids == ["entity/an-entity"]
+# --------------------------------------------------------------------------- #
+# --top + shape
+# --------------------------------------------------------------------------- #
+def test_similar_top_limits_results(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_raw("b", "# B\n\nBeta.\n")
+    kb.add_wiki("p1", ["raw/a", "raw/b"], title="Best match")
+    kb.add_wiki("p2", ["raw/a"], title="Partial")
+    kb.add_wiki("p3", ["raw/b"], title="Other")
+    _similar(kb, "--title", "Best match", "--from", "raw/a,raw/b", "--top", "1")
+    data = _json(capsys)
+    assert len(data["candidates"]) == 1
+    assert data["candidates"][0]["id"] == "concept/p1"
+def test_similar_json_shape(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("p", ["raw/a"], title="A Page")
+    _similar(kb, "--title", "Proposed", "--from", "raw/a")
+    data = _json(capsys)
+    assert set(data) == {"proposed", "weights", "candidates"}
+    assert set(data["proposed"]) == {"title", "derived_from", "kind"}
+    assert set(data["weights"]) == {"title", "sources", "tags"}
+    [cand] = data["candidates"]
+    assert set(cand) == {"id", "title", "path", "kind", "scores", "shared"}
+    assert set(cand["scores"]) == {"title", "sources", "tags", "combined"}
+    assert set(cand["shared"]) == {"sources", "tags"}
+def test_similar_human_output_lists_candidate(kb, capsys):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("twin", ["raw/a"], title="Twin")
+    rc = cli.main(["similar", "--title", "Twin", "--from", "raw/a", "--root", str(kb.root)])
+    assert rc == 0
+    assert "concept/twin" in capsys.readouterr().out
+# --------------------------------------------------------------------------- #
+# Errors
+# --------------------------------------------------------------------------- #
+def test_similar_missing_title_is_usage_error(kb):
+    with pytest.raises(SystemExit) as e:
+        cli.main(["similar", "--from", "raw/a", "--root", str(kb.root)])
+    assert e.value.code == 2
+def test_similar_missing_from_is_usage_error(kb):
+    with pytest.raises(SystemExit) as e:
+        cli.main(["similar", "--title", "X", "--root", str(kb.root)])
+    assert e.value.code == 2
+def test_similar_empty_from_is_usage_error(kb):
+    assert cli.main(["similar", "--title", "X", "--from", " , ", "--root", str(kb.root)]) == 2
+def test_similar_outside_a_vault_is_usage_error(tmp_path):
+    assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(tmp_path / "no")]) == 2
+def test_similar_malformed_claims_is_data_error(kb):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("p", ["raw/a"], title="P")
+    (kb.root / "vault" / "facts" / "claims.ndjson").write_text("{not json}\n", encoding="utf-8")
+    assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3
+def test_similar_non_object_claim_line_is_data_error(kb):
+    # a valid-JSON non-object must be a clean data error (3), not an internal one (4)
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("p", ["raw/a"], title="P")
+    (kb.root / "vault" / "facts" / "claims.ndjson").write_text("[]\n", encoding="utf-8")
+    assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3
+def test_similar_bad_source_id_is_data_error(kb):
+    # a non-string source_id is malformed facts data → exit 3, not a silent skip
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("p", ["raw/a"], title="P")
+    (kb.root / "vault" / "facts" / "claims.ndjson").write_text(
+        '{"source_id": 1, "tags": "oops"}\n', encoding="utf-8"
+    )
+    assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3
+def test_similar_bad_tags_shape_is_data_error(kb):
+    kb.add_raw("a", "# A\n\nAlpha.\n")
+    kb.add_wiki("p", ["raw/a"], title="P")
+    (kb.root / "vault" / "facts" / "claims.ndjson").write_text(
+        '{"claim_id": "clm_0001", "source_id": "raw/a", "anchor": "qh:x|loc:0|len:1", "tags": "oops"}\n',
+        encoding="utf-8",
+    )
+    assert cli.main(["similar", "--title", "X", "--from", "raw/a", "--root", str(kb.root)]) == 3

scriptoria-0.4.0/tests/test_version.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""The public __version__ must match the installed package metadata, so a release
+that bumps pyproject can't leave the runtime attribute stale. (Dist name is
+``scriptoria``; the import package is ``scrip``.)"""
+import importlib.metadata
+import scrip
+def test_version_matches_package_metadata():
+    assert scrip.__version__ == importlib.metadata.version("scriptoria")

{scriptoria-0.3.0 → scriptoria-0.4.0}/uv.lock RENAMED Viewed

@@ -1141,7 +1141,7 @@ wheels = [
 [[package]]
 name = "scriptoria"
-version = "0.3.0"
+version = "0.4.0"
 source = { editable = "." }
 dependencies = [
     { name = "duckdb" },