npm - medsci-skills - Versions diffs - 4.1.0 - Mend

medsci-skills 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (702) hide show

package/skills/verify-refs/skill.yml ADDED Viewed

@@ -0,0 +1,44 @@
+schema_version: 2
+name: verify-refs
+layer: A
+owner_domain: reference_integrity
+when_to_use:
+  - Audit-only verification of manuscript references against PubMed and CrossRef
+  - Pre-submission citation hallucination check (PostToolUse hook trigger on circulation/submission docx)
+  - Detecting author hallucination (DOI real but a cited author name wrong at any position — full-author cross-check against PubMed efetch, v1.3.0)
+  - LLM-assisted drafting gate — `--strict` mode required when AI generated or rewrote citations
+when_NOT_to_use:
+  - Adding new references (use /search-lit + /lit-sync)
+  - Rendering references list (use /manage-refs render_pandoc.sh)
+  - Modifying refs.bib or library.bib (audit-only — never writes back)
+inputs:
+  - manuscript.md
+  - manuscript.docx
+  - references.bib
+outputs:
+  - qc/reference_audit.json
+deterministic_scripts:
+  - scripts/verify_refs.py
+  - scripts/verify_cli.sh
+side_effects:
+  - writes_project_artifacts
+downstream_consumers:
+  - write-paper
+  - sync-submission
+  - orchestrate
+forbidden_actions:
+  - generate_references_from_memory
+  - silently_include_unverified_references
+# v2.1 quality card
+purpose: "Audit-only verification of manuscript references against PubMed and CrossRef (full-author cross-check); writes qc/reference_audit.json. Does not modify references."
+safety_boundaries:
+  - "Audit-only: never edits references/ or refs.bib; never generates references from memory."
+  - "Unverified references are flagged, not silently included."
+known_limitations:
+  - "Confirms DOI/PMID and author identity, not topical appropriateness of the citation."
+  - "CrossRef given-name errors are possible; PubMed efetch is treated as authoritative."
+validation_commands:
+  - "bash scripts/verify_cli.sh <refs.bib>"
+  - "confirm qc/reference_audit.json submission_safe: true"
+evidence_surface: bundled_script

package/skills/verify-refs/tests/fixtures/pagination_placeholder.bib ADDED Viewed

@@ -0,0 +1,17 @@
+@article{methodref_inpress,
+  title = {A method paper still in press},
+  author = {Smith, John and Doe, Jane},
+  year = {2026},
+  journal = {Journal of Examples},
+  pages = {e000--e000},
+  note = {in press}
+}
+@article{normalref_2025,
+  title = {A normal complete reference},
+  author = {Jones, Alice and Brown, Bob},
+  year = {2025},
+  journal = {Journal of Examples},
+  volume = {12},
+  pages = {123--130}
+}

package/skills/verify-refs/tests/test_pagination_placeholder.sh ADDED Viewed

@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Regression test for verify-refs Gate 6 (pagination-placeholder detection).
+# Offline (no network): a bib entry whose pages are "e000--e000" with an "in press"
+# note must get note="pagination_placeholder"; a normal entry must not. verify-refs
+# stays manuscript-agnostic — it only flags; the P0/centrality call is /self-review's.
+# Stdlib-only (python3).
+set -u
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT="$HERE/../scripts/verify_refs.py"
+BIB="$HERE/fixtures/pagination_placeholder.bib"
+ROOT="$(mktemp -d -t vrp_XXXX)"
+trap 'rm -rf "$ROOT"' EXIT
+fail=0
+check() { local label="$1"; shift
+    if "$@" >/dev/null 2>&1; then printf '  PASS  %s\n' "$label"
+    else printf '  FAIL  %s\n' "$label"; fail=$((fail+1)); fi
+}
+[[ -f "$SCRIPT" ]] || { echo "ENV-ERR: script missing" >&2; exit 2; }
+python3 "$SCRIPT" "$BIB" --project-root "$ROOT" --offline >/dev/null 2>&1
+AUDIT="$ROOT/qc/reference_audit.json"
+check "audit JSON written" test -s "$AUDIT"
+assert_py() { python3 -c "
+import json
+d = json.load(open('$AUDIT'))
+recs = {r['ref_id']: r for r in d['records']}
+$1
+"; }
+check "placeholder entry flagged note=pagination_placeholder" \
+    assert_py "assert 'pagination_placeholder' in recs['methodref_inpress'].get('note',''), recs['methodref_inpress']"
+check "placeholder entry status UNVERIFIED" \
+    assert_py "assert recs['methodref_inpress']['status']=='UNVERIFIED', recs['methodref_inpress']['status']"
+check "normal entry NOT flagged" \
+    assert_py "assert 'pagination_placeholder' not in recs['normalref_2025'].get('note',''), recs['normalref_2025']"
+echo "fail=$fail"; [[ "$fail" -eq 0 ]] && echo "ALL PASS" || echo "FAILURES: $fail"
+exit "$fail"

package/skills/version-dataset/SKILL.md ADDED Viewed

@@ -0,0 +1,143 @@
+---
+name: version-dataset
+description: Dataset version control for research reproducibility. Builds a deterministic content-hash manifest of a dataset (file SHA-256 + tabular schema + per-column value hashes), verifies a later copy against it to detect drift (schema change, row-count change, value changes), and diffs two manifests. Use to prove an analysis ran on the intended data, lock a dataset version, or reproducibility-lock bundled demos.
+triggers: version dataset, dataset version, data manifest, data hash, dataset drift, reproducibility lock, verify dataset, data provenance, did my data change, manifest.lock
+tools: Read, Write, Edit, Bash, Grep, Glob
+model: inherit
+---
+# Version Dataset Skill
+You help a medical researcher put a dataset under version control: fingerprint it,
+detect when it changes, and lock a reproducible version. This guards the
+data-integrity rule — an analysis must run on the data it claims to, with a fixed
+seed — by making any drift between runs loud instead of silent.
+## Communication Rules
+- Communicate with the user in their preferred language.
+- Manifest fields, drift reports, and provenance notes are in English.
+## Philosophy
+A dataset is an input to a result; if it changes silently, every downstream
+number is suspect. This skill records a deterministic fingerprint (file SHA-256 +,
+for tabular files, schema and per-column value hashes) so a later run can *prove*
+the inputs are unchanged. It does not alter data, and it records nothing
+non-deterministic (no timestamps unless explicitly passed), so the same data
+always yields the same manifest.
+## Reference Files
+- **Manifest schema + workflow**: `${CLAUDE_SKILL_DIR}/references/manifest_schema.md` —
+  the manifest.json structure, what each drift category means, and the non-
+  deterministic-artifact policy (PPTX/DOCX timestamps). Read before interpreting drift.
+## Deterministic Script
+```bash
+# Build a manifest (record the analysis seed + provenance)
+python "${CLAUDE_SKILL_DIR}/scripts/version_dataset.py" manifest data.csv \
+  --out manifest.json --seed 42 --provenance "KNHANES 2018 extract v1"
+# Verify a later copy against it (CI / pre-analysis gate)
+python "${CLAUDE_SKILL_DIR}/scripts/version_dataset.py" verify --manifest manifest.json --strict
+# Compare two manifests (what changed between versions)
+python "${CLAUDE_SKILL_DIR}/scripts/version_dataset.py" diff --old v1.json --new v2.json
+```
+File hashing is stdlib-only; tabular schema/column hashing uses pandas when present.
+`--ignore-cols` excludes volatile columns; `--base` makes manifest keys relative.
+## Workflow
+### Step 1: Lock the version (gate)
+Build the manifest at the moment the dataset is frozen for analysis. **Gate:**
+confirm with the user the seed and provenance note are correct before locking —
+the manifest is the record they will cite as "this is the data the results came from."
+### Step 2: Verify before each run (gate)
+Before re-running an analysis (or in CI), `verify --strict`. **Gate:** if drift is
+reported, stop and show the user the drift report; do not proceed on changed data
+without their explicit acknowledgement and a re-lock. Silent re-run on drifted data
+is the failure this skill exists to prevent.
+### Step 3: Diff across versions
+When a dataset is intentionally updated, `diff` the old and new manifests and
+present the change set (added/removed/changed columns, row-count delta) so the
+user can record what changed and re-lock. **Gate:** the user approves the new
+version before it replaces the locked one.
+## Non-Deterministic Artifacts
+Some outputs (PPTX/DOCX with embedded timestamps, figures with render metadata)
+change byte-for-byte on every build even when the analysis is identical. Do not
+put these under strict byte verification — manifest only the deterministic inputs
+and tabular outputs (data files, result CSVs), or use `--ignore-cols` for volatile
+columns. See references for the policy.
+## Scope Limitations
+### Supported
+- Content-hash manifest of any file; schema + per-column hashes for tabular files
+  (CSV/TSV/Parquet/Stata/SAS/Excel).
+- Drift verification and manifest-to-manifest diff.
+### NOT Supported
+- Storing or transmitting the data itself (manifests hold hashes, not contents).
+- Cleaning, profiling, or de-identifying — use `/clean-data`, `/generate-codebook`, `/deidentify`.
+- Full pipeline-output reproducibility for non-deterministic binaries (see above).
+## Cross-Skill Integration
+- **/generate-codebook** documents *what* is in the data; version-dataset locks *which version*.
+- **/deidentify** should run before a manifest is shared (example values are not stored, but provenance notes may carry context).
+- Demo reproducibility: each bundled `demo/*/` carries a `manifest.lock.json` (input data + deterministic result tables) that `verify --strict` checks.
+## Worked Example
+Lock a freshly-frozen extract:
+```bash
+python "${CLAUDE_SKILL_DIR}/scripts/version_dataset.py" manifest cohort.csv \
+  --out manifest.json --seed 42 --provenance "KNHANES 2018 extract, frozen 2026-05"
+# -> {"files": 1, "out": "manifest.json"}
+```
+Before re-running the analysis next month:
+```bash
+python "${CLAUDE_SKILL_DIR}/scripts/version_dataset.py" verify --manifest manifest.json --strict
+# OK: 1 file(s) match the manifest.   (exit 0 — safe to run)
+```
+If someone silently re-exported the data with three extra rows:
+```text
+=========================================
+ Dataset Manifest Verify
+=========================================
+DRIFT (3):
+  ROW COUNT cohort.csv: 3457 -> 3460
+  CHANGED column cohort.csv:bmi
+  CHANGED column cohort.csv:hba1c
+MANIFEST_DRIFT: dataset differs from manifest.   (exit 1 — STOP)
+```
+The analysis does **not** proceed: the result the manuscript will cite would no
+longer match the locked data. The researcher reviews the drift, decides whether
+the change is intended, and only then re-locks (`manifest` again) and records the
+new provenance. A tabular file is compared on its **logical content** (schema +
+per-column value hashes), not raw bytes — re-saving the same data, reordering
+columns, or an `--ignore-cols` volatile timestamp column does not trip a false drift.
+## Anti-Hallucination
+- Never claim a dataset is unchanged without running `verify`.
+- Manifests record only observed hashes/schema; no provenance is invented — the
+  `provenance` note is user-supplied text.
+- Report drift exactly as computed; do not downplay a changed column hash.

package/skills/version-dataset/references/manifest_schema.md ADDED Viewed

@@ -0,0 +1,72 @@
+# Manifest Schema & Drift Categories
+`version_dataset.py` produces a deterministic `manifest.json`. This documents the
+structure, the drift categories `verify`/`diff` report, and the non-deterministic
+artifact policy.
+## manifest.json schema (schema_version 1)
+```jsonc
+{
+  "schema_version": 1,
+  "seed": 42,                          // analysis seed, user-supplied (null if none)
+  "provenance": "KNHANES 2018 v1",     // user-supplied note (null if none)
+  "stamp": null,                       // omitted by default; set only via --stamp
+  "files": {
+    "data/cohort.csv": {
+      "sha256": "…",                   // byte hash of the file
+      "bytes": 12345,
+      "tabular": {                     // present only for CSV/TSV/Parquet/Stata/SAS/Excel
+        "n_rows": 200,
+        "n_cols": 9,
+        "column_hashes": {"age": "…", "bmi": "…"}   // sha256 of the column's literal
+                                                    // cell strings (row order)
+      }
+    }
+  }
+}
+```
+Determinism: no timestamp is written unless `--stamp` is passed, so the same bytes
+always yield the same manifest. `--base` stores file keys relative to a directory
+(portable manifests); `--ignore-cols` omits volatile columns from `column_hashes`.
+## Drift categories (verify / diff)
+| Category | Meaning |
+|---|---|
+| `CHANGED bytes: F` | A **non-tabular** file's SHA-256 differs. Tabular files are compared on logical content (below), not raw bytes, since re-save / float formatting / an `--ignore-cols` column would otherwise produce spurious byte drift. |
+| `MISSING file: F` | F was in the manifest but is absent now. |
+| `UNEXPECTED file: F` | F is present now but not in the manifest. |
+| `ROW COUNT F: a -> b` | Tabular row count changed. |
+| `ADDED column F:c` / `REMOVED column F:c` | Schema change. |
+| `CHANGED column F:c` | Column c's values (or dtype) changed, even if row count is stable. |
+`verify --strict` exits non-zero if any drift is found; without `--strict` it
+reports and exits 0 (for advisory runs).
+## Non-deterministic artifact policy
+Byte-for-byte hashing is correct for data files and result tables (CSV), but
+**not** for artifacts that embed timestamps or render metadata:
+- **PPTX / DOCX** embed creation/modification timestamps → hash changes every build.
+- **PDF / PNG figures** may embed render metadata.
+Policy: manifest only the **deterministic** surface — input data files and
+tabular result outputs. Do not put PPTX/DOCX/figure binaries under `verify --strict`.
+For tabular files with a volatile column (e.g. an export timestamp column), use
+`--ignore-cols <name>` so the rest of the table is still verified.
+## Demo reproducibility (codex Improvement E)
+Each bundled `demo/<name>/manifest.lock.json` fingerprints the demo's input data
+and deterministic result tables. Verify a demo reproduces with:
+```bash
+python skills/version-dataset/scripts/version_dataset.py verify \
+  --manifest demo/01_wisconsin_bc/manifest.lock.json --base demo/01_wisconsin_bc --strict
+```
+The locks intentionally exclude the manuscript `.docx` and `.pptx` (timestamped)
+and cover the input dataset plus the `analysis/tables/*.csv` outputs.

package/skills/version-dataset/scripts/version_dataset.py ADDED Viewed

@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""Dataset version control: content-hash manifest, drift verification, and diff.
+Records a deterministic fingerprint of a dataset (or any analysis artifact) so a
+later run can prove the inputs/outputs are unchanged, and so two versions can be
+compared. Serves two needs:
+  1. Dataset versioning for research — detect that an extract changed between
+     analysis runs (schema drift, row-count change, value changes) instead of
+     silently re-running on different data (data-integrity rule, seed=42).
+  2. Reproducibility lock for bundled demos — hash input data + deterministic
+     outputs into a manifest.lock so CI can verify a demo still reproduces.
+Subcommands:
+  manifest <paths...> --out FILE   build a manifest (file SHA-256 + tabular schema)
+  verify   --manifest FILE [paths] recompute and compare; --strict exits non-zero on drift
+  diff     --old A --new B         compare two manifests
+File-level SHA-256 works with the stdlib alone. Tabular schema/column hashing
+uses pandas when available; without it, files are still hashed at the byte level.
+Deterministic by design: no timestamps are written unless passed via --stamp.
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import sys
+from pathlib import Path
+try:
+    import pandas as pd
+    _HAVE_PANDAS = True
+except ImportError:
+    _HAVE_PANDAS = False
+TABULAR = {".csv", ".tsv", ".parquet", ".pq", ".dta", ".sas7bdat", ".xlsx"}
+def file_sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+def _read_str(path: Path):
+    """Read a tabular file with every cell as its literal string.
+    Hashing must depend only on the data, not on the reader's environment. Native
+    dtype inference (int64 vs int32, object vs string, NaN coercion, float repr)
+    varies across pandas versions and platforms, which made manifests fail to
+    reproduce in CI. Reading CSV/TSV with dtype=str + keep_default_na=False
+    captures the exact textual content; other formats are read then stringified.
+    """
+    suf = path.suffix.lower()
+    if suf in (".csv", ".txt"):
+        return pd.read_csv(path, dtype=str, keep_default_na=False)
+    if suf == ".tsv":
+        return pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False)
+    if suf in (".parquet", ".pq"):
+        df = pd.read_parquet(path)
+    elif suf == ".dta":
+        df = pd.read_stata(path)
+    elif suf == ".sas7bdat":
+        df = pd.read_sas(path)
+    elif suf == ".xlsx":
+        df = pd.read_excel(path, dtype=str)
+    else:
+        return None
+    return df.astype(str)
+def column_hashes(path: Path, ignore_cols: set[str]) -> dict | None:
+    if not _HAVE_PANDAS or path.suffix.lower() not in TABULAR:
+        return None
+    try:
+        df = _read_str(path)
+    except Exception:
+        return None
+    if df is None:
+        return None
+    cols = {}
+    for c in df.columns:
+        if c in ignore_cols:
+            continue
+        # Cells are already canonical strings (environment-independent); the
+        # pandas dtype is deliberately NOT part of the digest.
+        payload = ("\x1e".join(df[c].tolist())).encode("utf-8")
+        cols[str(c)] = hashlib.sha256(payload).hexdigest()
+    return {
+        "n_rows": int(len(df)),
+        "n_cols": int(df.shape[1]),
+        "column_hashes": cols,
+    }
+def build_entry(path: Path, ignore_cols: set[str]) -> dict:
+    entry = {"sha256": file_sha256(path), "bytes": path.stat().st_size}
+    tab = column_hashes(path, ignore_cols)
+    if tab is not None:
+        entry["tabular"] = tab
+    return entry
+def cmd_manifest(args: argparse.Namespace) -> int:
+    ignore = set(args.ignore_cols or [])
+    files = sorted(Path(p) for p in args.paths)
+    missing = [str(p) for p in files if not p.is_file()]
+    if missing:
+        print(f"ERROR: not found: {', '.join(missing)}", file=sys.stderr)
+        return 2
+    base = Path(args.base).resolve() if args.base else None
+    entries = {}
+    for p in files:
+        key = p.resolve().relative_to(base).as_posix() if base else p.as_posix()
+        entries[key] = build_entry(p, ignore)
+    manifest = {
+        "schema_version": 1,
+        "seed": args.seed,
+        "provenance": args.provenance,
+        "files": entries,
+    }
+    if args.stamp:
+        manifest["stamp"] = args.stamp
+    out = Path(args.out)
+    out.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(json.dumps({"files": len(entries), "out": str(out)}, indent=2))
+    return 0
+def _compare(expected: dict, actual: dict) -> list[str]:
+    drift: list[str] = []
+    exp_files, act_files = expected.get("files", {}), actual.get("files", {})
+    for name in sorted(set(exp_files) | set(act_files)):
+        if name not in act_files:
+            drift.append(f"MISSING file: {name}")
+            continue
+        if name not in exp_files:
+            drift.append(f"UNEXPECTED file: {name}")
+            continue
+        e, a = exp_files[name], act_files[name]
+        et, at = e.get("tabular"), a.get("tabular")
+        if et and at:
+            # Tabular: compare LOGICAL content (schema + column hashes), not raw
+            # bytes. Byte hash is over-sensitive (re-save, float formatting, an
+            # --ignore-cols column) and the column hashes fully characterize the
+            # data; only flag a byte change for non-tabular files below.
+            if et["n_rows"] != at["n_rows"]:
+                drift.append(f"ROW COUNT {name}: {et['n_rows']} -> {at['n_rows']}")
+            ec, ac = set(et["column_hashes"]), set(at["column_hashes"])
+            for col in sorted(ec - ac):
+                drift.append(f"REMOVED column {name}:{col}")
+            for col in sorted(ac - ec):
+                drift.append(f"ADDED column {name}:{col}")
+            for col in sorted(ec & ac):
+                if et["column_hashes"][col] != at["column_hashes"][col]:
+                    drift.append(f"CHANGED column {name}:{col}")
+        else:
+            # Non-tabular (or no longer readable as tabular): byte hash is the
+            # only available signal.
+            if e.get("sha256") != a.get("sha256"):
+                drift.append(f"CHANGED bytes: {name}")
+    return drift
+def cmd_verify(args: argparse.Namespace) -> int:
+    expected = json.loads(Path(args.manifest).read_text(encoding="utf-8"))
+    ignore = set(args.ignore_cols or [])
+    base = Path(args.base).resolve() if args.base else None
+    actual_files = {}
+    for name in expected.get("files", {}):
+        p = (base / name) if base else Path(name)
+        if not p.is_file():
+            actual_files[name] = {"sha256": None}
+            continue
+        actual_files[name] = build_entry(p, ignore)
+    actual = {"files": actual_files}
+    drift = _compare(expected, actual)
+    print("=" * 41)
+    print(" Dataset Manifest Verify")
+    print("=" * 41)
+    if not drift:
+        print(f"OK: {len(expected.get('files', {}))} file(s) match the manifest.")
+        return 0
+    print(f"DRIFT ({len(drift)}):")
+    for d in drift:
+        print(f"  {d}")
+    if args.strict:
+        print("\nMANIFEST_DRIFT: dataset differs from manifest.", file=sys.stderr)
+        return 1
+    print("\n(non-strict: reported only; rerun with --strict to fail.)")
+    return 0
+def cmd_diff(args: argparse.Namespace) -> int:
+    old = json.loads(Path(args.old).read_text(encoding="utf-8"))
+    new = json.loads(Path(args.new).read_text(encoding="utf-8"))
+    drift = _compare(old, new)
+    if not drift:
+        print("No differences between manifests.")
+        return 0
+    print(f"Differences ({len(drift)}):")
+    for d in drift:
+        print(f"  {d}")
+    return 0
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Dataset version control: manifest / verify / diff.")
+    sub = ap.add_subparsers(dest="cmd", required=True)
+    m = sub.add_parser("manifest", help="Build a content-hash manifest.")
+    m.add_argument("paths", nargs="+", help="Data/artifact files to fingerprint.")
+    m.add_argument("--out", default="manifest.json", help="Output manifest path.")
+    m.add_argument("--base", help="Base dir; manifest keys are paths relative to it.")
+    m.add_argument("--seed", type=int, default=None, help="Analysis seed to record (e.g. 42).")
+    m.add_argument("--provenance", default=None, help="Free-text provenance note.")
+    m.add_argument("--stamp", default=None, help="Optional timestamp string to record (omitted by default for determinism).")
+    m.add_argument("--ignore-cols", nargs="*", help="Column names excluded from hashing.")
+    m.set_defaults(func=cmd_manifest)
+    v = sub.add_parser("verify", help="Recompute and compare against a manifest.")
+    v.add_argument("--manifest", required=True)
+    v.add_argument("--base", help="Base dir for resolving manifest file keys.")
+    v.add_argument("--ignore-cols", nargs="*")
+    v.add_argument("--strict", action="store_true", help="Exit non-zero on any drift.")
+    v.set_defaults(func=cmd_verify)
+    d = sub.add_parser("diff", help="Compare two manifests.")
+    d.add_argument("--old", required=True)
+    d.add_argument("--new", required=True)
+    d.set_defaults(func=cmd_diff)
+    args = ap.parse_args()
+    return args.func(args)
+if __name__ == "__main__":
+    sys.exit(main())

package/skills/version-dataset/skill.yml ADDED Viewed

@@ -0,0 +1,35 @@
+schema_version: 2
+name: version-dataset
+layer: A
+owner_domain: dataset_versioning
+when_to_use: "Version-control a dataset with SHA-256 manifests for reproducibility."
+when_NOT_to_use: "Cleaning the data (use clean-data); documenting variables (use generate-codebook)."
+inputs:
+  - "analysis dataset"
+  - "prior manifest (optional)"
+outputs:
+  - "dataset manifest.lock with SHA-256 hashes"
+deterministic_scripts:
+  - scripts/version_dataset.py
+side_effects:
+  - writes_manifest_artifacts
+downstream_consumers:
+  - analyze-stats
+  - self-review
+forbidden_actions:
+  - alter_data_to_match_a_manifest
+  - report_verification_pass_without_running_verify
+# v2.1 quality card
+purpose: "Pin a dataset's contents with SHA-256 manifests so analyses are reproducible and drift is detectable."
+safety_boundaries:
+  - "Hashes are computed from the actual files; a manifest is never edited to force a pass."
+  - "Verification is deterministic and re-runnable in CI."
+known_limitations:
+  - "Detects byte-level drift, not semantic correctness of the data."
+  - "A manifest is only as trustworthy as the moment it was locked."
+validation_commands:
+  - "python3 scripts/version_dataset.py verify --manifest <lock> --strict"
+evidence_surface: ci_validator

package/skills/version-dataset/tests/test_version_dataset.sh ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Regression tests for version-dataset/scripts/version_dataset.py.
+# Self-contained: builds synthetic CSVs (no committed data).
+set -uo pipefail
+REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+VS="$REPO_ROOT/skills/version-dataset/scripts/version_dataset.py"
+TMP="$(mktemp -d -t versionds.XXXXXX)"
+trap 'rm -rf "$TMP"' EXIT
+[[ -f "$VS" ]] || { echo "ENV-ERR: script missing" >&2; exit 2; }
+fail=0; ran=0
+check() {
+    local label="$1" expected="$2" actual="$3"
+    ran=$((ran+1))
+    if [[ "$expected" == "$actual" ]]; then printf '  PASS  %-48s %s\n' "$label" "$actual"
+    else printf '  FAIL  %-48s expected=%s actual=%s\n' "$label" "$expected" "$actual"; fail=$((fail+1)); fi
+}
+ec() { "$@" >/dev/null 2>&1; echo $?; }
+printf 'id,age,grp\n1,50,A\n2,61,B\n3,47,A\n' > "$TMP/d.csv"
+# manifest builds (exit 0)
+check "manifest build" 0 "$(ec python3 "$VS" manifest "$TMP/d.csv" --out "$TMP/m.json" --seed 42 --provenance test)"
+check "manifest file written" 0 "$([[ -f "$TMP/m.json" ]] && echo 0 || echo 1)"
+# verify clean (exit 0)
+check "verify clean --strict" 0 "$(ec python3 "$VS" verify --manifest "$TMP/m.json" --strict)"
+# mutate a value -> drift (exit 1) + CHANGED column reported
+printf 'id,age,grp\n1,50,A\n2,99,B\n3,47,A\n' > "$TMP/d.csv"
+check "verify value-change --strict" 1 "$(ec python3 "$VS" verify --manifest "$TMP/m.json" --strict)"
+out="$(python3 "$VS" verify --manifest "$TMP/m.json" 2>&1)"
+check "drift reports CHANGED column age" 0 "$([[ "$out" == *"CHANGED column"*":age"* ]] && echo 0 || echo 1)"
+check "non-strict drift exits 0" 0 "$(ec python3 "$VS" verify --manifest "$TMP/m.json")"
+# add a row -> new manifest + diff reports ROW COUNT
+printf 'id,age,grp\n1,50,A\n2,99,B\n3,47,A\n4,55,C\n' > "$TMP/d.csv"
+python3 "$VS" manifest "$TMP/d.csv" --out "$TMP/m2.json" >/dev/null 2>&1
+dout="$(python3 "$VS" diff --old "$TMP/m.json" --new "$TMP/m2.json" 2>&1)"
+check "diff reports ROW COUNT 3 -> 4" 0 "$([[ "$dout" == *"ROW COUNT"*"3 -> 4"* ]] && echo 0 || echo 1)"
+# --ignore-cols excludes a volatile column from hashing
+printf 'id,age,ts\n1,50,t1\n2,61,t2\n' > "$TMP/v.csv"
+python3 "$VS" manifest "$TMP/v.csv" --out "$TMP/vm.json" --ignore-cols ts >/dev/null 2>&1
+printf 'id,age,ts\n1,50,t9\n2,61,t8\n' > "$TMP/v.csv"   # only ts changes
+check "verify ignores volatile col" 0 "$(ec python3 "$VS" verify --manifest "$TMP/vm.json" --ignore-cols ts --strict)"
+printf '\n%d/%d checks passed\n' "$((ran-fail))" "$ran"
+[[ "$fail" -eq 0 ]] || exit 1