npm - trantor - Versions diffs - 0.15.0 → 0.16.0 - Mend

trantor 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/deploy/setup.sh +3 -3
package/engine/LICENSE +21 -0
package/engine/README.md +5 -0
package/engine/bin/scrooge +1276 -0
package/engine/bin/scrooge-capabilities +209 -0
package/engine/bin/scrooge-diverge +263 -0
package/engine/bin/scrooge-drift +126 -0
package/engine/bin/scrooge-verify +190 -0
package/engine/capabilities.seed.json +112 -0
package/engine/install.sh +138 -0
package/engine/lessons.seed.json +17 -0
package/engine/registry.template.json +329 -0
package/package.json +3 -2

package/engine/bin/scrooge-verify ADDED Viewed

@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+"""
+verify — harness-backed verification gate (Tier 3 executor).
+Two-part verification, cheapest-first:
+  1. DETERMINISTIC (free, ground truth): detect the toolchain and actually run
+     build / typecheck / test, capturing real exit codes + output tails. A
+     non-zero exit is an objective FAIL — no LLM can override it.
+  2. JUDGMENT (cheap LLM via ~/.claude/bin/llm --task verify): only if the
+     deterministic steps pass AND a --claim is given, ask a cheap model whether
+     the evidence actually SUPPORTS the claim and what it does NOT cover
+     (e.g. "tests pass but none exercise the new function"). Opus adjudicates last.
+The agent that wrote the code is never the judge: this re-runs everything itself.
+Usage:
+  verify --dir . --claim "added POST /v1/refunds that 409s on duplicate"
+  verify --dir path/to/repo                 # deterministic only (no claim)
+  verify --cmd "npm test" --claim "..."     # explicit command instead of autodetect
+  verify --no-llm                           # deterministic only, skip judgment
+  verify --judge kimi --json                # choose judge model / JSON output
+Verdict: VERIFIED (built+tested+judgment supports) · FAILED (a step errored or
+judge refutes) · INCONCLUSIVE (passes but judge finds gaps / nothing to run).
+Exit code: 0 VERIFIED, 1 INCONCLUSIVE, 2 FAILED.
+"""
+import sys, os, json, argparse, subprocess, shutil
+# Resolve the cheap-model router (sibling `scrooge`), with PATH fallback.
+_HERE = os.path.dirname(os.path.realpath(__file__))
+LLM = os.path.join(_HERE, "scrooge")
+if not os.path.exists(LLM):
+    LLM = shutil.which("scrooge") or LLM
+def sh(cmd, cwd, timeout=600):
+    try:
+        r = subprocess.run(cmd, cwd=cwd, shell=True, stdout=subprocess.PIPE,
+                           stderr=subprocess.STDOUT, text=True, timeout=timeout)
+        return r.returncode, r.stdout
+    except subprocess.TimeoutExpired:
+        return 124, "(timed out after %ds)" % timeout
+    except Exception as e:
+        return 1, str(e)
+def tail(s, n=30):
+    lines = (s or "").strip().splitlines()
+    return "\n".join(lines[-n:])
+def has(cwd, *names):
+    return any(os.path.exists(os.path.join(cwd, n)) for n in names)
+def pkg_scripts(cwd):
+    try:
+        return json.load(open(os.path.join(cwd, "package.json"))).get("scripts", {}) or {}
+    except Exception:
+        return {}
+def detect_steps(cwd):
+    """Return [(name, cmd), ...] of build/typecheck/test commands that exist."""
+    steps = []
+    if has(cwd, "package.json"):
+        sc = pkg_scripts(cwd)
+        runner = "npm run"
+        pm = "npm"
+        if has(cwd, "pnpm-lock.yaml"): runner, pm = "pnpm", "pnpm"
+        elif has(cwd, "yarn.lock"): runner, pm = "yarn", "yarn"
+        if "build" in sc: steps.append(("build", "%s build" % runner))
+        if "typecheck" in sc: steps.append(("typecheck", "%s typecheck" % runner))
+        elif "type-check" in sc: steps.append(("typecheck", "%s type-check" % runner))
+        elif has(cwd, "tsconfig.json") and shutil.which("npx"): steps.append(("typecheck", "npx tsc --noEmit"))
+        if "test" in sc: steps.append(("test", "npm test" if pm == "npm" else "%s test" % runner))
+        return "node", steps
+    if has(cwd, "Cargo.toml"):
+        return "rust", [("build", "cargo build"), ("test", "cargo test")]
+    if has(cwd, "go.mod"):
+        return "go", [("build", "go build ./..."), ("test", "go test ./...")]
+    if has(cwd, "pyproject.toml", "setup.py", "pytest.ini", "tox.ini"):
+        steps = []
+        if shutil.which("ruff"): steps.append(("lint", "ruff check ."))
+        steps.append(("test", "python3 -m pytest -q"))
+        return "python", steps
+    if has(cwd, "Makefile"):
+        steps = []
+        mk = open(os.path.join(cwd, "Makefile")).read()
+        if "\nbuild:" in mk or mk.startswith("build:"): steps.append(("build", "make build"))
+        if "\ntest:" in mk or mk.startswith("test:"): steps.append(("test", "make test"))
+        return "make", steps
+    return "unknown", []
+def judge(claim, steps, judge_model):
+    ev = "\n".join("- %s: `%s` → exit %d\n  output tail:\n%s" %
+                   (s["name"], s["cmd"], s["exit"], "\n".join("    " + l for l in s["tail"].splitlines()[-12:]))
+                   for s in steps)
+    prompt = (
+        "You are a skeptical verification judge. A claim of completed work is below, "
+        "with the ACTUAL build/test commands that were run and their real output.\n\n"
+        "CLAIM: %s\n\nEVIDENCE (commands actually executed):\n%s\n\n"
+        "Decide, strictly from the evidence, whether it SUPPORTS the claim. Passing tests "
+        "on unrelated code do NOT support a specific claim. If nothing here actually exercises "
+        "the claimed behavior, say so. Respond as JSON: "
+        '{"supports": true|false, "gaps": ["what is not proven by this evidence"], '
+        '"verdict": "VERIFIED|INCONCLUSIVE|FAILED", "reasoning": "one or two sentences"}'
+    ) % (claim, ev)
+    try:
+        out = subprocess.run([LLM, "--task", "verify", "--model", judge_model, "--json",
+                              "--max-tokens", "700", prompt],
+                             stdout=subprocess.PIPE, stderr=None, text=True).stdout
+        import re
+        m = re.search(r"\{.*\}", out, re.DOTALL)
+        return json.loads(m.group(0)) if m else None
+    except Exception as e:
+        sys.stderr.write("[verify] judge error: %s\n" % e)
+        return None
+def main():
+    ap = argparse.ArgumentParser(prog="verify")
+    ap.add_argument("--dir", default=".")
+    ap.add_argument("--claim")
+    ap.add_argument("--cmd", action="append", help="explicit command(s) to run instead of autodetect")
+    ap.add_argument("--judge", default="deepseek-chat")
+    ap.add_argument("--no-llm", action="store_true")
+    ap.add_argument("--json", action="store_true")
+    ap.add_argument("--timeout", type=int, default=600)
+    args = ap.parse_args()
+    cwd = os.path.abspath(args.dir)
+    if not os.path.isdir(cwd):
+        sys.stderr.write("no such dir: %s\n" % cwd); sys.exit(2)
+    if args.cmd:
+        toolchain, plan = "custom", [("cmd%d" % i, c) for i, c in enumerate(args.cmd, 1)]
+    else:
+        toolchain, plan = detect_steps(cwd)
+    sys.stderr.write("\033[1m◆ VERIFY\033[0m %s  [toolchain: %s]\n" % (cwd, toolchain))
+    steps = []
+    any_fail = False
+    for name, cmd in plan:
+        sys.stderr.write("  ▶ %-10s %s\n" % (name, cmd))
+        code, out = sh(cmd, cwd, args.timeout)
+        ok = code == 0
+        any_fail = any_fail or not ok
+        steps.append({"name": name, "cmd": cmd, "exit": code, "ok": ok, "tail": tail(out)})
+        sys.stderr.write("    %s exit %d\n" % ("✓" if ok else "✗", code))
+    built = any(s["name"] in ("build", "typecheck") and s["ok"] for s in steps) or \
+            not any(s["name"] in ("build", "typecheck") for s in steps)
+    tested = any(s["name"] == "test" and s["ok"] for s in steps)
+    result = {"dir": cwd, "toolchain": toolchain, "steps": steps,
+              "built": built, "tested": tested, "ran_anything": bool(steps)}
+    if any_fail:
+        verdict = "FAILED"
+        result["blockingIssues"] = ["%s failed (exit %d)" % (s["name"], s["exit"]) for s in steps if not s["ok"]]
+    elif not steps:
+        verdict = "INCONCLUSIVE"
+        result["blockingIssues"] = ["no build/test commands detected — nothing was actually run"]
+    elif args.claim and not args.no_llm:
+        j = judge(args.claim, steps, args.judge)
+        result["llm_judgment"] = j
+        if not j:
+            verdict = "INCONCLUSIVE"
+            result["blockingIssues"] = ["judge unavailable; deterministic steps passed but claim not independently assessed"]
+        elif j.get("verdict") == "FAILED" or j.get("supports") is False:
+            verdict = "FAILED"
+            result["blockingIssues"] = j.get("gaps", []) or ["judge refuted the claim"]
+        elif j.get("verdict") == "INCONCLUSIVE" or j.get("gaps"):
+            verdict = "INCONCLUSIVE"
+            result["blockingIssues"] = j.get("gaps", [])
+        else:
+            verdict = "VERIFIED"
+    else:
+        verdict = "VERIFIED"  # steps passed, no claim to judge
+    result["verdict"] = verdict
+    if args.json:
+        json.dump(result, sys.stdout, indent=2); sys.stdout.write("\n")
+    else:
+        icon = {"VERIFIED": "✅", "INCONCLUSIVE": "⚠️", "FAILED": "❌"}[verdict]
+        sys.stderr.write("\n%s \033[1mVERDICT: %s\033[0m\n" % (icon, verdict))
+        for b in result.get("blockingIssues", []):
+            sys.stderr.write("   • %s\n" % b)
+        if result.get("llm_judgment", {}).get("reasoning"):
+            sys.stderr.write("   judge: %s\n" % result["llm_judgment"]["reasoning"])
+    sys.exit({"VERIFIED": 0, "INCONCLUSIVE": 1, "FAILED": 2}[verdict])
+if __name__ == "__main__":
+    main()

package/engine/capabilities.seed.json ADDED Viewed

@@ -0,0 +1,112 @@
+{
+  "_comment": "Token Scrooge capability seed \u2014 per-model quality scores used by the weighted router (quality-for-task / cost), gated by difficulty. Numbers are Artificial Analysis (artificialanalysis.ai) metrics, snapshot 2026-06-04: intelligence=AA Intelligence Index, coding=AA Coding Index, math=AA Math Index, reasoning=GPQA Diamond x100, speed_tps=median output tokens/sec (all 0-100 except speed). Committed as a starter set; refresh/override the user-local ~/.token-scrooge/capabilities.json weekly via `scrooge-capabilities` (AA + OpenRouter). null = AA had no score for that eval; the router falls back to the intelligence index.",
+  "_meta": {
+    "source": "artificialanalysis.ai",
+    "snapshot": "2026-06-04",
+    "attribution": "https://artificialanalysis.ai/"
+  },
+  "deepseek-v4-flash": {
+    "intelligence": 46.5,
+    "coding": 38.7,
+    "math": null,
+    "reasoning": 89.4,
+    "speed_tps": 119.722,
+    "aa_slug": "deepseek-v4-flash"
+  },
+  "deepseek-v4-pro": {
+    "intelligence": 51.5,
+    "coding": 47.5,
+    "math": null,
+    "reasoning": 88.8,
+    "speed_tps": 46.223,
+    "aa_slug": "deepseek-v4-pro"
+  },
+  "kimi-k2.6": {
+    "intelligence": 53.9,
+    "coding": 47.1,
+    "math": null,
+    "reasoning": 91.1,
+    "speed_tps": 41.575,
+    "aa_slug": "kimi-k2-6"
+  },
+  "glm-4.5-air": {
+    "intelligence": 23.2,
+    "coding": 23.8,
+    "math": 80.7,
+    "reasoning": 73.3,
+    "speed_tps": 74.495,
+    "aa_slug": "glm-4-5-air"
+  },
+  "glm-4.7": {
+    "intelligence": 42.1,
+    "coding": 36.3,
+    "math": 95,
+    "reasoning": 85.9,
+    "speed_tps": 79.245,
+    "aa_slug": "glm-4-7"
+  },
+  "glm-5": {
+    "intelligence": 49.8,
+    "coding": 44.2,
+    "math": null,
+    "reasoning": 82.0,
+    "speed_tps": 79.532,
+    "aa_slug": "glm-5"
+  },
+  "gemini-2.5-flash-lite": {
+    "intelligence": 12.7,
+    "coding": 7.4,
+    "math": 35.3,
+    "reasoning": 47.4,
+    "speed_tps": 229.515,
+    "aa_slug": "gemini-2-5-flash-lite"
+  },
+  "gemini-2.5-flash": {
+    "intelligence": 20.6,
+    "coding": 17.8,
+    "math": 60.3,
+    "reasoning": 68.3,
+    "speed_tps": 185.129,
+    "aa_slug": "gemini-2-5-flash"
+  },
+  "gemini-3-flash-preview": {
+    "intelligence": 35,
+    "coding": 37.8,
+    "math": 55.7,
+    "reasoning": 81.2,
+    "speed_tps": 181.264,
+    "aa_slug": "gemini-3-flash"
+  },
+  "gpt-5-nano": {
+    "intelligence": 26.8,
+    "coding": 20.3,
+    "math": 83.7,
+    "reasoning": 67.6,
+    "speed_tps": 150.373,
+    "aa_slug": "gpt-5-nano"
+  },
+  "gpt-5-mini": {
+    "intelligence": 41.2,
+    "coding": 35.3,
+    "math": 90.7,
+    "reasoning": 82.8,
+    "speed_tps": 87.426,
+    "aa_slug": "gpt-5-mini"
+  },
+  "gpt-4.1-mini": {
+    "intelligence": 22.9,
+    "coding": 18.5,
+    "math": 46.3,
+    "reasoning": 66.4,
+    "speed_tps": 79.254,
+    "aa_slug": "gpt-4-1-mini"
+  },
+  "grok-4.3": {
+    "intelligence": 53.2,
+    "coding": 41,
+    "math": null,
+    "reasoning": 90.1,
+    "speed_tps": 125.405,
+    "aa_slug": "grok-4-3"
+  }
+}

package/engine/install.sh ADDED Viewed

@@ -0,0 +1,138 @@
+#!/usr/bin/env bash
+# Token Scrooge installer — make the cheap models do the grunt work.
+# Usage:
+#   git clone https://github.com/sashabogi/token-scrooge && cd token-scrooge && ./install.sh
+#   curl -fsSL https://raw.githubusercontent.com/sashabogi/token-scrooge/main/install.sh | bash
+set -euo pipefail
+REPO_URL="${SCROOGE_REPO_URL:-https://github.com/sashabogi/token-scrooge}"
+BIN_DIR="${SCROOGE_BIN_DIR:-$HOME/.local/bin}"
+SCROOGE_HOME="${SCROOGE_HOME:-$HOME/.token-scrooge}"
+say() { printf '%s\n' "$*"; }
+# --- prerequisites -------------------------------------------------------
+command -v python3 >/dev/null 2>&1 || { say "✗ python3 is required (3.8+)."; exit 1; }
+# --- locate the repo (clone if piped via curl) ---------------------------
+SRC="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" 2>/dev/null && pwd || true)"
+if [ -z "${SRC:-}" ] || [ ! -f "$SRC/bin/scrooge" ]; then
+  command -v git >/dev/null 2>&1 || { say "✗ git is required to bootstrap (or run ./install.sh from a clone)."; exit 1; }
+  SRC="$SCROOGE_HOME/repo"
+  say "▸ Fetching Token Scrooge into $SRC ..."
+  if [ -d "$SRC/.git" ]; then git -C "$SRC" pull --ff-only --quiet; else git clone --depth 1 "$REPO_URL" "$SRC" --quiet; fi
+fi
+# --- install -------------------------------------------------------------
+mkdir -p "$BIN_DIR" "$SCROOGE_HOME"
+for b in scrooge scrooge-diverge scrooge-verify scrooge-drift scrooge-capabilities; do
+  chmod +x "$SRC/bin/$b"
+  ln -sf "$SRC/bin/$b" "$BIN_DIR/$b"   # symlink → `git pull` keeps tools current
+done
+# --- registry: refresh untouched copies, never clobber local edits ----------
+# We keep the last-shipped template at $SCROOGE_HOME/registry.template.json as a
+# baseline. If your live registry.json is byte-identical to that baseline you
+# never edited it, so it's safe to roll forward to the new template. If it
+# differs, you (or a manual sync) changed it — we preserve it and just flag that
+# a newer template exists.
+NEW_TPL="$SRC/registry.template.json"
+OLD_TPL="$SCROOGE_HOME/registry.template.json"
+REG="$SCROOGE_HOME/registry.json"
+if [ ! -f "$REG" ]; then
+  cp "$NEW_TPL" "$REG"                                   # fresh install
+  say "✓ Registry installed."
+elif cmp -s "$REG" "$NEW_TPL"; then
+  : # already current — nothing to do
+elif [ -f "$OLD_TPL" ] && cmp -s "$REG" "$OLD_TPL"; then
+  cp "$NEW_TPL" "$REG"                                   # untouched copy → roll forward
+  say "✓ Registry auto-refreshed to the latest models (no local edits detected)."
+else
+  say "⚠ A newer registry template is available, but your registry.json has local"
+  say "  edits — leaving it untouched. Compare with:"
+  say "      diff \"$REG\" \"$NEW_TPL\"     (or run: scrooge-drift)"
+fi
+cp "$NEW_TPL" "$OLD_TPL"                                 # update baseline for next run
+# --- live-training seed: keep a current copy in $SCROOGE_HOME ----------------
+# The committed seed (lessons.seed.json) ships starter guardrails. The user-local
+# lessons.json (gitignored) is created from it on first use and never clobbered.
+if [ -f "$SRC/lessons.seed.json" ]; then
+  cp "$SRC/lessons.seed.json" "$SCROOGE_HOME/lessons.seed.json"
+fi
+# --- capability seed: quality scores for the weighted router (refreshed by scrooge-capabilities)
+if [ -f "$SRC/capabilities.seed.json" ]; then
+  cp "$SRC/capabilities.seed.json" "$SCROOGE_HOME/capabilities.seed.json"
+fi
+say "✓ Installed: scrooge, scrooge-diverge, scrooge-verify, scrooge-drift, scrooge-capabilities → $BIN_DIR"
+# --- weekly self-maintenance: refresh model quality scores (capability routing) ----------
+# macOS uses a user LaunchAgent (no Full Disk Access needed, unlike crontab); Linux uses cron.
+# Idempotent and non-fatal — a failure here never blocks the install.
+setup_weekly_refresh() {
+  local tool="$BIN_DIR/scrooge-capabilities"
+  local log="$SCROOGE_HOME/capabilities-refresh.log"
+  [ -x "$tool" ] || return 0
+  case "$(uname -s)" in
+    Darwin)
+      local label="com.tokenscrooge.capabilities"
+      local plist="$HOME/Library/LaunchAgents/$label.plist"
+      local py; py="$(command -v python3 || echo /usr/bin/python3)"
+      mkdir -p "$HOME/Library/LaunchAgents"
+      cat > "$plist" <<PLIST
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>Label</key><string>$label</string>
+  <key>ProgramArguments</key>
+  <array><string>$py</string><string>$tool</string></array>
+  <key>StartCalendarInterval</key>
+  <dict><key>Weekday</key><integer>1</integer><key>Hour</key><integer>9</integer><key>Minute</key><integer>5</integer></dict>
+  <key>StandardOutPath</key><string>$log</string>
+  <key>StandardErrorPath</key><string>$log</string>
+  <key>RunAtLoad</key><false/>
+</dict>
+</plist>
+PLIST
+      launchctl bootout "gui/$(id -u)/$label" 2>/dev/null || true
+      if launchctl bootstrap "gui/$(id -u)" "$plist" 2>/dev/null; then
+        say "✓ Weekly capability refresh scheduled (LaunchAgent · Mondays 09:05)."
+      else
+        say "ℹ LaunchAgent written to $plist — load it with: launchctl bootstrap gui/$(id -u) \"$plist\""
+      fi
+      ;;
+    *)
+      local line="5 9 * * 1 $tool > $log 2>&1"
+      if command -v crontab >/dev/null 2>&1; then
+        if crontab -l 2>/dev/null | grep -q "scrooge-capabilities"; then
+          say "✓ Weekly capability refresh already in crontab."
+        elif ( crontab -l 2>/dev/null; printf '%s\n' "$line" ) | crontab - 2>/dev/null; then
+          say "✓ Weekly capability refresh added to crontab (Mondays 09:05)."
+        else
+          say "ℹ Could not edit crontab automatically. Add this line yourself:"
+          say "    $line"
+        fi
+      else
+        say "ℹ No crontab found — schedule '$tool' weekly however you prefer."
+      fi
+      ;;
+  esac
+}
+setup_weekly_refresh || true
+case ":$PATH:" in
+  *":$BIN_DIR:"*) ;;
+  *) say "⚠ $BIN_DIR is not on your PATH. Add it:"
+     say "    echo 'export PATH=\"$BIN_DIR:\$PATH\"' >> ~/.zshrc && source ~/.zshrc" ;;
+esac
+# --- first-run setup -----------------------------------------------------
+if [ "${1:-}" = "--no-setup" ] || [ ! -t 0 ]; then
+  say ""
+  say "Next: run the setup wizard to pick your orchestrator and add API keys:"
+  say "    scrooge setup"
+else
+  say ""
+  "$SRC/bin/scrooge" setup
+fi

package/engine/lessons.seed.json ADDED Viewed

@@ -0,0 +1,17 @@
+{
+  "_comment": "Token Scrooge live-training seed lessons. Short corrective guardrails injected into a cheap model's system prompt at routing time. Shape: { \"<model-id-or-alias>\": { \"<task>\"|\"*\": [\"one-liner\", ...] } }, plus a top-level \"*\" model bucket for lessons that apply to EVERY routed (cheap, execution-only) model. This file is COMMITTED and read-only at runtime; on first use it is copied to the user-local $SCROOGE_HOME/lessons.json, which then overrides/extends it (edit there via `scrooge learn` / `scrooge forget`, or re-merge new seeds with `scrooge learn --seed`). Keys starting with \"_\" are metadata and ignored by the loader.",
+  "_generalization_note": "The three deepseek-v4-flash/code lessons come from the 2026-06-04 polymarket dogfood. Decision: the order-book ordering pitfall is kept model-specific (deepseek-v4-flash, where it was observed), while the two model-agnostic correctness rules (absent numerics -> 0.0/schema default, exact schema key names) are promoted to the top-level \"*\"/code bucket so EVERY cheap code model inherits them. Scrooge only ever routes cheap execution models, so a \"*\" lesson never reaches an orchestrator. Exact-text de-dup means deepseek/code still shows all three with no duplication.",
+  "deepseek-v4-flash": {
+    "code": [
+      "Never assume API array ordering — sort order-book bids/asks explicitly by price.",
+      "Use 0.0 (or the schema default) for absent numeric values, not None.",
+      "Use the exact key names from the provided schema/example; do not invent fields."
+    ]
+  },
+  "*": {
+    "code": [
+      "Use 0.0 (or the schema default) for absent numeric values, not None.",
+      "Use the exact key names from the provided schema/example; do not invent fields."
+    ]
+  }
+}