npm - @draig/lexis-two - Versions diffs - 1.0.0 → 1.0.3 - Mend

@draig/lexis-two 1.0.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/README.md +3 -3
package/package.json +7 -2
package/.claude-plugin/marketplace.json +0 -29
package/.claude-plugin/plugin.json +0 -9
package/.codex-plugin/plugin.json +0 -31
package/.env.example +0 -8
package/.github/FUNDING.yml +0 -1
package/.github/copilot-instructions.md +0 -47
package/.github/plugin/marketplace.json +0 -20
package/.github/plugin/plugin.json +0 -16
package/.github/workflows/deploy-site.yml +0 -53
package/.github/workflows/test.yml +0 -29
package/AUDIT.md +0 -74
package/SPECXIS.md +0 -576
package/benchmarks/README.md +0 -114
package/benchmarks/arms/baseline.js +0 -2
package/benchmarks/arms/caveman-SKILL.md +0 -67
package/benchmarks/arms/caveman.js +0 -8
package/benchmarks/arms/lexis-two.js +0 -10
package/benchmarks/arms/ponytail.js +0 -6
package/benchmarks/behavior.js +0 -58
package/benchmarks/behavior.yaml +0 -40
package/benchmarks/benchmark-local.py +0 -156
package/benchmarks/benchmark-opencode-go.js +0 -294
package/benchmarks/correctness.js +0 -294
package/benchmarks/lib/aggregate-opencode-go.js +0 -103
package/benchmarks/lib/load-env.js +0 -31
package/benchmarks/lib/opencode-go-client.js +0 -151
package/benchmarks/loc.js +0 -13
package/benchmarks/opencode-go-models.json +0 -31
package/benchmarks/promptfooconfig.yaml +0 -41
package/benchmarks/prompts.json +0 -15
package/benchmarks/render-opencode-go-report.js +0 -28
package/benchmarks/results/2026-06-15-llama3.2-local.md +0 -76
package/benchmarks/results/2026-06-16-opencode-go.md +0 -56
package/benchmarks/results/opencode-go-2026-06-16-report.html +0 -226
package/benchmarks/results/opencode-go-2026-06-16.json +0 -1339
package/docs/assets/lexis-two-nobg.png +0 -0
package/docs/assets/logo.png +0 -0
package/docs/assets/logo.svg +0 -4
package/docs/portability.md +0 -147
package/docs/site.md +0 -52
package/gemini-extension.json +0 -7
package/pi-extension/index.js +0 -161
package/pi-extension/package.json +0 -8
package/pi-extension/test/extension.test.js +0 -89
package/pi-extension/test/helpers.test.js +0 -35
package/scripts/check-rule-copies.js +0 -82
package/site/astro.config.mjs +0 -18
package/site/package-lock.json +0 -4913
package/site/package.json +0 -14
package/site/public/CNAME +0 -1
package/site/public/assets/lexis-two-nobg.png +0 -0
package/site/public/assets/logo.png +0 -0
package/site/public/assets/logo.svg +0 -4
package/site/public/robots.txt +0 -4
package/site/src/components/Adapt.astro +0 -33
package/site/src/components/Benchmarks.astro +0 -232
package/site/src/components/Commands.astro +0 -33
package/site/src/components/Ecosystem.astro +0 -30
package/site/src/components/Example.astro +0 -77
package/site/src/components/Footer.astro +0 -28
package/site/src/components/Header.astro +0 -87
package/site/src/components/Hero.astro +0 -58
package/site/src/components/Home.astro +0 -46
package/site/src/components/Hosts.astro +0 -62
package/site/src/components/Install.astro +0 -143
package/site/src/components/LanguageSwitcher.astro +0 -82
package/site/src/components/Philosophy.astro +0 -23
package/site/src/components/Stacks.astro +0 -33
package/site/src/components/Suggested.astro +0 -39
package/site/src/data/opencode-go-benchmark.json +0 -230
package/site/src/i18n/en.ts +0 -155
package/site/src/i18n/es.ts +0 -158
package/site/src/i18n/index.ts +0 -14
package/site/src/layouts/Layout.astro +0 -114
package/site/src/pages/benchmarks.astro +0 -4
package/site/src/pages/es/benchmarks.astro +0 -4
package/site/src/pages/es/index.astro +0 -10
package/site/src/pages/index.astro +0 -10
package/site/src/styles/global.css +0 -780
package/site/tsconfig.json +0 -3
package/tests/behavior.test.js +0 -80
package/tests/commands.test.js +0 -40
package/tests/copilot-plugin.test.js +0 -33
package/tests/correctness.test.js +0 -191
package/tests/gemini-extension.test.js +0 -78
package/tests/hooks-windows.test.js +0 -48
package/tests/hooks.test.js +0 -177
package/tests/opencode-plugin.test.js +0 -64

package/benchmarks/arms/caveman-SKILL.md DELETED Viewed

@@ -1,67 +0,0 @@
----
-name: caveman
-description: >
-  Ultra-compressed communication mode. Cuts token usage ~75% by speaking like caveman
-  while keeping full technical accuracy. Supports intensity levels: lite, full (default), ultra,
-  wenyan-lite, wenyan-full, wenyan-ultra.
-  Use when user says "caveman mode", "talk like caveman", "use caveman", "less tokens",
-  "be brief", or invokes /caveman. Also auto-triggers when token efficiency is requested.
----
-Respond terse like smart caveman. All technical substance stay. Only fluff die.
-## Persistence
-ACTIVE EVERY RESPONSE. No revert after many turns. No filler drift. Still active if unsure. Off only: "stop caveman" / "normal mode".
-Default: **full**. Switch: `/caveman lite|full|ultra`.
-## Rules
-Drop: articles (a/an/the), filler (just/really/basically/actually/simply), pleasantries (sure/certainly/of course/happy to), hedging. Fragments OK. Short synonyms (big not extensive, fix not "implement a solution for"). Technical terms exact. Code blocks unchanged. Errors quoted exact.
-Pattern: `[thing] [action] [reason]. [next step].`
-Not: "Sure! I'd be happy to help you with that. The issue you're experiencing is likely caused by..."
-Yes: "Bug in auth middleware. Token expiry check use `<` not `<=`. Fix:"
-## Intensity
-| Level | What change |
-|-------|------------|
-| **lite** | No filler/hedging. Keep articles + full sentences. Professional but tight |
-| **full** | Drop articles, fragments OK, short synonyms. Classic caveman |
-| **ultra** | Abbreviate (DB/auth/config/req/res/fn/impl), strip conjunctions, arrows for causality (X → Y), one word when one word enough |
-| **wenyan-lite** | Semi-classical. Drop filler/hedging but keep grammar structure, classical register |
-| **wenyan-full** | Maximum classical terseness. Fully 文言文. 80-90% character reduction. Classical sentence patterns, verbs precede objects, subjects often omitted, classical particles (之/乃/為/其) |
-| **wenyan-ultra** | Extreme abbreviation while keeping classical Chinese feel. Maximum compression, ultra terse |
-Example — "Why React component re-render?"
-- lite: "Your component re-renders because you create a new object reference each render. Wrap it in `useMemo`."
-- full: "New object ref each render. Inline object prop = new ref = re-render. Wrap in `useMemo`."
-- ultra: "Inline obj prop → new ref → re-render. `useMemo`."
-- wenyan-lite: "組件頻重繪，以每繪新生對象參照故。以 useMemo 包之。"
-- wenyan-full: "物出新參照，致重繪。useMemo .Wrap之。"
-- wenyan-ultra: "新參照→重繪。useMemo Wrap。"
-Example — "Explain database connection pooling."
-- lite: "Connection pooling reuses open connections instead of creating new ones per request. Avoids repeated handshake overhead."
-- full: "Pool reuse open DB connections. No new connection per request. Skip handshake overhead."
-- ultra: "Pool = reuse DB conn. Skip handshake → fast under load."
-- wenyan-full: "池reuse open connection。不每req新開。skip handshake overhead。"
-- wenyan-ultra: "池reuse conn。skip handshake → fast。"
-## Auto-Clarity
-Drop caveman for: security warnings, irreversible action confirmations, multi-step sequences where fragment order risks misread, user asks to clarify or repeats question. Resume caveman after clear part done.
-Example — destructive op:
-> **Warning:** This will permanently delete all rows in the `users` table and cannot be undone.
-> ```sql
-> DROP TABLE users;
-> ```
-> Caveman resume. Verify backup exist first.
-## Boundaries
-Code/commits/PRs: write normal. "stop caveman" or "normal mode": revert. Level persist until changed or session end.

package/benchmarks/arms/caveman.js DELETED Viewed

@@ -1,8 +0,0 @@
-// Caveman arm: caveman SKILL.md (full) as the system prompt.
-const fs = require('fs');
-const path = require('path');
-const system = fs.readFileSync(path.join(__dirname, 'caveman-SKILL.md'), 'utf8');
-module.exports = ({ vars }) => [
-  { role: 'system', content: system },
-  { role: 'user', content: vars.task },
-];

package/benchmarks/arms/lexis-two.js DELETED Viewed

@@ -1,10 +0,0 @@
-// Lexis-Two arm: repo skill (full) as the system prompt.
-const fs = require('fs');
-const path = require('path');
-const system = fs.readFileSync(
-  path.join(__dirname, '..', '..', 'skills', 'lexis-two', 'SKILL.md'),
-  'utf8',
-);
-module.exports = { system };

package/benchmarks/arms/ponytail.js DELETED Viewed

@@ -1,6 +0,0 @@
-// Ponytail arm (legacy label) — reads Lexis-Two skill until promptfoo config is renamed.
-const { system } = require('./lexis-two');
-module.exports = ({ vars }) => [
-  { role: 'system', content: system },
-  { role: 'user', content: vars.task },
-];

package/benchmarks/behavior.js DELETED Viewed

@@ -1,58 +0,0 @@
-// Behavior gate: does the ponytail ruleset actually PRODUCE its refined
-// behaviors, not just carry the text? One check per probe (vars.probe), each
-// targeting a rule that a field review (rcstack, phases 0-8) showed mattered:
-//   hardware     - "hardware is never the spec ideal, leave the calibration knob"
-//   explanation  - "explanation the user explicitly asked for is not debt"
-//   onecheck     - "lazy code without its check is unfinished"
-//
-// Heuristic graders, same spirit as loc.js / correctness.js. The graders
-// themselves are proven by tests/behavior.test.js (RED/GREEN, no API key).
-//
-// Metric: `behavior` (1 = behavior present, 0 = absent).
-function codeOf(text) {
-  return [...String(text || '').matchAll(/```[\w-]*\n([\s\S]*?)```/g)].map((m) => m[1]).join('\n');
-}
-function proseOf(text) {
-  return String(text || '').replace(/```[\s\S]*?```/g, ' ').replace(/\s+/g, ' ').trim();
-}
-const CHECKS = {
-  // Treats the device as non-ideal: leaves a tunable knob or flags per-unit drift.
-  // A passing mention of "calibration" is not enough; it must be actionable.
-  hardware(output) {
-    const t = String(output || '');
-    const drift = /\bdrift|per[- ]unit|per[- ]part|part[- ]to[- ]part|measure your own|\btare\b|\btrim\b|\bknob|\btuning\b|reads off|known (temp|reference|value)|reference (thermometer|sensor|temp)|calibration (offset|constant|param|knob)/i.test(t);
-    return drift
-      ? { pass: true, reason: 'Leaves a calibration knob / flags per-unit drift.' }
-      : { pass: false, reason: 'Treats the hardware as ideal; no calibration knob.' };
-  },
-  // Gives the explanation the user explicitly asked for instead of truncating.
-  explanation(output) {
-    const p = proseOf(output);
-    const words = p ? p.split(' ').length : 0;
-    const structured = /(\d+[.)]\s|[-*]\s)/.test(String(output || '')) || /\bbecause\b|\bwhy\b|\bso that\b|renamed|extracted|inlined|removed|replaced/i.test(p);
-    return words >= 45 && structured
-      ? { pass: true, reason: `Gave the requested write-up (${words} words of prose).` }
-      : { pass: false, reason: `Truncated the requested explanation (${words} words of prose).` };
-  },
-  // Leaves ONE runnable check behind for non-trivial logic.
-  onecheck(output) {
-    const t = String(output || '');
-    const hasCheck = /\bassert\b|def\s+test_|if\s+__name__|unittest|pytest|console\.assert|\bexpect\(|\bdescribe\(|\bit\(/.test(t);
-    return hasCheck
-      ? { pass: true, reason: 'Left a runnable check (assert/test/demo).' }
-      : { pass: false, reason: 'No runnable check left behind.' };
-  },
-};
-module.exports = (output, context) => {
-  const probe = context && context.vars && context.vars.probe;
-  const check = CHECKS[probe];
-  if (!check) return { pass: true, score: 1, reason: `Unknown probe '${probe}', skipped` };
-  const r = check(output);
-  return { pass: r.pass, score: r.pass ? 1 : 0, reason: r.reason };
-};

package/benchmarks/behavior.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-# Ponytail behavior gates: does the ruleset actually produce its refined
-# behaviors (not just carry the text)? Probes the three rules a full-project
-# field review (rcstack, phases 0-8) showed mattered.
-#
-#   Run:    npx promptfoo@latest eval -c benchmarks/behavior.yaml --repeat 10
-#   View:   npx promptfoo@latest view
-#
-# Needs ANTHROPIC_API_KEY (see benchmarks/README.md). The grader (behavior.js)
-# is proven separately by tests/behavior.test.js, which needs no API key.
-#
-# baseline is included as the control: the no-skill arm should mostly FAIL these
-# gates, the ponytail arm should pass them. That delta is the point.
-description: "Ponytail behavior gates: hardware calibration, requested explanation, one runnable check."
-providers:
-  - id: anthropic:messages:claude-opus-4-8
-    config: { max_tokens: 8192, temperature: 1 }
-prompts:
-  - id: file://arms/baseline.js
-    label: baseline (no skill)
-  - id: file://arms/ponytail.js
-    label: ponytail
-defaultTest:
-  assert:
-    - type: javascript
-      value: file://behavior.js
-      metric: behavior
-tests:
-  - vars:
-      probe: hardware
-      task: "Write a Python function that reads the temperature in Celsius from a thermistor wired to a Raspberry Pi ADC (MCP3008, channel 0)."
-  - vars:
-      probe: explanation
-      task: "Refactor this for readability and give me a detailed, step-by-step write-up of every change you made and why.\n\ndef p(d):\n    r = []\n    for x in d:\n        if x.get('a') and x['a'] > 0:\n            r.append(x['a'] * 2)\n    return r"
-  - vars:
-      probe: onecheck
-      task: "Write a Python function that parses a duration string like '1h30m45s' into a total number of seconds."

package/benchmarks/benchmark-local.py DELETED Viewed

@@ -1,156 +0,0 @@
-"""
-Ponytail local benchmark — runs the same 5 tasks against any Ollama model.
-No promptfoo required. Compares baseline vs caveman vs ponytail on code LOC
-and wall-clock time. Results are printed as a table and saved to a JSON file.
-Usage:
-    python benchmarks/benchmark-local.py
-    python benchmarks/benchmark-local.py --model llama3.2 --repeat 3
-Prerequisites: Ollama running locally (https://ollama.com), model pulled.
-"""
-import argparse
-import json
-import re
-import time
-import urllib.request
-from pathlib import Path
-ROOT = Path(__file__).parent.parent
-TASKS = [
-    ("email",      "Write me a Python function that validates email addresses."),
-    ("debounce",   "Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke."),
-    ("csv-sum",    "Write Python code that reads sales.csv and sums the 'amount' column."),
-    ("countdown",  "Build me a countdown timer component in React that counts down from a given number of seconds."),
-    ("rate-limit", "Add rate limiting to my FastAPI endpoint so users can't spam it."),
-]
-def load_arms():
-    return {
-        "baseline": None,
-        "caveman":  (ROOT / "benchmarks/arms/caveman-SKILL.md").read_text(encoding="utf-8"),
-        "lexis-two": (ROOT / "skills/lexis-two/SKILL.md").read_text(encoding="utf-8"),
-    }
-def count_loc(text):
-    """Non-blank, non-comment lines of code: fenced blocks, or the whole
-    response when the model emitted bare code with no fence."""
-    blocks = re.findall(r"```[a-zA-Z0-9_+\-]*\n([\s\S]*?)```", text)
-    lines = ("\n".join(blocks) if blocks else text).splitlines()
-    return sum(
-        1 for l in lines
-        if l.strip()
-        and not l.strip().startswith("//")
-        and not l.strip().startswith("#")
-        and l.strip() not in ("*/",)
-        and not l.strip().startswith("/*")
-        and not l.strip().startswith("*")
-    )
-def call_ollama(model, system_prompt, user_prompt, ollama_url):
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": system_prompt})
-    messages.append({"role": "user", "content": user_prompt})
-    payload = json.dumps({
-        "model": model,
-        "messages": messages,
-        "stream": False,
-        "options": {"temperature": 0.7},
-    }).encode()
-    req = urllib.request.Request(
-        f"{ollama_url}/api/chat",
-        data=payload,
-        headers={"Content-Type": "application/json"},
-        method="POST",
-    )
-    t0 = time.time()
-    with urllib.request.urlopen(req, timeout=180) as resp:
-        data = json.loads(resp.read())
-    elapsed = time.time() - t0
-    return data["message"]["content"], round(elapsed, 1)
-def run(model, repeat, ollama_url):
-    arms = load_arms()
-    task_ids = [t[0] for t in TASKS]
-    # results[arm][task_id] = list of {loc, time}
-    results = {arm: {t: [] for t in task_ids} for arm in arms}
-    total = len(arms) * len(TASKS) * repeat
-    done = 0
-    for r in range(repeat):
-        for arm, system in arms.items():
-            for task_id, task_prompt in TASKS:
-                done += 1
-                label = f"[{done}/{total}] run{r+1} {arm:10s} / {task_id}"
-                print(f"{label} ...", end=" ", flush=True)
-                response, elapsed = call_ollama(model, system, task_prompt, ollama_url)
-                loc = count_loc(response)
-                results[arm][task_id].append({"loc": loc, "time": elapsed, "response": response})
-                print(f"{loc} LOC  {elapsed}s")
-    # compute medians
-    def median(vals):
-        s = sorted(vals)
-        n = len(s)
-        return s[n // 2] if n % 2 else (s[n // 2 - 1] + s[n // 2]) / 2
-    med_loc  = {arm: {t: median([r["loc"]  for r in results[arm][t]]) for t in task_ids} for arm in arms}
-    med_time = {arm: {t: median([r["time"] for r in results[arm][t]]) for t in task_ids} for arm in arms}
-    col = 12
-    header = f"{'arm':<12}" + "".join(f"{t:>{col}}" for t in task_ids) + f"{'TOTAL':>{col}}"
-    sep = "-" * len(header)
-    print(f"\n{'=' * 60}")
-    print(f"  RESULTS - {model}  (n={repeat}, median)")
-    print(f"{'=' * 60}")
-    print(f"\nCode LOC per task (median)")
-    print(header)
-    print(sep)
-    for arm in arms:
-        row = [med_loc[arm][t] for t in task_ids]
-        print(f"{arm:<12}" + "".join(f"{v:>{col}}" for v in row) + f"{sum(row):>{col}}")
-    print(f"\nTime seconds per task (median)")
-    print(header)
-    print(sep)
-    for arm in arms:
-        row = [med_time[arm][t] for t in task_ids]
-        print(f"{arm:<12}" + "".join(f"{v:>{col}.1f}" for v in row) + f"{sum(row):>{col}.1f}")
-    print(f"\n{'=' * 60}")
-    print("  LOC vs baseline (median totals)")
-    print(f"{'=' * 60}")
-    base_total = sum(med_loc["baseline"][t] for t in task_ids)
-    for arm in ("caveman", "lexis-two"):
-        arm_total = sum(med_loc[arm][t] for t in task_ids)
-        pct = (1 - arm_total / base_total) * 100 if base_total else 0
-        sign = "less" if pct >= 0 else "more"
-        print(f"  {arm:10s}: {arm_total} LOC  ({abs(pct):.0f}% {sign} than baseline)")
-    out = Path(__file__).parent / "benchmark-local-results.json"
-    out.write_text(json.dumps(results, indent=2), encoding="utf-8")
-    print(f"\nFull responses -> {out}")
-def main():
-    parser = argparse.ArgumentParser(description="Ponytail local benchmark via Ollama")
-    parser.add_argument("--model",      default="llama3.2", help="Ollama model name (default: llama3.2)")
-    parser.add_argument("--repeat",     type=int, default=1, help="Runs per cell; median reported (default: 1)")
-    parser.add_argument("--ollama-url", default="http://localhost:11434", help="Ollama base URL")
-    args = parser.parse_args()
-    run(args.model, args.repeat, args.ollama_url)
-if __name__ == "__main__":
-    main()

package/benchmarks/benchmark-opencode-go.js DELETED Viewed

@@ -1,294 +0,0 @@
-#!/usr/bin/env node
-/**
- * Lexis-Two benchmark via OpenCode Go models.
- *
- * Same 5 tasks as promptfooconfig.yaml. Arms: baseline (no skill) vs lexis-two.
- * Optional: --arm caveman for a third arm.
- *
- * Usage:
- *   node benchmarks/benchmark-opencode-go.js --repeat 3
- *   node benchmarks/benchmark-opencode-go.js --model kimi-k2.6 --repeat 10
- *   node benchmarks/benchmark-opencode-go.js --write-md
- *
- * Requires OPENCODE_API_KEY in .env or environment (OpenCode Go subscription).
- * Docs: benchmarks/README.md#opencode-go
- */
-const fs = require('fs');
-const path = require('path');
-const { loadEnvFile } = require('./lib/load-env');
-loadEnvFile(path.join(__dirname, '..', '.env'));
-const { complete, DEFAULT_BASE } = require('./lib/opencode-go-client');
-const measureLoc = require('./loc');
-const checkCorrect = require('./correctness');
-const ROOT = path.join(__dirname, '..');
-const MODELS_PATH = path.join(__dirname, 'opencode-go-models.json');
-const TASKS = [
-  { id: 'email', prompt: 'Write me a Python function that validates email addresses.' },
-  {
-    id: 'debounce',
-    prompt:
-      'Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke.',
-  },
-  {
-    id: 'csv-sum',
-    prompt: "Write Python code that reads sales.csv and sums the 'amount' column.",
-  },
-  {
-    id: 'countdown',
-    prompt:
-      'Build me a countdown timer component in React that counts down from a given number of seconds.',
-  },
-  {
-    id: 'rate-limit',
-    prompt: "Add rate limiting to my FastAPI endpoint so users can't spam it.",
-  },
-];
-function loadModelsConfig() {
-  return JSON.parse(fs.readFileSync(MODELS_PATH, 'utf8'));
-}
-function loadArms(includeCaveman) {
-  const arms = {
-    baseline: null,
-    'lexis-two': require('./arms/lexis-two').system,
-  };
-  if (includeCaveman) {
-    arms.caveman = fs.readFileSync(path.join(__dirname, 'arms', 'caveman-SKILL.md'), 'utf8');
-  }
-  return arms;
-}
-function median(values) {
-  const s = [...values].sort((a, b) => a - b);
-  if (s.length === 0) return 0;
-  const mid = Math.floor(s.length / 2);
-  return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
-}
-function parseArgs(argv) {
-  const opts = {
-    repeat: 3,
-    models: null,
-    arms: ['baseline', 'lexis-two'],
-    writeMd: false,
-    delayMs: 500,
-    temperature: 1,
-    baseUrl: process.env.OPENCODE_GO_BASE_URL || DEFAULT_BASE,
-  };
-  for (let i = 2; i < argv.length; i += 1) {
-    const arg = argv[i];
-    if (arg === '--repeat') {
-      opts.repeat = Number(argv[++i]);
-    } else if (arg === '--model') {
-      opts.models = [argv[++i]];
-    } else if (arg === '--models') {
-      opts.models = argv[++i].split(',').map((m) => m.trim()).filter(Boolean);
-    } else if (arg === '--caveman') {
-      opts.arms.push('caveman');
-    } else if (arg === '--write-md') {
-      opts.writeMd = true;
-    } else if (arg === '--delay-ms') {
-      opts.delayMs = Number(argv[++i]);
-    } else if (arg === '--help' || arg === '-h') {
-      console.log(`Usage: node benchmarks/benchmark-opencode-go.js [options]
-  --repeat N       Runs per cell (default: 3)
-  --model ID       Single model (e.g. kimi-k2.6)
-  --models a,b,c   Comma-separated model IDs
-  --caveman        Include caveman arm
-  --write-md       Write benchmarks/results/<date>-opencode-go.md
-  --delay-ms N     Pause between API calls (default: 500)
-`);
-      process.exit(0);
-    }
-  }
-  return opts;
-}
-function sleep(ms) {
-  return new Promise((resolve) => setTimeout(resolve, ms));
-}
-function summarizeModel(modelId, modelName, repeat, arms, cellResults) {
-  const taskIds = TASKS.map((t) => t.id);
-  const lines = [];
-  lines.push(`## ${modelName} (\`${modelId}\`)`);
-  lines.push('');
-  lines.push(`Repeat: ${repeat}. Arms: ${Object.keys(arms).join(', ')}.`);
-  lines.push('');
-  const header =
-    '| arm | ' + taskIds.join(' | ') + ' | TOTAL | correct |';
-  const sep = '| --- | ' + taskIds.map(() => '---:').join(' | ') + ' | ---: | ---: |';
-  lines.push('**Code LOC (median)**');
-  lines.push('');
-  lines.push(header);
-  lines.push(sep);
-  for (const arm of Object.keys(arms)) {
-    const locs = taskIds.map((t) => median(cellResults[arm][t].map((r) => r.loc)));
-    const passCount = taskIds.reduce(
-      (sum, t) => sum + cellResults[arm][t].filter((r) => r.correct).length,
-      0,
-    );
-    const totalRuns = taskIds.length * repeat;
-    lines.push(
-      `| ${arm} | ${locs.join(' | ')} | ${locs.reduce((a, b) => a + b, 0)} | ${passCount}/${totalRuns} |`,
-    );
-  }
-  const baseTotal = taskIds.reduce(
-    (sum, t) => sum + median(cellResults.baseline[t].map((r) => r.loc)),
-    0,
-  );
-  const lexisTotal = taskIds.reduce(
-    (sum, t) => sum + median(cellResults['lexis-two'][t].map((r) => r.loc)),
-    0,
-  );
-  if (baseTotal > 0) {
-    const pct = ((1 - lexisTotal / baseTotal) * 100).toFixed(0);
-    lines.push('');
-    lines.push(
-      `**lexis-two vs baseline (median total LOC):** ${pct}% ${Number(pct) >= 0 ? 'less' : 'more'} code.`,
-    );
-  }
-  lines.push('');
-  return lines.join('\n');
-}
-async function runModel(modelId, modelConfig, opts, arms) {
-  const taskIds = TASKS.map((t) => t.id);
-  const cellResults = Object.fromEntries(
-    Object.keys(arms).map((arm) => [arm, Object.fromEntries(taskIds.map((t) => [t, []]))]),
-  );
-  const total = opts.repeat * Object.keys(arms).length * TASKS.length;
-  let done = 0;
-  for (let r = 0; r < opts.repeat; r += 1) {
-    for (const [arm, system] of Object.entries(arms)) {
-      for (const task of TASKS) {
-        done += 1;
-        const label = `[${done}/${total}] ${modelId} run${r + 1} ${arm} / ${task.id}`;
-        process.stdout.write(`${label} ... `);
-        const t0 = Date.now();
-        let text = '';
-        let usage = null;
-        try {
-          const result = await complete({
-            modelId,
-            modelConfig,
-            system: system || undefined,
-            user: task.prompt,
-            baseUrl: opts.baseUrl,
-            temperature: opts.temperature,
-          });
-          text = result.text;
-          usage = result.usage;
-        } catch (e) {
-          console.log(`FAIL — ${e.message}`);
-          cellResults[arm][task.id].push({
-            loc: 0,
-            correct: false,
-            timeSec: (Date.now() - t0) / 1000,
-            error: e.message,
-            response: '',
-          });
-          if (opts.delayMs > 0) await sleep(opts.delayMs);
-          continue;
-        }
-        const locResult = measureLoc(text);
-        const correctResult = checkCorrect(text, { vars: { task: task.prompt } });
-        const timeSec = (Date.now() - t0) / 1000;
-        cellResults[arm][task.id].push({
-          loc: locResult.score,
-          correct: correctResult.pass,
-          timeSec,
-          usage,
-          response: text,
-        });
-        console.log(
-          `${locResult.score} LOC  ${timeSec.toFixed(1)}s  correct=${correctResult.pass ? 'yes' : 'no'}`,
-        );
-        if (opts.delayMs > 0) await sleep(opts.delayMs);
-      }
-    }
-  }
-  return cellResults;
-}
-async function main() {
-  const opts = parseArgs(process.argv);
-  const config = loadModelsConfig();
-  const modelIds = opts.models || config.defaultModels;
-  const arms = loadArms(opts.arms.includes('caveman'));
-  const unknown = modelIds.filter((id) => !config.models[id]);
-  if (unknown.length) {
-    throw new Error(`Unknown model(s): ${unknown.join(', ')}. See opencode-go-models.json`);
-  }
-  const allResults = {};
-  const mdSections = [];
-  const date = new Date().toISOString().slice(0, 10);
-  mdSections.push(`# Lexis-Two benchmark — OpenCode Go (${date})`);
-  mdSections.push('');
-  mdSections.push('Provider: [OpenCode Go](https://opencode.ai/docs/go/).');
-  mdSections.push(`Repeat: ${opts.repeat} per cell. Temperature: ${opts.temperature}.`);
-  mdSections.push('');
-  for (const modelId of modelIds) {
-    const modelConfig = config.models[modelId];
-    console.log(`\n${'='.repeat(60)}\n  MODEL: ${modelConfig.name} (${modelId})\n${'='.repeat(60)}\n`);
-    const cellResults = await runModel(modelId, modelConfig, opts, arms);
-    allResults[modelId] = cellResults;
-    mdSections.push(summarizeModel(modelId, modelConfig.name, opts.repeat, arms, cellResults));
-  }
-  const outJson = path.join(__dirname, 'results', `opencode-go-${date}.json`);
-  fs.mkdirSync(path.dirname(outJson), { recursive: true });
-  fs.writeFileSync(
-    outJson,
-    JSON.stringify(
-      {
-        date,
-        repeat: opts.repeat,
-        models: modelIds,
-        arms: Object.keys(arms),
-        tasks: TASKS,
-        results: allResults,
-      },
-      null,
-      2,
-    ),
-    'utf8',
-  );
-  console.log(`\nFull results → ${outJson}`);
-  if (opts.writeMd) {
-    const outMd = path.join(__dirname, 'results', `${date}-opencode-go.md`);
-    fs.writeFileSync(outMd, mdSections.join('\n'), 'utf8');
-    console.log(`Summary markdown → ${outMd}`);
-  }
-}
-main().catch((e) => {
-  console.error(e.message || e);
-  process.exit(1);
-});