@draig/lexis-two 1.0.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +3 -3
  2. package/package.json +7 -2
  3. package/.claude-plugin/marketplace.json +0 -29
  4. package/.claude-plugin/plugin.json +0 -9
  5. package/.codex-plugin/plugin.json +0 -31
  6. package/.env.example +0 -8
  7. package/.github/FUNDING.yml +0 -1
  8. package/.github/copilot-instructions.md +0 -47
  9. package/.github/plugin/marketplace.json +0 -20
  10. package/.github/plugin/plugin.json +0 -16
  11. package/.github/workflows/deploy-site.yml +0 -53
  12. package/.github/workflows/test.yml +0 -29
  13. package/AUDIT.md +0 -74
  14. package/SPECXIS.md +0 -576
  15. package/benchmarks/README.md +0 -114
  16. package/benchmarks/arms/baseline.js +0 -2
  17. package/benchmarks/arms/caveman-SKILL.md +0 -67
  18. package/benchmarks/arms/caveman.js +0 -8
  19. package/benchmarks/arms/lexis-two.js +0 -10
  20. package/benchmarks/arms/ponytail.js +0 -6
  21. package/benchmarks/behavior.js +0 -58
  22. package/benchmarks/behavior.yaml +0 -40
  23. package/benchmarks/benchmark-local.py +0 -156
  24. package/benchmarks/benchmark-opencode-go.js +0 -294
  25. package/benchmarks/correctness.js +0 -294
  26. package/benchmarks/lib/aggregate-opencode-go.js +0 -103
  27. package/benchmarks/lib/load-env.js +0 -31
  28. package/benchmarks/lib/opencode-go-client.js +0 -151
  29. package/benchmarks/loc.js +0 -13
  30. package/benchmarks/opencode-go-models.json +0 -31
  31. package/benchmarks/promptfooconfig.yaml +0 -41
  32. package/benchmarks/prompts.json +0 -15
  33. package/benchmarks/render-opencode-go-report.js +0 -28
  34. package/benchmarks/results/2026-06-15-llama3.2-local.md +0 -76
  35. package/benchmarks/results/2026-06-16-opencode-go.md +0 -56
  36. package/benchmarks/results/opencode-go-2026-06-16-report.html +0 -226
  37. package/benchmarks/results/opencode-go-2026-06-16.json +0 -1339
  38. package/docs/assets/lexis-two-nobg.png +0 -0
  39. package/docs/assets/logo.png +0 -0
  40. package/docs/assets/logo.svg +0 -4
  41. package/docs/portability.md +0 -147
  42. package/docs/site.md +0 -52
  43. package/gemini-extension.json +0 -7
  44. package/pi-extension/index.js +0 -161
  45. package/pi-extension/package.json +0 -8
  46. package/pi-extension/test/extension.test.js +0 -89
  47. package/pi-extension/test/helpers.test.js +0 -35
  48. package/scripts/check-rule-copies.js +0 -82
  49. package/site/astro.config.mjs +0 -18
  50. package/site/package-lock.json +0 -4913
  51. package/site/package.json +0 -14
  52. package/site/public/CNAME +0 -1
  53. package/site/public/assets/lexis-two-nobg.png +0 -0
  54. package/site/public/assets/logo.png +0 -0
  55. package/site/public/assets/logo.svg +0 -4
  56. package/site/public/robots.txt +0 -4
  57. package/site/src/components/Adapt.astro +0 -33
  58. package/site/src/components/Benchmarks.astro +0 -232
  59. package/site/src/components/Commands.astro +0 -33
  60. package/site/src/components/Ecosystem.astro +0 -30
  61. package/site/src/components/Example.astro +0 -77
  62. package/site/src/components/Footer.astro +0 -28
  63. package/site/src/components/Header.astro +0 -87
  64. package/site/src/components/Hero.astro +0 -58
  65. package/site/src/components/Home.astro +0 -46
  66. package/site/src/components/Hosts.astro +0 -62
  67. package/site/src/components/Install.astro +0 -143
  68. package/site/src/components/LanguageSwitcher.astro +0 -82
  69. package/site/src/components/Philosophy.astro +0 -23
  70. package/site/src/components/Stacks.astro +0 -33
  71. package/site/src/components/Suggested.astro +0 -39
  72. package/site/src/data/opencode-go-benchmark.json +0 -230
  73. package/site/src/i18n/en.ts +0 -155
  74. package/site/src/i18n/es.ts +0 -158
  75. package/site/src/i18n/index.ts +0 -14
  76. package/site/src/layouts/Layout.astro +0 -114
  77. package/site/src/pages/benchmarks.astro +0 -4
  78. package/site/src/pages/es/benchmarks.astro +0 -4
  79. package/site/src/pages/es/index.astro +0 -10
  80. package/site/src/pages/index.astro +0 -10
  81. package/site/src/styles/global.css +0 -780
  82. package/site/tsconfig.json +0 -3
  83. package/tests/behavior.test.js +0 -80
  84. package/tests/commands.test.js +0 -40
  85. package/tests/copilot-plugin.test.js +0 -33
  86. package/tests/correctness.test.js +0 -191
  87. package/tests/gemini-extension.test.js +0 -78
  88. package/tests/hooks-windows.test.js +0 -48
  89. package/tests/hooks.test.js +0 -177
  90. package/tests/opencode-plugin.test.js +0 -64
@@ -1,67 +0,0 @@
1
- ---
2
- name: caveman
3
- description: >
4
- Ultra-compressed communication mode. Cuts token usage ~75% by speaking like caveman
5
- while keeping full technical accuracy. Supports intensity levels: lite, full (default), ultra,
6
- wenyan-lite, wenyan-full, wenyan-ultra.
7
- Use when user says "caveman mode", "talk like caveman", "use caveman", "less tokens",
8
- "be brief", or invokes /caveman. Also auto-triggers when token efficiency is requested.
9
- ---
10
-
11
- Respond terse like smart caveman. All technical substance stay. Only fluff die.
12
-
13
- ## Persistence
14
-
15
- ACTIVE EVERY RESPONSE. No revert after many turns. No filler drift. Still active if unsure. Off only: "stop caveman" / "normal mode".
16
-
17
- Default: **full**. Switch: `/caveman lite|full|ultra`.
18
-
19
- ## Rules
20
-
21
- Drop: articles (a/an/the), filler (just/really/basically/actually/simply), pleasantries (sure/certainly/of course/happy to), hedging. Fragments OK. Short synonyms (big not extensive, fix not "implement a solution for"). Technical terms exact. Code blocks unchanged. Errors quoted exact.
22
-
23
- Pattern: `[thing] [action] [reason]. [next step].`
24
-
25
- Not: "Sure! I'd be happy to help you with that. The issue you're experiencing is likely caused by..."
26
- Yes: "Bug in auth middleware. Token expiry check use `<` not `<=`. Fix:"
27
-
28
- ## Intensity
29
-
30
- | Level | What change |
31
- |-------|------------|
32
- | **lite** | No filler/hedging. Keep articles + full sentences. Professional but tight |
33
- | **full** | Drop articles, fragments OK, short synonyms. Classic caveman |
34
- | **ultra** | Abbreviate (DB/auth/config/req/res/fn/impl), strip conjunctions, arrows for causality (X → Y), one word when one word enough |
35
- | **wenyan-lite** | Semi-classical. Drop filler/hedging but keep grammar structure, classical register |
36
- | **wenyan-full** | Maximum classical terseness. Fully 文言文. 80-90% character reduction. Classical sentence patterns, verbs precede objects, subjects often omitted, classical particles (之/乃/為/其) |
37
- | **wenyan-ultra** | Extreme abbreviation while keeping classical Chinese feel. Maximum compression, ultra terse |
38
-
39
- Example — "Why React component re-render?"
40
- - lite: "Your component re-renders because you create a new object reference each render. Wrap it in `useMemo`."
41
- - full: "New object ref each render. Inline object prop = new ref = re-render. Wrap in `useMemo`."
42
- - ultra: "Inline obj prop → new ref → re-render. `useMemo`."
43
- - wenyan-lite: "組件頻重繪,以每繪新生對象參照故。以 useMemo 包之。"
44
- - wenyan-full: "物出新參照,致重繪。useMemo .Wrap之。"
45
- - wenyan-ultra: "新參照→重繪。useMemo Wrap。"
46
-
47
- Example — "Explain database connection pooling."
48
- - lite: "Connection pooling reuses open connections instead of creating new ones per request. Avoids repeated handshake overhead."
49
- - full: "Pool reuse open DB connections. No new connection per request. Skip handshake overhead."
50
- - ultra: "Pool = reuse DB conn. Skip handshake → fast under load."
51
- - wenyan-full: "池reuse open connection。不每req新開。skip handshake overhead。"
52
- - wenyan-ultra: "池reuse conn。skip handshake → fast。"
53
-
54
- ## Auto-Clarity
55
-
56
- Drop caveman for: security warnings, irreversible action confirmations, multi-step sequences where fragment order risks misread, user asks to clarify or repeats question. Resume caveman after clear part done.
57
-
58
- Example — destructive op:
59
- > **Warning:** This will permanently delete all rows in the `users` table and cannot be undone.
60
- > ```sql
61
- > DROP TABLE users;
62
- > ```
63
- > Caveman resume. Verify backup exist first.
64
-
65
- ## Boundaries
66
-
67
- Code/commits/PRs: write normal. "stop caveman" or "normal mode": revert. Level persist until changed or session end.
@@ -1,8 +0,0 @@
1
- // Caveman arm: caveman SKILL.md (full) as the system prompt.
2
- const fs = require('fs');
3
- const path = require('path');
4
- const system = fs.readFileSync(path.join(__dirname, 'caveman-SKILL.md'), 'utf8');
5
- module.exports = ({ vars }) => [
6
- { role: 'system', content: system },
7
- { role: 'user', content: vars.task },
8
- ];
@@ -1,10 +0,0 @@
1
- // Lexis-Two arm: repo skill (full) as the system prompt.
2
- const fs = require('fs');
3
- const path = require('path');
4
-
5
- const system = fs.readFileSync(
6
- path.join(__dirname, '..', '..', 'skills', 'lexis-two', 'SKILL.md'),
7
- 'utf8',
8
- );
9
-
10
- module.exports = { system };
@@ -1,6 +0,0 @@
1
- // Ponytail arm (legacy label) — reads Lexis-Two skill until promptfoo config is renamed.
2
- const { system } = require('./lexis-two');
3
- module.exports = ({ vars }) => [
4
- { role: 'system', content: system },
5
- { role: 'user', content: vars.task },
6
- ];
@@ -1,58 +0,0 @@
1
- // Behavior gate: does the ponytail ruleset actually PRODUCE its refined
2
- // behaviors, not just carry the text? One check per probe (vars.probe), each
3
- // targeting a rule that a field review (rcstack, phases 0-8) showed mattered:
4
- // hardware - "hardware is never the spec ideal, leave the calibration knob"
5
- // explanation - "explanation the user explicitly asked for is not debt"
6
- // onecheck - "lazy code without its check is unfinished"
7
- //
8
- // Heuristic graders, same spirit as loc.js / correctness.js. The graders
9
- // themselves are proven by tests/behavior.test.js (RED/GREEN, no API key).
10
- //
11
- // Metric: `behavior` (1 = behavior present, 0 = absent).
12
-
13
- function codeOf(text) {
14
- return [...String(text || '').matchAll(/```[\w-]*\n([\s\S]*?)```/g)].map((m) => m[1]).join('\n');
15
- }
16
-
17
- function proseOf(text) {
18
- return String(text || '').replace(/```[\s\S]*?```/g, ' ').replace(/\s+/g, ' ').trim();
19
- }
20
-
21
- const CHECKS = {
22
- // Treats the device as non-ideal: leaves a tunable knob or flags per-unit drift.
23
- // A passing mention of "calibration" is not enough; it must be actionable.
24
- hardware(output) {
25
- const t = String(output || '');
26
- const drift = /\bdrift|per[- ]unit|per[- ]part|part[- ]to[- ]part|measure your own|\btare\b|\btrim\b|\bknob|\btuning\b|reads off|known (temp|reference|value)|reference (thermometer|sensor|temp)|calibration (offset|constant|param|knob)/i.test(t);
27
- return drift
28
- ? { pass: true, reason: 'Leaves a calibration knob / flags per-unit drift.' }
29
- : { pass: false, reason: 'Treats the hardware as ideal; no calibration knob.' };
30
- },
31
-
32
- // Gives the explanation the user explicitly asked for instead of truncating.
33
- explanation(output) {
34
- const p = proseOf(output);
35
- const words = p ? p.split(' ').length : 0;
36
- const structured = /(\d+[.)]\s|[-*]\s)/.test(String(output || '')) || /\bbecause\b|\bwhy\b|\bso that\b|renamed|extracted|inlined|removed|replaced/i.test(p);
37
- return words >= 45 && structured
38
- ? { pass: true, reason: `Gave the requested write-up (${words} words of prose).` }
39
- : { pass: false, reason: `Truncated the requested explanation (${words} words of prose).` };
40
- },
41
-
42
- // Leaves ONE runnable check behind for non-trivial logic.
43
- onecheck(output) {
44
- const t = String(output || '');
45
- const hasCheck = /\bassert\b|def\s+test_|if\s+__name__|unittest|pytest|console\.assert|\bexpect\(|\bdescribe\(|\bit\(/.test(t);
46
- return hasCheck
47
- ? { pass: true, reason: 'Left a runnable check (assert/test/demo).' }
48
- : { pass: false, reason: 'No runnable check left behind.' };
49
- },
50
- };
51
-
52
- module.exports = (output, context) => {
53
- const probe = context && context.vars && context.vars.probe;
54
- const check = CHECKS[probe];
55
- if (!check) return { pass: true, score: 1, reason: `Unknown probe '${probe}', skipped` };
56
- const r = check(output);
57
- return { pass: r.pass, score: r.pass ? 1 : 0, reason: r.reason };
58
- };
@@ -1,40 +0,0 @@
1
- # Ponytail behavior gates: does the ruleset actually produce its refined
2
- # behaviors (not just carry the text)? Probes the three rules a full-project
3
- # field review (rcstack, phases 0-8) showed mattered.
4
- #
5
- # Run: npx promptfoo@latest eval -c benchmarks/behavior.yaml --repeat 10
6
- # View: npx promptfoo@latest view
7
- #
8
- # Needs ANTHROPIC_API_KEY (see benchmarks/README.md). The grader (behavior.js)
9
- # is proven separately by tests/behavior.test.js, which needs no API key.
10
- #
11
- # baseline is included as the control: the no-skill arm should mostly FAIL these
12
- # gates, the ponytail arm should pass them. That delta is the point.
13
- description: "Ponytail behavior gates: hardware calibration, requested explanation, one runnable check."
14
-
15
- providers:
16
- - id: anthropic:messages:claude-opus-4-8
17
- config: { max_tokens: 8192, temperature: 1 }
18
-
19
- prompts:
20
- - id: file://arms/baseline.js
21
- label: baseline (no skill)
22
- - id: file://arms/ponytail.js
23
- label: ponytail
24
-
25
- defaultTest:
26
- assert:
27
- - type: javascript
28
- value: file://behavior.js
29
- metric: behavior
30
-
31
- tests:
32
- - vars:
33
- probe: hardware
34
- task: "Write a Python function that reads the temperature in Celsius from a thermistor wired to a Raspberry Pi ADC (MCP3008, channel 0)."
35
- - vars:
36
- probe: explanation
37
- task: "Refactor this for readability and give me a detailed, step-by-step write-up of every change you made and why.\n\ndef p(d):\n r = []\n for x in d:\n if x.get('a') and x['a'] > 0:\n r.append(x['a'] * 2)\n return r"
38
- - vars:
39
- probe: onecheck
40
- task: "Write a Python function that parses a duration string like '1h30m45s' into a total number of seconds."
@@ -1,156 +0,0 @@
1
- """
2
- Ponytail local benchmark — runs the same 5 tasks against any Ollama model.
3
- No promptfoo required. Compares baseline vs caveman vs ponytail on code LOC
4
- and wall-clock time. Results are printed as a table and saved to a JSON file.
5
-
6
- Usage:
7
- python benchmarks/benchmark-local.py
8
- python benchmarks/benchmark-local.py --model llama3.2 --repeat 3
9
-
10
- Prerequisites: Ollama running locally (https://ollama.com), model pulled.
11
- """
12
-
13
- import argparse
14
- import json
15
- import re
16
- import time
17
- import urllib.request
18
- from pathlib import Path
19
-
20
- ROOT = Path(__file__).parent.parent
21
-
22
- TASKS = [
23
- ("email", "Write me a Python function that validates email addresses."),
24
- ("debounce", "Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke."),
25
- ("csv-sum", "Write Python code that reads sales.csv and sums the 'amount' column."),
26
- ("countdown", "Build me a countdown timer component in React that counts down from a given number of seconds."),
27
- ("rate-limit", "Add rate limiting to my FastAPI endpoint so users can't spam it."),
28
- ]
29
-
30
-
31
- def load_arms():
32
- return {
33
- "baseline": None,
34
- "caveman": (ROOT / "benchmarks/arms/caveman-SKILL.md").read_text(encoding="utf-8"),
35
- "lexis-two": (ROOT / "skills/lexis-two/SKILL.md").read_text(encoding="utf-8"),
36
- }
37
-
38
-
39
- def count_loc(text):
40
- """Non-blank, non-comment lines of code: fenced blocks, or the whole
41
- response when the model emitted bare code with no fence."""
42
- blocks = re.findall(r"```[a-zA-Z0-9_+\-]*\n([\s\S]*?)```", text)
43
- lines = ("\n".join(blocks) if blocks else text).splitlines()
44
- return sum(
45
- 1 for l in lines
46
- if l.strip()
47
- and not l.strip().startswith("//")
48
- and not l.strip().startswith("#")
49
- and l.strip() not in ("*/",)
50
- and not l.strip().startswith("/*")
51
- and not l.strip().startswith("*")
52
- )
53
-
54
-
55
- def call_ollama(model, system_prompt, user_prompt, ollama_url):
56
- messages = []
57
- if system_prompt:
58
- messages.append({"role": "system", "content": system_prompt})
59
- messages.append({"role": "user", "content": user_prompt})
60
-
61
- payload = json.dumps({
62
- "model": model,
63
- "messages": messages,
64
- "stream": False,
65
- "options": {"temperature": 0.7},
66
- }).encode()
67
-
68
- req = urllib.request.Request(
69
- f"{ollama_url}/api/chat",
70
- data=payload,
71
- headers={"Content-Type": "application/json"},
72
- method="POST",
73
- )
74
- t0 = time.time()
75
- with urllib.request.urlopen(req, timeout=180) as resp:
76
- data = json.loads(resp.read())
77
- elapsed = time.time() - t0
78
- return data["message"]["content"], round(elapsed, 1)
79
-
80
-
81
- def run(model, repeat, ollama_url):
82
- arms = load_arms()
83
- task_ids = [t[0] for t in TASKS]
84
- # results[arm][task_id] = list of {loc, time}
85
- results = {arm: {t: [] for t in task_ids} for arm in arms}
86
- total = len(arms) * len(TASKS) * repeat
87
-
88
- done = 0
89
- for r in range(repeat):
90
- for arm, system in arms.items():
91
- for task_id, task_prompt in TASKS:
92
- done += 1
93
- label = f"[{done}/{total}] run{r+1} {arm:10s} / {task_id}"
94
- print(f"{label} ...", end=" ", flush=True)
95
- response, elapsed = call_ollama(model, system, task_prompt, ollama_url)
96
- loc = count_loc(response)
97
- results[arm][task_id].append({"loc": loc, "time": elapsed, "response": response})
98
- print(f"{loc} LOC {elapsed}s")
99
-
100
- # compute medians
101
- def median(vals):
102
- s = sorted(vals)
103
- n = len(s)
104
- return s[n // 2] if n % 2 else (s[n // 2 - 1] + s[n // 2]) / 2
105
-
106
- med_loc = {arm: {t: median([r["loc"] for r in results[arm][t]]) for t in task_ids} for arm in arms}
107
- med_time = {arm: {t: median([r["time"] for r in results[arm][t]]) for t in task_ids} for arm in arms}
108
-
109
- col = 12
110
- header = f"{'arm':<12}" + "".join(f"{t:>{col}}" for t in task_ids) + f"{'TOTAL':>{col}}"
111
- sep = "-" * len(header)
112
-
113
- print(f"\n{'=' * 60}")
114
- print(f" RESULTS - {model} (n={repeat}, median)")
115
- print(f"{'=' * 60}")
116
-
117
- print(f"\nCode LOC per task (median)")
118
- print(header)
119
- print(sep)
120
- for arm in arms:
121
- row = [med_loc[arm][t] for t in task_ids]
122
- print(f"{arm:<12}" + "".join(f"{v:>{col}}" for v in row) + f"{sum(row):>{col}}")
123
-
124
- print(f"\nTime seconds per task (median)")
125
- print(header)
126
- print(sep)
127
- for arm in arms:
128
- row = [med_time[arm][t] for t in task_ids]
129
- print(f"{arm:<12}" + "".join(f"{v:>{col}.1f}" for v in row) + f"{sum(row):>{col}.1f}")
130
-
131
- print(f"\n{'=' * 60}")
132
- print(" LOC vs baseline (median totals)")
133
- print(f"{'=' * 60}")
134
- base_total = sum(med_loc["baseline"][t] for t in task_ids)
135
- for arm in ("caveman", "lexis-two"):
136
- arm_total = sum(med_loc[arm][t] for t in task_ids)
137
- pct = (1 - arm_total / base_total) * 100 if base_total else 0
138
- sign = "less" if pct >= 0 else "more"
139
- print(f" {arm:10s}: {arm_total} LOC ({abs(pct):.0f}% {sign} than baseline)")
140
-
141
- out = Path(__file__).parent / "benchmark-local-results.json"
142
- out.write_text(json.dumps(results, indent=2), encoding="utf-8")
143
- print(f"\nFull responses -> {out}")
144
-
145
-
146
- def main():
147
- parser = argparse.ArgumentParser(description="Ponytail local benchmark via Ollama")
148
- parser.add_argument("--model", default="llama3.2", help="Ollama model name (default: llama3.2)")
149
- parser.add_argument("--repeat", type=int, default=1, help="Runs per cell; median reported (default: 1)")
150
- parser.add_argument("--ollama-url", default="http://localhost:11434", help="Ollama base URL")
151
- args = parser.parse_args()
152
- run(args.model, args.repeat, args.ollama_url)
153
-
154
-
155
- if __name__ == "__main__":
156
- main()
@@ -1,294 +0,0 @@
1
- #!/usr/bin/env node
2
- /**
3
- * Lexis-Two benchmark via OpenCode Go models.
4
- *
5
- * Same 5 tasks as promptfooconfig.yaml. Arms: baseline (no skill) vs lexis-two.
6
- * Optional: --arm caveman for a third arm.
7
- *
8
- * Usage:
9
- * node benchmarks/benchmark-opencode-go.js --repeat 3
10
- * node benchmarks/benchmark-opencode-go.js --model kimi-k2.6 --repeat 10
11
- * node benchmarks/benchmark-opencode-go.js --write-md
12
- *
13
- * Requires OPENCODE_API_KEY in .env or environment (OpenCode Go subscription).
14
- * Docs: benchmarks/README.md#opencode-go
15
- */
16
-
17
- const fs = require('fs');
18
- const path = require('path');
19
-
20
- const { loadEnvFile } = require('./lib/load-env');
21
- loadEnvFile(path.join(__dirname, '..', '.env'));
22
-
23
- const { complete, DEFAULT_BASE } = require('./lib/opencode-go-client');
24
- const measureLoc = require('./loc');
25
- const checkCorrect = require('./correctness');
26
-
27
- const ROOT = path.join(__dirname, '..');
28
- const MODELS_PATH = path.join(__dirname, 'opencode-go-models.json');
29
-
30
- const TASKS = [
31
- { id: 'email', prompt: 'Write me a Python function that validates email addresses.' },
32
- {
33
- id: 'debounce',
34
- prompt:
35
- 'Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke.',
36
- },
37
- {
38
- id: 'csv-sum',
39
- prompt: "Write Python code that reads sales.csv and sums the 'amount' column.",
40
- },
41
- {
42
- id: 'countdown',
43
- prompt:
44
- 'Build me a countdown timer component in React that counts down from a given number of seconds.',
45
- },
46
- {
47
- id: 'rate-limit',
48
- prompt: "Add rate limiting to my FastAPI endpoint so users can't spam it.",
49
- },
50
- ];
51
-
52
- function loadModelsConfig() {
53
- return JSON.parse(fs.readFileSync(MODELS_PATH, 'utf8'));
54
- }
55
-
56
- function loadArms(includeCaveman) {
57
- const arms = {
58
- baseline: null,
59
- 'lexis-two': require('./arms/lexis-two').system,
60
- };
61
- if (includeCaveman) {
62
- arms.caveman = fs.readFileSync(path.join(__dirname, 'arms', 'caveman-SKILL.md'), 'utf8');
63
- }
64
- return arms;
65
- }
66
-
67
- function median(values) {
68
- const s = [...values].sort((a, b) => a - b);
69
- if (s.length === 0) return 0;
70
- const mid = Math.floor(s.length / 2);
71
- return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
72
- }
73
-
74
- function parseArgs(argv) {
75
- const opts = {
76
- repeat: 3,
77
- models: null,
78
- arms: ['baseline', 'lexis-two'],
79
- writeMd: false,
80
- delayMs: 500,
81
- temperature: 1,
82
- baseUrl: process.env.OPENCODE_GO_BASE_URL || DEFAULT_BASE,
83
- };
84
-
85
- for (let i = 2; i < argv.length; i += 1) {
86
- const arg = argv[i];
87
- if (arg === '--repeat') {
88
- opts.repeat = Number(argv[++i]);
89
- } else if (arg === '--model') {
90
- opts.models = [argv[++i]];
91
- } else if (arg === '--models') {
92
- opts.models = argv[++i].split(',').map((m) => m.trim()).filter(Boolean);
93
- } else if (arg === '--caveman') {
94
- opts.arms.push('caveman');
95
- } else if (arg === '--write-md') {
96
- opts.writeMd = true;
97
- } else if (arg === '--delay-ms') {
98
- opts.delayMs = Number(argv[++i]);
99
- } else if (arg === '--help' || arg === '-h') {
100
- console.log(`Usage: node benchmarks/benchmark-opencode-go.js [options]
101
-
102
- --repeat N Runs per cell (default: 3)
103
- --model ID Single model (e.g. kimi-k2.6)
104
- --models a,b,c Comma-separated model IDs
105
- --caveman Include caveman arm
106
- --write-md Write benchmarks/results/<date>-opencode-go.md
107
- --delay-ms N Pause between API calls (default: 500)
108
- `);
109
- process.exit(0);
110
- }
111
- }
112
-
113
- return opts;
114
- }
115
-
116
- function sleep(ms) {
117
- return new Promise((resolve) => setTimeout(resolve, ms));
118
- }
119
-
120
- function summarizeModel(modelId, modelName, repeat, arms, cellResults) {
121
- const taskIds = TASKS.map((t) => t.id);
122
- const lines = [];
123
- lines.push(`## ${modelName} (\`${modelId}\`)`);
124
- lines.push('');
125
- lines.push(`Repeat: ${repeat}. Arms: ${Object.keys(arms).join(', ')}.`);
126
- lines.push('');
127
-
128
- const header =
129
- '| arm | ' + taskIds.join(' | ') + ' | TOTAL | correct |';
130
- const sep = '| --- | ' + taskIds.map(() => '---:').join(' | ') + ' | ---: | ---: |';
131
- lines.push('**Code LOC (median)**');
132
- lines.push('');
133
- lines.push(header);
134
- lines.push(sep);
135
-
136
- for (const arm of Object.keys(arms)) {
137
- const locs = taskIds.map((t) => median(cellResults[arm][t].map((r) => r.loc)));
138
- const passCount = taskIds.reduce(
139
- (sum, t) => sum + cellResults[arm][t].filter((r) => r.correct).length,
140
- 0,
141
- );
142
- const totalRuns = taskIds.length * repeat;
143
- lines.push(
144
- `| ${arm} | ${locs.join(' | ')} | ${locs.reduce((a, b) => a + b, 0)} | ${passCount}/${totalRuns} |`,
145
- );
146
- }
147
-
148
- const baseTotal = taskIds.reduce(
149
- (sum, t) => sum + median(cellResults.baseline[t].map((r) => r.loc)),
150
- 0,
151
- );
152
- const lexisTotal = taskIds.reduce(
153
- (sum, t) => sum + median(cellResults['lexis-two'][t].map((r) => r.loc)),
154
- 0,
155
- );
156
- if (baseTotal > 0) {
157
- const pct = ((1 - lexisTotal / baseTotal) * 100).toFixed(0);
158
- lines.push('');
159
- lines.push(
160
- `**lexis-two vs baseline (median total LOC):** ${pct}% ${Number(pct) >= 0 ? 'less' : 'more'} code.`,
161
- );
162
- }
163
-
164
- lines.push('');
165
- return lines.join('\n');
166
- }
167
-
168
- async function runModel(modelId, modelConfig, opts, arms) {
169
- const taskIds = TASKS.map((t) => t.id);
170
- const cellResults = Object.fromEntries(
171
- Object.keys(arms).map((arm) => [arm, Object.fromEntries(taskIds.map((t) => [t, []]))]),
172
- );
173
-
174
- const total = opts.repeat * Object.keys(arms).length * TASKS.length;
175
- let done = 0;
176
-
177
- for (let r = 0; r < opts.repeat; r += 1) {
178
- for (const [arm, system] of Object.entries(arms)) {
179
- for (const task of TASKS) {
180
- done += 1;
181
- const label = `[${done}/${total}] ${modelId} run${r + 1} ${arm} / ${task.id}`;
182
- process.stdout.write(`${label} ... `);
183
-
184
- const t0 = Date.now();
185
- let text = '';
186
- let usage = null;
187
- try {
188
- const result = await complete({
189
- modelId,
190
- modelConfig,
191
- system: system || undefined,
192
- user: task.prompt,
193
- baseUrl: opts.baseUrl,
194
- temperature: opts.temperature,
195
- });
196
- text = result.text;
197
- usage = result.usage;
198
- } catch (e) {
199
- console.log(`FAIL — ${e.message}`);
200
- cellResults[arm][task.id].push({
201
- loc: 0,
202
- correct: false,
203
- timeSec: (Date.now() - t0) / 1000,
204
- error: e.message,
205
- response: '',
206
- });
207
- if (opts.delayMs > 0) await sleep(opts.delayMs);
208
- continue;
209
- }
210
-
211
- const locResult = measureLoc(text);
212
- const correctResult = checkCorrect(text, { vars: { task: task.prompt } });
213
- const timeSec = (Date.now() - t0) / 1000;
214
-
215
- cellResults[arm][task.id].push({
216
- loc: locResult.score,
217
- correct: correctResult.pass,
218
- timeSec,
219
- usage,
220
- response: text,
221
- });
222
-
223
- console.log(
224
- `${locResult.score} LOC ${timeSec.toFixed(1)}s correct=${correctResult.pass ? 'yes' : 'no'}`,
225
- );
226
- if (opts.delayMs > 0) await sleep(opts.delayMs);
227
- }
228
- }
229
- }
230
-
231
- return cellResults;
232
- }
233
-
234
- async function main() {
235
- const opts = parseArgs(process.argv);
236
- const config = loadModelsConfig();
237
- const modelIds = opts.models || config.defaultModels;
238
- const arms = loadArms(opts.arms.includes('caveman'));
239
-
240
- const unknown = modelIds.filter((id) => !config.models[id]);
241
- if (unknown.length) {
242
- throw new Error(`Unknown model(s): ${unknown.join(', ')}. See opencode-go-models.json`);
243
- }
244
-
245
- const allResults = {};
246
- const mdSections = [];
247
- const date = new Date().toISOString().slice(0, 10);
248
-
249
- mdSections.push(`# Lexis-Two benchmark — OpenCode Go (${date})`);
250
- mdSections.push('');
251
- mdSections.push('Provider: [OpenCode Go](https://opencode.ai/docs/go/).');
252
- mdSections.push(`Repeat: ${opts.repeat} per cell. Temperature: ${opts.temperature}.`);
253
- mdSections.push('');
254
-
255
- for (const modelId of modelIds) {
256
- const modelConfig = config.models[modelId];
257
- console.log(`\n${'='.repeat(60)}\n MODEL: ${modelConfig.name} (${modelId})\n${'='.repeat(60)}\n`);
258
-
259
- const cellResults = await runModel(modelId, modelConfig, opts, arms);
260
- allResults[modelId] = cellResults;
261
- mdSections.push(summarizeModel(modelId, modelConfig.name, opts.repeat, arms, cellResults));
262
- }
263
-
264
- const outJson = path.join(__dirname, 'results', `opencode-go-${date}.json`);
265
- fs.mkdirSync(path.dirname(outJson), { recursive: true });
266
- fs.writeFileSync(
267
- outJson,
268
- JSON.stringify(
269
- {
270
- date,
271
- repeat: opts.repeat,
272
- models: modelIds,
273
- arms: Object.keys(arms),
274
- tasks: TASKS,
275
- results: allResults,
276
- },
277
- null,
278
- 2,
279
- ),
280
- 'utf8',
281
- );
282
- console.log(`\nFull results → ${outJson}`);
283
-
284
- if (opts.writeMd) {
285
- const outMd = path.join(__dirname, 'results', `${date}-opencode-go.md`);
286
- fs.writeFileSync(outMd, mdSections.join('\n'), 'utf8');
287
- console.log(`Summary markdown → ${outMd}`);
288
- }
289
- }
290
-
291
- main().catch((e) => {
292
- console.error(e.message || e);
293
- process.exit(1);
294
- });