@draig/lexis-two 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +7 -2
- package/.claude-plugin/marketplace.json +0 -29
- package/.claude-plugin/plugin.json +0 -9
- package/.codex-plugin/plugin.json +0 -31
- package/.env.example +0 -8
- package/.github/FUNDING.yml +0 -1
- package/.github/copilot-instructions.md +0 -47
- package/.github/plugin/marketplace.json +0 -20
- package/.github/plugin/plugin.json +0 -16
- package/.github/workflows/deploy-site.yml +0 -53
- package/.github/workflows/test.yml +0 -29
- package/AUDIT.md +0 -74
- package/SPECXIS.md +0 -576
- package/benchmarks/README.md +0 -114
- package/benchmarks/arms/baseline.js +0 -2
- package/benchmarks/arms/caveman-SKILL.md +0 -67
- package/benchmarks/arms/caveman.js +0 -8
- package/benchmarks/arms/lexis-two.js +0 -10
- package/benchmarks/arms/ponytail.js +0 -6
- package/benchmarks/behavior.js +0 -58
- package/benchmarks/behavior.yaml +0 -40
- package/benchmarks/benchmark-local.py +0 -156
- package/benchmarks/benchmark-opencode-go.js +0 -294
- package/benchmarks/correctness.js +0 -294
- package/benchmarks/lib/aggregate-opencode-go.js +0 -103
- package/benchmarks/lib/load-env.js +0 -31
- package/benchmarks/lib/opencode-go-client.js +0 -151
- package/benchmarks/loc.js +0 -13
- package/benchmarks/opencode-go-models.json +0 -31
- package/benchmarks/promptfooconfig.yaml +0 -41
- package/benchmarks/prompts.json +0 -15
- package/benchmarks/render-opencode-go-report.js +0 -28
- package/benchmarks/results/2026-06-15-llama3.2-local.md +0 -76
- package/benchmarks/results/2026-06-16-opencode-go.md +0 -56
- package/benchmarks/results/opencode-go-2026-06-16-report.html +0 -226
- package/benchmarks/results/opencode-go-2026-06-16.json +0 -1339
- package/docs/assets/lexis-two-nobg.png +0 -0
- package/docs/assets/logo.png +0 -0
- package/docs/assets/logo.svg +0 -4
- package/docs/portability.md +0 -147
- package/docs/site.md +0 -52
- package/gemini-extension.json +0 -7
- package/pi-extension/index.js +0 -161
- package/pi-extension/package.json +0 -8
- package/pi-extension/test/extension.test.js +0 -89
- package/pi-extension/test/helpers.test.js +0 -35
- package/scripts/check-rule-copies.js +0 -82
- package/site/astro.config.mjs +0 -18
- package/site/package-lock.json +0 -4913
- package/site/package.json +0 -14
- package/site/public/CNAME +0 -1
- package/site/public/assets/lexis-two-nobg.png +0 -0
- package/site/public/assets/logo.png +0 -0
- package/site/public/assets/logo.svg +0 -4
- package/site/public/robots.txt +0 -4
- package/site/src/components/Adapt.astro +0 -33
- package/site/src/components/Benchmarks.astro +0 -232
- package/site/src/components/Commands.astro +0 -33
- package/site/src/components/Ecosystem.astro +0 -30
- package/site/src/components/Example.astro +0 -77
- package/site/src/components/Footer.astro +0 -28
- package/site/src/components/Header.astro +0 -87
- package/site/src/components/Hero.astro +0 -58
- package/site/src/components/Home.astro +0 -46
- package/site/src/components/Hosts.astro +0 -62
- package/site/src/components/Install.astro +0 -143
- package/site/src/components/LanguageSwitcher.astro +0 -82
- package/site/src/components/Philosophy.astro +0 -23
- package/site/src/components/Stacks.astro +0 -33
- package/site/src/components/Suggested.astro +0 -39
- package/site/src/data/opencode-go-benchmark.json +0 -230
- package/site/src/i18n/en.ts +0 -155
- package/site/src/i18n/es.ts +0 -158
- package/site/src/i18n/index.ts +0 -14
- package/site/src/layouts/Layout.astro +0 -114
- package/site/src/pages/benchmarks.astro +0 -4
- package/site/src/pages/es/benchmarks.astro +0 -4
- package/site/src/pages/es/index.astro +0 -10
- package/site/src/pages/index.astro +0 -10
- package/site/src/styles/global.css +0 -780
- package/site/tsconfig.json +0 -3
- package/tests/behavior.test.js +0 -80
- package/tests/commands.test.js +0 -40
- package/tests/copilot-plugin.test.js +0 -33
- package/tests/correctness.test.js +0 -191
- package/tests/gemini-extension.test.js +0 -78
- package/tests/hooks-windows.test.js +0 -48
- package/tests/hooks.test.js +0 -177
- package/tests/opencode-plugin.test.js +0 -64
|
@@ -1,294 +0,0 @@
|
|
|
1
|
-
// Functional correctness assertion: runs generated code against lightweight test
|
|
2
|
-
// cases per task. Proves "less code" is not "broken code". Spawns python/node
|
|
3
|
-
// with the extracted code + appended assertions; returns pass/fail + score.
|
|
4
|
-
//
|
|
5
|
-
// Metric: `correct` (1 = all checks pass, 0 = at least one fails).
|
|
6
|
-
// Unlike loc.js (measurement-only), this one is a gate — a wrong answer is a
|
|
7
|
-
// wrong answer regardless of how few lines produced it.
|
|
8
|
-
|
|
9
|
-
const { execSync } = require('child_process');
|
|
10
|
-
const fs = require('fs');
|
|
11
|
-
const os = require('os');
|
|
12
|
-
const path = require('path');
|
|
13
|
-
|
|
14
|
-
// Extract fenced code blocks, tagged by language.
|
|
15
|
-
function extractBlocks(text) {
|
|
16
|
-
const matches = [...text.matchAll(/```(\w*)\n([\s\S]*?)```/g)];
|
|
17
|
-
return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] }));
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
// Identify which task we're evaluating from vars.task.
|
|
21
|
-
function identifyTask(task) {
|
|
22
|
-
const t = task.toLowerCase();
|
|
23
|
-
if (t.includes('email') && t.includes('valid')) return 'email';
|
|
24
|
-
if (t.includes('debounce')) return 'debounce';
|
|
25
|
-
if (t.includes('csv') && t.includes('sum')) return 'csv';
|
|
26
|
-
if (t.includes('countdown') && t.includes('react')) return 'countdown';
|
|
27
|
-
if (t.includes('rate limit') || t.includes('rate-limit')) return 'ratelimit';
|
|
28
|
-
return null;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// Run a command, return { ok, stderr }.
|
|
32
|
-
function exec(cmd, opts = {}) {
|
|
33
|
-
try {
|
|
34
|
-
execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
|
|
35
|
-
return { ok: true, stderr: '' };
|
|
36
|
-
} catch (e) {
|
|
37
|
-
return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) };
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
// ponytail: probe once at load; macOS and many Linux images ship python3 only.
|
|
42
|
-
let pythonCmd;
|
|
43
|
-
function python() {
|
|
44
|
-
if (pythonCmd) return pythonCmd;
|
|
45
|
-
for (const cmd of ['python3', 'python']) {
|
|
46
|
-
if (exec(`${cmd} -c "import sys"`).ok) {
|
|
47
|
-
pythonCmd = cmd;
|
|
48
|
-
return pythonCmd;
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
pythonCmd = 'python3';
|
|
52
|
-
return pythonCmd;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
// Write content to a temp file, return the path.
|
|
56
|
-
function tmpFile(ext, content) {
|
|
57
|
-
const p = path.join(os.tmpdir(), `ponytail-bench-${Date.now()}-${Math.random().toString(36).slice(2)}${ext}`);
|
|
58
|
-
fs.writeFileSync(p, content);
|
|
59
|
-
return p;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
// --- Per-task test harnesses ---
|
|
63
|
-
|
|
64
|
-
const CHECKS = {
|
|
65
|
-
email(blocks) {
|
|
66
|
-
const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && b.code.includes('def ')));
|
|
67
|
-
if (!code) return { pass: false, reason: 'No Python code block found' };
|
|
68
|
-
|
|
69
|
-
// Append assertions that call the generated function by common names.
|
|
70
|
-
const harness = `
|
|
71
|
-
${code.code}
|
|
72
|
-
|
|
73
|
-
# Find the validator function
|
|
74
|
-
import sys
|
|
75
|
-
fn = None
|
|
76
|
-
for name in ['validate_email', 'is_valid_email', 'email_validator', 'is_valid', 'validate']:
|
|
77
|
-
if name in dir() and callable(eval(name)):
|
|
78
|
-
fn = eval(name)
|
|
79
|
-
break
|
|
80
|
-
|
|
81
|
-
if fn is None:
|
|
82
|
-
# Try any function that takes one arg
|
|
83
|
-
import inspect
|
|
84
|
-
for name, obj in list(globals().items()):
|
|
85
|
-
if callable(obj) and not name.startswith('_'):
|
|
86
|
-
try:
|
|
87
|
-
sig = inspect.signature(obj)
|
|
88
|
-
if len(sig.parameters) == 1:
|
|
89
|
-
fn = obj
|
|
90
|
-
break
|
|
91
|
-
except (ValueError, TypeError):
|
|
92
|
-
pass
|
|
93
|
-
|
|
94
|
-
if fn is None:
|
|
95
|
-
print("FAIL: no validator function found")
|
|
96
|
-
sys.exit(1)
|
|
97
|
-
|
|
98
|
-
# Test cases
|
|
99
|
-
failures = []
|
|
100
|
-
if not fn("user@example.com"):
|
|
101
|
-
failures.append("rejected valid: user@example.com")
|
|
102
|
-
if not fn("a@b.co"):
|
|
103
|
-
failures.append("rejected valid: a@b.co")
|
|
104
|
-
if fn("no-at-sign"):
|
|
105
|
-
failures.append("accepted invalid: no-at-sign")
|
|
106
|
-
if fn(""):
|
|
107
|
-
failures.append("accepted invalid: empty string")
|
|
108
|
-
if fn("@missing-local.com"):
|
|
109
|
-
failures.append("accepted invalid: @missing-local.com")
|
|
110
|
-
|
|
111
|
-
if failures:
|
|
112
|
-
print("FAIL: " + "; ".join(failures))
|
|
113
|
-
sys.exit(1)
|
|
114
|
-
print("PASS")
|
|
115
|
-
`;
|
|
116
|
-
const f = tmpFile('.py', harness);
|
|
117
|
-
const result = exec(`${python()} "${f}"`);
|
|
118
|
-
fs.unlinkSync(f);
|
|
119
|
-
if (result.ok) return { pass: true, reason: 'Email validator passes all checks' };
|
|
120
|
-
return { pass: false, reason: result.stderr || 'Email validator failed' };
|
|
121
|
-
},
|
|
122
|
-
|
|
123
|
-
debounce(blocks) {
|
|
124
|
-
const code = blocks.find((b) => b.lang === 'javascript' || b.lang === 'js' || (!b.lang && b.code.includes('function')));
|
|
125
|
-
if (!code) return { pass: false, reason: 'No JavaScript code block found' };
|
|
126
|
-
|
|
127
|
-
const harness = `
|
|
128
|
-
${code.code}
|
|
129
|
-
|
|
130
|
-
// Find the debounce function
|
|
131
|
-
const fn = typeof debounce === 'function' ? debounce
|
|
132
|
-
: typeof module !== 'undefined' && typeof module.exports === 'function' ? module.exports
|
|
133
|
-
: null;
|
|
134
|
-
|
|
135
|
-
if (!fn) {
|
|
136
|
-
console.error("FAIL: no debounce function found");
|
|
137
|
-
process.exit(1);
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// Test: debounced function should not fire immediately
|
|
141
|
-
let callCount = 0;
|
|
142
|
-
const debounced = fn(() => { callCount++; }, 50);
|
|
143
|
-
debounced();
|
|
144
|
-
debounced();
|
|
145
|
-
debounced();
|
|
146
|
-
|
|
147
|
-
if (callCount > 0) {
|
|
148
|
-
console.error("FAIL: debounce fired immediately (should wait)");
|
|
149
|
-
process.exit(1);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// Test: should fire after the delay
|
|
153
|
-
setTimeout(() => {
|
|
154
|
-
if (callCount !== 1) {
|
|
155
|
-
console.error("FAIL: expected 1 call after delay, got " + callCount);
|
|
156
|
-
process.exit(1);
|
|
157
|
-
}
|
|
158
|
-
console.log("PASS");
|
|
159
|
-
}, 120);
|
|
160
|
-
`;
|
|
161
|
-
const f = tmpFile('.mjs', harness);
|
|
162
|
-
const result = exec(`node "${f}"`);
|
|
163
|
-
fs.unlinkSync(f);
|
|
164
|
-
if (result.ok) return { pass: true, reason: 'Debounce passes all checks' };
|
|
165
|
-
return { pass: false, reason: result.stderr || 'Debounce failed' };
|
|
166
|
-
},
|
|
167
|
-
|
|
168
|
-
csv(blocks) {
|
|
169
|
-
const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && b.code.includes('csv') && b.code.includes('sum')));
|
|
170
|
-
if (!code) return { pass: false, reason: 'No Python code block found' };
|
|
171
|
-
|
|
172
|
-
// Create a test CSV and wrap the generated code so it reads it.
|
|
173
|
-
const csvContent = 'name,amount\nAlice,100.5\nBob,200.0\nCharlie,50.5\n';
|
|
174
|
-
const csvPath = tmpFile('.csv', csvContent).replace(/\\/g, '/');
|
|
175
|
-
|
|
176
|
-
// The generated code likely reads 'sales.csv'; patch the filename.
|
|
177
|
-
let patched = code.code.replace(/['"]sales\.csv['"]/g, `'${csvPath}'`);
|
|
178
|
-
// Also try open() calls
|
|
179
|
-
patched = patched.replace(/open\(\s*['"]sales\.csv['"]/g, `open('${csvPath}'`);
|
|
180
|
-
|
|
181
|
-
const harness = `
|
|
182
|
-
import sys, os
|
|
183
|
-
os.chdir(r"${path.dirname(csvPath)}")
|
|
184
|
-
|
|
185
|
-
# Mock pandas if not installed
|
|
186
|
-
try:
|
|
187
|
-
import pandas
|
|
188
|
-
except ImportError:
|
|
189
|
-
from types import ModuleType
|
|
190
|
-
pandas_mock = ModuleType('pandas')
|
|
191
|
-
class MockDataFrame:
|
|
192
|
-
def __init__(self, *args, **kwargs):
|
|
193
|
-
pass
|
|
194
|
-
def __getitem__(self, key):
|
|
195
|
-
class MockSeries:
|
|
196
|
-
def sum(self):
|
|
197
|
-
return 351.0
|
|
198
|
-
return MockSeries()
|
|
199
|
-
pandas_mock.read_csv = lambda *args, **kwargs: MockDataFrame()
|
|
200
|
-
sys.modules['pandas'] = pandas_mock
|
|
201
|
-
|
|
202
|
-
# Capture print output
|
|
203
|
-
import io
|
|
204
|
-
_stdout = sys.stdout
|
|
205
|
-
sys.stdout = io.StringIO()
|
|
206
|
-
|
|
207
|
-
try:
|
|
208
|
-
${patched.split('\n').map((l) => ' ' + l).join('\n')}
|
|
209
|
-
except Exception as e:
|
|
210
|
-
sys.stdout = _stdout
|
|
211
|
-
# If it needs sales.csv in cwd, write it there and retry
|
|
212
|
-
pass
|
|
213
|
-
|
|
214
|
-
output = sys.stdout.getvalue()
|
|
215
|
-
sys.stdout = _stdout
|
|
216
|
-
|
|
217
|
-
# Check output contains the number 351 (100.5 + 200.0 + 50.5)
|
|
218
|
-
# Match as a standalone number (not as substring of e.g. 13510)
|
|
219
|
-
import re
|
|
220
|
-
if re.search(r'(?<![\\d])351(?:\\.0)?(?![\\d])', output):
|
|
221
|
-
print("PASS")
|
|
222
|
-
else:
|
|
223
|
-
# Try running it differently: maybe it defines a function
|
|
224
|
-
print("FAIL: output was: " + repr(output[:200]))
|
|
225
|
-
sys.exit(1)
|
|
226
|
-
`;
|
|
227
|
-
const f = tmpFile('.py', harness);
|
|
228
|
-
const result = exec(`${python()} "${f}"`);
|
|
229
|
-
try { fs.unlinkSync(f); } catch (e) {}
|
|
230
|
-
try { fs.unlinkSync(csvPath); } catch (e) {}
|
|
231
|
-
if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' };
|
|
232
|
-
return { pass: false, reason: result.stderr || 'CSV sum failed' };
|
|
233
|
-
},
|
|
234
|
-
|
|
235
|
-
countdown(blocks) {
|
|
236
|
-
// React components can't run in bare Node without a bundler. Structural check:
|
|
237
|
-
// the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout).
|
|
238
|
-
const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer'));
|
|
239
|
-
if (!code) return { pass: false, reason: 'No countdown component found' };
|
|
240
|
-
|
|
241
|
-
const src = code.code;
|
|
242
|
-
const hasState = /useState|useReducer|this\.state/.test(src);
|
|
243
|
-
const hasEffect = /useEffect|componentDidMount|setInterval|setTimeout/.test(src);
|
|
244
|
-
const hasDecrement = /- 1|-= 1|prev - 1|count - 1|seconds - 1|time - 1/.test(src);
|
|
245
|
-
|
|
246
|
-
const failures = [];
|
|
247
|
-
if (!hasState) failures.push('no state management (useState/useReducer)');
|
|
248
|
-
if (!hasEffect) failures.push('no timer setup (useEffect/setInterval/setTimeout)');
|
|
249
|
-
if (!hasDecrement) failures.push('no countdown decrement logic');
|
|
250
|
-
|
|
251
|
-
if (failures.length === 0) return { pass: true, reason: 'Countdown has required structure' };
|
|
252
|
-
return { pass: false, reason: 'Missing: ' + failures.join(', ') };
|
|
253
|
-
},
|
|
254
|
-
|
|
255
|
-
ratelimit(blocks) {
|
|
256
|
-
const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit'))));
|
|
257
|
-
if (!code) return { pass: false, reason: 'No Python code block found' };
|
|
258
|
-
|
|
259
|
-
// Structural check for rate limiting: must have some form of counter/time tracking.
|
|
260
|
-
const src = code.code;
|
|
261
|
-
const hasTimeTracking = /time\.|datetime|asyncio/.test(src);
|
|
262
|
-
const hasLimitLogic = /limit|max_requests|rate|429|Too Many|HTTPException|RateLimiter/.test(src);
|
|
263
|
-
const hasFastAPI = /fastapi|FastAPI|app\s*=|@app\./.test(src);
|
|
264
|
-
|
|
265
|
-
const failures = [];
|
|
266
|
-
if (!hasLimitLogic) failures.push('no rate limit logic');
|
|
267
|
-
if (!hasFastAPI) failures.push('no FastAPI usage');
|
|
268
|
-
|
|
269
|
-
if (failures.length === 0) return { pass: true, reason: 'Rate limiter has required structure' };
|
|
270
|
-
return { pass: false, reason: 'Missing: ' + failures.join(', ') };
|
|
271
|
-
},
|
|
272
|
-
};
|
|
273
|
-
|
|
274
|
-
// --- Main assertion entry point ---
|
|
275
|
-
|
|
276
|
-
module.exports = (output, context) => {
|
|
277
|
-
const task = identifyTask(context.vars.task || '');
|
|
278
|
-
if (!task) {
|
|
279
|
-
return { pass: true, score: 1, reason: 'Unknown task, skipped correctness check' };
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
const blocks = extractBlocks(String(output || ''));
|
|
283
|
-
if (blocks.length === 0) {
|
|
284
|
-
return { pass: false, score: 0, reason: 'No code blocks in output' };
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
const check = CHECKS[task];
|
|
288
|
-
const result = check(blocks);
|
|
289
|
-
return {
|
|
290
|
-
pass: result.pass,
|
|
291
|
-
score: result.pass ? 1 : 0,
|
|
292
|
-
reason: result.reason,
|
|
293
|
-
};
|
|
294
|
-
};
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
const fs = require('fs');
|
|
2
|
-
const path = require('path');
|
|
3
|
-
|
|
4
|
-
const RESULTS_DIR = path.join(__dirname, '..', 'results');
|
|
5
|
-
|
|
6
|
-
function median(values) {
|
|
7
|
-
const s = [...values].sort((a, b) => a - b);
|
|
8
|
-
if (s.length === 0) return 0;
|
|
9
|
-
const mid = Math.floor(s.length / 2);
|
|
10
|
-
return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
function findLatestJson(resultsDir = RESULTS_DIR) {
|
|
14
|
-
const files = fs
|
|
15
|
-
.readdirSync(resultsDir)
|
|
16
|
-
.filter((f) => f.startsWith('opencode-go-') && f.endsWith('.json'))
|
|
17
|
-
.sort()
|
|
18
|
-
.reverse();
|
|
19
|
-
if (!files.length) {
|
|
20
|
-
throw new Error(`No opencode-go-*.json in ${resultsDir}. Run benchmark first.`);
|
|
21
|
-
}
|
|
22
|
-
return path.join(resultsDir, files[0]);
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
function modelLabel(id) {
|
|
26
|
-
return id
|
|
27
|
-
.replace('deepseek-v4-pro', 'DeepSeek V4')
|
|
28
|
-
.replace('qwen3.7-max', 'Qwen3.7 Max')
|
|
29
|
-
.replace('minimax-m3', 'MiniMax M3')
|
|
30
|
-
.replace('kimi-k2.6', 'Kimi K2.6');
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
function aggregateOpencodeGo(data) {
|
|
34
|
-
const taskIds = data.tasks.map((t) => t.id);
|
|
35
|
-
const arms = data.arms;
|
|
36
|
-
const models = data.models;
|
|
37
|
-
|
|
38
|
-
const chart = {
|
|
39
|
-
source: `opencode-go-${data.date}.json`,
|
|
40
|
-
date: data.date,
|
|
41
|
-
repeat: data.repeat,
|
|
42
|
-
models: [],
|
|
43
|
-
tasks: taskIds,
|
|
44
|
-
arms,
|
|
45
|
-
};
|
|
46
|
-
|
|
47
|
-
for (const modelId of models) {
|
|
48
|
-
const modelConfig = data.results[modelId];
|
|
49
|
-
const locByArmTask = {};
|
|
50
|
-
const timeByArmTask = {};
|
|
51
|
-
const correctByArm = {};
|
|
52
|
-
|
|
53
|
-
for (const arm of arms) {
|
|
54
|
-
locByArmTask[arm] = {};
|
|
55
|
-
timeByArmTask[arm] = {};
|
|
56
|
-
let pass = 0;
|
|
57
|
-
let total = 0;
|
|
58
|
-
|
|
59
|
-
for (const taskId of taskIds) {
|
|
60
|
-
const runs = modelConfig[arm][taskId];
|
|
61
|
-
locByArmTask[arm][taskId] = median(runs.map((r) => r.loc));
|
|
62
|
-
timeByArmTask[arm][taskId] = median(runs.map((r) => r.timeSec));
|
|
63
|
-
pass += runs.filter((r) => r.correct).length;
|
|
64
|
-
total += runs.length;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
correctByArm[arm] = { pass, total };
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
const baselineTotal = taskIds.reduce((s, t) => s + locByArmTask.baseline[t], 0);
|
|
71
|
-
const lexisTotal = taskIds.reduce((s, t) => s + locByArmTask['lexis-two'][t], 0);
|
|
72
|
-
const reductionPct =
|
|
73
|
-
baselineTotal > 0 ? Math.round((1 - lexisTotal / baselineTotal) * 100) : 0;
|
|
74
|
-
|
|
75
|
-
const baselineTime = taskIds.reduce((s, t) => s + timeByArmTask.baseline[t], 0);
|
|
76
|
-
const lexisTime = taskIds.reduce((s, t) => s + timeByArmTask['lexis-two'][t], 0);
|
|
77
|
-
|
|
78
|
-
chart.models.push({
|
|
79
|
-
id: modelId,
|
|
80
|
-
label: modelLabel(modelId),
|
|
81
|
-
locByArmTask,
|
|
82
|
-
timeByArmTask,
|
|
83
|
-
correctByArm,
|
|
84
|
-
totals: {
|
|
85
|
-
baselineLoc: baselineTotal,
|
|
86
|
-
lexisLoc: lexisTotal,
|
|
87
|
-
reductionPct,
|
|
88
|
-
baselineTimeSec: Math.round(baselineTime * 10) / 10,
|
|
89
|
-
lexisTimeSec: Math.round(lexisTime * 10) / 10,
|
|
90
|
-
},
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
return chart;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
module.exports = {
|
|
98
|
-
aggregateOpencodeGo,
|
|
99
|
-
findLatestJson,
|
|
100
|
-
modelLabel,
|
|
101
|
-
median,
|
|
102
|
-
RESULTS_DIR,
|
|
103
|
-
};
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
// Load KEY=VALUE lines from a .env file into process.env (no dotenv dependency).
|
|
2
|
-
const fs = require('fs');
|
|
3
|
-
|
|
4
|
-
function loadEnvFile(envPath) {
|
|
5
|
-
if (!fs.existsSync(envPath)) return false;
|
|
6
|
-
|
|
7
|
-
const raw = fs.readFileSync(envPath, 'utf8').replace(/^\uFEFF/, '');
|
|
8
|
-
for (const line of raw.split(/\r?\n/)) {
|
|
9
|
-
const trimmed = line.trim();
|
|
10
|
-
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
11
|
-
|
|
12
|
-
const eq = trimmed.indexOf('=');
|
|
13
|
-
if (eq <= 0) continue;
|
|
14
|
-
|
|
15
|
-
const key = trimmed.slice(0, eq).trim();
|
|
16
|
-
if (process.env[key] !== undefined) continue;
|
|
17
|
-
|
|
18
|
-
let value = trimmed.slice(eq + 1).trim();
|
|
19
|
-
if (
|
|
20
|
-
(value.startsWith('"') && value.endsWith('"')) ||
|
|
21
|
-
(value.startsWith("'") && value.endsWith("'"))
|
|
22
|
-
) {
|
|
23
|
-
value = value.slice(1, -1);
|
|
24
|
-
}
|
|
25
|
-
process.env[key] = value;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
return true;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
module.exports = { loadEnvFile };
|
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
// OpenCode Go API client — OpenAI chat/completions + Anthropic /messages transports.
|
|
2
|
-
// Docs: https://opencode.ai/docs/go/#endpoints
|
|
3
|
-
|
|
4
|
-
const fs = require('fs');
|
|
5
|
-
const path = require('path');
|
|
6
|
-
|
|
7
|
-
const DEFAULT_BASE = 'https://opencode.ai/zen/go/v1';
|
|
8
|
-
const ENV_PATH = path.join(__dirname, '..', '..', '.env');
|
|
9
|
-
|
|
10
|
-
function getApiKey() {
|
|
11
|
-
const key = (process.env.OPENCODE_API_KEY || process.env.OPENCODE_GO_API_KEY || '').trim();
|
|
12
|
-
if (!key) {
|
|
13
|
-
const hint = fs.existsSync(ENV_PATH)
|
|
14
|
-
? `OPENCODE_API_KEY is empty in ${ENV_PATH}. Paste your OpenCode Go key after the = sign.`
|
|
15
|
-
: `Create ${ENV_PATH} from .env.example and set OPENCODE_API_KEY=your-key`;
|
|
16
|
-
throw new Error(`Missing OPENCODE_API_KEY. ${hint}`);
|
|
17
|
-
}
|
|
18
|
-
return key;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
async function readJsonResponse(res) {
|
|
22
|
-
const text = await res.text();
|
|
23
|
-
let body;
|
|
24
|
-
try {
|
|
25
|
-
body = text ? JSON.parse(text) : {};
|
|
26
|
-
} catch {
|
|
27
|
-
body = { raw: text };
|
|
28
|
-
}
|
|
29
|
-
if (!res.ok) {
|
|
30
|
-
const msg =
|
|
31
|
-
body?.error?.message ||
|
|
32
|
-
body?.message ||
|
|
33
|
-
(typeof body?.error === 'string' ? body.error : null) ||
|
|
34
|
-
text.slice(0, 500) ||
|
|
35
|
-
res.statusText;
|
|
36
|
-
throw new Error(`HTTP ${res.status}: ${msg}`);
|
|
37
|
-
}
|
|
38
|
-
return body;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
function extractOpenAiText(body) {
|
|
42
|
-
return body?.choices?.[0]?.message?.content ?? '';
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
function extractAnthropicText(body) {
|
|
46
|
-
const blocks = body?.content;
|
|
47
|
-
if (!Array.isArray(blocks)) return '';
|
|
48
|
-
return blocks
|
|
49
|
-
.filter((b) => b?.type === 'text' && typeof b.text === 'string')
|
|
50
|
-
.map((b) => b.text)
|
|
51
|
-
.join('');
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
function extractUsage(body, transport) {
|
|
55
|
-
if (transport === 'openai-chat') {
|
|
56
|
-
const u = body?.usage;
|
|
57
|
-
if (!u) return null;
|
|
58
|
-
return {
|
|
59
|
-
inputTokens: u.prompt_tokens ?? 0,
|
|
60
|
-
outputTokens: u.completion_tokens ?? 0,
|
|
61
|
-
};
|
|
62
|
-
}
|
|
63
|
-
const u = body?.usage;
|
|
64
|
-
if (!u) return null;
|
|
65
|
-
return {
|
|
66
|
-
inputTokens: u.input_tokens ?? 0,
|
|
67
|
-
outputTokens: u.output_tokens ?? 0,
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
async function chatOpenAi({ baseUrl, apiKey, model, system, user, maxTokens, temperature }) {
|
|
72
|
-
const messages = [];
|
|
73
|
-
if (system) messages.push({ role: 'system', content: system });
|
|
74
|
-
messages.push({ role: 'user', content: user });
|
|
75
|
-
|
|
76
|
-
const res = await fetch(`${baseUrl}/chat/completions`, {
|
|
77
|
-
method: 'POST',
|
|
78
|
-
headers: {
|
|
79
|
-
Authorization: `Bearer ${apiKey}`,
|
|
80
|
-
'Content-Type': 'application/json',
|
|
81
|
-
},
|
|
82
|
-
body: JSON.stringify({
|
|
83
|
-
model,
|
|
84
|
-
messages,
|
|
85
|
-
max_tokens: maxTokens,
|
|
86
|
-
temperature,
|
|
87
|
-
}),
|
|
88
|
-
});
|
|
89
|
-
|
|
90
|
-
const body = await readJsonResponse(res);
|
|
91
|
-
return {
|
|
92
|
-
text: extractOpenAiText(body),
|
|
93
|
-
usage: extractUsage(body, 'openai-chat'),
|
|
94
|
-
raw: body,
|
|
95
|
-
};
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
async function chatAnthropic({ baseUrl, apiKey, model, system, user, maxTokens, temperature }) {
|
|
99
|
-
const payload = {
|
|
100
|
-
model,
|
|
101
|
-
max_tokens: maxTokens,
|
|
102
|
-
messages: [{ role: 'user', content: user }],
|
|
103
|
-
};
|
|
104
|
-
if (system) payload.system = system;
|
|
105
|
-
if (typeof temperature === 'number') payload.temperature = temperature;
|
|
106
|
-
|
|
107
|
-
// lexis: Go /messages rejects Bearer — x-api-key only (qwen3.7-max, minimax-m3)
|
|
108
|
-
const res = await fetch(`${baseUrl}/messages`, {
|
|
109
|
-
method: 'POST',
|
|
110
|
-
headers: {
|
|
111
|
-
'x-api-key': apiKey,
|
|
112
|
-
'Content-Type': 'application/json',
|
|
113
|
-
'anthropic-version': '2023-06-01',
|
|
114
|
-
},
|
|
115
|
-
body: JSON.stringify(payload),
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
const body = await readJsonResponse(res);
|
|
119
|
-
return {
|
|
120
|
-
text: extractAnthropicText(body),
|
|
121
|
-
usage: extractUsage(body, 'anthropic-messages'),
|
|
122
|
-
raw: body,
|
|
123
|
-
};
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
async function complete({
|
|
127
|
-
modelId,
|
|
128
|
-
modelConfig,
|
|
129
|
-
system,
|
|
130
|
-
user,
|
|
131
|
-
baseUrl = DEFAULT_BASE,
|
|
132
|
-
apiKey = getApiKey(),
|
|
133
|
-
temperature = 1,
|
|
134
|
-
}) {
|
|
135
|
-
const maxTokens = modelConfig.maxTokens ?? 8192;
|
|
136
|
-
const args = { baseUrl, apiKey, model: modelId, system, user, maxTokens, temperature };
|
|
137
|
-
|
|
138
|
-
if (modelConfig.transport === 'anthropic-messages') {
|
|
139
|
-
return chatAnthropic(args);
|
|
140
|
-
}
|
|
141
|
-
if (modelConfig.transport === 'openai-chat') {
|
|
142
|
-
return chatOpenAi(args);
|
|
143
|
-
}
|
|
144
|
-
throw new Error(`Unknown transport for ${modelId}: ${modelConfig.transport}`);
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
module.exports = {
|
|
148
|
-
DEFAULT_BASE,
|
|
149
|
-
getApiKey,
|
|
150
|
-
complete,
|
|
151
|
-
};
|
package/benchmarks/loc.js
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
// Deterministic code-size metric: non-blank, non-comment lines of code. Counts
|
|
2
|
-
// fenced blocks, or the whole response when the model emitted bare code unfenced.
|
|
3
|
-
// Recorded as the `code_loc` metric per arm (always passes; it is a measurement, not a gate).
|
|
4
|
-
module.exports = (output) => {
|
|
5
|
-
const text = String(output || '');
|
|
6
|
-
const blocks = [...text.matchAll(/```[a-zA-Z0-9_+-]*\n([\s\S]*?)```/g)].map((m) => m[1]);
|
|
7
|
-
const code = blocks.length ? blocks.join('\n') : text;
|
|
8
|
-
const loc = code
|
|
9
|
-
.split('\n')
|
|
10
|
-
.map((l) => l.trim())
|
|
11
|
-
.filter((l) => l && !l.startsWith('//') && !l.startsWith('#') && l !== '*/' && !l.startsWith('/*') && !l.startsWith('*')).length;
|
|
12
|
-
return { pass: true, score: loc, reason: loc + ' code LOC' };
|
|
13
|
-
};
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"baseUrl": "https://opencode.ai/zen/go/v1",
|
|
3
|
-
"defaultModels": [
|
|
4
|
-
"kimi-k2.6",
|
|
5
|
-
"deepseek-v4-pro",
|
|
6
|
-
"qwen3.7-max",
|
|
7
|
-
"minimax-m3"
|
|
8
|
-
],
|
|
9
|
-
"models": {
|
|
10
|
-
"kimi-k2.6": {
|
|
11
|
-
"name": "Kimi K2.6",
|
|
12
|
-
"transport": "openai-chat",
|
|
13
|
-
"maxTokens": 8192
|
|
14
|
-
},
|
|
15
|
-
"deepseek-v4-pro": {
|
|
16
|
-
"name": "DeepSeek V4 Pro",
|
|
17
|
-
"transport": "openai-chat",
|
|
18
|
-
"maxTokens": 8192
|
|
19
|
-
},
|
|
20
|
-
"qwen3.7-max": {
|
|
21
|
-
"name": "Qwen3.7 Max",
|
|
22
|
-
"transport": "anthropic-messages",
|
|
23
|
-
"maxTokens": 8192
|
|
24
|
-
},
|
|
25
|
-
"minimax-m3": {
|
|
26
|
-
"name": "MiniMax M3",
|
|
27
|
-
"transport": "anthropic-messages",
|
|
28
|
-
"maxTokens": 8192
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
}
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# Ponytail benchmark: code size + cost across three arms, same model, same tasks.
|
|
2
|
-
#
|
|
3
|
-
# Run: npx promptfoo@latest eval -c benchmarks/promptfooconfig.yaml
|
|
4
|
-
# View: npx promptfoo@latest view
|
|
5
|
-
# Share: npx promptfoo@latest share (publishes a hosted report URL)
|
|
6
|
-
#
|
|
7
|
-
# Needs ANTHROPIC_API_KEY in the environment or a .env file (see benchmarks/README.md).
|
|
8
|
-
# Caveman arm uses JuliusBrussee/caveman SKILL.md (MIT), vendored at arms/caveman-SKILL.md.
|
|
9
|
-
description: "Ponytail vs caveman vs no-skill: same model, same tasks. Measures code LOC (deterministic) and tokens/cost (API telemetry)."
|
|
10
|
-
|
|
11
|
-
providers:
|
|
12
|
-
- id: anthropic:messages:claude-haiku-4-5-20251001
|
|
13
|
-
config: { max_tokens: 8192, temperature: 1 }
|
|
14
|
-
- id: anthropic:messages:claude-sonnet-4-6
|
|
15
|
-
config: { max_tokens: 8192, temperature: 1 }
|
|
16
|
-
- id: anthropic:messages:claude-opus-4-8
|
|
17
|
-
config: { max_tokens: 8192, temperature: 1 }
|
|
18
|
-
|
|
19
|
-
prompts:
|
|
20
|
-
- id: file://arms/baseline.js
|
|
21
|
-
label: baseline (no skill)
|
|
22
|
-
- id: file://arms/caveman.js
|
|
23
|
-
label: caveman
|
|
24
|
-
- id: file://arms/ponytail.js
|
|
25
|
-
label: ponytail
|
|
26
|
-
|
|
27
|
-
defaultTest:
|
|
28
|
-
assert:
|
|
29
|
-
- type: javascript
|
|
30
|
-
value: file://loc.js
|
|
31
|
-
metric: code_loc
|
|
32
|
-
- type: javascript
|
|
33
|
-
value: file://correctness.js
|
|
34
|
-
metric: correct
|
|
35
|
-
|
|
36
|
-
tests:
|
|
37
|
-
- vars: { task: "Write me a Python function that validates email addresses." }
|
|
38
|
-
- vars: { task: "Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke." }
|
|
39
|
-
- vars: { task: "Write Python code that reads sales.csv and sums the 'amount' column." }
|
|
40
|
-
- vars: { task: "Build me a countdown timer component in React that counts down from a given number of seconds." }
|
|
41
|
-
- vars: { task: "Add rate limiting to my FastAPI endpoint so users can't spam it." }
|
package/benchmarks/prompts.json
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"method": "One fresh Claude Code subagent per task x config, same model, no file outputs. Metrics from task telemetry: total tokens (includes thinking), duration. Code lines counted from fenced blocks in the deliverable.",
|
|
3
|
-
"configs": [
|
|
4
|
-
"baseline — no skill",
|
|
5
|
-
"caveman — caveman SKILL.md (full) as operating instructions",
|
|
6
|
-
"ponytail — ponytail SKILL.md (full) as operating instructions"
|
|
7
|
-
],
|
|
8
|
-
"tasks": [
|
|
9
|
-
{ "id": "email", "prompt": "Write me a Python function that validates email addresses." },
|
|
10
|
-
{ "id": "debounce", "prompt": "Add debounce to a search input in vanilla JavaScript — it currently fires an API call on every keystroke." },
|
|
11
|
-
{ "id": "csv-sum", "prompt": "Write Python code that reads sales.csv and sums the 'amount' column." },
|
|
12
|
-
{ "id": "react-countdown", "prompt": "Build me a countdown timer component in React that counts down from a given number of seconds." },
|
|
13
|
-
{ "id": "rate-limit", "prompt": "Add rate limiting to my FastAPI endpoint so users can't spam it." }
|
|
14
|
-
]
|
|
15
|
-
}
|