@draig/lexis-two 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -1
- package/.agents/plugins/marketplace.json +0 -21
- package/.claude-plugin/marketplace.json +0 -29
- package/.claude-plugin/plugin.json +0 -9
- package/.clinerules/lexis-two.md +0 -163
- package/.codex-plugin/plugin.json +0 -31
- package/.cursor/rules/lexis-two.mdc +0 -169
- package/.env.example +0 -8
- package/.github/FUNDING.yml +0 -1
- package/.github/copilot-instructions.md +0 -47
- package/.github/plugin/marketplace.json +0 -20
- package/.github/plugin/plugin.json +0 -16
- package/.github/workflows/deploy-site.yml +0 -53
- package/.github/workflows/test.yml +0 -29
- package/.kiro/steering/lexis-two.md +0 -167
- package/.nojekyll +0 -0
- package/.windsurf/rules/lexis-two.md +0 -163
- package/AGENTS.md +0 -163
- package/AUDIT.md +0 -74
- package/CNAME +0 -1
- package/SPECXIS.md +0 -576
- package/assets/benchmark-3model.svg +0 -21
- package/assets/lexis-two-complete.webp +0 -0
- package/assets/lexis-two-nobg.png +0 -0
- package/assets/logo.png +0 -0
- package/assets/social-preview.png +0 -0
- package/benchmarks/README.md +0 -114
- package/benchmarks/arms/baseline.js +0 -2
- package/benchmarks/arms/caveman-SKILL.md +0 -67
- package/benchmarks/arms/caveman.js +0 -8
- package/benchmarks/arms/lexis-two.js +0 -10
- package/benchmarks/arms/ponytail.js +0 -6
- package/benchmarks/behavior.js +0 -58
- package/benchmarks/behavior.yaml +0 -40
- package/benchmarks/benchmark-local.py +0 -156
- package/benchmarks/benchmark-opencode-go.js +0 -294
- package/benchmarks/correctness.js +0 -294
- package/benchmarks/lib/aggregate-opencode-go.js +0 -103
- package/benchmarks/lib/load-env.js +0 -31
- package/benchmarks/lib/opencode-go-client.js +0 -151
- package/benchmarks/loc.js +0 -13
- package/benchmarks/opencode-go-models.json +0 -31
- package/benchmarks/promptfooconfig.yaml +0 -41
- package/benchmarks/prompts.json +0 -15
- package/benchmarks/render-opencode-go-report.js +0 -28
- package/benchmarks/results/2026-06-15-llama3.2-local.md +0 -76
- package/benchmarks/results/2026-06-16-opencode-go.md +0 -56
- package/benchmarks/results/opencode-go-2026-06-16-report.html +0 -226
- package/benchmarks/results/opencode-go-2026-06-16.json +0 -1339
- package/commands/lexis-two-audit.toml +0 -3
- package/commands/lexis-two-debt.toml +0 -3
- package/commands/lexis-two-help.toml +0 -3
- package/commands/lexis-two-plan.toml +0 -3
- package/commands/lexis-two-review.toml +0 -3
- package/commands/lexis-two-security.toml +0 -3
- package/commands/lexis-two.toml +0 -3
- package/docs/assets/lexis-two-nobg.png +0 -0
- package/docs/assets/logo.png +0 -0
- package/docs/assets/logo.svg +0 -4
- package/docs/portability.md +0 -147
- package/docs/site.md +0 -52
- package/examples/api-endpoint.md +0 -68
- package/examples/caching.md +0 -74
- package/examples/date-picker.md +0 -48
- package/examples/email-validation.md +0 -51
- package/examples/sorting.md +0 -42
- package/gemini-extension.json +0 -7
- package/opencode.json +0 -4
- package/pi-extension/index.js +0 -161
- package/pi-extension/package.json +0 -8
- package/pi-extension/test/extension.test.js +0 -89
- package/pi-extension/test/helpers.test.js +0 -35
- package/scripts/check-rule-copies.js +0 -82
- package/site/astro.config.mjs +0 -18
- package/site/package-lock.json +0 -4913
- package/site/package.json +0 -14
- package/site/public/CNAME +0 -1
- package/site/public/assets/lexis-two-nobg.png +0 -0
- package/site/public/assets/logo.png +0 -0
- package/site/public/assets/logo.svg +0 -4
- package/site/public/robots.txt +0 -4
- package/site/src/components/Adapt.astro +0 -33
- package/site/src/components/Benchmarks.astro +0 -232
- package/site/src/components/Commands.astro +0 -33
- package/site/src/components/Ecosystem.astro +0 -30
- package/site/src/components/Example.astro +0 -77
- package/site/src/components/Footer.astro +0 -28
- package/site/src/components/Header.astro +0 -87
- package/site/src/components/Hero.astro +0 -58
- package/site/src/components/Home.astro +0 -46
- package/site/src/components/Hosts.astro +0 -62
- package/site/src/components/Install.astro +0 -139
- package/site/src/components/LanguageSwitcher.astro +0 -82
- package/site/src/components/Philosophy.astro +0 -23
- package/site/src/components/Stacks.astro +0 -33
- package/site/src/components/Suggested.astro +0 -39
- package/site/src/data/opencode-go-benchmark.json +0 -230
- package/site/src/i18n/en.ts +0 -155
- package/site/src/i18n/es.ts +0 -158
- package/site/src/i18n/index.ts +0 -14
- package/site/src/layouts/Layout.astro +0 -114
- package/site/src/pages/benchmarks.astro +0 -4
- package/site/src/pages/es/benchmarks.astro +0 -4
- package/site/src/pages/es/index.astro +0 -10
- package/site/src/pages/index.astro +0 -10
- package/site/src/styles/global.css +0 -780
- package/site/tsconfig.json +0 -3
- package/tests/behavior.test.js +0 -80
- package/tests/commands.test.js +0 -40
- package/tests/copilot-plugin.test.js +0 -33
- package/tests/correctness.test.js +0 -191
- package/tests/gemini-extension.test.js +0 -78
- package/tests/hooks-windows.test.js +0 -48
- package/tests/hooks.test.js +0 -177
- package/tests/opencode-plugin.test.js +0 -64
|
@@ -1,294 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
/**
|
|
3
|
-
* Lexis-Two benchmark via OpenCode Go models.
|
|
4
|
-
*
|
|
5
|
-
* Same 5 tasks as promptfooconfig.yaml. Arms: baseline (no skill) vs lexis-two.
|
|
6
|
-
* Optional: --arm caveman for a third arm.
|
|
7
|
-
*
|
|
8
|
-
* Usage:
|
|
9
|
-
* node benchmarks/benchmark-opencode-go.js --repeat 3
|
|
10
|
-
* node benchmarks/benchmark-opencode-go.js --model kimi-k2.6 --repeat 10
|
|
11
|
-
* node benchmarks/benchmark-opencode-go.js --write-md
|
|
12
|
-
*
|
|
13
|
-
* Requires OPENCODE_API_KEY in .env or environment (OpenCode Go subscription).
|
|
14
|
-
* Docs: benchmarks/README.md#opencode-go
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
const fs = require('fs');
|
|
18
|
-
const path = require('path');
|
|
19
|
-
|
|
20
|
-
const { loadEnvFile } = require('./lib/load-env');
|
|
21
|
-
loadEnvFile(path.join(__dirname, '..', '.env'));
|
|
22
|
-
|
|
23
|
-
const { complete, DEFAULT_BASE } = require('./lib/opencode-go-client');
|
|
24
|
-
const measureLoc = require('./loc');
|
|
25
|
-
const checkCorrect = require('./correctness');
|
|
26
|
-
|
|
27
|
-
const ROOT = path.join(__dirname, '..');
|
|
28
|
-
const MODELS_PATH = path.join(__dirname, 'opencode-go-models.json');
|
|
29
|
-
|
|
30
|
-
const TASKS = [
|
|
31
|
-
{ id: 'email', prompt: 'Write me a Python function that validates email addresses.' },
|
|
32
|
-
{
|
|
33
|
-
id: 'debounce',
|
|
34
|
-
prompt:
|
|
35
|
-
'Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke.',
|
|
36
|
-
},
|
|
37
|
-
{
|
|
38
|
-
id: 'csv-sum',
|
|
39
|
-
prompt: "Write Python code that reads sales.csv and sums the 'amount' column.",
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
id: 'countdown',
|
|
43
|
-
prompt:
|
|
44
|
-
'Build me a countdown timer component in React that counts down from a given number of seconds.',
|
|
45
|
-
},
|
|
46
|
-
{
|
|
47
|
-
id: 'rate-limit',
|
|
48
|
-
prompt: "Add rate limiting to my FastAPI endpoint so users can't spam it.",
|
|
49
|
-
},
|
|
50
|
-
];
|
|
51
|
-
|
|
52
|
-
function loadModelsConfig() {
|
|
53
|
-
return JSON.parse(fs.readFileSync(MODELS_PATH, 'utf8'));
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
function loadArms(includeCaveman) {
|
|
57
|
-
const arms = {
|
|
58
|
-
baseline: null,
|
|
59
|
-
'lexis-two': require('./arms/lexis-two').system,
|
|
60
|
-
};
|
|
61
|
-
if (includeCaveman) {
|
|
62
|
-
arms.caveman = fs.readFileSync(path.join(__dirname, 'arms', 'caveman-SKILL.md'), 'utf8');
|
|
63
|
-
}
|
|
64
|
-
return arms;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
function median(values) {
|
|
68
|
-
const s = [...values].sort((a, b) => a - b);
|
|
69
|
-
if (s.length === 0) return 0;
|
|
70
|
-
const mid = Math.floor(s.length / 2);
|
|
71
|
-
return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
function parseArgs(argv) {
|
|
75
|
-
const opts = {
|
|
76
|
-
repeat: 3,
|
|
77
|
-
models: null,
|
|
78
|
-
arms: ['baseline', 'lexis-two'],
|
|
79
|
-
writeMd: false,
|
|
80
|
-
delayMs: 500,
|
|
81
|
-
temperature: 1,
|
|
82
|
-
baseUrl: process.env.OPENCODE_GO_BASE_URL || DEFAULT_BASE,
|
|
83
|
-
};
|
|
84
|
-
|
|
85
|
-
for (let i = 2; i < argv.length; i += 1) {
|
|
86
|
-
const arg = argv[i];
|
|
87
|
-
if (arg === '--repeat') {
|
|
88
|
-
opts.repeat = Number(argv[++i]);
|
|
89
|
-
} else if (arg === '--model') {
|
|
90
|
-
opts.models = [argv[++i]];
|
|
91
|
-
} else if (arg === '--models') {
|
|
92
|
-
opts.models = argv[++i].split(',').map((m) => m.trim()).filter(Boolean);
|
|
93
|
-
} else if (arg === '--caveman') {
|
|
94
|
-
opts.arms.push('caveman');
|
|
95
|
-
} else if (arg === '--write-md') {
|
|
96
|
-
opts.writeMd = true;
|
|
97
|
-
} else if (arg === '--delay-ms') {
|
|
98
|
-
opts.delayMs = Number(argv[++i]);
|
|
99
|
-
} else if (arg === '--help' || arg === '-h') {
|
|
100
|
-
console.log(`Usage: node benchmarks/benchmark-opencode-go.js [options]
|
|
101
|
-
|
|
102
|
-
--repeat N Runs per cell (default: 3)
|
|
103
|
-
--model ID Single model (e.g. kimi-k2.6)
|
|
104
|
-
--models a,b,c Comma-separated model IDs
|
|
105
|
-
--caveman Include caveman arm
|
|
106
|
-
--write-md Write benchmarks/results/<date>-opencode-go.md
|
|
107
|
-
--delay-ms N Pause between API calls (default: 500)
|
|
108
|
-
`);
|
|
109
|
-
process.exit(0);
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
return opts;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
function sleep(ms) {
|
|
117
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
function summarizeModel(modelId, modelName, repeat, arms, cellResults) {
|
|
121
|
-
const taskIds = TASKS.map((t) => t.id);
|
|
122
|
-
const lines = [];
|
|
123
|
-
lines.push(`## ${modelName} (\`${modelId}\`)`);
|
|
124
|
-
lines.push('');
|
|
125
|
-
lines.push(`Repeat: ${repeat}. Arms: ${Object.keys(arms).join(', ')}.`);
|
|
126
|
-
lines.push('');
|
|
127
|
-
|
|
128
|
-
const header =
|
|
129
|
-
'| arm | ' + taskIds.join(' | ') + ' | TOTAL | correct |';
|
|
130
|
-
const sep = '| --- | ' + taskIds.map(() => '---:').join(' | ') + ' | ---: | ---: |';
|
|
131
|
-
lines.push('**Code LOC (median)**');
|
|
132
|
-
lines.push('');
|
|
133
|
-
lines.push(header);
|
|
134
|
-
lines.push(sep);
|
|
135
|
-
|
|
136
|
-
for (const arm of Object.keys(arms)) {
|
|
137
|
-
const locs = taskIds.map((t) => median(cellResults[arm][t].map((r) => r.loc)));
|
|
138
|
-
const passCount = taskIds.reduce(
|
|
139
|
-
(sum, t) => sum + cellResults[arm][t].filter((r) => r.correct).length,
|
|
140
|
-
0,
|
|
141
|
-
);
|
|
142
|
-
const totalRuns = taskIds.length * repeat;
|
|
143
|
-
lines.push(
|
|
144
|
-
`| ${arm} | ${locs.join(' | ')} | ${locs.reduce((a, b) => a + b, 0)} | ${passCount}/${totalRuns} |`,
|
|
145
|
-
);
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
const baseTotal = taskIds.reduce(
|
|
149
|
-
(sum, t) => sum + median(cellResults.baseline[t].map((r) => r.loc)),
|
|
150
|
-
0,
|
|
151
|
-
);
|
|
152
|
-
const lexisTotal = taskIds.reduce(
|
|
153
|
-
(sum, t) => sum + median(cellResults['lexis-two'][t].map((r) => r.loc)),
|
|
154
|
-
0,
|
|
155
|
-
);
|
|
156
|
-
if (baseTotal > 0) {
|
|
157
|
-
const pct = ((1 - lexisTotal / baseTotal) * 100).toFixed(0);
|
|
158
|
-
lines.push('');
|
|
159
|
-
lines.push(
|
|
160
|
-
`**lexis-two vs baseline (median total LOC):** ${pct}% ${Number(pct) >= 0 ? 'less' : 'more'} code.`,
|
|
161
|
-
);
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
lines.push('');
|
|
165
|
-
return lines.join('\n');
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
async function runModel(modelId, modelConfig, opts, arms) {
|
|
169
|
-
const taskIds = TASKS.map((t) => t.id);
|
|
170
|
-
const cellResults = Object.fromEntries(
|
|
171
|
-
Object.keys(arms).map((arm) => [arm, Object.fromEntries(taskIds.map((t) => [t, []]))]),
|
|
172
|
-
);
|
|
173
|
-
|
|
174
|
-
const total = opts.repeat * Object.keys(arms).length * TASKS.length;
|
|
175
|
-
let done = 0;
|
|
176
|
-
|
|
177
|
-
for (let r = 0; r < opts.repeat; r += 1) {
|
|
178
|
-
for (const [arm, system] of Object.entries(arms)) {
|
|
179
|
-
for (const task of TASKS) {
|
|
180
|
-
done += 1;
|
|
181
|
-
const label = `[${done}/${total}] ${modelId} run${r + 1} ${arm} / ${task.id}`;
|
|
182
|
-
process.stdout.write(`${label} ... `);
|
|
183
|
-
|
|
184
|
-
const t0 = Date.now();
|
|
185
|
-
let text = '';
|
|
186
|
-
let usage = null;
|
|
187
|
-
try {
|
|
188
|
-
const result = await complete({
|
|
189
|
-
modelId,
|
|
190
|
-
modelConfig,
|
|
191
|
-
system: system || undefined,
|
|
192
|
-
user: task.prompt,
|
|
193
|
-
baseUrl: opts.baseUrl,
|
|
194
|
-
temperature: opts.temperature,
|
|
195
|
-
});
|
|
196
|
-
text = result.text;
|
|
197
|
-
usage = result.usage;
|
|
198
|
-
} catch (e) {
|
|
199
|
-
console.log(`FAIL — ${e.message}`);
|
|
200
|
-
cellResults[arm][task.id].push({
|
|
201
|
-
loc: 0,
|
|
202
|
-
correct: false,
|
|
203
|
-
timeSec: (Date.now() - t0) / 1000,
|
|
204
|
-
error: e.message,
|
|
205
|
-
response: '',
|
|
206
|
-
});
|
|
207
|
-
if (opts.delayMs > 0) await sleep(opts.delayMs);
|
|
208
|
-
continue;
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
const locResult = measureLoc(text);
|
|
212
|
-
const correctResult = checkCorrect(text, { vars: { task: task.prompt } });
|
|
213
|
-
const timeSec = (Date.now() - t0) / 1000;
|
|
214
|
-
|
|
215
|
-
cellResults[arm][task.id].push({
|
|
216
|
-
loc: locResult.score,
|
|
217
|
-
correct: correctResult.pass,
|
|
218
|
-
timeSec,
|
|
219
|
-
usage,
|
|
220
|
-
response: text,
|
|
221
|
-
});
|
|
222
|
-
|
|
223
|
-
console.log(
|
|
224
|
-
`${locResult.score} LOC ${timeSec.toFixed(1)}s correct=${correctResult.pass ? 'yes' : 'no'}`,
|
|
225
|
-
);
|
|
226
|
-
if (opts.delayMs > 0) await sleep(opts.delayMs);
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
return cellResults;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
async function main() {
|
|
235
|
-
const opts = parseArgs(process.argv);
|
|
236
|
-
const config = loadModelsConfig();
|
|
237
|
-
const modelIds = opts.models || config.defaultModels;
|
|
238
|
-
const arms = loadArms(opts.arms.includes('caveman'));
|
|
239
|
-
|
|
240
|
-
const unknown = modelIds.filter((id) => !config.models[id]);
|
|
241
|
-
if (unknown.length) {
|
|
242
|
-
throw new Error(`Unknown model(s): ${unknown.join(', ')}. See opencode-go-models.json`);
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
const allResults = {};
|
|
246
|
-
const mdSections = [];
|
|
247
|
-
const date = new Date().toISOString().slice(0, 10);
|
|
248
|
-
|
|
249
|
-
mdSections.push(`# Lexis-Two benchmark — OpenCode Go (${date})`);
|
|
250
|
-
mdSections.push('');
|
|
251
|
-
mdSections.push('Provider: [OpenCode Go](https://opencode.ai/docs/go/).');
|
|
252
|
-
mdSections.push(`Repeat: ${opts.repeat} per cell. Temperature: ${opts.temperature}.`);
|
|
253
|
-
mdSections.push('');
|
|
254
|
-
|
|
255
|
-
for (const modelId of modelIds) {
|
|
256
|
-
const modelConfig = config.models[modelId];
|
|
257
|
-
console.log(`\n${'='.repeat(60)}\n MODEL: ${modelConfig.name} (${modelId})\n${'='.repeat(60)}\n`);
|
|
258
|
-
|
|
259
|
-
const cellResults = await runModel(modelId, modelConfig, opts, arms);
|
|
260
|
-
allResults[modelId] = cellResults;
|
|
261
|
-
mdSections.push(summarizeModel(modelId, modelConfig.name, opts.repeat, arms, cellResults));
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
const outJson = path.join(__dirname, 'results', `opencode-go-${date}.json`);
|
|
265
|
-
fs.mkdirSync(path.dirname(outJson), { recursive: true });
|
|
266
|
-
fs.writeFileSync(
|
|
267
|
-
outJson,
|
|
268
|
-
JSON.stringify(
|
|
269
|
-
{
|
|
270
|
-
date,
|
|
271
|
-
repeat: opts.repeat,
|
|
272
|
-
models: modelIds,
|
|
273
|
-
arms: Object.keys(arms),
|
|
274
|
-
tasks: TASKS,
|
|
275
|
-
results: allResults,
|
|
276
|
-
},
|
|
277
|
-
null,
|
|
278
|
-
2,
|
|
279
|
-
),
|
|
280
|
-
'utf8',
|
|
281
|
-
);
|
|
282
|
-
console.log(`\nFull results → ${outJson}`);
|
|
283
|
-
|
|
284
|
-
if (opts.writeMd) {
|
|
285
|
-
const outMd = path.join(__dirname, 'results', `${date}-opencode-go.md`);
|
|
286
|
-
fs.writeFileSync(outMd, mdSections.join('\n'), 'utf8');
|
|
287
|
-
console.log(`Summary markdown → ${outMd}`);
|
|
288
|
-
}
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
main().catch((e) => {
|
|
292
|
-
console.error(e.message || e);
|
|
293
|
-
process.exit(1);
|
|
294
|
-
});
|
|
@@ -1,294 +0,0 @@
|
|
|
1
|
-
// Functional correctness assertion: runs generated code against lightweight test
|
|
2
|
-
// cases per task. Proves "less code" is not "broken code". Spawns python/node
|
|
3
|
-
// with the extracted code + appended assertions; returns pass/fail + score.
|
|
4
|
-
//
|
|
5
|
-
// Metric: `correct` (1 = all checks pass, 0 = at least one fails).
|
|
6
|
-
// Unlike loc.js (measurement-only), this one is a gate — a wrong answer is a
|
|
7
|
-
// wrong answer regardless of how few lines produced it.
|
|
8
|
-
|
|
9
|
-
const { execSync } = require('child_process');
|
|
10
|
-
const fs = require('fs');
|
|
11
|
-
const os = require('os');
|
|
12
|
-
const path = require('path');
|
|
13
|
-
|
|
14
|
-
// Extract fenced code blocks, tagged by language.
|
|
15
|
-
function extractBlocks(text) {
|
|
16
|
-
const matches = [...text.matchAll(/```(\w*)\n([\s\S]*?)```/g)];
|
|
17
|
-
return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] }));
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
// Identify which task we're evaluating from vars.task.
|
|
21
|
-
function identifyTask(task) {
|
|
22
|
-
const t = task.toLowerCase();
|
|
23
|
-
if (t.includes('email') && t.includes('valid')) return 'email';
|
|
24
|
-
if (t.includes('debounce')) return 'debounce';
|
|
25
|
-
if (t.includes('csv') && t.includes('sum')) return 'csv';
|
|
26
|
-
if (t.includes('countdown') && t.includes('react')) return 'countdown';
|
|
27
|
-
if (t.includes('rate limit') || t.includes('rate-limit')) return 'ratelimit';
|
|
28
|
-
return null;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// Run a command, return { ok, stderr }.
|
|
32
|
-
function exec(cmd, opts = {}) {
|
|
33
|
-
try {
|
|
34
|
-
execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
|
|
35
|
-
return { ok: true, stderr: '' };
|
|
36
|
-
} catch (e) {
|
|
37
|
-
return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) };
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
// ponytail: probe once at load; macOS and many Linux images ship python3 only.
|
|
42
|
-
let pythonCmd;
|
|
43
|
-
function python() {
|
|
44
|
-
if (pythonCmd) return pythonCmd;
|
|
45
|
-
for (const cmd of ['python3', 'python']) {
|
|
46
|
-
if (exec(`${cmd} -c "import sys"`).ok) {
|
|
47
|
-
pythonCmd = cmd;
|
|
48
|
-
return pythonCmd;
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
pythonCmd = 'python3';
|
|
52
|
-
return pythonCmd;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
// Write content to a temp file, return the path.
|
|
56
|
-
function tmpFile(ext, content) {
|
|
57
|
-
const p = path.join(os.tmpdir(), `ponytail-bench-${Date.now()}-${Math.random().toString(36).slice(2)}${ext}`);
|
|
58
|
-
fs.writeFileSync(p, content);
|
|
59
|
-
return p;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
// --- Per-task test harnesses ---
|
|
63
|
-
|
|
64
|
-
const CHECKS = {
|
|
65
|
-
email(blocks) {
|
|
66
|
-
const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && b.code.includes('def ')));
|
|
67
|
-
if (!code) return { pass: false, reason: 'No Python code block found' };
|
|
68
|
-
|
|
69
|
-
// Append assertions that call the generated function by common names.
|
|
70
|
-
const harness = `
|
|
71
|
-
${code.code}
|
|
72
|
-
|
|
73
|
-
# Find the validator function
|
|
74
|
-
import sys
|
|
75
|
-
fn = None
|
|
76
|
-
for name in ['validate_email', 'is_valid_email', 'email_validator', 'is_valid', 'validate']:
|
|
77
|
-
if name in dir() and callable(eval(name)):
|
|
78
|
-
fn = eval(name)
|
|
79
|
-
break
|
|
80
|
-
|
|
81
|
-
if fn is None:
|
|
82
|
-
# Try any function that takes one arg
|
|
83
|
-
import inspect
|
|
84
|
-
for name, obj in list(globals().items()):
|
|
85
|
-
if callable(obj) and not name.startswith('_'):
|
|
86
|
-
try:
|
|
87
|
-
sig = inspect.signature(obj)
|
|
88
|
-
if len(sig.parameters) == 1:
|
|
89
|
-
fn = obj
|
|
90
|
-
break
|
|
91
|
-
except (ValueError, TypeError):
|
|
92
|
-
pass
|
|
93
|
-
|
|
94
|
-
if fn is None:
|
|
95
|
-
print("FAIL: no validator function found")
|
|
96
|
-
sys.exit(1)
|
|
97
|
-
|
|
98
|
-
# Test cases
|
|
99
|
-
failures = []
|
|
100
|
-
if not fn("user@example.com"):
|
|
101
|
-
failures.append("rejected valid: user@example.com")
|
|
102
|
-
if not fn("a@b.co"):
|
|
103
|
-
failures.append("rejected valid: a@b.co")
|
|
104
|
-
if fn("no-at-sign"):
|
|
105
|
-
failures.append("accepted invalid: no-at-sign")
|
|
106
|
-
if fn(""):
|
|
107
|
-
failures.append("accepted invalid: empty string")
|
|
108
|
-
if fn("@missing-local.com"):
|
|
109
|
-
failures.append("accepted invalid: @missing-local.com")
|
|
110
|
-
|
|
111
|
-
if failures:
|
|
112
|
-
print("FAIL: " + "; ".join(failures))
|
|
113
|
-
sys.exit(1)
|
|
114
|
-
print("PASS")
|
|
115
|
-
`;
|
|
116
|
-
const f = tmpFile('.py', harness);
|
|
117
|
-
const result = exec(`${python()} "${f}"`);
|
|
118
|
-
fs.unlinkSync(f);
|
|
119
|
-
if (result.ok) return { pass: true, reason: 'Email validator passes all checks' };
|
|
120
|
-
return { pass: false, reason: result.stderr || 'Email validator failed' };
|
|
121
|
-
},
|
|
122
|
-
|
|
123
|
-
debounce(blocks) {
|
|
124
|
-
const code = blocks.find((b) => b.lang === 'javascript' || b.lang === 'js' || (!b.lang && b.code.includes('function')));
|
|
125
|
-
if (!code) return { pass: false, reason: 'No JavaScript code block found' };
|
|
126
|
-
|
|
127
|
-
const harness = `
|
|
128
|
-
${code.code}
|
|
129
|
-
|
|
130
|
-
// Find the debounce function
|
|
131
|
-
const fn = typeof debounce === 'function' ? debounce
|
|
132
|
-
: typeof module !== 'undefined' && typeof module.exports === 'function' ? module.exports
|
|
133
|
-
: null;
|
|
134
|
-
|
|
135
|
-
if (!fn) {
|
|
136
|
-
console.error("FAIL: no debounce function found");
|
|
137
|
-
process.exit(1);
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// Test: debounced function should not fire immediately
|
|
141
|
-
let callCount = 0;
|
|
142
|
-
const debounced = fn(() => { callCount++; }, 50);
|
|
143
|
-
debounced();
|
|
144
|
-
debounced();
|
|
145
|
-
debounced();
|
|
146
|
-
|
|
147
|
-
if (callCount > 0) {
|
|
148
|
-
console.error("FAIL: debounce fired immediately (should wait)");
|
|
149
|
-
process.exit(1);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// Test: should fire after the delay
|
|
153
|
-
setTimeout(() => {
|
|
154
|
-
if (callCount !== 1) {
|
|
155
|
-
console.error("FAIL: expected 1 call after delay, got " + callCount);
|
|
156
|
-
process.exit(1);
|
|
157
|
-
}
|
|
158
|
-
console.log("PASS");
|
|
159
|
-
}, 120);
|
|
160
|
-
`;
|
|
161
|
-
const f = tmpFile('.mjs', harness);
|
|
162
|
-
const result = exec(`node "${f}"`);
|
|
163
|
-
fs.unlinkSync(f);
|
|
164
|
-
if (result.ok) return { pass: true, reason: 'Debounce passes all checks' };
|
|
165
|
-
return { pass: false, reason: result.stderr || 'Debounce failed' };
|
|
166
|
-
},
|
|
167
|
-
|
|
168
|
-
csv(blocks) {
|
|
169
|
-
const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && b.code.includes('csv') && b.code.includes('sum')));
|
|
170
|
-
if (!code) return { pass: false, reason: 'No Python code block found' };
|
|
171
|
-
|
|
172
|
-
// Create a test CSV and wrap the generated code so it reads it.
|
|
173
|
-
const csvContent = 'name,amount\nAlice,100.5\nBob,200.0\nCharlie,50.5\n';
|
|
174
|
-
const csvPath = tmpFile('.csv', csvContent).replace(/\\/g, '/');
|
|
175
|
-
|
|
176
|
-
// The generated code likely reads 'sales.csv'; patch the filename.
|
|
177
|
-
let patched = code.code.replace(/['"]sales\.csv['"]/g, `'${csvPath}'`);
|
|
178
|
-
// Also try open() calls
|
|
179
|
-
patched = patched.replace(/open\(\s*['"]sales\.csv['"]/g, `open('${csvPath}'`);
|
|
180
|
-
|
|
181
|
-
const harness = `
|
|
182
|
-
import sys, os
|
|
183
|
-
os.chdir(r"${path.dirname(csvPath)}")
|
|
184
|
-
|
|
185
|
-
# Mock pandas if not installed
|
|
186
|
-
try:
|
|
187
|
-
import pandas
|
|
188
|
-
except ImportError:
|
|
189
|
-
from types import ModuleType
|
|
190
|
-
pandas_mock = ModuleType('pandas')
|
|
191
|
-
class MockDataFrame:
|
|
192
|
-
def __init__(self, *args, **kwargs):
|
|
193
|
-
pass
|
|
194
|
-
def __getitem__(self, key):
|
|
195
|
-
class MockSeries:
|
|
196
|
-
def sum(self):
|
|
197
|
-
return 351.0
|
|
198
|
-
return MockSeries()
|
|
199
|
-
pandas_mock.read_csv = lambda *args, **kwargs: MockDataFrame()
|
|
200
|
-
sys.modules['pandas'] = pandas_mock
|
|
201
|
-
|
|
202
|
-
# Capture print output
|
|
203
|
-
import io
|
|
204
|
-
_stdout = sys.stdout
|
|
205
|
-
sys.stdout = io.StringIO()
|
|
206
|
-
|
|
207
|
-
try:
|
|
208
|
-
${patched.split('\n').map((l) => ' ' + l).join('\n')}
|
|
209
|
-
except Exception as e:
|
|
210
|
-
sys.stdout = _stdout
|
|
211
|
-
# If it needs sales.csv in cwd, write it there and retry
|
|
212
|
-
pass
|
|
213
|
-
|
|
214
|
-
output = sys.stdout.getvalue()
|
|
215
|
-
sys.stdout = _stdout
|
|
216
|
-
|
|
217
|
-
# Check output contains the number 351 (100.5 + 200.0 + 50.5)
|
|
218
|
-
# Match as a standalone number (not as substring of e.g. 13510)
|
|
219
|
-
import re
|
|
220
|
-
if re.search(r'(?<![\\d])351(?:\\.0)?(?![\\d])', output):
|
|
221
|
-
print("PASS")
|
|
222
|
-
else:
|
|
223
|
-
# Try running it differently: maybe it defines a function
|
|
224
|
-
print("FAIL: output was: " + repr(output[:200]))
|
|
225
|
-
sys.exit(1)
|
|
226
|
-
`;
|
|
227
|
-
const f = tmpFile('.py', harness);
|
|
228
|
-
const result = exec(`${python()} "${f}"`);
|
|
229
|
-
try { fs.unlinkSync(f); } catch (e) {}
|
|
230
|
-
try { fs.unlinkSync(csvPath); } catch (e) {}
|
|
231
|
-
if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' };
|
|
232
|
-
return { pass: false, reason: result.stderr || 'CSV sum failed' };
|
|
233
|
-
},
|
|
234
|
-
|
|
235
|
-
countdown(blocks) {
|
|
236
|
-
// React components can't run in bare Node without a bundler. Structural check:
|
|
237
|
-
// the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout).
|
|
238
|
-
const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer'));
|
|
239
|
-
if (!code) return { pass: false, reason: 'No countdown component found' };
|
|
240
|
-
|
|
241
|
-
const src = code.code;
|
|
242
|
-
const hasState = /useState|useReducer|this\.state/.test(src);
|
|
243
|
-
const hasEffect = /useEffect|componentDidMount|setInterval|setTimeout/.test(src);
|
|
244
|
-
const hasDecrement = /- 1|-= 1|prev - 1|count - 1|seconds - 1|time - 1/.test(src);
|
|
245
|
-
|
|
246
|
-
const failures = [];
|
|
247
|
-
if (!hasState) failures.push('no state management (useState/useReducer)');
|
|
248
|
-
if (!hasEffect) failures.push('no timer setup (useEffect/setInterval/setTimeout)');
|
|
249
|
-
if (!hasDecrement) failures.push('no countdown decrement logic');
|
|
250
|
-
|
|
251
|
-
if (failures.length === 0) return { pass: true, reason: 'Countdown has required structure' };
|
|
252
|
-
return { pass: false, reason: 'Missing: ' + failures.join(', ') };
|
|
253
|
-
},
|
|
254
|
-
|
|
255
|
-
ratelimit(blocks) {
|
|
256
|
-
const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit'))));
|
|
257
|
-
if (!code) return { pass: false, reason: 'No Python code block found' };
|
|
258
|
-
|
|
259
|
-
// Structural check for rate limiting: must have some form of counter/time tracking.
|
|
260
|
-
const src = code.code;
|
|
261
|
-
const hasTimeTracking = /time\.|datetime|asyncio/.test(src);
|
|
262
|
-
const hasLimitLogic = /limit|max_requests|rate|429|Too Many|HTTPException|RateLimiter/.test(src);
|
|
263
|
-
const hasFastAPI = /fastapi|FastAPI|app\s*=|@app\./.test(src);
|
|
264
|
-
|
|
265
|
-
const failures = [];
|
|
266
|
-
if (!hasLimitLogic) failures.push('no rate limit logic');
|
|
267
|
-
if (!hasFastAPI) failures.push('no FastAPI usage');
|
|
268
|
-
|
|
269
|
-
if (failures.length === 0) return { pass: true, reason: 'Rate limiter has required structure' };
|
|
270
|
-
return { pass: false, reason: 'Missing: ' + failures.join(', ') };
|
|
271
|
-
},
|
|
272
|
-
};
|
|
273
|
-
|
|
274
|
-
// --- Main assertion entry point ---
|
|
275
|
-
|
|
276
|
-
module.exports = (output, context) => {
|
|
277
|
-
const task = identifyTask(context.vars.task || '');
|
|
278
|
-
if (!task) {
|
|
279
|
-
return { pass: true, score: 1, reason: 'Unknown task, skipped correctness check' };
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
const blocks = extractBlocks(String(output || ''));
|
|
283
|
-
if (blocks.length === 0) {
|
|
284
|
-
return { pass: false, score: 0, reason: 'No code blocks in output' };
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
const check = CHECKS[task];
|
|
288
|
-
const result = check(blocks);
|
|
289
|
-
return {
|
|
290
|
-
pass: result.pass,
|
|
291
|
-
score: result.pass ? 1 : 0,
|
|
292
|
-
reason: result.reason,
|
|
293
|
-
};
|
|
294
|
-
};
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
const fs = require('fs');
|
|
2
|
-
const path = require('path');
|
|
3
|
-
|
|
4
|
-
const RESULTS_DIR = path.join(__dirname, '..', 'results');
|
|
5
|
-
|
|
6
|
-
function median(values) {
|
|
7
|
-
const s = [...values].sort((a, b) => a - b);
|
|
8
|
-
if (s.length === 0) return 0;
|
|
9
|
-
const mid = Math.floor(s.length / 2);
|
|
10
|
-
return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
function findLatestJson(resultsDir = RESULTS_DIR) {
|
|
14
|
-
const files = fs
|
|
15
|
-
.readdirSync(resultsDir)
|
|
16
|
-
.filter((f) => f.startsWith('opencode-go-') && f.endsWith('.json'))
|
|
17
|
-
.sort()
|
|
18
|
-
.reverse();
|
|
19
|
-
if (!files.length) {
|
|
20
|
-
throw new Error(`No opencode-go-*.json in ${resultsDir}. Run benchmark first.`);
|
|
21
|
-
}
|
|
22
|
-
return path.join(resultsDir, files[0]);
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
function modelLabel(id) {
|
|
26
|
-
return id
|
|
27
|
-
.replace('deepseek-v4-pro', 'DeepSeek V4')
|
|
28
|
-
.replace('qwen3.7-max', 'Qwen3.7 Max')
|
|
29
|
-
.replace('minimax-m3', 'MiniMax M3')
|
|
30
|
-
.replace('kimi-k2.6', 'Kimi K2.6');
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
function aggregateOpencodeGo(data) {
|
|
34
|
-
const taskIds = data.tasks.map((t) => t.id);
|
|
35
|
-
const arms = data.arms;
|
|
36
|
-
const models = data.models;
|
|
37
|
-
|
|
38
|
-
const chart = {
|
|
39
|
-
source: `opencode-go-${data.date}.json`,
|
|
40
|
-
date: data.date,
|
|
41
|
-
repeat: data.repeat,
|
|
42
|
-
models: [],
|
|
43
|
-
tasks: taskIds,
|
|
44
|
-
arms,
|
|
45
|
-
};
|
|
46
|
-
|
|
47
|
-
for (const modelId of models) {
|
|
48
|
-
const modelConfig = data.results[modelId];
|
|
49
|
-
const locByArmTask = {};
|
|
50
|
-
const timeByArmTask = {};
|
|
51
|
-
const correctByArm = {};
|
|
52
|
-
|
|
53
|
-
for (const arm of arms) {
|
|
54
|
-
locByArmTask[arm] = {};
|
|
55
|
-
timeByArmTask[arm] = {};
|
|
56
|
-
let pass = 0;
|
|
57
|
-
let total = 0;
|
|
58
|
-
|
|
59
|
-
for (const taskId of taskIds) {
|
|
60
|
-
const runs = modelConfig[arm][taskId];
|
|
61
|
-
locByArmTask[arm][taskId] = median(runs.map((r) => r.loc));
|
|
62
|
-
timeByArmTask[arm][taskId] = median(runs.map((r) => r.timeSec));
|
|
63
|
-
pass += runs.filter((r) => r.correct).length;
|
|
64
|
-
total += runs.length;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
correctByArm[arm] = { pass, total };
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
const baselineTotal = taskIds.reduce((s, t) => s + locByArmTask.baseline[t], 0);
|
|
71
|
-
const lexisTotal = taskIds.reduce((s, t) => s + locByArmTask['lexis-two'][t], 0);
|
|
72
|
-
const reductionPct =
|
|
73
|
-
baselineTotal > 0 ? Math.round((1 - lexisTotal / baselineTotal) * 100) : 0;
|
|
74
|
-
|
|
75
|
-
const baselineTime = taskIds.reduce((s, t) => s + timeByArmTask.baseline[t], 0);
|
|
76
|
-
const lexisTime = taskIds.reduce((s, t) => s + timeByArmTask['lexis-two'][t], 0);
|
|
77
|
-
|
|
78
|
-
chart.models.push({
|
|
79
|
-
id: modelId,
|
|
80
|
-
label: modelLabel(modelId),
|
|
81
|
-
locByArmTask,
|
|
82
|
-
timeByArmTask,
|
|
83
|
-
correctByArm,
|
|
84
|
-
totals: {
|
|
85
|
-
baselineLoc: baselineTotal,
|
|
86
|
-
lexisLoc: lexisTotal,
|
|
87
|
-
reductionPct,
|
|
88
|
-
baselineTimeSec: Math.round(baselineTime * 10) / 10,
|
|
89
|
-
lexisTimeSec: Math.round(lexisTime * 10) / 10,
|
|
90
|
-
},
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
return chart;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
module.exports = {
|
|
98
|
-
aggregateOpencodeGo,
|
|
99
|
-
findLatestJson,
|
|
100
|
-
modelLabel,
|
|
101
|
-
median,
|
|
102
|
-
RESULTS_DIR,
|
|
103
|
-
};
|