@draig/lexis-two 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/package.json +1 -1
  2. package/.claude-plugin/marketplace.json +0 -29
  3. package/.claude-plugin/plugin.json +0 -9
  4. package/.codex-plugin/plugin.json +0 -31
  5. package/.env.example +0 -8
  6. package/.github/FUNDING.yml +0 -1
  7. package/.github/copilot-instructions.md +0 -47
  8. package/.github/plugin/marketplace.json +0 -20
  9. package/.github/plugin/plugin.json +0 -16
  10. package/.github/workflows/deploy-site.yml +0 -53
  11. package/.github/workflows/test.yml +0 -29
  12. package/AUDIT.md +0 -74
  13. package/SPECXIS.md +0 -576
  14. package/benchmarks/README.md +0 -114
  15. package/benchmarks/arms/baseline.js +0 -2
  16. package/benchmarks/arms/caveman-SKILL.md +0 -67
  17. package/benchmarks/arms/caveman.js +0 -8
  18. package/benchmarks/arms/lexis-two.js +0 -10
  19. package/benchmarks/arms/ponytail.js +0 -6
  20. package/benchmarks/behavior.js +0 -58
  21. package/benchmarks/behavior.yaml +0 -40
  22. package/benchmarks/benchmark-local.py +0 -156
  23. package/benchmarks/benchmark-opencode-go.js +0 -294
  24. package/benchmarks/correctness.js +0 -294
  25. package/benchmarks/lib/aggregate-opencode-go.js +0 -103
  26. package/benchmarks/lib/load-env.js +0 -31
  27. package/benchmarks/lib/opencode-go-client.js +0 -151
  28. package/benchmarks/loc.js +0 -13
  29. package/benchmarks/opencode-go-models.json +0 -31
  30. package/benchmarks/promptfooconfig.yaml +0 -41
  31. package/benchmarks/prompts.json +0 -15
  32. package/benchmarks/render-opencode-go-report.js +0 -28
  33. package/benchmarks/results/2026-06-15-llama3.2-local.md +0 -76
  34. package/benchmarks/results/2026-06-16-opencode-go.md +0 -56
  35. package/benchmarks/results/opencode-go-2026-06-16-report.html +0 -226
  36. package/benchmarks/results/opencode-go-2026-06-16.json +0 -1339
  37. package/docs/assets/lexis-two-nobg.png +0 -0
  38. package/docs/assets/logo.png +0 -0
  39. package/docs/assets/logo.svg +0 -4
  40. package/docs/portability.md +0 -147
  41. package/docs/site.md +0 -52
  42. package/gemini-extension.json +0 -7
  43. package/pi-extension/index.js +0 -161
  44. package/pi-extension/package.json +0 -8
  45. package/pi-extension/test/extension.test.js +0 -89
  46. package/pi-extension/test/helpers.test.js +0 -35
  47. package/scripts/check-rule-copies.js +0 -82
  48. package/site/astro.config.mjs +0 -18
  49. package/site/package-lock.json +0 -4913
  50. package/site/package.json +0 -14
  51. package/site/public/CNAME +0 -1
  52. package/site/public/assets/lexis-two-nobg.png +0 -0
  53. package/site/public/assets/logo.png +0 -0
  54. package/site/public/assets/logo.svg +0 -4
  55. package/site/public/robots.txt +0 -4
  56. package/site/src/components/Adapt.astro +0 -33
  57. package/site/src/components/Benchmarks.astro +0 -232
  58. package/site/src/components/Commands.astro +0 -33
  59. package/site/src/components/Ecosystem.astro +0 -30
  60. package/site/src/components/Example.astro +0 -77
  61. package/site/src/components/Footer.astro +0 -28
  62. package/site/src/components/Header.astro +0 -87
  63. package/site/src/components/Hero.astro +0 -58
  64. package/site/src/components/Home.astro +0 -46
  65. package/site/src/components/Hosts.astro +0 -62
  66. package/site/src/components/Install.astro +0 -139
  67. package/site/src/components/LanguageSwitcher.astro +0 -82
  68. package/site/src/components/Philosophy.astro +0 -23
  69. package/site/src/components/Stacks.astro +0 -33
  70. package/site/src/components/Suggested.astro +0 -39
  71. package/site/src/data/opencode-go-benchmark.json +0 -230
  72. package/site/src/i18n/en.ts +0 -155
  73. package/site/src/i18n/es.ts +0 -158
  74. package/site/src/i18n/index.ts +0 -14
  75. package/site/src/layouts/Layout.astro +0 -114
  76. package/site/src/pages/benchmarks.astro +0 -4
  77. package/site/src/pages/es/benchmarks.astro +0 -4
  78. package/site/src/pages/es/index.astro +0 -10
  79. package/site/src/pages/index.astro +0 -10
  80. package/site/src/styles/global.css +0 -780
  81. package/site/tsconfig.json +0 -3
  82. package/tests/behavior.test.js +0 -80
  83. package/tests/commands.test.js +0 -40
  84. package/tests/copilot-plugin.test.js +0 -33
  85. package/tests/correctness.test.js +0 -191
  86. package/tests/gemini-extension.test.js +0 -78
  87. package/tests/hooks-windows.test.js +0 -48
  88. package/tests/hooks.test.js +0 -177
  89. package/tests/opencode-plugin.test.js +0 -64
@@ -1,294 +0,0 @@
1
- // Functional correctness assertion: runs generated code against lightweight test
2
- // cases per task. Proves "less code" is not "broken code". Spawns python/node
3
- // with the extracted code + appended assertions; returns pass/fail + score.
4
- //
5
- // Metric: `correct` (1 = all checks pass, 0 = at least one fails).
6
- // Unlike loc.js (measurement-only), this one is a gate — a wrong answer is a
7
- // wrong answer regardless of how few lines produced it.
8
-
9
- const { execSync } = require('child_process');
10
- const fs = require('fs');
11
- const os = require('os');
12
- const path = require('path');
13
-
14
- // Extract fenced code blocks, tagged by language.
15
- function extractBlocks(text) {
16
- const matches = [...text.matchAll(/```(\w*)\n([\s\S]*?)```/g)];
17
- return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] }));
18
- }
19
-
20
- // Identify which task we're evaluating from vars.task.
21
- function identifyTask(task) {
22
- const t = task.toLowerCase();
23
- if (t.includes('email') && t.includes('valid')) return 'email';
24
- if (t.includes('debounce')) return 'debounce';
25
- if (t.includes('csv') && t.includes('sum')) return 'csv';
26
- if (t.includes('countdown') && t.includes('react')) return 'countdown';
27
- if (t.includes('rate limit') || t.includes('rate-limit')) return 'ratelimit';
28
- return null;
29
- }
30
-
31
- // Run a command, return { ok, stderr }.
32
- function exec(cmd, opts = {}) {
33
- try {
34
- execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
35
- return { ok: true, stderr: '' };
36
- } catch (e) {
37
- return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) };
38
- }
39
- }
40
-
41
- // ponytail: probe once at load; macOS and many Linux images ship python3 only.
42
- let pythonCmd;
43
- function python() {
44
- if (pythonCmd) return pythonCmd;
45
- for (const cmd of ['python3', 'python']) {
46
- if (exec(`${cmd} -c "import sys"`).ok) {
47
- pythonCmd = cmd;
48
- return pythonCmd;
49
- }
50
- }
51
- pythonCmd = 'python3';
52
- return pythonCmd;
53
- }
54
-
55
- // Write content to a temp file, return the path.
56
- function tmpFile(ext, content) {
57
- const p = path.join(os.tmpdir(), `ponytail-bench-${Date.now()}-${Math.random().toString(36).slice(2)}${ext}`);
58
- fs.writeFileSync(p, content);
59
- return p;
60
- }
61
-
62
- // --- Per-task test harnesses ---
63
-
64
- const CHECKS = {
65
- email(blocks) {
66
- const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && b.code.includes('def ')));
67
- if (!code) return { pass: false, reason: 'No Python code block found' };
68
-
69
- // Append assertions that call the generated function by common names.
70
- const harness = `
71
- ${code.code}
72
-
73
- # Find the validator function
74
- import sys
75
- fn = None
76
- for name in ['validate_email', 'is_valid_email', 'email_validator', 'is_valid', 'validate']:
77
- if name in dir() and callable(eval(name)):
78
- fn = eval(name)
79
- break
80
-
81
- if fn is None:
82
- # Try any function that takes one arg
83
- import inspect
84
- for name, obj in list(globals().items()):
85
- if callable(obj) and not name.startswith('_'):
86
- try:
87
- sig = inspect.signature(obj)
88
- if len(sig.parameters) == 1:
89
- fn = obj
90
- break
91
- except (ValueError, TypeError):
92
- pass
93
-
94
- if fn is None:
95
- print("FAIL: no validator function found")
96
- sys.exit(1)
97
-
98
- # Test cases
99
- failures = []
100
- if not fn("user@example.com"):
101
- failures.append("rejected valid: user@example.com")
102
- if not fn("a@b.co"):
103
- failures.append("rejected valid: a@b.co")
104
- if fn("no-at-sign"):
105
- failures.append("accepted invalid: no-at-sign")
106
- if fn(""):
107
- failures.append("accepted invalid: empty string")
108
- if fn("@missing-local.com"):
109
- failures.append("accepted invalid: @missing-local.com")
110
-
111
- if failures:
112
- print("FAIL: " + "; ".join(failures))
113
- sys.exit(1)
114
- print("PASS")
115
- `;
116
- const f = tmpFile('.py', harness);
117
- const result = exec(`${python()} "${f}"`);
118
- fs.unlinkSync(f);
119
- if (result.ok) return { pass: true, reason: 'Email validator passes all checks' };
120
- return { pass: false, reason: result.stderr || 'Email validator failed' };
121
- },
122
-
123
- debounce(blocks) {
124
- const code = blocks.find((b) => b.lang === 'javascript' || b.lang === 'js' || (!b.lang && b.code.includes('function')));
125
- if (!code) return { pass: false, reason: 'No JavaScript code block found' };
126
-
127
- const harness = `
128
- ${code.code}
129
-
130
- // Find the debounce function
131
- const fn = typeof debounce === 'function' ? debounce
132
- : typeof module !== 'undefined' && typeof module.exports === 'function' ? module.exports
133
- : null;
134
-
135
- if (!fn) {
136
- console.error("FAIL: no debounce function found");
137
- process.exit(1);
138
- }
139
-
140
- // Test: debounced function should not fire immediately
141
- let callCount = 0;
142
- const debounced = fn(() => { callCount++; }, 50);
143
- debounced();
144
- debounced();
145
- debounced();
146
-
147
- if (callCount > 0) {
148
- console.error("FAIL: debounce fired immediately (should wait)");
149
- process.exit(1);
150
- }
151
-
152
- // Test: should fire after the delay
153
- setTimeout(() => {
154
- if (callCount !== 1) {
155
- console.error("FAIL: expected 1 call after delay, got " + callCount);
156
- process.exit(1);
157
- }
158
- console.log("PASS");
159
- }, 120);
160
- `;
161
- const f = tmpFile('.mjs', harness);
162
- const result = exec(`node "${f}"`);
163
- fs.unlinkSync(f);
164
- if (result.ok) return { pass: true, reason: 'Debounce passes all checks' };
165
- return { pass: false, reason: result.stderr || 'Debounce failed' };
166
- },
167
-
168
- csv(blocks) {
169
- const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && b.code.includes('csv') && b.code.includes('sum')));
170
- if (!code) return { pass: false, reason: 'No Python code block found' };
171
-
172
- // Create a test CSV and wrap the generated code so it reads it.
173
- const csvContent = 'name,amount\nAlice,100.5\nBob,200.0\nCharlie,50.5\n';
174
- const csvPath = tmpFile('.csv', csvContent).replace(/\\/g, '/');
175
-
176
- // The generated code likely reads 'sales.csv'; patch the filename.
177
- let patched = code.code.replace(/['"]sales\.csv['"]/g, `'${csvPath}'`);
178
- // Also try open() calls
179
- patched = patched.replace(/open\(\s*['"]sales\.csv['"]/g, `open('${csvPath}'`);
180
-
181
- const harness = `
182
- import sys, os
183
- os.chdir(r"${path.dirname(csvPath)}")
184
-
185
- # Mock pandas if not installed
186
- try:
187
- import pandas
188
- except ImportError:
189
- from types import ModuleType
190
- pandas_mock = ModuleType('pandas')
191
- class MockDataFrame:
192
- def __init__(self, *args, **kwargs):
193
- pass
194
- def __getitem__(self, key):
195
- class MockSeries:
196
- def sum(self):
197
- return 351.0
198
- return MockSeries()
199
- pandas_mock.read_csv = lambda *args, **kwargs: MockDataFrame()
200
- sys.modules['pandas'] = pandas_mock
201
-
202
- # Capture print output
203
- import io
204
- _stdout = sys.stdout
205
- sys.stdout = io.StringIO()
206
-
207
- try:
208
- ${patched.split('\n').map((l) => ' ' + l).join('\n')}
209
- except Exception as e:
210
- sys.stdout = _stdout
211
- # If it needs sales.csv in cwd, write it there and retry
212
- pass
213
-
214
- output = sys.stdout.getvalue()
215
- sys.stdout = _stdout
216
-
217
- # Check output contains the number 351 (100.5 + 200.0 + 50.5)
218
- # Match as a standalone number (not as substring of e.g. 13510)
219
- import re
220
- if re.search(r'(?<![\\d])351(?:\\.0)?(?![\\d])', output):
221
- print("PASS")
222
- else:
223
- # Try running it differently: maybe it defines a function
224
- print("FAIL: output was: " + repr(output[:200]))
225
- sys.exit(1)
226
- `;
227
- const f = tmpFile('.py', harness);
228
- const result = exec(`${python()} "${f}"`);
229
- try { fs.unlinkSync(f); } catch (e) {}
230
- try { fs.unlinkSync(csvPath); } catch (e) {}
231
- if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' };
232
- return { pass: false, reason: result.stderr || 'CSV sum failed' };
233
- },
234
-
235
- countdown(blocks) {
236
- // React components can't run in bare Node without a bundler. Structural check:
237
- // the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout).
238
- const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer'));
239
- if (!code) return { pass: false, reason: 'No countdown component found' };
240
-
241
- const src = code.code;
242
- const hasState = /useState|useReducer|this\.state/.test(src);
243
- const hasEffect = /useEffect|componentDidMount|setInterval|setTimeout/.test(src);
244
- const hasDecrement = /- 1|-= 1|prev - 1|count - 1|seconds - 1|time - 1/.test(src);
245
-
246
- const failures = [];
247
- if (!hasState) failures.push('no state management (useState/useReducer)');
248
- if (!hasEffect) failures.push('no timer setup (useEffect/setInterval/setTimeout)');
249
- if (!hasDecrement) failures.push('no countdown decrement logic');
250
-
251
- if (failures.length === 0) return { pass: true, reason: 'Countdown has required structure' };
252
- return { pass: false, reason: 'Missing: ' + failures.join(', ') };
253
- },
254
-
255
- ratelimit(blocks) {
256
- const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit'))));
257
- if (!code) return { pass: false, reason: 'No Python code block found' };
258
-
259
- // Structural check for rate limiting: must have some form of counter/time tracking.
260
- const src = code.code;
261
- const hasTimeTracking = /time\.|datetime|asyncio/.test(src);
262
- const hasLimitLogic = /limit|max_requests|rate|429|Too Many|HTTPException|RateLimiter/.test(src);
263
- const hasFastAPI = /fastapi|FastAPI|app\s*=|@app\./.test(src);
264
-
265
- const failures = [];
266
- if (!hasLimitLogic) failures.push('no rate limit logic');
267
- if (!hasFastAPI) failures.push('no FastAPI usage');
268
-
269
- if (failures.length === 0) return { pass: true, reason: 'Rate limiter has required structure' };
270
- return { pass: false, reason: 'Missing: ' + failures.join(', ') };
271
- },
272
- };
273
-
274
- // --- Main assertion entry point ---
275
-
276
- module.exports = (output, context) => {
277
- const task = identifyTask(context.vars.task || '');
278
- if (!task) {
279
- return { pass: true, score: 1, reason: 'Unknown task, skipped correctness check' };
280
- }
281
-
282
- const blocks = extractBlocks(String(output || ''));
283
- if (blocks.length === 0) {
284
- return { pass: false, score: 0, reason: 'No code blocks in output' };
285
- }
286
-
287
- const check = CHECKS[task];
288
- const result = check(blocks);
289
- return {
290
- pass: result.pass,
291
- score: result.pass ? 1 : 0,
292
- reason: result.reason,
293
- };
294
- };
@@ -1,103 +0,0 @@
1
- const fs = require('fs');
2
- const path = require('path');
3
-
4
- const RESULTS_DIR = path.join(__dirname, '..', 'results');
5
-
6
- function median(values) {
7
- const s = [...values].sort((a, b) => a - b);
8
- if (s.length === 0) return 0;
9
- const mid = Math.floor(s.length / 2);
10
- return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
11
- }
12
-
13
- function findLatestJson(resultsDir = RESULTS_DIR) {
14
- const files = fs
15
- .readdirSync(resultsDir)
16
- .filter((f) => f.startsWith('opencode-go-') && f.endsWith('.json'))
17
- .sort()
18
- .reverse();
19
- if (!files.length) {
20
- throw new Error(`No opencode-go-*.json in ${resultsDir}. Run benchmark first.`);
21
- }
22
- return path.join(resultsDir, files[0]);
23
- }
24
-
25
- function modelLabel(id) {
26
- return id
27
- .replace('deepseek-v4-pro', 'DeepSeek V4')
28
- .replace('qwen3.7-max', 'Qwen3.7 Max')
29
- .replace('minimax-m3', 'MiniMax M3')
30
- .replace('kimi-k2.6', 'Kimi K2.6');
31
- }
32
-
33
- function aggregateOpencodeGo(data) {
34
- const taskIds = data.tasks.map((t) => t.id);
35
- const arms = data.arms;
36
- const models = data.models;
37
-
38
- const chart = {
39
- source: `opencode-go-${data.date}.json`,
40
- date: data.date,
41
- repeat: data.repeat,
42
- models: [],
43
- tasks: taskIds,
44
- arms,
45
- };
46
-
47
- for (const modelId of models) {
48
- const modelConfig = data.results[modelId];
49
- const locByArmTask = {};
50
- const timeByArmTask = {};
51
- const correctByArm = {};
52
-
53
- for (const arm of arms) {
54
- locByArmTask[arm] = {};
55
- timeByArmTask[arm] = {};
56
- let pass = 0;
57
- let total = 0;
58
-
59
- for (const taskId of taskIds) {
60
- const runs = modelConfig[arm][taskId];
61
- locByArmTask[arm][taskId] = median(runs.map((r) => r.loc));
62
- timeByArmTask[arm][taskId] = median(runs.map((r) => r.timeSec));
63
- pass += runs.filter((r) => r.correct).length;
64
- total += runs.length;
65
- }
66
-
67
- correctByArm[arm] = { pass, total };
68
- }
69
-
70
- const baselineTotal = taskIds.reduce((s, t) => s + locByArmTask.baseline[t], 0);
71
- const lexisTotal = taskIds.reduce((s, t) => s + locByArmTask['lexis-two'][t], 0);
72
- const reductionPct =
73
- baselineTotal > 0 ? Math.round((1 - lexisTotal / baselineTotal) * 100) : 0;
74
-
75
- const baselineTime = taskIds.reduce((s, t) => s + timeByArmTask.baseline[t], 0);
76
- const lexisTime = taskIds.reduce((s, t) => s + timeByArmTask['lexis-two'][t], 0);
77
-
78
- chart.models.push({
79
- id: modelId,
80
- label: modelLabel(modelId),
81
- locByArmTask,
82
- timeByArmTask,
83
- correctByArm,
84
- totals: {
85
- baselineLoc: baselineTotal,
86
- lexisLoc: lexisTotal,
87
- reductionPct,
88
- baselineTimeSec: Math.round(baselineTime * 10) / 10,
89
- lexisTimeSec: Math.round(lexisTime * 10) / 10,
90
- },
91
- });
92
- }
93
-
94
- return chart;
95
- }
96
-
97
- module.exports = {
98
- aggregateOpencodeGo,
99
- findLatestJson,
100
- modelLabel,
101
- median,
102
- RESULTS_DIR,
103
- };
@@ -1,31 +0,0 @@
1
- // Load KEY=VALUE lines from a .env file into process.env (no dotenv dependency).
2
- const fs = require('fs');
3
-
4
- function loadEnvFile(envPath) {
5
- if (!fs.existsSync(envPath)) return false;
6
-
7
- const raw = fs.readFileSync(envPath, 'utf8').replace(/^\uFEFF/, '');
8
- for (const line of raw.split(/\r?\n/)) {
9
- const trimmed = line.trim();
10
- if (!trimmed || trimmed.startsWith('#')) continue;
11
-
12
- const eq = trimmed.indexOf('=');
13
- if (eq <= 0) continue;
14
-
15
- const key = trimmed.slice(0, eq).trim();
16
- if (process.env[key] !== undefined) continue;
17
-
18
- let value = trimmed.slice(eq + 1).trim();
19
- if (
20
- (value.startsWith('"') && value.endsWith('"')) ||
21
- (value.startsWith("'") && value.endsWith("'"))
22
- ) {
23
- value = value.slice(1, -1);
24
- }
25
- process.env[key] = value;
26
- }
27
-
28
- return true;
29
- }
30
-
31
- module.exports = { loadEnvFile };
@@ -1,151 +0,0 @@
1
- // OpenCode Go API client — OpenAI chat/completions + Anthropic /messages transports.
2
- // Docs: https://opencode.ai/docs/go/#endpoints
3
-
4
- const fs = require('fs');
5
- const path = require('path');
6
-
7
- const DEFAULT_BASE = 'https://opencode.ai/zen/go/v1';
8
- const ENV_PATH = path.join(__dirname, '..', '..', '.env');
9
-
10
- function getApiKey() {
11
- const key = (process.env.OPENCODE_API_KEY || process.env.OPENCODE_GO_API_KEY || '').trim();
12
- if (!key) {
13
- const hint = fs.existsSync(ENV_PATH)
14
- ? `OPENCODE_API_KEY is empty in ${ENV_PATH}. Paste your OpenCode Go key after the = sign.`
15
- : `Create ${ENV_PATH} from .env.example and set OPENCODE_API_KEY=your-key`;
16
- throw new Error(`Missing OPENCODE_API_KEY. ${hint}`);
17
- }
18
- return key;
19
- }
20
-
21
- async function readJsonResponse(res) {
22
- const text = await res.text();
23
- let body;
24
- try {
25
- body = text ? JSON.parse(text) : {};
26
- } catch {
27
- body = { raw: text };
28
- }
29
- if (!res.ok) {
30
- const msg =
31
- body?.error?.message ||
32
- body?.message ||
33
- (typeof body?.error === 'string' ? body.error : null) ||
34
- text.slice(0, 500) ||
35
- res.statusText;
36
- throw new Error(`HTTP ${res.status}: ${msg}`);
37
- }
38
- return body;
39
- }
40
-
41
- function extractOpenAiText(body) {
42
- return body?.choices?.[0]?.message?.content ?? '';
43
- }
44
-
45
- function extractAnthropicText(body) {
46
- const blocks = body?.content;
47
- if (!Array.isArray(blocks)) return '';
48
- return blocks
49
- .filter((b) => b?.type === 'text' && typeof b.text === 'string')
50
- .map((b) => b.text)
51
- .join('');
52
- }
53
-
54
- function extractUsage(body, transport) {
55
- if (transport === 'openai-chat') {
56
- const u = body?.usage;
57
- if (!u) return null;
58
- return {
59
- inputTokens: u.prompt_tokens ?? 0,
60
- outputTokens: u.completion_tokens ?? 0,
61
- };
62
- }
63
- const u = body?.usage;
64
- if (!u) return null;
65
- return {
66
- inputTokens: u.input_tokens ?? 0,
67
- outputTokens: u.output_tokens ?? 0,
68
- };
69
- }
70
-
71
- async function chatOpenAi({ baseUrl, apiKey, model, system, user, maxTokens, temperature }) {
72
- const messages = [];
73
- if (system) messages.push({ role: 'system', content: system });
74
- messages.push({ role: 'user', content: user });
75
-
76
- const res = await fetch(`${baseUrl}/chat/completions`, {
77
- method: 'POST',
78
- headers: {
79
- Authorization: `Bearer ${apiKey}`,
80
- 'Content-Type': 'application/json',
81
- },
82
- body: JSON.stringify({
83
- model,
84
- messages,
85
- max_tokens: maxTokens,
86
- temperature,
87
- }),
88
- });
89
-
90
- const body = await readJsonResponse(res);
91
- return {
92
- text: extractOpenAiText(body),
93
- usage: extractUsage(body, 'openai-chat'),
94
- raw: body,
95
- };
96
- }
97
-
98
- async function chatAnthropic({ baseUrl, apiKey, model, system, user, maxTokens, temperature }) {
99
- const payload = {
100
- model,
101
- max_tokens: maxTokens,
102
- messages: [{ role: 'user', content: user }],
103
- };
104
- if (system) payload.system = system;
105
- if (typeof temperature === 'number') payload.temperature = temperature;
106
-
107
- // lexis: Go /messages rejects Bearer — x-api-key only (qwen3.7-max, minimax-m3)
108
- const res = await fetch(`${baseUrl}/messages`, {
109
- method: 'POST',
110
- headers: {
111
- 'x-api-key': apiKey,
112
- 'Content-Type': 'application/json',
113
- 'anthropic-version': '2023-06-01',
114
- },
115
- body: JSON.stringify(payload),
116
- });
117
-
118
- const body = await readJsonResponse(res);
119
- return {
120
- text: extractAnthropicText(body),
121
- usage: extractUsage(body, 'anthropic-messages'),
122
- raw: body,
123
- };
124
- }
125
-
126
- async function complete({
127
- modelId,
128
- modelConfig,
129
- system,
130
- user,
131
- baseUrl = DEFAULT_BASE,
132
- apiKey = getApiKey(),
133
- temperature = 1,
134
- }) {
135
- const maxTokens = modelConfig.maxTokens ?? 8192;
136
- const args = { baseUrl, apiKey, model: modelId, system, user, maxTokens, temperature };
137
-
138
- if (modelConfig.transport === 'anthropic-messages') {
139
- return chatAnthropic(args);
140
- }
141
- if (modelConfig.transport === 'openai-chat') {
142
- return chatOpenAi(args);
143
- }
144
- throw new Error(`Unknown transport for ${modelId}: ${modelConfig.transport}`);
145
- }
146
-
147
- module.exports = {
148
- DEFAULT_BASE,
149
- getApiKey,
150
- complete,
151
- };
package/benchmarks/loc.js DELETED
@@ -1,13 +0,0 @@
1
- // Deterministic code-size metric: non-blank, non-comment lines of code. Counts
2
- // fenced blocks, or the whole response when the model emitted bare code unfenced.
3
- // Recorded as the `code_loc` metric per arm (always passes; it is a measurement, not a gate).
4
- module.exports = (output) => {
5
- const text = String(output || '');
6
- const blocks = [...text.matchAll(/```[a-zA-Z0-9_+-]*\n([\s\S]*?)```/g)].map((m) => m[1]);
7
- const code = blocks.length ? blocks.join('\n') : text;
8
- const loc = code
9
- .split('\n')
10
- .map((l) => l.trim())
11
- .filter((l) => l && !l.startsWith('//') && !l.startsWith('#') && l !== '*/' && !l.startsWith('/*') && !l.startsWith('*')).length;
12
- return { pass: true, score: loc, reason: loc + ' code LOC' };
13
- };
@@ -1,31 +0,0 @@
1
- {
2
- "baseUrl": "https://opencode.ai/zen/go/v1",
3
- "defaultModels": [
4
- "kimi-k2.6",
5
- "deepseek-v4-pro",
6
- "qwen3.7-max",
7
- "minimax-m3"
8
- ],
9
- "models": {
10
- "kimi-k2.6": {
11
- "name": "Kimi K2.6",
12
- "transport": "openai-chat",
13
- "maxTokens": 8192
14
- },
15
- "deepseek-v4-pro": {
16
- "name": "DeepSeek V4 Pro",
17
- "transport": "openai-chat",
18
- "maxTokens": 8192
19
- },
20
- "qwen3.7-max": {
21
- "name": "Qwen3.7 Max",
22
- "transport": "anthropic-messages",
23
- "maxTokens": 8192
24
- },
25
- "minimax-m3": {
26
- "name": "MiniMax M3",
27
- "transport": "anthropic-messages",
28
- "maxTokens": 8192
29
- }
30
- }
31
- }
@@ -1,41 +0,0 @@
1
- # Ponytail benchmark: code size + cost across three arms, same model, same tasks.
2
- #
3
- # Run: npx promptfoo@latest eval -c benchmarks/promptfooconfig.yaml
4
- # View: npx promptfoo@latest view
5
- # Share: npx promptfoo@latest share (publishes a hosted report URL)
6
- #
7
- # Needs ANTHROPIC_API_KEY in the environment or a .env file (see benchmarks/README.md).
8
- # Caveman arm uses JuliusBrussee/caveman SKILL.md (MIT), vendored at arms/caveman-SKILL.md.
9
- description: "Ponytail vs caveman vs no-skill: same model, same tasks. Measures code LOC (deterministic) and tokens/cost (API telemetry)."
10
-
11
- providers:
12
- - id: anthropic:messages:claude-haiku-4-5-20251001
13
- config: { max_tokens: 8192, temperature: 1 }
14
- - id: anthropic:messages:claude-sonnet-4-6
15
- config: { max_tokens: 8192, temperature: 1 }
16
- - id: anthropic:messages:claude-opus-4-8
17
- config: { max_tokens: 8192, temperature: 1 }
18
-
19
- prompts:
20
- - id: file://arms/baseline.js
21
- label: baseline (no skill)
22
- - id: file://arms/caveman.js
23
- label: caveman
24
- - id: file://arms/ponytail.js
25
- label: ponytail
26
-
27
- defaultTest:
28
- assert:
29
- - type: javascript
30
- value: file://loc.js
31
- metric: code_loc
32
- - type: javascript
33
- value: file://correctness.js
34
- metric: correct
35
-
36
- tests:
37
- - vars: { task: "Write me a Python function that validates email addresses." }
38
- - vars: { task: "Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke." }
39
- - vars: { task: "Write Python code that reads sales.csv and sums the 'amount' column." }
40
- - vars: { task: "Build me a countdown timer component in React that counts down from a given number of seconds." }
41
- - vars: { task: "Add rate limiting to my FastAPI endpoint so users can't spam it." }
@@ -1,15 +0,0 @@
1
- {
2
- "method": "One fresh Claude Code subagent per task x config, same model, no file outputs. Metrics from task telemetry: total tokens (includes thinking), duration. Code lines counted from fenced blocks in the deliverable.",
3
- "configs": [
4
- "baseline — no skill",
5
- "caveman — caveman SKILL.md (full) as operating instructions",
6
- "ponytail — ponytail SKILL.md (full) as operating instructions"
7
- ],
8
- "tasks": [
9
- { "id": "email", "prompt": "Write me a Python function that validates email addresses." },
10
- { "id": "debounce", "prompt": "Add debounce to a search input in vanilla JavaScript — it currently fires an API call on every keystroke." },
11
- { "id": "csv-sum", "prompt": "Write Python code that reads sales.csv and sums the 'amount' column." },
12
- { "id": "react-countdown", "prompt": "Build me a countdown timer component in React that counts down from a given number of seconds." },
13
- { "id": "rate-limit", "prompt": "Add rate limiting to my FastAPI endpoint so users can't spam it." }
14
- ]
15
- }