@draig/lexis-two 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/plugins/marketplace.json +21 -0
- package/.claude-plugin/marketplace.json +29 -0
- package/.claude-plugin/plugin.json +9 -0
- package/.clinerules/lexis-two.md +163 -0
- package/.codex-plugin/plugin.json +31 -0
- package/.cursor/rules/lexis-two.mdc +169 -0
- package/.env.example +8 -0
- package/.github/FUNDING.yml +1 -0
- package/.github/copilot-instructions.md +47 -0
- package/.github/plugin/marketplace.json +20 -0
- package/.github/plugin/plugin.json +16 -0
- package/.github/workflows/deploy-site.yml +53 -0
- package/.github/workflows/test.yml +29 -0
- package/.kiro/steering/lexis-two.md +167 -0
- package/.nojekyll +0 -0
- package/.opencode/command/lexis-two-audit.md +5 -0
- package/.opencode/command/lexis-two-debt.md +5 -0
- package/.opencode/command/lexis-two-help.md +5 -0
- package/.opencode/command/lexis-two-plan.md +5 -0
- package/.opencode/command/lexis-two-review.md +5 -0
- package/.opencode/command/lexis-two-security.md +5 -0
- package/.opencode/command/lexis-two.md +5 -0
- package/.opencode/plugins/lexis-two.mjs +74 -0
- package/.windsurf/rules/lexis-two.md +163 -0
- package/AGENTS.md +163 -0
- package/AUDIT.md +74 -0
- package/CNAME +1 -0
- package/LICENSE +23 -0
- package/README.md +301 -0
- package/SPECXIS.md +576 -0
- package/assets/benchmark-3model.svg +21 -0
- package/assets/lexis-two-complete.webp +0 -0
- package/assets/lexis-two-nobg.png +0 -0
- package/assets/logo.png +0 -0
- package/assets/social-preview.png +0 -0
- package/benchmarks/README.md +114 -0
- package/benchmarks/arms/baseline.js +2 -0
- package/benchmarks/arms/caveman-SKILL.md +67 -0
- package/benchmarks/arms/caveman.js +8 -0
- package/benchmarks/arms/lexis-two.js +10 -0
- package/benchmarks/arms/ponytail.js +6 -0
- package/benchmarks/behavior.js +58 -0
- package/benchmarks/behavior.yaml +40 -0
- package/benchmarks/benchmark-local.py +156 -0
- package/benchmarks/benchmark-opencode-go.js +294 -0
- package/benchmarks/correctness.js +294 -0
- package/benchmarks/lib/aggregate-opencode-go.js +103 -0
- package/benchmarks/lib/load-env.js +31 -0
- package/benchmarks/lib/opencode-go-client.js +151 -0
- package/benchmarks/loc.js +13 -0
- package/benchmarks/opencode-go-models.json +31 -0
- package/benchmarks/promptfooconfig.yaml +41 -0
- package/benchmarks/prompts.json +15 -0
- package/benchmarks/render-opencode-go-report.js +28 -0
- package/benchmarks/results/2026-06-15-llama3.2-local.md +76 -0
- package/benchmarks/results/2026-06-16-opencode-go.md +56 -0
- package/benchmarks/results/opencode-go-2026-06-16-report.html +226 -0
- package/benchmarks/results/opencode-go-2026-06-16.json +1339 -0
- package/commands/lexis-two-audit.toml +3 -0
- package/commands/lexis-two-debt.toml +3 -0
- package/commands/lexis-two-help.toml +3 -0
- package/commands/lexis-two-plan.toml +3 -0
- package/commands/lexis-two-review.toml +3 -0
- package/commands/lexis-two-security.toml +3 -0
- package/commands/lexis-two.toml +3 -0
- package/docs/assets/lexis-two-nobg.png +0 -0
- package/docs/assets/logo.png +0 -0
- package/docs/assets/logo.svg +4 -0
- package/docs/portability.md +147 -0
- package/docs/site.md +52 -0
- package/examples/api-endpoint.md +68 -0
- package/examples/caching.md +74 -0
- package/examples/date-picker.md +48 -0
- package/examples/email-validation.md +51 -0
- package/examples/sorting.md +42 -0
- package/gemini-extension.json +7 -0
- package/hooks/copilot-hooks.json +21 -0
- package/hooks/hooks.json +31 -0
- package/hooks/lexis-two-activate.js +72 -0
- package/hooks/lexis-two-config.js +101 -0
- package/hooks/lexis-two-instructions.js +126 -0
- package/hooks/lexis-two-mode-tracker.js +55 -0
- package/hooks/lexis-two-runtime.js +50 -0
- package/hooks/lexis-two-statusline.ps1 +19 -0
- package/hooks/lexis-two-statusline.sh +11 -0
- package/opencode.json +4 -0
- package/package.json +31 -0
- package/pi-extension/index.js +161 -0
- package/pi-extension/package.json +8 -0
- package/pi-extension/test/extension.test.js +89 -0
- package/pi-extension/test/helpers.test.js +35 -0
- package/scripts/check-rule-copies.js +82 -0
- package/site/astro.config.mjs +18 -0
- package/site/package-lock.json +4913 -0
- package/site/package.json +14 -0
- package/site/public/CNAME +1 -0
- package/site/public/assets/lexis-two-nobg.png +0 -0
- package/site/public/assets/logo.png +0 -0
- package/site/public/assets/logo.svg +4 -0
- package/site/public/robots.txt +4 -0
- package/site/src/components/Adapt.astro +33 -0
- package/site/src/components/Benchmarks.astro +232 -0
- package/site/src/components/Commands.astro +33 -0
- package/site/src/components/Ecosystem.astro +30 -0
- package/site/src/components/Example.astro +77 -0
- package/site/src/components/Footer.astro +28 -0
- package/site/src/components/Header.astro +87 -0
- package/site/src/components/Hero.astro +58 -0
- package/site/src/components/Home.astro +46 -0
- package/site/src/components/Hosts.astro +62 -0
- package/site/src/components/Install.astro +143 -0
- package/site/src/components/LanguageSwitcher.astro +82 -0
- package/site/src/components/Philosophy.astro +23 -0
- package/site/src/components/Stacks.astro +33 -0
- package/site/src/components/Suggested.astro +39 -0
- package/site/src/data/opencode-go-benchmark.json +230 -0
- package/site/src/i18n/en.ts +155 -0
- package/site/src/i18n/es.ts +158 -0
- package/site/src/i18n/index.ts +14 -0
- package/site/src/layouts/Layout.astro +114 -0
- package/site/src/pages/benchmarks.astro +4 -0
- package/site/src/pages/es/benchmarks.astro +4 -0
- package/site/src/pages/es/index.astro +10 -0
- package/site/src/pages/index.astro +10 -0
- package/site/src/styles/global.css +780 -0
- package/site/tsconfig.json +3 -0
- package/skills/lexis-two/SKILL.md +109 -0
- package/skills/lexis-two-audit/SKILL.md +21 -0
- package/skills/lexis-two-debt/SKILL.md +22 -0
- package/skills/lexis-two-plan/SKILL.md +25 -0
- package/skills/lexis-two-review/SKILL.md +24 -0
- package/skills/lexis-two-security/SKILL.md +24 -0
- package/tests/behavior.test.js +80 -0
- package/tests/commands.test.js +40 -0
- package/tests/copilot-plugin.test.js +33 -0
- package/tests/correctness.test.js +191 -0
- package/tests/gemini-extension.test.js +78 -0
- package/tests/hooks-windows.test.js +48 -0
- package/tests/hooks.test.js +177 -0
- package/tests/opencode-plugin.test.js +64 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Benchmark
|
|
2
|
+
|
|
3
|
+
Five everyday tasks, **baseline vs lexis-two** (`skills/lexis-two/SKILL.md`). Code LOC from fenced blocks; correctness gate via `correctness.js`.
|
|
4
|
+
|
|
5
|
+
## OpenCode Go (recommended)
|
|
6
|
+
|
|
7
|
+
Uses the same models you run in OpenCode today. Default set in `opencode-go-models.json`:
|
|
8
|
+
|
|
9
|
+
| Model ID | Name | Transport |
|
|
10
|
+
| -------- | ---- | --------- |
|
|
11
|
+
| `kimi-k2.6` | Kimi K2.6 | OpenAI `/chat/completions` |
|
|
12
|
+
| `deepseek-v4-pro` | DeepSeek V4 Pro | OpenAI `/chat/completions` |
|
|
13
|
+
| `qwen3.7-max` | Qwen3.7 Max | Anthropic `/messages` |
|
|
14
|
+
| `minimax-m3` | MiniMax M3 | Anthropic `/messages` |
|
|
15
|
+
|
|
16
|
+
[OpenCode Go docs](https://opencode.ai/docs/go/) — subscribe, copy API key, `/connect` in TUI.
|
|
17
|
+
|
|
18
|
+
### Run
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
cp .env.example .env # OPENCODE_API_KEY=...
|
|
22
|
+
node benchmarks/benchmark-opencode-go.js --repeat 3 --write-md
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Single model:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
node benchmarks/benchmark-opencode-go.js --model kimi-k2.6 --repeat 10 --write-md
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Subset:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
node benchmarks/benchmark-opencode-go.js --models kimi-k2.6,deepseek-v4-pro --repeat 5
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Optional third arm:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
node benchmarks/benchmark-opencode-go.js --caveman --repeat 3
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Outputs:
|
|
44
|
+
|
|
45
|
+
- `benchmarks/results/opencode-go-YYYY-MM-DD.json` — full responses + usage
|
|
46
|
+
- `benchmarks/results/YYYY-MM-DD-opencode-go.md` — summary tables (with `--write-md`)
|
|
47
|
+
- `site/src/data/opencode-go-benchmark.json` — chart data for the Astro site (`npm run benchmark:report`)
|
|
48
|
+
- Live charts: `https://lexis-two.excelso.xyz/benchmarks/` (after `npm run site:build` + deploy)
|
|
49
|
+
|
|
50
|
+
**Publish site/README metrics only from a committed `results/*-opencode-go.md` for the lexis-two arm.**
|
|
51
|
+
|
|
52
|
+
### Adding models later (Gemini, Claude, OpenAI)
|
|
53
|
+
|
|
54
|
+
1. Add entry to `opencode-go-models.json` (or a new `providers/*.json` when you split harnesses).
|
|
55
|
+
2. Set `transport`: `openai-chat` or `anthropic-messages` per [Go endpoints](https://opencode.ai/docs/go/#endpoints).
|
|
56
|
+
3. Re-run and commit a new `results/` file.
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Legacy: Claude via promptfoo
|
|
61
|
+
|
|
62
|
+
Historical ponytail-arm results (2026-06-13) — **not Lexis-Two**. `arms/ponytail.js` now reads `skills/lexis-two/SKILL.md` until the promptfoo labels are renamed.
|
|
63
|
+
|
|
64
|
+
Requires `ANTHROPIC_API_KEY` and **Node.js ≥ 22.22.0**:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
npx promptfoo@latest eval -c benchmarks/promptfooconfig.yaml --repeat 10
|
|
68
|
+
npx promptfoo@latest view
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Median results (10 runs, 2026-06-13) — ponytail arm only
|
|
72
|
+
|
|
73
|
+
| arm | Haiku LOC | Sonnet LOC | Opus LOC |
|
|
74
|
+
| --- | --------: | ---------: | -------: |
|
|
75
|
+
| baseline | 518 | 693 | 256 |
|
|
76
|
+
| caveman | 116 | 120 | 67 |
|
|
77
|
+
| ponytail | 39 | 44 | 51 |
|
|
78
|
+
|
|
79
|
+
Versus baseline, ponytail wrote **80-94% less code** in that run. Do not cite as Lexis-Two until re-run via `benchmark-opencode-go.js`.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Local models (Ollama)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
python benchmarks/benchmark-local.py --model llama3.2 --repeat 3
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Arms: baseline, caveman, lexis-two.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Tasks
|
|
94
|
+
|
|
95
|
+
Email validator, JS debounce, CSV sum, React countdown, FastAPI rate-limit (`promptfooconfig.yaml`).
|
|
96
|
+
|
|
97
|
+
## Metrics
|
|
98
|
+
|
|
99
|
+
| File | Metric | Behavior |
|
|
100
|
+
| ---- | ------ | -------- |
|
|
101
|
+
| `loc.js` | `loc` | Line count (measurement) |
|
|
102
|
+
| `correctness.js` | `correct` | Gate — broken one-liners fail |
|
|
103
|
+
|
|
104
|
+
### Prerequisites
|
|
105
|
+
|
|
106
|
+
- **OpenCode Go:** Node.js 18+, `OPENCODE_API_KEY`
|
|
107
|
+
- **promptfoo:** Node.js ≥ 22.22.0, `ANTHROPIC_API_KEY`, Python 3 for correctness checks
|
|
108
|
+
- **Ollama:** Python 3, local Ollama
|
|
109
|
+
|
|
110
|
+
## Notes
|
|
111
|
+
|
|
112
|
+
- Go usage limits apply ($12/5h etc.) — use `--delay-ms` and start with `--repeat 3`.
|
|
113
|
+
- `qwen3.7-max` and `minimax-m3` use Anthropic `/messages` with header **`x-api-key`** (not `Authorization: Bearer`).
|
|
114
|
+
- Real sessions inject the skill once (cached); benchmark re-sends the full skill each call — pessimistic on tokens.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: caveman
|
|
3
|
+
description: >
|
|
4
|
+
Ultra-compressed communication mode. Cuts token usage ~75% by speaking like caveman
|
|
5
|
+
while keeping full technical accuracy. Supports intensity levels: lite, full (default), ultra,
|
|
6
|
+
wenyan-lite, wenyan-full, wenyan-ultra.
|
|
7
|
+
Use when user says "caveman mode", "talk like caveman", "use caveman", "less tokens",
|
|
8
|
+
"be brief", or invokes /caveman. Also auto-triggers when token efficiency is requested.
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
Respond terse like smart caveman. All technical substance stay. Only fluff die.
|
|
12
|
+
|
|
13
|
+
## Persistence
|
|
14
|
+
|
|
15
|
+
ACTIVE EVERY RESPONSE. No revert after many turns. No filler drift. Still active if unsure. Off only: "stop caveman" / "normal mode".
|
|
16
|
+
|
|
17
|
+
Default: **full**. Switch: `/caveman lite|full|ultra`.
|
|
18
|
+
|
|
19
|
+
## Rules
|
|
20
|
+
|
|
21
|
+
Drop: articles (a/an/the), filler (just/really/basically/actually/simply), pleasantries (sure/certainly/of course/happy to), hedging. Fragments OK. Short synonyms (big not extensive, fix not "implement a solution for"). Technical terms exact. Code blocks unchanged. Errors quoted exact.
|
|
22
|
+
|
|
23
|
+
Pattern: `[thing] [action] [reason]. [next step].`
|
|
24
|
+
|
|
25
|
+
Not: "Sure! I'd be happy to help you with that. The issue you're experiencing is likely caused by..."
|
|
26
|
+
Yes: "Bug in auth middleware. Token expiry check use `<` not `<=`. Fix:"
|
|
27
|
+
|
|
28
|
+
## Intensity
|
|
29
|
+
|
|
30
|
+
| Level | What change |
|
|
31
|
+
|-------|------------|
|
|
32
|
+
| **lite** | No filler/hedging. Keep articles + full sentences. Professional but tight |
|
|
33
|
+
| **full** | Drop articles, fragments OK, short synonyms. Classic caveman |
|
|
34
|
+
| **ultra** | Abbreviate (DB/auth/config/req/res/fn/impl), strip conjunctions, arrows for causality (X → Y), one word when one word enough |
|
|
35
|
+
| **wenyan-lite** | Semi-classical. Drop filler/hedging but keep grammar structure, classical register |
|
|
36
|
+
| **wenyan-full** | Maximum classical terseness. Fully 文言文. 80-90% character reduction. Classical sentence patterns, verbs precede objects, subjects often omitted, classical particles (之/乃/為/其) |
|
|
37
|
+
| **wenyan-ultra** | Extreme abbreviation while keeping classical Chinese feel. Maximum compression, ultra terse |
|
|
38
|
+
|
|
39
|
+
Example — "Why React component re-render?"
|
|
40
|
+
- lite: "Your component re-renders because you create a new object reference each render. Wrap it in `useMemo`."
|
|
41
|
+
- full: "New object ref each render. Inline object prop = new ref = re-render. Wrap in `useMemo`."
|
|
42
|
+
- ultra: "Inline obj prop → new ref → re-render. `useMemo`."
|
|
43
|
+
- wenyan-lite: "組件頻重繪,以每繪新生對象參照故。以 useMemo 包之。"
|
|
44
|
+
- wenyan-full: "物出新參照,致重繪。useMemo .Wrap之。"
|
|
45
|
+
- wenyan-ultra: "新參照→重繪。useMemo Wrap。"
|
|
46
|
+
|
|
47
|
+
Example — "Explain database connection pooling."
|
|
48
|
+
- lite: "Connection pooling reuses open connections instead of creating new ones per request. Avoids repeated handshake overhead."
|
|
49
|
+
- full: "Pool reuse open DB connections. No new connection per request. Skip handshake overhead."
|
|
50
|
+
- ultra: "Pool = reuse DB conn. Skip handshake → fast under load."
|
|
51
|
+
- wenyan-full: "池reuse open connection。不每req新開。skip handshake overhead。"
|
|
52
|
+
- wenyan-ultra: "池reuse conn。skip handshake → fast。"
|
|
53
|
+
|
|
54
|
+
## Auto-Clarity
|
|
55
|
+
|
|
56
|
+
Drop caveman for: security warnings, irreversible action confirmations, multi-step sequences where fragment order risks misread, user asks to clarify or repeats question. Resume caveman after clear part done.
|
|
57
|
+
|
|
58
|
+
Example — destructive op:
|
|
59
|
+
> **Warning:** This will permanently delete all rows in the `users` table and cannot be undone.
|
|
60
|
+
> ```sql
|
|
61
|
+
> DROP TABLE users;
|
|
62
|
+
> ```
|
|
63
|
+
> Caveman resume. Verify backup exist first.
|
|
64
|
+
|
|
65
|
+
## Boundaries
|
|
66
|
+
|
|
67
|
+
Code/commits/PRs: write normal. "stop caveman" or "normal mode": revert. Level persist until changed or session end.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
// Caveman arm: caveman SKILL.md (full) as the system prompt.
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const system = fs.readFileSync(path.join(__dirname, 'caveman-SKILL.md'), 'utf8');
|
|
5
|
+
module.exports = ({ vars }) => [
|
|
6
|
+
{ role: 'system', content: system },
|
|
7
|
+
{ role: 'user', content: vars.task },
|
|
8
|
+
];
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
// Lexis-Two arm: repo skill (full) as the system prompt.
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
|
|
5
|
+
const system = fs.readFileSync(
|
|
6
|
+
path.join(__dirname, '..', '..', 'skills', 'lexis-two', 'SKILL.md'),
|
|
7
|
+
'utf8',
|
|
8
|
+
);
|
|
9
|
+
|
|
10
|
+
module.exports = { system };
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
// Behavior gate: does the ponytail ruleset actually PRODUCE its refined
|
|
2
|
+
// behaviors, not just carry the text? One check per probe (vars.probe), each
|
|
3
|
+
// targeting a rule that a field review (rcstack, phases 0-8) showed mattered:
|
|
4
|
+
// hardware - "hardware is never the spec ideal, leave the calibration knob"
|
|
5
|
+
// explanation - "explanation the user explicitly asked for is not debt"
|
|
6
|
+
// onecheck - "lazy code without its check is unfinished"
|
|
7
|
+
//
|
|
8
|
+
// Heuristic graders, same spirit as loc.js / correctness.js. The graders
|
|
9
|
+
// themselves are proven by tests/behavior.test.js (RED/GREEN, no API key).
|
|
10
|
+
//
|
|
11
|
+
// Metric: `behavior` (1 = behavior present, 0 = absent).
|
|
12
|
+
|
|
13
|
+
function codeOf(text) {
|
|
14
|
+
return [...String(text || '').matchAll(/```[\w-]*\n([\s\S]*?)```/g)].map((m) => m[1]).join('\n');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function proseOf(text) {
|
|
18
|
+
return String(text || '').replace(/```[\s\S]*?```/g, ' ').replace(/\s+/g, ' ').trim();
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const CHECKS = {
|
|
22
|
+
// Treats the device as non-ideal: leaves a tunable knob or flags per-unit drift.
|
|
23
|
+
// A passing mention of "calibration" is not enough; it must be actionable.
|
|
24
|
+
hardware(output) {
|
|
25
|
+
const t = String(output || '');
|
|
26
|
+
const drift = /\bdrift|per[- ]unit|per[- ]part|part[- ]to[- ]part|measure your own|\btare\b|\btrim\b|\bknob|\btuning\b|reads off|known (temp|reference|value)|reference (thermometer|sensor|temp)|calibration (offset|constant|param|knob)/i.test(t);
|
|
27
|
+
return drift
|
|
28
|
+
? { pass: true, reason: 'Leaves a calibration knob / flags per-unit drift.' }
|
|
29
|
+
: { pass: false, reason: 'Treats the hardware as ideal; no calibration knob.' };
|
|
30
|
+
},
|
|
31
|
+
|
|
32
|
+
// Gives the explanation the user explicitly asked for instead of truncating.
|
|
33
|
+
explanation(output) {
|
|
34
|
+
const p = proseOf(output);
|
|
35
|
+
const words = p ? p.split(' ').length : 0;
|
|
36
|
+
const structured = /(\d+[.)]\s|[-*]\s)/.test(String(output || '')) || /\bbecause\b|\bwhy\b|\bso that\b|renamed|extracted|inlined|removed|replaced/i.test(p);
|
|
37
|
+
return words >= 45 && structured
|
|
38
|
+
? { pass: true, reason: `Gave the requested write-up (${words} words of prose).` }
|
|
39
|
+
: { pass: false, reason: `Truncated the requested explanation (${words} words of prose).` };
|
|
40
|
+
},
|
|
41
|
+
|
|
42
|
+
// Leaves ONE runnable check behind for non-trivial logic.
|
|
43
|
+
onecheck(output) {
|
|
44
|
+
const t = String(output || '');
|
|
45
|
+
const hasCheck = /\bassert\b|def\s+test_|if\s+__name__|unittest|pytest|console\.assert|\bexpect\(|\bdescribe\(|\bit\(/.test(t);
|
|
46
|
+
return hasCheck
|
|
47
|
+
? { pass: true, reason: 'Left a runnable check (assert/test/demo).' }
|
|
48
|
+
: { pass: false, reason: 'No runnable check left behind.' };
|
|
49
|
+
},
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
module.exports = (output, context) => {
|
|
53
|
+
const probe = context && context.vars && context.vars.probe;
|
|
54
|
+
const check = CHECKS[probe];
|
|
55
|
+
if (!check) return { pass: true, score: 1, reason: `Unknown probe '${probe}', skipped` };
|
|
56
|
+
const r = check(output);
|
|
57
|
+
return { pass: r.pass, score: r.pass ? 1 : 0, reason: r.reason };
|
|
58
|
+
};
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Ponytail behavior gates: does the ruleset actually produce its refined
|
|
2
|
+
# behaviors (not just carry the text)? Probes the three rules a full-project
|
|
3
|
+
# field review (rcstack, phases 0-8) showed mattered.
|
|
4
|
+
#
|
|
5
|
+
# Run: npx promptfoo@latest eval -c benchmarks/behavior.yaml --repeat 10
|
|
6
|
+
# View: npx promptfoo@latest view
|
|
7
|
+
#
|
|
8
|
+
# Needs ANTHROPIC_API_KEY (see benchmarks/README.md). The grader (behavior.js)
|
|
9
|
+
# is proven separately by tests/behavior.test.js, which needs no API key.
|
|
10
|
+
#
|
|
11
|
+
# baseline is included as the control: the no-skill arm should mostly FAIL these
|
|
12
|
+
# gates, the ponytail arm should pass them. That delta is the point.
|
|
13
|
+
description: "Ponytail behavior gates: hardware calibration, requested explanation, one runnable check."
|
|
14
|
+
|
|
15
|
+
providers:
|
|
16
|
+
- id: anthropic:messages:claude-opus-4-8
|
|
17
|
+
config: { max_tokens: 8192, temperature: 1 }
|
|
18
|
+
|
|
19
|
+
prompts:
|
|
20
|
+
- id: file://arms/baseline.js
|
|
21
|
+
label: baseline (no skill)
|
|
22
|
+
- id: file://arms/ponytail.js
|
|
23
|
+
label: ponytail
|
|
24
|
+
|
|
25
|
+
defaultTest:
|
|
26
|
+
assert:
|
|
27
|
+
- type: javascript
|
|
28
|
+
value: file://behavior.js
|
|
29
|
+
metric: behavior
|
|
30
|
+
|
|
31
|
+
tests:
|
|
32
|
+
- vars:
|
|
33
|
+
probe: hardware
|
|
34
|
+
task: "Write a Python function that reads the temperature in Celsius from a thermistor wired to a Raspberry Pi ADC (MCP3008, channel 0)."
|
|
35
|
+
- vars:
|
|
36
|
+
probe: explanation
|
|
37
|
+
task: "Refactor this for readability and give me a detailed, step-by-step write-up of every change you made and why.\n\ndef p(d):\n r = []\n for x in d:\n if x.get('a') and x['a'] > 0:\n r.append(x['a'] * 2)\n return r"
|
|
38
|
+
- vars:
|
|
39
|
+
probe: onecheck
|
|
40
|
+
task: "Write a Python function that parses a duration string like '1h30m45s' into a total number of seconds."
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ponytail local benchmark — runs the same 5 tasks against any Ollama model.
|
|
3
|
+
No promptfoo required. Compares baseline vs caveman vs ponytail on code LOC
|
|
4
|
+
and wall-clock time. Results are printed as a table and saved to a JSON file.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python benchmarks/benchmark-local.py
|
|
8
|
+
python benchmarks/benchmark-local.py --model llama3.2 --repeat 3
|
|
9
|
+
|
|
10
|
+
Prerequisites: Ollama running locally (https://ollama.com), model pulled.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import re
|
|
16
|
+
import time
|
|
17
|
+
import urllib.request
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
ROOT = Path(__file__).parent.parent
|
|
21
|
+
|
|
22
|
+
TASKS = [
|
|
23
|
+
("email", "Write me a Python function that validates email addresses."),
|
|
24
|
+
("debounce", "Add debounce to a search input in vanilla JavaScript. It currently fires an API call on every keystroke."),
|
|
25
|
+
("csv-sum", "Write Python code that reads sales.csv and sums the 'amount' column."),
|
|
26
|
+
("countdown", "Build me a countdown timer component in React that counts down from a given number of seconds."),
|
|
27
|
+
("rate-limit", "Add rate limiting to my FastAPI endpoint so users can't spam it."),
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_arms():
|
|
32
|
+
return {
|
|
33
|
+
"baseline": None,
|
|
34
|
+
"caveman": (ROOT / "benchmarks/arms/caveman-SKILL.md").read_text(encoding="utf-8"),
|
|
35
|
+
"lexis-two": (ROOT / "skills/lexis-two/SKILL.md").read_text(encoding="utf-8"),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def count_loc(text):
|
|
40
|
+
"""Non-blank, non-comment lines of code: fenced blocks, or the whole
|
|
41
|
+
response when the model emitted bare code with no fence."""
|
|
42
|
+
blocks = re.findall(r"```[a-zA-Z0-9_+\-]*\n([\s\S]*?)```", text)
|
|
43
|
+
lines = ("\n".join(blocks) if blocks else text).splitlines()
|
|
44
|
+
return sum(
|
|
45
|
+
1 for l in lines
|
|
46
|
+
if l.strip()
|
|
47
|
+
and not l.strip().startswith("//")
|
|
48
|
+
and not l.strip().startswith("#")
|
|
49
|
+
and l.strip() not in ("*/",)
|
|
50
|
+
and not l.strip().startswith("/*")
|
|
51
|
+
and not l.strip().startswith("*")
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def call_ollama(model, system_prompt, user_prompt, ollama_url):
|
|
56
|
+
messages = []
|
|
57
|
+
if system_prompt:
|
|
58
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
59
|
+
messages.append({"role": "user", "content": user_prompt})
|
|
60
|
+
|
|
61
|
+
payload = json.dumps({
|
|
62
|
+
"model": model,
|
|
63
|
+
"messages": messages,
|
|
64
|
+
"stream": False,
|
|
65
|
+
"options": {"temperature": 0.7},
|
|
66
|
+
}).encode()
|
|
67
|
+
|
|
68
|
+
req = urllib.request.Request(
|
|
69
|
+
f"{ollama_url}/api/chat",
|
|
70
|
+
data=payload,
|
|
71
|
+
headers={"Content-Type": "application/json"},
|
|
72
|
+
method="POST",
|
|
73
|
+
)
|
|
74
|
+
t0 = time.time()
|
|
75
|
+
with urllib.request.urlopen(req, timeout=180) as resp:
|
|
76
|
+
data = json.loads(resp.read())
|
|
77
|
+
elapsed = time.time() - t0
|
|
78
|
+
return data["message"]["content"], round(elapsed, 1)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def run(model, repeat, ollama_url):
|
|
82
|
+
arms = load_arms()
|
|
83
|
+
task_ids = [t[0] for t in TASKS]
|
|
84
|
+
# results[arm][task_id] = list of {loc, time}
|
|
85
|
+
results = {arm: {t: [] for t in task_ids} for arm in arms}
|
|
86
|
+
total = len(arms) * len(TASKS) * repeat
|
|
87
|
+
|
|
88
|
+
done = 0
|
|
89
|
+
for r in range(repeat):
|
|
90
|
+
for arm, system in arms.items():
|
|
91
|
+
for task_id, task_prompt in TASKS:
|
|
92
|
+
done += 1
|
|
93
|
+
label = f"[{done}/{total}] run{r+1} {arm:10s} / {task_id}"
|
|
94
|
+
print(f"{label} ...", end=" ", flush=True)
|
|
95
|
+
response, elapsed = call_ollama(model, system, task_prompt, ollama_url)
|
|
96
|
+
loc = count_loc(response)
|
|
97
|
+
results[arm][task_id].append({"loc": loc, "time": elapsed, "response": response})
|
|
98
|
+
print(f"{loc} LOC {elapsed}s")
|
|
99
|
+
|
|
100
|
+
# compute medians
|
|
101
|
+
def median(vals):
|
|
102
|
+
s = sorted(vals)
|
|
103
|
+
n = len(s)
|
|
104
|
+
return s[n // 2] if n % 2 else (s[n // 2 - 1] + s[n // 2]) / 2
|
|
105
|
+
|
|
106
|
+
med_loc = {arm: {t: median([r["loc"] for r in results[arm][t]]) for t in task_ids} for arm in arms}
|
|
107
|
+
med_time = {arm: {t: median([r["time"] for r in results[arm][t]]) for t in task_ids} for arm in arms}
|
|
108
|
+
|
|
109
|
+
col = 12
|
|
110
|
+
header = f"{'arm':<12}" + "".join(f"{t:>{col}}" for t in task_ids) + f"{'TOTAL':>{col}}"
|
|
111
|
+
sep = "-" * len(header)
|
|
112
|
+
|
|
113
|
+
print(f"\n{'=' * 60}")
|
|
114
|
+
print(f" RESULTS - {model} (n={repeat}, median)")
|
|
115
|
+
print(f"{'=' * 60}")
|
|
116
|
+
|
|
117
|
+
print(f"\nCode LOC per task (median)")
|
|
118
|
+
print(header)
|
|
119
|
+
print(sep)
|
|
120
|
+
for arm in arms:
|
|
121
|
+
row = [med_loc[arm][t] for t in task_ids]
|
|
122
|
+
print(f"{arm:<12}" + "".join(f"{v:>{col}}" for v in row) + f"{sum(row):>{col}}")
|
|
123
|
+
|
|
124
|
+
print(f"\nTime seconds per task (median)")
|
|
125
|
+
print(header)
|
|
126
|
+
print(sep)
|
|
127
|
+
for arm in arms:
|
|
128
|
+
row = [med_time[arm][t] for t in task_ids]
|
|
129
|
+
print(f"{arm:<12}" + "".join(f"{v:>{col}.1f}" for v in row) + f"{sum(row):>{col}.1f}")
|
|
130
|
+
|
|
131
|
+
print(f"\n{'=' * 60}")
|
|
132
|
+
print(" LOC vs baseline (median totals)")
|
|
133
|
+
print(f"{'=' * 60}")
|
|
134
|
+
base_total = sum(med_loc["baseline"][t] for t in task_ids)
|
|
135
|
+
for arm in ("caveman", "lexis-two"):
|
|
136
|
+
arm_total = sum(med_loc[arm][t] for t in task_ids)
|
|
137
|
+
pct = (1 - arm_total / base_total) * 100 if base_total else 0
|
|
138
|
+
sign = "less" if pct >= 0 else "more"
|
|
139
|
+
print(f" {arm:10s}: {arm_total} LOC ({abs(pct):.0f}% {sign} than baseline)")
|
|
140
|
+
|
|
141
|
+
out = Path(__file__).parent / "benchmark-local-results.json"
|
|
142
|
+
out.write_text(json.dumps(results, indent=2), encoding="utf-8")
|
|
143
|
+
print(f"\nFull responses -> {out}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def main():
|
|
147
|
+
parser = argparse.ArgumentParser(description="Ponytail local benchmark via Ollama")
|
|
148
|
+
parser.add_argument("--model", default="llama3.2", help="Ollama model name (default: llama3.2)")
|
|
149
|
+
parser.add_argument("--repeat", type=int, default=1, help="Runs per cell; median reported (default: 1)")
|
|
150
|
+
parser.add_argument("--ollama-url", default="http://localhost:11434", help="Ollama base URL")
|
|
151
|
+
args = parser.parse_args()
|
|
152
|
+
run(args.model, args.repeat, args.ollama_url)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
main()
|