@draig/lexis-two 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +7 -2
- package/.claude-plugin/marketplace.json +0 -29
- package/.claude-plugin/plugin.json +0 -9
- package/.codex-plugin/plugin.json +0 -31
- package/.env.example +0 -8
- package/.github/FUNDING.yml +0 -1
- package/.github/copilot-instructions.md +0 -47
- package/.github/plugin/marketplace.json +0 -20
- package/.github/plugin/plugin.json +0 -16
- package/.github/workflows/deploy-site.yml +0 -53
- package/.github/workflows/test.yml +0 -29
- package/AUDIT.md +0 -74
- package/SPECXIS.md +0 -576
- package/benchmarks/README.md +0 -114
- package/benchmarks/arms/baseline.js +0 -2
- package/benchmarks/arms/caveman-SKILL.md +0 -67
- package/benchmarks/arms/caveman.js +0 -8
- package/benchmarks/arms/lexis-two.js +0 -10
- package/benchmarks/arms/ponytail.js +0 -6
- package/benchmarks/behavior.js +0 -58
- package/benchmarks/behavior.yaml +0 -40
- package/benchmarks/benchmark-local.py +0 -156
- package/benchmarks/benchmark-opencode-go.js +0 -294
- package/benchmarks/correctness.js +0 -294
- package/benchmarks/lib/aggregate-opencode-go.js +0 -103
- package/benchmarks/lib/load-env.js +0 -31
- package/benchmarks/lib/opencode-go-client.js +0 -151
- package/benchmarks/loc.js +0 -13
- package/benchmarks/opencode-go-models.json +0 -31
- package/benchmarks/promptfooconfig.yaml +0 -41
- package/benchmarks/prompts.json +0 -15
- package/benchmarks/render-opencode-go-report.js +0 -28
- package/benchmarks/results/2026-06-15-llama3.2-local.md +0 -76
- package/benchmarks/results/2026-06-16-opencode-go.md +0 -56
- package/benchmarks/results/opencode-go-2026-06-16-report.html +0 -226
- package/benchmarks/results/opencode-go-2026-06-16.json +0 -1339
- package/docs/assets/lexis-two-nobg.png +0 -0
- package/docs/assets/logo.png +0 -0
- package/docs/assets/logo.svg +0 -4
- package/docs/portability.md +0 -147
- package/docs/site.md +0 -52
- package/gemini-extension.json +0 -7
- package/pi-extension/index.js +0 -161
- package/pi-extension/package.json +0 -8
- package/pi-extension/test/extension.test.js +0 -89
- package/pi-extension/test/helpers.test.js +0 -35
- package/scripts/check-rule-copies.js +0 -82
- package/site/astro.config.mjs +0 -18
- package/site/package-lock.json +0 -4913
- package/site/package.json +0 -14
- package/site/public/CNAME +0 -1
- package/site/public/assets/lexis-two-nobg.png +0 -0
- package/site/public/assets/logo.png +0 -0
- package/site/public/assets/logo.svg +0 -4
- package/site/public/robots.txt +0 -4
- package/site/src/components/Adapt.astro +0 -33
- package/site/src/components/Benchmarks.astro +0 -232
- package/site/src/components/Commands.astro +0 -33
- package/site/src/components/Ecosystem.astro +0 -30
- package/site/src/components/Example.astro +0 -77
- package/site/src/components/Footer.astro +0 -28
- package/site/src/components/Header.astro +0 -87
- package/site/src/components/Hero.astro +0 -58
- package/site/src/components/Home.astro +0 -46
- package/site/src/components/Hosts.astro +0 -62
- package/site/src/components/Install.astro +0 -143
- package/site/src/components/LanguageSwitcher.astro +0 -82
- package/site/src/components/Philosophy.astro +0 -23
- package/site/src/components/Stacks.astro +0 -33
- package/site/src/components/Suggested.astro +0 -39
- package/site/src/data/opencode-go-benchmark.json +0 -230
- package/site/src/i18n/en.ts +0 -155
- package/site/src/i18n/es.ts +0 -158
- package/site/src/i18n/index.ts +0 -14
- package/site/src/layouts/Layout.astro +0 -114
- package/site/src/pages/benchmarks.astro +0 -4
- package/site/src/pages/es/benchmarks.astro +0 -4
- package/site/src/pages/es/index.astro +0 -10
- package/site/src/pages/index.astro +0 -10
- package/site/src/styles/global.css +0 -780
- package/site/tsconfig.json +0 -3
- package/tests/behavior.test.js +0 -80
- package/tests/commands.test.js +0 -40
- package/tests/copilot-plugin.test.js +0 -33
- package/tests/correctness.test.js +0 -191
- package/tests/gemini-extension.test.js +0 -78
- package/tests/hooks-windows.test.js +0 -48
- package/tests/hooks.test.js +0 -177
- package/tests/opencode-plugin.test.js +0 -64
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
/**
|
|
3
|
-
* Sync aggregated benchmark data for the Astro site.
|
|
4
|
-
*
|
|
5
|
-
* Usage:
|
|
6
|
-
* node benchmarks/render-opencode-go-report.js
|
|
7
|
-
* node benchmarks/render-opencode-go-report.js benchmarks/results/opencode-go-2026-06-16.json
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
const fs = require('fs');
|
|
11
|
-
const path = require('path');
|
|
12
|
-
const { aggregateOpencodeGo, findLatestJson } = require('./lib/aggregate-opencode-go.js');
|
|
13
|
-
|
|
14
|
-
const SITE_DATA = path.join(__dirname, '..', 'site', 'src', 'data', 'opencode-go-benchmark.json');
|
|
15
|
-
|
|
16
|
-
function main() {
|
|
17
|
-
const input = process.argv[2] ? path.resolve(process.argv[2]) : findLatestJson();
|
|
18
|
-
const data = JSON.parse(fs.readFileSync(input, 'utf8'));
|
|
19
|
-
const chart = aggregateOpencodeGo(data);
|
|
20
|
-
|
|
21
|
-
fs.mkdirSync(path.dirname(SITE_DATA), { recursive: true });
|
|
22
|
-
fs.writeFileSync(SITE_DATA, `${JSON.stringify(chart, null, 2)}\n`, 'utf8');
|
|
23
|
-
|
|
24
|
-
console.log(`Site data → ${SITE_DATA}`);
|
|
25
|
-
console.log('Preview charts: npm run site:dev → http://localhost:4321/benchmarks/');
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
main();
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
# Local model benchmark: llama3.2 via Ollama — 2026-06-15
|
|
2
|
-
|
|
3
|
-
Same 5 tasks as the Claude benchmark, same three arms (baseline / caveman / ponytail),
|
|
4
|
-
run against a local **llama3.2:latest** (3.2B, Q4_K_M) via Ollama on a Windows 11 machine.
|
|
5
|
-
Tooling: `benchmarks/benchmark-local.py` (no promptfoo needed).
|
|
6
|
-
|
|
7
|
-
> **Updated 2026-06-15:** the LOC counter now counts bare, unfenced code. It
|
|
8
|
-
> previously counted only fenced code blocks and scored everything else as 0,
|
|
9
|
-
> which silently deflated any arm whose output happened to skip the fences (small
|
|
10
|
-
> models do this often). Numbers below use the corrected counter at n=5 median.
|
|
11
|
-
> Absolute times reflect this machine (GPU-accelerated); compare arms within a
|
|
12
|
-
> run, not against an earlier CPU-bound machine.
|
|
13
|
-
|
|
14
|
-
## Results (n=5, median)
|
|
15
|
-
|
|
16
|
-
**Code LOC**
|
|
17
|
-
|
|
18
|
-
| arm | email | debounce | csv-sum | countdown | rate-limit | **TOTAL** |
|
|
19
|
-
|---|--:|--:|--:|--:|--:|--:|
|
|
20
|
-
| baseline | 16 | 18 | 22 | 37 | 16 | **109** |
|
|
21
|
-
| caveman | 16 | 21 | 18 | 46 | 32 | **133** |
|
|
22
|
-
| ponytail | 17 | 22 | 18 | 52 | 28 | **137** |
|
|
23
|
-
|
|
24
|
-
**Time (seconds)**
|
|
25
|
-
|
|
26
|
-
| arm | email | debounce | csv-sum | countdown | rate-limit | **TOTAL** |
|
|
27
|
-
|---|--:|--:|--:|--:|--:|--:|
|
|
28
|
-
| baseline | 3.1 | 3.7 | 3.6 | 4.2 | 4.8 | **19.4** |
|
|
29
|
-
| caveman | 4.1 | 4.2 | 3.6 | 4.4 | 4.8 | **21.1** |
|
|
30
|
-
| ponytail | 4.1 | 4.2 | 3.8 | 4.8 | 4.9 | **21.8** |
|
|
31
|
-
|
|
32
|
-
## Key findings
|
|
33
|
-
|
|
34
|
-
**On llama3.2 the LOC effect is inside the noise floor.** At temperature 0.7 the
|
|
35
|
-
per-run totals swing hard: across the five runs, ponytail landed anywhere from
|
|
36
|
-
17% *below* baseline to 50% *above* it. The n=5 median came out +26%; a separate
|
|
37
|
-
n=3 median came out −17%. The aggregate itself flips sign depending on the
|
|
38
|
-
sample, and the countdown task alone ranged 19 to 74 LOC on baseline. There is no
|
|
39
|
-
stable LOC reduction to report.
|
|
40
|
-
|
|
41
|
-
**Ponytail does not transfer to llama3.2.** The 80-94% LOC reduction seen on
|
|
42
|
-
Claude is simply absent: the signal is lost in run-to-run variance. The one
|
|
43
|
-
consistent effect is on time, and it goes the wrong way: ponytail is ~10-15%
|
|
44
|
-
*slower* than baseline (more system-prompt tokens to process), never the 3-6x
|
|
45
|
-
speedup seen on Claude.
|
|
46
|
-
|
|
47
|
-
**Why:** ponytail is a prompt-engineering skill calibrated on Claude models,
|
|
48
|
-
which are trained to follow detailed system instructions. A 3.2B quantised model
|
|
49
|
-
absorbs the rules only partially and adds prose justifying its choices, paying
|
|
50
|
-
the instruction-following cost without reliably converting it into less code.
|
|
51
|
-
|
|
52
|
-
## Reproduce
|
|
53
|
-
|
|
54
|
-
Install Ollama and pull a model, then run from the repo root:
|
|
55
|
-
|
|
56
|
-
```bash
|
|
57
|
-
ollama pull llama3.2
|
|
58
|
-
python benchmarks/benchmark-local.py --model llama3.2 --repeat 5
|
|
59
|
-
```
|
|
60
|
-
|
|
61
|
-
At this model size the LOC signal is noisy; raise `--repeat` (or lower the
|
|
62
|
-
sampling temperature in the script) before reading anything into the totals.
|
|
63
|
-
|
|
64
|
-
Optional flags:
|
|
65
|
-
|
|
66
|
-
```
|
|
67
|
-
--repeat N Runs per cell; median is reported (default: 1)
|
|
68
|
-
--ollama-url URL Ollama base URL (default: http://localhost:11434)
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
## Takeaway
|
|
72
|
-
|
|
73
|
-
The benchmark claims in the README are accurate for the models tested (Haiku,
|
|
74
|
-
Sonnet, Opus). For local/small models, expect the gains to shrink into the noise
|
|
75
|
-
until instruction-following reaches a threshold comparable to Claude Haiku or
|
|
76
|
-
better.
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# Lexis-Two benchmark — OpenCode Go (2026-06-16)
|
|
2
|
-
|
|
3
|
-
Provider: [OpenCode Go](https://opencode.ai/docs/go/).
|
|
4
|
-
Repeat: 3 per cell. Temperature: 1.
|
|
5
|
-
|
|
6
|
-
## Kimi K2.6 (`kimi-k2.6`)
|
|
7
|
-
|
|
8
|
-
Repeat: 3. Arms: baseline, lexis-two.
|
|
9
|
-
|
|
10
|
-
**Code LOC (median)**
|
|
11
|
-
|
|
12
|
-
| arm | email | debounce | csv-sum | countdown | rate-limit | TOTAL | correct |
|
|
13
|
-
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
14
|
-
| baseline | 46 | 63 | 18 | 413 | 62 | 602 | 12/15 |
|
|
15
|
-
| lexis-two | 13 | 10 | 4 | 13 | 23 | 63 | 12/15 |
|
|
16
|
-
|
|
17
|
-
**lexis-two vs baseline (median total LOC):** 90% less code.
|
|
18
|
-
|
|
19
|
-
## DeepSeek V4 Pro (`deepseek-v4-pro`)
|
|
20
|
-
|
|
21
|
-
Repeat: 3. Arms: baseline, lexis-two.
|
|
22
|
-
|
|
23
|
-
**Code LOC (median)**
|
|
24
|
-
|
|
25
|
-
| arm | email | debounce | csv-sum | countdown | rate-limit | TOTAL | correct |
|
|
26
|
-
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
27
|
-
| baseline | 36 | 61 | 25 | 113 | 53 | 288 | 14/15 |
|
|
28
|
-
| lexis-two | 9 | 12 | 4 | 12 | 20 | 57 | 13/15 |
|
|
29
|
-
|
|
30
|
-
**lexis-two vs baseline (median total LOC):** 80% less code.
|
|
31
|
-
|
|
32
|
-
## Qwen3.7 Max (`qwen3.7-max`)
|
|
33
|
-
|
|
34
|
-
Repeat: 3. Arms: baseline, lexis-two.
|
|
35
|
-
|
|
36
|
-
**Code LOC (median)**
|
|
37
|
-
|
|
38
|
-
| arm | email | debounce | csv-sum | countdown | rate-limit | TOTAL | correct |
|
|
39
|
-
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
40
|
-
| baseline | 39 | 48 | 19 | 124 | 40 | 270 | 12/15 |
|
|
41
|
-
| lexis-two | 14 | 9 | 4 | 10 | 17 | 54 | 13/15 |
|
|
42
|
-
|
|
43
|
-
**lexis-two vs baseline (median total LOC):** 80% less code.
|
|
44
|
-
|
|
45
|
-
## MiniMax M3 (`minimax-m3`)
|
|
46
|
-
|
|
47
|
-
Repeat: 3. Arms: baseline, lexis-two.
|
|
48
|
-
|
|
49
|
-
**Code LOC (median)**
|
|
50
|
-
|
|
51
|
-
| arm | email | debounce | csv-sum | countdown | rate-limit | TOTAL | correct |
|
|
52
|
-
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
53
|
-
| baseline | 55 | 66 | 33 | 112 | 59 | 325 | 11/15 |
|
|
54
|
-
| lexis-two | 9 | 10 | 4 | 18 | 15 | 56 | 15/15 |
|
|
55
|
-
|
|
56
|
-
**lexis-two vs baseline (median total LOC):** 83% less code.
|
|
@@ -1,226 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<meta charset="utf-8" />
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
6
|
-
<title>Lexis-Two benchmark — OpenCode Go (2026-06-16)</title>
|
|
7
|
-
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
|
8
|
-
<style>
|
|
9
|
-
:root {
|
|
10
|
-
--bg: #0b0d0c;
|
|
11
|
-
--panel: #121614;
|
|
12
|
-
--border: #1e2420;
|
|
13
|
-
--text: #e8ece9;
|
|
14
|
-
--muted: #8a948d;
|
|
15
|
-
--accent: #7cba8a;
|
|
16
|
-
--baseline: #c97a7a;
|
|
17
|
-
}
|
|
18
|
-
* { box-sizing: border-box; }
|
|
19
|
-
body {
|
|
20
|
-
margin: 0;
|
|
21
|
-
font-family: system-ui, sans-serif;
|
|
22
|
-
background: var(--bg);
|
|
23
|
-
color: var(--text);
|
|
24
|
-
line-height: 1.5;
|
|
25
|
-
}
|
|
26
|
-
.wrap { max-width: 1100px; margin: 0 auto; padding: 2rem 1rem 3rem; }
|
|
27
|
-
h1 { font-size: 1.75rem; margin: 0 0 0.25rem; }
|
|
28
|
-
.sub { color: var(--muted); margin-bottom: 2rem; font-size: 0.95rem; }
|
|
29
|
-
.grid {
|
|
30
|
-
display: grid;
|
|
31
|
-
gap: 1.25rem;
|
|
32
|
-
grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
|
|
33
|
-
}
|
|
34
|
-
.card {
|
|
35
|
-
background: var(--panel);
|
|
36
|
-
border: 1px solid var(--border);
|
|
37
|
-
border-radius: 0.5rem;
|
|
38
|
-
padding: 1rem 1rem 0.5rem;
|
|
39
|
-
}
|
|
40
|
-
.card h2 {
|
|
41
|
-
font-size: 0.85rem;
|
|
42
|
-
text-transform: uppercase;
|
|
43
|
-
letter-spacing: 0.06em;
|
|
44
|
-
color: var(--muted);
|
|
45
|
-
margin: 0 0 0.75rem;
|
|
46
|
-
}
|
|
47
|
-
.card canvas { max-height: 280px; }
|
|
48
|
-
.wide { grid-column: 1 / -1; }
|
|
49
|
-
table {
|
|
50
|
-
width: 100%;
|
|
51
|
-
border-collapse: collapse;
|
|
52
|
-
font-size: 0.875rem;
|
|
53
|
-
}
|
|
54
|
-
th, td {
|
|
55
|
-
padding: 0.5rem 0.75rem;
|
|
56
|
-
border-bottom: 1px solid var(--border);
|
|
57
|
-
text-align: right;
|
|
58
|
-
}
|
|
59
|
-
th:first-child, td:first-child { text-align: left; }
|
|
60
|
-
th { color: var(--muted); font-weight: 600; }
|
|
61
|
-
.good { color: var(--accent); }
|
|
62
|
-
footer { margin-top: 2rem; color: var(--muted); font-size: 0.8rem; }
|
|
63
|
-
a { color: var(--accent); }
|
|
64
|
-
</style>
|
|
65
|
-
</head>
|
|
66
|
-
<body>
|
|
67
|
-
<div class="wrap">
|
|
68
|
-
<h1>Lexis-Two × OpenCode Go</h1>
|
|
69
|
-
<p class="sub">
|
|
70
|
-
Source: <code>opencode-go-2026-06-16.json</code> · 3 runs/cell · median LOC ·
|
|
71
|
-
arms: baseline, lexis-two
|
|
72
|
-
</p>
|
|
73
|
-
|
|
74
|
-
<div class="grid">
|
|
75
|
-
<div class="card wide">
|
|
76
|
-
<h2>Total code LOC (median, 5 tasks)</h2>
|
|
77
|
-
<canvas id="chart-total-loc"></canvas>
|
|
78
|
-
</div>
|
|
79
|
-
<div class="card">
|
|
80
|
-
<h2>LOC reduction vs baseline</h2>
|
|
81
|
-
<canvas id="chart-reduction"></canvas>
|
|
82
|
-
</div>
|
|
83
|
-
<div class="card">
|
|
84
|
-
<h2>Wall time (median total seconds)</h2>
|
|
85
|
-
<canvas id="chart-time"></canvas>
|
|
86
|
-
</div>
|
|
87
|
-
<div class="card wide">
|
|
88
|
-
<h2>LOC by task — lexis-two arm</h2>
|
|
89
|
-
<canvas id="chart-by-task"></canvas>
|
|
90
|
-
</div>
|
|
91
|
-
<div class="card wide">
|
|
92
|
-
<h2>Summary table</h2>
|
|
93
|
-
<table id="summary-table">
|
|
94
|
-
<thead>
|
|
95
|
-
<tr>
|
|
96
|
-
<th>Model</th>
|
|
97
|
-
<th>Baseline LOC</th>
|
|
98
|
-
<th>Lexis-Two LOC</th>
|
|
99
|
-
<th>Reduction</th>
|
|
100
|
-
<th>Correct (lexis)</th>
|
|
101
|
-
</tr>
|
|
102
|
-
</thead>
|
|
103
|
-
<tbody></tbody>
|
|
104
|
-
</table>
|
|
105
|
-
</div>
|
|
106
|
-
</div>
|
|
107
|
-
|
|
108
|
-
<footer>
|
|
109
|
-
Regenerate: <code>node benchmarks/render-opencode-go-report.js</code>
|
|
110
|
-
</footer>
|
|
111
|
-
</div>
|
|
112
|
-
|
|
113
|
-
<script>
|
|
114
|
-
const DATA = {"date":"2026-06-16","repeat":3,"models":[{"id":"kimi-k2.6","locByArmTask":{"baseline":{"email":56,"debounce":71,"csv-sum":19,"countdown":366,"rate-limit":49},"lexis-two":{"email":10,"debounce":5,"csv-sum":4,"countdown":16,"rate-limit":14}},"timeByArmTask":{"baseline":{"email":12.973,"debounce":14.098,"csv-sum":5.08,"countdown":21.987,"rate-limit":16.836},"lexis-two":{"email":16.23,"debounce":8.363,"csv-sum":9.846,"countdown":18.389,"rate-limit":16.189}},"correctByArm":{"baseline":{"pass":11,"total":15},"lexis-two":{"pass":10,"total":15}},"totals":{"baselineLoc":561,"lexisLoc":49,"reductionPct":91,"baselineTimeSec":71,"lexisTimeSec":69}},{"id":"deepseek-v4-pro","locByArmTask":{"baseline":{"email":60,"debounce":39,"csv-sum":26,"countdown":90,"rate-limit":66},"lexis-two":{"email":7,"debounce":7,"csv-sum":4,"countdown":10,"rate-limit":18}},"timeByArmTask":{"baseline":{"email":35.051,"debounce":14.509,"csv-sum":13.422,"countdown":46.762,"rate-limit":32.401},"lexis-two":{"email":27.456,"debounce":16.982,"csv-sum":24.931,"countdown":36.191,"rate-limit":39.952}},"correctByArm":{"baseline":{"pass":13,"total":15},"lexis-two":{"pass":12,"total":15}},"totals":{"baselineLoc":281,"lexisLoc":46,"reductionPct":84,"baselineTimeSec":142.1,"lexisTimeSec":145.5}},{"id":"qwen3.7-max","locByArmTask":{"baseline":{"email":34,"debounce":44,"csv-sum":17,"countdown":128,"rate-limit":47},"lexis-two":{"email":7,"debounce":5,"csv-sum":3,"countdown":10,"rate-limit":13}},"timeByArmTask":{"baseline":{"email":43.542,"debounce":17.58,"csv-sum":25.876,"countdown":32.586,"rate-limit":41.602},"lexis-two":{"email":39.323,"debounce":28.275,"csv-sum":23.769,"countdown":28.775,"rate-limit":40.786}},"correctByArm":{"baseline":{"pass":14,"total":15},"lexis-two":{"pass":11,"total":15}},"totals":{"baselineLoc":270,"lexisLoc":38,"reductionPct":86,"baselineTimeSec":161.2,"lexisTimeSec":160.9}},{"id":"minimax-m3","locByArmTask":{"baseline":{"email":62,"debounce":58,"csv-sum":43,"countdown":117,"rate-limit":76},"lexis-two":{"email":12,"debounce":6,"csv-sum":2,"countdown":13,"rate-limit":14}},"timeByArmTask":{"baseline":{"email":18.398,"debounce":14.916,"csv-sum":10.237,"countdown":19.516,"rate-limit":22.131},"lexis-two":{"email":8.002,"debounce":2.393,"csv-sum":2.423,"countdown":2.753,"rate-limit":4.209}},"correctByArm":{"baseline":{"pass":12,"total":15},"lexis-two":{"pass":9,"total":15}},"totals":{"baselineLoc":356,"lexisLoc":47,"reductionPct":87,"baselineTimeSec":85.2,"lexisTimeSec":19.8}}],"tasks":["email","debounce","csv-sum","countdown","rate-limit"],"arms":["baseline","lexis-two"]};
|
|
115
|
-
const labels = DATA.models.map((m) => m.id.replace('kimi-k2.6','Kimi K2.6').replace('deepseek-v4-pro','DeepSeek V4').replace('qwen3.7-max','Qwen3.7 Max').replace('minimax-m3','MiniMax M3'));
|
|
116
|
-
|
|
117
|
-
Chart.defaults.color = '#8a948d';
|
|
118
|
-
Chart.defaults.borderColor = '#1e2420';
|
|
119
|
-
Chart.defaults.font.family = 'system-ui, sans-serif';
|
|
120
|
-
|
|
121
|
-
new Chart(document.getElementById('chart-total-loc'), {
|
|
122
|
-
type: 'bar',
|
|
123
|
-
data: {
|
|
124
|
-
labels,
|
|
125
|
-
datasets: [
|
|
126
|
-
{
|
|
127
|
-
label: 'baseline',
|
|
128
|
-
data: DATA.models.map((m) => m.totals.baselineLoc),
|
|
129
|
-
backgroundColor: '#c97a7a',
|
|
130
|
-
},
|
|
131
|
-
{
|
|
132
|
-
label: 'lexis-two',
|
|
133
|
-
data: DATA.models.map((m) => m.totals.lexisLoc),
|
|
134
|
-
backgroundColor: '#7cba8a',
|
|
135
|
-
},
|
|
136
|
-
],
|
|
137
|
-
},
|
|
138
|
-
options: {
|
|
139
|
-
responsive: true,
|
|
140
|
-
plugins: { legend: { position: 'bottom' } },
|
|
141
|
-
scales: { y: { beginAtZero: true, title: { display: true, text: 'lines of code' } } },
|
|
142
|
-
},
|
|
143
|
-
});
|
|
144
|
-
|
|
145
|
-
new Chart(document.getElementById('chart-reduction'), {
|
|
146
|
-
type: 'bar',
|
|
147
|
-
data: {
|
|
148
|
-
labels,
|
|
149
|
-
datasets: [{
|
|
150
|
-
label: '% less code',
|
|
151
|
-
data: DATA.models.map((m) => m.totals.reductionPct),
|
|
152
|
-
backgroundColor: '#7cba8a',
|
|
153
|
-
}],
|
|
154
|
-
},
|
|
155
|
-
options: {
|
|
156
|
-
indexAxis: 'y',
|
|
157
|
-
responsive: true,
|
|
158
|
-
plugins: { legend: { display: false } },
|
|
159
|
-
scales: {
|
|
160
|
-
x: { beginAtZero: true, max: 100, ticks: { callback: (v) => v + '%' } },
|
|
161
|
-
},
|
|
162
|
-
},
|
|
163
|
-
});
|
|
164
|
-
|
|
165
|
-
new Chart(document.getElementById('chart-time'), {
|
|
166
|
-
type: 'bar',
|
|
167
|
-
data: {
|
|
168
|
-
labels,
|
|
169
|
-
datasets: [
|
|
170
|
-
{
|
|
171
|
-
label: 'baseline',
|
|
172
|
-
data: DATA.models.map((m) => m.totals.baselineTimeSec),
|
|
173
|
-
backgroundColor: '#c97a7a',
|
|
174
|
-
},
|
|
175
|
-
{
|
|
176
|
-
label: 'lexis-two',
|
|
177
|
-
data: DATA.models.map((m) => m.totals.lexisTimeSec),
|
|
178
|
-
backgroundColor: '#7cba8a',
|
|
179
|
-
},
|
|
180
|
-
],
|
|
181
|
-
},
|
|
182
|
-
options: {
|
|
183
|
-
responsive: true,
|
|
184
|
-
plugins: { legend: { position: 'bottom' } },
|
|
185
|
-
scales: { y: { beginAtZero: true, title: { display: true, text: 'seconds' } } },
|
|
186
|
-
},
|
|
187
|
-
});
|
|
188
|
-
|
|
189
|
-
const taskLabels = DATA.tasks;
|
|
190
|
-
const taskColors = ['#7cba8a', '#5a9a6a', '#9fd4a8', '#4a7356', '#3d5f48'];
|
|
191
|
-
new Chart(document.getElementById('chart-by-task'), {
|
|
192
|
-
type: 'bar',
|
|
193
|
-
data: {
|
|
194
|
-
labels,
|
|
195
|
-
datasets: taskLabels.map((taskId, i) => ({
|
|
196
|
-
label: taskId,
|
|
197
|
-
data: DATA.models.map((m) => m.locByArmTask['lexis-two'][taskId]),
|
|
198
|
-
backgroundColor: taskColors[i % taskColors.length],
|
|
199
|
-
})),
|
|
200
|
-
},
|
|
201
|
-
options: {
|
|
202
|
-
responsive: true,
|
|
203
|
-
plugins: { legend: { position: 'bottom' } },
|
|
204
|
-
scales: {
|
|
205
|
-
x: { stacked: false },
|
|
206
|
-
y: { beginAtZero: true, title: { display: true, text: 'LOC (lexis-two)' } },
|
|
207
|
-
},
|
|
208
|
-
},
|
|
209
|
-
});
|
|
210
|
-
|
|
211
|
-
const tbody = document.querySelector('#summary-table tbody');
|
|
212
|
-
for (const m of DATA.models) {
|
|
213
|
-
const c = m.correctByArm['lexis-two'];
|
|
214
|
-
const tr = document.createElement('tr');
|
|
215
|
-
tr.innerHTML = `
|
|
216
|
-
<td>${m.id}</td>
|
|
217
|
-
<td>${m.totals.baselineLoc}</td>
|
|
218
|
-
<td class="good">${m.totals.lexisLoc}</td>
|
|
219
|
-
<td class="good">${m.totals.reductionPct}%</td>
|
|
220
|
-
<td>${c.pass}/${c.total}</td>
|
|
221
|
-
`;
|
|
222
|
-
tbody.appendChild(tr);
|
|
223
|
-
}
|
|
224
|
-
</script>
|
|
225
|
-
</body>
|
|
226
|
-
</html>
|