@tryinget/pi-evalset-lab 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,85 @@
1
+ {
2
+ "name": "@tryinget/pi-evalset-lab",
3
+ "version": "0.2.0",
4
+ "description": "pi extension for fixed-task-set eval runs and prompt/system comparisons",
5
+ "type": "module",
6
+ "license": "SEE LICENSE IN LICENSE",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "git+https://github.com/tryingET/pi-extensions.git",
10
+ "directory": "packages/pi-evalset-lab"
11
+ },
12
+ "bugs": {
13
+ "url": "https://github.com/tryingET/pi-extensions/issues"
14
+ },
15
+ "homepage": "https://github.com/tryingET/pi-extensions/tree/main/packages/pi-evalset-lab",
16
+ "keywords": [
17
+ "pi-package",
18
+ "pi-extension",
19
+ "evalset",
20
+ "llm-eval",
21
+ "prompt-evaluation",
22
+ "fixed-task-set",
23
+ "ux-observability",
24
+ "safety-governance",
25
+ "review-quality-loops",
26
+ "model-prompt-management",
27
+ "monorepo"
28
+ ],
29
+ "publishConfig": {
30
+ "registry": "https://registry.npmjs.org/",
31
+ "access": "public"
32
+ },
33
+ "engines": {
34
+ "node": ">=22"
35
+ },
36
+ "scripts": {
37
+ "fix": "bash ./scripts/quality-gate.sh fix",
38
+ "lint": "bash ./scripts/quality-gate.sh lint",
39
+ "typecheck": "bash ./scripts/quality-gate.sh typecheck",
40
+ "quality:pre-commit": "bash ./scripts/quality-gate.sh pre-commit",
41
+ "quality:pre-push": "bash ./scripts/quality-gate.sh pre-push",
42
+ "quality:ci": "bash ./scripts/quality-gate.sh ci",
43
+ "check": "npm run quality:ci",
44
+ "test": "npm run quality:ci",
45
+ "docs:list": "bash ./scripts/docs-list.sh",
46
+ "docs:list:workspace": "bash ./scripts/docs-list.sh --workspace --discover",
47
+ "docs:list:json": "bash ./scripts/docs-list.sh --json",
48
+ "release:check": "bash ./scripts/release-check.sh",
49
+ "release:check:quick": "SKIP_PI_SMOKE=1 bash ./scripts/release-check.sh",
50
+ "evalset:export-html": "node ./scripts/export-evalset-report-html.mjs"
51
+ },
52
+ "files": [
53
+ "extensions/evalset.ts",
54
+ "prompts",
55
+ "examples",
56
+ "scripts/export-evalset-report-html.mjs",
57
+ "CHANGELOG.md",
58
+ "policy/security-policy.json",
59
+ "policy/stack-lane.json"
60
+ ],
61
+ "pi": {
62
+ "extensions": [
63
+ "./extensions/evalset.ts"
64
+ ],
65
+ "prompts": [
66
+ "./prompts"
67
+ ]
68
+ },
69
+ "x-pi-template": {
70
+ "scaffoldMode": "simple-package",
71
+ "workspacePath": "packages/pi-evalset-lab",
72
+ "releaseComponent": "pi-evalset-lab",
73
+ "releaseConfigMode": "component"
74
+ },
75
+ "devDependencies": {
76
+ "@biomejs/biome": "2.3.14"
77
+ },
78
+ "overrides": {
79
+ "fast-xml-parser": "5.3.6"
80
+ },
81
+ "peerDependencies": {
82
+ "@mariozechner/pi-coding-agent": "*",
83
+ "@mariozechner/pi-ai": "*"
84
+ }
85
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "default": {
3
+ "maxRiskScore": 70,
4
+ "requireIntegrity": true,
5
+ "requireSignatures": false,
6
+ "allowNpmDiffFallback": false,
7
+ "minReleaseAgeHours": 0
8
+ },
9
+ "packages": {}
10
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "lane": "ts",
3
+ "tech_stack_core": {
4
+ "tool": "tech-stack-core",
5
+ "lane": "pi-ts",
6
+ "repository": "https://github.com/lightningralf/tech-stack-core",
7
+ "ref": "4bd813dbb9ebbd9cb279c7293cd941fd1154d59e",
8
+ "command": "uv tool run --from ~/ai-society/core/tech-stack-core tech-stack-core show pi-ts --prefer-repo"
9
+ }
10
+ }
@@ -0,0 +1,21 @@
1
+ ---
2
+ summary: "Draft an implementation plan for a requested change"
3
+ read_when:
4
+ - "Using or maintaining this package prompt template."
5
+ - "Checking prompt metadata for pi package discovery or docs-list validation."
6
+ description: Draft an implementation plan for a requested change
7
+ system4d:
8
+ container: "Prompt template for implementation planning."
9
+ compass: "Turn requests into actionable, risk-aware plans."
10
+ engine: "Scope -> tasks -> validation -> rollout."
11
+ fog: "Hidden constraints unless assumptions are surfaced."
12
+ ---
13
+
14
+ Create an implementation plan for this request: $@
15
+
16
+ Include:
17
+ - Scope and non-goals
18
+ - Key risks and mitigations
19
+ - Step-by-step implementation tasks
20
+ - Validation commands and expected outcomes
21
+ - Rollout and rollback notes
@@ -0,0 +1,21 @@
1
+ ---
2
+ summary: "Review a change for security risks and mitigations"
3
+ read_when:
4
+ - "Using or maintaining this package prompt template."
5
+ - "Checking prompt metadata for pi package discovery or docs-list validation."
6
+ description: Review a change for security risks and mitigations
7
+ system4d:
8
+ container: "Prompt template for security-focused review."
9
+ compass: "Identify practical vulnerabilities before release."
10
+ engine: "Threats -> impact -> mitigations -> verification."
11
+ fog: "Partial context can hide exploit paths."
12
+ ---
13
+
14
+ Review this change for security concerns: $@
15
+
16
+ Focus on:
17
+ - Input validation and injection risk
18
+ - Privilege boundaries and secret handling
19
+ - Dependency and supply-chain risk
20
+ - Safe failure modes and logging
21
+ - Concrete remediations with priority
@@ -0,0 +1,364 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
4
+ import { dirname, resolve } from "node:path";
5
+
6
+ const HELP_TEXT = `Export evalset JSON report to a standalone HTML file.
7
+
8
+ Usage:
9
+ node ./scripts/export-evalset-report-html.mjs --in <report.json> [--out <report.html>] [--title <text>]
10
+
11
+ Options:
12
+ --in, -i Input report path (.json) [required]
13
+ --out, -o Output HTML path (default: input path with .html extension)
14
+ --title Custom report title
15
+ --help, -h Show this help
16
+ `;
17
+
18
+ function parseArgs(argv) {
19
+ /** @type {{input?: string; output?: string; title?: string; help?: boolean}} */
20
+ const result = {};
21
+
22
+ for (let i = 0; i < argv.length; i += 1) {
23
+ const token = argv[i];
24
+
25
+ switch (token) {
26
+ case "--help":
27
+ case "-h":
28
+ result.help = true;
29
+ break;
30
+ case "--in":
31
+ case "--input":
32
+ case "-i":
33
+ result.input = argv[i + 1];
34
+ i += 1;
35
+ break;
36
+ case "--out":
37
+ case "--output":
38
+ case "-o":
39
+ result.output = argv[i + 1];
40
+ i += 1;
41
+ break;
42
+ case "--title":
43
+ result.title = argv[i + 1];
44
+ i += 1;
45
+ break;
46
+ default:
47
+ throw new Error(`Unknown option: ${token}`);
48
+ }
49
+ }
50
+
51
+ return result;
52
+ }
53
+
54
+ function requireValue(value, flag) {
55
+ if (!value || value.startsWith("-")) {
56
+ throw new Error(`Missing value for ${flag}`);
57
+ }
58
+ return value;
59
+ }
60
+
61
+ function isRecord(value) {
62
+ return typeof value === "object" && value !== null && !Array.isArray(value);
63
+ }
64
+
65
+ function toArray(value) {
66
+ return Array.isArray(value) ? value : [];
67
+ }
68
+
69
+ function esc(value) {
70
+ return String(value ?? "")
71
+ .replace(/&/g, "&amp;")
72
+ .replace(/</g, "&lt;")
73
+ .replace(/>/g, "&gt;")
74
+ .replace(/"/g, "&quot;")
75
+ .replace(/'/g, "&#39;");
76
+ }
77
+
78
+ function pct(value) {
79
+ return typeof value === "number" ? `${(value * 100).toFixed(1)}%` : "n/a";
80
+ }
81
+
82
+ function money(value) {
83
+ return `$${Number(value ?? 0).toFixed(6)}`;
84
+ }
85
+
86
+ function ms(value) {
87
+ return `${Number(value ?? 0).toFixed(0)} ms`;
88
+ }
89
+
90
+ function passPill(pass) {
91
+ return pass ? '<span class="pill ok">PASS</span>' : '<span class="pill bad">FAIL</span>';
92
+ }
93
+
94
+ function outcomePill(outcome) {
95
+ if (outcome === "improved") {
96
+ return '<span class="pill imp">Improved</span>';
97
+ }
98
+ if (outcome === "regressed") {
99
+ return '<span class="pill reg">Regressed</span>';
100
+ }
101
+ return '<span class="pill same">No change</span>';
102
+ }
103
+
104
+ function checksText(entry) {
105
+ const lines = toArray(entry?.checks).map((check) => {
106
+ const marker = check?.pass ? "✅" : "❌";
107
+ return `${marker} ${check?.details ?? ""}`;
108
+ });
109
+ return lines.length > 0 ? lines.join("\n") : "None";
110
+ }
111
+
112
+ function pageTemplate({ title, subtitle, summaryCards, tableHeader, tableRows }) {
113
+ return `<!doctype html>
114
+ <html lang="en">
115
+ <head>
116
+ <meta charset="utf-8" />
117
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
118
+ <title>${esc(title)}</title>
119
+ <style>
120
+ :root {
121
+ --bg: #0b1020;
122
+ --panel: #111a33;
123
+ --line: #24335f;
124
+ --txt: #e8eeff;
125
+ --muted: #9fb0d8;
126
+ --ok: #2ecc71;
127
+ --bad: #ff6b6b;
128
+ --same: #7f8ea3;
129
+ --imp: #49dcb1;
130
+ --reg: #ff8f70;
131
+ }
132
+ * { box-sizing: border-box; }
133
+ body { margin: 0; padding: 24px; font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu; background: var(--bg); color: var(--txt); }
134
+ .wrap { max-width: 1240px; margin: 0 auto; }
135
+ .panel { background: var(--panel); border: 1px solid var(--line); border-radius: 12px; padding: 14px; margin-bottom: 14px; }
136
+ .muted { color: var(--muted); font-size: 0.92rem; }
137
+ .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 10px; }
138
+ .card { background: #0e1630; border: 1px solid #2a3a66; border-radius: 10px; padding: 10px; }
139
+ .label { color: var(--muted); text-transform: uppercase; letter-spacing: .04em; font-size: .75rem; }
140
+ .val { margin-top: 4px; font-size: 1.06rem; font-weight: 600; }
141
+ table { width: 100%; border-collapse: collapse; font-size: .91rem; }
142
+ th, td { text-align: left; padding: 10px 8px; border-bottom: 1px solid var(--line); vertical-align: top; }
143
+ th { color: var(--muted); }
144
+ .pill { display: inline-block; border-radius: 999px; padding: 2px 10px; font-size: .74rem; font-weight: 700; border: 1px solid transparent; }
145
+ .ok { background: rgba(46, 204, 113, .14); color: #9df7c5; border-color: rgba(46, 204, 113, .45); }
146
+ .bad { background: rgba(255, 107, 107, .14); color: #ffc2c2; border-color: rgba(255, 107, 107, .45); }
147
+ .same { background: rgba(127, 142, 163, .14); color: #d4dcf2; border-color: rgba(127, 142, 163, .45); }
148
+ .imp { background: rgba(73, 220, 177, .14); color: #abf6df; border-color: rgba(73, 220, 177, .45); }
149
+ .reg { background: rgba(255, 143, 112, .14); color: #ffd0c2; border-color: rgba(255, 143, 112, .45); }
150
+ .meta { color: var(--muted); font-size: .8rem; margin-top: 6px; }
151
+ pre { white-space: pre-wrap; background: #0d152b; border: 1px solid #2a3a66; border-radius: 8px; padding: 8px; max-height: 220px; overflow: auto; }
152
+ details summary { cursor: pointer; color: #9dc2ff; }
153
+ .split { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; }
154
+ @media (max-width: 980px) { .split { grid-template-columns: 1fr; } }
155
+ </style>
156
+ </head>
157
+ <body>
158
+ <div class="wrap">
159
+ <div class="panel">
160
+ <h1 style="margin:0 0 8px; font-size:1.32rem;">${esc(title)}</h1>
161
+ <div class="muted">${esc(subtitle)}</div>
162
+ </div>
163
+
164
+ <div class="panel grid">
165
+ ${summaryCards}
166
+ </div>
167
+
168
+ <div class="panel">
169
+ <table>
170
+ <thead>${tableHeader}</thead>
171
+ <tbody>${tableRows}</tbody>
172
+ </table>
173
+ </div>
174
+ </div>
175
+ </body>
176
+ </html>`;
177
+ }
178
+
179
+ function summaryCard(label, value) {
180
+ return `<div class="card"><div class="label">${esc(label)}</div><div class="val">${esc(value)}</div></div>`;
181
+ }
182
+
183
+ function renderRun(report, options) {
184
+ const cases = toArray(report?.cases);
185
+
186
+ const summaryCards = [
187
+ summaryCard(
188
+ "Pass rate",
189
+ `${pct(report?.totals?.passRate)} (${report?.totals?.passedCases ?? 0}/${report?.totals?.scoredCases ?? 0})`,
190
+ ),
191
+ summaryCard("Avg latency", ms(report?.totals?.avgLatencyMs)),
192
+ summaryCard("Total cost", money(report?.totals?.usage?.cost?.total)),
193
+ summaryCard("Dataset", report?.dataset?.name ?? "n/a"),
194
+ summaryCard("Variant", report?.variant?.name ?? "n/a"),
195
+ summaryCard("Run ID", report?.run?.runId ?? "n/a"),
196
+ ].join("\n");
197
+
198
+ const rows = cases
199
+ .map((entry, index) => {
200
+ const checks = checksText(entry);
201
+ return `<tr>
202
+ <td>${index + 1}</td>
203
+ <td><code>${esc(entry?.id ?? `case-${index + 1}`)}</code></td>
204
+ <td>${passPill(Boolean(entry?.pass))}<div class="meta">${ms(entry?.latencyMs)}</div></td>
205
+ <td><details><summary>checks + output</summary>
206
+ <h4 style="margin:8px 0 6px;">Checks</h4><pre>${esc(checks)}</pre>
207
+ <h4 style="margin:8px 0 6px;">Output preview</h4><pre>${esc(entry?.outputPreview ?? "")}</pre>
208
+ </details></td>
209
+ </tr>`;
210
+ })
211
+ .join("\n");
212
+
213
+ const title = options.title || `Evalset run report: ${report?.dataset?.name ?? "dataset"}`;
214
+ const subtitle = [
215
+ report?.dataset?.path,
216
+ `model ${report?.model?.provider ?? "unknown"}/${report?.model?.id ?? "unknown"}`,
217
+ `run ${report?.run?.runId ?? "n/a"}`,
218
+ ]
219
+ .filter(Boolean)
220
+ .join(" · ");
221
+
222
+ return pageTemplate({
223
+ title,
224
+ subtitle,
225
+ summaryCards,
226
+ tableHeader: `<tr><th>#</th><th>Case</th><th>Result</th><th>Details</th></tr>`,
227
+ tableRows: rows,
228
+ });
229
+ }
230
+
231
+ function renderCompare(report, options) {
232
+ const baselineCases = toArray(report?.baseline?.cases);
233
+ const candidateById = new Map(
234
+ toArray(report?.candidate?.cases).map((entry) => [String(entry?.id ?? ""), entry]),
235
+ );
236
+
237
+ const summaryCards = [
238
+ summaryCard(
239
+ "Baseline pass",
240
+ `${pct(report?.baseline?.totals?.passRate)} (${report?.baseline?.totals?.passedCases ?? 0}/${report?.baseline?.totals?.scoredCases ?? 0})`,
241
+ ),
242
+ summaryCard(
243
+ "Candidate pass",
244
+ `${pct(report?.candidate?.totals?.passRate)} (${report?.candidate?.totals?.passedCases ?? 0}/${report?.candidate?.totals?.scoredCases ?? 0})`,
245
+ ),
246
+ summaryCard("Δ pass rate", pct(report?.delta?.passRate)),
247
+ summaryCard("Δ avg latency", ms(report?.delta?.avgLatencyMs)),
248
+ summaryCard("Δ total cost", money(report?.delta?.totalCost)),
249
+ summaryCard("Dataset hash", (report?.run?.datasetHash ?? "n/a").slice(0, 12)),
250
+ ].join("\n");
251
+
252
+ const rows = baselineCases
253
+ .map((baselineEntry, index) => {
254
+ const candidateEntry = candidateById.get(String(baselineEntry?.id ?? "")) ?? {};
255
+
256
+ let outcome = "same";
257
+ if (!baselineEntry?.pass && candidateEntry?.pass) {
258
+ outcome = "improved";
259
+ } else if (baselineEntry?.pass && !candidateEntry?.pass) {
260
+ outcome = "regressed";
261
+ }
262
+
263
+ return `<tr>
264
+ <td>${index + 1}</td>
265
+ <td><code>${esc(baselineEntry?.id ?? `case-${index + 1}`)}</code></td>
266
+ <td>${passPill(Boolean(baselineEntry?.pass))}<div class="meta">${ms(baselineEntry?.latencyMs)}</div></td>
267
+ <td>${passPill(Boolean(candidateEntry?.pass))}<div class="meta">${ms(candidateEntry?.latencyMs)}</div></td>
268
+ <td>${outcomePill(outcome)}</td>
269
+ <td><details><summary>checks + output</summary>
270
+ <div class="split">
271
+ <div>
272
+ <h4 style="margin:8px 0 6px;">Baseline checks</h4><pre>${esc(checksText(baselineEntry))}</pre>
273
+ <h4 style="margin:8px 0 6px;">Baseline output</h4><pre>${esc(baselineEntry?.outputPreview ?? "")}</pre>
274
+ </div>
275
+ <div>
276
+ <h4 style="margin:8px 0 6px;">Candidate checks</h4><pre>${esc(checksText(candidateEntry))}</pre>
277
+ <h4 style="margin:8px 0 6px;">Candidate output</h4><pre>${esc(candidateEntry?.outputPreview ?? "")}</pre>
278
+ </div>
279
+ </div>
280
+ </details></td>
281
+ </tr>`;
282
+ })
283
+ .join("\n");
284
+
285
+ const title = options.title || `Evalset compare report: ${report?.dataset?.name ?? "dataset"}`;
286
+ const subtitle = [
287
+ report?.dataset?.path,
288
+ `model ${report?.model?.provider ?? "unknown"}/${report?.model?.id ?? "unknown"}`,
289
+ `run ${report?.run?.runId ?? "n/a"}`,
290
+ ]
291
+ .filter(Boolean)
292
+ .join(" · ");
293
+
294
+ return pageTemplate({
295
+ title,
296
+ subtitle,
297
+ summaryCards,
298
+ tableHeader: `<tr><th>#</th><th>Case</th><th>Baseline</th><th>Candidate</th><th>Outcome</th><th>Details</th></tr>`,
299
+ tableRows: rows,
300
+ });
301
+ }
302
+
303
+ function renderUnknown(report, options) {
304
+ const title = options.title || "Evalset report (raw JSON)";
305
+ const subtitle = `Unsupported report kind: ${report?.kind ?? "unknown"}`;
306
+ const summaryCards = summaryCard("Kind", report?.kind ?? "unknown");
307
+ const tableRows = `<tr><td><pre>${esc(JSON.stringify(report, null, 2))}</pre></td></tr>`;
308
+ return pageTemplate({
309
+ title,
310
+ subtitle,
311
+ summaryCards,
312
+ tableHeader: `<tr><th>Report JSON</th></tr>`,
313
+ tableRows,
314
+ });
315
+ }
316
+
317
+ function defaultOutputPath(inputPath) {
318
+ if (inputPath.toLowerCase().endsWith(".json")) {
319
+ return inputPath.replace(/\.json$/i, ".html");
320
+ }
321
+ return `${inputPath}.html`;
322
+ }
323
+
324
+ async function main() {
325
+ const cli = parseArgs(process.argv.slice(2));
326
+
327
+ if (cli.help) {
328
+ console.log(HELP_TEXT.trimEnd());
329
+ return;
330
+ }
331
+
332
+ const input = resolve(process.cwd(), requireValue(cli.input, "--in"));
333
+ const output = resolve(
334
+ process.cwd(),
335
+ cli.output ? requireValue(cli.output, "--out") : defaultOutputPath(input),
336
+ );
337
+
338
+ const raw = await readFile(input, "utf8");
339
+ const parsed = JSON.parse(raw);
340
+
341
+ if (!isRecord(parsed)) {
342
+ throw new Error("Report JSON must be an object.");
343
+ }
344
+
345
+ let html;
346
+ if (parsed.kind === "evalset-run") {
347
+ html = renderRun(parsed, { title: cli.title });
348
+ } else if (parsed.kind === "evalset-compare") {
349
+ html = renderCompare(parsed, { title: cli.title });
350
+ } else {
351
+ html = renderUnknown(parsed, { title: cli.title });
352
+ }
353
+
354
+ await mkdir(dirname(output), { recursive: true });
355
+ await writeFile(output, `${html}\n`, "utf8");
356
+
357
+ console.log(`Exported HTML report: ${output}`);
358
+ }
359
+
360
+ main().catch((error) => {
361
+ const message = error instanceof Error ? error.message : String(error);
362
+ console.error(`export-evalset-report-html error: ${message}`);
363
+ process.exit(1);
364
+ });