agent-regression-lab 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -123
- package/dist/agent/httpAdapter.js +78 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +37 -1
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/index.js +287 -102
- package/dist/lib/id.js +3 -0
- package/dist/scenarios.js +121 -9
- package/dist/storage.js +193 -29
- package/dist/tools.js +246 -0
- package/dist/ui/App.js +39 -3
- package/dist/ui/server.js +47 -3
- package/dist/ui-assets/client.css +174 -0
- package/dist/ui-assets/client.js +22185 -0
- package/docs/agents.md +152 -0
- package/docs/release-checklist.md +64 -0
- package/docs/scenarios.md +172 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +158 -0
- package/package.json +6 -3
package/dist/ui/server.js
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
import { build } from "esbuild";
|
|
2
2
|
import { createServer } from "node:http";
|
|
3
|
-
import { readFileSync, existsSync } from "node:fs";
|
|
4
|
-
import { extname, resolve } from "node:path";
|
|
3
|
+
import { readFileSync, existsSync, writeFileSync } from "node:fs";
|
|
4
|
+
import { dirname, extname, resolve } from "node:path";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
5
6
|
import { ensureDir } from "../lib/fs.js";
|
|
6
7
|
import { getRunErrorDetail } from "../runOutput.js";
|
|
7
8
|
import { Storage } from "../storage.js";
|
|
8
9
|
const UI_ROOT = resolve("artifacts", "ui");
|
|
9
10
|
const ASSETS_ROOT = resolve(UI_ROOT, "assets");
|
|
11
|
+
const PACKAGED_ASSETS_ROOT = resolve(dirname(fileURLToPath(import.meta.url)), "..", "ui-assets");
|
|
12
|
+
const SOURCE_UI_ENTRY = resolve("src", "ui", "client.tsx");
|
|
10
13
|
const PORT = 4173;
|
|
11
14
|
export async function startUiServer() {
|
|
12
15
|
await buildUiAssets();
|
|
@@ -77,6 +80,10 @@ function handleApi(url, response) {
|
|
|
77
80
|
...comparison.candidate,
|
|
78
81
|
errorDetail: getRunErrorDetail(comparison.candidate),
|
|
79
82
|
},
|
|
83
|
+
classification: comparison.classification,
|
|
84
|
+
verdictDelta: comparison.verdictDelta,
|
|
85
|
+
terminationDelta: comparison.terminationDelta,
|
|
86
|
+
outputChanged: comparison.outputChanged,
|
|
80
87
|
notes: comparison.notes,
|
|
81
88
|
deltas: comparison.deltas,
|
|
82
89
|
evaluatorDiffs: comparison.evaluatorDiffs,
|
|
@@ -84,16 +91,38 @@ function handleApi(url, response) {
|
|
|
84
91
|
});
|
|
85
92
|
return;
|
|
86
93
|
}
|
|
94
|
+
if (url.pathname === "/api/compare-suite") {
|
|
95
|
+
const baselineBatch = url.searchParams.get("baselineBatch");
|
|
96
|
+
const candidateBatch = url.searchParams.get("candidateBatch");
|
|
97
|
+
if (!baselineBatch || !candidateBatch) {
|
|
98
|
+
sendJson(response, 400, { error: "Both 'baselineBatch' and 'candidateBatch' query params are required." });
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
const comparison = storage.compareSuites(baselineBatch, candidateBatch);
|
|
102
|
+
sendJson(response, 200, comparison);
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
87
105
|
sendJson(response, 404, { error: "Not found." });
|
|
88
106
|
}
|
|
89
107
|
catch (error) {
|
|
90
108
|
sendJson(response, 500, { error: error instanceof Error ? error.message : String(error) });
|
|
91
109
|
}
|
|
110
|
+
finally {
|
|
111
|
+
storage.close();
|
|
112
|
+
}
|
|
92
113
|
}
|
|
93
114
|
async function buildUiAssets() {
|
|
115
|
+
if (existsSync(PACKAGED_ASSETS_ROOT)) {
|
|
116
|
+
ensureDir(ASSETS_ROOT);
|
|
117
|
+
writePackagedAssetCopies();
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
if (!existsSync(SOURCE_UI_ENTRY)) {
|
|
121
|
+
throw new Error("UI assets are unavailable. Install a package build that includes dist/ui-assets or run from the repo root.");
|
|
122
|
+
}
|
|
94
123
|
ensureDir(ASSETS_ROOT);
|
|
95
124
|
await build({
|
|
96
|
-
entryPoints: [
|
|
125
|
+
entryPoints: [SOURCE_UI_ENTRY],
|
|
97
126
|
outdir: ASSETS_ROOT,
|
|
98
127
|
bundle: true,
|
|
99
128
|
format: "esm",
|
|
@@ -106,6 +135,21 @@ async function buildUiAssets() {
|
|
|
106
135
|
},
|
|
107
136
|
});
|
|
108
137
|
}
|
|
138
|
+
function writePackagedAssetCopies() {
|
|
139
|
+
for (const assetName of ["client.js", "client.css"]) {
|
|
140
|
+
const sourcePath = resolve(PACKAGED_ASSETS_ROOT, assetName);
|
|
141
|
+
const targetPath = resolve(ASSETS_ROOT, assetName);
|
|
142
|
+
if (!existsSync(sourcePath)) {
|
|
143
|
+
throw new Error(`Packaged UI asset '${assetName}' is missing.`);
|
|
144
|
+
}
|
|
145
|
+
responseSafeCopy(sourcePath, targetPath);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
function responseSafeCopy(sourcePath, targetPath) {
|
|
149
|
+
ensureDir(dirname(targetPath));
|
|
150
|
+
const contents = readFileSync(sourcePath);
|
|
151
|
+
writeFileSync(targetPath, contents);
|
|
152
|
+
}
|
|
109
153
|
function serveStatic(path, response) {
|
|
110
154
|
if (!existsSync(path)) {
|
|
111
155
|
response.writeHead(404, { "Content-Type": "text/plain; charset=utf-8" });
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/* src/ui/styles.css */
|
|
2
|
+
:root {
|
|
3
|
+
color-scheme: light;
|
|
4
|
+
--bg: #f1ede4;
|
|
5
|
+
--panel: #fffdf7;
|
|
6
|
+
--ink: #1c1a16;
|
|
7
|
+
--muted: #665f54;
|
|
8
|
+
--line: #d6ccbc;
|
|
9
|
+
--accent: #9e3d22;
|
|
10
|
+
--pass: #1e6a42;
|
|
11
|
+
--fail: #9a2c1f;
|
|
12
|
+
--error: #5b1e72;
|
|
13
|
+
}
|
|
14
|
+
* {
|
|
15
|
+
box-sizing: border-box;
|
|
16
|
+
}
|
|
17
|
+
body {
|
|
18
|
+
margin: 0;
|
|
19
|
+
background:
|
|
20
|
+
radial-gradient(
|
|
21
|
+
circle at top,
|
|
22
|
+
#f8f3ea 0,
|
|
23
|
+
var(--bg) 45%,
|
|
24
|
+
#e4dccd 100%);
|
|
25
|
+
color: var(--ink);
|
|
26
|
+
font-family:
|
|
27
|
+
"IBM Plex Sans",
|
|
28
|
+
"Helvetica Neue",
|
|
29
|
+
sans-serif;
|
|
30
|
+
}
|
|
31
|
+
a {
|
|
32
|
+
color: var(--accent);
|
|
33
|
+
text-decoration: none;
|
|
34
|
+
}
|
|
35
|
+
pre {
|
|
36
|
+
white-space: pre-wrap;
|
|
37
|
+
word-break: break-word;
|
|
38
|
+
background: #f7f1e6;
|
|
39
|
+
border: 1px solid var(--line);
|
|
40
|
+
padding: 0.8rem;
|
|
41
|
+
border-radius: 10px;
|
|
42
|
+
}
|
|
43
|
+
.shell {
|
|
44
|
+
min-height: 100vh;
|
|
45
|
+
}
|
|
46
|
+
.topbar {
|
|
47
|
+
position: sticky;
|
|
48
|
+
top: 0;
|
|
49
|
+
backdrop-filter: blur(10px);
|
|
50
|
+
background: rgba(241, 237, 228, 0.92);
|
|
51
|
+
border-bottom: 1px solid var(--line);
|
|
52
|
+
padding: 1rem 1.25rem;
|
|
53
|
+
}
|
|
54
|
+
.brand {
|
|
55
|
+
font-family: "IBM Plex Mono", monospace;
|
|
56
|
+
font-size: 0.95rem;
|
|
57
|
+
text-transform: uppercase;
|
|
58
|
+
letter-spacing: 0.08em;
|
|
59
|
+
color: var(--ink);
|
|
60
|
+
}
|
|
61
|
+
.page {
|
|
62
|
+
max-width: 1200px;
|
|
63
|
+
margin: 0 auto;
|
|
64
|
+
padding: 1.25rem;
|
|
65
|
+
}
|
|
66
|
+
.hero {
|
|
67
|
+
margin-bottom: 1rem;
|
|
68
|
+
}
|
|
69
|
+
.hero h1 {
|
|
70
|
+
margin: 0 0 0.35rem;
|
|
71
|
+
font-size: 2rem;
|
|
72
|
+
}
|
|
73
|
+
.hero p,
|
|
74
|
+
.muted {
|
|
75
|
+
color: var(--muted);
|
|
76
|
+
}
|
|
77
|
+
.filters,
|
|
78
|
+
.stats,
|
|
79
|
+
.panel-grid,
|
|
80
|
+
.compare-grid {
|
|
81
|
+
display: grid;
|
|
82
|
+
gap: 1rem;
|
|
83
|
+
}
|
|
84
|
+
.filters {
|
|
85
|
+
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
|
86
|
+
margin-bottom: 1rem;
|
|
87
|
+
}
|
|
88
|
+
input,
|
|
89
|
+
select {
|
|
90
|
+
width: 100%;
|
|
91
|
+
padding: 0.75rem 0.85rem;
|
|
92
|
+
border: 1px solid var(--line);
|
|
93
|
+
border-radius: 10px;
|
|
94
|
+
background: var(--panel);
|
|
95
|
+
}
|
|
96
|
+
.stats {
|
|
97
|
+
grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
|
|
98
|
+
margin-bottom: 1rem;
|
|
99
|
+
}
|
|
100
|
+
.stat,
|
|
101
|
+
.panel,
|
|
102
|
+
.empty {
|
|
103
|
+
background: var(--panel);
|
|
104
|
+
border: 1px solid var(--line);
|
|
105
|
+
border-radius: 16px;
|
|
106
|
+
padding: 1rem;
|
|
107
|
+
}
|
|
108
|
+
.stat-value {
|
|
109
|
+
font-size: 1.4rem;
|
|
110
|
+
margin-top: 0.25rem;
|
|
111
|
+
}
|
|
112
|
+
.panel-grid,
|
|
113
|
+
.compare-grid {
|
|
114
|
+
grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
|
|
115
|
+
margin-bottom: 1rem;
|
|
116
|
+
}
|
|
117
|
+
.table {
|
|
118
|
+
width: 100%;
|
|
119
|
+
border-collapse: collapse;
|
|
120
|
+
background: var(--panel);
|
|
121
|
+
border: 1px solid var(--line);
|
|
122
|
+
border-radius: 16px;
|
|
123
|
+
overflow: hidden;
|
|
124
|
+
}
|
|
125
|
+
.table th,
|
|
126
|
+
.table td {
|
|
127
|
+
text-align: left;
|
|
128
|
+
padding: 0.85rem;
|
|
129
|
+
border-bottom: 1px solid var(--line);
|
|
130
|
+
vertical-align: top;
|
|
131
|
+
}
|
|
132
|
+
.table th {
|
|
133
|
+
font-family: "IBM Plex Mono", monospace;
|
|
134
|
+
font-size: 0.8rem;
|
|
135
|
+
text-transform: uppercase;
|
|
136
|
+
letter-spacing: 0.04em;
|
|
137
|
+
color: var(--muted);
|
|
138
|
+
}
|
|
139
|
+
.pill {
|
|
140
|
+
display: inline-block;
|
|
141
|
+
padding: 0.2rem 0.55rem;
|
|
142
|
+
border-radius: 999px;
|
|
143
|
+
font-size: 0.8rem;
|
|
144
|
+
font-weight: 700;
|
|
145
|
+
text-transform: uppercase;
|
|
146
|
+
letter-spacing: 0.04em;
|
|
147
|
+
}
|
|
148
|
+
.pill.pass {
|
|
149
|
+
background: rgba(30, 106, 66, 0.12);
|
|
150
|
+
color: var(--pass);
|
|
151
|
+
}
|
|
152
|
+
.pill.fail {
|
|
153
|
+
background: rgba(154, 44, 31, 0.12);
|
|
154
|
+
color: var(--fail);
|
|
155
|
+
}
|
|
156
|
+
.pill.error {
|
|
157
|
+
background: rgba(91, 30, 114, 0.12);
|
|
158
|
+
color: var(--error);
|
|
159
|
+
}
|
|
160
|
+
.stack,
|
|
161
|
+
.timeline {
|
|
162
|
+
display: grid;
|
|
163
|
+
gap: 0.75rem;
|
|
164
|
+
padding-left: 1rem;
|
|
165
|
+
}
|
|
166
|
+
.timeline.compact {
|
|
167
|
+
gap: 0.35rem;
|
|
168
|
+
}
|
|
169
|
+
@media (max-width: 720px) {
|
|
170
|
+
.table {
|
|
171
|
+
display: block;
|
|
172
|
+
overflow-x: auto;
|
|
173
|
+
}
|
|
174
|
+
}
|