thumbgate 1.5.1 → 1.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/CHANGELOG.md +504 -0
- package/README.md +36 -6
- package/adapters/README.md +1 -1
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/codex/config.toml +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/bench/prompt-eval-suite.json +106 -0
- package/package.json +36 -27
- package/public/dashboard.html +1436 -0
- package/public/index.html +9 -10
- package/public/lessons.html +16 -0
- package/scripts/prompt-eval.js +363 -0
package/public/index.html
CHANGED
|
@@ -872,7 +872,7 @@ __GA_BOOTSTRAP__
|
|
|
872
872
|
<!-- HOW IT WORKS -->
|
|
873
873
|
<section class="how-it-works" id="how-it-works">
|
|
874
874
|
<div class="container">
|
|
875
|
-
<div class="section-label">New in v1.5.
|
|
875
|
+
<div class="section-label">New in v1.5.3</div>
|
|
876
876
|
<h2 class="section-title">Three steps to stop repeated AI failures</h2>
|
|
877
877
|
<div class="steps">
|
|
878
878
|
<div class="step">
|
|
@@ -1024,12 +1024,11 @@ __GA_BOOTSTRAP__
|
|
|
1024
1024
|
<p style="font-size:13px;color:#aaa;margin-bottom:16px;">3 captures, 1 rule, 1 agent. Enough to prove the enforcement loop works. When you need more, you will know.</p>
|
|
1025
1025
|
<ul>
|
|
1026
1026
|
<li><strong>3 feedback captures total</strong> (not per day)</li>
|
|
1027
|
-
<li>1 prevention rule</li>
|
|
1028
|
-
<li>1 agent</li>
|
|
1027
|
+
<li>1 auto-promoted prevention rule</li>
|
|
1029
1028
|
<li>No recall or lesson search</li>
|
|
1030
|
-
<li>No exports</li>
|
|
1031
|
-
<li>All MCP integrations (Claude Code, Cursor, Codex,
|
|
1032
|
-
<li>PreToolUse hook blocking</li>
|
|
1029
|
+
<li>No exports (DPO, Databricks, HuggingFace)</li>
|
|
1030
|
+
<li>All MCP integrations (Claude Code, Cursor, Codex, Gemini, Amp, any MCP agent)</li>
|
|
1031
|
+
<li>PreToolUse hook blocking with built-in safety gates (force-push, destructive SQL, secrets)</li>
|
|
1033
1032
|
<li><a href="/guide" style="color:var(--cyan);text-decoration:underline;">Setup guide for all agents →</a></li>
|
|
1034
1033
|
</ul>
|
|
1035
1034
|
<div class="hero-install" onclick="copyInstall(this)" title="Click to copy" style="margin-bottom:12px;width:100%;justify-content:center;">
|
|
@@ -1060,9 +1059,9 @@ __GA_BOOTSTRAP__
|
|
|
1060
1059
|
</div>
|
|
1061
1060
|
<ul>
|
|
1062
1061
|
<li>Everything in Free, plus:</li>
|
|
1063
|
-
<li><a href="/dashboard" style="color:var(--cyan);text-decoration:underline;">Visual gate debugger →</a> see every blocked action and the gate that fired so you can trust the system in minutes</li>
|
|
1062
|
+
<li><a href="/dashboard#insights" style="color:var(--cyan);text-decoration:underline;">Visual gate debugger →</a> see every blocked action and the gate that fired so you can trust the system in minutes</li>
|
|
1064
1063
|
<li>Auto-connect — activate once with your license key, then your running agents appear automatically on your local dashboard</li>
|
|
1065
|
-
<li><a href="/dashboard" style="color:var(--cyan);text-decoration:underline;">DPO training data export →</a> turn real thumbs-downs into ready-to-use preference pairs for fine-tuning (LoRA / JSONL)</li>
|
|
1064
|
+
<li><a href="/dashboard#export" style="color:var(--cyan);text-decoration:underline;">DPO training data export →</a> turn real thumbs-downs into ready-to-use preference pairs for fine-tuning (LoRA / JSONL)</li>
|
|
1066
1065
|
<li><strong>HuggingFace dataset export</strong> — share PII-redacted agent traces as open training datasets (<code>npm run export:hf</code>)</li>
|
|
1067
1066
|
<li><strong>Model Hardening Advisor</strong> — get recommendations on when and how to fine-tune your model to natively avoid recurring failures</li>
|
|
1068
1067
|
<li>Personal local dashboard — every Pro user gets a localhost dashboard without extra cloud setup</li>
|
|
@@ -1082,7 +1081,7 @@ __GA_BOOTSTRAP__
|
|
|
1082
1081
|
<div class="price-sub">3-seat minimum · One engineer's correction protects the whole team</div>
|
|
1083
1082
|
<p style="font-size:13px;color:var(--green);margin-bottom:16px;font-weight:500;">When one engineer teaches the agent not to delete staging data, that lesson applies to every agent on the team. Stop paying the same mistake tax across different developers.</p>
|
|
1084
1083
|
<div class="pro-upgrade-triggers" style="font-size:12px;color:#aaa;margin-bottom:12px;">
|
|
1085
|
-
<strong style="color:#fff;">Previously $
|
|
1084
|
+
<strong style="color:#fff;">Previously $99/seat.</strong> Now $49/seat. Start with one repo, one workflow, one repeat failure.
|
|
1086
1085
|
</div>
|
|
1087
1086
|
<ul>
|
|
1088
1087
|
<li>Workflow hardening sprint — map one painful workflow, one repeated failure, and one buyer proof review before wider rollout</li>
|
|
@@ -1229,7 +1228,7 @@ __GA_BOOTSTRAP__
|
|
|
1229
1228
|
<a href="https://www.linkedin.com/in/igorganapolsky" target="_blank" rel="noopener">LinkedIn</a>
|
|
1230
1229
|
<a href="/blog">Blog</a>
|
|
1231
1230
|
</div>
|
|
1232
|
-
<span class="footer-copy">© 2026 Max Smith KDP LLC · MIT License · v1.5.
|
|
1231
|
+
<span class="footer-copy">© 2026 Max Smith KDP LLC · MIT License · v1.5.3</span>
|
|
1233
1232
|
</div>
|
|
1234
1233
|
</footer>
|
|
1235
1234
|
|
package/public/lessons.html
CHANGED
|
@@ -936,6 +936,22 @@ loadLive().then(function() {
|
|
|
936
936
|
el.style.borderColor = 'var(--cyan)';
|
|
937
937
|
el.style.boxShadow = '0 0 12px rgba(34,211,238,0.3)';
|
|
938
938
|
setTimeout(function() { el.style.borderColor = ''; el.style.boxShadow = ''; }, 4000);
|
|
939
|
+
} else {
|
|
940
|
+
// Deep-link hash was provided but no matching lesson/feedback was found.
|
|
941
|
+
// Surface this to the user instead of silently loading the page top — the
|
|
942
|
+
// linked lesson may have been pruned, rotated, or the ID may be stale.
|
|
943
|
+
var banner = document.createElement('div');
|
|
944
|
+
banner.setAttribute('role', 'alert');
|
|
945
|
+
banner.style.cssText = 'margin:12px 0;padding:12px 16px;border:1px solid rgba(251,191,36,0.4);background:rgba(251,191,36,0.08);border-radius:10px;color:var(--text);font-size:13px;';
|
|
946
|
+
banner.innerHTML =
|
|
947
|
+
'<strong style="color:#fbbf24;">Lesson not found:</strong> ' +
|
|
948
|
+
'<code style="background:var(--bg-card);padding:2px 6px;border-radius:4px;font-size:12px;">' +
|
|
949
|
+
hash.replace(/[<>&"]/g, function(c) { return {'<':'<','>':'>','&':'&','"':'"'}[c]; }) +
|
|
950
|
+
'</code>' +
|
|
951
|
+
' — this ID is not in the active lesson index. It may have been rotated or the statusbar link is stale. ' +
|
|
952
|
+
'Browse rules and timeline below to find what you were looking for.';
|
|
953
|
+
var container = document.querySelector('.container') || document.body;
|
|
954
|
+
container.insertBefore(banner, container.firstChild);
|
|
939
955
|
}
|
|
940
956
|
});
|
|
941
957
|
</script>
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Prompt Evaluation Framework for ThumbGate
|
|
6
|
+
*
|
|
7
|
+
* Based on Anthropic's prompt evaluation methodology:
|
|
8
|
+
* 1. Define test cases with inputs and expected outputs
|
|
9
|
+
* 2. Run prompts against test cases
|
|
10
|
+
* 3. Grade outputs against expectations (deterministic + LLM-as-judge)
|
|
11
|
+
* 4. Report pass/fail with scores
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* node scripts/prompt-eval.js [--suite=path] [--json] [--min-score=80]
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const fs = require('node:fs');
|
|
18
|
+
const os = require('node:os');
|
|
19
|
+
const path = require('node:path');
|
|
20
|
+
|
|
21
|
+
const ROOT = path.join(__dirname, '..');
|
|
22
|
+
const DEFAULT_SUITE = path.join(ROOT, 'bench', 'prompt-eval-suite.json');
|
|
23
|
+
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Prompt simulators — run ThumbGate's actual logic against eval inputs
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
function simulateLessonDistillation(input) {
|
|
29
|
+
// Use ThumbGate's actual captureFeedback logic to produce a lesson
|
|
30
|
+
const { captureFeedback } = require('./feedback-loop');
|
|
31
|
+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'tg-eval-'));
|
|
32
|
+
const prevDir = process.env.THUMBGATE_FEEDBACK_DIR;
|
|
33
|
+
process.env.THUMBGATE_FEEDBACK_DIR = tmpDir;
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
const result = captureFeedback({
|
|
37
|
+
signal: input.signal === 'positive' ? 'up' : 'down',
|
|
38
|
+
context: input.context || '',
|
|
39
|
+
whatWentWrong: input.whatWentWrong || undefined,
|
|
40
|
+
whatToChange: input.whatToChange || undefined,
|
|
41
|
+
whatWorked: input.whatWorked || undefined,
|
|
42
|
+
tags: input.tags || [],
|
|
43
|
+
});
|
|
44
|
+
return result;
|
|
45
|
+
} finally {
|
|
46
|
+
process.env.THUMBGATE_FEEDBACK_DIR = prevDir || '';
|
|
47
|
+
if (!prevDir) delete process.env.THUMBGATE_FEEDBACK_DIR;
|
|
48
|
+
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function simulateFeedbackEnrichment(input) {
|
|
53
|
+
const { enrichFeedbackContext } = require('./feedback-loop');
|
|
54
|
+
return enrichFeedbackContext({
|
|
55
|
+
signal: input.signal,
|
|
56
|
+
context: input.context,
|
|
57
|
+
tags: input.tags || [],
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function simulatePreventionRule(input) {
|
|
62
|
+
// Prevention rules are generated from accumulated patterns
|
|
63
|
+
// For eval purposes, we test the rule structure expectations
|
|
64
|
+
return {
|
|
65
|
+
pattern: input.pattern,
|
|
66
|
+
occurrences: input.occurrences,
|
|
67
|
+
examples: input.examples,
|
|
68
|
+
generated: true,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function simulateSelfDistill(input) {
|
|
73
|
+
return {
|
|
74
|
+
sessionFeedback: input.sessionFeedback,
|
|
75
|
+
summary: input.sessionFeedback.map((f) => f.context).join('; '),
|
|
76
|
+
generated: true,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const PROMPT_SIMULATORS = {
|
|
81
|
+
'lesson-distillation': simulateLessonDistillation,
|
|
82
|
+
'feedback-enrichment': simulateFeedbackEnrichment,
|
|
83
|
+
'prevention-rule-generation': simulatePreventionRule,
|
|
84
|
+
'self-distillation': simulateSelfDistill,
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
// Deterministic graders — check output against expected fields
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
function firstString(...values) {
|
|
92
|
+
for (const value of values) {
|
|
93
|
+
if (typeof value === 'string') return value;
|
|
94
|
+
}
|
|
95
|
+
return '';
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function addContainsChecks(checks, prefix, label, content, terms = []) {
|
|
99
|
+
for (const term of terms) {
|
|
100
|
+
const found = content.toLowerCase().includes(term.toLowerCase());
|
|
101
|
+
checks.push({
|
|
102
|
+
criterion: `${prefix}:${term}`,
|
|
103
|
+
pass: found,
|
|
104
|
+
detail: found ? `${label} contains "${term}"` : `${label} missing "${term}"`,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function handleRejectExpectation(checks, result, expected) {
|
|
110
|
+
if (!expected.shouldReject) return false;
|
|
111
|
+
|
|
112
|
+
const wasRejected = result.accepted === false
|
|
113
|
+
|| result.status === 'rejected'
|
|
114
|
+
|| result.actionType === 'no-action';
|
|
115
|
+
checks.push({
|
|
116
|
+
criterion: 'shouldReject',
|
|
117
|
+
pass: wasRejected,
|
|
118
|
+
detail: wasRejected ? 'Correctly rejected vague input' : 'Should have rejected but accepted',
|
|
119
|
+
});
|
|
120
|
+
return true;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function addTitleChecks(checks, result, expected) {
|
|
124
|
+
if (!expected.hasTitle) return;
|
|
125
|
+
|
|
126
|
+
const title = firstString(result.memoryRecord?.title, result.title);
|
|
127
|
+
checks.push({
|
|
128
|
+
criterion: 'hasTitle',
|
|
129
|
+
pass: title.length > 0,
|
|
130
|
+
detail: title ? `Title: "${title.slice(0, 60)}"` : 'Missing title',
|
|
131
|
+
});
|
|
132
|
+
addContainsChecks(checks, 'titleContains', 'Title', title, expected.titleContains);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function addContentChecks(checks, result, expected) {
|
|
136
|
+
if (!expected.hasContent) return;
|
|
137
|
+
|
|
138
|
+
const content = firstString(result.memoryRecord?.content, result.content);
|
|
139
|
+
checks.push({
|
|
140
|
+
criterion: 'hasContent',
|
|
141
|
+
pass: content.length > 0,
|
|
142
|
+
detail: content ? `Content length: ${content.length}` : 'Missing content',
|
|
143
|
+
});
|
|
144
|
+
addContainsChecks(checks, 'contentContains', 'Content', content, expected.contentContains);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function addCategoryChecks(checks, result, expected) {
|
|
148
|
+
if (expected.category) {
|
|
149
|
+
const category = firstString(result.memoryRecord?.category, result.category);
|
|
150
|
+
checks.push({
|
|
151
|
+
criterion: 'category',
|
|
152
|
+
pass: category === expected.category,
|
|
153
|
+
detail: `Expected "${expected.category}", got "${category}"`,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (expected.importance) {
|
|
158
|
+
const importance = firstString(result.memoryRecord?.importance, result.importance);
|
|
159
|
+
checks.push({
|
|
160
|
+
criterion: 'importance',
|
|
161
|
+
pass: importance === expected.importance,
|
|
162
|
+
detail: `Expected "${expected.importance}", got "${importance}"`,
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function addContextChecks(checks, result, expected) {
|
|
168
|
+
if (expected.hasDomain) {
|
|
169
|
+
const domain = firstString(result.richContext?.domain, result.domain);
|
|
170
|
+
checks.push({
|
|
171
|
+
criterion: 'domain',
|
|
172
|
+
pass: expected.domain ? domain === expected.domain : domain.length > 0,
|
|
173
|
+
detail: `Domain: "${domain}"`,
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if (expected.hasOutcome) {
|
|
178
|
+
const outcome = firstString(result.richContext?.outcomeCategory, result.outcome);
|
|
179
|
+
checks.push({
|
|
180
|
+
criterion: 'hasOutcome',
|
|
181
|
+
pass: outcome.length > 0,
|
|
182
|
+
detail: `Outcome: "${outcome}"`,
|
|
183
|
+
});
|
|
184
|
+
addContainsChecks(checks, 'outcomeContains', 'Outcome', outcome, expected.outcomeContains);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function addRuleChecks(checks, result, expected) {
|
|
189
|
+
if (!expected.hasRule) return;
|
|
190
|
+
|
|
191
|
+
checks.push({
|
|
192
|
+
criterion: 'hasRule',
|
|
193
|
+
pass: result.generated === true || !!result.rule,
|
|
194
|
+
detail: result.generated ? 'Rule generated' : 'No rule generated',
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function addSummaryChecks(checks, result, expected) {
|
|
199
|
+
if (!expected.hasSummary) return;
|
|
200
|
+
|
|
201
|
+
const summary = firstString(result.summary);
|
|
202
|
+
checks.push({
|
|
203
|
+
criterion: 'hasSummary',
|
|
204
|
+
pass: summary.length > 0,
|
|
205
|
+
detail: `Summary length: ${summary.length}`,
|
|
206
|
+
});
|
|
207
|
+
addContainsChecks(checks, 'summaryContains', 'Summary', summary, expected.summaryContains);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function gradeOutput(output, expected) {
|
|
211
|
+
const checks = [];
|
|
212
|
+
const result = output || {};
|
|
213
|
+
|
|
214
|
+
if (handleRejectExpectation(checks, result, expected)) return checks;
|
|
215
|
+
|
|
216
|
+
addTitleChecks(checks, result, expected);
|
|
217
|
+
addContentChecks(checks, result, expected);
|
|
218
|
+
addCategoryChecks(checks, result, expected);
|
|
219
|
+
addContextChecks(checks, result, expected);
|
|
220
|
+
addRuleChecks(checks, result, expected);
|
|
221
|
+
addSummaryChecks(checks, result, expected);
|
|
222
|
+
|
|
223
|
+
return checks;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// ---------------------------------------------------------------------------
|
|
227
|
+
// Runner
|
|
228
|
+
// ---------------------------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
function loadSuite(suitePath) {
|
|
231
|
+
const raw = JSON.parse(fs.readFileSync(suitePath, 'utf8'));
|
|
232
|
+
if (!Array.isArray(raw.evaluations) || raw.evaluations.length === 0) {
|
|
233
|
+
throw new Error('Suite must define a non-empty evaluations array');
|
|
234
|
+
}
|
|
235
|
+
return raw;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
function runEvaluation(evalCase) {
|
|
239
|
+
const simulator = PROMPT_SIMULATORS[evalCase.prompt];
|
|
240
|
+
if (!simulator) {
|
|
241
|
+
return {
|
|
242
|
+
id: evalCase.id,
|
|
243
|
+
status: 'skip',
|
|
244
|
+
reason: `No simulator for prompt: ${evalCase.prompt}`,
|
|
245
|
+
checks: [],
|
|
246
|
+
score: 0,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
let output;
|
|
251
|
+
let error = null;
|
|
252
|
+
try {
|
|
253
|
+
output = simulator(evalCase.input);
|
|
254
|
+
} catch (err) {
|
|
255
|
+
error = err.message || String(err);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if (error) {
|
|
259
|
+
return {
|
|
260
|
+
id: evalCase.id,
|
|
261
|
+
status: 'error',
|
|
262
|
+
error,
|
|
263
|
+
checks: [],
|
|
264
|
+
score: 0,
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
const checks = gradeOutput(output, evalCase.expectedOutput);
|
|
269
|
+
const passCount = checks.filter((c) => c.pass).length;
|
|
270
|
+
const score = checks.length > 0 ? Math.round((passCount / checks.length) * 100) : 0;
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
id: evalCase.id,
|
|
274
|
+
status: score === 100 ? 'pass' : 'fail',
|
|
275
|
+
checks,
|
|
276
|
+
score,
|
|
277
|
+
passCount,
|
|
278
|
+
totalChecks: checks.length,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function runSuite(suitePath = DEFAULT_SUITE, options = {}) {
|
|
283
|
+
const suite = loadSuite(suitePath);
|
|
284
|
+
const results = [];
|
|
285
|
+
|
|
286
|
+
for (const evalCase of suite.evaluations) {
|
|
287
|
+
results.push(runEvaluation(evalCase));
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
const passed = results.filter((r) => r.status === 'pass').length;
|
|
291
|
+
const failed = results.filter((r) => r.status === 'fail').length;
|
|
292
|
+
const errors = results.filter((r) => r.status === 'error').length;
|
|
293
|
+
const skipped = results.filter((r) => r.status === 'skip').length;
|
|
294
|
+
const totalScore = results.length > 0
|
|
295
|
+
? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
|
|
296
|
+
: 0;
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
suite: suite.name,
|
|
300
|
+
total: results.length,
|
|
301
|
+
passed,
|
|
302
|
+
failed,
|
|
303
|
+
errors,
|
|
304
|
+
skipped,
|
|
305
|
+
score: totalScore,
|
|
306
|
+
minScore: options.minScore || 80,
|
|
307
|
+
pass: totalScore >= (options.minScore || 80),
|
|
308
|
+
results,
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// ---------------------------------------------------------------------------
|
|
313
|
+
// CLI
|
|
314
|
+
// ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
function statusIcon(status) {
|
|
317
|
+
if (status === 'pass') return '\u2705';
|
|
318
|
+
if (status === 'skip') return '\u23ED';
|
|
319
|
+
return '\u274C';
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function isCliInvocation() {
|
|
323
|
+
return Boolean(process.argv[1]) && path.resolve(process.argv[1]) === __filename;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (isCliInvocation()) {
|
|
327
|
+
const args = process.argv.slice(2);
|
|
328
|
+
let suitePath = DEFAULT_SUITE;
|
|
329
|
+
let json = false;
|
|
330
|
+
let minScore = 80;
|
|
331
|
+
|
|
332
|
+
for (const arg of args) {
|
|
333
|
+
if (arg.startsWith('--suite=')) suitePath = path.resolve(arg.slice(8));
|
|
334
|
+
if (arg === '--json') json = true;
|
|
335
|
+
if (arg.startsWith('--min-score=')) minScore = Number(arg.slice(12));
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
const report = runSuite(suitePath, { minScore });
|
|
339
|
+
|
|
340
|
+
if (json) {
|
|
341
|
+
console.log(JSON.stringify(report, null, 2));
|
|
342
|
+
} else {
|
|
343
|
+
console.log(`\n${report.suite}`);
|
|
344
|
+
console.log('='.repeat(50));
|
|
345
|
+
for (const r of report.results) {
|
|
346
|
+
const icon = statusIcon(r.status);
|
|
347
|
+
console.log(`${icon} ${r.id} — ${r.score}% (${r.passCount || 0}/${r.totalChecks || 0})`);
|
|
348
|
+
if (r.status === 'fail' || r.status === 'error') {
|
|
349
|
+
for (const c of (r.checks || [])) {
|
|
350
|
+
if (!c.pass) console.log(` \u274C ${c.criterion}: ${c.detail}`);
|
|
351
|
+
}
|
|
352
|
+
if (r.error) console.log(` Error: ${r.error}`);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
console.log('='.repeat(50));
|
|
356
|
+
console.log(`Score: ${report.score}% | Pass: ${report.passed} | Fail: ${report.failed} | Error: ${report.errors} | Skip: ${report.skipped}`);
|
|
357
|
+
console.log(report.pass ? '\u2705 PASS' : `\u274C FAIL (min: ${minScore}%)`);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
process.exit(report.pass ? 0 : 1);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
module.exports = { runSuite, runEvaluation, gradeOutput, loadSuite };
|