@intentsolutions/jrig-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +82 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +4234 -0
- package/package.json +73 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,4234 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/index.ts
|
|
4
|
+
import { Command } from "commander";
|
|
5
|
+
|
|
6
|
+
// ../core/dist/index.js
|
|
7
|
+
import { z } from "zod";
|
|
8
|
+
import { z as z2 } from "zod";
|
|
9
|
+
import { z as z3 } from "zod";
|
|
10
|
+
import { z as z4 } from "zod";
|
|
11
|
+
import { z as z5 } from "zod";
|
|
12
|
+
import {
|
|
13
|
+
SkillFrontmatterSchema,
|
|
14
|
+
skillFrontmatterIssues,
|
|
15
|
+
SKILL_FRONTMATTER_BASE_REQUIRED,
|
|
16
|
+
SKILL_FRONTMATTER_OVERLAY_REQUIRED,
|
|
17
|
+
SKILL_FRONTMATTER_REQUIRED_FIELDS,
|
|
18
|
+
SKILL_NAME_PATTERN,
|
|
19
|
+
SKILL_NAME_MAX,
|
|
20
|
+
SKILL_COMPATIBILITY_MAX,
|
|
21
|
+
SKILL_DESCRIPTION_MAX
|
|
22
|
+
} from "@intentsolutions/core/validators/v1/authoring";
|
|
23
|
+
import {
|
|
24
|
+
SkillFrontmatterSchema as KernelSkillFrontmatterSchemaInternal,
|
|
25
|
+
attach,
|
|
26
|
+
universalFoldsIssues
|
|
27
|
+
} from "@intentsolutions/core/validators/v1/authoring";
|
|
28
|
+
import { upstreamBaseIssues } from "@intentsolutions/core/validators/v1/authoring/skill-frontmatter";
|
|
29
|
+
import { parse as parseYaml } from "yaml";
|
|
30
|
+
import matter from "gray-matter";
|
|
31
|
+
import matter2 from "gray-matter";
|
|
32
|
+
import { existsSync, readFileSync, statSync } from "fs";
|
|
33
|
+
import { join, resolve } from "path";
|
|
34
|
+
import { z as z6 } from "zod";
|
|
35
|
+
import {
|
|
36
|
+
EvidenceStatementSchema,
|
|
37
|
+
EvidenceBundlePayloadSchema,
|
|
38
|
+
IN_TOTO_STATEMENT_V1_TYPE
|
|
39
|
+
} from "@intentsolutions/core/validators/v1/evidence-statement";
|
|
40
|
+
import {
|
|
41
|
+
GateResultV1Schema,
|
|
42
|
+
GateDecisionSchema,
|
|
43
|
+
AdvisorySeveritySchema,
|
|
44
|
+
ReplayFidelityLevelSchema,
|
|
45
|
+
SubjectSideSchema,
|
|
46
|
+
GATE_RESULT_V1_URI
|
|
47
|
+
} from "@intentsolutions/core/validators/v1/gate-result-v1";
|
|
48
|
+
import { InTotoSubjectSchema } from "@intentsolutions/core/validators/v1/evidence-bundle";
|
|
49
|
+
import {
|
|
50
|
+
EvidenceStatementSchema as KernelEvidenceStatementSchemaInternal,
|
|
51
|
+
EvidenceBundlePayloadSchema as EvidenceBundlePayloadSchema2,
|
|
52
|
+
IN_TOTO_STATEMENT_V1_TYPE as IN_TOTO_STATEMENT_V1_TYPE2
|
|
53
|
+
} from "@intentsolutions/core/validators/v1/evidence-statement";
|
|
54
|
+
import {
|
|
55
|
+
GateResultV1Schema as GateResultV1Schema2,
|
|
56
|
+
GateDecisionSchema as GateDecisionSchema2,
|
|
57
|
+
AdvisorySeveritySchema as AdvisorySeveritySchema2,
|
|
58
|
+
GATE_RESULT_V1_URI as GATE_RESULT_V1_URI2
|
|
59
|
+
} from "@intentsolutions/core/validators/v1/gate-result-v1";
|
|
60
|
+
import { InTotoSubjectSchema as InTotoSubjectSchema2 } from "@intentsolutions/core/validators/v1/evidence-bundle";
|
|
61
|
+
import { trace } from "@opentelemetry/api";
|
|
62
|
+
import {
|
|
63
|
+
BasicTracerProvider,
|
|
64
|
+
InMemorySpanExporter,
|
|
65
|
+
SimpleSpanProcessor
|
|
66
|
+
} from "@opentelemetry/sdk-trace-base";
|
|
67
|
+
import { z as z7 } from "zod";
|
|
68
|
+
import { z as z8 } from "zod";
|
|
69
|
+
import { trace as trace2, SpanStatusCode } from "@opentelemetry/api";
|
|
70
|
+
import { randomBytes } from "crypto";
|
|
71
|
+
var CriterionMethod = z.enum(["deterministic", "judge"]);
|
|
72
|
+
var CriterionSchema = z.object({
|
|
73
|
+
id: z.string().min(1).describe("Unique identifier within the spec"),
|
|
74
|
+
description: z.string().min(1).describe("Human-readable description of what is being checked"),
|
|
75
|
+
method: CriterionMethod,
|
|
76
|
+
blocker: z.boolean().default(false).describe("If true, failure blocks release"),
|
|
77
|
+
regression_critical: z.boolean().default(false).describe("If true, regression on this criterion blocks release"),
|
|
78
|
+
baseline_sensitive: z.boolean().default(false).describe("If true, compare against naked model performance"),
|
|
79
|
+
pack_sensitive: z.boolean().default(false).describe("If true, evaluate in context of sibling skills"),
|
|
80
|
+
judge_prompt: z.string().optional().describe("Prompt template for judge-based criteria"),
|
|
81
|
+
deterministic_check: z.string().optional().describe("Check identifier for deterministic criteria (e.g. 'file_exists', 'regex_match')"),
|
|
82
|
+
deterministic_check_params: z.record(z.string(), z.unknown()).optional().describe(
|
|
83
|
+
"Parameters forwarded to the deterministic check (e.g. { value: 'needle' } for 'contains', { pattern: '\\\\d+' } for 'regex_match')"
|
|
84
|
+
)
|
|
85
|
+
});
|
|
86
|
+
var TestCaseTier = z2.enum(["core", "edge", "regression", "adversarial"]);
|
|
87
|
+
var TriggerExpectation = z2.enum(["should_trigger", "should_not_trigger"]);
|
|
88
|
+
var TestCaseSchema = z2.object({
|
|
89
|
+
id: z2.string().min(1).describe("Unique identifier within the spec"),
|
|
90
|
+
description: z2.string().min(1).describe("What this test case checks"),
|
|
91
|
+
tier: TestCaseTier,
|
|
92
|
+
prompt: z2.string().min(1).describe("The user prompt to send"),
|
|
93
|
+
trigger_expectation: TriggerExpectation.optional().describe(
|
|
94
|
+
"Whether the skill should or should not trigger"
|
|
95
|
+
),
|
|
96
|
+
expected_artifacts: z2.array(z2.string()).optional().describe("Expected output files or artifacts"),
|
|
97
|
+
expected_output_contains: z2.array(z2.string()).optional().describe("Strings that must appear in the output"),
|
|
98
|
+
context_hints: z2.record(z2.string(), z2.unknown()).optional().describe("Additional context for the test runner"),
|
|
99
|
+
criteria_ids: z2.array(z2.string()).optional().describe("Which criteria this test case evaluates (defaults to all)")
|
|
100
|
+
});
|
|
101
|
+
var ModelTarget = z3.string().min(1).describe(
|
|
102
|
+
"Model id: a Claude alias (haiku|sonnet|opus) or any concrete provider model id (e.g. deepseek-chat, kimi-k2-0711-preview, deepseek/deepseek-chat, claude-sonnet-4-5)."
|
|
103
|
+
);
|
|
104
|
+
var SiblingSkillSchema = z3.object({
|
|
105
|
+
name: z3.string().min(1).describe("Sibling skill name (kebab-case)"),
|
|
106
|
+
description: z3.string().min(1).describe("What the sibling does"),
|
|
107
|
+
trigger_patterns: z3.array(z3.string()).optional().describe("Prompts that should trigger the sibling instead")
|
|
108
|
+
});
|
|
109
|
+
var EvalSpecSchema = z3.object({
|
|
110
|
+
spec_version: z3.literal("1.0").describe("Schema version for forward compatibility"),
|
|
111
|
+
skill_name: z3.string().min(1).regex(/^[a-z][a-z0-9-]*[a-z0-9]$/, "Must be kebab-case").describe("Name of the skill being evaluated"),
|
|
112
|
+
description: z3.string().min(1).describe("What this eval spec covers"),
|
|
113
|
+
criteria: z3.array(CriterionSchema).min(1).describe("Binary criteria to evaluate"),
|
|
114
|
+
test_cases: z3.array(TestCaseSchema).min(1).describe("Test cases to run"),
|
|
115
|
+
models: z3.array(ModelTarget).default(["sonnet"]).describe("Models to test independently"),
|
|
116
|
+
siblings: z3.array(SiblingSkillSchema).optional().describe("Sibling skills for pack-sensitive evaluation"),
|
|
117
|
+
tags: z3.array(z3.string()).optional().describe("Categorization tags")
|
|
118
|
+
});
|
|
119
|
+
var EvalContractSchema = z4.object({
|
|
120
|
+
contract_version: z4.literal("1.0").describe("Schema version for forward compatibility"),
|
|
121
|
+
skill_name: z4.string().min(1).regex(/^[a-z][a-z0-9-]*[a-z0-9]$/, "Must be kebab-case").describe("Name of the skill this contract governs"),
|
|
122
|
+
purpose: z4.string().min(1).describe("What the skill is for \u2014 one clear sentence"),
|
|
123
|
+
trigger_boundary: z4.object({
|
|
124
|
+
should_trigger: z4.array(z4.string()).min(1).describe("Prompt patterns that should activate the skill"),
|
|
125
|
+
should_not_trigger: z4.array(z4.string()).min(1).describe("Prompt patterns that should NOT activate the skill")
|
|
126
|
+
}),
|
|
127
|
+
success_criteria: z4.array(z4.string()).min(1).describe("What counts as successful execution \u2014 observable outcomes"),
|
|
128
|
+
blockers: z4.array(z4.string()).min(1).describe("Sacred failures that block release regardless of average score"),
|
|
129
|
+
safety_boundaries: z4.array(z4.string()).optional().describe("What the skill must never do (prompt leakage, overreach, etc.)"),
|
|
130
|
+
baseline_expectation: z4.string().optional().describe("What the naked model does without this skill \u2014 for obsolete review"),
|
|
131
|
+
evidence_rules: z4.object({
|
|
132
|
+
require_artifacts: z4.boolean().default(false).describe("Whether the skill must produce artifacts"),
|
|
133
|
+
require_output_validation: z4.boolean().default(true).describe("Whether output must be validated against expectations")
|
|
134
|
+
}).optional()
|
|
135
|
+
});
|
|
136
|
+
var SkillModel = z5.enum(["inherit", "sonnet", "haiku", "opus"]);
|
|
137
|
+
var SkillEffort = z5.enum(["low", "medium", "high", "max"]);
|
|
138
|
+
var DESCRIPTION_ANTI_PATTERNS = [
|
|
139
|
+
/\b(I can|I will|I'm|I help)\b/i,
|
|
140
|
+
/\b(You can|You should|You will)\b/i
|
|
141
|
+
];
|
|
142
|
+
function thirdPersonDescriptionIssues(artifact) {
|
|
143
|
+
const description = artifact["description"];
|
|
144
|
+
if (typeof description !== "string") {
|
|
145
|
+
return [];
|
|
146
|
+
}
|
|
147
|
+
if (DESCRIPTION_ANTI_PATTERNS.some((p) => p.test(description))) {
|
|
148
|
+
return [
|
|
149
|
+
{
|
|
150
|
+
message: "Description must use third person \u2014 avoid 'I can', 'You should', etc.",
|
|
151
|
+
path: ["description"]
|
|
152
|
+
}
|
|
153
|
+
];
|
|
154
|
+
}
|
|
155
|
+
return [];
|
|
156
|
+
}
|
|
157
|
+
function withEvalDomainChecks(schema) {
|
|
158
|
+
return schema.superRefine((artifact, ctx) => {
|
|
159
|
+
for (const issue of thirdPersonDescriptionIssues(artifact)) {
|
|
160
|
+
ctx.addIssue({ code: "custom", message: issue.message, path: [...issue.path] });
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
function kernelBaseCompositionIssues(artifact) {
|
|
165
|
+
return [...upstreamBaseIssues(artifact), ...universalFoldsIssues(artifact)];
|
|
166
|
+
}
|
|
167
|
+
var SkillFrontmatterSchema2 = withEvalDomainChecks(
|
|
168
|
+
attach(kernelBaseCompositionIssues)
|
|
169
|
+
).pipe(z5.custom());
|
|
170
|
+
var SkillFrontmatterEnterpriseSchema = withEvalDomainChecks(
|
|
171
|
+
KernelSkillFrontmatterSchemaInternal
|
|
172
|
+
).pipe(z5.custom());
|
|
173
|
+
function parseAndValidateYaml(yamlString, schema) {
|
|
174
|
+
let raw;
|
|
175
|
+
try {
|
|
176
|
+
raw = parseYaml(yamlString);
|
|
177
|
+
} catch (err) {
|
|
178
|
+
return {
|
|
179
|
+
success: false,
|
|
180
|
+
errors: [
|
|
181
|
+
{
|
|
182
|
+
path: "",
|
|
183
|
+
message: `Invalid YAML: ${err instanceof Error ? err.message : String(err)}`
|
|
184
|
+
}
|
|
185
|
+
]
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
if (raw === null || raw === void 0) {
|
|
189
|
+
return {
|
|
190
|
+
success: false,
|
|
191
|
+
errors: [{ path: "", message: "YAML document is empty" }]
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
const result = schema.safeParse(raw);
|
|
195
|
+
if (result.success) {
|
|
196
|
+
return { success: true, data: result.data };
|
|
197
|
+
}
|
|
198
|
+
const errors = result.error.issues.map((issue) => ({
|
|
199
|
+
path: issue.path.join("."),
|
|
200
|
+
message: issue.message
|
|
201
|
+
}));
|
|
202
|
+
return { success: false, errors };
|
|
203
|
+
}
|
|
204
|
+
function parseSkillMd(content) {
|
|
205
|
+
let parsed;
|
|
206
|
+
try {
|
|
207
|
+
parsed = matter(content);
|
|
208
|
+
} catch (err) {
|
|
209
|
+
return {
|
|
210
|
+
success: false,
|
|
211
|
+
errors: [
|
|
212
|
+
{
|
|
213
|
+
path: "",
|
|
214
|
+
message: `Failed to parse SKILL.md frontmatter: ${err instanceof Error ? err.message : String(err)}`
|
|
215
|
+
}
|
|
216
|
+
]
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
if (!parsed.data || Object.keys(parsed.data).length === 0) {
|
|
220
|
+
return {
|
|
221
|
+
success: false,
|
|
222
|
+
errors: [
|
|
223
|
+
{
|
|
224
|
+
path: "",
|
|
225
|
+
message: "SKILL.md has no frontmatter. Expected YAML frontmatter between --- delimiters."
|
|
226
|
+
}
|
|
227
|
+
]
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
const result = SkillFrontmatterSchema2.safeParse(parsed.data);
|
|
231
|
+
if (!result.success) {
|
|
232
|
+
const errors = result.error.issues.map((issue) => ({
|
|
233
|
+
path: `frontmatter.${issue.path.join(".")}`,
|
|
234
|
+
message: issue.message
|
|
235
|
+
}));
|
|
236
|
+
return { success: false, errors };
|
|
237
|
+
}
|
|
238
|
+
return {
|
|
239
|
+
success: true,
|
|
240
|
+
data: {
|
|
241
|
+
frontmatter: result.data,
|
|
242
|
+
body: parsed.content.trim()
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
function parseSkillMdEnterprise(content) {
|
|
247
|
+
let parsed;
|
|
248
|
+
try {
|
|
249
|
+
parsed = matter(content);
|
|
250
|
+
} catch (err) {
|
|
251
|
+
return {
|
|
252
|
+
success: false,
|
|
253
|
+
errors: [
|
|
254
|
+
{
|
|
255
|
+
path: "",
|
|
256
|
+
message: `Failed to parse SKILL.md frontmatter: ${err instanceof Error ? err.message : String(err)}`
|
|
257
|
+
}
|
|
258
|
+
]
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
if (!parsed.data || Object.keys(parsed.data).length === 0) {
|
|
262
|
+
return {
|
|
263
|
+
success: false,
|
|
264
|
+
errors: [
|
|
265
|
+
{
|
|
266
|
+
path: "",
|
|
267
|
+
message: "SKILL.md has no frontmatter. Expected YAML frontmatter between --- delimiters."
|
|
268
|
+
}
|
|
269
|
+
]
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
const result = SkillFrontmatterEnterpriseSchema.safeParse(parsed.data);
|
|
273
|
+
if (!result.success) {
|
|
274
|
+
const errors = result.error.issues.map((issue) => ({
|
|
275
|
+
path: `frontmatter.${issue.path.join(".")}`,
|
|
276
|
+
message: issue.message
|
|
277
|
+
}));
|
|
278
|
+
return { success: false, errors };
|
|
279
|
+
}
|
|
280
|
+
return {
|
|
281
|
+
success: true,
|
|
282
|
+
data: {
|
|
283
|
+
frontmatter: result.data,
|
|
284
|
+
body: parsed.content.trim()
|
|
285
|
+
}
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
var COMMAND_HEADING_REGEXES = {
|
|
289
|
+
build: /\bbuild\b/i,
|
|
290
|
+
test: /\btest(s|ing)?\b/i,
|
|
291
|
+
lint: /\blint(ing|er)?\b/i,
|
|
292
|
+
setup: /\b(setup|install|bootstrap|getting started)\b/i,
|
|
293
|
+
style: /\b(code\s*style|conventions?)\b/i,
|
|
294
|
+
format: /\b(format|formatter|formatting|prettier)\b/i
|
|
295
|
+
};
|
|
296
|
+
var TOOL_HEADING_RE = /\btools?\b|\bcapabilit(y|ies)\b/i;
|
|
297
|
+
var CAPABILITY_HEADING_RE = /\bcapabilit(y|ies)\b|\bbehaviors?\b|\babilit(y|ies)\b/i;
|
|
298
|
+
var CONSTRAINT_HEADING_RE = /\bconstraints?\b|\bdon'?ts?\b|\bmust\s*not\b|\bguardrails?\b/i;
|
|
299
|
+
function parseAgentsMd(content) {
|
|
300
|
+
matter2.clearCache?.();
|
|
301
|
+
let parsed;
|
|
302
|
+
try {
|
|
303
|
+
parsed = matter2(content);
|
|
304
|
+
} catch (err) {
|
|
305
|
+
const errors = [
|
|
306
|
+
{
|
|
307
|
+
path: "",
|
|
308
|
+
message: `Failed to parse AGENTS.md frontmatter: ${err instanceof Error ? err.message : String(err)}`
|
|
309
|
+
}
|
|
310
|
+
];
|
|
311
|
+
return { success: false, errors };
|
|
312
|
+
}
|
|
313
|
+
const frontmatter = parsed.data ?? {};
|
|
314
|
+
const body = parsed.content;
|
|
315
|
+
const title = extractTitle(body);
|
|
316
|
+
const sections = extractSections(body);
|
|
317
|
+
const commands = {};
|
|
318
|
+
for (const kind of Object.keys(COMMAND_HEADING_REGEXES)) {
|
|
319
|
+
const cmds = extractCommandsForKind(sections, kind);
|
|
320
|
+
if (cmds.length > 0) commands[kind] = cmds;
|
|
321
|
+
}
|
|
322
|
+
const tools = extractBulletsForHeading(sections, TOOL_HEADING_RE);
|
|
323
|
+
const capabilities = extractBulletsForHeading(sections, CAPABILITY_HEADING_RE);
|
|
324
|
+
const constraints = extractBulletsForHeading(sections, CONSTRAINT_HEADING_RE);
|
|
325
|
+
return {
|
|
326
|
+
success: true,
|
|
327
|
+
data: { frontmatter, title, sections, commands, tools, capabilities, constraints, body }
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
function extractTitle(body) {
|
|
331
|
+
const m = body.match(/^#\s+(.+?)\s*$/m);
|
|
332
|
+
return m ? m[1].trim() : "";
|
|
333
|
+
}
|
|
334
|
+
function extractSections(body) {
|
|
335
|
+
const lines = body.split(/\r?\n/);
|
|
336
|
+
const sections = [];
|
|
337
|
+
let current = null;
|
|
338
|
+
let inFence = false;
|
|
339
|
+
let fenceMarker = "";
|
|
340
|
+
const flush = (endLineExclusive) => {
|
|
341
|
+
if (current === null) return;
|
|
342
|
+
const content = lines.slice(current.startLine + 1, endLineExclusive).join("\n").trim();
|
|
343
|
+
sections.push({ heading: current.heading, level: current.level, content });
|
|
344
|
+
};
|
|
345
|
+
for (let i = 0; i < lines.length; i++) {
|
|
346
|
+
const line = lines[i];
|
|
347
|
+
const fenceMatch = line.match(/^```+(\S*)\s*$/);
|
|
348
|
+
if (fenceMatch) {
|
|
349
|
+
const marker = fenceMatch[0].match(/^`+/)[0];
|
|
350
|
+
if (!inFence) {
|
|
351
|
+
inFence = true;
|
|
352
|
+
fenceMarker = marker;
|
|
353
|
+
} else if (marker.length >= fenceMarker.length) {
|
|
354
|
+
inFence = false;
|
|
355
|
+
fenceMarker = "";
|
|
356
|
+
}
|
|
357
|
+
continue;
|
|
358
|
+
}
|
|
359
|
+
if (inFence) continue;
|
|
360
|
+
const h2 = line.match(/^##\s+(.+?)\s*$/);
|
|
361
|
+
const h3 = line.match(/^###\s+(.+?)\s*$/);
|
|
362
|
+
if (h2) {
|
|
363
|
+
flush(i);
|
|
364
|
+
current = { heading: h2[1].trim(), level: 2, startLine: i };
|
|
365
|
+
} else if (h3) {
|
|
366
|
+
flush(i);
|
|
367
|
+
current = { heading: h3[1].trim(), level: 3, startLine: i };
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
flush(lines.length);
|
|
371
|
+
return sections;
|
|
372
|
+
}
|
|
373
|
+
function extractCommandsForKind(sections, kind) {
|
|
374
|
+
const re = COMMAND_HEADING_REGEXES[kind];
|
|
375
|
+
const matches = sections.filter((s) => re.test(s.heading));
|
|
376
|
+
const cmds = [];
|
|
377
|
+
for (const sec of matches) {
|
|
378
|
+
cmds.push(...extractShellCommandsFromBlock(sec.content));
|
|
379
|
+
}
|
|
380
|
+
return cmds;
|
|
381
|
+
}
|
|
382
|
+
var SHELL_INFO_STRINGS = /* @__PURE__ */ new Set(["", "bash", "sh", "shell", "zsh", "console"]);
|
|
383
|
+
function extractShellCommandsFromBlock(md) {
|
|
384
|
+
const lines = md.split(/\r?\n/);
|
|
385
|
+
const cmds = [];
|
|
386
|
+
let inShellBlock = false;
|
|
387
|
+
let currentMarker = "";
|
|
388
|
+
for (const line of lines) {
|
|
389
|
+
const fenceMatch = line.match(/^```+(\S*)\s*$/);
|
|
390
|
+
if (fenceMatch) {
|
|
391
|
+
const marker = fenceMatch[0].match(/^`+/)[0];
|
|
392
|
+
if (!inShellBlock) {
|
|
393
|
+
const info = (fenceMatch[1] ?? "").toLowerCase();
|
|
394
|
+
if (SHELL_INFO_STRINGS.has(info)) {
|
|
395
|
+
inShellBlock = true;
|
|
396
|
+
currentMarker = marker;
|
|
397
|
+
}
|
|
398
|
+
} else if (marker.length >= currentMarker.length) {
|
|
399
|
+
inShellBlock = false;
|
|
400
|
+
currentMarker = "";
|
|
401
|
+
}
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
404
|
+
if (!inShellBlock) continue;
|
|
405
|
+
const stripped = line.replace(/^[\s]*[$>]\s+/, "").trim();
|
|
406
|
+
if (!stripped) continue;
|
|
407
|
+
if (stripped.startsWith("#")) continue;
|
|
408
|
+
cmds.push(stripped);
|
|
409
|
+
}
|
|
410
|
+
return cmds;
|
|
411
|
+
}
|
|
412
|
+
function extractBulletsForHeading(sections, headingRe) {
|
|
413
|
+
const matches = sections.filter((s) => headingRe.test(s.heading));
|
|
414
|
+
const out = [];
|
|
415
|
+
for (const sec of matches) {
|
|
416
|
+
for (const line of sec.content.split(/\r?\n/)) {
|
|
417
|
+
const m = line.match(/^[-*+]\s+(.+?)\s*$/);
|
|
418
|
+
if (m) out.push(m[1].trim());
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
return out;
|
|
422
|
+
}
|
|
423
|
+
function summarize(results) {
|
|
424
|
+
return {
|
|
425
|
+
total: results.length,
|
|
426
|
+
passed: results.filter((r) => r.severity === "pass").length,
|
|
427
|
+
warnings: results.filter((r) => r.severity === "warning").length,
|
|
428
|
+
errors: results.filter((r) => r.severity === "error").length
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
var FILE_REF_PATTERN = /\$\{CLAUDE_SKILL_DIR\}\/([\w./-]+)/g;
|
|
432
|
+
var PATH_LIKE_PATTERN = /(?:^|\s)(\.\/[\w./-]+)/gm;
|
|
433
|
+
var MIN_DESCRIPTION_LENGTH = 20;
|
|
434
|
+
var MIN_DESCRIPTION_WORDS = 4;
|
|
435
|
+
var MAX_BODY_LINES = 500;
|
|
436
|
+
var MIN_BODY_LINES = 3;
|
|
437
|
+
var XML_TAG_PATTERN = /[<>]/;
|
|
438
|
+
var TIME_SENSITIVE_PATTERNS = [
|
|
439
|
+
/\b(20\d{2}[-/]\d{2}[-/]\d{2})\b/,
|
|
440
|
+
// dates like 2025-01-01
|
|
441
|
+
/\b(v\d+\.\d+\.\d+)\b/i,
|
|
442
|
+
// version numbers like v1.2.3 (outside code blocks)
|
|
443
|
+
/\b(as of|since|after|before) (January|February|March|April|May|June|July|August|September|October|November|December)\b/i
|
|
444
|
+
];
|
|
445
|
+
function checkPackage(packageDir) {
|
|
446
|
+
const results = [];
|
|
447
|
+
const absDir = resolve(packageDir);
|
|
448
|
+
const skillPath = join(absDir, "SKILL.md");
|
|
449
|
+
if (!existsSync(skillPath)) {
|
|
450
|
+
results.push({
|
|
451
|
+
id: "pkg:skill-md-exists",
|
|
452
|
+
description: "SKILL.md file exists in package",
|
|
453
|
+
severity: "error",
|
|
454
|
+
message: "SKILL.md not found",
|
|
455
|
+
details: `Expected at: ${skillPath}`
|
|
456
|
+
});
|
|
457
|
+
return buildReport(null, results);
|
|
458
|
+
}
|
|
459
|
+
results.push({
|
|
460
|
+
id: "pkg:skill-md-exists",
|
|
461
|
+
description: "SKILL.md file exists in package",
|
|
462
|
+
severity: "pass",
|
|
463
|
+
message: "SKILL.md found"
|
|
464
|
+
});
|
|
465
|
+
const content = readFileSync(skillPath, "utf-8");
|
|
466
|
+
const parseResult = parseSkillMd(content);
|
|
467
|
+
if (!parseResult.success) {
|
|
468
|
+
results.push({
|
|
469
|
+
id: "pkg:skill-md-parses",
|
|
470
|
+
description: "SKILL.md frontmatter parses successfully",
|
|
471
|
+
severity: "error",
|
|
472
|
+
message: "SKILL.md failed to parse",
|
|
473
|
+
details: parseResult.errors.map((e) => `${e.path}: ${e.message}`).join("; ")
|
|
474
|
+
});
|
|
475
|
+
return buildReport(null, results);
|
|
476
|
+
}
|
|
477
|
+
const { frontmatter, body } = parseResult.data;
|
|
478
|
+
results.push({
|
|
479
|
+
id: "pkg:skill-md-parses",
|
|
480
|
+
description: "SKILL.md frontmatter parses successfully",
|
|
481
|
+
severity: "pass",
|
|
482
|
+
message: "SKILL.md parsed successfully"
|
|
483
|
+
});
|
|
484
|
+
results.push(...checkRequiredFields(frontmatter));
|
|
485
|
+
results.push(...checkDescriptionQuality(frontmatter.description));
|
|
486
|
+
results.push(...checkBodySize(body));
|
|
487
|
+
results.push(...checkReferencedFiles(body, absDir));
|
|
488
|
+
results.push(...checkXmlTags(frontmatter));
|
|
489
|
+
results.push(...checkTimeSensitiveInfo(body));
|
|
490
|
+
return buildReport(frontmatter.name, results);
|
|
491
|
+
}
|
|
492
|
+
function checkRequiredFields(fm) {
|
|
493
|
+
const results = [];
|
|
494
|
+
if (!fm.name || fm.name.trim().length === 0) {
|
|
495
|
+
results.push({
|
|
496
|
+
id: "pkg:name-present",
|
|
497
|
+
description: "Skill name is present",
|
|
498
|
+
severity: "error",
|
|
499
|
+
message: "Skill name is missing or empty"
|
|
500
|
+
});
|
|
501
|
+
} else {
|
|
502
|
+
results.push({
|
|
503
|
+
id: "pkg:name-present",
|
|
504
|
+
description: "Skill name is present",
|
|
505
|
+
severity: "pass",
|
|
506
|
+
message: `Skill name: ${fm.name}`
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
if (!fm.description || fm.description.trim().length === 0) {
|
|
510
|
+
results.push({
|
|
511
|
+
id: "pkg:description-present",
|
|
512
|
+
description: "Skill description is present",
|
|
513
|
+
severity: "error",
|
|
514
|
+
message: "Skill description is missing or empty"
|
|
515
|
+
});
|
|
516
|
+
} else {
|
|
517
|
+
results.push({
|
|
518
|
+
id: "pkg:description-present",
|
|
519
|
+
description: "Skill description is present",
|
|
520
|
+
severity: "pass",
|
|
521
|
+
message: "Skill description present"
|
|
522
|
+
});
|
|
523
|
+
}
|
|
524
|
+
return results;
|
|
525
|
+
}
|
|
526
|
+
function checkDescriptionQuality(description) {
|
|
527
|
+
const results = [];
|
|
528
|
+
if (description.length < MIN_DESCRIPTION_LENGTH) {
|
|
529
|
+
results.push({
|
|
530
|
+
id: "heuristic:description-length",
|
|
531
|
+
description: "Description meets minimum length",
|
|
532
|
+
severity: "warning",
|
|
533
|
+
message: `Description is very short (${description.length} chars, min recommended: ${MIN_DESCRIPTION_LENGTH})`
|
|
534
|
+
});
|
|
535
|
+
} else {
|
|
536
|
+
results.push({
|
|
537
|
+
id: "heuristic:description-length",
|
|
538
|
+
description: "Description meets minimum length",
|
|
539
|
+
severity: "pass",
|
|
540
|
+
message: "Description length is adequate"
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
const wordCount = description.split(/\s+/).filter(Boolean).length;
|
|
544
|
+
if (wordCount < MIN_DESCRIPTION_WORDS) {
|
|
545
|
+
results.push({
|
|
546
|
+
id: "heuristic:description-words",
|
|
547
|
+
description: "Description has enough words",
|
|
548
|
+
severity: "warning",
|
|
549
|
+
message: `Description has only ${wordCount} words (min recommended: ${MIN_DESCRIPTION_WORDS})`
|
|
550
|
+
});
|
|
551
|
+
} else {
|
|
552
|
+
results.push({
|
|
553
|
+
id: "heuristic:description-words",
|
|
554
|
+
description: "Description has enough words",
|
|
555
|
+
severity: "pass",
|
|
556
|
+
message: "Description word count is adequate"
|
|
557
|
+
});
|
|
558
|
+
}
|
|
559
|
+
const vaguePatterns = [
|
|
560
|
+
/^(does|handles|manages|processes|works with)\b/i,
|
|
561
|
+
/^(a|an|the) (tool|helper|utility)\b/i,
|
|
562
|
+
/\b(stuff|things|etc\.?)\b/i
|
|
563
|
+
];
|
|
564
|
+
const isVague = vaguePatterns.some((p) => p.test(description));
|
|
565
|
+
if (isVague) {
|
|
566
|
+
results.push({
|
|
567
|
+
id: "heuristic:description-specificity",
|
|
568
|
+
description: "Description is specific enough",
|
|
569
|
+
severity: "warning",
|
|
570
|
+
message: "Description appears vague or generic",
|
|
571
|
+
details: "Consider adding what the skill does, when it triggers, and what it produces"
|
|
572
|
+
});
|
|
573
|
+
} else {
|
|
574
|
+
results.push({
|
|
575
|
+
id: "heuristic:description-specificity",
|
|
576
|
+
description: "Description is specific enough",
|
|
577
|
+
severity: "pass",
|
|
578
|
+
message: "Description appears specific"
|
|
579
|
+
});
|
|
580
|
+
}
|
|
581
|
+
return results;
|
|
582
|
+
}
|
|
583
|
+
function checkBodySize(body) {
|
|
584
|
+
const results = [];
|
|
585
|
+
const lineCount = body.split("\n").length;
|
|
586
|
+
if (lineCount > MAX_BODY_LINES) {
|
|
587
|
+
results.push({
|
|
588
|
+
id: "heuristic:body-oversized",
|
|
589
|
+
description: "Body is not excessively large",
|
|
590
|
+
severity: "warning",
|
|
591
|
+
message: `Body is ${lineCount} lines (max recommended: ${MAX_BODY_LINES})`,
|
|
592
|
+
details: "Very large skill bodies may indicate scope creep or embedded data that should be externalized"
|
|
593
|
+
});
|
|
594
|
+
} else {
|
|
595
|
+
results.push({
|
|
596
|
+
id: "heuristic:body-oversized",
|
|
597
|
+
description: "Body is not excessively large",
|
|
598
|
+
severity: "pass",
|
|
599
|
+
message: `Body size is reasonable (${lineCount} lines)`
|
|
600
|
+
});
|
|
601
|
+
}
|
|
602
|
+
if (lineCount < MIN_BODY_LINES) {
|
|
603
|
+
results.push({
|
|
604
|
+
id: "heuristic:body-underspecified",
|
|
605
|
+
description: "Body has sufficient content",
|
|
606
|
+
severity: "warning",
|
|
607
|
+
message: `Body is very thin (${lineCount} lines, min recommended: ${MIN_BODY_LINES})`,
|
|
608
|
+
details: "Skills with minimal instructions may not provide enough guidance for the model"
|
|
609
|
+
});
|
|
610
|
+
} else {
|
|
611
|
+
results.push({
|
|
612
|
+
id: "heuristic:body-underspecified",
|
|
613
|
+
description: "Body has sufficient content",
|
|
614
|
+
severity: "pass",
|
|
615
|
+
message: "Body has sufficient content"
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
return results;
|
|
619
|
+
}
|
|
620
|
+
function checkReferencedFiles(body, packageDir) {
|
|
621
|
+
const results = [];
|
|
622
|
+
const refs = /* @__PURE__ */ new Set();
|
|
623
|
+
for (const match of body.matchAll(FILE_REF_PATTERN)) {
|
|
624
|
+
refs.add(match[1]);
|
|
625
|
+
}
|
|
626
|
+
for (const match of body.matchAll(PATH_LIKE_PATTERN)) {
|
|
627
|
+
refs.add(match[1]);
|
|
628
|
+
}
|
|
629
|
+
if (refs.size === 0) {
|
|
630
|
+
return results;
|
|
631
|
+
}
|
|
632
|
+
for (const ref of refs) {
|
|
633
|
+
const absPath = resolve(packageDir, ref);
|
|
634
|
+
if (existsSync(absPath)) {
|
|
635
|
+
try {
|
|
636
|
+
const stat = statSync(absPath);
|
|
637
|
+
results.push({
|
|
638
|
+
id: `ref:${ref}`,
|
|
639
|
+
description: `Referenced file exists: ${ref}`,
|
|
640
|
+
severity: "pass",
|
|
641
|
+
message: `Referenced ${stat.isDirectory() ? "directory" : "file"} exists: ${ref}`
|
|
642
|
+
});
|
|
643
|
+
} catch {
|
|
644
|
+
results.push({
|
|
645
|
+
id: `ref:${ref}`,
|
|
646
|
+
description: `Referenced file exists: ${ref}`,
|
|
647
|
+
severity: "pass",
|
|
648
|
+
message: `Referenced path exists: ${ref}`
|
|
649
|
+
});
|
|
650
|
+
}
|
|
651
|
+
} else {
|
|
652
|
+
results.push({
|
|
653
|
+
id: `ref:${ref}`,
|
|
654
|
+
description: `Referenced file exists: ${ref}`,
|
|
655
|
+
severity: "error",
|
|
656
|
+
message: `Referenced file not found: ${ref}`,
|
|
657
|
+
details: `Expected at: ${absPath}`
|
|
658
|
+
});
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
return results;
|
|
662
|
+
}
|
|
663
|
+
function checkXmlTags(fm) {
|
|
664
|
+
const results = [];
|
|
665
|
+
if (fm.name && XML_TAG_PATTERN.test(fm.name)) {
|
|
666
|
+
results.push({
|
|
667
|
+
id: "anthropic:name-no-xml",
|
|
668
|
+
description: "Name does not contain XML tags",
|
|
669
|
+
severity: "error",
|
|
670
|
+
message: "Name contains XML tags (< or >) \u2014 prohibited by Anthropic spec"
|
|
671
|
+
});
|
|
672
|
+
} else {
|
|
673
|
+
results.push({
|
|
674
|
+
id: "anthropic:name-no-xml",
|
|
675
|
+
description: "Name does not contain XML tags",
|
|
676
|
+
severity: "pass",
|
|
677
|
+
message: "Name is free of XML tags"
|
|
678
|
+
});
|
|
679
|
+
}
|
|
680
|
+
if (fm.description && XML_TAG_PATTERN.test(fm.description)) {
|
|
681
|
+
results.push({
|
|
682
|
+
id: "anthropic:description-no-xml",
|
|
683
|
+
description: "Description does not contain XML tags",
|
|
684
|
+
severity: "error",
|
|
685
|
+
message: "Description contains XML tags (< or >) \u2014 prohibited by Anthropic spec"
|
|
686
|
+
});
|
|
687
|
+
} else {
|
|
688
|
+
results.push({
|
|
689
|
+
id: "anthropic:description-no-xml",
|
|
690
|
+
description: "Description does not contain XML tags",
|
|
691
|
+
severity: "pass",
|
|
692
|
+
message: "Description is free of XML tags"
|
|
693
|
+
});
|
|
694
|
+
}
|
|
695
|
+
return results;
|
|
696
|
+
}
|
|
697
|
+
function checkTimeSensitiveInfo(body) {
|
|
698
|
+
const stripped = body.replace(/```[\s\S]*?```/g, "").replace(/`[^`]+`/g, "");
|
|
699
|
+
const hasTimeSensitive = TIME_SENSITIVE_PATTERNS.some((p) => p.test(stripped));
|
|
700
|
+
if (hasTimeSensitive) {
|
|
701
|
+
return [
|
|
702
|
+
{
|
|
703
|
+
id: "anthropic:no-time-sensitive",
|
|
704
|
+
description: "No time-sensitive information in body",
|
|
705
|
+
severity: "warning",
|
|
706
|
+
message: "Body may contain time-sensitive information (dates, versions) that could go stale",
|
|
707
|
+
details: 'Consider using an "old patterns" section or the compatibility field instead'
|
|
708
|
+
}
|
|
709
|
+
];
|
|
710
|
+
}
|
|
711
|
+
return [
|
|
712
|
+
{
|
|
713
|
+
id: "anthropic:no-time-sensitive",
|
|
714
|
+
description: "No time-sensitive information in body",
|
|
715
|
+
severity: "pass",
|
|
716
|
+
message: "No time-sensitive information detected"
|
|
717
|
+
}
|
|
718
|
+
];
|
|
719
|
+
}
|
|
720
|
+
function buildReport(skillName, results) {
|
|
721
|
+
return {
|
|
722
|
+
skill_name: skillName,
|
|
723
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
724
|
+
results,
|
|
725
|
+
summary: summarize(results)
|
|
726
|
+
};
|
|
727
|
+
}
|
|
728
|
+
var registry = /* @__PURE__ */ new Map();
|
|
729
|
+
function registerCheck(name, fn) {
|
|
730
|
+
registry.set(name, fn);
|
|
731
|
+
}
|
|
732
|
+
function runCheck(checkName, input, params) {
|
|
733
|
+
const fn = registry.get(checkName);
|
|
734
|
+
if (!fn) {
|
|
735
|
+
return {
|
|
736
|
+
id: `deterministic:${checkName}`,
|
|
737
|
+
description: `Deterministic check: ${checkName}`,
|
|
738
|
+
severity: "error",
|
|
739
|
+
message: `Unknown deterministic check: "${checkName}"`,
|
|
740
|
+
details: `Available checks: ${[...registry.keys()].join(", ")}`
|
|
741
|
+
};
|
|
742
|
+
}
|
|
743
|
+
let passed;
|
|
744
|
+
try {
|
|
745
|
+
passed = fn(input, params);
|
|
746
|
+
} catch (err) {
|
|
747
|
+
return {
|
|
748
|
+
id: `deterministic:${checkName}`,
|
|
749
|
+
description: `Deterministic check: ${checkName}`,
|
|
750
|
+
severity: "error",
|
|
751
|
+
message: `Check "${checkName}" errored: ${err instanceof Error ? err.message : String(err)}`
|
|
752
|
+
};
|
|
753
|
+
}
|
|
754
|
+
return {
|
|
755
|
+
id: `deterministic:${checkName}`,
|
|
756
|
+
description: `Deterministic check: ${checkName}`,
|
|
757
|
+
severity: passed ? "pass" : "error",
|
|
758
|
+
message: passed ? `Check "${checkName}" passed` : `Check "${checkName}" failed`
|
|
759
|
+
};
|
|
760
|
+
}
|
|
761
|
+
function requireParam(params, key, checkName) {
|
|
762
|
+
const value = params?.[key];
|
|
763
|
+
if (value === void 0 || value === null) {
|
|
764
|
+
throw new Error(`check "${checkName}" requires params.${key}; refusing to evaluate without it`);
|
|
765
|
+
}
|
|
766
|
+
return value;
|
|
767
|
+
}
|
|
768
|
+
registerCheck("contains", (input, params) => {
|
|
769
|
+
const needle = String(requireParam(params, "value", "contains"));
|
|
770
|
+
return input.includes(needle);
|
|
771
|
+
});
|
|
772
|
+
registerCheck("not_contains", (input, params) => {
|
|
773
|
+
const needle = String(requireParam(params, "value", "not_contains"));
|
|
774
|
+
return !input.includes(needle);
|
|
775
|
+
});
|
|
776
|
+
registerCheck("regex_match", (input, params) => {
|
|
777
|
+
const pattern = String(requireParam(params, "pattern", "regex_match"));
|
|
778
|
+
const flags = String(params?.["flags"] ?? "");
|
|
779
|
+
try {
|
|
780
|
+
return new RegExp(pattern, flags).test(input);
|
|
781
|
+
} catch {
|
|
782
|
+
return false;
|
|
783
|
+
}
|
|
784
|
+
});
|
|
785
|
+
registerCheck("min_length", (input, params) => {
|
|
786
|
+
const min = Number(requireParam(params, "min", "min_length"));
|
|
787
|
+
return input.length >= min;
|
|
788
|
+
});
|
|
789
|
+
registerCheck("max_length", (input, params) => {
|
|
790
|
+
const max = Number(requireParam(params, "max", "max_length"));
|
|
791
|
+
return input.length <= max;
|
|
792
|
+
});
|
|
793
|
+
registerCheck("not_empty", (input) => {
|
|
794
|
+
return input.trim().length > 0;
|
|
795
|
+
});
|
|
796
|
+
function buildRoster(targetFrontmatter, siblings) {
|
|
797
|
+
const target = {
|
|
798
|
+
name: targetFrontmatter.name,
|
|
799
|
+
description: targetFrontmatter.description,
|
|
800
|
+
isTarget: true
|
|
801
|
+
};
|
|
802
|
+
const siblingEntries = (siblings ?? []).map((s) => ({
|
|
803
|
+
name: s.name,
|
|
804
|
+
description: s.description,
|
|
805
|
+
isTarget: false
|
|
806
|
+
}));
|
|
807
|
+
return {
|
|
808
|
+
target,
|
|
809
|
+
siblings: siblingEntries,
|
|
810
|
+
all: [target, ...siblingEntries]
|
|
811
|
+
};
|
|
812
|
+
}
|
|
813
|
+
async function runTriggerTests(testCases, roster, provider) {
|
|
814
|
+
const results = [];
|
|
815
|
+
const triggerCases = testCases.filter((tc) => tc.trigger_expectation);
|
|
816
|
+
for (const tc of triggerCases) {
|
|
817
|
+
try {
|
|
818
|
+
const { selected, reasoning } = await provider.selectSkill(
|
|
819
|
+
tc.prompt,
|
|
820
|
+
roster.all.map((e) => ({ name: e.name, description: e.description }))
|
|
821
|
+
);
|
|
822
|
+
const outcome = classifyOutcome(selected, tc.trigger_expectation, roster.target.name);
|
|
823
|
+
results.push({
|
|
824
|
+
test_case_id: tc.id,
|
|
825
|
+
prompt: tc.prompt,
|
|
826
|
+
expected: tc.trigger_expectation,
|
|
827
|
+
outcome,
|
|
828
|
+
selected_skill: selected,
|
|
829
|
+
reasoning
|
|
830
|
+
});
|
|
831
|
+
} catch (err) {
|
|
832
|
+
results.push({
|
|
833
|
+
test_case_id: tc.id,
|
|
834
|
+
prompt: tc.prompt,
|
|
835
|
+
expected: tc.trigger_expectation,
|
|
836
|
+
outcome: "error",
|
|
837
|
+
selected_skill: null,
|
|
838
|
+
reasoning: err instanceof Error ? err.message : String(err)
|
|
839
|
+
});
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
return results;
|
|
843
|
+
}
|
|
844
|
+
function classifyOutcome(selectedSkill, expected, targetName) {
|
|
845
|
+
const targetSelected = selectedSkill === targetName;
|
|
846
|
+
const noneSelected = selectedSkill === null;
|
|
847
|
+
const siblingSelected = !noneSelected && !targetSelected;
|
|
848
|
+
if (expected === "should_trigger") {
|
|
849
|
+
if (targetSelected) return "correct_trigger";
|
|
850
|
+
if (siblingSelected) return "sibling_confusion";
|
|
851
|
+
return "false_negative";
|
|
852
|
+
}
|
|
853
|
+
if (noneSelected) return "correct_no_trigger";
|
|
854
|
+
if (targetSelected) return "false_positive";
|
|
855
|
+
return "correct_no_trigger";
|
|
856
|
+
}
|
|
857
|
+
function computeMetrics(results) {
|
|
858
|
+
let tp = 0, tn = 0, fp = 0, fn = 0, confusions = 0, errors = 0;
|
|
859
|
+
for (const r of results) {
|
|
860
|
+
switch (r.outcome) {
|
|
861
|
+
case "correct_trigger":
|
|
862
|
+
tp++;
|
|
863
|
+
break;
|
|
864
|
+
case "correct_no_trigger":
|
|
865
|
+
tn++;
|
|
866
|
+
break;
|
|
867
|
+
case "false_positive":
|
|
868
|
+
fp++;
|
|
869
|
+
break;
|
|
870
|
+
case "false_negative":
|
|
871
|
+
fn++;
|
|
872
|
+
break;
|
|
873
|
+
case "sibling_confusion":
|
|
874
|
+
confusions++;
|
|
875
|
+
break;
|
|
876
|
+
case "error":
|
|
877
|
+
errors++;
|
|
878
|
+
break;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
const precision = tp + fp > 0 ? tp / (tp + fp) : 1;
|
|
882
|
+
const recall = tp + fn > 0 ? tp / (tp + fn) : 1;
|
|
883
|
+
const fpr = fp + tn > 0 ? fp / (fp + tn) : 0;
|
|
884
|
+
const fnr = fn + tp > 0 ? fn / (fn + tp) : 0;
|
|
885
|
+
return {
|
|
886
|
+
total_cases: results.length,
|
|
887
|
+
true_positives: tp,
|
|
888
|
+
true_negatives: tn,
|
|
889
|
+
false_positives: fp,
|
|
890
|
+
false_negatives: fn,
|
|
891
|
+
sibling_confusions: confusions,
|
|
892
|
+
errors,
|
|
893
|
+
precision,
|
|
894
|
+
recall,
|
|
895
|
+
false_positive_rate: fpr,
|
|
896
|
+
false_negative_rate: fnr
|
|
897
|
+
};
|
|
898
|
+
}
|
|
899
|
+
async function runFunctionalTests(testCases, skill, provider, options) {
|
|
900
|
+
const outcomes = [];
|
|
901
|
+
const functionalCases = testCases.filter(
|
|
902
|
+
(tc) => tc.tier !== "adversarial" || tc.expected_output_contains || tc.expected_artifacts
|
|
903
|
+
);
|
|
904
|
+
for (const tc of functionalCases) {
|
|
905
|
+
const context = {
|
|
906
|
+
skill_body: skill.body,
|
|
907
|
+
base_path: options?.base_path,
|
|
908
|
+
file_contents: options?.file_contents,
|
|
909
|
+
context_hints: tc.context_hints
|
|
910
|
+
};
|
|
911
|
+
try {
|
|
912
|
+
const result = await provider.execute(tc.prompt, context, {
|
|
913
|
+
timeout_ms: options?.timeout_ms,
|
|
914
|
+
model: options?.model
|
|
915
|
+
});
|
|
916
|
+
outcomes.push({
|
|
917
|
+
test_case_id: tc.id,
|
|
918
|
+
prompt: tc.prompt,
|
|
919
|
+
output: {
|
|
920
|
+
text: result.text,
|
|
921
|
+
artifacts: result.artifacts,
|
|
922
|
+
tool_calls: result.tool_calls,
|
|
923
|
+
error: result.error
|
|
924
|
+
},
|
|
925
|
+
meta: result.meta,
|
|
926
|
+
status: result.meta.timed_out ? "timed_out" : "completed"
|
|
927
|
+
});
|
|
928
|
+
} catch (err) {
|
|
929
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
930
|
+
outcomes.push({
|
|
931
|
+
test_case_id: tc.id,
|
|
932
|
+
prompt: tc.prompt,
|
|
933
|
+
output: {
|
|
934
|
+
text: "",
|
|
935
|
+
artifacts: [],
|
|
936
|
+
tool_calls: 0,
|
|
937
|
+
error: err instanceof Error ? err.message : String(err)
|
|
938
|
+
},
|
|
939
|
+
meta: {
|
|
940
|
+
started_at: now,
|
|
941
|
+
completed_at: now,
|
|
942
|
+
duration_ms: 0,
|
|
943
|
+
timed_out: false
|
|
944
|
+
},
|
|
945
|
+
status: "failed"
|
|
946
|
+
});
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
return outcomes;
|
|
950
|
+
}
|
|
951
|
+
async function judgeCriteria(criteria, outcome, judgeProvider, options) {
|
|
952
|
+
const results = [];
|
|
953
|
+
for (const criterion of criteria) {
|
|
954
|
+
if (criterion.method === "deterministic") {
|
|
955
|
+
results.push(judgeDeterministic(criterion, outcome));
|
|
956
|
+
} else {
|
|
957
|
+
results.push(await judgeWithLLM(criterion, outcome, judgeProvider, options?.model));
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
return results;
|
|
961
|
+
}
|
|
962
|
+
function judgeDeterministic(criterion, outcome) {
|
|
963
|
+
if (!criterion.deterministic_check) {
|
|
964
|
+
return {
|
|
965
|
+
criterion_id: criterion.id,
|
|
966
|
+
verdict: "no",
|
|
967
|
+
confidence: 1,
|
|
968
|
+
reasoning: "Deterministic criterion has no check defined",
|
|
969
|
+
method: "deterministic"
|
|
970
|
+
};
|
|
971
|
+
}
|
|
972
|
+
const checkResult = runCheck(
|
|
973
|
+
criterion.deterministic_check,
|
|
974
|
+
outcome.output.text,
|
|
975
|
+
criterion.deterministic_check_params
|
|
976
|
+
);
|
|
977
|
+
return {
|
|
978
|
+
criterion_id: criterion.id,
|
|
979
|
+
verdict: checkResult.severity === "pass" ? "yes" : "no",
|
|
980
|
+
confidence: 1,
|
|
981
|
+
reasoning: checkResult.message,
|
|
982
|
+
method: "deterministic"
|
|
983
|
+
};
|
|
984
|
+
}
|
|
985
|
+
async function judgeWithLLM(criterion, outcome, provider, model) {
|
|
986
|
+
try {
|
|
987
|
+
const { verdict, confidence, reasoning } = await provider.judge(
|
|
988
|
+
criterion.description,
|
|
989
|
+
outcome.prompt,
|
|
990
|
+
outcome.output.text,
|
|
991
|
+
criterion.judge_prompt
|
|
992
|
+
);
|
|
993
|
+
return {
|
|
994
|
+
criterion_id: criterion.id,
|
|
995
|
+
verdict,
|
|
996
|
+
confidence,
|
|
997
|
+
reasoning,
|
|
998
|
+
method: "judge",
|
|
999
|
+
judge_model: model
|
|
1000
|
+
};
|
|
1001
|
+
} catch (err) {
|
|
1002
|
+
return {
|
|
1003
|
+
criterion_id: criterion.id,
|
|
1004
|
+
verdict: "unsure",
|
|
1005
|
+
confidence: 0,
|
|
1006
|
+
reasoning: `Judge error: ${err instanceof Error ? err.message : String(err)}`,
|
|
1007
|
+
method: "judge",
|
|
1008
|
+
judge_model: model
|
|
1009
|
+
};
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
function computeScoreCard(results, criteria, regressions = []) {
|
|
1013
|
+
const criteriaMap = new Map(criteria.map((c) => [c.id, c]));
|
|
1014
|
+
let passed = 0, failed = 0, unsure = 0, blockerFailures = 0;
|
|
1015
|
+
for (const r of results) {
|
|
1016
|
+
if (r.verdict === "yes") passed++;
|
|
1017
|
+
else if (r.verdict === "no") {
|
|
1018
|
+
failed++;
|
|
1019
|
+
const criterion = criteriaMap.get(r.criterion_id);
|
|
1020
|
+
if (criterion?.blocker) blockerFailures++;
|
|
1021
|
+
} else {
|
|
1022
|
+
unsure++;
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
const sacredRegressions = regressions.filter((r) => r.is_sacred).length;
|
|
1026
|
+
return {
|
|
1027
|
+
total_criteria: results.length,
|
|
1028
|
+
passed,
|
|
1029
|
+
failed,
|
|
1030
|
+
unsure,
|
|
1031
|
+
blocker_failures: blockerFailures,
|
|
1032
|
+
sacred_regressions: sacredRegressions,
|
|
1033
|
+
pass_rate: results.length > 0 ? passed / results.length : 0
|
|
1034
|
+
};
|
|
1035
|
+
}
|
|
1036
|
+
function decideRollout(score, isObsolete = false) {
|
|
1037
|
+
if (score.blocker_failures > 0) return "block";
|
|
1038
|
+
if (score.sacred_regressions > 0) return "block";
|
|
1039
|
+
if (isObsolete) return "obsolete_review";
|
|
1040
|
+
if (score.failed > 0 || score.unsure > 0) return "warn";
|
|
1041
|
+
return "ship";
|
|
1042
|
+
}
|
|
1043
|
+
function buildLaunchReport(skillName, score, regressions, baseline, isObsolete, opts = {}) {
|
|
1044
|
+
const decision = decideRollout(score, isObsolete);
|
|
1045
|
+
const blockers = [];
|
|
1046
|
+
const warnings = [];
|
|
1047
|
+
if (score.blocker_failures > 0) {
|
|
1048
|
+
blockers.push(`${score.blocker_failures} blocker criteria failed`);
|
|
1049
|
+
}
|
|
1050
|
+
if (score.sacred_regressions > 0) {
|
|
1051
|
+
blockers.push(`${score.sacred_regressions} sacred regressions detected`);
|
|
1052
|
+
}
|
|
1053
|
+
if (isObsolete) {
|
|
1054
|
+
warnings.push("Skill may be obsolete \u2014 baseline matches skill on most criteria");
|
|
1055
|
+
}
|
|
1056
|
+
if (score.unsure > 0) {
|
|
1057
|
+
warnings.push(`${score.unsure} criteria could not be judged (unsure)`);
|
|
1058
|
+
}
|
|
1059
|
+
if (score.failed > 0 && score.blocker_failures === 0) {
|
|
1060
|
+
warnings.push(`${score.failed} non-blocker criteria failed`);
|
|
1061
|
+
}
|
|
1062
|
+
const reasoning = decision === "ship" ? `All ${score.total_criteria} criteria passed. Ready to ship.` : decision === "block" ? `Release blocked: ${blockers.join("; ")}` : decision === "obsolete_review" ? "Skill flagged for obsolete review \u2014 baseline model matches skill performance." : `${score.passed}/${score.total_criteria} criteria passed with warnings.`;
|
|
1063
|
+
return {
|
|
1064
|
+
skill_name: skillName,
|
|
1065
|
+
// DR-103 D5 B5.1: injected clock for replayability; falls back to wall clock
|
|
1066
|
+
// only for legacy callers that have not yet adopted injection.
|
|
1067
|
+
timestamp: opts.now ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
1068
|
+
decision,
|
|
1069
|
+
score,
|
|
1070
|
+
regressions,
|
|
1071
|
+
baseline,
|
|
1072
|
+
blockers,
|
|
1073
|
+
warnings,
|
|
1074
|
+
reasoning,
|
|
1075
|
+
// DR-103 D4 B4.2: additive advisory field; present only when opted in.
|
|
1076
|
+
...opts.adoptionVerdict !== void 0 ? { adoptionVerdict: opts.adoptionVerdict } : {}
|
|
1077
|
+
};
|
|
1078
|
+
}
|
|
1079
|
+
function clusterFailures(results, criteria) {
|
|
1080
|
+
const criteriaMap = new Map(criteria.map((c) => [c.id, c]));
|
|
1081
|
+
const failures = results.filter((r) => r.verdict === "no");
|
|
1082
|
+
if (failures.length === 0) return [];
|
|
1083
|
+
const groups = /* @__PURE__ */ new Map();
|
|
1084
|
+
for (const f of failures) {
|
|
1085
|
+
const criterion = criteriaMap.get(f.criterion_id);
|
|
1086
|
+
const key = criterion?.method ?? "unknown";
|
|
1087
|
+
const group = groups.get(key) ?? [];
|
|
1088
|
+
group.push(f);
|
|
1089
|
+
groups.set(key, group);
|
|
1090
|
+
}
|
|
1091
|
+
const clusters = [];
|
|
1092
|
+
for (const [method, group] of groups) {
|
|
1093
|
+
const hasBlocker = group.some((f) => {
|
|
1094
|
+
const c = criteriaMap.get(f.criterion_id);
|
|
1095
|
+
return c?.blocker ?? false;
|
|
1096
|
+
});
|
|
1097
|
+
clusters.push({
|
|
1098
|
+
pattern: `${method} failures`,
|
|
1099
|
+
criterion_ids: group.map((f) => f.criterion_id),
|
|
1100
|
+
count: group.length,
|
|
1101
|
+
severity: hasBlocker ? "critical" : group.length > 2 ? "high" : "medium"
|
|
1102
|
+
});
|
|
1103
|
+
}
|
|
1104
|
+
return clusters.sort((a, b) => {
|
|
1105
|
+
const severityOrder = { critical: 0, high: 1, medium: 2 };
|
|
1106
|
+
const diff = severityOrder[a.severity] - severityOrder[b.severity];
|
|
1107
|
+
return diff !== 0 ? diff : b.count - a.count;
|
|
1108
|
+
});
|
|
1109
|
+
}
|
|
1110
|
+
function selectWeakest(results, criteria) {
|
|
1111
|
+
const criteriaMap = new Map(criteria.map((c) => [c.id, c]));
|
|
1112
|
+
const failures = results.filter((r) => r.verdict === "no");
|
|
1113
|
+
if (failures.length === 0) return null;
|
|
1114
|
+
const blockerFailure = failures.find((f) => criteriaMap.get(f.criterion_id)?.blocker);
|
|
1115
|
+
if (blockerFailure) return blockerFailure.criterion_id;
|
|
1116
|
+
const regressionFailure = failures.find(
|
|
1117
|
+
(f) => criteriaMap.get(f.criterion_id)?.regression_critical
|
|
1118
|
+
);
|
|
1119
|
+
if (regressionFailure) return regressionFailure.criterion_id;
|
|
1120
|
+
return failures[0].criterion_id;
|
|
1121
|
+
}
|
|
1122
|
+
function needsReevaluation(lastRunTimestamp, maxAgeDays = 30) {
|
|
1123
|
+
const lastRun = new Date(lastRunTimestamp);
|
|
1124
|
+
const now = /* @__PURE__ */ new Date();
|
|
1125
|
+
const ageMs = now.getTime() - lastRun.getTime();
|
|
1126
|
+
const ageDays = ageMs / (1e3 * 60 * 60 * 24);
|
|
1127
|
+
return ageDays > maxAgeDays;
|
|
1128
|
+
}
|
|
1129
|
+
var PREDICATE_URI = GATE_RESULT_V1_URI2;
|
|
1130
|
+
var STATEMENT_TYPE = IN_TOTO_STATEMENT_V1_TYPE2;
|
|
1131
|
+
var GateResultEnum = GateDecisionSchema2;
|
|
1132
|
+
var AdvisorySeverityEnum = AdvisorySeveritySchema2;
|
|
1133
|
+
var PipelineSideEnum = z6.enum(["client", "server", "ci", "sandbox", "local"]);
|
|
1134
|
+
var EvidenceStatementSchema2 = KernelEvidenceStatementSchemaInternal.superRefine(
|
|
1135
|
+
(stmt, ctx) => {
|
|
1136
|
+
const subject0 = stmt.subject[0];
|
|
1137
|
+
if (subject0 === void 0) return;
|
|
1138
|
+
if (subject0.name !== stmt.predicate.gate_id) {
|
|
1139
|
+
ctx.addIssue({
|
|
1140
|
+
code: "custom",
|
|
1141
|
+
path: ["subject", 0, "name"],
|
|
1142
|
+
message: "j-rig secondary check I1: subject[0].name must equal predicate.gate_id"
|
|
1143
|
+
});
|
|
1144
|
+
}
|
|
1145
|
+
const SHA256_PREFIX_LEN = "sha256:".length;
|
|
1146
|
+
if (subject0.digest.sha256 !== stmt.predicate.input_hash.slice(SHA256_PREFIX_LEN)) {
|
|
1147
|
+
ctx.addIssue({
|
|
1148
|
+
code: "custom",
|
|
1149
|
+
path: ["subject", 0, "digest", "sha256"],
|
|
1150
|
+
message: "j-rig secondary check I2: subject[0].digest.sha256 must equal predicate.input_hash without sha256: prefix"
|
|
1151
|
+
});
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
);
|
|
1155
|
+
var LegacyBundleContainerSchema = z6.object({
|
|
1156
|
+
bundle_format: z6.literal("json-array"),
|
|
1157
|
+
rows: z6.array(EvidenceStatementSchema2)
|
|
1158
|
+
}).strict();
|
|
1159
|
+
function composeStatement(input) {
|
|
1160
|
+
if (!input.inputHash.startsWith("sha256:")) {
|
|
1161
|
+
throw new Error(
|
|
1162
|
+
`composeStatement: inputHash must be sha256:-prefixed (got: ${input.inputHash})`
|
|
1163
|
+
);
|
|
1164
|
+
}
|
|
1165
|
+
const digestHex = input.inputHash.slice("sha256:".length);
|
|
1166
|
+
const evaluatedAt = input.evaluatedAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
1167
|
+
const predicate = {
|
|
1168
|
+
gate_id: input.gateId,
|
|
1169
|
+
gate_name: input.gateName,
|
|
1170
|
+
gate_version: input.gateVersion,
|
|
1171
|
+
gate_decision: input.gateDecision,
|
|
1172
|
+
gate_reasons: input.gateReasons,
|
|
1173
|
+
coverage: {
|
|
1174
|
+
dimensions_evaluated: input.coverage.dimensionsEvaluated,
|
|
1175
|
+
dimensions_skipped: input.coverage.dimensionsSkipped
|
|
1176
|
+
},
|
|
1177
|
+
policy_ref: input.policyRef,
|
|
1178
|
+
policy_hash: input.policyHash,
|
|
1179
|
+
input_hash: input.inputHash,
|
|
1180
|
+
evaluated_at: evaluatedAt,
|
|
1181
|
+
runner: input.runner,
|
|
1182
|
+
commit_sha: input.commitSha
|
|
1183
|
+
};
|
|
1184
|
+
if (input.metadata !== void 0) predicate.metadata = input.metadata;
|
|
1185
|
+
if (input.failureMode !== void 0) predicate.failure_mode = input.failureMode;
|
|
1186
|
+
if (input.advisorySeverity !== void 0) predicate.advisory_severity = input.advisorySeverity;
|
|
1187
|
+
const candidate = {
|
|
1188
|
+
_type: STATEMENT_TYPE,
|
|
1189
|
+
subject: [
|
|
1190
|
+
{
|
|
1191
|
+
name: input.gateId,
|
|
1192
|
+
digest: { sha256: digestHex }
|
|
1193
|
+
}
|
|
1194
|
+
],
|
|
1195
|
+
predicateType: PREDICATE_URI,
|
|
1196
|
+
predicate
|
|
1197
|
+
};
|
|
1198
|
+
const parsed = EvidenceStatementSchema2.safeParse(candidate);
|
|
1199
|
+
if (!parsed.success) {
|
|
1200
|
+
throw new Error(
|
|
1201
|
+
`composeStatement: validation failed: ${parsed.error.issues.map((i) => `${i.path.join(".") || "<root>"}: ${i.message}`).join("; ")}`
|
|
1202
|
+
);
|
|
1203
|
+
}
|
|
1204
|
+
return parsed.data;
|
|
1205
|
+
}
|
|
1206
|
+
function serializeStatement(stmt) {
|
|
1207
|
+
const check = EvidenceStatementSchema2.safeParse(stmt);
|
|
1208
|
+
if (!check.success) {
|
|
1209
|
+
throw new Error(
|
|
1210
|
+
`serializeStatement: validation failed: ${check.error.issues.map((i) => i.message).join("; ")}`
|
|
1211
|
+
);
|
|
1212
|
+
}
|
|
1213
|
+
return JSON.stringify(stmt);
|
|
1214
|
+
}
|
|
1215
|
+
var ProviderError = class _ProviderError extends Error {
|
|
1216
|
+
category;
|
|
1217
|
+
providerName;
|
|
1218
|
+
retryable;
|
|
1219
|
+
originalError;
|
|
1220
|
+
constructor(args) {
|
|
1221
|
+
super(args.message);
|
|
1222
|
+
this.name = "ProviderError";
|
|
1223
|
+
this.category = args.category;
|
|
1224
|
+
this.providerName = args.providerName;
|
|
1225
|
+
this.retryable = args.retryable ?? defaultRetryableFor(args.category);
|
|
1226
|
+
this.originalError = args.originalError;
|
|
1227
|
+
Object.setPrototypeOf(this, _ProviderError.prototype);
|
|
1228
|
+
}
|
|
1229
|
+
};
|
|
1230
|
+
function defaultRetryableFor(c) {
|
|
1231
|
+
switch (c) {
|
|
1232
|
+
case "rate_limit":
|
|
1233
|
+
case "network_timeout":
|
|
1234
|
+
return true;
|
|
1235
|
+
case "authentication":
|
|
1236
|
+
case "model_not_found":
|
|
1237
|
+
case "content_policy_refusal":
|
|
1238
|
+
case "schema_violation":
|
|
1239
|
+
case "unknown":
|
|
1240
|
+
return false;
|
|
1241
|
+
}
|
|
1242
|
+
}
|
|
1243
|
+
var RESPONSE_ZOD = z7.object({
|
|
1244
|
+
verdict: z7.enum(["yes", "no", "unsure"]),
|
|
1245
|
+
reasoning: z7.string()
|
|
1246
|
+
}).strict();
|
|
1247
|
+
var SAMPLE_TOOL_ARGS_ZOD = z8.object({
|
|
1248
|
+
title: z8.string(),
|
|
1249
|
+
start_time: z8.string().datetime({ message: "start_time must be RFC 3339 date-time" }),
|
|
1250
|
+
duration_minutes: z8.number().int().min(1).max(480)
|
|
1251
|
+
}).strict();
|
|
1252
|
+
var OtelEvents = {
|
|
1253
|
+
// EXECUTION events (067 § 1.1, category `runtime.*`).
|
|
1254
|
+
/** A worker leases an EvalRun and transitions it to `running`. */
|
|
1255
|
+
RUNTIME_RUN_STARTED: "runtime.run.started",
|
|
1256
|
+
/** The EvalRun reaches a terminal state. */
|
|
1257
|
+
RUNTIME_RUN_FINISHED: "runtime.run.finished",
|
|
1258
|
+
/** One matcher/criterion is scored within a SessionTrace. */
|
|
1259
|
+
RUNTIME_CRITERION_EVALUATED: "runtime.criterion.evaluated",
|
|
1260
|
+
// JUDGE events (067 § 1.2, category `judge.*`).
|
|
1261
|
+
/** An LLM judge is dialed for a matching event. */
|
|
1262
|
+
JUDGE_INVOKED: "judge.invoked",
|
|
1263
|
+
/** A JudgeDecision is finalized for a matching event. */
|
|
1264
|
+
JUDGE_VERDICT: "judge.verdict",
|
|
1265
|
+
// GOVERNANCE events (067 § 2.2, category `gate.*`).
|
|
1266
|
+
/**
|
|
1267
|
+
* A RolloutGate decision row is emitted under `gate-result/v1`. The NORMATIVE
|
|
1268
|
+
* end-of-evaluation event a ship-gate dashboard alerts on. Identical spelling
|
|
1269
|
+
* to the audit-harness iah-E07 emitter.
|
|
1270
|
+
*/
|
|
1271
|
+
GATE_DECISION_EMITTED: "gate.decision.emitted"
|
|
1272
|
+
};
|
|
1273
|
+
var OtelAttrs = {
|
|
1274
|
+
// Shared correlation metadata — every event MUST carry eval.run_id (067 § 4.2;
|
|
1275
|
+
// kernel YAML `shared_attributes`). An event without it is malformed, not
|
|
1276
|
+
// "partial" (067 § 5).
|
|
1277
|
+
EVAL_RUN_ID: "eval.run_id",
|
|
1278
|
+
EVAL_SESSION_TRACE_ID: "eval.session_trace_id",
|
|
1279
|
+
TRACE_ID: "trace.id",
|
|
1280
|
+
// runtime.run.started payload (067 § 1.1).
|
|
1281
|
+
RUNTIME_RUN_SPEC_CONTENT_HASH: "runtime.run.spec_content_hash",
|
|
1282
|
+
RUNTIME_RUN_SKILL_SNAPSHOT_SHA: "runtime.run.skill_snapshot_sha",
|
|
1283
|
+
// runtime.run.finished payload (067 § 1.1).
|
|
1284
|
+
RUNTIME_RUN_TERMINAL_STATE: "runtime.run.terminal_state",
|
|
1285
|
+
RUNTIME_RUN_DURATION_MS: "runtime.run.duration_ms",
|
|
1286
|
+
// runtime.criterion.evaluated payload (067 § 1.1).
|
|
1287
|
+
RUNTIME_CRITERION_MATCHER_CLASS: "runtime.criterion.matcher_class",
|
|
1288
|
+
RUNTIME_CRITERION_OUTCOME: "runtime.criterion.outcome",
|
|
1289
|
+
// judge.invoked payload (067 § 1.2).
|
|
1290
|
+
JUDGE_ID: "judge.id",
|
|
1291
|
+
JUDGE_MODEL_ID: "judge.model_id",
|
|
1292
|
+
JUDGE_MODEL_VERSION: "judge.model_version",
|
|
1293
|
+
// judge.verdict payload (067 § 1.2).
|
|
1294
|
+
JUDGE_VERDICT: "judge.verdict",
|
|
1295
|
+
JUDGE_VERDICT_SOURCE: "judge.verdict_source",
|
|
1296
|
+
JUDGE_SEED: "judge.seed",
|
|
1297
|
+
// gate.decision.emitted payload (067 § 2.2). Identical to iah-E07 spelling.
|
|
1298
|
+
GATE_NAME: "gate.name",
|
|
1299
|
+
GATE_DECISION: "gate.decision",
|
|
1300
|
+
GATE_POLICY_REF: "gate.policy_ref"
|
|
1301
|
+
};
|
|
1302
|
+
var RuntimeTerminalState = {
|
|
1303
|
+
JUDGED: "judged",
|
|
1304
|
+
ARCHIVED_SUCCESS: "archived_success",
|
|
1305
|
+
ARCHIVED_FAILED: "archived_failed"
|
|
1306
|
+
};
|
|
1307
|
+
var CriterionOutcome = {
|
|
1308
|
+
PASS: "pass",
|
|
1309
|
+
FAIL: "fail",
|
|
1310
|
+
SKIP: "skip"
|
|
1311
|
+
};
|
|
1312
|
+
var JudgeVerdictSource = {
|
|
1313
|
+
LLM_WITH_SEED: "llm_with_seed",
|
|
1314
|
+
LLM_NO_SEED: "llm_no_seed",
|
|
1315
|
+
DETERMINISTIC: "deterministic"
|
|
1316
|
+
};
|
|
1317
|
+
var GateDecision = {
|
|
1318
|
+
PASS: "pass",
|
|
1319
|
+
FAIL: "fail",
|
|
1320
|
+
ADVISORY: "advisory",
|
|
1321
|
+
ERROR: "error"
|
|
1322
|
+
};
|
|
1323
|
+
var TRACER_NAME = "@j-rig/core";
|
|
1324
|
+
var TRACER_VERSION = "2.1.0";
|
|
1325
|
+
function stderrEmissionActive() {
|
|
1326
|
+
return process.env.J_RIG_OTEL === "1" || (process.env.OTEL_EXPORTER_OTLP_ENDPOINT?.length ?? 0) > 0;
|
|
1327
|
+
}
|
|
1328
|
+
function toAttributes(payload) {
|
|
1329
|
+
const attrs = {};
|
|
1330
|
+
for (const [k, v] of Object.entries(payload)) {
|
|
1331
|
+
if (v === null || v === void 0) continue;
|
|
1332
|
+
attrs[k] = v;
|
|
1333
|
+
}
|
|
1334
|
+
return attrs;
|
|
1335
|
+
}
|
|
1336
|
+
function emitOtelEvent(name, payload, span) {
|
|
1337
|
+
try {
|
|
1338
|
+
const runId = payload[OtelAttrs.EVAL_RUN_ID];
|
|
1339
|
+
if (typeof runId !== "string" || runId.length === 0) {
|
|
1340
|
+
if (stderrEmissionActive()) {
|
|
1341
|
+
process.stderr.write(
|
|
1342
|
+
`[OTEL-DROP] event '${name}' missing required eval.run_id; dropped (067 \xA7 4.2)
|
|
1343
|
+
`
|
|
1344
|
+
);
|
|
1345
|
+
}
|
|
1346
|
+
return;
|
|
1347
|
+
}
|
|
1348
|
+
const attrs = toAttributes(payload);
|
|
1349
|
+
const targetSpan = span ?? trace2.getActiveSpan();
|
|
1350
|
+
if (targetSpan) {
|
|
1351
|
+
targetSpan.addEvent(name, attrs);
|
|
1352
|
+
} else {
|
|
1353
|
+
const tracer = trace2.getTracer(TRACER_NAME, TRACER_VERSION);
|
|
1354
|
+
const s = tracer.startSpan(name);
|
|
1355
|
+
s.addEvent(name, attrs);
|
|
1356
|
+
s.end();
|
|
1357
|
+
}
|
|
1358
|
+
if (stderrEmissionActive()) {
|
|
1359
|
+
const line = JSON.stringify({ name, attributes: attrs });
|
|
1360
|
+
process.stderr.write(`[OTEL] ${line}
|
|
1361
|
+
`);
|
|
1362
|
+
}
|
|
1363
|
+
} catch {
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
function correlationAttrs(c) {
|
|
1367
|
+
return {
|
|
1368
|
+
[OtelAttrs.EVAL_RUN_ID]: c.evalRunId,
|
|
1369
|
+
[OtelAttrs.EVAL_SESSION_TRACE_ID]: c.sessionTraceId,
|
|
1370
|
+
[OtelAttrs.TRACE_ID]: c.traceId
|
|
1371
|
+
};
|
|
1372
|
+
}
|
|
1373
|
+
function emitRuntimeRunStarted(c, payload) {
|
|
1374
|
+
emitOtelEvent(OtelEvents.RUNTIME_RUN_STARTED, {
|
|
1375
|
+
...correlationAttrs(c),
|
|
1376
|
+
[OtelAttrs.RUNTIME_RUN_SPEC_CONTENT_HASH]: payload.specContentHash,
|
|
1377
|
+
[OtelAttrs.RUNTIME_RUN_SKILL_SNAPSHOT_SHA]: payload.skillSnapshotSha
|
|
1378
|
+
});
|
|
1379
|
+
}
|
|
1380
|
+
function emitRuntimeRunFinished(c, payload) {
|
|
1381
|
+
emitOtelEvent(OtelEvents.RUNTIME_RUN_FINISHED, {
|
|
1382
|
+
...correlationAttrs(c),
|
|
1383
|
+
[OtelAttrs.RUNTIME_RUN_TERMINAL_STATE]: payload.terminalState,
|
|
1384
|
+
[OtelAttrs.RUNTIME_RUN_DURATION_MS]: payload.durationMs
|
|
1385
|
+
});
|
|
1386
|
+
}
|
|
1387
|
+
function emitRuntimeCriterionEvaluated(c, payload) {
|
|
1388
|
+
emitOtelEvent(OtelEvents.RUNTIME_CRITERION_EVALUATED, {
|
|
1389
|
+
...correlationAttrs(c),
|
|
1390
|
+
[OtelAttrs.RUNTIME_CRITERION_MATCHER_CLASS]: payload.matcherClass,
|
|
1391
|
+
[OtelAttrs.RUNTIME_CRITERION_OUTCOME]: payload.outcome
|
|
1392
|
+
});
|
|
1393
|
+
}
|
|
1394
|
+
function emitJudgeInvoked(c, payload) {
|
|
1395
|
+
emitOtelEvent(OtelEvents.JUDGE_INVOKED, {
|
|
1396
|
+
...correlationAttrs(c),
|
|
1397
|
+
[OtelAttrs.JUDGE_ID]: payload.judgeId,
|
|
1398
|
+
[OtelAttrs.JUDGE_MODEL_ID]: payload.modelId,
|
|
1399
|
+
[OtelAttrs.JUDGE_MODEL_VERSION]: payload.modelVersion
|
|
1400
|
+
});
|
|
1401
|
+
}
|
|
1402
|
+
function emitJudgeVerdict(c, payload) {
|
|
1403
|
+
emitOtelEvent(OtelEvents.JUDGE_VERDICT, {
|
|
1404
|
+
...correlationAttrs(c),
|
|
1405
|
+
[OtelAttrs.JUDGE_VERDICT]: payload.verdict,
|
|
1406
|
+
[OtelAttrs.JUDGE_VERDICT_SOURCE]: payload.verdictSource,
|
|
1407
|
+
[OtelAttrs.JUDGE_SEED]: payload.seed
|
|
1408
|
+
});
|
|
1409
|
+
}
|
|
1410
|
+
function emitGateDecisionEmitted(c, payload) {
|
|
1411
|
+
emitOtelEvent(OtelEvents.GATE_DECISION_EMITTED, {
|
|
1412
|
+
...correlationAttrs(c),
|
|
1413
|
+
[OtelAttrs.GATE_NAME]: payload.gateName,
|
|
1414
|
+
[OtelAttrs.GATE_DECISION]: payload.decision,
|
|
1415
|
+
[OtelAttrs.GATE_POLICY_REF]: payload.policyRef
|
|
1416
|
+
});
|
|
1417
|
+
}
|
|
1418
|
+
function uuidv7(nowMs = Date.now()) {
|
|
1419
|
+
const bytes = randomBytes(16);
|
|
1420
|
+
const ts = Math.max(0, Math.floor(nowMs));
|
|
1421
|
+
bytes[0] = ts / 2 ** 40 & 255;
|
|
1422
|
+
bytes[1] = ts / 2 ** 32 & 255;
|
|
1423
|
+
bytes[2] = ts / 2 ** 24 & 255;
|
|
1424
|
+
bytes[3] = ts / 2 ** 16 & 255;
|
|
1425
|
+
bytes[4] = ts / 2 ** 8 & 255;
|
|
1426
|
+
bytes[5] = ts & 255;
|
|
1427
|
+
bytes[6] = bytes[6] & 15 | 112;
|
|
1428
|
+
bytes[8] = bytes[8] & 63 | 128;
|
|
1429
|
+
const hex = bytes.toString("hex");
|
|
1430
|
+
return `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(16, 20)}-${hex.slice(20, 32)}`;
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
// src/lib/output.ts
|
|
1434
|
+
import chalk from "chalk";
|
|
1435
|
+
function icon(severity) {
|
|
1436
|
+
switch (severity) {
|
|
1437
|
+
case "pass":
|
|
1438
|
+
return chalk.green("\u2713");
|
|
1439
|
+
case "error":
|
|
1440
|
+
return chalk.red("\u2717");
|
|
1441
|
+
case "warning":
|
|
1442
|
+
return chalk.yellow("!");
|
|
1443
|
+
case "info":
|
|
1444
|
+
return chalk.blue("\xB7");
|
|
1445
|
+
default: {
|
|
1446
|
+
const _exhaustive = severity;
|
|
1447
|
+
return _exhaustive;
|
|
1448
|
+
}
|
|
1449
|
+
}
|
|
1450
|
+
}
|
|
1451
|
+
function formatDuration(ms) {
|
|
1452
|
+
if (ms < 1e3) return `${ms}ms`;
|
|
1453
|
+
return `${(ms / 1e3).toFixed(1)}s`;
|
|
1454
|
+
}
|
|
1455
|
+
function formatDecision(decision) {
|
|
1456
|
+
switch (decision) {
|
|
1457
|
+
case "ship":
|
|
1458
|
+
return chalk.green.bold("SHIP");
|
|
1459
|
+
case "block":
|
|
1460
|
+
return chalk.red.bold("BLOCK");
|
|
1461
|
+
case "warn":
|
|
1462
|
+
return chalk.yellow.bold("WARN");
|
|
1463
|
+
case "obsolete_review":
|
|
1464
|
+
return chalk.magenta.bold("OBSOLETE_REVIEW");
|
|
1465
|
+
default: {
|
|
1466
|
+
const _exhaustive = decision;
|
|
1467
|
+
return _exhaustive;
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
}
|
|
1471
|
+
function formatScore(passed, total) {
|
|
1472
|
+
const pct = total > 0 ? (passed / total * 100).toFixed(0) : "0";
|
|
1473
|
+
const color = passed === total ? chalk.green : passed / total >= 0.5 ? chalk.yellow : chalk.red;
|
|
1474
|
+
return color(`${passed}/${total} (${pct}%)`);
|
|
1475
|
+
}
|
|
1476
|
+
function header(text2) {
|
|
1477
|
+
return chalk.bold.underline(text2);
|
|
1478
|
+
}
|
|
1479
|
+
function printReport(report) {
|
|
1480
|
+
const { summary } = report;
|
|
1481
|
+
const name = report.skill_name ?? "unknown";
|
|
1482
|
+
console.log(header(`Package Check: ${name}`));
|
|
1483
|
+
console.log(
|
|
1484
|
+
` ${summary.passed} passed, ${summary.warnings} warnings, ${summary.errors} errors
|
|
1485
|
+
`
|
|
1486
|
+
);
|
|
1487
|
+
for (const r of report.results) {
|
|
1488
|
+
if (r.severity === "pass") continue;
|
|
1489
|
+
console.log(` ${icon(r.severity)} ${chalk.dim(r.id)}: ${r.message}`);
|
|
1490
|
+
if (r.details) console.log(` ${chalk.dim(r.details)}`);
|
|
1491
|
+
}
|
|
1492
|
+
if (summary.errors === 0) {
|
|
1493
|
+
console.log(chalk.green("\n All checks passed."));
|
|
1494
|
+
} else {
|
|
1495
|
+
console.log(chalk.red(`
|
|
1496
|
+
${summary.errors} error(s) must be fixed.`));
|
|
1497
|
+
}
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
// src/commands/check.ts
|
|
1501
|
+
function registerCheckCommand(program) {
|
|
1502
|
+
program.command("check").description("Run package integrity checks on a skill directory").argument("<skill-dir>", "Path to skill directory containing SKILL.md").option("--json", "Output as JSON").action(async (skillDir, opts) => {
|
|
1503
|
+
try {
|
|
1504
|
+
const report = checkPackage(skillDir);
|
|
1505
|
+
if (opts.json) {
|
|
1506
|
+
console.log(JSON.stringify(report, null, 2));
|
|
1507
|
+
} else {
|
|
1508
|
+
printReport(report);
|
|
1509
|
+
}
|
|
1510
|
+
process.exit(report.summary.errors > 0 ? 1 : 0);
|
|
1511
|
+
} catch (err) {
|
|
1512
|
+
console.error(`Error: ${err instanceof Error ? err.message : err}`);
|
|
1513
|
+
process.exit(1);
|
|
1514
|
+
}
|
|
1515
|
+
});
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
// src/commands/validate.ts
|
|
1519
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
1520
|
+
import { resolve as resolve2 } from "path";
|
|
1521
|
+
import chalk2 from "chalk";
|
|
1522
|
+
function registerValidateCommand(program) {
|
|
1523
|
+
program.command("validate").description("Validate an eval spec or eval contract YAML file").argument("<file>", "Path to YAML file").option("--contract", "Validate as eval contract instead of eval spec").option("--json", "Output as JSON").action(async (file, opts) => {
|
|
1524
|
+
try {
|
|
1525
|
+
const content = readFileSync2(resolve2(file), "utf-8");
|
|
1526
|
+
const isContract = opts.contract === true || content.includes("contract_version:");
|
|
1527
|
+
const label = isContract ? "eval contract" : "eval spec";
|
|
1528
|
+
const result = isContract ? parseAndValidateYaml(content, EvalContractSchema) : parseAndValidateYaml(content, EvalSpecSchema);
|
|
1529
|
+
if (opts.json) {
|
|
1530
|
+
console.log(
|
|
1531
|
+
JSON.stringify(
|
|
1532
|
+
{
|
|
1533
|
+
valid: result.success,
|
|
1534
|
+
type: label,
|
|
1535
|
+
errors: result.success ? [] : result.errors
|
|
1536
|
+
},
|
|
1537
|
+
null,
|
|
1538
|
+
2
|
|
1539
|
+
)
|
|
1540
|
+
);
|
|
1541
|
+
process.exit(result.success ? 0 : 1);
|
|
1542
|
+
return;
|
|
1543
|
+
}
|
|
1544
|
+
if (result.success) {
|
|
1545
|
+
const data = result.data;
|
|
1546
|
+
const name = data.skill_name || "unknown";
|
|
1547
|
+
console.log(chalk2.green(`\u2713 Valid ${label}: ${name}`));
|
|
1548
|
+
if (!isContract) {
|
|
1549
|
+
const spec = data;
|
|
1550
|
+
const criteria = spec.criteria?.length ?? 0;
|
|
1551
|
+
const cases = spec.test_cases?.length ?? 0;
|
|
1552
|
+
const models = (spec.models ?? ["sonnet"]).join(", ");
|
|
1553
|
+
console.log(` ${criteria} criteria, ${cases} test cases, models: ${models}`);
|
|
1554
|
+
}
|
|
1555
|
+
process.exit(0);
|
|
1556
|
+
} else {
|
|
1557
|
+
console.error(chalk2.red(`\u2717 Invalid ${label}:`));
|
|
1558
|
+
for (const e of result.errors) {
|
|
1559
|
+
console.error(` ${e.path}: ${e.message}`);
|
|
1560
|
+
}
|
|
1561
|
+
process.exit(1);
|
|
1562
|
+
}
|
|
1563
|
+
} catch (err) {
|
|
1564
|
+
console.error(`Error: ${err instanceof Error ? err.message : err}`);
|
|
1565
|
+
process.exit(1);
|
|
1566
|
+
}
|
|
1567
|
+
});
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
// src/commands/eval.ts
|
|
1571
|
+
import chalk3 from "chalk";
|
|
1572
|
+
import { resolve as resolve4 } from "path";
|
|
1573
|
+
import { createHash as createHash2 } from "crypto";
|
|
1574
|
+
|
|
1575
|
+
// ../db/dist/index.js
|
|
1576
|
+
import Database from "better-sqlite3";
|
|
1577
|
+
import { drizzle } from "drizzle-orm/better-sqlite3";
|
|
1578
|
+
import { sqliteTable, text, integer, real } from "drizzle-orm/sqlite-core";
|
|
1579
|
+
import { eq, desc } from "drizzle-orm";
|
|
1580
|
+
import { createHash } from "crypto";
|
|
1581
|
+
import { eq as eq2 } from "drizzle-orm";
|
|
1582
|
+
var __defProp = Object.defineProperty;
|
|
1583
|
+
var __export = (target, all) => {
|
|
1584
|
+
for (var name in all)
|
|
1585
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
1586
|
+
};
|
|
1587
|
+
var schema_exports = {};
|
|
1588
|
+
__export(schema_exports, {
|
|
1589
|
+
artifacts: () => artifacts,
|
|
1590
|
+
criterionResults: () => criterionResults,
|
|
1591
|
+
runSummaries: () => runSummaries,
|
|
1592
|
+
runs: () => runs,
|
|
1593
|
+
skillHumanReviews: () => skillHumanReviews,
|
|
1594
|
+
skillUsageEvents: () => skillUsageEvents,
|
|
1595
|
+
skillVersions: () => skillVersions
|
|
1596
|
+
});
|
|
1597
|
+
var skillVersions = sqliteTable("skill_versions", {
|
|
1598
|
+
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
1599
|
+
skill_name: text("skill_name").notNull(),
|
|
1600
|
+
version: text("version").notNull(),
|
|
1601
|
+
skill_md_hash: text("skill_md_hash").notNull(),
|
|
1602
|
+
created_at: text("created_at").notNull().default("(datetime('now'))")
|
|
1603
|
+
});
|
|
1604
|
+
var runs = sqliteTable("runs", {
|
|
1605
|
+
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
1606
|
+
skill_version_id: integer("skill_version_id").notNull(),
|
|
1607
|
+
status: text("status").notNull().$type().default("pending"),
|
|
1608
|
+
run_type: text("run_type").notNull().default("deterministic"),
|
|
1609
|
+
model: text("model"),
|
|
1610
|
+
started_at: text("started_at"),
|
|
1611
|
+
completed_at: text("completed_at"),
|
|
1612
|
+
duration_ms: integer("duration_ms"),
|
|
1613
|
+
created_at: text("created_at").notNull().default("(datetime('now'))"),
|
|
1614
|
+
error_message: text("error_message")
|
|
1615
|
+
});
|
|
1616
|
+
var criterionResults = sqliteTable("criterion_results", {
|
|
1617
|
+
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
1618
|
+
run_id: integer("run_id").notNull(),
|
|
1619
|
+
criterion_id: text("criterion_id").notNull(),
|
|
1620
|
+
passed: integer("passed", { mode: "boolean" }).notNull(),
|
|
1621
|
+
severity: text("severity").notNull(),
|
|
1622
|
+
message: text("message").notNull(),
|
|
1623
|
+
details: text("details"),
|
|
1624
|
+
method: text("method")
|
|
1625
|
+
});
|
|
1626
|
+
var runSummaries = sqliteTable("run_summaries", {
|
|
1627
|
+
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
1628
|
+
run_id: integer("run_id").notNull().unique(),
|
|
1629
|
+
total: integer("total").notNull(),
|
|
1630
|
+
passed: integer("passed").notNull(),
|
|
1631
|
+
warnings: integer("warnings").notNull(),
|
|
1632
|
+
errors: integer("errors").notNull(),
|
|
1633
|
+
score: real("score")
|
|
1634
|
+
});
|
|
1635
|
+
var artifacts = sqliteTable("artifacts", {
|
|
1636
|
+
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
1637
|
+
run_id: integer("run_id").notNull(),
|
|
1638
|
+
artifact_type: text("artifact_type").notNull(),
|
|
1639
|
+
filename: text("filename").notNull(),
|
|
1640
|
+
relative_path: text("relative_path").notNull(),
|
|
1641
|
+
size_bytes: integer("size_bytes"),
|
|
1642
|
+
created_at: text("created_at").notNull().default("(datetime('now'))")
|
|
1643
|
+
});
|
|
1644
|
+
var skillUsageEvents = sqliteTable("skill_usage_events", {
|
|
1645
|
+
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
1646
|
+
skill_id: text("skill_id").notNull(),
|
|
1647
|
+
session_id: text("session_id").notNull(),
|
|
1648
|
+
source: text("source").notNull().$type(),
|
|
1649
|
+
cass_score: real("cass_score").notNull(),
|
|
1650
|
+
cass_passed: integer("cass_passed", { mode: "boolean" }).notNull(),
|
|
1651
|
+
tenant_id: text("tenant_id"),
|
|
1652
|
+
recorded_at: text("recorded_at").notNull()
|
|
1653
|
+
});
|
|
1654
|
+
var skillHumanReviews = sqliteTable("skill_human_reviews", {
|
|
1655
|
+
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
1656
|
+
skill_id: text("skill_id").notNull(),
|
|
1657
|
+
thumbs_up: integer("thumbs_up", { mode: "boolean" }).notNull(),
|
|
1658
|
+
rationale: text("rationale"),
|
|
1659
|
+
reviewer: text("reviewer").notNull(),
|
|
1660
|
+
governance_class: text("governance_class").notNull().$type(),
|
|
1661
|
+
tenant_id: text("tenant_id"),
|
|
1662
|
+
recorded_at: text("recorded_at").notNull()
|
|
1663
|
+
});
|
|
1664
|
+
var CREATE_TABLES = `
|
|
1665
|
+
CREATE TABLE IF NOT EXISTS skill_versions (
|
|
1666
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1667
|
+
skill_name TEXT NOT NULL,
|
|
1668
|
+
version TEXT NOT NULL,
|
|
1669
|
+
skill_md_hash TEXT NOT NULL,
|
|
1670
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
1671
|
+
);
|
|
1672
|
+
|
|
1673
|
+
CREATE TABLE IF NOT EXISTS runs (
|
|
1674
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1675
|
+
skill_version_id INTEGER NOT NULL,
|
|
1676
|
+
status TEXT NOT NULL DEFAULT 'pending',
|
|
1677
|
+
run_type TEXT NOT NULL DEFAULT 'deterministic',
|
|
1678
|
+
model TEXT,
|
|
1679
|
+
started_at TEXT,
|
|
1680
|
+
completed_at TEXT,
|
|
1681
|
+
duration_ms INTEGER,
|
|
1682
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
1683
|
+
error_message TEXT,
|
|
1684
|
+
FOREIGN KEY (skill_version_id) REFERENCES skill_versions(id)
|
|
1685
|
+
);
|
|
1686
|
+
|
|
1687
|
+
CREATE TABLE IF NOT EXISTS criterion_results (
|
|
1688
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1689
|
+
run_id INTEGER NOT NULL,
|
|
1690
|
+
criterion_id TEXT NOT NULL,
|
|
1691
|
+
passed INTEGER NOT NULL,
|
|
1692
|
+
severity TEXT NOT NULL,
|
|
1693
|
+
message TEXT NOT NULL,
|
|
1694
|
+
details TEXT,
|
|
1695
|
+
method TEXT,
|
|
1696
|
+
FOREIGN KEY (run_id) REFERENCES runs(id)
|
|
1697
|
+
);
|
|
1698
|
+
|
|
1699
|
+
CREATE TABLE IF NOT EXISTS run_summaries (
|
|
1700
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1701
|
+
run_id INTEGER NOT NULL UNIQUE,
|
|
1702
|
+
total INTEGER NOT NULL,
|
|
1703
|
+
passed INTEGER NOT NULL,
|
|
1704
|
+
warnings INTEGER NOT NULL,
|
|
1705
|
+
errors INTEGER NOT NULL,
|
|
1706
|
+
score REAL,
|
|
1707
|
+
FOREIGN KEY (run_id) REFERENCES runs(id)
|
|
1708
|
+
);
|
|
1709
|
+
|
|
1710
|
+
CREATE TABLE IF NOT EXISTS artifacts (
|
|
1711
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1712
|
+
run_id INTEGER NOT NULL,
|
|
1713
|
+
artifact_type TEXT NOT NULL,
|
|
1714
|
+
filename TEXT NOT NULL,
|
|
1715
|
+
relative_path TEXT NOT NULL,
|
|
1716
|
+
size_bytes INTEGER,
|
|
1717
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
1718
|
+
FOREIGN KEY (run_id) REFERENCES runs(id)
|
|
1719
|
+
);
|
|
1720
|
+
|
|
1721
|
+
-- Skill usage events \u2014 intake fact table for the j-rig ingest-skill verb
|
|
1722
|
+
-- (ISEDC DR-103 D1/D2/D5). The tenant_id column lands in this FIRST CREATE
|
|
1723
|
+
-- TABLE per DR-103 D2 B2.1 (database.ts is CREATE-IF-NOT-EXISTS only \u2014 no ALTER
|
|
1724
|
+
-- path \u2014 so a future column add cannot retrofit cleanly; the multi-tenancy slot
|
|
1725
|
+
-- is reserved now). NULL tenant_id = the single-tenant/global bucket, never
|
|
1726
|
+
-- pooled cross-tenant (D2 B2.2). cass_passed = 0 rows are persisted-but-excluded.
|
|
1727
|
+
CREATE TABLE IF NOT EXISTS skill_usage_events (
|
|
1728
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1729
|
+
skill_id TEXT NOT NULL,
|
|
1730
|
+
session_id TEXT NOT NULL,
|
|
1731
|
+
source TEXT NOT NULL,
|
|
1732
|
+
cass_score REAL NOT NULL,
|
|
1733
|
+
cass_passed INTEGER NOT NULL,
|
|
1734
|
+
tenant_id TEXT,
|
|
1735
|
+
recorded_at TEXT NOT NULL
|
|
1736
|
+
);
|
|
1737
|
+
|
|
1738
|
+
-- Skill human reviews \u2014 intake fact table for the j-rig review verb.
|
|
1739
|
+
-- governance_class is always 'curated-signal' (NOT a signed human-review/v1
|
|
1740
|
+
-- predicate; DR-103 D3 B3.2 / doc 072 R6). tenant_id reserved here per D2 B2.1.
|
|
1741
|
+
CREATE TABLE IF NOT EXISTS skill_human_reviews (
|
|
1742
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1743
|
+
skill_id TEXT NOT NULL,
|
|
1744
|
+
thumbs_up INTEGER NOT NULL,
|
|
1745
|
+
rationale TEXT,
|
|
1746
|
+
reviewer TEXT NOT NULL,
|
|
1747
|
+
governance_class TEXT NOT NULL,
|
|
1748
|
+
tenant_id TEXT,
|
|
1749
|
+
recorded_at TEXT NOT NULL
|
|
1750
|
+
);
|
|
1751
|
+
|
|
1752
|
+
CREATE INDEX IF NOT EXISTS idx_runs_skill_version ON runs(skill_version_id);
|
|
1753
|
+
CREATE INDEX IF NOT EXISTS idx_runs_status ON runs(status);
|
|
1754
|
+
CREATE INDEX IF NOT EXISTS idx_criterion_results_run ON criterion_results(run_id);
|
|
1755
|
+
CREATE INDEX IF NOT EXISTS idx_artifacts_run ON artifacts(run_id);
|
|
1756
|
+
CREATE INDEX IF NOT EXISTS idx_skill_usage_skill ON skill_usage_events(skill_id);
|
|
1757
|
+
CREATE INDEX IF NOT EXISTS idx_skill_usage_skill_passed ON skill_usage_events(skill_id, cass_passed);
|
|
1758
|
+
CREATE INDEX IF NOT EXISTS idx_skill_reviews_skill ON skill_human_reviews(skill_id);
|
|
1759
|
+
`;
|
|
1760
|
+
function createDatabase(dbPath) {
|
|
1761
|
+
const sqlite = new Database(dbPath);
|
|
1762
|
+
sqlite.pragma("journal_mode = WAL");
|
|
1763
|
+
sqlite.pragma("foreign_keys = ON");
|
|
1764
|
+
sqlite.exec(CREATE_TABLES);
|
|
1765
|
+
const db = drizzle(sqlite, { schema: schema_exports });
|
|
1766
|
+
return {
|
|
1767
|
+
db,
|
|
1768
|
+
sqlite,
|
|
1769
|
+
close: () => sqlite.close()
|
|
1770
|
+
};
|
|
1771
|
+
}
|
|
1772
|
+
var TRANSITIONS = {
|
|
1773
|
+
pending: ["running", "canceled"],
|
|
1774
|
+
running: ["completed", "failed", "timed_out", "canceled"],
|
|
1775
|
+
completed: [],
|
|
1776
|
+
failed: [],
|
|
1777
|
+
timed_out: [],
|
|
1778
|
+
canceled: []
|
|
1779
|
+
};
|
|
1780
|
+
function isValidTransition(from, to) {
|
|
1781
|
+
return TRANSITIONS[from]?.includes(to) ?? false;
|
|
1782
|
+
}
|
|
1783
|
+
function getOrCreateSkillVersion({ db }, skillName, version, skillMdContent) {
|
|
1784
|
+
const hash = createHash("sha256").update(skillMdContent).digest("hex").slice(0, 16);
|
|
1785
|
+
const existing = db.select().from(skillVersions).where(eq(skillVersions.skill_md_hash, hash)).get();
|
|
1786
|
+
if (existing) return existing.id;
|
|
1787
|
+
const result = db.insert(skillVersions).values({ skill_name: skillName, version, skill_md_hash: hash }).returning({ id: skillVersions.id }).get();
|
|
1788
|
+
return result.id;
|
|
1789
|
+
}
|
|
1790
|
+
function createRun({ db }, skillVersionId, runType = "deterministic", model) {
|
|
1791
|
+
const result = db.insert(runs).values({
|
|
1792
|
+
skill_version_id: skillVersionId,
|
|
1793
|
+
run_type: runType,
|
|
1794
|
+
model: model ?? null,
|
|
1795
|
+
status: "pending"
|
|
1796
|
+
}).returning({ id: runs.id }).get();
|
|
1797
|
+
return result.id;
|
|
1798
|
+
}
|
|
1799
|
+
function transitionRun({ db }, runId, newStatus) {
|
|
1800
|
+
const run = db.select().from(runs).where(eq(runs.id, runId)).get();
|
|
1801
|
+
if (!run) throw new Error(`Run ${runId} not found`);
|
|
1802
|
+
const currentStatus = run.status;
|
|
1803
|
+
if (!isValidTransition(currentStatus, newStatus)) {
|
|
1804
|
+
throw new Error(`Invalid transition: ${currentStatus} \u2192 ${newStatus}`);
|
|
1805
|
+
}
|
|
1806
|
+
const updates = { status: newStatus };
|
|
1807
|
+
if (newStatus === "running") {
|
|
1808
|
+
updates["started_at"] = (/* @__PURE__ */ new Date()).toISOString();
|
|
1809
|
+
}
|
|
1810
|
+
if (["completed", "failed", "timed_out", "canceled"].includes(newStatus)) {
|
|
1811
|
+
updates["completed_at"] = (/* @__PURE__ */ new Date()).toISOString();
|
|
1812
|
+
if (run.started_at) {
|
|
1813
|
+
updates["duration_ms"] = (/* @__PURE__ */ new Date()).getTime() - new Date(run.started_at).getTime();
|
|
1814
|
+
}
|
|
1815
|
+
}
|
|
1816
|
+
db.update(runs).set(updates).where(eq(runs.id, runId)).run();
|
|
1817
|
+
}
|
|
1818
|
+
function storeCriterionResults({ db }, runId, results) {
|
|
1819
|
+
for (const r of results) {
|
|
1820
|
+
db.insert(criterionResults).values({
|
|
1821
|
+
run_id: runId,
|
|
1822
|
+
criterion_id: r.criterion_id,
|
|
1823
|
+
passed: r.passed,
|
|
1824
|
+
severity: r.severity,
|
|
1825
|
+
message: r.message,
|
|
1826
|
+
details: r.details ?? null,
|
|
1827
|
+
method: r.method ?? null
|
|
1828
|
+
}).run();
|
|
1829
|
+
}
|
|
1830
|
+
}
|
|
1831
|
+
function storeRunSummary({ db }, runId, summary) {
|
|
1832
|
+
const score = summary.total > 0 ? summary.passed / summary.total : 0;
|
|
1833
|
+
db.insert(runSummaries).values({ run_id: runId, ...summary, score }).run();
|
|
1834
|
+
}
|
|
1835
|
+
function getRun({ db }, runId) {
|
|
1836
|
+
const run = db.select().from(runs).where(eq(runs.id, runId)).get();
|
|
1837
|
+
if (!run) return null;
|
|
1838
|
+
const summary = db.select().from(runSummaries).where(eq(runSummaries.run_id, runId)).get();
|
|
1839
|
+
return { ...run, summary: summary ?? null };
|
|
1840
|
+
}
|
|
1841
|
+
function getRecentRuns(database, options = {}) {
|
|
1842
|
+
const { db } = database;
|
|
1843
|
+
const limit = options.limit ?? 10;
|
|
1844
|
+
if (options.skillName) {
|
|
1845
|
+
return db.select().from(runs).innerJoin(skillVersions, eq(runs.skill_version_id, skillVersions.id)).where(eq(skillVersions.skill_name, options.skillName)).orderBy(desc(runs.id)).limit(limit).all();
|
|
1846
|
+
}
|
|
1847
|
+
return db.select().from(runs).innerJoin(skillVersions, eq(runs.skill_version_id, skillVersions.id)).orderBy(desc(runs.id)).limit(limit).all();
|
|
1848
|
+
}
|
|
1849
|
+
function getRunResults({ db }, runId) {
|
|
1850
|
+
return db.select().from(criterionResults).where(eq(criterionResults.run_id, runId)).all();
|
|
1851
|
+
}
|
|
1852
|
+
function getRunArtifacts({ db }, runId) {
|
|
1853
|
+
return db.select().from(artifacts).where(eq(artifacts.run_id, runId)).all();
|
|
1854
|
+
}
|
|
1855
|
+
var CASS_PASS_THRESHOLD = 0.3;
|
|
1856
|
+
var CASS_WEIGHTS = {
|
|
1857
|
+
testsPassed: 0.25,
|
|
1858
|
+
clearResolution: 0.25,
|
|
1859
|
+
codeChanges: 0.15,
|
|
1860
|
+
userConfirmed: 0.15,
|
|
1861
|
+
backtracking: -0.1,
|
|
1862
|
+
abandoned: -0.2
|
|
1863
|
+
};
|
|
1864
|
+
function scoreCass(inputs) {
|
|
1865
|
+
let score = 0;
|
|
1866
|
+
if (inputs.testsPassed) score += CASS_WEIGHTS.testsPassed;
|
|
1867
|
+
if (inputs.clearResolution) score += CASS_WEIGHTS.clearResolution;
|
|
1868
|
+
if (inputs.codeChanges) score += CASS_WEIGHTS.codeChanges;
|
|
1869
|
+
if (inputs.userConfirmed) score += CASS_WEIGHTS.userConfirmed;
|
|
1870
|
+
if (inputs.backtracking) score += CASS_WEIGHTS.backtracking;
|
|
1871
|
+
if (inputs.abandoned) score += CASS_WEIGHTS.abandoned;
|
|
1872
|
+
return { score, passed: score >= CASS_PASS_THRESHOLD };
|
|
1873
|
+
}
|
|
1874
|
+
function recordSkillUsage({ db }, input) {
|
|
1875
|
+
const cass = scoreCass(input.cass);
|
|
1876
|
+
const row = db.insert(skillUsageEvents).values({
|
|
1877
|
+
skill_id: input.skillId,
|
|
1878
|
+
session_id: input.sessionId,
|
|
1879
|
+
source: input.source,
|
|
1880
|
+
cass_score: cass.score,
|
|
1881
|
+
cass_passed: cass.passed,
|
|
1882
|
+
tenant_id: input.tenantId ?? null,
|
|
1883
|
+
recorded_at: input.recordedAt
|
|
1884
|
+
}).returning().get();
|
|
1885
|
+
return toUsageRecord(row);
|
|
1886
|
+
}
|
|
1887
|
+
function recordSkillReview({ db }, input) {
|
|
1888
|
+
const row = db.insert(skillHumanReviews).values({
|
|
1889
|
+
skill_id: input.skillId,
|
|
1890
|
+
thumbs_up: input.thumbsUp,
|
|
1891
|
+
rationale: input.rationale ?? null,
|
|
1892
|
+
reviewer: input.reviewer,
|
|
1893
|
+
governance_class: "curated-signal",
|
|
1894
|
+
tenant_id: input.tenantId ?? null,
|
|
1895
|
+
recorded_at: input.recordedAt
|
|
1896
|
+
}).returning().get();
|
|
1897
|
+
return toReviewRecord(row);
|
|
1898
|
+
}
|
|
1899
|
+
function toUsageRecord(row) {
|
|
1900
|
+
return {
|
|
1901
|
+
id: row.id,
|
|
1902
|
+
skillId: row.skill_id,
|
|
1903
|
+
sessionId: row.session_id,
|
|
1904
|
+
source: row.source,
|
|
1905
|
+
cassScore: row.cass_score,
|
|
1906
|
+
cassPassed: row.cass_passed,
|
|
1907
|
+
tenantId: row.tenant_id ?? null,
|
|
1908
|
+
recordedAt: row.recorded_at
|
|
1909
|
+
};
|
|
1910
|
+
}
|
|
1911
|
+
function toReviewRecord(row) {
|
|
1912
|
+
return {
|
|
1913
|
+
id: row.id,
|
|
1914
|
+
skillId: row.skill_id,
|
|
1915
|
+
thumbsUp: row.thumbs_up,
|
|
1916
|
+
rationale: row.rationale ?? null,
|
|
1917
|
+
reviewer: row.reviewer,
|
|
1918
|
+
governanceClass: "curated-signal",
|
|
1919
|
+
tenantId: row.tenant_id ?? null,
|
|
1920
|
+
recordedAt: row.recorded_at
|
|
1921
|
+
};
|
|
1922
|
+
}
|
|
1923
|
+
|
|
1924
|
+
// src/lib/db.ts
|
|
1925
|
+
var DEFAULT_DB_PATH = "j-rig.db";
|
|
1926
|
+
function openDb(path) {
|
|
1927
|
+
return createDatabase(path ?? DEFAULT_DB_PATH);
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
// src/lib/loaders.ts
|
|
1931
|
+
import { readFileSync as readFileSync3, existsSync as existsSync2 } from "fs";
|
|
1932
|
+
import { join as join2, resolve as resolve3 } from "path";
|
|
1933
|
+
function loadEvalSpec(specPath, skillDir) {
|
|
1934
|
+
let filePath;
|
|
1935
|
+
if (specPath) {
|
|
1936
|
+
filePath = resolve3(specPath);
|
|
1937
|
+
} else if (skillDir) {
|
|
1938
|
+
const candidates = ["eval-spec.yaml", "eval-spec.yml"];
|
|
1939
|
+
const found = candidates.map((c) => join2(resolve3(skillDir), c)).find(existsSync2);
|
|
1940
|
+
if (!found) {
|
|
1941
|
+
throw new Error(
|
|
1942
|
+
`No eval spec found. Tried: ${candidates.join(", ")} in ${skillDir}. Use --spec to provide a path.`
|
|
1943
|
+
);
|
|
1944
|
+
}
|
|
1945
|
+
filePath = found;
|
|
1946
|
+
} else {
|
|
1947
|
+
throw new Error("Either specPath or skillDir must be provided");
|
|
1948
|
+
}
|
|
1949
|
+
const content = readFileSync3(filePath, "utf-8");
|
|
1950
|
+
const result = parseAndValidateYaml(content, EvalSpecSchema);
|
|
1951
|
+
if (!result.success) {
|
|
1952
|
+
const msgs = result.errors.map((e) => ` ${e.path}: ${e.message}`).join("\n");
|
|
1953
|
+
throw new Error(`Invalid eval spec:
|
|
1954
|
+
${msgs}`);
|
|
1955
|
+
}
|
|
1956
|
+
return result.data;
|
|
1957
|
+
}
|
|
1958
|
+
function loadSkillMd(skillDir, enterprise = false) {
|
|
1959
|
+
const absDir = resolve3(skillDir);
|
|
1960
|
+
const skillPath = join2(absDir, "SKILL.md");
|
|
1961
|
+
if (!existsSync2(skillPath)) {
|
|
1962
|
+
throw new Error(`SKILL.md not found at: ${skillPath}`);
|
|
1963
|
+
}
|
|
1964
|
+
const raw = readFileSync3(skillPath, "utf-8");
|
|
1965
|
+
const parser = enterprise ? parseSkillMdEnterprise : parseSkillMd;
|
|
1966
|
+
const result = parser(raw);
|
|
1967
|
+
if (!result.success) {
|
|
1968
|
+
const msgs = result.errors.map((e) => ` ${e.path}: ${e.message}`).join("\n");
|
|
1969
|
+
throw new Error(`SKILL.md parse error:
|
|
1970
|
+
${msgs}`);
|
|
1971
|
+
}
|
|
1972
|
+
return { parsed: result.data, raw };
|
|
1973
|
+
}
|
|
1974
|
+
|
|
1975
|
+
// src/providers/anthropic.ts
|
|
1976
|
+
var stubBannerEmitted = false;
|
|
1977
|
+
function emitStubBanner() {
|
|
1978
|
+
if (stubBannerEmitted) return;
|
|
1979
|
+
stubBannerEmitted = true;
|
|
1980
|
+
process.stderr.write(
|
|
1981
|
+
[
|
|
1982
|
+
"",
|
|
1983
|
+
"\u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557",
|
|
1984
|
+
"\u2551 WARNING \u2014 j-rig STUB PROVIDER MODE \u2551",
|
|
1985
|
+
"\u2551 \u2551",
|
|
1986
|
+
"\u2551 This run is using STUB providers. Output is NOT ground truth. \u2551",
|
|
1987
|
+
"\u2551 Stub Trigger: always selects the first available skill \u2551",
|
|
1988
|
+
"\u2551 Stub Execution: returns a synthetic response with zero latency \u2551",
|
|
1989
|
+
"\u2551 Stub Judgment: always returns 'yes' with confidence 0.7 \u2551",
|
|
1990
|
+
"\u2551 \u2551",
|
|
1991
|
+
"\u2551 These outputs are placeholder values for pipeline plumbing only. \u2551",
|
|
1992
|
+
"\u2551 Do NOT treat any metric, decision, or rollout verdict from this run as \u2551",
|
|
1993
|
+
"\u2551 evidence of skill quality. CI gates that consume j-rig output MUST \u2551",
|
|
1994
|
+
"\u2551 refuse rows produced under stub mode. \u2551",
|
|
1995
|
+
"\u2551 \u2551",
|
|
1996
|
+
"\u2551 To run against a real provider: implement the Anthropic SDK adapter \u2551",
|
|
1997
|
+
"\u2551 (see STUB-PROVIDERS.md). To acknowledge stub mode: set the env var \u2551",
|
|
1998
|
+
"\u2551 J_RIG_ALLOW_STUB=1 before invocation. \u2551",
|
|
1999
|
+
"\u255A\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255D",
|
|
2000
|
+
""
|
|
2001
|
+
].join("\n")
|
|
2002
|
+
);
|
|
2003
|
+
}
|
|
2004
|
+
function assertStubAllowed() {
|
|
2005
|
+
if (process.env.J_RIG_ALLOW_STUB === "1") return;
|
|
2006
|
+
throw new Error(
|
|
2007
|
+
[
|
|
2008
|
+
"REFUSED: j-rig cannot run without a real provider implementation.",
|
|
2009
|
+
"",
|
|
2010
|
+
"The Anthropic SDK adapter is not yet wired (iaj-stub-provider, PB-7). To opt",
|
|
2011
|
+
"into stub mode for pipeline-plumbing development, set:",
|
|
2012
|
+
"",
|
|
2013
|
+
" J_RIG_ALLOW_STUB=1",
|
|
2014
|
+
"",
|
|
2015
|
+
"BEFORE running the CLI. Stub-mode results are NOT ground truth \u2014 see",
|
|
2016
|
+
"STUB-PROVIDERS.md for the full discipline."
|
|
2017
|
+
].join("\n")
|
|
2018
|
+
);
|
|
2019
|
+
}
|
|
2020
|
+
var StubTriggerProvider = class {
|
|
2021
|
+
constructor(model) {
|
|
2022
|
+
this.model = model;
|
|
2023
|
+
assertStubAllowed();
|
|
2024
|
+
emitStubBanner();
|
|
2025
|
+
}
|
|
2026
|
+
async selectSkill(prompt, availableSkills) {
|
|
2027
|
+
const first = availableSkills[0]?.name ?? null;
|
|
2028
|
+
return {
|
|
2029
|
+
selected: first,
|
|
2030
|
+
reasoning: `[stub] Would call ${this.model} to select from [${availableSkills.map((s) => s.name).join(", ")}] for: "${prompt.slice(0, 50)}..."`
|
|
2031
|
+
};
|
|
2032
|
+
}
|
|
2033
|
+
};
|
|
2034
|
+
var StubExecutionProvider = class {
|
|
2035
|
+
constructor(model) {
|
|
2036
|
+
this.model = model;
|
|
2037
|
+
assertStubAllowed();
|
|
2038
|
+
emitStubBanner();
|
|
2039
|
+
}
|
|
2040
|
+
async execute(prompt, context, options) {
|
|
2041
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
2042
|
+
const effectiveModel = options?.model ?? this.model;
|
|
2043
|
+
return {
|
|
2044
|
+
text: `[stub] Would call ${effectiveModel} with skill body (${context.skill_body.length} chars) and prompt: "${prompt.slice(0, 50)}..."`,
|
|
2045
|
+
artifacts: [],
|
|
2046
|
+
tool_calls: 0,
|
|
2047
|
+
meta: {
|
|
2048
|
+
started_at: now,
|
|
2049
|
+
completed_at: now,
|
|
2050
|
+
duration_ms: 0,
|
|
2051
|
+
timed_out: false
|
|
2052
|
+
}
|
|
2053
|
+
};
|
|
2054
|
+
}
|
|
2055
|
+
};
|
|
2056
|
+
var StubJudgeProvider = class {
|
|
2057
|
+
constructor(model) {
|
|
2058
|
+
this.model = model;
|
|
2059
|
+
assertStubAllowed();
|
|
2060
|
+
emitStubBanner();
|
|
2061
|
+
}
|
|
2062
|
+
async judge(criterion_description, prompt, output, judge_prompt) {
|
|
2063
|
+
void prompt;
|
|
2064
|
+
void output;
|
|
2065
|
+
void judge_prompt;
|
|
2066
|
+
return {
|
|
2067
|
+
verdict: "yes",
|
|
2068
|
+
confidence: 0.7,
|
|
2069
|
+
reasoning: `[stub] Would call ${this.model} to judge: "${criterion_description.slice(0, 60)}...". Defaulting to yes.`
|
|
2070
|
+
};
|
|
2071
|
+
}
|
|
2072
|
+
};
|
|
2073
|
+
|
|
2074
|
+
// src/providers/transport.ts
|
|
2075
|
+
function createFetchTransport() {
|
|
2076
|
+
return async (req) => {
|
|
2077
|
+
const res = await fetch(req.url, {
|
|
2078
|
+
method: req.method,
|
|
2079
|
+
headers: req.headers,
|
|
2080
|
+
body: JSON.stringify(req.body),
|
|
2081
|
+
signal: req.signal
|
|
2082
|
+
});
|
|
2083
|
+
let json = null;
|
|
2084
|
+
try {
|
|
2085
|
+
json = await res.json();
|
|
2086
|
+
} catch {
|
|
2087
|
+
json = null;
|
|
2088
|
+
}
|
|
2089
|
+
return { status: res.status, json };
|
|
2090
|
+
};
|
|
2091
|
+
}
|
|
2092
|
+
|
|
2093
|
+
// src/providers/anthropic-real.ts
|
|
2094
|
+
var ADAPTER_NAME = "anthropic";
|
|
2095
|
+
var ADAPTER_VERSION = "1.0.0";
|
|
2096
|
+
var DEFAULT_BASE_URL = "https://api.anthropic.com/v1/messages";
|
|
2097
|
+
var ANTHROPIC_VERSION = "2023-06-01";
|
|
2098
|
+
var DEFAULT_MAX_TOKENS = 1024;
|
|
2099
|
+
var MODEL_ALIASES = {
|
|
2100
|
+
haiku: "claude-haiku-4-5",
|
|
2101
|
+
sonnet: "claude-sonnet-4-5",
|
|
2102
|
+
opus: "claude-opus-4-1"
|
|
2103
|
+
};
|
|
2104
|
+
function resolveAnthropicModel(model) {
|
|
2105
|
+
if (model.startsWith("claude-")) return model;
|
|
2106
|
+
const stripped = model.startsWith("anthropic/") ? model.slice("anthropic/".length) : model;
|
|
2107
|
+
return MODEL_ALIASES[stripped] ?? stripped;
|
|
2108
|
+
}
|
|
2109
|
+
function mapFinishReason(raw) {
|
|
2110
|
+
switch (raw) {
|
|
2111
|
+
case "end_turn":
|
|
2112
|
+
case "stop_sequence":
|
|
2113
|
+
return "stop";
|
|
2114
|
+
case "max_tokens":
|
|
2115
|
+
return "length";
|
|
2116
|
+
case "tool_use":
|
|
2117
|
+
return "tool_use";
|
|
2118
|
+
case "refusal":
|
|
2119
|
+
return "refusal";
|
|
2120
|
+
default:
|
|
2121
|
+
return "stop";
|
|
2122
|
+
}
|
|
2123
|
+
}
|
|
2124
|
+
function mapUsage(raw) {
|
|
2125
|
+
const u = raw ?? {};
|
|
2126
|
+
const usage = {
|
|
2127
|
+
inputTokens: typeof u.input_tokens === "number" ? u.input_tokens : 0,
|
|
2128
|
+
outputTokens: typeof u.output_tokens === "number" ? u.output_tokens : 0
|
|
2129
|
+
};
|
|
2130
|
+
if (typeof u.cache_read_input_tokens === "number") {
|
|
2131
|
+
usage.cachedInputTokens = u.cache_read_input_tokens;
|
|
2132
|
+
}
|
|
2133
|
+
return usage;
|
|
2134
|
+
}
|
|
2135
|
+
function toAnthropicPayload(messages) {
|
|
2136
|
+
const systemParts = [];
|
|
2137
|
+
const out = [];
|
|
2138
|
+
for (const m of messages) {
|
|
2139
|
+
if (m.role === "system") {
|
|
2140
|
+
systemParts.push(m.content);
|
|
2141
|
+
continue;
|
|
2142
|
+
}
|
|
2143
|
+
if (m.role === "tool") {
|
|
2144
|
+
out.push({
|
|
2145
|
+
role: "user",
|
|
2146
|
+
content: [
|
|
2147
|
+
{
|
|
2148
|
+
type: "tool_result",
|
|
2149
|
+
tool_use_id: m.toolCallId ?? "",
|
|
2150
|
+
content: m.content
|
|
2151
|
+
}
|
|
2152
|
+
]
|
|
2153
|
+
});
|
|
2154
|
+
continue;
|
|
2155
|
+
}
|
|
2156
|
+
out.push({ role: m.role, content: m.content });
|
|
2157
|
+
}
|
|
2158
|
+
return {
|
|
2159
|
+
system: systemParts.length > 0 ? systemParts.join("\n\n") : void 0,
|
|
2160
|
+
messages: out
|
|
2161
|
+
};
|
|
2162
|
+
}
|
|
2163
|
+
function errorForStatus(status, body) {
|
|
2164
|
+
const message = body?.error?.message ?? `Anthropic API returned HTTP ${status}`;
|
|
2165
|
+
if (status === 401 || status === 403) {
|
|
2166
|
+
return new ProviderError({ category: "authentication", providerName: ADAPTER_NAME, message });
|
|
2167
|
+
}
|
|
2168
|
+
if (status === 404) {
|
|
2169
|
+
return new ProviderError({ category: "model_not_found", providerName: ADAPTER_NAME, message });
|
|
2170
|
+
}
|
|
2171
|
+
if (status === 429) {
|
|
2172
|
+
return new ProviderError({ category: "rate_limit", providerName: ADAPTER_NAME, message });
|
|
2173
|
+
}
|
|
2174
|
+
if (status === 408 || status === 504 || status === 529) {
|
|
2175
|
+
return new ProviderError({ category: "network_timeout", providerName: ADAPTER_NAME, message });
|
|
2176
|
+
}
|
|
2177
|
+
return new ProviderError({ category: "unknown", providerName: ADAPTER_NAME, message });
|
|
2178
|
+
}
|
|
2179
|
+
function errorForThrow(err) {
|
|
2180
|
+
if (err instanceof ProviderError) return err;
|
|
2181
|
+
if (err instanceof Error && (err.name === "AbortError" || /abort|timeout/i.test(err.message))) {
|
|
2182
|
+
return new ProviderError({
|
|
2183
|
+
category: "network_timeout",
|
|
2184
|
+
providerName: ADAPTER_NAME,
|
|
2185
|
+
message: err.message,
|
|
2186
|
+
originalError: err
|
|
2187
|
+
});
|
|
2188
|
+
}
|
|
2189
|
+
return new ProviderError({
|
|
2190
|
+
category: "unknown",
|
|
2191
|
+
providerName: ADAPTER_NAME,
|
|
2192
|
+
message: err instanceof Error ? err.message : String(err),
|
|
2193
|
+
originalError: err
|
|
2194
|
+
});
|
|
2195
|
+
}
|
|
2196
|
+
function extractText(json) {
|
|
2197
|
+
const blocks = json?.content;
|
|
2198
|
+
if (!Array.isArray(blocks)) return "";
|
|
2199
|
+
return blocks.filter((b) => typeof b === "object" && b !== null).filter((b) => b.type === "text" && typeof b.text === "string").map((b) => b.text).join("");
|
|
2200
|
+
}
|
|
2201
|
+
var RealAnthropicProvider = class {
|
|
2202
|
+
name = ADAPTER_NAME;
|
|
2203
|
+
version = ADAPTER_VERSION;
|
|
2204
|
+
#apiKey;
|
|
2205
|
+
#baseUrl;
|
|
2206
|
+
#transport;
|
|
2207
|
+
constructor(opts) {
|
|
2208
|
+
this.#apiKey = opts.apiKey;
|
|
2209
|
+
this.#baseUrl = opts.baseUrl ?? DEFAULT_BASE_URL;
|
|
2210
|
+
this.#transport = opts.transport ?? createFetchTransport();
|
|
2211
|
+
}
|
|
2212
|
+
#headers() {
|
|
2213
|
+
return {
|
|
2214
|
+
"content-type": "application/json",
|
|
2215
|
+
"x-api-key": this.#apiKey,
|
|
2216
|
+
"anthropic-version": ANTHROPIC_VERSION
|
|
2217
|
+
};
|
|
2218
|
+
}
|
|
2219
|
+
async complete(req) {
|
|
2220
|
+
this.#assertKey();
|
|
2221
|
+
const { system, messages } = toAnthropicPayload(req.messages);
|
|
2222
|
+
const body = {
|
|
2223
|
+
model: resolveAnthropicModel(req.model),
|
|
2224
|
+
max_tokens: req.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
2225
|
+
messages,
|
|
2226
|
+
...system !== void 0 ? { system } : {},
|
|
2227
|
+
...req.temperature !== void 0 ? { temperature: req.temperature } : {},
|
|
2228
|
+
...req.stop !== void 0 ? { stop_sequences: req.stop } : {}
|
|
2229
|
+
};
|
|
2230
|
+
const res = await this.#send(body, req.signal);
|
|
2231
|
+
if (res.status < 200 || res.status >= 300) {
|
|
2232
|
+
throw errorForStatus(res.status, res.json);
|
|
2233
|
+
}
|
|
2234
|
+
const text2 = extractText(res.json);
|
|
2235
|
+
const result = {
|
|
2236
|
+
text: text2,
|
|
2237
|
+
model: req.model,
|
|
2238
|
+
usage: mapUsage(res.json.usage),
|
|
2239
|
+
finishReason: mapFinishReason(res.json.stop_reason)
|
|
2240
|
+
};
|
|
2241
|
+
if (req.responseSchema !== void 0) {
|
|
2242
|
+
result.structuredOutput = this.#parseStructured(text2);
|
|
2243
|
+
}
|
|
2244
|
+
return result;
|
|
2245
|
+
}
|
|
2246
|
+
async *completeStream(req) {
|
|
2247
|
+
const completion = await this.complete(req);
|
|
2248
|
+
if (completion.text.length > 0) {
|
|
2249
|
+
yield { type: "text_delta", delta: completion.text };
|
|
2250
|
+
}
|
|
2251
|
+
yield { type: "finish", finishReason: completion.finishReason, usage: completion.usage };
|
|
2252
|
+
}
|
|
2253
|
+
async callTool(req) {
|
|
2254
|
+
this.#assertKey();
|
|
2255
|
+
const { system, messages } = toAnthropicPayload(req.messages);
|
|
2256
|
+
const body = {
|
|
2257
|
+
model: resolveAnthropicModel(req.model),
|
|
2258
|
+
max_tokens: req.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
2259
|
+
messages,
|
|
2260
|
+
...system !== void 0 ? { system } : {},
|
|
2261
|
+
tools: req.tools.map((t) => ({
|
|
2262
|
+
name: t.name,
|
|
2263
|
+
description: t.description,
|
|
2264
|
+
input_schema: t.inputSchema
|
|
2265
|
+
}))
|
|
2266
|
+
};
|
|
2267
|
+
const res = await this.#send(body, req.signal);
|
|
2268
|
+
if (res.status < 200 || res.status >= 300) {
|
|
2269
|
+
throw errorForStatus(res.status, res.json);
|
|
2270
|
+
}
|
|
2271
|
+
const json = res.json ?? {};
|
|
2272
|
+
const text2 = extractText(json);
|
|
2273
|
+
const usage = mapUsage(json.usage);
|
|
2274
|
+
const blocks = Array.isArray(json.content) ? json.content : [];
|
|
2275
|
+
const toolUse = blocks.find(
|
|
2276
|
+
(b) => typeof b === "object" && b !== null && b.type === "tool_use"
|
|
2277
|
+
);
|
|
2278
|
+
if (!toolUse) {
|
|
2279
|
+
return {
|
|
2280
|
+
toolName: null,
|
|
2281
|
+
toolArguments: null,
|
|
2282
|
+
toolCallId: null,
|
|
2283
|
+
text: text2,
|
|
2284
|
+
finishReason: mapFinishReason(json.stop_reason),
|
|
2285
|
+
usage
|
|
2286
|
+
};
|
|
2287
|
+
}
|
|
2288
|
+
return {
|
|
2289
|
+
toolName: typeof toolUse.name === "string" ? toolUse.name : null,
|
|
2290
|
+
toolArguments: typeof toolUse.input === "object" && toolUse.input !== null ? toolUse.input : null,
|
|
2291
|
+
toolCallId: typeof toolUse.id === "string" ? toolUse.id : null,
|
|
2292
|
+
text: text2,
|
|
2293
|
+
finishReason: "tool_use",
|
|
2294
|
+
usage
|
|
2295
|
+
};
|
|
2296
|
+
}
|
|
2297
|
+
async batch(reqs) {
|
|
2298
|
+
return Promise.all(
|
|
2299
|
+
reqs.map(
|
|
2300
|
+
(r) => this.complete(r).catch(
|
|
2301
|
+
(err) => err instanceof ProviderError ? err : errorForThrow(err)
|
|
2302
|
+
)
|
|
2303
|
+
)
|
|
2304
|
+
);
|
|
2305
|
+
}
|
|
2306
|
+
// --- internals ---------------------------------------------------------
|
|
2307
|
+
#assertKey() {
|
|
2308
|
+
if (this.#apiKey.length < 8) {
|
|
2309
|
+
throw new ProviderError({
|
|
2310
|
+
category: "authentication",
|
|
2311
|
+
providerName: this.name,
|
|
2312
|
+
message: "apiKey missing or too short"
|
|
2313
|
+
});
|
|
2314
|
+
}
|
|
2315
|
+
}
|
|
2316
|
+
async #send(body, signal) {
|
|
2317
|
+
try {
|
|
2318
|
+
return await this.#transport({
|
|
2319
|
+
url: this.#baseUrl,
|
|
2320
|
+
method: "POST",
|
|
2321
|
+
headers: this.#headers(),
|
|
2322
|
+
body,
|
|
2323
|
+
signal
|
|
2324
|
+
});
|
|
2325
|
+
} catch (err) {
|
|
2326
|
+
throw errorForThrow(err);
|
|
2327
|
+
}
|
|
2328
|
+
}
|
|
2329
|
+
#parseStructured(text2) {
|
|
2330
|
+
try {
|
|
2331
|
+
return JSON.parse(text2);
|
|
2332
|
+
} catch {
|
|
2333
|
+
throw new ProviderError({
|
|
2334
|
+
category: "schema_violation",
|
|
2335
|
+
providerName: this.name,
|
|
2336
|
+
message: "responseSchema requested but model output was not valid JSON"
|
|
2337
|
+
});
|
|
2338
|
+
}
|
|
2339
|
+
}
|
|
2340
|
+
};
|
|
2341
|
+
var AnthropicTriggerProvider = class {
|
|
2342
|
+
#provider;
|
|
2343
|
+
#model;
|
|
2344
|
+
constructor(model, provider) {
|
|
2345
|
+
this.#model = model;
|
|
2346
|
+
this.#provider = provider;
|
|
2347
|
+
}
|
|
2348
|
+
async selectSkill(prompt, availableSkills) {
|
|
2349
|
+
const roster = availableSkills.map((s) => `- ${s.name}: ${s.description}`).join("\n");
|
|
2350
|
+
const system = 'You are a skill router. Given a user prompt and a roster of available skills, decide which single skill (if any) should handle the prompt. Respond ONLY with a JSON object {"selected": "<skill-name-or-null>", "reasoning": "<one sentence>"}. Use null for selected when no skill fits.';
|
|
2351
|
+
const user = `Available skills:
|
|
2352
|
+
${roster}
|
|
2353
|
+
|
|
2354
|
+
User prompt: "${prompt}"`;
|
|
2355
|
+
const result = await this.#provider.complete({
|
|
2356
|
+
model: this.#model,
|
|
2357
|
+
messages: [
|
|
2358
|
+
{ role: "system", content: system },
|
|
2359
|
+
{ role: "user", content: user }
|
|
2360
|
+
],
|
|
2361
|
+
maxTokens: 256,
|
|
2362
|
+
temperature: 0
|
|
2363
|
+
});
|
|
2364
|
+
const parsed = parseJsonObject(result.text);
|
|
2365
|
+
const rawSelected = parsed?.selected;
|
|
2366
|
+
const selected = typeof rawSelected === "string" && rawSelected !== "null" && rawSelected.length > 0 ? rawSelected : null;
|
|
2367
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : result.text.slice(0, 200);
|
|
2368
|
+
return { selected, reasoning };
|
|
2369
|
+
}
|
|
2370
|
+
};
|
|
2371
|
+
var AnthropicExecutionProvider = class {
|
|
2372
|
+
#provider;
|
|
2373
|
+
#model;
|
|
2374
|
+
constructor(model, provider) {
|
|
2375
|
+
this.#model = model;
|
|
2376
|
+
this.#provider = provider;
|
|
2377
|
+
}
|
|
2378
|
+
async execute(prompt, context, options) {
|
|
2379
|
+
const started = /* @__PURE__ */ new Date();
|
|
2380
|
+
const model = options?.model ?? this.#model;
|
|
2381
|
+
const controller = options?.timeout_ms ? new AbortController() : void 0;
|
|
2382
|
+
const timer = controller ? setTimeout(() => controller.abort(), options.timeout_ms) : void 0;
|
|
2383
|
+
try {
|
|
2384
|
+
const result = await this.#provider.complete({
|
|
2385
|
+
model,
|
|
2386
|
+
messages: [
|
|
2387
|
+
{ role: "system", content: context.skill_body },
|
|
2388
|
+
{ role: "user", content: prompt }
|
|
2389
|
+
],
|
|
2390
|
+
maxTokens: 1024,
|
|
2391
|
+
...controller ? { signal: controller.signal } : {}
|
|
2392
|
+
});
|
|
2393
|
+
const completed = /* @__PURE__ */ new Date();
|
|
2394
|
+
const meta = {
|
|
2395
|
+
started_at: started.toISOString(),
|
|
2396
|
+
completed_at: completed.toISOString(),
|
|
2397
|
+
duration_ms: completed.getTime() - started.getTime(),
|
|
2398
|
+
timed_out: false
|
|
2399
|
+
};
|
|
2400
|
+
return { text: result.text, artifacts: [], tool_calls: 0, meta };
|
|
2401
|
+
} finally {
|
|
2402
|
+
if (timer) clearTimeout(timer);
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
};
|
|
2406
|
+
var AnthropicJudgeProvider = class {
|
|
2407
|
+
#provider;
|
|
2408
|
+
#model;
|
|
2409
|
+
constructor(model, provider) {
|
|
2410
|
+
this.#model = model;
|
|
2411
|
+
this.#provider = provider;
|
|
2412
|
+
}
|
|
2413
|
+
async judge(criterion_description, prompt, output, judge_prompt) {
|
|
2414
|
+
const system = 'You are a strict binary evaluator. Decide whether the OUTPUT satisfies the CRITERION for the given PROMPT. Respond ONLY with a JSON object {"verdict": "yes"|"no"|"unsure", "confidence": <0..1>, "reasoning": "<one sentence>"}.';
|
|
2415
|
+
const question = judge_prompt ?? `Does the output satisfy: ${criterion_description}?`;
|
|
2416
|
+
const user = `CRITERION: ${criterion_description}
|
|
2417
|
+
|
|
2418
|
+
QUESTION: ${question}
|
|
2419
|
+
|
|
2420
|
+
PROMPT: ${prompt}
|
|
2421
|
+
|
|
2422
|
+
OUTPUT:
|
|
2423
|
+
${output}`;
|
|
2424
|
+
const result = await this.#provider.complete({
|
|
2425
|
+
model: this.#model,
|
|
2426
|
+
messages: [
|
|
2427
|
+
{ role: "system", content: system },
|
|
2428
|
+
{ role: "user", content: user }
|
|
2429
|
+
],
|
|
2430
|
+
maxTokens: 256,
|
|
2431
|
+
temperature: 0
|
|
2432
|
+
});
|
|
2433
|
+
const parsed = parseJsonObject(result.text);
|
|
2434
|
+
const rawVerdict = typeof parsed?.verdict === "string" ? parsed.verdict.toLowerCase() : "";
|
|
2435
|
+
const verdict = rawVerdict === "yes" ? "yes" : rawVerdict === "no" ? "no" : "unsure";
|
|
2436
|
+
const confidence = typeof parsed?.confidence === "number" ? Math.max(0, Math.min(1, parsed.confidence)) : 0.5;
|
|
2437
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : result.text.slice(0, 200);
|
|
2438
|
+
return { verdict, confidence, reasoning };
|
|
2439
|
+
}
|
|
2440
|
+
};
|
|
2441
|
+
function parseJsonObject(text2) {
|
|
2442
|
+
const fenced = text2.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
2443
|
+
const candidate = fenced ? fenced[1] : text2;
|
|
2444
|
+
const start = candidate.indexOf("{");
|
|
2445
|
+
const end = candidate.lastIndexOf("}");
|
|
2446
|
+
if (start === -1 || end === -1 || end < start) return null;
|
|
2447
|
+
try {
|
|
2448
|
+
const parsed = JSON.parse(candidate.slice(start, end + 1));
|
|
2449
|
+
return typeof parsed === "object" && parsed !== null ? parsed : null;
|
|
2450
|
+
} catch {
|
|
2451
|
+
return null;
|
|
2452
|
+
}
|
|
2453
|
+
}
|
|
2454
|
+
|
|
2455
|
+
// src/providers/openai-compatible.ts
|
|
2456
|
+
var ADAPTER_VERSION2 = "1.0.0";
|
|
2457
|
+
var PROVIDER_PRESETS = {
|
|
2458
|
+
deepseek: {
|
|
2459
|
+
id: "deepseek",
|
|
2460
|
+
baseUrl: "https://api.deepseek.com",
|
|
2461
|
+
// deepseek-v4-flash (V4 Lite) is the current fast coding/general model
|
|
2462
|
+
// (1M context, 13B-active MoE); deepseek-reasoner is the reasoning model.
|
|
2463
|
+
// The legacy "deepseek-chat" alias deprecates 2026-07-24 — it just maps to
|
|
2464
|
+
// v4-flash non-thinking mode. Override via LLM_MODEL/--model to switch.
|
|
2465
|
+
defaultModel: "deepseek-v4-flash",
|
|
2466
|
+
keyEnv: "DEEPSEEK_API_KEY"
|
|
2467
|
+
},
|
|
2468
|
+
kimi: {
|
|
2469
|
+
id: "kimi",
|
|
2470
|
+
// Moonshot's international endpoint. platform.kimi.ai is the console;
|
|
2471
|
+
// api.moonshot.ai/v1 is the OpenAI-compatible API surface.
|
|
2472
|
+
baseUrl: "https://api.moonshot.ai/v1",
|
|
2473
|
+
// Kimi K2.6 (latest, April 2026 — 1M ctx, coding/agentic). Model ids churn
|
|
2474
|
+
// on the vendor side — override via LLM_MODEL/--model when needed.
|
|
2475
|
+
defaultModel: "kimi-k2.6",
|
|
2476
|
+
keyEnv: "MOONSHOT_API_KEY"
|
|
2477
|
+
},
|
|
2478
|
+
openrouter: {
|
|
2479
|
+
id: "openrouter",
|
|
2480
|
+
baseUrl: "https://openrouter.ai/api/v1",
|
|
2481
|
+
// OpenRouter namespaces models as <org>/<model>. Pick Kimi or DeepSeek by
|
|
2482
|
+
// overriding LLM_MODEL/--model.
|
|
2483
|
+
defaultModel: "deepseek/deepseek-chat",
|
|
2484
|
+
keyEnv: "OPENROUTER_API_KEY"
|
|
2485
|
+
}
|
|
2486
|
+
};
|
|
2487
|
+
var PRESET_ALIASES = {
|
|
2488
|
+
moonshot: "kimi"
|
|
2489
|
+
};
|
|
2490
|
+
function resolveOpenAICompatConfig(env = process.env, preferred) {
|
|
2491
|
+
const llmBase = env.LLM_BASE_URL?.trim();
|
|
2492
|
+
const llmModel = env.LLM_MODEL?.trim();
|
|
2493
|
+
const llmKey = env.LLM_API_KEY?.trim();
|
|
2494
|
+
const fromPreset = (presetId) => {
|
|
2495
|
+
const id = PRESET_ALIASES[presetId] ?? presetId;
|
|
2496
|
+
const preset = PROVIDER_PRESETS[id];
|
|
2497
|
+
if (!preset) return null;
|
|
2498
|
+
const key = (env[preset.keyEnv] ?? llmKey)?.trim();
|
|
2499
|
+
if (!key || key.length < 8) return null;
|
|
2500
|
+
return {
|
|
2501
|
+
name: preset.id,
|
|
2502
|
+
baseUrl: llmBase || preset.baseUrl,
|
|
2503
|
+
apiKey: key,
|
|
2504
|
+
defaultModel: llmModel || preset.defaultModel
|
|
2505
|
+
};
|
|
2506
|
+
};
|
|
2507
|
+
if (preferred) {
|
|
2508
|
+
const explicit = fromPreset(preferred);
|
|
2509
|
+
if (explicit) return explicit;
|
|
2510
|
+
return null;
|
|
2511
|
+
}
|
|
2512
|
+
if (llmKey && llmKey.length >= 8 && llmBase) {
|
|
2513
|
+
return {
|
|
2514
|
+
name: env.LLM_PROVIDER?.trim() || "openai-compatible",
|
|
2515
|
+
baseUrl: llmBase,
|
|
2516
|
+
apiKey: llmKey,
|
|
2517
|
+
defaultModel: llmModel || ""
|
|
2518
|
+
};
|
|
2519
|
+
}
|
|
2520
|
+
for (const presetId of ["deepseek", "kimi", "openrouter"]) {
|
|
2521
|
+
const cfg = fromPreset(presetId);
|
|
2522
|
+
if (cfg) return cfg;
|
|
2523
|
+
}
|
|
2524
|
+
return null;
|
|
2525
|
+
}
|
|
2526
|
+
function mapFinishReason2(raw) {
|
|
2527
|
+
switch (raw) {
|
|
2528
|
+
case "stop":
|
|
2529
|
+
return "stop";
|
|
2530
|
+
case "length":
|
|
2531
|
+
return "length";
|
|
2532
|
+
case "tool_calls":
|
|
2533
|
+
case "function_call":
|
|
2534
|
+
return "tool_use";
|
|
2535
|
+
case "content_filter":
|
|
2536
|
+
return "refusal";
|
|
2537
|
+
default:
|
|
2538
|
+
return "stop";
|
|
2539
|
+
}
|
|
2540
|
+
}
|
|
2541
|
+
function mapUsage2(raw) {
|
|
2542
|
+
const u = raw ?? {};
|
|
2543
|
+
const usage = {
|
|
2544
|
+
inputTokens: typeof u.prompt_tokens === "number" ? u.prompt_tokens : 0,
|
|
2545
|
+
outputTokens: typeof u.completion_tokens === "number" ? u.completion_tokens : 0
|
|
2546
|
+
};
|
|
2547
|
+
const cached = u.prompt_tokens_details?.cached_tokens;
|
|
2548
|
+
if (typeof cached === "number") {
|
|
2549
|
+
usage.cachedInputTokens = cached;
|
|
2550
|
+
}
|
|
2551
|
+
return usage;
|
|
2552
|
+
}
|
|
2553
|
+
function toWireMessages(messages) {
|
|
2554
|
+
return messages.map((m) => {
|
|
2555
|
+
if (m.role === "tool") {
|
|
2556
|
+
return {
|
|
2557
|
+
role: "tool",
|
|
2558
|
+
content: m.content,
|
|
2559
|
+
tool_call_id: m.toolCallId ?? "",
|
|
2560
|
+
...m.toolName ? { name: m.toolName } : {}
|
|
2561
|
+
};
|
|
2562
|
+
}
|
|
2563
|
+
return { role: m.role, content: m.content };
|
|
2564
|
+
});
|
|
2565
|
+
}
|
|
2566
|
+
function errorForStatus2(name, status, body) {
|
|
2567
|
+
const message = body?.error?.message ?? `${name} API returned HTTP ${status}`;
|
|
2568
|
+
if (status === 401 || status === 403) {
|
|
2569
|
+
return new ProviderError({ category: "authentication", providerName: name, message });
|
|
2570
|
+
}
|
|
2571
|
+
if (status === 404) {
|
|
2572
|
+
return new ProviderError({ category: "model_not_found", providerName: name, message });
|
|
2573
|
+
}
|
|
2574
|
+
if (status === 429) {
|
|
2575
|
+
return new ProviderError({ category: "rate_limit", providerName: name, message });
|
|
2576
|
+
}
|
|
2577
|
+
if (status === 408 || status === 504 || status === 529 || status === 503) {
|
|
2578
|
+
return new ProviderError({ category: "network_timeout", providerName: name, message });
|
|
2579
|
+
}
|
|
2580
|
+
return new ProviderError({ category: "unknown", providerName: name, message });
|
|
2581
|
+
}
|
|
2582
|
+
function errorForThrow2(name, err) {
|
|
2583
|
+
if (err instanceof ProviderError) return err;
|
|
2584
|
+
if (err instanceof Error && (err.name === "AbortError" || /abort|timeout/i.test(err.message))) {
|
|
2585
|
+
return new ProviderError({
|
|
2586
|
+
category: "network_timeout",
|
|
2587
|
+
providerName: name,
|
|
2588
|
+
message: err.message,
|
|
2589
|
+
originalError: err
|
|
2590
|
+
});
|
|
2591
|
+
}
|
|
2592
|
+
return new ProviderError({
|
|
2593
|
+
category: "unknown",
|
|
2594
|
+
providerName: name,
|
|
2595
|
+
message: err instanceof Error ? err.message : String(err),
|
|
2596
|
+
originalError: err
|
|
2597
|
+
});
|
|
2598
|
+
}
|
|
2599
|
+
var RealOpenAICompatProvider = class {
|
|
2600
|
+
name;
|
|
2601
|
+
version = ADAPTER_VERSION2;
|
|
2602
|
+
#apiKey;
|
|
2603
|
+
#baseUrl;
|
|
2604
|
+
#transport;
|
|
2605
|
+
constructor(opts) {
|
|
2606
|
+
this.#apiKey = opts.apiKey;
|
|
2607
|
+
this.#baseUrl = opts.baseUrl.replace(/\/+$/, "");
|
|
2608
|
+
this.name = opts.name ?? "openai-compatible";
|
|
2609
|
+
this.#transport = opts.transport ?? createFetchTransport();
|
|
2610
|
+
}
|
|
2611
|
+
#headers() {
|
|
2612
|
+
return {
|
|
2613
|
+
"content-type": "application/json",
|
|
2614
|
+
authorization: `Bearer ${this.#apiKey}`
|
|
2615
|
+
};
|
|
2616
|
+
}
|
|
2617
|
+
async complete(req) {
|
|
2618
|
+
this.#assertKey();
|
|
2619
|
+
const body = {
|
|
2620
|
+
model: req.model,
|
|
2621
|
+
messages: toWireMessages(req.messages),
|
|
2622
|
+
...req.maxTokens !== void 0 ? { max_tokens: req.maxTokens } : {},
|
|
2623
|
+
...req.temperature !== void 0 ? { temperature: req.temperature } : {},
|
|
2624
|
+
...req.stop !== void 0 ? { stop: req.stop } : {},
|
|
2625
|
+
...req.responseSchema !== void 0 ? {
|
|
2626
|
+
response_format: {
|
|
2627
|
+
type: "json_schema",
|
|
2628
|
+
json_schema: { name: "response", schema: req.responseSchema, strict: true }
|
|
2629
|
+
}
|
|
2630
|
+
} : {}
|
|
2631
|
+
};
|
|
2632
|
+
const res = await this.#send(body, req.signal);
|
|
2633
|
+
if (res.status < 200 || res.status >= 300) {
|
|
2634
|
+
throw errorForStatus2(this.name, res.status, res.json);
|
|
2635
|
+
}
|
|
2636
|
+
const choice = this.#firstChoice(res.json);
|
|
2637
|
+
const message = choice.message ?? {};
|
|
2638
|
+
const text2 = typeof message.content === "string" ? message.content : "";
|
|
2639
|
+
const result = {
|
|
2640
|
+
text: text2,
|
|
2641
|
+
model: req.model,
|
|
2642
|
+
usage: mapUsage2(res.json.usage),
|
|
2643
|
+
finishReason: mapFinishReason2(choice.finish_reason)
|
|
2644
|
+
};
|
|
2645
|
+
if (req.responseSchema !== void 0) {
|
|
2646
|
+
result.structuredOutput = this.#parseStructured(text2);
|
|
2647
|
+
}
|
|
2648
|
+
return result;
|
|
2649
|
+
}
|
|
2650
|
+
async *completeStream(req) {
|
|
2651
|
+
const completion = await this.complete(req);
|
|
2652
|
+
if (completion.text.length > 0) {
|
|
2653
|
+
yield { type: "text_delta", delta: completion.text };
|
|
2654
|
+
}
|
|
2655
|
+
yield { type: "finish", finishReason: completion.finishReason, usage: completion.usage };
|
|
2656
|
+
}
|
|
2657
|
+
async callTool(req) {
|
|
2658
|
+
this.#assertKey();
|
|
2659
|
+
const body = {
|
|
2660
|
+
model: req.model,
|
|
2661
|
+
messages: toWireMessages(req.messages),
|
|
2662
|
+
tools: req.tools.map((t) => ({
|
|
2663
|
+
type: "function",
|
|
2664
|
+
function: { name: t.name, description: t.description, parameters: t.inputSchema }
|
|
2665
|
+
})),
|
|
2666
|
+
...req.maxTokens !== void 0 ? { max_tokens: req.maxTokens } : {}
|
|
2667
|
+
};
|
|
2668
|
+
const res = await this.#send(body, req.signal);
|
|
2669
|
+
if (res.status < 200 || res.status >= 300) {
|
|
2670
|
+
throw errorForStatus2(this.name, res.status, res.json);
|
|
2671
|
+
}
|
|
2672
|
+
const choice = this.#firstChoice(res.json);
|
|
2673
|
+
const message = choice.message ?? {};
|
|
2674
|
+
const text2 = typeof message.content === "string" ? message.content : "";
|
|
2675
|
+
const usage = mapUsage2(res.json.usage);
|
|
2676
|
+
const toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
|
|
2677
|
+
const first = toolCalls[0];
|
|
2678
|
+
if (!first) {
|
|
2679
|
+
return {
|
|
2680
|
+
toolName: null,
|
|
2681
|
+
toolArguments: null,
|
|
2682
|
+
toolCallId: null,
|
|
2683
|
+
text: text2,
|
|
2684
|
+
finishReason: mapFinishReason2(choice.finish_reason),
|
|
2685
|
+
usage
|
|
2686
|
+
};
|
|
2687
|
+
}
|
|
2688
|
+
const fn = first.function ?? {};
|
|
2689
|
+
return {
|
|
2690
|
+
toolName: typeof fn.name === "string" ? fn.name : null,
|
|
2691
|
+
toolArguments: this.#parseToolArgs(fn.arguments),
|
|
2692
|
+
toolCallId: typeof first.id === "string" ? first.id : null,
|
|
2693
|
+
text: text2,
|
|
2694
|
+
finishReason: "tool_use",
|
|
2695
|
+
usage
|
|
2696
|
+
};
|
|
2697
|
+
}
|
|
2698
|
+
async batch(reqs) {
|
|
2699
|
+
return Promise.all(
|
|
2700
|
+
reqs.map(
|
|
2701
|
+
(r) => this.complete(r).catch(
|
|
2702
|
+
(err) => err instanceof ProviderError ? err : errorForThrow2(this.name, err)
|
|
2703
|
+
)
|
|
2704
|
+
)
|
|
2705
|
+
);
|
|
2706
|
+
}
|
|
2707
|
+
// --- internals ---------------------------------------------------------
|
|
2708
|
+
#assertKey() {
|
|
2709
|
+
if (this.#apiKey.length < 8) {
|
|
2710
|
+
throw new ProviderError({
|
|
2711
|
+
category: "authentication",
|
|
2712
|
+
providerName: this.name,
|
|
2713
|
+
message: "apiKey missing or too short"
|
|
2714
|
+
});
|
|
2715
|
+
}
|
|
2716
|
+
}
|
|
2717
|
+
async #send(body, signal) {
|
|
2718
|
+
try {
|
|
2719
|
+
return await this.#transport({
|
|
2720
|
+
url: `${this.#baseUrl}/chat/completions`,
|
|
2721
|
+
method: "POST",
|
|
2722
|
+
headers: this.#headers(),
|
|
2723
|
+
body,
|
|
2724
|
+
signal
|
|
2725
|
+
});
|
|
2726
|
+
} catch (err) {
|
|
2727
|
+
throw errorForThrow2(this.name, err);
|
|
2728
|
+
}
|
|
2729
|
+
}
|
|
2730
|
+
#firstChoice(json) {
|
|
2731
|
+
const choices = json?.choices;
|
|
2732
|
+
if (!Array.isArray(choices) || choices.length === 0) {
|
|
2733
|
+
throw new ProviderError({
|
|
2734
|
+
category: "unknown",
|
|
2735
|
+
providerName: this.name,
|
|
2736
|
+
message: `${this.name} response contained no choices`
|
|
2737
|
+
});
|
|
2738
|
+
}
|
|
2739
|
+
return choices[0];
|
|
2740
|
+
}
|
|
2741
|
+
#parseStructured(text2) {
|
|
2742
|
+
try {
|
|
2743
|
+
return JSON.parse(text2);
|
|
2744
|
+
} catch {
|
|
2745
|
+
throw new ProviderError({
|
|
2746
|
+
category: "schema_violation",
|
|
2747
|
+
providerName: this.name,
|
|
2748
|
+
message: "responseSchema requested but model output was not valid JSON"
|
|
2749
|
+
});
|
|
2750
|
+
}
|
|
2751
|
+
}
|
|
2752
|
+
#parseToolArgs(raw) {
|
|
2753
|
+
if (raw == null) return null;
|
|
2754
|
+
if (typeof raw === "object") return raw;
|
|
2755
|
+
if (typeof raw === "string") {
|
|
2756
|
+
try {
|
|
2757
|
+
const parsed = JSON.parse(raw);
|
|
2758
|
+
return typeof parsed === "object" && parsed !== null ? parsed : {};
|
|
2759
|
+
} catch {
|
|
2760
|
+
return {};
|
|
2761
|
+
}
|
|
2762
|
+
}
|
|
2763
|
+
return null;
|
|
2764
|
+
}
|
|
2765
|
+
};
|
|
2766
|
+
function parseJsonObject2(text2) {
|
|
2767
|
+
const fenced = text2.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
2768
|
+
const candidate = fenced ? fenced[1] : text2;
|
|
2769
|
+
const start = candidate.indexOf("{");
|
|
2770
|
+
const end = candidate.lastIndexOf("}");
|
|
2771
|
+
if (start === -1 || end === -1 || end < start) return null;
|
|
2772
|
+
try {
|
|
2773
|
+
const parsed = JSON.parse(candidate.slice(start, end + 1));
|
|
2774
|
+
return typeof parsed === "object" && parsed !== null ? parsed : null;
|
|
2775
|
+
} catch {
|
|
2776
|
+
return null;
|
|
2777
|
+
}
|
|
2778
|
+
}
|
|
2779
|
+
var OpenAICompatTriggerProvider = class {
|
|
2780
|
+
#provider;
|
|
2781
|
+
#model;
|
|
2782
|
+
constructor(model, provider) {
|
|
2783
|
+
this.#model = model;
|
|
2784
|
+
this.#provider = provider;
|
|
2785
|
+
}
|
|
2786
|
+
async selectSkill(prompt, availableSkills) {
|
|
2787
|
+
const roster = availableSkills.map((s) => `- ${s.name}: ${s.description}`).join("\n");
|
|
2788
|
+
const system = 'You are a skill router. Given a user prompt and a roster of available skills, decide which single skill (if any) should handle the prompt. Respond ONLY with a JSON object {"selected": "<skill-name-or-null>", "reasoning": "<one sentence>"}. Use null for selected when no skill fits.';
|
|
2789
|
+
const user = `Available skills:
|
|
2790
|
+
${roster}
|
|
2791
|
+
|
|
2792
|
+
User prompt: "${prompt}"`;
|
|
2793
|
+
const result = await this.#provider.complete({
|
|
2794
|
+
model: this.#model,
|
|
2795
|
+
messages: [
|
|
2796
|
+
{ role: "system", content: system },
|
|
2797
|
+
{ role: "user", content: user }
|
|
2798
|
+
],
|
|
2799
|
+
maxTokens: 256,
|
|
2800
|
+
temperature: 0
|
|
2801
|
+
});
|
|
2802
|
+
const parsed = parseJsonObject2(result.text);
|
|
2803
|
+
const rawSelected = parsed?.selected;
|
|
2804
|
+
const selected = typeof rawSelected === "string" && rawSelected !== "null" && rawSelected.length > 0 ? rawSelected : null;
|
|
2805
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : result.text.slice(0, 200);
|
|
2806
|
+
return { selected, reasoning };
|
|
2807
|
+
}
|
|
2808
|
+
};
|
|
2809
|
+
var OpenAICompatExecutionProvider = class {
|
|
2810
|
+
#provider;
|
|
2811
|
+
#model;
|
|
2812
|
+
constructor(model, provider) {
|
|
2813
|
+
this.#model = model;
|
|
2814
|
+
this.#provider = provider;
|
|
2815
|
+
}
|
|
2816
|
+
async execute(prompt, context, options) {
|
|
2817
|
+
const started = /* @__PURE__ */ new Date();
|
|
2818
|
+
const model = options?.model ?? this.#model;
|
|
2819
|
+
const controller = options?.timeout_ms ? new AbortController() : void 0;
|
|
2820
|
+
const timer = controller ? setTimeout(() => controller.abort(), options.timeout_ms) : void 0;
|
|
2821
|
+
try {
|
|
2822
|
+
const result = await this.#provider.complete({
|
|
2823
|
+
model,
|
|
2824
|
+
messages: [
|
|
2825
|
+
{ role: "system", content: context.skill_body },
|
|
2826
|
+
{ role: "user", content: prompt }
|
|
2827
|
+
],
|
|
2828
|
+
maxTokens: 1024,
|
|
2829
|
+
...controller ? { signal: controller.signal } : {}
|
|
2830
|
+
});
|
|
2831
|
+
const completed = /* @__PURE__ */ new Date();
|
|
2832
|
+
const meta = {
|
|
2833
|
+
started_at: started.toISOString(),
|
|
2834
|
+
completed_at: completed.toISOString(),
|
|
2835
|
+
duration_ms: completed.getTime() - started.getTime(),
|
|
2836
|
+
timed_out: false
|
|
2837
|
+
};
|
|
2838
|
+
return { text: result.text, artifacts: [], tool_calls: 0, meta };
|
|
2839
|
+
} finally {
|
|
2840
|
+
if (timer) clearTimeout(timer);
|
|
2841
|
+
}
|
|
2842
|
+
}
|
|
2843
|
+
};
|
|
2844
|
+
var OpenAICompatJudgeProvider = class {
|
|
2845
|
+
#provider;
|
|
2846
|
+
#model;
|
|
2847
|
+
constructor(model, provider) {
|
|
2848
|
+
this.#model = model;
|
|
2849
|
+
this.#provider = provider;
|
|
2850
|
+
}
|
|
2851
|
+
async judge(criterion_description, prompt, output, judge_prompt) {
|
|
2852
|
+
const system = 'You are a strict binary evaluator. Decide whether the OUTPUT satisfies the CRITERION for the given PROMPT. Respond ONLY with a JSON object {"verdict": "yes"|"no"|"unsure", "confidence": <0..1>, "reasoning": "<one sentence>"}.';
|
|
2853
|
+
const question = judge_prompt ?? `Does the output satisfy: ${criterion_description}?`;
|
|
2854
|
+
const user = `CRITERION: ${criterion_description}
|
|
2855
|
+
|
|
2856
|
+
QUESTION: ${question}
|
|
2857
|
+
|
|
2858
|
+
PROMPT: ${prompt}
|
|
2859
|
+
|
|
2860
|
+
OUTPUT:
|
|
2861
|
+
${output}`;
|
|
2862
|
+
const result = await this.#provider.complete({
|
|
2863
|
+
model: this.#model,
|
|
2864
|
+
messages: [
|
|
2865
|
+
{ role: "system", content: system },
|
|
2866
|
+
{ role: "user", content: user }
|
|
2867
|
+
],
|
|
2868
|
+
maxTokens: 256,
|
|
2869
|
+
temperature: 0
|
|
2870
|
+
});
|
|
2871
|
+
const parsed = parseJsonObject2(result.text);
|
|
2872
|
+
const rawVerdict = typeof parsed?.verdict === "string" ? parsed.verdict.toLowerCase() : "";
|
|
2873
|
+
const verdict = rawVerdict === "yes" ? "yes" : rawVerdict === "no" ? "no" : "unsure";
|
|
2874
|
+
const confidence = typeof parsed?.confidence === "number" ? Math.max(0, Math.min(1, parsed.confidence)) : 0.5;
|
|
2875
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : result.text.slice(0, 200);
|
|
2876
|
+
return { verdict, confidence, reasoning };
|
|
2877
|
+
}
|
|
2878
|
+
};
|
|
2879
|
+
|
|
2880
|
+
// src/commands/eval.ts
|
|
2881
|
+
function selectProviders(model, preferred) {
|
|
2882
|
+
const want = preferred?.trim().toLowerCase();
|
|
2883
|
+
if (want === "stub") {
|
|
2884
|
+
return {
|
|
2885
|
+
trigger: new StubTriggerProvider(model),
|
|
2886
|
+
execution: new StubExecutionProvider(model),
|
|
2887
|
+
judge: new StubJudgeProvider(model),
|
|
2888
|
+
real: false,
|
|
2889
|
+
providerName: "stub"
|
|
2890
|
+
};
|
|
2891
|
+
}
|
|
2892
|
+
if (want !== "anthropic") {
|
|
2893
|
+
const cfg = resolveOpenAICompatConfig(process.env, want);
|
|
2894
|
+
if (cfg) {
|
|
2895
|
+
const effectiveModel = cfg.defaultModel && cfg.defaultModel.length > 0 ? cfg.defaultModel : model;
|
|
2896
|
+
const provider = new RealOpenAICompatProvider({
|
|
2897
|
+
apiKey: cfg.apiKey,
|
|
2898
|
+
baseUrl: cfg.baseUrl,
|
|
2899
|
+
name: cfg.name
|
|
2900
|
+
});
|
|
2901
|
+
return {
|
|
2902
|
+
trigger: new OpenAICompatTriggerProvider(effectiveModel, provider),
|
|
2903
|
+
execution: new OpenAICompatExecutionProvider(effectiveModel, provider),
|
|
2904
|
+
judge: new OpenAICompatJudgeProvider(effectiveModel, provider),
|
|
2905
|
+
real: true,
|
|
2906
|
+
providerName: cfg.name
|
|
2907
|
+
};
|
|
2908
|
+
}
|
|
2909
|
+
}
|
|
2910
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
2911
|
+
if (apiKey && apiKey.length >= 8) {
|
|
2912
|
+
const provider = new RealAnthropicProvider({ apiKey });
|
|
2913
|
+
return {
|
|
2914
|
+
trigger: new AnthropicTriggerProvider(model, provider),
|
|
2915
|
+
execution: new AnthropicExecutionProvider(model, provider),
|
|
2916
|
+
judge: new AnthropicJudgeProvider(model, provider),
|
|
2917
|
+
real: true,
|
|
2918
|
+
providerName: "anthropic"
|
|
2919
|
+
};
|
|
2920
|
+
}
|
|
2921
|
+
return {
|
|
2922
|
+
trigger: new StubTriggerProvider(model),
|
|
2923
|
+
execution: new StubExecutionProvider(model),
|
|
2924
|
+
judge: new StubJudgeProvider(model),
|
|
2925
|
+
real: false,
|
|
2926
|
+
providerName: "stub"
|
|
2927
|
+
};
|
|
2928
|
+
}
|
|
2929
|
+
function hasAnyRealKey(preferred) {
|
|
2930
|
+
const want = preferred?.trim().toLowerCase();
|
|
2931
|
+
if (want === "stub") return false;
|
|
2932
|
+
if (want !== "anthropic" && resolveOpenAICompatConfig(process.env, want)) return true;
|
|
2933
|
+
if (want === "anthropic" || want === void 0 || want === "") {
|
|
2934
|
+
return (process.env.ANTHROPIC_API_KEY?.length ?? 0) >= 8;
|
|
2935
|
+
}
|
|
2936
|
+
return false;
|
|
2937
|
+
}
|
|
2938
|
+
function registerEvalCommand(program) {
|
|
2939
|
+
program.command("eval").description("Run full 7-layer binary evaluation on a skill").argument("<skill-dir>", "Path to skill directory containing SKILL.md").option("--spec <path>", "Path to eval spec YAML").option("--models <list>", "Comma-separated model list", "sonnet").option("--db <path>", "SQLite DB path", "j-rig.db").option("--json", "Output as JSON").option("--no-trigger", "Skip trigger tests").option("--no-functional", "Skip functional tests").option(
|
|
2940
|
+
"--provider <name>",
|
|
2941
|
+
"Force a provider: deepseek | kimi | moonshot | openrouter | anthropic | stub (default: auto-detect from env keys, preferring an OpenAI-compatible endpoint)"
|
|
2942
|
+
).action(async (skillDir, opts) => {
|
|
2943
|
+
const startTime = Date.now();
|
|
2944
|
+
try {
|
|
2945
|
+
const hasRealKey = hasAnyRealKey(opts.provider);
|
|
2946
|
+
if (!hasRealKey) assertStubAllowed();
|
|
2947
|
+
const absDir = resolve4(skillDir);
|
|
2948
|
+
const { parsed: skill, raw: skillContent } = loadSkillMd(absDir);
|
|
2949
|
+
const spec = loadEvalSpec(opts.spec, absDir);
|
|
2950
|
+
const models = opts.models.split(",").map((m) => m.trim());
|
|
2951
|
+
const database = openDb(opts.db);
|
|
2952
|
+
const skillName = skill.frontmatter.name;
|
|
2953
|
+
const skillVersion = typeof skill.frontmatter.version === "string" ? skill.frontmatter.version : "0.0.0";
|
|
2954
|
+
const skillSnapshotSha = "sha256:" + createHash2("sha256").update(skillContent).digest("hex");
|
|
2955
|
+
const specContentHash = "sha256:" + createHash2("sha256").update(JSON.stringify(spec)).digest("hex");
|
|
2956
|
+
if (!opts.json) {
|
|
2957
|
+
console.log(header(`j-rig eval: ${skillName}`));
|
|
2958
|
+
console.log(` Models: ${models.join(", ")}
|
|
2959
|
+
`);
|
|
2960
|
+
}
|
|
2961
|
+
const pkgReport = checkPackage(absDir);
|
|
2962
|
+
if (!opts.json) {
|
|
2963
|
+
console.log(header("--- Package Integrity ---"));
|
|
2964
|
+
const { summary } = pkgReport;
|
|
2965
|
+
console.log(` ${summary.passed}/${summary.passed + summary.errors} checks passed`);
|
|
2966
|
+
}
|
|
2967
|
+
if (pkgReport.summary.errors > 0) {
|
|
2968
|
+
if (!opts.json) {
|
|
2969
|
+
printReport(pkgReport);
|
|
2970
|
+
console.error(chalk3.red("\nPackage integrity failed. Fix errors before evaluation."));
|
|
2971
|
+
} else {
|
|
2972
|
+
console.log(JSON.stringify({ error: "Package integrity failed", pkgReport }, null, 2));
|
|
2973
|
+
}
|
|
2974
|
+
process.exit(1);
|
|
2975
|
+
}
|
|
2976
|
+
if (!opts.json) console.log("");
|
|
2977
|
+
const allResults = {};
|
|
2978
|
+
for (const model of models) {
|
|
2979
|
+
const modelStart = Date.now();
|
|
2980
|
+
const svId = getOrCreateSkillVersion(database, skillName, skillVersion, skillContent);
|
|
2981
|
+
const runId = createRun(database, svId, "full", model);
|
|
2982
|
+
transitionRun(database, runId, "running");
|
|
2983
|
+
const providers = selectProviders(model, opts.provider);
|
|
2984
|
+
const correlation = { evalRunId: uuidv7() };
|
|
2985
|
+
emitRuntimeRunStarted(correlation, {
|
|
2986
|
+
specContentHash,
|
|
2987
|
+
skillSnapshotSha
|
|
2988
|
+
});
|
|
2989
|
+
let runHadFailure = false;
|
|
2990
|
+
if (!opts.json) {
|
|
2991
|
+
console.log(header(`--- Model: ${model} ---`));
|
|
2992
|
+
console.log(
|
|
2993
|
+
` Provider: ${providers.real ? `${providers.providerName} (REAL \u2014 ground truth)` : `${providers.providerName.toUpperCase()} (not ground truth)`}`
|
|
2994
|
+
);
|
|
2995
|
+
}
|
|
2996
|
+
if (opts.trigger !== false) {
|
|
2997
|
+
const roster = buildRoster(skill.frontmatter, spec.siblings);
|
|
2998
|
+
const triggerResults = await runTriggerTests(
|
|
2999
|
+
spec.test_cases,
|
|
3000
|
+
roster,
|
|
3001
|
+
providers.trigger
|
|
3002
|
+
);
|
|
3003
|
+
const metrics = computeMetrics(triggerResults);
|
|
3004
|
+
if (!opts.json) {
|
|
3005
|
+
console.log(
|
|
3006
|
+
` Trigger: precision=${metrics.precision.toFixed(2)} recall=${metrics.recall.toFixed(2)} (${metrics.total_cases} cases)`
|
|
3007
|
+
);
|
|
3008
|
+
}
|
|
3009
|
+
}
|
|
3010
|
+
if (opts.functional !== false) {
|
|
3011
|
+
const outcomes = await runFunctionalTests(
|
|
3012
|
+
spec.test_cases,
|
|
3013
|
+
skill,
|
|
3014
|
+
providers.execution,
|
|
3015
|
+
{ model }
|
|
3016
|
+
);
|
|
3017
|
+
if (!opts.json) {
|
|
3018
|
+
console.log(
|
|
3019
|
+
` Functional: ${outcomes.length}/${spec.test_cases.length} test case(s) executed`
|
|
3020
|
+
);
|
|
3021
|
+
}
|
|
3022
|
+
const allJudgments = [];
|
|
3023
|
+
for (const outcome of outcomes) {
|
|
3024
|
+
const judgments = await judgeCriteria(spec.criteria, outcome, providers.judge, {
|
|
3025
|
+
model
|
|
3026
|
+
});
|
|
3027
|
+
for (const j of judgments) {
|
|
3028
|
+
if (j.method === "judge") {
|
|
3029
|
+
emitJudgeInvoked(correlation, {
|
|
3030
|
+
judgeId: `j-rig:judge:${j.criterion_id}`,
|
|
3031
|
+
modelId: j.judge_model ?? model,
|
|
3032
|
+
modelVersion: skillVersion
|
|
3033
|
+
});
|
|
3034
|
+
emitJudgeVerdict(correlation, {
|
|
3035
|
+
verdict: j.verdict,
|
|
3036
|
+
verdictSource: JudgeVerdictSource.LLM_NO_SEED,
|
|
3037
|
+
seed: null
|
|
3038
|
+
});
|
|
3039
|
+
}
|
|
3040
|
+
const criterionOutcome = j.verdict === "yes" ? CriterionOutcome.PASS : j.verdict === "unsure" ? CriterionOutcome.SKIP : CriterionOutcome.FAIL;
|
|
3041
|
+
if (criterionOutcome === CriterionOutcome.FAIL) runHadFailure = true;
|
|
3042
|
+
emitRuntimeCriterionEvaluated(correlation, {
|
|
3043
|
+
matcherClass: j.method,
|
|
3044
|
+
outcome: criterionOutcome
|
|
3045
|
+
});
|
|
3046
|
+
}
|
|
3047
|
+
allJudgments.push(...judgments);
|
|
3048
|
+
}
|
|
3049
|
+
const passed = allJudgments.filter((j) => j.verdict === "yes").length;
|
|
3050
|
+
const total = allJudgments.length;
|
|
3051
|
+
if (!opts.json) {
|
|
3052
|
+
console.log(` Judgment: ${formatScore(passed, total)}`);
|
|
3053
|
+
for (const j of allJudgments) {
|
|
3054
|
+
const verdictIcon = j.verdict === "yes" ? icon("pass") : j.verdict === "unsure" ? icon("warning") : icon("error");
|
|
3055
|
+
console.log(` ${verdictIcon} ${j.criterion_id}: ${j.reasoning.slice(0, 80)}`);
|
|
3056
|
+
}
|
|
3057
|
+
}
|
|
3058
|
+
const dbResults = allJudgments.map((j) => ({
|
|
3059
|
+
criterion_id: j.criterion_id,
|
|
3060
|
+
passed: j.verdict === "yes",
|
|
3061
|
+
severity: j.verdict === "yes" ? "pass" : j.verdict === "unsure" ? "warning" : "error",
|
|
3062
|
+
message: j.reasoning,
|
|
3063
|
+
method: j.method
|
|
3064
|
+
}));
|
|
3065
|
+
storeCriterionResults(database, runId, dbResults);
|
|
3066
|
+
const errors = allJudgments.filter((j) => j.verdict === "no").length;
|
|
3067
|
+
const warnings = allJudgments.filter((j) => j.verdict === "unsure").length;
|
|
3068
|
+
storeRunSummary(database, runId, {
|
|
3069
|
+
total,
|
|
3070
|
+
passed,
|
|
3071
|
+
warnings,
|
|
3072
|
+
errors
|
|
3073
|
+
});
|
|
3074
|
+
const scoreCard = computeScoreCard(allJudgments, spec.criteria);
|
|
3075
|
+
const decision = decideRollout(scoreCard);
|
|
3076
|
+
const report = buildLaunchReport(
|
|
3077
|
+
skillName,
|
|
3078
|
+
scoreCard,
|
|
3079
|
+
[],
|
|
3080
|
+
// regressions: none in a standalone run
|
|
3081
|
+
[],
|
|
3082
|
+
// baseline: none without a baseline comparison run
|
|
3083
|
+
false,
|
|
3084
|
+
// isObsolete: not computed here
|
|
3085
|
+
// DR-103 D5 B5.1: inject `now` so the launch-report artifact is
|
|
3086
|
+
// replayable (the determinism the adoption signal's bandit-rejection
|
|
3087
|
+
// rests on). One timestamp per model run.
|
|
3088
|
+
{ now: new Date(modelStart).toISOString() }
|
|
3089
|
+
);
|
|
3090
|
+
allResults[model] = {
|
|
3091
|
+
provider: providers.providerName,
|
|
3092
|
+
model,
|
|
3093
|
+
ground_truth: providers.real,
|
|
3094
|
+
pkgReport,
|
|
3095
|
+
scoreCard,
|
|
3096
|
+
decision,
|
|
3097
|
+
report
|
|
3098
|
+
};
|
|
3099
|
+
const gateDecisionValue = report.decision === "ship" ? GateDecision.PASS : report.decision === "block" ? GateDecision.FAIL : GateDecision.ADVISORY;
|
|
3100
|
+
if (gateDecisionValue === GateDecision.FAIL) runHadFailure = true;
|
|
3101
|
+
emitGateDecisionEmitted(correlation, {
|
|
3102
|
+
gateName: "j-rig-rollout-gate",
|
|
3103
|
+
decision: gateDecisionValue,
|
|
3104
|
+
policyRef: specContentHash
|
|
3105
|
+
});
|
|
3106
|
+
if (!opts.json) {
|
|
3107
|
+
console.log(` Decision: ${formatDecision(report.decision)}`);
|
|
3108
|
+
if (report.blockers.length > 0) {
|
|
3109
|
+
for (const b of report.blockers) {
|
|
3110
|
+
console.log(` ${icon("error")} ${b}`);
|
|
3111
|
+
}
|
|
3112
|
+
}
|
|
3113
|
+
if (report.warnings.length > 0) {
|
|
3114
|
+
for (const w of report.warnings) {
|
|
3115
|
+
console.log(` ${icon("warning")} ${w}`);
|
|
3116
|
+
}
|
|
3117
|
+
}
|
|
3118
|
+
console.log("");
|
|
3119
|
+
}
|
|
3120
|
+
transitionRun(database, runId, "completed");
|
|
3121
|
+
} else {
|
|
3122
|
+
transitionRun(database, runId, "completed");
|
|
3123
|
+
}
|
|
3124
|
+
emitRuntimeRunFinished(correlation, {
|
|
3125
|
+
terminalState: runHadFailure ? RuntimeTerminalState.ARCHIVED_FAILED : RuntimeTerminalState.JUDGED,
|
|
3126
|
+
durationMs: Date.now() - modelStart
|
|
3127
|
+
});
|
|
3128
|
+
}
|
|
3129
|
+
const duration = Date.now() - startTime;
|
|
3130
|
+
if (opts.json) {
|
|
3131
|
+
console.log(JSON.stringify(allResults, null, 2));
|
|
3132
|
+
} else {
|
|
3133
|
+
console.log(chalk3.dim(`Duration: ${formatDuration(duration)} | DB: ${opts.db}`));
|
|
3134
|
+
}
|
|
3135
|
+
} catch (err) {
|
|
3136
|
+
console.error(`Error: ${err instanceof Error ? err.message : String(err)}`);
|
|
3137
|
+
process.exit(1);
|
|
3138
|
+
}
|
|
3139
|
+
});
|
|
3140
|
+
}
|
|
3141
|
+
|
|
3142
|
+
// src/commands/report.ts
|
|
3143
|
+
import chalk4 from "chalk";
|
|
3144
|
+
function registerReportCommand(program) {
|
|
3145
|
+
program.command("report").description("Show evaluation results from the database").option("--db <path>", "SQLite DB path", "j-rig.db").option("--skill <name>", "Filter by skill name").option("--run-id <id>", "Show detailed results for a specific run", parseInt).option("--limit <n>", "Max runs to show", parseInt, 10).option("--json", "Output as JSON").action(
|
|
3146
|
+
async (opts) => {
|
|
3147
|
+
try {
|
|
3148
|
+
const database = openDb(opts.db);
|
|
3149
|
+
if (opts.runId) {
|
|
3150
|
+
const run = getRun(database, opts.runId);
|
|
3151
|
+
if (!run) {
|
|
3152
|
+
console.error(`Run #${opts.runId} not found`);
|
|
3153
|
+
process.exit(1);
|
|
3154
|
+
}
|
|
3155
|
+
const results = getRunResults(database, opts.runId);
|
|
3156
|
+
const arts = getRunArtifacts(database, opts.runId);
|
|
3157
|
+
if (opts.json) {
|
|
3158
|
+
console.log(JSON.stringify({ run, results, artifacts: arts }, null, 2));
|
|
3159
|
+
} else {
|
|
3160
|
+
const summary = run.summary;
|
|
3161
|
+
console.log(header(`Run #${run.id}`));
|
|
3162
|
+
console.log(
|
|
3163
|
+
` Status: ${run.status} | Model: ${run.model ?? "n/a"} | Type: ${run.run_type}`
|
|
3164
|
+
);
|
|
3165
|
+
if (run.duration_ms != null) {
|
|
3166
|
+
console.log(` Duration: ${formatDuration(run.duration_ms)}`);
|
|
3167
|
+
}
|
|
3168
|
+
if (summary) {
|
|
3169
|
+
console.log(` Score: ${formatScore(summary.passed, summary.total)}`);
|
|
3170
|
+
}
|
|
3171
|
+
console.log(`
|
|
3172
|
+
${header("Criterion Results:")}`);
|
|
3173
|
+
for (const r of results) {
|
|
3174
|
+
const sev = r.severity;
|
|
3175
|
+
const ic = r.passed ? icon("pass") : icon("error");
|
|
3176
|
+
const sevIcon = sev === "warning" ? ` ${icon("warning")}` : "";
|
|
3177
|
+
console.log(` ${ic}${sevIcon} ${r.criterion_id}: ${r.message}`);
|
|
3178
|
+
}
|
|
3179
|
+
if (arts.length > 0) {
|
|
3180
|
+
console.log(`
|
|
3181
|
+
${header("Artifacts:")}`);
|
|
3182
|
+
for (const a of arts) {
|
|
3183
|
+
console.log(` ${a.filename} (${a.artifact_type})`);
|
|
3184
|
+
}
|
|
3185
|
+
}
|
|
3186
|
+
}
|
|
3187
|
+
} else {
|
|
3188
|
+
const rows = getRecentRuns(database, {
|
|
3189
|
+
limit: opts.limit,
|
|
3190
|
+
skillName: opts.skill
|
|
3191
|
+
});
|
|
3192
|
+
if (opts.json) {
|
|
3193
|
+
console.log(JSON.stringify(rows, null, 2));
|
|
3194
|
+
} else {
|
|
3195
|
+
console.log(header("Recent Runs:"));
|
|
3196
|
+
console.log(
|
|
3197
|
+
chalk4.dim(" ID Skill Model Status Date")
|
|
3198
|
+
);
|
|
3199
|
+
for (const row of rows) {
|
|
3200
|
+
const r = row.runs;
|
|
3201
|
+
const sv = row.skill_versions;
|
|
3202
|
+
console.log(
|
|
3203
|
+
` ${String(r.id).padEnd(5)} ${sv.skill_name.padEnd(28)} ${(r.model ?? "n/a").padEnd(9)} ${r.status.padEnd(12)} ${r.created_at?.slice(0, 10) ?? ""}`
|
|
3204
|
+
);
|
|
3205
|
+
}
|
|
3206
|
+
if (rows.length === 0) {
|
|
3207
|
+
console.log(chalk4.dim(" No runs found."));
|
|
3208
|
+
}
|
|
3209
|
+
}
|
|
3210
|
+
}
|
|
3211
|
+
} catch (err) {
|
|
3212
|
+
console.error(`Error: ${err instanceof Error ? err.message : err}`);
|
|
3213
|
+
process.exit(1);
|
|
3214
|
+
}
|
|
3215
|
+
}
|
|
3216
|
+
);
|
|
3217
|
+
}
|
|
3218
|
+
|
|
3219
|
+
// src/commands/optimize.ts
|
|
3220
|
+
import chalk5 from "chalk";
|
|
3221
|
+
function toJudgmentResult(row) {
|
|
3222
|
+
return {
|
|
3223
|
+
criterion_id: row.criterion_id,
|
|
3224
|
+
verdict: row.passed ? "yes" : "no",
|
|
3225
|
+
confidence: 1,
|
|
3226
|
+
reasoning: row.message,
|
|
3227
|
+
method: row.method === "judge" ? "judge" : "deterministic"
|
|
3228
|
+
};
|
|
3229
|
+
}
|
|
3230
|
+
function toCriterion(row) {
|
|
3231
|
+
return {
|
|
3232
|
+
id: row.criterion_id,
|
|
3233
|
+
description: row.criterion_id,
|
|
3234
|
+
method: row.method === "judge" ? "judge" : "deterministic",
|
|
3235
|
+
blocker: false,
|
|
3236
|
+
regression_critical: false,
|
|
3237
|
+
baseline_sensitive: false,
|
|
3238
|
+
pack_sensitive: false
|
|
3239
|
+
};
|
|
3240
|
+
}
|
|
3241
|
+
function registerOptimizeCommand(program) {
|
|
3242
|
+
program.command("optimize").description("Analyze failures and suggest improvements for a skill").requiredOption("--skill <name>", "Skill name to optimize").option("--db <path>", "SQLite DB path", "j-rig.db").option("--run-id <id>", "Optimize from a specific run", parseInt).option("--json", "Output as JSON").action(async (opts) => {
|
|
3243
|
+
try {
|
|
3244
|
+
const database = openDb(opts.db);
|
|
3245
|
+
let runId = opts.runId;
|
|
3246
|
+
if (runId == null) {
|
|
3247
|
+
const runs2 = getRecentRuns(database, { skillName: opts.skill, limit: 1 });
|
|
3248
|
+
if (runs2.length === 0) {
|
|
3249
|
+
console.error(`No runs found for skill: ${opts.skill}`);
|
|
3250
|
+
process.exit(1);
|
|
3251
|
+
}
|
|
3252
|
+
runId = runs2[0].runs.id;
|
|
3253
|
+
}
|
|
3254
|
+
const dbResults = getRunResults(database, runId);
|
|
3255
|
+
if (dbResults.length === 0) {
|
|
3256
|
+
console.error(`No results found for run #${runId}`);
|
|
3257
|
+
process.exit(1);
|
|
3258
|
+
}
|
|
3259
|
+
const judgmentResults = dbResults.map(toJudgmentResult);
|
|
3260
|
+
const criteria = dbResults.map(toCriterion);
|
|
3261
|
+
const clusters = clusterFailures(judgmentResults, criteria);
|
|
3262
|
+
const weakest = selectWeakest(judgmentResults, criteria);
|
|
3263
|
+
if (opts.json) {
|
|
3264
|
+
console.log(JSON.stringify({ runId, clusters, weakest }, null, 2));
|
|
3265
|
+
} else {
|
|
3266
|
+
console.log(header(`Optimization Analysis: ${opts.skill} (Run #${runId})`));
|
|
3267
|
+
if (clusters.length === 0) {
|
|
3268
|
+
console.log(chalk5.green("\n No failures to cluster. All criteria passed."));
|
|
3269
|
+
} else {
|
|
3270
|
+
console.log("\n Failure Clusters:");
|
|
3271
|
+
for (const c of clusters) {
|
|
3272
|
+
const color = c.severity === "critical" ? chalk5.red : c.severity === "high" ? chalk5.yellow : chalk5.dim;
|
|
3273
|
+
console.log(
|
|
3274
|
+
` ${color(`[${c.severity.toUpperCase()}]`)} ${c.pattern} (${c.criterion_ids.length} criteria)`
|
|
3275
|
+
);
|
|
3276
|
+
for (const id of c.criterion_ids) {
|
|
3277
|
+
console.log(` - ${id}`);
|
|
3278
|
+
}
|
|
3279
|
+
}
|
|
3280
|
+
}
|
|
3281
|
+
if (weakest != null) {
|
|
3282
|
+
console.log(`
|
|
3283
|
+
Weakest Criterion: ${chalk5.red.bold(weakest)}`);
|
|
3284
|
+
console.log(chalk5.dim(" Focus improvement efforts here first."));
|
|
3285
|
+
} else {
|
|
3286
|
+
console.log(chalk5.dim("\n No single weakest criterion identified."));
|
|
3287
|
+
}
|
|
3288
|
+
}
|
|
3289
|
+
} catch (err) {
|
|
3290
|
+
console.error(`Error: ${err instanceof Error ? err.message : err}`);
|
|
3291
|
+
process.exit(1);
|
|
3292
|
+
}
|
|
3293
|
+
});
|
|
3294
|
+
}
|
|
3295
|
+
|
|
3296
|
+
// src/commands/drift.ts
|
|
3297
|
+
import chalk6 from "chalk";
|
|
3298
|
+
function registerDriftCommand(program) {
|
|
3299
|
+
program.command("drift").description("Check if a skill needs reevaluation").requiredOption("--skill <name>", "Skill name to check").option("--db <path>", "SQLite DB path", "j-rig.db").option("--max-age <days>", "Days before flagging stale", parseInt, 30).option("--json", "Output as JSON").action(async (opts) => {
|
|
3300
|
+
try {
|
|
3301
|
+
const database = openDb(opts.db);
|
|
3302
|
+
const runs2 = getRecentRuns(database, { skillName: opts.skill, limit: 2 });
|
|
3303
|
+
if (runs2.length === 0) {
|
|
3304
|
+
if (opts.json) {
|
|
3305
|
+
console.log(JSON.stringify({ skill: opts.skill, status: "no_data", stale: true }));
|
|
3306
|
+
} else {
|
|
3307
|
+
console.log(header(`Drift Check: ${opts.skill}`));
|
|
3308
|
+
console.log(chalk6.yellow("\n No evaluation history found."));
|
|
3309
|
+
console.log(chalk6.dim(" Run: j-rig eval <skill-dir>"));
|
|
3310
|
+
}
|
|
3311
|
+
process.exit(0);
|
|
3312
|
+
return;
|
|
3313
|
+
}
|
|
3314
|
+
const latest = runs2[0].runs;
|
|
3315
|
+
const createdAt = latest.created_at ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
3316
|
+
const stale = needsReevaluation(createdAt, opts.maxAge);
|
|
3317
|
+
const daysAgo = Math.floor((Date.now() - new Date(createdAt).getTime()) / 864e5);
|
|
3318
|
+
if (opts.json) {
|
|
3319
|
+
console.log(
|
|
3320
|
+
JSON.stringify(
|
|
3321
|
+
{
|
|
3322
|
+
skill: opts.skill,
|
|
3323
|
+
lastEval: createdAt,
|
|
3324
|
+
daysAgo,
|
|
3325
|
+
stale,
|
|
3326
|
+
maxAge: opts.maxAge,
|
|
3327
|
+
status: latest.status
|
|
3328
|
+
},
|
|
3329
|
+
null,
|
|
3330
|
+
2
|
|
3331
|
+
)
|
|
3332
|
+
);
|
|
3333
|
+
} else {
|
|
3334
|
+
console.log(header(`Drift Check: ${opts.skill}`));
|
|
3335
|
+
console.log(`
|
|
3336
|
+
Last Eval: ${createdAt.slice(0, 10)} (${daysAgo} days ago)`);
|
|
3337
|
+
console.log(
|
|
3338
|
+
` Status: ${stale ? chalk6.red("STALE") : chalk6.green("CURRENT")} (threshold: ${opts.maxAge} days)`
|
|
3339
|
+
);
|
|
3340
|
+
console.log(` Last Run: #${latest.id} (${latest.status})`);
|
|
3341
|
+
if (stale) {
|
|
3342
|
+
console.log(chalk6.yellow("\n Recommendation: Re-run evaluation with `j-rig eval`"));
|
|
3343
|
+
}
|
|
3344
|
+
}
|
|
3345
|
+
} catch (err) {
|
|
3346
|
+
console.error(`Error: ${err instanceof Error ? err.message : err}`);
|
|
3347
|
+
process.exit(1);
|
|
3348
|
+
}
|
|
3349
|
+
});
|
|
3350
|
+
}
|
|
3351
|
+
|
|
3352
|
+
// src/commands/emit-evidence.ts
|
|
3353
|
+
import { readFileSync as readFileSync4, writeFileSync, mkdirSync, existsSync as existsSync3, mkdtempSync, rmSync } from "fs";
|
|
3354
|
+
import { dirname, resolve as resolve5, join as join3 } from "path";
|
|
3355
|
+
import { tmpdir } from "os";
|
|
3356
|
+
import { execSync, spawnSync } from "child_process";
|
|
3357
|
+
import { createHash as createHash3 } from "crypto";
|
|
3358
|
+
var NOT_APPLICABLE_SENTINEL = "NOT_APPLICABLE";
|
|
3359
|
+
var DEFAULT_RUNNER_VERSION = "j-rig@0.0.0-dev";
|
|
3360
|
+
function registerEmitEvidenceCommand(program) {
|
|
3361
|
+
program.command("emit-evidence").description(
|
|
3362
|
+
"Wrap a gate-result envelope into a signed in-toto Statement v1 (https://evals.intentsolutions.io/gate-result/v1)"
|
|
3363
|
+
).option("--input <path>", "Read gate-result JSON from <path> instead of stdin").option("--output <path>", "Write Statement to <path> instead of stdout").option(
|
|
3364
|
+
"--runner-version <ver>",
|
|
3365
|
+
'Override runner identifier (default: "j-rig@<package version>")'
|
|
3366
|
+
).option("--commit-sha <sha>", "Override commit SHA (default: git rev-parse HEAD)").option("--gate-id <id>", "Direct mode: gate id (e.g. 'j-rig:server:MM-1')").option(
|
|
3367
|
+
"--gate-decision <d>",
|
|
3368
|
+
"Direct mode: pass|fail|advisory|error (or NOT_APPLICABLE for backward-compat; routes to coverage.dimensions_skipped)"
|
|
3369
|
+
).option(
|
|
3370
|
+
"--gate-name <name>",
|
|
3371
|
+
"Direct mode: gate name in lowercase kebab-case (e.g. 'coverage-check')"
|
|
3372
|
+
).option("--gate-version <ver>", "Direct mode: gate SemVer (e.g. '2.0.0')").option(
|
|
3373
|
+
"--gate-reason <reason>",
|
|
3374
|
+
"Direct mode: reason string (repeatable; at least one for non-pass decisions)",
|
|
3375
|
+
(val, acc) => {
|
|
3376
|
+
acc.push(val);
|
|
3377
|
+
return acc;
|
|
3378
|
+
},
|
|
3379
|
+
[]
|
|
3380
|
+
).option(
|
|
3381
|
+
"--coverage-evaluated <dim>",
|
|
3382
|
+
"Direct mode: dimension that was evaluated (repeatable)",
|
|
3383
|
+
(val, acc) => {
|
|
3384
|
+
acc.push(val);
|
|
3385
|
+
return acc;
|
|
3386
|
+
},
|
|
3387
|
+
[]
|
|
3388
|
+
).option(
|
|
3389
|
+
"--coverage-skipped <dim>",
|
|
3390
|
+
"Direct mode: dimension that was skipped / not applicable (repeatable)",
|
|
3391
|
+
(val, acc) => {
|
|
3392
|
+
acc.push(val);
|
|
3393
|
+
return acc;
|
|
3394
|
+
},
|
|
3395
|
+
[]
|
|
3396
|
+
).option("--policy-ref <ref>", "Direct mode: policy reference sha256:<hex>:<path>").option("--input-hash <h>", "Direct mode: sha256:<64-hex>").option("--policy-hash <h>", "Direct mode: sha256:<64-hex>").option("--failure-mode <m>", "Direct mode: failure_mode (when gate-decision=fail)").option("--advisory-severity <s>", "Direct mode: info|warn|error (when gate-decision=advisory)").option("--metadata <json>", "Direct mode: free-form metadata as a JSON object string").option(
|
|
3397
|
+
"--sign",
|
|
3398
|
+
"Sign the Statement via cosign (requires --key OR --keyless). Without this flag, emits unsigned Statement."
|
|
3399
|
+
).option("--key <ref>", "cosign key reference (file path, KMS URI, etc). Implies --sign.").option(
|
|
3400
|
+
"--keyless",
|
|
3401
|
+
"cosign keyless signing via Fulcio OIDC (requires terminal). Implies --sign."
|
|
3402
|
+
).option(
|
|
3403
|
+
"--rekor-url [url]",
|
|
3404
|
+
"Push the signed attestation to Rekor at <url> (defaults to https://rekor.sigstore.dev when flag is used without a value). Implies --sign."
|
|
3405
|
+
).option(
|
|
3406
|
+
"--predicate-body-only",
|
|
3407
|
+
"Plain (unsigned) mode: emit ONLY the predicate body instead of the full v1 Statement. The signing path ALWAYS sends the predicate body to cosign (which wraps it in its own Statement envelope) unless --full-statement is given."
|
|
3408
|
+
).option(
|
|
3409
|
+
"--full-statement",
|
|
3410
|
+
"Signing mode: pass the full pre-formed in-toto Statement to cosign's --predicate instead of the predicate body. cosign attest-blob will then NEST it inside its own Statement (double-wrapped); only for consumers that expect the nested form."
|
|
3411
|
+
).option("--cosign-bin <path>", "Path to cosign binary (default: cosign on PATH).", "cosign").option(
|
|
3412
|
+
"--artifact <path>",
|
|
3413
|
+
"Path to the artifact whose sha256 must equal predicate.input_hash. Required when --sign is requested so the DSSE envelope's subject digest is cryptographically bound to the gate's input. Without this, the link between attestation and artifact cannot be verified by standard tooling."
|
|
3414
|
+
).action(async (opts) => {
|
|
3415
|
+
try {
|
|
3416
|
+
const composed = await buildComposeInput(opts);
|
|
3417
|
+
const statement = composeStatement(composed);
|
|
3418
|
+
if (process.env.AUDIT_HARNESS_OTEL === "1" || process.env.OTEL_EXPORTER_OTLP_ENDPOINT) {
|
|
3419
|
+
const evt = {
|
|
3420
|
+
name: "agent.rollout.gate.evaluated",
|
|
3421
|
+
attributes: {
|
|
3422
|
+
"gate.id": composed.gateId,
|
|
3423
|
+
"gate.decision": composed.gateDecision,
|
|
3424
|
+
"gate.runner": composed.runner,
|
|
3425
|
+
"gate.commit_sha": composed.commitSha
|
|
3426
|
+
},
|
|
3427
|
+
timestamp: statement.predicate.evaluated_at
|
|
3428
|
+
};
|
|
3429
|
+
process.stderr.write(`[OTEL] ${JSON.stringify(evt)}
|
|
3430
|
+
`);
|
|
3431
|
+
}
|
|
3432
|
+
const wantsSigning = opts.sign === true || opts.key !== void 0 || opts.keyless === true || opts.rekorUrl !== void 0;
|
|
3433
|
+
if (!wantsSigning) {
|
|
3434
|
+
const out = opts.predicateBodyOnly ? JSON.stringify(statement.predicate) : serializeStatement(statement);
|
|
3435
|
+
writeOut(out, opts);
|
|
3436
|
+
process.exit(0);
|
|
3437
|
+
}
|
|
3438
|
+
const exitCode = signAndEmit(statement, composed, opts);
|
|
3439
|
+
process.exit(exitCode);
|
|
3440
|
+
} catch (err) {
|
|
3441
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
3442
|
+
process.stderr.write(`j-rig emit-evidence: ${msg}
|
|
3443
|
+
`);
|
|
3444
|
+
process.exit(1);
|
|
3445
|
+
}
|
|
3446
|
+
});
|
|
3447
|
+
}
|
|
3448
|
+
function writeOut(content, opts) {
|
|
3449
|
+
if (opts.output) {
|
|
3450
|
+
const outAbs = resolve5(opts.output);
|
|
3451
|
+
if (!existsSync3(dirname(outAbs))) mkdirSync(dirname(outAbs), { recursive: true });
|
|
3452
|
+
writeFileSync(outAbs, content + "\n");
|
|
3453
|
+
process.stderr.write(`emit-evidence: wrote ${outAbs}
|
|
3454
|
+
`);
|
|
3455
|
+
} else {
|
|
3456
|
+
process.stdout.write(content + "\n");
|
|
3457
|
+
}
|
|
3458
|
+
}
|
|
3459
|
+
function signAndEmit(statement, composed, opts) {
|
|
3460
|
+
if (!opts.key && !opts.keyless) {
|
|
3461
|
+
process.stderr.write("j-rig emit-evidence: --sign requires --key <ref> OR --keyless\n");
|
|
3462
|
+
return 1;
|
|
3463
|
+
}
|
|
3464
|
+
if (opts.fullStatement && opts.predicateBodyOnly) {
|
|
3465
|
+
process.stderr.write(
|
|
3466
|
+
"j-rig emit-evidence: --full-statement and --predicate-body-only are mutually exclusive\n"
|
|
3467
|
+
);
|
|
3468
|
+
return 1;
|
|
3469
|
+
}
|
|
3470
|
+
if (!opts.artifact) {
|
|
3471
|
+
process.stderr.write(
|
|
3472
|
+
"j-rig emit-evidence: --sign requires --artifact <path> pointing at the file whose sha256 equals predicate.input_hash. Without --artifact the attestation's subject digest cannot match the predicate, breaking standard verification.\n"
|
|
3473
|
+
);
|
|
3474
|
+
return 1;
|
|
3475
|
+
}
|
|
3476
|
+
const artifactAbs = resolve5(opts.artifact);
|
|
3477
|
+
if (!existsSync3(artifactAbs)) {
|
|
3478
|
+
process.stderr.write(`j-rig emit-evidence: --artifact path does not exist: ${artifactAbs}
|
|
3479
|
+
`);
|
|
3480
|
+
return 1;
|
|
3481
|
+
}
|
|
3482
|
+
const artifactBytes = readFileSync4(artifactAbs);
|
|
3483
|
+
const actualHash = `sha256:${createHash3("sha256").update(artifactBytes).digest("hex")}`;
|
|
3484
|
+
if (actualHash !== composed.inputHash) {
|
|
3485
|
+
process.stderr.write(
|
|
3486
|
+
`j-rig emit-evidence: --artifact sha256 mismatch:
|
|
3487
|
+
computed: ${actualHash}
|
|
3488
|
+
predicate.input_hash: ${composed.inputHash}
|
|
3489
|
+
The artifact passed to --sign must be the exact file whose hash the gate recorded.
|
|
3490
|
+
`
|
|
3491
|
+
);
|
|
3492
|
+
return 1;
|
|
3493
|
+
}
|
|
3494
|
+
const rekorUrlStr = opts.rekorUrl === true ? "https://rekor.sigstore.dev" : typeof opts.rekorUrl === "string" ? opts.rekorUrl : void 0;
|
|
3495
|
+
const tmp = mkdtempSync(join3(tmpdir(), "j-rig-emit-evidence-"));
|
|
3496
|
+
try {
|
|
3497
|
+
const predicatePath = join3(tmp, "predicate.json");
|
|
3498
|
+
const predicateContent = opts.fullStatement ? JSON.stringify(statement, null, 2) : JSON.stringify(statement.predicate, null, 2);
|
|
3499
|
+
writeFileSync(predicatePath, predicateContent);
|
|
3500
|
+
const sigPath = join3(tmp, "attestation.sig");
|
|
3501
|
+
const args = [
|
|
3502
|
+
"attest-blob",
|
|
3503
|
+
"--predicate",
|
|
3504
|
+
predicatePath,
|
|
3505
|
+
"--type",
|
|
3506
|
+
"https://evals.intentsolutions.io/gate-result/v1",
|
|
3507
|
+
"--output-signature",
|
|
3508
|
+
sigPath,
|
|
3509
|
+
`--tlog-upload=${rekorUrlStr || opts.keyless ? "true" : "false"}`
|
|
3510
|
+
];
|
|
3511
|
+
if (opts.key) args.push("--key", opts.key);
|
|
3512
|
+
else if (opts.keyless) args.push("--yes");
|
|
3513
|
+
if (rekorUrlStr) args.push("--rekor-url", rekorUrlStr);
|
|
3514
|
+
args.push(artifactAbs);
|
|
3515
|
+
const cosignBin = opts.cosignBin ?? "cosign";
|
|
3516
|
+
const result = spawnSync(cosignBin, args, {
|
|
3517
|
+
env: process.env,
|
|
3518
|
+
stdio: ["inherit", "pipe", "pipe"]
|
|
3519
|
+
});
|
|
3520
|
+
if (result.error) {
|
|
3521
|
+
process.stderr.write(
|
|
3522
|
+
`j-rig emit-evidence: failed to spawn cosign (${cosignBin}): ${result.error.message}
|
|
3523
|
+
`
|
|
3524
|
+
);
|
|
3525
|
+
return 2;
|
|
3526
|
+
}
|
|
3527
|
+
if (result.status !== 0) {
|
|
3528
|
+
process.stderr.write(
|
|
3529
|
+
`j-rig emit-evidence: cosign signing failed (exit ${result.status}):
|
|
3530
|
+
${result.stderr.toString()}
|
|
3531
|
+
`
|
|
3532
|
+
);
|
|
3533
|
+
return 3;
|
|
3534
|
+
}
|
|
3535
|
+
const sig = readFileSync4(sigPath, "utf-8").trim();
|
|
3536
|
+
writeOut(sig, opts);
|
|
3537
|
+
process.stderr.write(
|
|
3538
|
+
`emit-evidence: signed envelope emitted${rekorUrlStr ? ` (Rekor: ${rekorUrlStr})` : ""}
|
|
3539
|
+
`
|
|
3540
|
+
);
|
|
3541
|
+
return 0;
|
|
3542
|
+
} finally {
|
|
3543
|
+
rmSync(tmp, { recursive: true, force: true });
|
|
3544
|
+
}
|
|
3545
|
+
}
|
|
3546
|
+
async function buildComposeInput(opts) {
|
|
3547
|
+
const runner = opts.runnerVersion ?? DEFAULT_RUNNER_VERSION;
|
|
3548
|
+
const commitSha = opts.commitSha ?? safeGitHead();
|
|
3549
|
+
if (opts.gateId || opts.gateDecision || opts.gateName || opts.gateVersion || opts.policyRef || opts.inputHash || opts.policyHash) {
|
|
3550
|
+
const missing2 = [];
|
|
3551
|
+
if (!opts.gateId) missing2.push("--gate-id");
|
|
3552
|
+
if (!opts.gateDecision) missing2.push("--gate-decision");
|
|
3553
|
+
if (!opts.inputHash) missing2.push("--input-hash");
|
|
3554
|
+
if (!opts.policyHash) missing2.push("--policy-hash");
|
|
3555
|
+
if (!opts.gateName) missing2.push("--gate-name");
|
|
3556
|
+
if (!opts.gateVersion) missing2.push("--gate-version");
|
|
3557
|
+
if (!opts.policyRef) missing2.push("--policy-ref");
|
|
3558
|
+
if (missing2.length) {
|
|
3559
|
+
throw new Error(`direct mode requires: ${missing2.join(", ")}`);
|
|
3560
|
+
}
|
|
3561
|
+
return buildFromDirectFlags(opts, runner, commitSha);
|
|
3562
|
+
}
|
|
3563
|
+
const raw = await readInputJson(opts.input);
|
|
3564
|
+
if (!raw) {
|
|
3565
|
+
throw new Error(
|
|
3566
|
+
"no input received \u2014 pipe a gate-result JSON envelope on stdin OR pass --input <path> OR use direct-mode flags"
|
|
3567
|
+
);
|
|
3568
|
+
}
|
|
3569
|
+
let parsed;
|
|
3570
|
+
try {
|
|
3571
|
+
parsed = JSON.parse(raw);
|
|
3572
|
+
} catch (err) {
|
|
3573
|
+
throw new Error(`input is not valid JSON: ${err.message}`);
|
|
3574
|
+
}
|
|
3575
|
+
const required = ["gate_id", "input_hash", "policy_hash"];
|
|
3576
|
+
const missing = required.filter((k) => !(k in parsed));
|
|
3577
|
+
if (!("gate_decision" in parsed) && !("result" in parsed)) {
|
|
3578
|
+
missing.push("gate_decision");
|
|
3579
|
+
}
|
|
3580
|
+
if (!("gate_name" in parsed)) missing.push("gate_name");
|
|
3581
|
+
if (!("gate_version" in parsed)) missing.push("gate_version");
|
|
3582
|
+
if (!("gate_reasons" in parsed)) missing.push("gate_reasons");
|
|
3583
|
+
if (!("policy_ref" in parsed)) missing.push("policy_ref");
|
|
3584
|
+
if (missing.length) {
|
|
3585
|
+
throw new Error(
|
|
3586
|
+
`gate-result envelope missing required v2 field(s): ${missing.join(", ")}. A v1-shaped envelope (lacking gate_name/gate_version/gate_reasons/policy_ref) must be re-emitted via the gate that produced it with the --gate-name/--gate-version/--gate-reasons/--policy-ref flags set. Pipeline mode will not synthesize these fields because doing so produces fabricated provenance in the signed statement.`
|
|
3587
|
+
);
|
|
3588
|
+
}
|
|
3589
|
+
const rawDecision = "gate_decision" in parsed ? String(parsed.gate_decision) : mapV1ResultToV2Decision(String(parsed.result ?? ""));
|
|
3590
|
+
const { gateDecision, extraSkipped, extraReasons } = resolveDecision(rawDecision);
|
|
3591
|
+
const coverageObj = parsed.coverage && typeof parsed.coverage === "object" ? parsed.coverage : void 0;
|
|
3592
|
+
const coverageEvaluated = Array.isArray(coverageObj?.dimensions_evaluated) ? coverageObj.dimensions_evaluated : Array.isArray(parsed.coverage_evaluated) ? parsed.coverage_evaluated : [];
|
|
3593
|
+
const coverageSkipped = [
|
|
3594
|
+
...extraSkipped,
|
|
3595
|
+
...Array.isArray(coverageObj?.dimensions_skipped) ? coverageObj.dimensions_skipped : Array.isArray(parsed.coverage_skipped) ? parsed.coverage_skipped : []
|
|
3596
|
+
];
|
|
3597
|
+
const gateReasons = [
|
|
3598
|
+
...Array.isArray(parsed.gate_reasons) ? parsed.gate_reasons : [],
|
|
3599
|
+
...extraReasons
|
|
3600
|
+
];
|
|
3601
|
+
return {
|
|
3602
|
+
gateId: String(parsed.gate_id),
|
|
3603
|
+
gateDecision,
|
|
3604
|
+
gateName: String(parsed.gate_name),
|
|
3605
|
+
gateVersion: String(parsed.gate_version),
|
|
3606
|
+
gateReasons,
|
|
3607
|
+
coverage: { dimensionsEvaluated: coverageEvaluated, dimensionsSkipped: coverageSkipped },
|
|
3608
|
+
policyRef: String(parsed.policy_ref),
|
|
3609
|
+
policyHash: String(parsed.policy_hash),
|
|
3610
|
+
inputHash: String(parsed.input_hash),
|
|
3611
|
+
runner,
|
|
3612
|
+
commitSha,
|
|
3613
|
+
metadata: parsed.metadata,
|
|
3614
|
+
failureMode: parsed.failure_mode,
|
|
3615
|
+
advisorySeverity: typeof parsed.advisory_severity === "string" ? parseSeverity(parsed.advisory_severity) : void 0
|
|
3616
|
+
};
|
|
3617
|
+
}
|
|
3618
|
+
function buildFromDirectFlags(opts, runner, commitSha) {
|
|
3619
|
+
const rawDecision = opts.gateDecision;
|
|
3620
|
+
const { gateDecision, extraSkipped, extraReasons } = resolveDecision(rawDecision);
|
|
3621
|
+
const coverageSkipped = [...extraSkipped, ...opts.coverageSkipped ?? []];
|
|
3622
|
+
const gateReasons = [...opts.gateReason ?? [], ...extraReasons];
|
|
3623
|
+
return {
|
|
3624
|
+
gateId: opts.gateId,
|
|
3625
|
+
gateDecision,
|
|
3626
|
+
gateName: opts.gateName,
|
|
3627
|
+
gateVersion: opts.gateVersion,
|
|
3628
|
+
gateReasons,
|
|
3629
|
+
coverage: {
|
|
3630
|
+
dimensionsEvaluated: opts.coverageEvaluated ?? [],
|
|
3631
|
+
dimensionsSkipped: coverageSkipped
|
|
3632
|
+
},
|
|
3633
|
+
policyRef: opts.policyRef,
|
|
3634
|
+
policyHash: opts.policyHash,
|
|
3635
|
+
inputHash: opts.inputHash,
|
|
3636
|
+
runner,
|
|
3637
|
+
commitSha,
|
|
3638
|
+
metadata: opts.metadata ? parseMetadata(opts.metadata) : void 0,
|
|
3639
|
+
failureMode: opts.failureMode,
|
|
3640
|
+
advisorySeverity: opts.advisorySeverity ? parseSeverity(opts.advisorySeverity) : void 0
|
|
3641
|
+
};
|
|
3642
|
+
}
|
|
3643
|
+
var NOT_APPLICABLE_SKIPPED_TOKEN = "__not_applicable__";
|
|
3644
|
+
function resolveDecision(raw) {
|
|
3645
|
+
if (raw === NOT_APPLICABLE_SENTINEL || raw.toUpperCase() === NOT_APPLICABLE_SENTINEL) {
|
|
3646
|
+
return {
|
|
3647
|
+
gateDecision: "pass",
|
|
3648
|
+
extraSkipped: [NOT_APPLICABLE_SKIPPED_TOKEN],
|
|
3649
|
+
extraReasons: ["routed from NOT_APPLICABLE per DR-018 \xA7279 \u2014 non-verdict, not a pass"]
|
|
3650
|
+
};
|
|
3651
|
+
}
|
|
3652
|
+
return { gateDecision: parseDecision(raw), extraSkipped: [], extraReasons: [] };
|
|
3653
|
+
}
|
|
3654
|
+
function mapV1ResultToV2Decision(v1) {
|
|
3655
|
+
switch (v1.toUpperCase()) {
|
|
3656
|
+
case "PASS":
|
|
3657
|
+
return "pass";
|
|
3658
|
+
case "FAIL":
|
|
3659
|
+
return "fail";
|
|
3660
|
+
case "ADVISORY":
|
|
3661
|
+
return "advisory";
|
|
3662
|
+
case "NOT_APPLICABLE":
|
|
3663
|
+
return NOT_APPLICABLE_SENTINEL;
|
|
3664
|
+
default:
|
|
3665
|
+
return v1.toLowerCase();
|
|
3666
|
+
}
|
|
3667
|
+
}
|
|
3668
|
+
function parseDecision(s) {
|
|
3669
|
+
const check = GateResultEnum.safeParse(s);
|
|
3670
|
+
if (!check.success) {
|
|
3671
|
+
throw new Error(
|
|
3672
|
+
`invalid gate_decision '${s}' (expected one of: ${GateResultEnum.options.join(", ")})`
|
|
3673
|
+
);
|
|
3674
|
+
}
|
|
3675
|
+
return check.data;
|
|
3676
|
+
}
|
|
3677
|
+
function parseSeverity(s) {
|
|
3678
|
+
const check = AdvisorySeverityEnum.safeParse(s);
|
|
3679
|
+
if (!check.success) {
|
|
3680
|
+
throw new Error(
|
|
3681
|
+
`invalid advisory_severity '${s}' (expected one of: ${AdvisorySeverityEnum.options.join(", ")})`
|
|
3682
|
+
);
|
|
3683
|
+
}
|
|
3684
|
+
return check.data;
|
|
3685
|
+
}
|
|
3686
|
+
function parseMetadata(s) {
|
|
3687
|
+
try {
|
|
3688
|
+
const obj = JSON.parse(s);
|
|
3689
|
+
if (typeof obj !== "object" || obj === null || Array.isArray(obj)) {
|
|
3690
|
+
throw new Error("metadata must be a JSON object");
|
|
3691
|
+
}
|
|
3692
|
+
return obj;
|
|
3693
|
+
} catch (err) {
|
|
3694
|
+
throw new Error(`--metadata is not a valid JSON object: ${err.message}`);
|
|
3695
|
+
}
|
|
3696
|
+
}
|
|
3697
|
+
async function readInputJson(inputPath) {
|
|
3698
|
+
if (inputPath) {
|
|
3699
|
+
return readFileSync4(resolve5(inputPath), "utf-8");
|
|
3700
|
+
}
|
|
3701
|
+
if (process.stdin.isTTY) {
|
|
3702
|
+
return "";
|
|
3703
|
+
}
|
|
3704
|
+
return new Promise((resolveP, rejectP) => {
|
|
3705
|
+
const chunks = [];
|
|
3706
|
+
process.stdin.on("data", (c) => chunks.push(Buffer.from(c)));
|
|
3707
|
+
process.stdin.on("end", () => resolveP(Buffer.concat(chunks).toString("utf-8")));
|
|
3708
|
+
process.stdin.on("error", rejectP);
|
|
3709
|
+
});
|
|
3710
|
+
}
|
|
3711
|
+
function safeGitHead() {
|
|
3712
|
+
try {
|
|
3713
|
+
return execSync("git rev-parse HEAD", { stdio: ["ignore", "pipe", "ignore"] }).toString().trim();
|
|
3714
|
+
} catch {
|
|
3715
|
+
process.stderr.write(
|
|
3716
|
+
"j-rig emit-evidence: warning: could not resolve git HEAD (not a git repository?); embedding sentinel commit_sha '0000000' \u2014 pass --commit-sha to record the real commit\n"
|
|
3717
|
+
);
|
|
3718
|
+
return "0000000";
|
|
3719
|
+
}
|
|
3720
|
+
}
|
|
3721
|
+
|
|
3722
|
+
// src/commands/parse-agents.ts
|
|
3723
|
+
import { readFileSync as readFileSync5 } from "fs";
|
|
3724
|
+
import { resolve as resolve6 } from "path";
|
|
3725
|
+
import chalk7 from "chalk";
|
|
3726
|
+
function registerParseAgentsCommand(program) {
|
|
3727
|
+
program.command("parse-agents").description("Parse an AGENTS.md file into a typed structure").argument("[file]", "Path to AGENTS.md", "AGENTS.md").option("--json", "Output the full parsed structure as JSON").action((file, opts) => {
|
|
3728
|
+
process.exit(runParseAgents(file, opts));
|
|
3729
|
+
});
|
|
3730
|
+
}
|
|
3731
|
+
function runParseAgents(file, opts) {
|
|
3732
|
+
let content;
|
|
3733
|
+
try {
|
|
3734
|
+
content = readFileSync5(resolve6(file), "utf-8");
|
|
3735
|
+
} catch (err) {
|
|
3736
|
+
console.error(`Error: ${err instanceof Error ? err.message : err}`);
|
|
3737
|
+
return 1;
|
|
3738
|
+
}
|
|
3739
|
+
const result = parseAgentsMd(content);
|
|
3740
|
+
if (opts.json) {
|
|
3741
|
+
console.log(
|
|
3742
|
+
JSON.stringify(
|
|
3743
|
+
{
|
|
3744
|
+
ok: result.success,
|
|
3745
|
+
data: result.success ? result.data : null,
|
|
3746
|
+
errors: result.success ? [] : result.errors
|
|
3747
|
+
},
|
|
3748
|
+
null,
|
|
3749
|
+
2
|
|
3750
|
+
)
|
|
3751
|
+
);
|
|
3752
|
+
return result.success ? 0 : 1;
|
|
3753
|
+
}
|
|
3754
|
+
if (!result.success) {
|
|
3755
|
+
console.error(chalk7.red(`\u2717 Failed to parse ${file}:`));
|
|
3756
|
+
for (const e of result.errors) {
|
|
3757
|
+
console.error(` ${e.path || "<root>"}: ${e.message}`);
|
|
3758
|
+
}
|
|
3759
|
+
return 1;
|
|
3760
|
+
}
|
|
3761
|
+
const d = result.data;
|
|
3762
|
+
console.log(chalk7.green(`\u2713 Parsed ${file}`));
|
|
3763
|
+
console.log(` Title: ${d.title || chalk7.dim("(none)")}`);
|
|
3764
|
+
console.log(` Sections: ${d.sections.length}`);
|
|
3765
|
+
const commandKinds = Object.keys(d.commands);
|
|
3766
|
+
if (commandKinds.length > 0) {
|
|
3767
|
+
console.log(` Commands:`);
|
|
3768
|
+
for (const kind of commandKinds) {
|
|
3769
|
+
const cmds = d.commands[kind] ?? [];
|
|
3770
|
+
console.log(` ${kind}: ${cmds.length} command${cmds.length === 1 ? "" : "s"}`);
|
|
3771
|
+
}
|
|
3772
|
+
}
|
|
3773
|
+
if (d.tools.length > 0) console.log(` Tools: ${d.tools.length}`);
|
|
3774
|
+
if (d.capabilities.length > 0) console.log(` Capabilities: ${d.capabilities.length}`);
|
|
3775
|
+
if (d.constraints.length > 0) console.log(` Constraints: ${d.constraints.length}`);
|
|
3776
|
+
return 0;
|
|
3777
|
+
}
|
|
3778
|
+
|
|
3779
|
+
// src/commands/migrate.ts
|
|
3780
|
+
import { resolve as resolve7 } from "path";
|
|
3781
|
+
import chalk8 from "chalk";
|
|
3782
|
+
|
|
3783
|
+
// ../migrate/dist/index.js
|
|
3784
|
+
import { readFileSync as readFileSync6, writeFileSync as writeFileSync2, readdirSync, statSync as statSync2 } from "fs";
|
|
3785
|
+
import { join as join4 } from "path";
|
|
3786
|
+
var NOT_APPLICABLE_TOKEN = "__not_applicable__";
|
|
3787
|
+
var NOT_APPLICABLE_REASON = "routed from NOT_APPLICABLE per DR-018 \xA7279 \u2014 non-verdict, not a pass";
|
|
3788
|
+
var MIGRATED_NON_PASS_REASON = "migrated from v1 result field \u2014 original row predated gate_reasons requirement";
|
|
3789
|
+
var V1_DECISION_MAP = {
|
|
3790
|
+
PASS: "pass",
|
|
3791
|
+
FAIL: "fail",
|
|
3792
|
+
ADVISORY: "advisory"
|
|
3793
|
+
};
|
|
3794
|
+
function isObject(v) {
|
|
3795
|
+
return typeof v === "object" && v !== null && !Array.isArray(v);
|
|
3796
|
+
}
|
|
3797
|
+
function migrateBundle(input) {
|
|
3798
|
+
if (Array.isArray(input)) {
|
|
3799
|
+
const rows = [];
|
|
3800
|
+
const migrated = input.map((row, index) => {
|
|
3801
|
+
const r = migrateStatement(row, index);
|
|
3802
|
+
rows.push(r.report);
|
|
3803
|
+
return r.value;
|
|
3804
|
+
});
|
|
3805
|
+
return { migrated, rows, changed: rows.some((r) => r.outcome === "migrated") };
|
|
3806
|
+
}
|
|
3807
|
+
if (isObject(input) && Array.isArray(input.rows)) {
|
|
3808
|
+
const rows = [];
|
|
3809
|
+
const migratedRows = input.rows.map((row, index) => {
|
|
3810
|
+
const r = migrateStatement(row, index);
|
|
3811
|
+
rows.push(r.report);
|
|
3812
|
+
return r.value;
|
|
3813
|
+
});
|
|
3814
|
+
const migrated = { ...input, rows: migratedRows };
|
|
3815
|
+
return { migrated, rows, changed: rows.some((r) => r.outcome === "migrated") };
|
|
3816
|
+
}
|
|
3817
|
+
const single = migrateStatement(input, 0);
|
|
3818
|
+
return {
|
|
3819
|
+
migrated: single.value,
|
|
3820
|
+
rows: [single.report],
|
|
3821
|
+
changed: single.report.outcome === "migrated"
|
|
3822
|
+
};
|
|
3823
|
+
}
|
|
3824
|
+
function migrateStatement(input, index) {
|
|
3825
|
+
if (!isObject(input) || !isObject(input.predicate)) {
|
|
3826
|
+
return {
|
|
3827
|
+
value: input,
|
|
3828
|
+
report: { index, outcome: "not-a-statement", gateId: null, note: null }
|
|
3829
|
+
};
|
|
3830
|
+
}
|
|
3831
|
+
const predicate = input.predicate;
|
|
3832
|
+
const gateId = typeof predicate.gate_id === "string" ? predicate.gate_id : null;
|
|
3833
|
+
if ("gate_decision" in predicate) {
|
|
3834
|
+
return {
|
|
3835
|
+
value: input,
|
|
3836
|
+
report: { index, outcome: "already-v2", gateId, note: null }
|
|
3837
|
+
};
|
|
3838
|
+
}
|
|
3839
|
+
if (!("result" in predicate)) {
|
|
3840
|
+
return {
|
|
3841
|
+
value: input,
|
|
3842
|
+
report: {
|
|
3843
|
+
index,
|
|
3844
|
+
outcome: "error",
|
|
3845
|
+
gateId,
|
|
3846
|
+
note: "predicate has neither gate_decision (v2) nor result (v1) \u2014 cannot migrate"
|
|
3847
|
+
}
|
|
3848
|
+
};
|
|
3849
|
+
}
|
|
3850
|
+
const migrated = migratePredicate(predicate);
|
|
3851
|
+
if (migrated.error !== null) {
|
|
3852
|
+
return {
|
|
3853
|
+
value: input,
|
|
3854
|
+
report: { index, outcome: "error", gateId, note: migrated.error }
|
|
3855
|
+
};
|
|
3856
|
+
}
|
|
3857
|
+
return {
|
|
3858
|
+
value: { ...input, predicate: migrated.predicate },
|
|
3859
|
+
report: { index, outcome: "migrated", gateId, note: migrated.note }
|
|
3860
|
+
};
|
|
3861
|
+
}
|
|
3862
|
+
function migratePredicate(v1) {
|
|
3863
|
+
const rawResult = v1.result;
|
|
3864
|
+
if (typeof rawResult !== "string") {
|
|
3865
|
+
return {
|
|
3866
|
+
predicate: v1,
|
|
3867
|
+
error: `v1 'result' must be a string (got ${typeof rawResult})`,
|
|
3868
|
+
note: null
|
|
3869
|
+
};
|
|
3870
|
+
}
|
|
3871
|
+
const gateReasons = Array.isArray(v1.gate_reasons) ? v1.gate_reasons.filter((r) => typeof r === "string") : [];
|
|
3872
|
+
const dimsEvaluated = [];
|
|
3873
|
+
const dimsSkipped = [];
|
|
3874
|
+
let note = null;
|
|
3875
|
+
let gateDecision;
|
|
3876
|
+
if (rawResult === "NOT_APPLICABLE") {
|
|
3877
|
+
gateDecision = "pass";
|
|
3878
|
+
dimsSkipped.push(NOT_APPLICABLE_TOKEN);
|
|
3879
|
+
gateReasons.push(NOT_APPLICABLE_REASON);
|
|
3880
|
+
note = "routed NOT_APPLICABLE via coverage.dimensions_skipped";
|
|
3881
|
+
} else {
|
|
3882
|
+
const mapped = V1_DECISION_MAP[rawResult];
|
|
3883
|
+
if (mapped === void 0) {
|
|
3884
|
+
return {
|
|
3885
|
+
predicate: v1,
|
|
3886
|
+
error: `unknown v1 result value '${rawResult}' (expected PASS|FAIL|ADVISORY|NOT_APPLICABLE)`,
|
|
3887
|
+
note: null
|
|
3888
|
+
};
|
|
3889
|
+
}
|
|
3890
|
+
gateDecision = mapped;
|
|
3891
|
+
if (gateDecision !== "pass" && gateReasons.length === 0) {
|
|
3892
|
+
gateReasons.push(MIGRATED_NON_PASS_REASON);
|
|
3893
|
+
}
|
|
3894
|
+
}
|
|
3895
|
+
const gateId = typeof v1.gate_id === "string" ? v1.gate_id : "";
|
|
3896
|
+
const runner = typeof v1.runner === "string" ? v1.runner : "";
|
|
3897
|
+
const v2 = {
|
|
3898
|
+
gate_id: gateId,
|
|
3899
|
+
gate_name: deriveGateName(gateId),
|
|
3900
|
+
gate_version: deriveGateVersion(runner),
|
|
3901
|
+
gate_decision: gateDecision,
|
|
3902
|
+
gate_reasons: gateReasons,
|
|
3903
|
+
coverage: {
|
|
3904
|
+
dimensions_evaluated: dimsEvaluated,
|
|
3905
|
+
dimensions_skipped: dimsSkipped
|
|
3906
|
+
},
|
|
3907
|
+
policy_ref: derivePolicyRef(v1.policy_hash),
|
|
3908
|
+
policy_hash: v1.policy_hash,
|
|
3909
|
+
input_hash: v1.input_hash,
|
|
3910
|
+
// RENAME: timestamp → evaluated_at (value unchanged).
|
|
3911
|
+
evaluated_at: v1.timestamp,
|
|
3912
|
+
runner: v1.runner,
|
|
3913
|
+
commit_sha: v1.commit_sha
|
|
3914
|
+
};
|
|
3915
|
+
if ("metadata" in v1) v2.metadata = v1.metadata;
|
|
3916
|
+
if ("failure_mode" in v1) v2.failure_mode = v1.failure_mode;
|
|
3917
|
+
if ("advisory_severity" in v1) v2.advisory_severity = v1.advisory_severity;
|
|
3918
|
+
return { predicate: v2, error: null, note };
|
|
3919
|
+
}
|
|
3920
|
+
function deriveGateName(gateId) {
|
|
3921
|
+
const parts = gateId.split(":");
|
|
3922
|
+
const last = parts.length >= 3 ? parts[parts.length - 1] : "";
|
|
3923
|
+
const kebab = last.replace(/([a-z0-9])([A-Z])/g, "$1-$2").toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
3924
|
+
return kebab.length > 0 ? kebab : "migrated-gate";
|
|
3925
|
+
}
|
|
3926
|
+
function deriveGateVersion(runner) {
|
|
3927
|
+
const m = runner.match(/@(\d+\.\d+\.\d+(?:-[A-Za-z0-9.-]+)?(?:\+[A-Za-z0-9.-]+)?)$/);
|
|
3928
|
+
return m ? m[1] : "0.0.0";
|
|
3929
|
+
}
|
|
3930
|
+
function derivePolicyRef(policyHash) {
|
|
3931
|
+
if (typeof policyHash === "string" && policyHash.startsWith("sha256:")) {
|
|
3932
|
+
return `${policyHash}:unknown`;
|
|
3933
|
+
}
|
|
3934
|
+
return `sha256:${"0".repeat(64)}:unknown`;
|
|
3935
|
+
}
|
|
3936
|
+
var DEFAULT_INCLUDE = (p) => p.endsWith(".json");
|
|
3937
|
+
function runCodemod(dir, fs, options = {}) {
|
|
3938
|
+
const include = options.include ?? DEFAULT_INCLUDE;
|
|
3939
|
+
const doWrite = options.write === true;
|
|
3940
|
+
const files = [];
|
|
3941
|
+
for (const path of fs.walk(dir)) {
|
|
3942
|
+
if (!include(path)) continue;
|
|
3943
|
+
files.push(migrateFile(path, fs, doWrite));
|
|
3944
|
+
}
|
|
3945
|
+
return {
|
|
3946
|
+
files,
|
|
3947
|
+
changedCount: files.filter((f) => f.changed).length,
|
|
3948
|
+
errorCount: files.filter((f) => f.parseError !== null).length
|
|
3949
|
+
};
|
|
3950
|
+
}
|
|
3951
|
+
function migrateFile(path, fs, doWrite) {
|
|
3952
|
+
const original = fs.read(path);
|
|
3953
|
+
let parsed;
|
|
3954
|
+
try {
|
|
3955
|
+
parsed = JSON.parse(original);
|
|
3956
|
+
} catch (err) {
|
|
3957
|
+
return {
|
|
3958
|
+
path,
|
|
3959
|
+
changed: false,
|
|
3960
|
+
parseError: err instanceof Error ? err.message : String(err),
|
|
3961
|
+
rows: [],
|
|
3962
|
+
diff: "",
|
|
3963
|
+
written: false
|
|
3964
|
+
};
|
|
3965
|
+
}
|
|
3966
|
+
const result = migrateBundle(parsed);
|
|
3967
|
+
if (!result.changed) {
|
|
3968
|
+
return { path, changed: false, parseError: null, rows: result.rows, diff: "", written: false };
|
|
3969
|
+
}
|
|
3970
|
+
const trailingNewline = original.endsWith("\n") ? "\n" : "";
|
|
3971
|
+
const next = JSON.stringify(result.migrated, null, 2) + trailingNewline;
|
|
3972
|
+
const diff = unifiedDiff(path, original, next);
|
|
3973
|
+
let written = false;
|
|
3974
|
+
if (doWrite) {
|
|
3975
|
+
fs.write(path, next);
|
|
3976
|
+
written = true;
|
|
3977
|
+
}
|
|
3978
|
+
return { path, changed: true, parseError: null, rows: result.rows, diff, written };
|
|
3979
|
+
}
|
|
3980
|
+
function unifiedDiff(path, before, after) {
|
|
3981
|
+
const a = before.split("\n");
|
|
3982
|
+
const b = after.split("\n");
|
|
3983
|
+
const lines = [`--- a/${path}`, `+++ b/${path}`];
|
|
3984
|
+
const max = Math.max(a.length, b.length);
|
|
3985
|
+
for (let i = 0; i < max; i++) {
|
|
3986
|
+
const la = a[i];
|
|
3987
|
+
const lb = b[i];
|
|
3988
|
+
if (la === lb) {
|
|
3989
|
+
if (la !== void 0) lines.push(` ${la}`);
|
|
3990
|
+
continue;
|
|
3991
|
+
}
|
|
3992
|
+
if (la !== void 0) lines.push(`-${la}`);
|
|
3993
|
+
if (lb !== void 0) lines.push(`+${lb}`);
|
|
3994
|
+
}
|
|
3995
|
+
return lines.join("\n");
|
|
3996
|
+
}
|
|
3997
|
+
var nodeFs = {
|
|
3998
|
+
walk(dir) {
|
|
3999
|
+
const out = [];
|
|
4000
|
+
walkInto(dir, out);
|
|
4001
|
+
return out.sort();
|
|
4002
|
+
},
|
|
4003
|
+
read(path) {
|
|
4004
|
+
return readFileSync6(path, "utf-8");
|
|
4005
|
+
},
|
|
4006
|
+
write(path, content) {
|
|
4007
|
+
writeFileSync2(path, content);
|
|
4008
|
+
}
|
|
4009
|
+
};
|
|
4010
|
+
function walkInto(dir, out) {
|
|
4011
|
+
let entries;
|
|
4012
|
+
try {
|
|
4013
|
+
entries = readdirSync(dir);
|
|
4014
|
+
} catch {
|
|
4015
|
+
return;
|
|
4016
|
+
}
|
|
4017
|
+
for (const entry of entries) {
|
|
4018
|
+
if (entry === "node_modules" || entry === ".git") continue;
|
|
4019
|
+
const full = join4(dir, entry);
|
|
4020
|
+
let isDir = false;
|
|
4021
|
+
try {
|
|
4022
|
+
isDir = statSync2(full).isDirectory();
|
|
4023
|
+
} catch {
|
|
4024
|
+
continue;
|
|
4025
|
+
}
|
|
4026
|
+
if (isDir) {
|
|
4027
|
+
walkInto(full, out);
|
|
4028
|
+
} else {
|
|
4029
|
+
out.push(full);
|
|
4030
|
+
}
|
|
4031
|
+
}
|
|
4032
|
+
}
|
|
4033
|
+
|
|
4034
|
+
// src/commands/migrate.ts
|
|
4035
|
+
function registerMigrateCommand(program) {
|
|
4036
|
+
program.command("migrate").description("Rewrite v0.1.0-draft Evidence Bundle fixtures to the v2.0 gate-result/v1 shape").argument("<dir>", "Directory to scan for *.json Evidence Bundle fixtures").option("--write", "Apply the migration in place (default: dry run / diff only)").option("--json", "Output the per-file report as JSON").action((dir, opts) => {
|
|
4037
|
+
process.exit(runMigrate(dir, opts));
|
|
4038
|
+
});
|
|
4039
|
+
}
|
|
4040
|
+
function runMigrate(dir, opts) {
|
|
4041
|
+
let result;
|
|
4042
|
+
const target = resolve7(dir);
|
|
4043
|
+
try {
|
|
4044
|
+
result = runCodemod(target, nodeFs, { write: opts.write === true });
|
|
4045
|
+
} catch (err) {
|
|
4046
|
+
console.error(`Error: ${err instanceof Error ? err.message : err}`);
|
|
4047
|
+
return 1;
|
|
4048
|
+
}
|
|
4049
|
+
if (opts.json) {
|
|
4050
|
+
console.log(
|
|
4051
|
+
JSON.stringify(
|
|
4052
|
+
{
|
|
4053
|
+
dir: target,
|
|
4054
|
+
wrote: opts.write === true,
|
|
4055
|
+
changedCount: result.changedCount,
|
|
4056
|
+
errorCount: result.errorCount,
|
|
4057
|
+
files: result.files.map((f) => ({
|
|
4058
|
+
path: f.path,
|
|
4059
|
+
changed: f.changed,
|
|
4060
|
+
written: f.written,
|
|
4061
|
+
parseError: f.parseError,
|
|
4062
|
+
rows: f.rows
|
|
4063
|
+
}))
|
|
4064
|
+
},
|
|
4065
|
+
null,
|
|
4066
|
+
2
|
|
4067
|
+
)
|
|
4068
|
+
);
|
|
4069
|
+
return result.errorCount > 0 ? 1 : 0;
|
|
4070
|
+
}
|
|
4071
|
+
if (result.files.length === 0) {
|
|
4072
|
+
console.log(chalk8.yellow(`No JSON fixtures found under ${target}`));
|
|
4073
|
+
return 0;
|
|
4074
|
+
}
|
|
4075
|
+
for (const f of result.files) {
|
|
4076
|
+
if (f.parseError !== null) {
|
|
4077
|
+
console.error(chalk8.red(`\u2717 ${f.path}: parse error \u2014 ${f.parseError}`));
|
|
4078
|
+
continue;
|
|
4079
|
+
}
|
|
4080
|
+
if (!f.changed) continue;
|
|
4081
|
+
console.log(
|
|
4082
|
+
opts.write ? chalk8.green(`\u2713 migrated ${f.path}`) : chalk8.cyan(`~ would migrate ${f.path}`)
|
|
4083
|
+
);
|
|
4084
|
+
if (!opts.write) console.log(f.diff);
|
|
4085
|
+
}
|
|
4086
|
+
const verb = opts.write ? "migrated" : "would migrate";
|
|
4087
|
+
console.log(
|
|
4088
|
+
`
|
|
4089
|
+
${result.changedCount} file(s) ${verb}` + (result.errorCount > 0 ? chalk8.red(`, ${result.errorCount} parse error(s)`) : "") + (opts.write ? "" : chalk8.dim(" (dry run \u2014 pass --write to apply)"))
|
|
4090
|
+
);
|
|
4091
|
+
return result.errorCount > 0 ? 1 : 0;
|
|
4092
|
+
}
|
|
4093
|
+
|
|
4094
|
+
// src/commands/skill-signals.ts
|
|
4095
|
+
import chalk9 from "chalk";
|
|
4096
|
+
function collectCass(opts) {
|
|
4097
|
+
return {
|
|
4098
|
+
testsPassed: opts.testsPassed === true,
|
|
4099
|
+
clearResolution: opts.clearResolution === true,
|
|
4100
|
+
codeChanges: opts.codeChanges === true,
|
|
4101
|
+
userConfirmed: opts.userConfirmed === true,
|
|
4102
|
+
backtracking: opts.backtracking === true,
|
|
4103
|
+
abandoned: opts.abandoned === true
|
|
4104
|
+
};
|
|
4105
|
+
}
|
|
4106
|
+
function registerSkillSignalCommands(program) {
|
|
4107
|
+
program.command("ingest-skill").description(
|
|
4108
|
+
"Record one CASS-gated skill usage event (verified-session-gated; never raw loads)"
|
|
4109
|
+
).argument("<skill-id>", "kebab-slug skill id the usage is for").requiredOption("--session-id <id>", "Opaque id of the CASS-gated session").option(
|
|
4110
|
+
"--source <ci|plugin>",
|
|
4111
|
+
"Provenance: ci (gate-anchored, trusted) | plugin (unverified)",
|
|
4112
|
+
"plugin"
|
|
4113
|
+
).option("--tests-passed", "CASS signal: tests ran and passed (+0.25)").option("--clear-resolution", "CASS signal: session reached a clear resolution (+0.25)").option("--code-changes", "CASS signal: session produced code changes (+0.15)").option("--user-confirmed", "CASS signal: user confirmed the result was useful (+0.15)").option("--backtracking", "CASS signal: session involved backtracking (-0.10)").option("--abandoned", "CASS signal: session was abandoned (-0.20)").option("--tenant <id>", "Tenant bucket (omit for the single-tenant/global bucket)").option("--db <path>", "SQLite DB path", "j-rig.db").option("--json", "Output as JSON").action(
|
|
4114
|
+
(skillId, opts) => {
|
|
4115
|
+
try {
|
|
4116
|
+
const source = (opts.source ?? "plugin").trim().toLowerCase();
|
|
4117
|
+
if (source !== "ci" && source !== "plugin") {
|
|
4118
|
+
console.error(`Error: --source must be 'ci' or 'plugin' (got '${opts.source}')`);
|
|
4119
|
+
process.exit(1);
|
|
4120
|
+
}
|
|
4121
|
+
const cass = collectCass(opts);
|
|
4122
|
+
const database = openDb(opts.db);
|
|
4123
|
+
try {
|
|
4124
|
+
const rec = recordSkillUsage(database, {
|
|
4125
|
+
skillId,
|
|
4126
|
+
sessionId: opts.sessionId,
|
|
4127
|
+
source,
|
|
4128
|
+
cass,
|
|
4129
|
+
...opts.tenant !== void 0 ? { tenantId: opts.tenant } : {},
|
|
4130
|
+
// Timestamp at the I/O edge — the CLI is the wall-clock boundary; the
|
|
4131
|
+
// persistence + rollup layers stay deterministic on injected values.
|
|
4132
|
+
recordedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
4133
|
+
});
|
|
4134
|
+
if (opts.json) {
|
|
4135
|
+
console.log(JSON.stringify(rec, null, 2));
|
|
4136
|
+
return;
|
|
4137
|
+
}
|
|
4138
|
+
console.log(header(`j-rig ingest-skill: ${skillId}`));
|
|
4139
|
+
console.log(
|
|
4140
|
+
` CASS: ${rec.cassScore.toFixed(2)} (threshold ${CASS_PASS_THRESHOLD}) \u2014 ` + (rec.cassPassed ? `${icon("pass")} PASS \u2014 counts toward verified adoption` : `${icon("warning")} FAIL \u2014 persisted but EXCLUDED from adoption (anti-gaming)`)
|
|
4141
|
+
);
|
|
4142
|
+
console.log(
|
|
4143
|
+
` source: ${rec.source}${rec.tenantId ? ` | tenant: ${rec.tenantId}` : ""}`
|
|
4144
|
+
);
|
|
4145
|
+
if (!rec.cassPassed) {
|
|
4146
|
+
console.log(
|
|
4147
|
+
chalk9.dim(
|
|
4148
|
+
" Note: a low-quality session is recorded but never counted. There is no force-count flag."
|
|
4149
|
+
)
|
|
4150
|
+
);
|
|
4151
|
+
}
|
|
4152
|
+
} finally {
|
|
4153
|
+
database.close();
|
|
4154
|
+
}
|
|
4155
|
+
} catch (err) {
|
|
4156
|
+
console.error(`Error: ${err instanceof Error ? err.message : String(err)}`);
|
|
4157
|
+
process.exit(1);
|
|
4158
|
+
}
|
|
4159
|
+
}
|
|
4160
|
+
);
|
|
4161
|
+
program.command("review").description(
|
|
4162
|
+
"Record a curated-signal human thumb + open-ended rationale (NOT a signed predicate)"
|
|
4163
|
+
).argument("<skill-id>", "kebab-slug skill id the review is for").requiredOption("--verdict <up|down>", "Coarse thumb: up | down").option("--rationale <text>", "Open-ended free-text rationale (non-comparable; never parsed)").option("--reviewer <id>", "Reviewer identity (email/handle)", "unknown").option("--tenant <id>", "Tenant bucket (omit for the single-tenant/global bucket)").option("--db <path>", "SQLite DB path", "j-rig.db").option("--json", "Output as JSON").action(
|
|
4164
|
+
(skillId, opts) => {
|
|
4165
|
+
try {
|
|
4166
|
+
const v = opts.verdict.trim().toLowerCase();
|
|
4167
|
+
if (v !== "up" && v !== "down") {
|
|
4168
|
+
console.error(`Error: --verdict must be 'up' or 'down' (got '${opts.verdict}')`);
|
|
4169
|
+
process.exit(1);
|
|
4170
|
+
}
|
|
4171
|
+
const database = openDb(opts.db);
|
|
4172
|
+
try {
|
|
4173
|
+
const rec = recordSkillReview(database, {
|
|
4174
|
+
skillId,
|
|
4175
|
+
thumbsUp: v === "up",
|
|
4176
|
+
...opts.rationale !== void 0 ? { rationale: opts.rationale } : {},
|
|
4177
|
+
reviewer: opts.reviewer,
|
|
4178
|
+
...opts.tenant !== void 0 ? { tenantId: opts.tenant } : {},
|
|
4179
|
+
recordedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
4180
|
+
});
|
|
4181
|
+
if (opts.json) {
|
|
4182
|
+
console.log(JSON.stringify(rec, null, 2));
|
|
4183
|
+
return;
|
|
4184
|
+
}
|
|
4185
|
+
console.log(header(`j-rig review: ${skillId}`));
|
|
4186
|
+
console.log(
|
|
4187
|
+
` ${rec.thumbsUp ? icon("pass") : icon("error")} thumb ${rec.thumbsUp ? "up" : "down"} by ${rec.reviewer} (${rec.governanceClass})`
|
|
4188
|
+
);
|
|
4189
|
+
if (rec.rationale) console.log(` rationale: ${rec.rationale}`);
|
|
4190
|
+
console.log(
|
|
4191
|
+
chalk9.dim(
|
|
4192
|
+
" Note: curated-signal \u2014 NOT a signed human-review/v1 predicate, never a trust root."
|
|
4193
|
+
)
|
|
4194
|
+
);
|
|
4195
|
+
} finally {
|
|
4196
|
+
database.close();
|
|
4197
|
+
}
|
|
4198
|
+
} catch (err) {
|
|
4199
|
+
console.error(`Error: ${err instanceof Error ? err.message : String(err)}`);
|
|
4200
|
+
process.exit(1);
|
|
4201
|
+
}
|
|
4202
|
+
}
|
|
4203
|
+
);
|
|
4204
|
+
}
|
|
4205
|
+
|
|
4206
|
+
// src/index.ts
|
|
4207
|
+
import { registerRefineCommand } from "@intentsolutions/refiner";
|
|
4208
|
+
function createProgram() {
|
|
4209
|
+
const program = new Command();
|
|
4210
|
+
program.name("j-rig").description("Seven-layer binary evaluation harness for Claude Skills").version("0.1.0");
|
|
4211
|
+
registerCheckCommand(program);
|
|
4212
|
+
registerValidateCommand(program);
|
|
4213
|
+
registerEvalCommand(program);
|
|
4214
|
+
registerReportCommand(program);
|
|
4215
|
+
registerOptimizeCommand(program);
|
|
4216
|
+
registerDriftCommand(program);
|
|
4217
|
+
registerEmitEvidenceCommand(program);
|
|
4218
|
+
registerParseAgentsCommand(program);
|
|
4219
|
+
registerMigrateCommand(program);
|
|
4220
|
+
registerSkillSignalCommands(program);
|
|
4221
|
+
registerRefineCommand(program);
|
|
4222
|
+
return program;
|
|
4223
|
+
}
|
|
4224
|
+
function main(argv) {
|
|
4225
|
+
const program = createProgram();
|
|
4226
|
+
program.parse(argv ?? process.argv);
|
|
4227
|
+
}
|
|
4228
|
+
var isDirectRun = process.argv[1]?.endsWith("index.js") || process.argv[1]?.endsWith("index.ts") || process.argv[1]?.endsWith("j-rig");
|
|
4229
|
+
if (isDirectRun) {
|
|
4230
|
+
main();
|
|
4231
|
+
}
|
|
4232
|
+
export {
|
|
4233
|
+
main
|
|
4234
|
+
};
|