@intentsolutions/jrig-cli 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +39 -2
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -82,6 +82,9 @@ var CriterionSchema = z.object({
|
|
|
82
82
|
deterministic_check_params: z.record(z.string(), z.unknown()).optional().describe(
|
|
83
83
|
"Parameters forwarded to the deterministic check (e.g. { value: 'needle' } for 'contains', { pattern: '\\\\d+' } for 'regex_match')"
|
|
84
84
|
)
|
|
85
|
+
}).refine((c) => c.method !== "deterministic" || !!c.deterministic_check, {
|
|
86
|
+
message: "deterministic criteria must define deterministic_check",
|
|
87
|
+
path: ["deterministic_check"]
|
|
85
88
|
});
|
|
86
89
|
var TestCaseTier = z2.enum(["core", "edge", "regression", "adversarial"]);
|
|
87
90
|
var TriggerExpectation = z2.enum(["should_trigger", "should_not_trigger"]);
|
|
@@ -115,6 +118,19 @@ var EvalSpecSchema = z3.object({
|
|
|
115
118
|
models: z3.array(ModelTarget).default(["sonnet"]).describe("Models to test independently"),
|
|
116
119
|
siblings: z3.array(SiblingSkillSchema).optional().describe("Sibling skills for pack-sensitive evaluation"),
|
|
117
120
|
tags: z3.array(z3.string()).optional().describe("Categorization tags")
|
|
121
|
+
}).superRefine((spec, ctx) => {
|
|
122
|
+
const knownCriteria = new Set(spec.criteria.map((c) => c.id));
|
|
123
|
+
spec.test_cases.forEach((tc, ti) => {
|
|
124
|
+
tc.criteria_ids?.forEach((cid, ci) => {
|
|
125
|
+
if (!knownCriteria.has(cid)) {
|
|
126
|
+
ctx.addIssue({
|
|
127
|
+
code: z3.ZodIssueCode.custom,
|
|
128
|
+
message: `test case "${tc.id}" references unknown criterion id "${cid}"`,
|
|
129
|
+
path: ["test_cases", ti, "criteria_ids", ci]
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
});
|
|
118
134
|
});
|
|
119
135
|
var EvalContractSchema = z4.object({
|
|
120
136
|
contract_version: z4.literal("1.0").describe("Schema version for forward compatibility"),
|
|
@@ -1009,6 +1025,16 @@ async function judgeWithLLM(criterion, outcome, provider, model) {
|
|
|
1009
1025
|
};
|
|
1010
1026
|
}
|
|
1011
1027
|
}
|
|
1028
|
+
function selectCriteriaForTestCase(criteria, criteriaIds) {
|
|
1029
|
+
if (criteriaIds === void 0) return criteria;
|
|
1030
|
+
const available = new Set(criteria.map((c) => c.id));
|
|
1031
|
+
const unknown = criteriaIds.filter((id) => !available.has(id));
|
|
1032
|
+
if (unknown.length > 0) {
|
|
1033
|
+
throw new Error(`Test case references unknown criteria_ids: ${unknown.join(", ")}`);
|
|
1034
|
+
}
|
|
1035
|
+
const wanted = new Set(criteriaIds);
|
|
1036
|
+
return criteria.filter((c) => wanted.has(c.id));
|
|
1037
|
+
}
|
|
1012
1038
|
function computeScoreCard(results, criteria, regressions = []) {
|
|
1013
1039
|
const criteriaMap = new Map(criteria.map((c) => [c.id, c]));
|
|
1014
1040
|
let passed = 0, failed = 0, unsure = 0, blockerFailures = 0;
|
|
@@ -3019,9 +3045,20 @@ function registerEvalCommand(program) {
|
|
|
3019
3045
|
` Functional: ${outcomes.length}/${spec.test_cases.length} test case(s) executed`
|
|
3020
3046
|
);
|
|
3021
3047
|
}
|
|
3048
|
+
const testCaseById = new Map(spec.test_cases.map((tc) => [tc.id, tc]));
|
|
3022
3049
|
const allJudgments = [];
|
|
3023
3050
|
for (const outcome of outcomes) {
|
|
3024
|
-
const
|
|
3051
|
+
const testCase = testCaseById.get(outcome.test_case_id);
|
|
3052
|
+
if (!testCase) {
|
|
3053
|
+
throw new Error(
|
|
3054
|
+
`Outcome references unknown test case id: "${outcome.test_case_id}"`
|
|
3055
|
+
);
|
|
3056
|
+
}
|
|
3057
|
+
const applicableCriteria = selectCriteriaForTestCase(
|
|
3058
|
+
spec.criteria,
|
|
3059
|
+
testCase.criteria_ids
|
|
3060
|
+
);
|
|
3061
|
+
const judgments = await judgeCriteria(applicableCriteria, outcome, providers.judge, {
|
|
3025
3062
|
model
|
|
3026
3063
|
});
|
|
3027
3064
|
for (const j of judgments) {
|
|
@@ -4207,7 +4244,7 @@ function registerSkillSignalCommands(program) {
|
|
|
4207
4244
|
import { registerRefineCommand } from "@intentsolutions/refiner";
|
|
4208
4245
|
function createProgram() {
|
|
4209
4246
|
const program = new Command();
|
|
4210
|
-
program.name("j-rig").description("Seven-layer binary evaluation harness for Claude Skills").version("0.1.
|
|
4247
|
+
program.name("j-rig").description("Seven-layer binary evaluation harness for Claude Skills").version("0.1.1");
|
|
4211
4248
|
registerCheckCommand(program);
|
|
4212
4249
|
registerValidateCommand(program);
|
|
4213
4250
|
registerEvalCommand(program);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@intentsolutions/jrig-cli",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "J-Rig seven-layer binary eval CLI for Claude Skills — the `j-rig` command: package integrity, trigger/functional/regression/baseline scoring, optimizer, and rollout-gate evidence. Self-contained (bundles the internal eval engine).",
|
|
6
6
|
"keywords": [
|
|
@@ -61,8 +61,8 @@
|
|
|
61
61
|
"zod": "^4.4.3"
|
|
62
62
|
},
|
|
63
63
|
"devDependencies": {
|
|
64
|
-
"@j-rig/core": "2.1.0",
|
|
65
64
|
"@j-rig/db": "2.1.0",
|
|
65
|
+
"@j-rig/core": "2.1.0",
|
|
66
66
|
"@j-rig/migrate": "2.1.0"
|
|
67
67
|
},
|
|
68
68
|
"scripts": {
|