@mcptoolshop/research-os 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +134 -0
- package/README.es.md +25 -2
- package/README.fr.md +24 -1
- package/README.hi.md +39 -1
- package/README.it.md +25 -2
- package/README.ja.md +24 -1
- package/README.md +37 -1
- package/README.pt-BR.md +24 -1
- package/README.zh.md +25 -2
- package/dist/calibration/aggregate-receipt-schema.d.ts +509 -0
- package/dist/calibration/aggregate-receipt-schema.js +143 -0
- package/dist/calibration/aggregate-receipt-schema.js.map +1 -0
- package/dist/calibration/aggregate.d.ts +35 -0
- package/dist/calibration/aggregate.js +454 -0
- package/dist/calibration/aggregate.js.map +1 -0
- package/dist/calibration/receipt-schema.d.ts +317 -0
- package/dist/calibration/receipt-schema.js +68 -0
- package/dist/calibration/receipt-schema.js.map +1 -0
- package/dist/calibration/receipt.d.ts +31 -0
- package/dist/calibration/receipt.js +151 -0
- package/dist/calibration/receipt.js.map +1 -0
- package/dist/cli.js +136 -9
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +10 -1
- package/dist/index.js +13 -6
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
|
|
3
|
+
declare const StatusLabelSchema: z.ZodEnum<["trusted_baseline", "conditional_pass", "failed", "comparison_only"]>;
|
|
4
|
+
declare const ArchitectureSchema: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
5
|
+
declare const RecallSchema: z.ZodObject<{
|
|
6
|
+
matched: z.ZodNumber;
|
|
7
|
+
total: z.ZodNumber;
|
|
8
|
+
ratio: z.ZodNumber;
|
|
9
|
+
}, "strip", z.ZodTypeAny, {
|
|
10
|
+
matched: number;
|
|
11
|
+
total: number;
|
|
12
|
+
ratio: number;
|
|
13
|
+
}, {
|
|
14
|
+
matched: number;
|
|
15
|
+
total: number;
|
|
16
|
+
ratio: number;
|
|
17
|
+
}>;
|
|
18
|
+
declare const PerCategoryRecallSchema: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
19
|
+
matched: z.ZodNumber;
|
|
20
|
+
total: z.ZodNumber;
|
|
21
|
+
ratio: z.ZodNumber;
|
|
22
|
+
}, "strip", z.ZodTypeAny, {
|
|
23
|
+
matched: number;
|
|
24
|
+
total: number;
|
|
25
|
+
ratio: number;
|
|
26
|
+
}, {
|
|
27
|
+
matched: number;
|
|
28
|
+
total: number;
|
|
29
|
+
ratio: number;
|
|
30
|
+
}>>;
|
|
31
|
+
declare const PassFailSchema: z.ZodObject<{
|
|
32
|
+
fp_ceiling: z.ZodEnum<["PASS", "FAIL"]>;
|
|
33
|
+
any_flag_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
34
|
+
per_category_any_flag_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
35
|
+
strict_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
36
|
+
decision_vocab_completeness: z.ZodEnum<["PASS", "FAIL"]>;
|
|
37
|
+
latency_soft: z.ZodEnum<["PASS", "WARN"]>;
|
|
38
|
+
latency_hard: z.ZodEnum<["PASS", "FAIL"]>;
|
|
39
|
+
empty_or_malformed: z.ZodEnum<["PASS", "FAIL"]>;
|
|
40
|
+
overall: z.ZodEnum<["PASS", "FAIL"]>;
|
|
41
|
+
}, "strip", z.ZodTypeAny, {
|
|
42
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
43
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
44
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
45
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
46
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
47
|
+
latency_soft: "PASS" | "WARN";
|
|
48
|
+
latency_hard: "PASS" | "FAIL";
|
|
49
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
50
|
+
overall: "PASS" | "FAIL";
|
|
51
|
+
}, {
|
|
52
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
53
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
54
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
55
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
56
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
57
|
+
latency_soft: "PASS" | "WARN";
|
|
58
|
+
latency_hard: "PASS" | "FAIL";
|
|
59
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
60
|
+
overall: "PASS" | "FAIL";
|
|
61
|
+
}>;
|
|
62
|
+
declare const DecisionVocabBarSchema: z.ZodObject<{
|
|
63
|
+
architecture: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
64
|
+
required: z.ZodNumber;
|
|
65
|
+
produced: z.ZodNumber;
|
|
66
|
+
passed: z.ZodBoolean;
|
|
67
|
+
}, "strip", z.ZodTypeAny, {
|
|
68
|
+
required: number;
|
|
69
|
+
architecture: "single-pass" | "two-pass";
|
|
70
|
+
produced: number;
|
|
71
|
+
passed: boolean;
|
|
72
|
+
}, {
|
|
73
|
+
required: number;
|
|
74
|
+
architecture: "single-pass" | "two-pass";
|
|
75
|
+
produced: number;
|
|
76
|
+
passed: boolean;
|
|
77
|
+
}>;
|
|
78
|
+
declare const CalibrationReceiptSchema: z.ZodObject<{
|
|
79
|
+
schema_version: z.ZodLiteral<1>;
|
|
80
|
+
profile_name: z.ZodString;
|
|
81
|
+
status: z.ZodEnum<["trusted_baseline", "conditional_pass", "failed", "comparison_only"]>;
|
|
82
|
+
model: z.ZodString;
|
|
83
|
+
architecture: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
84
|
+
fixture: z.ZodString;
|
|
85
|
+
fixture_total_claims: z.ZodNumber;
|
|
86
|
+
fixture_good_claims: z.ZodNumber;
|
|
87
|
+
fixture_bad_claims: z.ZodNumber;
|
|
88
|
+
calibrated_at: z.ZodString;
|
|
89
|
+
research_os_version: z.ZodString;
|
|
90
|
+
runtime_ms: z.ZodNumber;
|
|
91
|
+
good_fp_count: z.ZodNumber;
|
|
92
|
+
any_flag_recall: z.ZodObject<{
|
|
93
|
+
matched: z.ZodNumber;
|
|
94
|
+
total: z.ZodNumber;
|
|
95
|
+
ratio: z.ZodNumber;
|
|
96
|
+
}, "strip", z.ZodTypeAny, {
|
|
97
|
+
matched: number;
|
|
98
|
+
total: number;
|
|
99
|
+
ratio: number;
|
|
100
|
+
}, {
|
|
101
|
+
matched: number;
|
|
102
|
+
total: number;
|
|
103
|
+
ratio: number;
|
|
104
|
+
}>;
|
|
105
|
+
strict_recall: z.ZodObject<{
|
|
106
|
+
matched: z.ZodNumber;
|
|
107
|
+
total: z.ZodNumber;
|
|
108
|
+
ratio: z.ZodNumber;
|
|
109
|
+
}, "strip", z.ZodTypeAny, {
|
|
110
|
+
matched: number;
|
|
111
|
+
total: number;
|
|
112
|
+
ratio: number;
|
|
113
|
+
}, {
|
|
114
|
+
matched: number;
|
|
115
|
+
total: number;
|
|
116
|
+
ratio: number;
|
|
117
|
+
}>;
|
|
118
|
+
per_category_any_flag: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
119
|
+
matched: z.ZodNumber;
|
|
120
|
+
total: z.ZodNumber;
|
|
121
|
+
ratio: z.ZodNumber;
|
|
122
|
+
}, "strip", z.ZodTypeAny, {
|
|
123
|
+
matched: number;
|
|
124
|
+
total: number;
|
|
125
|
+
ratio: number;
|
|
126
|
+
}, {
|
|
127
|
+
matched: number;
|
|
128
|
+
total: number;
|
|
129
|
+
ratio: number;
|
|
130
|
+
}>>;
|
|
131
|
+
per_category_strict: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
132
|
+
matched: z.ZodNumber;
|
|
133
|
+
total: z.ZodNumber;
|
|
134
|
+
ratio: z.ZodNumber;
|
|
135
|
+
}, "strip", z.ZodTypeAny, {
|
|
136
|
+
matched: number;
|
|
137
|
+
total: number;
|
|
138
|
+
ratio: number;
|
|
139
|
+
}, {
|
|
140
|
+
matched: number;
|
|
141
|
+
total: number;
|
|
142
|
+
ratio: number;
|
|
143
|
+
}>>;
|
|
144
|
+
decision_vocabulary: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
145
|
+
decisions_produced_count: z.ZodNumber;
|
|
146
|
+
decision_vocab_bar: z.ZodObject<{
|
|
147
|
+
architecture: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
148
|
+
required: z.ZodNumber;
|
|
149
|
+
produced: z.ZodNumber;
|
|
150
|
+
passed: z.ZodBoolean;
|
|
151
|
+
}, "strip", z.ZodTypeAny, {
|
|
152
|
+
required: number;
|
|
153
|
+
architecture: "single-pass" | "two-pass";
|
|
154
|
+
produced: number;
|
|
155
|
+
passed: boolean;
|
|
156
|
+
}, {
|
|
157
|
+
required: number;
|
|
158
|
+
architecture: "single-pass" | "two-pass";
|
|
159
|
+
produced: number;
|
|
160
|
+
passed: boolean;
|
|
161
|
+
}>;
|
|
162
|
+
unreachable_decisions: z.ZodArray<z.ZodString, "many">;
|
|
163
|
+
empty_or_malformed_responses: z.ZodNumber;
|
|
164
|
+
pass_fail: z.ZodObject<{
|
|
165
|
+
fp_ceiling: z.ZodEnum<["PASS", "FAIL"]>;
|
|
166
|
+
any_flag_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
167
|
+
per_category_any_flag_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
168
|
+
strict_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
169
|
+
decision_vocab_completeness: z.ZodEnum<["PASS", "FAIL"]>;
|
|
170
|
+
latency_soft: z.ZodEnum<["PASS", "WARN"]>;
|
|
171
|
+
latency_hard: z.ZodEnum<["PASS", "FAIL"]>;
|
|
172
|
+
empty_or_malformed: z.ZodEnum<["PASS", "FAIL"]>;
|
|
173
|
+
overall: z.ZodEnum<["PASS", "FAIL"]>;
|
|
174
|
+
}, "strip", z.ZodTypeAny, {
|
|
175
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
176
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
177
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
178
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
179
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
180
|
+
latency_soft: "PASS" | "WARN";
|
|
181
|
+
latency_hard: "PASS" | "FAIL";
|
|
182
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
183
|
+
overall: "PASS" | "FAIL";
|
|
184
|
+
}, {
|
|
185
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
186
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
187
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
188
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
189
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
190
|
+
latency_soft: "PASS" | "WARN";
|
|
191
|
+
latency_hard: "PASS" | "FAIL";
|
|
192
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
193
|
+
overall: "PASS" | "FAIL";
|
|
194
|
+
}>;
|
|
195
|
+
notes: z.ZodArray<z.ZodString, "many">;
|
|
196
|
+
}, "strip", z.ZodTypeAny, {
|
|
197
|
+
research_os_version: string;
|
|
198
|
+
status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
|
|
199
|
+
notes: string[];
|
|
200
|
+
schema_version: 1;
|
|
201
|
+
profile_name: string;
|
|
202
|
+
model: string;
|
|
203
|
+
architecture: "single-pass" | "two-pass";
|
|
204
|
+
fixture: string;
|
|
205
|
+
fixture_total_claims: number;
|
|
206
|
+
fixture_good_claims: number;
|
|
207
|
+
fixture_bad_claims: number;
|
|
208
|
+
calibrated_at: string;
|
|
209
|
+
runtime_ms: number;
|
|
210
|
+
good_fp_count: number;
|
|
211
|
+
any_flag_recall: {
|
|
212
|
+
matched: number;
|
|
213
|
+
total: number;
|
|
214
|
+
ratio: number;
|
|
215
|
+
};
|
|
216
|
+
strict_recall: {
|
|
217
|
+
matched: number;
|
|
218
|
+
total: number;
|
|
219
|
+
ratio: number;
|
|
220
|
+
};
|
|
221
|
+
per_category_any_flag: Record<string, {
|
|
222
|
+
matched: number;
|
|
223
|
+
total: number;
|
|
224
|
+
ratio: number;
|
|
225
|
+
}>;
|
|
226
|
+
per_category_strict: Record<string, {
|
|
227
|
+
matched: number;
|
|
228
|
+
total: number;
|
|
229
|
+
ratio: number;
|
|
230
|
+
}>;
|
|
231
|
+
decision_vocabulary: Record<string, number>;
|
|
232
|
+
decisions_produced_count: number;
|
|
233
|
+
decision_vocab_bar: {
|
|
234
|
+
required: number;
|
|
235
|
+
architecture: "single-pass" | "two-pass";
|
|
236
|
+
produced: number;
|
|
237
|
+
passed: boolean;
|
|
238
|
+
};
|
|
239
|
+
unreachable_decisions: string[];
|
|
240
|
+
empty_or_malformed_responses: number;
|
|
241
|
+
pass_fail: {
|
|
242
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
243
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
244
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
245
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
246
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
247
|
+
latency_soft: "PASS" | "WARN";
|
|
248
|
+
latency_hard: "PASS" | "FAIL";
|
|
249
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
250
|
+
overall: "PASS" | "FAIL";
|
|
251
|
+
};
|
|
252
|
+
}, {
|
|
253
|
+
research_os_version: string;
|
|
254
|
+
status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
|
|
255
|
+
notes: string[];
|
|
256
|
+
schema_version: 1;
|
|
257
|
+
profile_name: string;
|
|
258
|
+
model: string;
|
|
259
|
+
architecture: "single-pass" | "two-pass";
|
|
260
|
+
fixture: string;
|
|
261
|
+
fixture_total_claims: number;
|
|
262
|
+
fixture_good_claims: number;
|
|
263
|
+
fixture_bad_claims: number;
|
|
264
|
+
calibrated_at: string;
|
|
265
|
+
runtime_ms: number;
|
|
266
|
+
good_fp_count: number;
|
|
267
|
+
any_flag_recall: {
|
|
268
|
+
matched: number;
|
|
269
|
+
total: number;
|
|
270
|
+
ratio: number;
|
|
271
|
+
};
|
|
272
|
+
strict_recall: {
|
|
273
|
+
matched: number;
|
|
274
|
+
total: number;
|
|
275
|
+
ratio: number;
|
|
276
|
+
};
|
|
277
|
+
per_category_any_flag: Record<string, {
|
|
278
|
+
matched: number;
|
|
279
|
+
total: number;
|
|
280
|
+
ratio: number;
|
|
281
|
+
}>;
|
|
282
|
+
per_category_strict: Record<string, {
|
|
283
|
+
matched: number;
|
|
284
|
+
total: number;
|
|
285
|
+
ratio: number;
|
|
286
|
+
}>;
|
|
287
|
+
decision_vocabulary: Record<string, number>;
|
|
288
|
+
decisions_produced_count: number;
|
|
289
|
+
decision_vocab_bar: {
|
|
290
|
+
required: number;
|
|
291
|
+
architecture: "single-pass" | "two-pass";
|
|
292
|
+
produced: number;
|
|
293
|
+
passed: boolean;
|
|
294
|
+
};
|
|
295
|
+
unreachable_decisions: string[];
|
|
296
|
+
empty_or_malformed_responses: number;
|
|
297
|
+
pass_fail: {
|
|
298
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
299
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
300
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
301
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
302
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
303
|
+
latency_soft: "PASS" | "WARN";
|
|
304
|
+
latency_hard: "PASS" | "FAIL";
|
|
305
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
306
|
+
overall: "PASS" | "FAIL";
|
|
307
|
+
};
|
|
308
|
+
}>;
|
|
309
|
+
type StatusLabel = z.infer<typeof StatusLabelSchema>;
|
|
310
|
+
type Architecture = z.infer<typeof ArchitectureSchema>;
|
|
311
|
+
type Recall = z.infer<typeof RecallSchema>;
|
|
312
|
+
type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;
|
|
313
|
+
type PassFail = z.infer<typeof PassFailSchema>;
|
|
314
|
+
type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;
|
|
315
|
+
type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;
|
|
316
|
+
|
|
317
|
+
export { type Architecture, ArchitectureSchema, type CalibrationReceipt, CalibrationReceiptSchema, type DecisionVocabBar, DecisionVocabBarSchema, type PassFail, PassFailSchema, type PerCategoryRecall, PerCategoryRecallSchema, type Recall, RecallSchema, type StatusLabel, StatusLabelSchema };
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// src/calibration/receipt-schema.ts
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
var StatusLabelSchema = z.enum([
|
|
4
|
+
"trusted_baseline",
|
|
5
|
+
"conditional_pass",
|
|
6
|
+
"failed",
|
|
7
|
+
"comparison_only"
|
|
8
|
+
]);
|
|
9
|
+
var ArchitectureSchema = z.enum(["single-pass", "two-pass"]);
|
|
10
|
+
var RecallSchema = z.object({
|
|
11
|
+
matched: z.number().int().nonnegative(),
|
|
12
|
+
total: z.number().int().nonnegative(),
|
|
13
|
+
ratio: z.number().min(0).max(1)
|
|
14
|
+
});
|
|
15
|
+
var PerCategoryRecallSchema = z.record(z.string(), RecallSchema);
|
|
16
|
+
var PassFailSchema = z.object({
|
|
17
|
+
fp_ceiling: z.enum(["PASS", "FAIL"]),
|
|
18
|
+
any_flag_recall_floor: z.enum(["PASS", "FAIL"]),
|
|
19
|
+
per_category_any_flag_floor: z.enum(["PASS", "FAIL"]),
|
|
20
|
+
strict_recall_floor: z.enum(["PASS", "FAIL"]),
|
|
21
|
+
decision_vocab_completeness: z.enum(["PASS", "FAIL"]),
|
|
22
|
+
latency_soft: z.enum(["PASS", "WARN"]),
|
|
23
|
+
latency_hard: z.enum(["PASS", "FAIL"]),
|
|
24
|
+
empty_or_malformed: z.enum(["PASS", "FAIL"]),
|
|
25
|
+
overall: z.enum(["PASS", "FAIL"])
|
|
26
|
+
});
|
|
27
|
+
var DecisionVocabBarSchema = z.object({
|
|
28
|
+
architecture: ArchitectureSchema,
|
|
29
|
+
required: z.number().int().positive(),
|
|
30
|
+
produced: z.number().int().nonnegative(),
|
|
31
|
+
passed: z.boolean()
|
|
32
|
+
});
|
|
33
|
+
var CalibrationReceiptSchema = z.object({
|
|
34
|
+
schema_version: z.literal(1),
|
|
35
|
+
profile_name: z.string(),
|
|
36
|
+
status: StatusLabelSchema,
|
|
37
|
+
model: z.string(),
|
|
38
|
+
architecture: ArchitectureSchema,
|
|
39
|
+
fixture: z.string(),
|
|
40
|
+
fixture_total_claims: z.number().int().positive(),
|
|
41
|
+
fixture_good_claims: z.number().int().nonnegative(),
|
|
42
|
+
fixture_bad_claims: z.number().int().nonnegative(),
|
|
43
|
+
calibrated_at: z.string(),
|
|
44
|
+
research_os_version: z.string(),
|
|
45
|
+
runtime_ms: z.number().int().nonnegative(),
|
|
46
|
+
good_fp_count: z.number().int().nonnegative(),
|
|
47
|
+
any_flag_recall: RecallSchema,
|
|
48
|
+
strict_recall: RecallSchema,
|
|
49
|
+
per_category_any_flag: PerCategoryRecallSchema,
|
|
50
|
+
per_category_strict: PerCategoryRecallSchema,
|
|
51
|
+
decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),
|
|
52
|
+
decisions_produced_count: z.number().int().nonnegative(),
|
|
53
|
+
decision_vocab_bar: DecisionVocabBarSchema,
|
|
54
|
+
unreachable_decisions: z.array(z.string()),
|
|
55
|
+
empty_or_malformed_responses: z.number().int().nonnegative(),
|
|
56
|
+
pass_fail: PassFailSchema,
|
|
57
|
+
notes: z.array(z.string())
|
|
58
|
+
});
|
|
59
|
+
export {
|
|
60
|
+
ArchitectureSchema,
|
|
61
|
+
CalibrationReceiptSchema,
|
|
62
|
+
DecisionVocabBarSchema,
|
|
63
|
+
PassFailSchema,
|
|
64
|
+
PerCategoryRecallSchema,
|
|
65
|
+
RecallSchema,
|
|
66
|
+
StatusLabelSchema
|
|
67
|
+
};
|
|
68
|
+
//# sourceMappingURL=receipt-schema.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/calibration/receipt-schema.ts"],"sourcesContent":["import { z } from 'zod';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n"],"mappings":";AAAA,SAAS,SAAS;AAEX,IAAM,oBAAoB,EAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqB,EAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAe,EAAE,OAAO;AAAA,EACnC,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0B,EAAE,OAAO,EAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,YAAY,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAAS,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyB,EAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQ,EAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,2BAA2B,EAAE,OAAO;AAAA,EAC/C,gBAAgB,EAAE,QAAQ,CAAC;AAAA,EAC3B,cAAc,EAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAO,EAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAAS,EAAE,OAAO;AAAA,EAClB,sBAAsB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAe,EAAE,OAAO;AAAA,EACxB,qBAAqB,EAAE,OAAO;AAAA,EAC9B,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAe,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EACzC,8BAA8B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC;AAC3B,CAAC;","names":[]}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { CalibrationReceipt, Architecture, DecisionVocabBar, Recall, PerCategoryRecall, PassFail, StatusLabel } from './receipt-schema.js';
|
|
2
|
+
import 'zod';
|
|
3
|
+
|
|
4
|
+
declare function computeDecisionVocabBar(architecture: Architecture, decisionsProducedCount: number): DecisionVocabBar;
|
|
5
|
+
declare function computePassFail(input: {
|
|
6
|
+
good_fp_count: number;
|
|
7
|
+
any_flag_recall: Recall;
|
|
8
|
+
per_category_any_flag: PerCategoryRecall;
|
|
9
|
+
strict_recall: Recall;
|
|
10
|
+
decision_vocab_bar: DecisionVocabBar;
|
|
11
|
+
runtime_ms: number;
|
|
12
|
+
empty_or_malformed_responses: number;
|
|
13
|
+
}): PassFail;
|
|
14
|
+
declare function computeStatusLabel(input: {
|
|
15
|
+
profileName: string;
|
|
16
|
+
architecture: Architecture;
|
|
17
|
+
passFail: PassFail;
|
|
18
|
+
goodFpCount: number;
|
|
19
|
+
modeOverride?: 'comparison_only';
|
|
20
|
+
}): StatusLabel;
|
|
21
|
+
declare function receiptToCalibrationSummary(receipt: CalibrationReceipt): {
|
|
22
|
+
fixture: string | null;
|
|
23
|
+
good_false_positive_rate: string | null;
|
|
24
|
+
bad_any_flag_recall: string | null;
|
|
25
|
+
strict_category_recall: string | null;
|
|
26
|
+
unsupported_claim_recall: string | null;
|
|
27
|
+
notes: string | null;
|
|
28
|
+
};
|
|
29
|
+
declare function buildReceiptMarkdown(r: CalibrationReceipt): string;
|
|
30
|
+
|
|
31
|
+
export { buildReceiptMarkdown, computeDecisionVocabBar, computePassFail, computeStatusLabel, receiptToCalibrationSummary };
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
// src/calibration/receipt.ts
|
|
2
|
+
function computeDecisionVocabBar(architecture, decisionsProducedCount) {
|
|
3
|
+
const required = architecture === "two-pass" ? 3 : 4;
|
|
4
|
+
return {
|
|
5
|
+
architecture,
|
|
6
|
+
required,
|
|
7
|
+
produced: decisionsProducedCount,
|
|
8
|
+
passed: decisionsProducedCount >= required
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
function computePerCategoryFloor(perCategoryAnyFlag) {
|
|
12
|
+
for (const [, recall] of Object.entries(perCategoryAnyFlag)) {
|
|
13
|
+
if (recall.total >= 2 && recall.ratio < 0.5) return "FAIL";
|
|
14
|
+
}
|
|
15
|
+
return "PASS";
|
|
16
|
+
}
|
|
17
|
+
function computePassFail(input) {
|
|
18
|
+
const fp_ceiling = input.good_fp_count <= 1 ? "PASS" : "FAIL";
|
|
19
|
+
const any_flag_recall_floor = input.any_flag_recall.ratio >= 0.65 ? "PASS" : "FAIL";
|
|
20
|
+
const per_category_any_flag_floor = computePerCategoryFloor(input.per_category_any_flag);
|
|
21
|
+
const strict_recall_floor = input.strict_recall.ratio >= 0.2 ? "PASS" : "FAIL";
|
|
22
|
+
const decision_vocab_completeness = input.decision_vocab_bar.passed ? "PASS" : "FAIL";
|
|
23
|
+
const latency_soft = input.runtime_ms <= 6e5 ? "PASS" : "WARN";
|
|
24
|
+
const latency_hard = input.runtime_ms <= 12e5 ? "PASS" : "FAIL";
|
|
25
|
+
const empty_or_malformed = input.empty_or_malformed_responses === 0 ? "PASS" : "FAIL";
|
|
26
|
+
const hardBars = [
|
|
27
|
+
fp_ceiling,
|
|
28
|
+
any_flag_recall_floor,
|
|
29
|
+
per_category_any_flag_floor,
|
|
30
|
+
strict_recall_floor,
|
|
31
|
+
decision_vocab_completeness,
|
|
32
|
+
latency_hard,
|
|
33
|
+
empty_or_malformed
|
|
34
|
+
];
|
|
35
|
+
const overall = hardBars.every((v) => v === "PASS") ? "PASS" : "FAIL";
|
|
36
|
+
return {
|
|
37
|
+
fp_ceiling,
|
|
38
|
+
any_flag_recall_floor,
|
|
39
|
+
per_category_any_flag_floor,
|
|
40
|
+
strict_recall_floor,
|
|
41
|
+
decision_vocab_completeness,
|
|
42
|
+
latency_soft,
|
|
43
|
+
latency_hard,
|
|
44
|
+
empty_or_malformed,
|
|
45
|
+
overall
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
function computeStatusLabel(input) {
|
|
49
|
+
if (input.modeOverride === "comparison_only") return "comparison_only";
|
|
50
|
+
if (input.architecture === "single-pass" && /hermes/i.test(input.profileName)) {
|
|
51
|
+
return "comparison_only";
|
|
52
|
+
}
|
|
53
|
+
if (input.passFail.overall === "FAIL") return "failed";
|
|
54
|
+
const isHermesTwoPass = /hermes/i.test(input.profileName) && input.architecture === "two-pass";
|
|
55
|
+
if (isHermesTwoPass && input.goodFpCount === 0) return "trusted_baseline";
|
|
56
|
+
return "conditional_pass";
|
|
57
|
+
}
|
|
58
|
+
function receiptToCalibrationSummary(receipt) {
|
|
59
|
+
const fp = receipt.good_fp_count;
|
|
60
|
+
const fpTotal = receipt.fixture_good_claims;
|
|
61
|
+
const fpPct = fpTotal > 0 ? Math.round(fp / fpTotal * 100) : 0;
|
|
62
|
+
const af = receipt.any_flag_recall;
|
|
63
|
+
const sr = receipt.strict_recall;
|
|
64
|
+
const unsupported = receipt.per_category_any_flag["unsupported_claim"];
|
|
65
|
+
return {
|
|
66
|
+
fixture: receipt.fixture,
|
|
67
|
+
good_false_positive_rate: `${fp}/${fpTotal} (${fpPct}%)`,
|
|
68
|
+
bad_any_flag_recall: `${af.matched}/${af.total} (${Math.round(af.ratio * 100)}%)`,
|
|
69
|
+
strict_category_recall: `${sr.matched}/${sr.total} (${Math.round(sr.ratio * 100)}%)`,
|
|
70
|
+
unsupported_claim_recall: unsupported ? `${unsupported.matched}/${unsupported.total} (${Math.round(unsupported.ratio * 100)}%)` : null,
|
|
71
|
+
notes: `status=${receipt.status} model=${receipt.model} arch=${receipt.architecture} overall=${receipt.pass_fail.overall} decisions=${receipt.decisions_produced_count}/6`
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
function buildReceiptMarkdown(r) {
|
|
75
|
+
const pct = (ratio) => `${Math.round(ratio * 100)}%`;
|
|
76
|
+
const runtimeSec = (r.runtime_ms / 1e3).toFixed(1);
|
|
77
|
+
const perCatRows = Object.entries(r.per_category_any_flag).map(([cat, af]) => {
|
|
78
|
+
const st = r.per_category_strict[cat] ?? { matched: 0, total: af.total, ratio: 0 };
|
|
79
|
+
return `| ${cat} | ${af.matched}/${af.total} (${pct(af.ratio)}) | ${st.matched}/${st.total} (${pct(st.ratio)}) |`;
|
|
80
|
+
}).join("\n");
|
|
81
|
+
const dvRows = [
|
|
82
|
+
"accepted_for_synthesis",
|
|
83
|
+
"rejected",
|
|
84
|
+
"needs_scope_repair",
|
|
85
|
+
"needs_source_repair",
|
|
86
|
+
"needs_contradiction_mapping",
|
|
87
|
+
"needs_human_review"
|
|
88
|
+
].map((d) => {
|
|
89
|
+
const count = r.decision_vocabulary[d] ?? 0;
|
|
90
|
+
const unreachable = r.unreachable_decisions.includes(d) ? ` (unreachable from ${r.fixture})` : "";
|
|
91
|
+
return `| ${d} | ${count}${unreachable} |`;
|
|
92
|
+
}).join("\n");
|
|
93
|
+
const pf = r.pass_fail;
|
|
94
|
+
const bar = r.decision_vocab_bar;
|
|
95
|
+
const notesSection = r.notes.length > 0 ? `
|
|
96
|
+
## Notes
|
|
97
|
+
|
|
98
|
+
${r.notes.map((n) => `- ${n}`).join("\n")}
|
|
99
|
+
` : "";
|
|
100
|
+
return `# Calibration Receipt \u2014 ${r.profile_name}
|
|
101
|
+
|
|
102
|
+
- **Model:** ${r.model}
|
|
103
|
+
- **Architecture:** ${r.architecture}
|
|
104
|
+
- **Status:** ${r.status}
|
|
105
|
+
- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)
|
|
106
|
+
- **Calibrated at:** ${r.calibrated_at}
|
|
107
|
+
- **Research-OS version:** ${r.research_os_version}
|
|
108
|
+
- **Runtime:** ${runtimeSec} seconds
|
|
109
|
+
|
|
110
|
+
## Headline metrics
|
|
111
|
+
|
|
112
|
+
- FP: ${r.good_fp_count} / ${r.fixture_good_claims}
|
|
113
|
+
- Any-flag recall: ${r.any_flag_recall.matched} / ${r.any_flag_recall.total} (${pct(r.any_flag_recall.ratio)})
|
|
114
|
+
- Strict recall: ${r.strict_recall.matched} / ${r.strict_recall.total} (${pct(r.strict_recall.ratio)})
|
|
115
|
+
- Decisions produced: ${r.decisions_produced_count} / 6
|
|
116
|
+
|
|
117
|
+
## PASS / FAIL
|
|
118
|
+
|
|
119
|
+
| Bar | Result |
|
|
120
|
+
|---|---|
|
|
121
|
+
| FP ceiling (\u22641) | ${pf.fp_ceiling} |
|
|
122
|
+
| Any-flag recall (\u226565%) | ${pf.any_flag_recall_floor} |
|
|
123
|
+
| Per-category any-flag (\u226550%) | ${pf.per_category_any_flag_floor} |
|
|
124
|
+
| Strict recall (\u226520%) | ${pf.strict_recall_floor} |
|
|
125
|
+
| Decision vocab (${bar.architecture} \u2265 ${bar.required}) | ${pf.decision_vocab_completeness} |
|
|
126
|
+
| Latency soft (\u226410 min) | ${pf.latency_soft} |
|
|
127
|
+
| Latency hard (\u226420 min) | ${pf.latency_hard} |
|
|
128
|
+
| Empty/malformed (=0) | ${pf.empty_or_malformed} |
|
|
129
|
+
| **OVERALL** | **${pf.overall}** |
|
|
130
|
+
|
|
131
|
+
## Per-category recall
|
|
132
|
+
|
|
133
|
+
| Category | Any-flag | Strict |
|
|
134
|
+
|---|---|---|
|
|
135
|
+
${perCatRows}
|
|
136
|
+
|
|
137
|
+
## Decision vocabulary
|
|
138
|
+
|
|
139
|
+
| Decision | Count |
|
|
140
|
+
|---|---:|
|
|
141
|
+
${dvRows}
|
|
142
|
+
${notesSection}`;
|
|
143
|
+
}
|
|
144
|
+
export {
|
|
145
|
+
buildReceiptMarkdown,
|
|
146
|
+
computeDecisionVocabBar,
|
|
147
|
+
computePassFail,
|
|
148
|
+
computeStatusLabel,
|
|
149
|
+
receiptToCalibrationSummary
|
|
150
|
+
};
|
|
151
|
+
//# sourceMappingURL=receipt.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/calibration/receipt.ts"],"sourcesContent":["import type {\n Architecture,\n CalibrationReceipt,\n DecisionVocabBar,\n PassFail,\n PerCategoryRecall,\n Recall,\n StatusLabel,\n} from './receipt-schema.js';\n\n// Architecture-aware decision-vocab bar.\n// single-pass: narrow_critic pass is absent, so model uses full 6-decision\n// vocabulary. Bar: >= 4.\n// two-pass: narrow_critic collapses needs_human_review into harder decisions,\n// reducing diversity. Bar: >= 3 (F-49 resolution).\nexport function computeDecisionVocabBar(\n architecture: Architecture,\n decisionsProducedCount: number,\n): DecisionVocabBar {\n const required = architecture === 'two-pass' ? 3 : 4;\n return {\n architecture,\n required,\n produced: decisionsProducedCount,\n passed: decisionsProducedCount >= required,\n };\n}\n\n// Per-category any-flag floor: seeded categories with total >= 2 must have\n// ratio >= 0.50. Categories with fewer than 2 seeds are excluded (not enough\n// signal to enforce a floor — e.g. a 1-seed category with 0 misses is fine).\nfunction computePerCategoryFloor(perCategoryAnyFlag: PerCategoryRecall): 'PASS' | 'FAIL' {\n for (const [, recall] of Object.entries(perCategoryAnyFlag)) {\n if (recall.total >= 2 && recall.ratio < 0.5) return 'FAIL';\n }\n return 'PASS';\n}\n\nexport function computePassFail(input: {\n good_fp_count: number;\n any_flag_recall: Recall;\n per_category_any_flag: PerCategoryRecall;\n strict_recall: Recall;\n decision_vocab_bar: DecisionVocabBar;\n runtime_ms: number;\n empty_or_malformed_responses: number;\n}): PassFail {\n const fp_ceiling = input.good_fp_count <= 1 ? 'PASS' : 'FAIL';\n const any_flag_recall_floor = input.any_flag_recall.ratio >= 0.65 ? 'PASS' : 'FAIL';\n const per_category_any_flag_floor = computePerCategoryFloor(input.per_category_any_flag);\n const strict_recall_floor = input.strict_recall.ratio >= 0.2 ? 'PASS' : 'FAIL';\n const decision_vocab_completeness = input.decision_vocab_bar.passed ? 'PASS' : 'FAIL';\n // Latency soft: warn-only, never FAIL\n const latency_soft = input.runtime_ms <= 600_000 ? 'PASS' : 'WARN';\n const latency_hard = input.runtime_ms <= 1_200_000 ? 'PASS' : 'FAIL';\n const empty_or_malformed = input.empty_or_malformed_responses === 0 ? 'PASS' : 'FAIL';\n\n const hardBars: Array<'PASS' | 'FAIL'> = [\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_hard,\n empty_or_malformed,\n ];\n const overall = hardBars.every((v) => v === 'PASS') ? 'PASS' : 'FAIL';\n\n return {\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_soft,\n latency_hard,\n empty_or_malformed,\n overall,\n };\n}\n\n// Status-label assignment (advisor-locked predicates).\n//\n// Priority order:\n// 1. comparison_only — explicit flag OR single-pass Hermes (architectural side-run)\n// 2. failed — any hard bar FAIL\n// 3. trusted_baseline — canonical Hermes two-pass with PASS + FP=0\n// 4. conditional_pass — everything else that passes bars\n//\n// trusted_baseline encodes the canonical Hermes two-pass admission: the profile\n// is a named Hermes model run in two-pass architecture, all bars pass, and FP=0.\n// Hermes family is detected by case-insensitive substring match on profile name.\n//\n// conditional_pass is the admission status for non-baseline profiles that pass\n// all hard bars but carry a caution (FP at ceiling, non-hermes model, etc.).\n// mistral-nemo:12b two-pass = conditional_pass (FP=1, passes recalibrated bars).\nexport function computeStatusLabel(input: {\n profileName: string;\n architecture: Architecture;\n passFail: PassFail;\n goodFpCount: number;\n modeOverride?: 'comparison_only';\n}): StatusLabel {\n // comparison_only: explicit operator flag\n if (input.modeOverride === 'comparison_only') return 'comparison_only';\n\n // comparison_only: single-pass Hermes is an architectural side-run by design\n // (the canonical profile is two-pass; single-pass exists only for comparison)\n if (input.architecture === 'single-pass' && /hermes/i.test(input.profileName)) {\n return 'comparison_only';\n }\n\n // failed: any hard bar fails (latency_soft is WARN-only, never blocks)\n if (input.passFail.overall === 'FAIL') return 'failed';\n\n // trusted_baseline: canonical Hermes two-pass profile with perfect FP\n // Predicate: profile name contains \"hermes\" (case-insensitive) AND\n // architecture is two-pass AND all bars pass AND FP = 0\n const isHermesTwoPass =\n /hermes/i.test(input.profileName) && input.architecture === 'two-pass';\n if (isHermesTwoPass && input.goodFpCount === 0) return 'trusted_baseline';\n\n // conditional_pass: passes all bars but carries caution\n // (FP at ceiling, non-baseline profile, or non-hermes model)\n return 'conditional_pass';\n}\n\n// Map a receipt to the PromotionCalibrationSummary string shape used by\n// review-active.json. Called by the review-promote CLI when auto-populating\n// calibration_summary from a persisted receipt.\nexport function receiptToCalibrationSummary(receipt: CalibrationReceipt): {\n fixture: string | null;\n good_false_positive_rate: string | null;\n bad_any_flag_recall: string | null;\n strict_category_recall: string | null;\n unsupported_claim_recall: string | null;\n notes: string | null;\n} {\n const fp = receipt.good_fp_count;\n const fpTotal = receipt.fixture_good_claims;\n const fpPct = fpTotal > 0 ? Math.round((fp / fpTotal) * 100) : 0;\n\n const af = receipt.any_flag_recall;\n const sr = receipt.strict_recall;\n const unsupported = receipt.per_category_any_flag['unsupported_claim'];\n\n return {\n fixture: receipt.fixture,\n good_false_positive_rate: `${fp}/${fpTotal} (${fpPct}%)`,\n bad_any_flag_recall: `${af.matched}/${af.total} (${Math.round(af.ratio * 100)}%)`,\n strict_category_recall: `${sr.matched}/${sr.total} (${Math.round(sr.ratio * 100)}%)`,\n unsupported_claim_recall: unsupported\n ? `${unsupported.matched}/${unsupported.total} (${Math.round(unsupported.ratio * 100)}%)`\n : null,\n notes: `status=${receipt.status} model=${receipt.model} arch=${receipt.architecture} overall=${receipt.pass_fail.overall} decisions=${receipt.decisions_produced_count}/6`,\n };\n}\n\n// Render a compact Markdown receipt. Operator proof artifact — no prose.\nexport function buildReceiptMarkdown(r: CalibrationReceipt): string {\n const pct = (ratio: number) => `${Math.round(ratio * 100)}%`;\n const runtimeSec = (r.runtime_ms / 1000).toFixed(1);\n\n const perCatRows = Object.entries(r.per_category_any_flag)\n .map(([cat, af]) => {\n const st = r.per_category_strict[cat] ?? { matched: 0, total: af.total, ratio: 0 };\n return `| ${cat} | ${af.matched}/${af.total} (${pct(af.ratio)}) | ${st.matched}/${st.total} (${pct(st.ratio)}) |`;\n })\n .join('\\n');\n\n const dvRows = [\n 'accepted_for_synthesis',\n 'rejected',\n 'needs_scope_repair',\n 'needs_source_repair',\n 'needs_contradiction_mapping',\n 'needs_human_review',\n ]\n .map((d) => {\n const count = r.decision_vocabulary[d] ?? 0;\n const unreachable = r.unreachable_decisions.includes(d) ? ` (unreachable from ${r.fixture})` : '';\n return `| ${d} | ${count}${unreachable} |`;\n })\n .join('\\n');\n\n const pf = r.pass_fail;\n const bar = r.decision_vocab_bar;\n\n const notesSection =\n r.notes.length > 0 ? `\\n## Notes\\n\\n${r.notes.map((n) => `- ${n}`).join('\\n')}\\n` : '';\n\n return `# Calibration Receipt — ${r.profile_name}\n\n- **Model:** ${r.model}\n- **Architecture:** ${r.architecture}\n- **Status:** ${r.status}\n- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)\n- **Calibrated at:** ${r.calibrated_at}\n- **Research-OS version:** ${r.research_os_version}\n- **Runtime:** ${runtimeSec} seconds\n\n## Headline metrics\n\n- FP: ${r.good_fp_count} / ${r.fixture_good_claims}\n- Any-flag recall: ${r.any_flag_recall.matched} / ${r.any_flag_recall.total} (${pct(r.any_flag_recall.ratio)})\n- Strict recall: ${r.strict_recall.matched} / ${r.strict_recall.total} (${pct(r.strict_recall.ratio)})\n- Decisions produced: ${r.decisions_produced_count} / 6\n\n## PASS / FAIL\n\n| Bar | Result |\n|---|---|\n| FP ceiling (≤1) | ${pf.fp_ceiling} |\n| Any-flag recall (≥65%) | ${pf.any_flag_recall_floor} |\n| Per-category any-flag (≥50%) | ${pf.per_category_any_flag_floor} |\n| Strict recall (≥20%) | ${pf.strict_recall_floor} |\n| Decision vocab (${bar.architecture} ≥ ${bar.required}) | ${pf.decision_vocab_completeness} |\n| Latency soft (≤10 min) | ${pf.latency_soft} |\n| Latency hard (≤20 min) | ${pf.latency_hard} |\n| Empty/malformed (=0) | ${pf.empty_or_malformed} |\n| **OVERALL** | **${pf.overall}** |\n\n## Per-category recall\n\n| Category | Any-flag | Strict |\n|---|---|---|\n${perCatRows}\n\n## Decision vocabulary\n\n| Decision | Count |\n|---|---:|\n${dvRows}\n${notesSection}`;\n}\n"],"mappings":";AAeO,SAAS,wBACd,cACA,wBACkB;AAClB,QAAM,WAAW,iBAAiB,aAAa,IAAI;AACnD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,UAAU;AAAA,IACV,QAAQ,0BAA0B;AAAA,EACpC;AACF;AAKA,SAAS,wBAAwB,oBAAwD;AACvF,aAAW,CAAC,EAAE,MAAM,KAAK,OAAO,QAAQ,kBAAkB,GAAG;AAC3D,QAAI,OAAO,SAAS,KAAK,OAAO,QAAQ,IAAK,QAAO;AAAA,EACtD;AACA,SAAO;AACT;AAEO,SAAS,gBAAgB,OAQnB;AACX,QAAM,aAAa,MAAM,iBAAiB,IAAI,SAAS;AACvD,QAAM,wBAAwB,MAAM,gBAAgB,SAAS,OAAO,SAAS;AAC7E,QAAM,8BAA8B,wBAAwB,MAAM,qBAAqB;AACvF,QAAM,sBAAsB,MAAM,cAAc,SAAS,MAAM,SAAS;AACxE,QAAM,8BAA8B,MAAM,mBAAmB,SAAS,SAAS;AAE/E,QAAM,eAAe,MAAM,cAAc,MAAU,SAAS;AAC5D,QAAM,eAAe,MAAM,cAAc,OAAY,SAAS;AAC9D,QAAM,qBAAqB,MAAM,iCAAiC,IAAI,SAAS;AAE/E,QAAM,WAAmC;AAAA,IACvC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,UAAU,SAAS,MAAM,CAAC,MAAM,MAAM,MAAM,IAAI,SAAS;AAE/D,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAiBO,SAAS,mBAAmB,OAMnB;AAEd,MAAI,MAAM,iBAAiB,kBAAmB,QAAO;AAIrD,MAAI,MAAM,iBAAiB,iBAAiB,UAAU,KAAK,MAAM,WAAW,GAAG;AAC7E,WAAO;AAAA,EACT;AAGA,MAAI,MAAM,SAAS,YAAY,OAAQ,QAAO;AAK9C,QAAM,kBACJ,UAAU,KAAK,MAAM,WAAW,KAAK,MAAM,iBAAiB;AAC9D,MAAI,mBAAmB,MAAM,gBAAgB,EAAG,QAAO;AAIvD,SAAO;AACT;AAKO,SAAS,4BAA4B,SAO1C;AACA,QAAM,KAAK,QAAQ;AACnB,QAAM,UAAU,QAAQ;AACxB,QAAM,QAAQ,UAAU,IAAI,KAAK,MAAO,KAAK,UAAW,GAAG,IAAI;AAE/D,QAAM,KAAK,QAAQ;AACnB,QAAM,KAAK,QAAQ;AACnB,QAAM,cAAc,QAAQ,sBAAsB,mBAAmB;AAErE,SAAO;AAAA,IACL,SAAS,QAAQ;AAAA,IACjB,0BAA0B,GAAG,EAAE,IAAI,OAAO,KAAK,KAAK;AAAA,IACpD,qBAAqB,GAAG,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,KAAK,MAAM,GAAG,QAAQ,GAAG,CAAC;AAAA,IAC7E,wBAAwB,GAAG,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,KAAK,MAAM,GAAG,QAAQ,GAAG,CAAC;AAAA,IAChF,0BAA0B,cACtB,GAAG,YAAY,OAAO,IAAI,YAAY,KAAK,KAAK,KAAK,MAAM,YAAY,QAAQ,GAAG,CAAC,OACnF;AAAA,IACJ,OAAO,UAAU,QAAQ,MAAM,UAAU,QAAQ,KAAK,SAAS,QAAQ,YAAY,YAAY,QAAQ,UAAU,OAAO,cAAc,QAAQ,wBAAwB;AAAA,EACxK;AACF;AAGO,SAAS,qBAAqB,GAA+B;AAClE,QAAM,MAAM,CAAC,UAAkB,GAAG,KAAK,MAAM,QAAQ,GAAG,CAAC;AACzD,QAAM,cAAc,EAAE,aAAa,KAAM,QAAQ,CAAC;AAElD,QAAM,aAAa,OAAO,QAAQ,EAAE,qBAAqB,EACtD,IAAI,CAAC,CAAC,KAAK,EAAE,MAAM;AAClB,UAAM,KAAK,EAAE,oBAAoB,GAAG,KAAK,EAAE,SAAS,GAAG,OAAO,GAAG,OAAO,OAAO,EAAE;AACjF,WAAO,KAAK,GAAG,MAAM,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,CAAC,OAAO,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,CAAC;AAAA,EAC9G,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,SAAS;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EACG,IAAI,CAAC,MAAM;AACV,UAAM,QAAQ,EAAE,oBAAoB,CAAC,KAAK;AAC1C,UAAM,cAAc,EAAE,sBAAsB,SAAS,CAAC,IAAI,sBAAsB,EAAE,OAAO,MAAM;AAC/F,WAAO,KAAK,CAAC,MAAM,KAAK,GAAG,WAAW;AAAA,EACxC,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AAEd,QAAM,eACJ,EAAE,MAAM,SAAS,IAAI;AAAA;AAAA;AAAA,EAAiB,EAAE,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,IAAO;AAEtF,SAAO,gCAA2B,EAAE,YAAY;AAAA;AAAA,eAEnC,EAAE,KAAK;AAAA,sBACA,EAAE,YAAY;AAAA,gBACpB,EAAE,MAAM;AAAA,iBACP,EAAE,OAAO,KAAK,EAAE,oBAAoB,aAAa,EAAE,mBAAmB,WAAW,EAAE,kBAAkB;AAAA,uBAC/F,EAAE,aAAa;AAAA,6BACT,EAAE,mBAAmB;AAAA,iBACjC,UAAU;AAAA;AAAA;AAAA;AAAA,QAInB,EAAE,aAAa,MAAM,EAAE,mBAAmB;AAAA,qBAC7B,EAAE,gBAAgB,OAAO,MAAM,EAAE,gBAAgB,KAAK,KAAK,IAAI,EAAE,gBAAgB,KAAK,CAAC;AAAA,mBACzF,EAAE,cAAc,OAAO,MAAM,EAAE,cAAc,KAAK,KAAK,IAAI,EAAE,cAAc,KAAK,CAAC;AAAA,wBAC5E,EAAE,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,2BAM5B,GAAG,UAAU;AAAA,kCACN,GAAG,qBAAqB;AAAA,wCAClB,GAAG,2BAA2B;AAAA,gCACtC,GAAG,mBAAmB;AAAA,oBAC7B,IAAI,YAAY,WAAM,IAAI,QAAQ,OAAO,GAAG,2BAA2B;AAAA,kCAC9D,GAAG,YAAY;AAAA,kCACf,GAAG,YAAY;AAAA,2BACjB,GAAG,kBAAkB;AAAA,oBAC5B,GAAG,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAM5B,UAAU;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMV,MAAM;AAAA,EACN,YAAY;AACd;","names":[]}
|