@mcptoolshop/research-os 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +253 -0
- package/README.es.md +33 -2
- package/README.fr.md +32 -1
- package/README.hi.md +52 -1
- package/README.it.md +33 -2
- package/README.ja.md +32 -1
- package/README.md +53 -1
- package/README.pt-BR.md +32 -1
- package/README.zh.md +33 -2
- package/dist/calibration/aggregate-receipt-schema.d.ts +547 -0
- package/dist/calibration/aggregate-receipt-schema.js +160 -0
- package/dist/calibration/aggregate-receipt-schema.js.map +1 -0
- package/dist/calibration/aggregate.d.ts +37 -0
- package/dist/calibration/aggregate.js +493 -0
- package/dist/calibration/aggregate.js.map +1 -0
- package/dist/calibration/receipt-schema.d.ts +356 -0
- package/dist/calibration/receipt-schema.js +83 -0
- package/dist/calibration/receipt-schema.js.map +1 -0
- package/dist/calibration/receipt.d.ts +32 -0
- package/dist/calibration/receipt.js +170 -0
- package/dist/calibration/receipt.js.map +1 -0
- package/dist/cli.js +1041 -851
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +154 -49
- package/dist/index.js +881 -818
- package/dist/index.js.map +1 -1
- package/dist/reviewer-options-schema-PZacF_MO.d.ts +27 -0
- package/package.json +1 -1
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
export { R as ReviewerOptions, a as ReviewerOptionsSchema } from '../reviewer-options-schema-PZacF_MO.js';
|
|
3
|
+
|
|
4
|
+
declare const StatusLabelSchema: z.ZodEnum<["trusted_baseline", "conditional_pass", "failed", "comparison_only"]>;
|
|
5
|
+
declare const ArchitectureSchema: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
6
|
+
declare const RecallSchema: z.ZodObject<{
|
|
7
|
+
matched: z.ZodNumber;
|
|
8
|
+
total: z.ZodNumber;
|
|
9
|
+
ratio: z.ZodNumber;
|
|
10
|
+
}, "strip", z.ZodTypeAny, {
|
|
11
|
+
matched: number;
|
|
12
|
+
total: number;
|
|
13
|
+
ratio: number;
|
|
14
|
+
}, {
|
|
15
|
+
matched: number;
|
|
16
|
+
total: number;
|
|
17
|
+
ratio: number;
|
|
18
|
+
}>;
|
|
19
|
+
declare const PerCategoryRecallSchema: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
20
|
+
matched: z.ZodNumber;
|
|
21
|
+
total: z.ZodNumber;
|
|
22
|
+
ratio: z.ZodNumber;
|
|
23
|
+
}, "strip", z.ZodTypeAny, {
|
|
24
|
+
matched: number;
|
|
25
|
+
total: number;
|
|
26
|
+
ratio: number;
|
|
27
|
+
}, {
|
|
28
|
+
matched: number;
|
|
29
|
+
total: number;
|
|
30
|
+
ratio: number;
|
|
31
|
+
}>>;
|
|
32
|
+
declare const PassFailSchema: z.ZodObject<{
|
|
33
|
+
fp_ceiling: z.ZodEnum<["PASS", "FAIL"]>;
|
|
34
|
+
any_flag_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
35
|
+
per_category_any_flag_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
36
|
+
strict_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
37
|
+
decision_vocab_completeness: z.ZodEnum<["PASS", "FAIL"]>;
|
|
38
|
+
latency_soft: z.ZodEnum<["PASS", "WARN"]>;
|
|
39
|
+
latency_hard: z.ZodEnum<["PASS", "FAIL"]>;
|
|
40
|
+
empty_or_malformed: z.ZodEnum<["PASS", "FAIL"]>;
|
|
41
|
+
overall: z.ZodEnum<["PASS", "FAIL"]>;
|
|
42
|
+
}, "strip", z.ZodTypeAny, {
|
|
43
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
44
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
45
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
46
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
47
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
48
|
+
latency_soft: "PASS" | "WARN";
|
|
49
|
+
latency_hard: "PASS" | "FAIL";
|
|
50
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
51
|
+
overall: "PASS" | "FAIL";
|
|
52
|
+
}, {
|
|
53
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
54
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
55
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
56
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
57
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
58
|
+
latency_soft: "PASS" | "WARN";
|
|
59
|
+
latency_hard: "PASS" | "FAIL";
|
|
60
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
61
|
+
overall: "PASS" | "FAIL";
|
|
62
|
+
}>;
|
|
63
|
+
declare const DecisionVocabBarSchema: z.ZodObject<{
|
|
64
|
+
architecture: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
65
|
+
required: z.ZodNumber;
|
|
66
|
+
produced: z.ZodNumber;
|
|
67
|
+
passed: z.ZodBoolean;
|
|
68
|
+
}, "strip", z.ZodTypeAny, {
|
|
69
|
+
required: number;
|
|
70
|
+
architecture: "single-pass" | "two-pass";
|
|
71
|
+
produced: number;
|
|
72
|
+
passed: boolean;
|
|
73
|
+
}, {
|
|
74
|
+
required: number;
|
|
75
|
+
architecture: "single-pass" | "two-pass";
|
|
76
|
+
produced: number;
|
|
77
|
+
passed: boolean;
|
|
78
|
+
}>;
|
|
79
|
+
declare const CalibrationReceiptSchema: z.ZodObject<{
|
|
80
|
+
schema_version: z.ZodLiteral<1>;
|
|
81
|
+
profile_name: z.ZodString;
|
|
82
|
+
status: z.ZodEnum<["trusted_baseline", "conditional_pass", "failed", "comparison_only"]>;
|
|
83
|
+
model: z.ZodString;
|
|
84
|
+
architecture: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
85
|
+
fixture: z.ZodString;
|
|
86
|
+
fixture_total_claims: z.ZodNumber;
|
|
87
|
+
fixture_good_claims: z.ZodNumber;
|
|
88
|
+
fixture_bad_claims: z.ZodNumber;
|
|
89
|
+
calibrated_at: z.ZodString;
|
|
90
|
+
research_os_version: z.ZodString;
|
|
91
|
+
runtime_ms: z.ZodNumber;
|
|
92
|
+
good_fp_count: z.ZodNumber;
|
|
93
|
+
any_flag_recall: z.ZodObject<{
|
|
94
|
+
matched: z.ZodNumber;
|
|
95
|
+
total: z.ZodNumber;
|
|
96
|
+
ratio: z.ZodNumber;
|
|
97
|
+
}, "strip", z.ZodTypeAny, {
|
|
98
|
+
matched: number;
|
|
99
|
+
total: number;
|
|
100
|
+
ratio: number;
|
|
101
|
+
}, {
|
|
102
|
+
matched: number;
|
|
103
|
+
total: number;
|
|
104
|
+
ratio: number;
|
|
105
|
+
}>;
|
|
106
|
+
strict_recall: z.ZodObject<{
|
|
107
|
+
matched: z.ZodNumber;
|
|
108
|
+
total: z.ZodNumber;
|
|
109
|
+
ratio: z.ZodNumber;
|
|
110
|
+
}, "strip", z.ZodTypeAny, {
|
|
111
|
+
matched: number;
|
|
112
|
+
total: number;
|
|
113
|
+
ratio: number;
|
|
114
|
+
}, {
|
|
115
|
+
matched: number;
|
|
116
|
+
total: number;
|
|
117
|
+
ratio: number;
|
|
118
|
+
}>;
|
|
119
|
+
per_category_any_flag: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
120
|
+
matched: z.ZodNumber;
|
|
121
|
+
total: z.ZodNumber;
|
|
122
|
+
ratio: z.ZodNumber;
|
|
123
|
+
}, "strip", z.ZodTypeAny, {
|
|
124
|
+
matched: number;
|
|
125
|
+
total: number;
|
|
126
|
+
ratio: number;
|
|
127
|
+
}, {
|
|
128
|
+
matched: number;
|
|
129
|
+
total: number;
|
|
130
|
+
ratio: number;
|
|
131
|
+
}>>;
|
|
132
|
+
per_category_strict: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
133
|
+
matched: z.ZodNumber;
|
|
134
|
+
total: z.ZodNumber;
|
|
135
|
+
ratio: z.ZodNumber;
|
|
136
|
+
}, "strip", z.ZodTypeAny, {
|
|
137
|
+
matched: number;
|
|
138
|
+
total: number;
|
|
139
|
+
ratio: number;
|
|
140
|
+
}, {
|
|
141
|
+
matched: number;
|
|
142
|
+
total: number;
|
|
143
|
+
ratio: number;
|
|
144
|
+
}>>;
|
|
145
|
+
decision_vocabulary: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
146
|
+
decisions_produced_count: z.ZodNumber;
|
|
147
|
+
decision_vocab_bar: z.ZodObject<{
|
|
148
|
+
architecture: z.ZodEnum<["single-pass", "two-pass"]>;
|
|
149
|
+
required: z.ZodNumber;
|
|
150
|
+
produced: z.ZodNumber;
|
|
151
|
+
passed: z.ZodBoolean;
|
|
152
|
+
}, "strip", z.ZodTypeAny, {
|
|
153
|
+
required: number;
|
|
154
|
+
architecture: "single-pass" | "two-pass";
|
|
155
|
+
produced: number;
|
|
156
|
+
passed: boolean;
|
|
157
|
+
}, {
|
|
158
|
+
required: number;
|
|
159
|
+
architecture: "single-pass" | "two-pass";
|
|
160
|
+
produced: number;
|
|
161
|
+
passed: boolean;
|
|
162
|
+
}>;
|
|
163
|
+
unreachable_decisions: z.ZodArray<z.ZodString, "many">;
|
|
164
|
+
empty_or_malformed_responses: z.ZodNumber;
|
|
165
|
+
pass_fail: z.ZodObject<{
|
|
166
|
+
fp_ceiling: z.ZodEnum<["PASS", "FAIL"]>;
|
|
167
|
+
any_flag_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
168
|
+
per_category_any_flag_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
169
|
+
strict_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
|
|
170
|
+
decision_vocab_completeness: z.ZodEnum<["PASS", "FAIL"]>;
|
|
171
|
+
latency_soft: z.ZodEnum<["PASS", "WARN"]>;
|
|
172
|
+
latency_hard: z.ZodEnum<["PASS", "FAIL"]>;
|
|
173
|
+
empty_or_malformed: z.ZodEnum<["PASS", "FAIL"]>;
|
|
174
|
+
overall: z.ZodEnum<["PASS", "FAIL"]>;
|
|
175
|
+
}, "strip", z.ZodTypeAny, {
|
|
176
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
177
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
178
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
179
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
180
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
181
|
+
latency_soft: "PASS" | "WARN";
|
|
182
|
+
latency_hard: "PASS" | "FAIL";
|
|
183
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
184
|
+
overall: "PASS" | "FAIL";
|
|
185
|
+
}, {
|
|
186
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
187
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
188
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
189
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
190
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
191
|
+
latency_soft: "PASS" | "WARN";
|
|
192
|
+
latency_hard: "PASS" | "FAIL";
|
|
193
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
194
|
+
overall: "PASS" | "FAIL";
|
|
195
|
+
}>;
|
|
196
|
+
notes: z.ZodArray<z.ZodString, "many">;
|
|
197
|
+
reviewer_options: z.ZodOptional<z.ZodObject<{
|
|
198
|
+
num_ctx: z.ZodOptional<z.ZodNumber>;
|
|
199
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
200
|
+
seed: z.ZodOptional<z.ZodNumber>;
|
|
201
|
+
top_p: z.ZodOptional<z.ZodNumber>;
|
|
202
|
+
top_k: z.ZodOptional<z.ZodNumber>;
|
|
203
|
+
repeat_penalty: z.ZodOptional<z.ZodNumber>;
|
|
204
|
+
}, "strip", z.ZodTypeAny, {
|
|
205
|
+
num_ctx?: number | undefined;
|
|
206
|
+
temperature?: number | undefined;
|
|
207
|
+
seed?: number | undefined;
|
|
208
|
+
top_p?: number | undefined;
|
|
209
|
+
top_k?: number | undefined;
|
|
210
|
+
repeat_penalty?: number | undefined;
|
|
211
|
+
}, {
|
|
212
|
+
num_ctx?: number | undefined;
|
|
213
|
+
temperature?: number | undefined;
|
|
214
|
+
seed?: number | undefined;
|
|
215
|
+
top_p?: number | undefined;
|
|
216
|
+
top_k?: number | undefined;
|
|
217
|
+
repeat_penalty?: number | undefined;
|
|
218
|
+
}>>;
|
|
219
|
+
}, "strip", z.ZodTypeAny, {
|
|
220
|
+
status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
|
|
221
|
+
architecture: "single-pass" | "two-pass";
|
|
222
|
+
schema_version: 1;
|
|
223
|
+
profile_name: string;
|
|
224
|
+
model: string;
|
|
225
|
+
fixture: string;
|
|
226
|
+
fixture_total_claims: number;
|
|
227
|
+
fixture_good_claims: number;
|
|
228
|
+
fixture_bad_claims: number;
|
|
229
|
+
calibrated_at: string;
|
|
230
|
+
research_os_version: string;
|
|
231
|
+
runtime_ms: number;
|
|
232
|
+
good_fp_count: number;
|
|
233
|
+
any_flag_recall: {
|
|
234
|
+
matched: number;
|
|
235
|
+
total: number;
|
|
236
|
+
ratio: number;
|
|
237
|
+
};
|
|
238
|
+
strict_recall: {
|
|
239
|
+
matched: number;
|
|
240
|
+
total: number;
|
|
241
|
+
ratio: number;
|
|
242
|
+
};
|
|
243
|
+
per_category_any_flag: Record<string, {
|
|
244
|
+
matched: number;
|
|
245
|
+
total: number;
|
|
246
|
+
ratio: number;
|
|
247
|
+
}>;
|
|
248
|
+
per_category_strict: Record<string, {
|
|
249
|
+
matched: number;
|
|
250
|
+
total: number;
|
|
251
|
+
ratio: number;
|
|
252
|
+
}>;
|
|
253
|
+
decision_vocabulary: Record<string, number>;
|
|
254
|
+
decisions_produced_count: number;
|
|
255
|
+
decision_vocab_bar: {
|
|
256
|
+
required: number;
|
|
257
|
+
architecture: "single-pass" | "two-pass";
|
|
258
|
+
produced: number;
|
|
259
|
+
passed: boolean;
|
|
260
|
+
};
|
|
261
|
+
unreachable_decisions: string[];
|
|
262
|
+
empty_or_malformed_responses: number;
|
|
263
|
+
pass_fail: {
|
|
264
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
265
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
266
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
267
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
268
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
269
|
+
latency_soft: "PASS" | "WARN";
|
|
270
|
+
latency_hard: "PASS" | "FAIL";
|
|
271
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
272
|
+
overall: "PASS" | "FAIL";
|
|
273
|
+
};
|
|
274
|
+
notes: string[];
|
|
275
|
+
reviewer_options?: {
|
|
276
|
+
num_ctx?: number | undefined;
|
|
277
|
+
temperature?: number | undefined;
|
|
278
|
+
seed?: number | undefined;
|
|
279
|
+
top_p?: number | undefined;
|
|
280
|
+
top_k?: number | undefined;
|
|
281
|
+
repeat_penalty?: number | undefined;
|
|
282
|
+
} | undefined;
|
|
283
|
+
}, {
|
|
284
|
+
status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
|
|
285
|
+
architecture: "single-pass" | "two-pass";
|
|
286
|
+
schema_version: 1;
|
|
287
|
+
profile_name: string;
|
|
288
|
+
model: string;
|
|
289
|
+
fixture: string;
|
|
290
|
+
fixture_total_claims: number;
|
|
291
|
+
fixture_good_claims: number;
|
|
292
|
+
fixture_bad_claims: number;
|
|
293
|
+
calibrated_at: string;
|
|
294
|
+
research_os_version: string;
|
|
295
|
+
runtime_ms: number;
|
|
296
|
+
good_fp_count: number;
|
|
297
|
+
any_flag_recall: {
|
|
298
|
+
matched: number;
|
|
299
|
+
total: number;
|
|
300
|
+
ratio: number;
|
|
301
|
+
};
|
|
302
|
+
strict_recall: {
|
|
303
|
+
matched: number;
|
|
304
|
+
total: number;
|
|
305
|
+
ratio: number;
|
|
306
|
+
};
|
|
307
|
+
per_category_any_flag: Record<string, {
|
|
308
|
+
matched: number;
|
|
309
|
+
total: number;
|
|
310
|
+
ratio: number;
|
|
311
|
+
}>;
|
|
312
|
+
per_category_strict: Record<string, {
|
|
313
|
+
matched: number;
|
|
314
|
+
total: number;
|
|
315
|
+
ratio: number;
|
|
316
|
+
}>;
|
|
317
|
+
decision_vocabulary: Record<string, number>;
|
|
318
|
+
decisions_produced_count: number;
|
|
319
|
+
decision_vocab_bar: {
|
|
320
|
+
required: number;
|
|
321
|
+
architecture: "single-pass" | "two-pass";
|
|
322
|
+
produced: number;
|
|
323
|
+
passed: boolean;
|
|
324
|
+
};
|
|
325
|
+
unreachable_decisions: string[];
|
|
326
|
+
empty_or_malformed_responses: number;
|
|
327
|
+
pass_fail: {
|
|
328
|
+
fp_ceiling: "PASS" | "FAIL";
|
|
329
|
+
any_flag_recall_floor: "PASS" | "FAIL";
|
|
330
|
+
per_category_any_flag_floor: "PASS" | "FAIL";
|
|
331
|
+
strict_recall_floor: "PASS" | "FAIL";
|
|
332
|
+
decision_vocab_completeness: "PASS" | "FAIL";
|
|
333
|
+
latency_soft: "PASS" | "WARN";
|
|
334
|
+
latency_hard: "PASS" | "FAIL";
|
|
335
|
+
empty_or_malformed: "PASS" | "FAIL";
|
|
336
|
+
overall: "PASS" | "FAIL";
|
|
337
|
+
};
|
|
338
|
+
notes: string[];
|
|
339
|
+
reviewer_options?: {
|
|
340
|
+
num_ctx?: number | undefined;
|
|
341
|
+
temperature?: number | undefined;
|
|
342
|
+
seed?: number | undefined;
|
|
343
|
+
top_p?: number | undefined;
|
|
344
|
+
top_k?: number | undefined;
|
|
345
|
+
repeat_penalty?: number | undefined;
|
|
346
|
+
} | undefined;
|
|
347
|
+
}>;
|
|
348
|
+
type StatusLabel = z.infer<typeof StatusLabelSchema>;
|
|
349
|
+
type Architecture = z.infer<typeof ArchitectureSchema>;
|
|
350
|
+
type Recall = z.infer<typeof RecallSchema>;
|
|
351
|
+
type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;
|
|
352
|
+
type PassFail = z.infer<typeof PassFailSchema>;
|
|
353
|
+
type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;
|
|
354
|
+
type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;
|
|
355
|
+
|
|
356
|
+
export { type Architecture, ArchitectureSchema, type CalibrationReceipt, CalibrationReceiptSchema, type DecisionVocabBar, DecisionVocabBarSchema, type PassFail, PassFailSchema, type PerCategoryRecall, PerCategoryRecallSchema, type Recall, RecallSchema, type StatusLabel, StatusLabelSchema };
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
// src/calibration/receipt-schema.ts
|
|
2
|
+
import { z as z2 } from "zod";
|
|
3
|
+
|
|
4
|
+
// src/review/reviewer-options-schema.ts
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
var ReviewerOptionsSchema = z.object({
|
|
7
|
+
num_ctx: z.number().int().positive().optional(),
|
|
8
|
+
temperature: z.number().min(0).max(2).optional(),
|
|
9
|
+
seed: z.number().int().optional(),
|
|
10
|
+
top_p: z.number().min(0).max(1).optional(),
|
|
11
|
+
top_k: z.number().int().nonnegative().optional(),
|
|
12
|
+
repeat_penalty: z.number().min(0).optional()
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
// src/calibration/receipt-schema.ts
|
|
16
|
+
var StatusLabelSchema = z2.enum([
|
|
17
|
+
"trusted_baseline",
|
|
18
|
+
"conditional_pass",
|
|
19
|
+
"failed",
|
|
20
|
+
"comparison_only"
|
|
21
|
+
]);
|
|
22
|
+
var ArchitectureSchema = z2.enum(["single-pass", "two-pass"]);
|
|
23
|
+
var RecallSchema = z2.object({
|
|
24
|
+
matched: z2.number().int().nonnegative(),
|
|
25
|
+
total: z2.number().int().nonnegative(),
|
|
26
|
+
ratio: z2.number().min(0).max(1)
|
|
27
|
+
});
|
|
28
|
+
var PerCategoryRecallSchema = z2.record(z2.string(), RecallSchema);
|
|
29
|
+
var PassFailSchema = z2.object({
|
|
30
|
+
fp_ceiling: z2.enum(["PASS", "FAIL"]),
|
|
31
|
+
any_flag_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
32
|
+
per_category_any_flag_floor: z2.enum(["PASS", "FAIL"]),
|
|
33
|
+
strict_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
34
|
+
decision_vocab_completeness: z2.enum(["PASS", "FAIL"]),
|
|
35
|
+
latency_soft: z2.enum(["PASS", "WARN"]),
|
|
36
|
+
latency_hard: z2.enum(["PASS", "FAIL"]),
|
|
37
|
+
empty_or_malformed: z2.enum(["PASS", "FAIL"]),
|
|
38
|
+
overall: z2.enum(["PASS", "FAIL"])
|
|
39
|
+
});
|
|
40
|
+
var DecisionVocabBarSchema = z2.object({
|
|
41
|
+
architecture: ArchitectureSchema,
|
|
42
|
+
required: z2.number().int().positive(),
|
|
43
|
+
produced: z2.number().int().nonnegative(),
|
|
44
|
+
passed: z2.boolean()
|
|
45
|
+
});
|
|
46
|
+
var CalibrationReceiptSchema = z2.object({
|
|
47
|
+
schema_version: z2.literal(1),
|
|
48
|
+
profile_name: z2.string(),
|
|
49
|
+
status: StatusLabelSchema,
|
|
50
|
+
model: z2.string(),
|
|
51
|
+
architecture: ArchitectureSchema,
|
|
52
|
+
fixture: z2.string(),
|
|
53
|
+
fixture_total_claims: z2.number().int().positive(),
|
|
54
|
+
fixture_good_claims: z2.number().int().nonnegative(),
|
|
55
|
+
fixture_bad_claims: z2.number().int().nonnegative(),
|
|
56
|
+
calibrated_at: z2.string(),
|
|
57
|
+
research_os_version: z2.string(),
|
|
58
|
+
runtime_ms: z2.number().int().nonnegative(),
|
|
59
|
+
good_fp_count: z2.number().int().nonnegative(),
|
|
60
|
+
any_flag_recall: RecallSchema,
|
|
61
|
+
strict_recall: RecallSchema,
|
|
62
|
+
per_category_any_flag: PerCategoryRecallSchema,
|
|
63
|
+
per_category_strict: PerCategoryRecallSchema,
|
|
64
|
+
decision_vocabulary: z2.record(z2.string(), z2.number().int().nonnegative()),
|
|
65
|
+
decisions_produced_count: z2.number().int().nonnegative(),
|
|
66
|
+
decision_vocab_bar: DecisionVocabBarSchema,
|
|
67
|
+
unreachable_decisions: z2.array(z2.string()),
|
|
68
|
+
empty_or_malformed_responses: z2.number().int().nonnegative(),
|
|
69
|
+
pass_fail: PassFailSchema,
|
|
70
|
+
notes: z2.array(z2.string()),
|
|
71
|
+
reviewer_options: ReviewerOptionsSchema.optional()
|
|
72
|
+
});
|
|
73
|
+
export {
|
|
74
|
+
ArchitectureSchema,
|
|
75
|
+
CalibrationReceiptSchema,
|
|
76
|
+
DecisionVocabBarSchema,
|
|
77
|
+
PassFailSchema,
|
|
78
|
+
PerCategoryRecallSchema,
|
|
79
|
+
RecallSchema,
|
|
80
|
+
ReviewerOptionsSchema,
|
|
81
|
+
StatusLabelSchema
|
|
82
|
+
};
|
|
83
|
+
//# sourceMappingURL=receipt-schema.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/calibration/receipt-schema.ts","../../src/review/reviewer-options-schema.ts"],"sourcesContent":["import { z } from 'zod';\nimport { ReviewerOptionsSchema } from '../review/reviewer-options-schema.js';\nexport { ReviewerOptionsSchema };\nexport type { ReviewerOptions } from '../review/reviewer-options-schema.js';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\n// schema_version: 1 — additive-optional additions (Exp6 Session 2):\n// reviewer_options: optional sampling params used during this calibration run.\n// Absent = stochastic run (pre-v0.6 compat preserved). Present = keys explicitly set.\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n reviewer_options: ReviewerOptionsSchema.optional(),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n","import { z } from 'zod';\n\n// Sampling parameters passed verbatim to the Ollama /api/chat `options` field.\n// Used by OllamaInternReviewer to control determinism. All fields optional —\n// omitted keys fall back to Ollama/model defaults. Introduced in Experiment 6\n// Session 2 to make reviewer conditions explicit in calibration receipts.\n//\n// LOAD-BEARING: temperature: 0 is valid and must not be dropped. All merges\n// in OllamaInternReviewer use `!== undefined` checks, NOT truthiness.\nexport const ReviewerOptionsSchema = z.object({\n num_ctx: z.number().int().positive().optional(),\n temperature: z.number().min(0).max(2).optional(),\n seed: z.number().int().optional(),\n top_p: z.number().min(0).max(1).optional(),\n top_k: z.number().int().nonnegative().optional(),\n repeat_penalty: z.number().min(0).optional(),\n});\n\nexport type ReviewerOptions = z.infer<typeof ReviewerOptionsSchema>;\n"],"mappings":";AAAA,SAAS,KAAAA,UAAS;;;ACAlB,SAAS,SAAS;AASX,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS;AAAA,EAC9C,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EAC/C,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,SAAS;AAAA,EAC/C,gBAAgB,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS;AAC7C,CAAC;;;ADXM,IAAM,oBAAoBC,GAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqBA,GAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAeA,GAAE,OAAO;AAAA,EACnC,SAASA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAOA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0BA,GAAE,OAAOA,GAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyBA,GAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAKM,IAAM,2BAA2BA,GAAE,OAAO;AAAA,EAC/C,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAeA,GAAE,OAAO;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA,EAC9B,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAeA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzC,8BAA8BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzB,kBAAkB,sBAAsB,SAAS;AACnD,CAAC;","names":["z","z"]}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { CalibrationReceipt, Architecture, DecisionVocabBar, Recall, PerCategoryRecall, PassFail, StatusLabel } from './receipt-schema.js';
|
|
2
|
+
import 'zod';
|
|
3
|
+
import '../reviewer-options-schema-PZacF_MO.js';
|
|
4
|
+
|
|
5
|
+
declare function computeDecisionVocabBar(architecture: Architecture, decisionsProducedCount: number): DecisionVocabBar;
|
|
6
|
+
declare function computePassFail(input: {
|
|
7
|
+
good_fp_count: number;
|
|
8
|
+
any_flag_recall: Recall;
|
|
9
|
+
per_category_any_flag: PerCategoryRecall;
|
|
10
|
+
strict_recall: Recall;
|
|
11
|
+
decision_vocab_bar: DecisionVocabBar;
|
|
12
|
+
runtime_ms: number;
|
|
13
|
+
empty_or_malformed_responses: number;
|
|
14
|
+
}): PassFail;
|
|
15
|
+
declare function computeStatusLabel(input: {
|
|
16
|
+
profileName: string;
|
|
17
|
+
architecture: Architecture;
|
|
18
|
+
passFail: PassFail;
|
|
19
|
+
goodFpCount: number;
|
|
20
|
+
modeOverride?: 'comparison_only';
|
|
21
|
+
}): StatusLabel;
|
|
22
|
+
declare function receiptToCalibrationSummary(receipt: CalibrationReceipt): {
|
|
23
|
+
fixture: string | null;
|
|
24
|
+
good_false_positive_rate: string | null;
|
|
25
|
+
bad_any_flag_recall: string | null;
|
|
26
|
+
strict_category_recall: string | null;
|
|
27
|
+
unsupported_claim_recall: string | null;
|
|
28
|
+
notes: string | null;
|
|
29
|
+
};
|
|
30
|
+
declare function buildReceiptMarkdown(r: CalibrationReceipt): string;
|
|
31
|
+
|
|
32
|
+
export { buildReceiptMarkdown, computeDecisionVocabBar, computePassFail, computeStatusLabel, receiptToCalibrationSummary };
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
// src/calibration/receipt.ts
|
|
2
|
+
function computeDecisionVocabBar(architecture, decisionsProducedCount) {
|
|
3
|
+
const required = architecture === "two-pass" ? 3 : 4;
|
|
4
|
+
return {
|
|
5
|
+
architecture,
|
|
6
|
+
required,
|
|
7
|
+
produced: decisionsProducedCount,
|
|
8
|
+
passed: decisionsProducedCount >= required
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
function computePerCategoryFloor(perCategoryAnyFlag) {
|
|
12
|
+
for (const [, recall] of Object.entries(perCategoryAnyFlag)) {
|
|
13
|
+
if (recall.total >= 2 && recall.ratio < 0.5) return "FAIL";
|
|
14
|
+
}
|
|
15
|
+
return "PASS";
|
|
16
|
+
}
|
|
17
|
+
function computePassFail(input) {
|
|
18
|
+
const fp_ceiling = input.good_fp_count <= 1 ? "PASS" : "FAIL";
|
|
19
|
+
const any_flag_recall_floor = input.any_flag_recall.ratio >= 0.65 ? "PASS" : "FAIL";
|
|
20
|
+
const per_category_any_flag_floor = computePerCategoryFloor(input.per_category_any_flag);
|
|
21
|
+
const strict_recall_floor = input.strict_recall.ratio >= 0.2 ? "PASS" : "FAIL";
|
|
22
|
+
const decision_vocab_completeness = input.decision_vocab_bar.passed ? "PASS" : "FAIL";
|
|
23
|
+
const latency_soft = input.runtime_ms <= 6e5 ? "PASS" : "WARN";
|
|
24
|
+
const latency_hard = input.runtime_ms <= 12e5 ? "PASS" : "FAIL";
|
|
25
|
+
const empty_or_malformed = input.empty_or_malformed_responses === 0 ? "PASS" : "FAIL";
|
|
26
|
+
const hardBars = [
|
|
27
|
+
fp_ceiling,
|
|
28
|
+
any_flag_recall_floor,
|
|
29
|
+
per_category_any_flag_floor,
|
|
30
|
+
strict_recall_floor,
|
|
31
|
+
decision_vocab_completeness,
|
|
32
|
+
latency_hard,
|
|
33
|
+
empty_or_malformed
|
|
34
|
+
];
|
|
35
|
+
const overall = hardBars.every((v) => v === "PASS") ? "PASS" : "FAIL";
|
|
36
|
+
return {
|
|
37
|
+
fp_ceiling,
|
|
38
|
+
any_flag_recall_floor,
|
|
39
|
+
per_category_any_flag_floor,
|
|
40
|
+
strict_recall_floor,
|
|
41
|
+
decision_vocab_completeness,
|
|
42
|
+
latency_soft,
|
|
43
|
+
latency_hard,
|
|
44
|
+
empty_or_malformed,
|
|
45
|
+
overall
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
function computeStatusLabel(input) {
|
|
49
|
+
if (input.modeOverride === "comparison_only") return "comparison_only";
|
|
50
|
+
if (input.architecture === "single-pass" && /hermes/i.test(input.profileName)) {
|
|
51
|
+
return "comparison_only";
|
|
52
|
+
}
|
|
53
|
+
if (input.passFail.overall === "FAIL") return "failed";
|
|
54
|
+
const isHermesTwoPass = /hermes/i.test(input.profileName) && input.architecture === "two-pass";
|
|
55
|
+
if (isHermesTwoPass && input.goodFpCount === 0) return "trusted_baseline";
|
|
56
|
+
return "conditional_pass";
|
|
57
|
+
}
|
|
58
|
+
function receiptToCalibrationSummary(receipt) {
|
|
59
|
+
const fp = receipt.good_fp_count;
|
|
60
|
+
const fpTotal = receipt.fixture_good_claims;
|
|
61
|
+
const fpPct = fpTotal > 0 ? Math.round(fp / fpTotal * 100) : 0;
|
|
62
|
+
const af = receipt.any_flag_recall;
|
|
63
|
+
const sr = receipt.strict_recall;
|
|
64
|
+
const unsupported = receipt.per_category_any_flag["unsupported_claim"];
|
|
65
|
+
return {
|
|
66
|
+
fixture: receipt.fixture,
|
|
67
|
+
good_false_positive_rate: `${fp}/${fpTotal} (${fpPct}%)`,
|
|
68
|
+
bad_any_flag_recall: `${af.matched}/${af.total} (${Math.round(af.ratio * 100)}%)`,
|
|
69
|
+
strict_category_recall: `${sr.matched}/${sr.total} (${Math.round(sr.ratio * 100)}%)`,
|
|
70
|
+
unsupported_claim_recall: unsupported ? `${unsupported.matched}/${unsupported.total} (${Math.round(unsupported.ratio * 100)}%)` : null,
|
|
71
|
+
notes: `status=${receipt.status} model=${receipt.model} arch=${receipt.architecture} overall=${receipt.pass_fail.overall} decisions=${receipt.decisions_produced_count}/6`
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
var REVIEWER_OPTIONS_KEY_ORDER = [
|
|
75
|
+
"num_ctx",
|
|
76
|
+
"temperature",
|
|
77
|
+
"seed",
|
|
78
|
+
"top_p",
|
|
79
|
+
"top_k",
|
|
80
|
+
"repeat_penalty"
|
|
81
|
+
];
|
|
82
|
+
function buildReviewerOptionsSection(opts) {
|
|
83
|
+
if (!opts) return "";
|
|
84
|
+
const lines = REVIEWER_OPTIONS_KEY_ORDER.filter((k) => opts[k] !== void 0).map((k) => `- ${k}: ${opts[k]}`);
|
|
85
|
+
if (lines.length === 0) return "";
|
|
86
|
+
return `
|
|
87
|
+
## Reviewer options
|
|
88
|
+
|
|
89
|
+
${lines.join("\n")}
|
|
90
|
+
`;
|
|
91
|
+
}
|
|
92
|
+
function buildReceiptMarkdown(r) {
|
|
93
|
+
const pct = (ratio) => `${Math.round(ratio * 100)}%`;
|
|
94
|
+
const runtimeSec = (r.runtime_ms / 1e3).toFixed(1);
|
|
95
|
+
const perCatRows = Object.entries(r.per_category_any_flag).map(([cat, af]) => {
|
|
96
|
+
const st = r.per_category_strict[cat] ?? { matched: 0, total: af.total, ratio: 0 };
|
|
97
|
+
return `| ${cat} | ${af.matched}/${af.total} (${pct(af.ratio)}) | ${st.matched}/${st.total} (${pct(st.ratio)}) |`;
|
|
98
|
+
}).join("\n");
|
|
99
|
+
const dvRows = [
|
|
100
|
+
"accepted_for_synthesis",
|
|
101
|
+
"rejected",
|
|
102
|
+
"needs_scope_repair",
|
|
103
|
+
"needs_source_repair",
|
|
104
|
+
"needs_contradiction_mapping",
|
|
105
|
+
"needs_human_review"
|
|
106
|
+
].map((d) => {
|
|
107
|
+
const count = r.decision_vocabulary[d] ?? 0;
|
|
108
|
+
const unreachable = r.unreachable_decisions.includes(d) ? ` (unreachable from ${r.fixture})` : "";
|
|
109
|
+
return `| ${d} | ${count}${unreachable} |`;
|
|
110
|
+
}).join("\n");
|
|
111
|
+
const pf = r.pass_fail;
|
|
112
|
+
const bar = r.decision_vocab_bar;
|
|
113
|
+
const notesSection = r.notes.length > 0 ? `
|
|
114
|
+
## Notes
|
|
115
|
+
|
|
116
|
+
${r.notes.map((n) => `- ${n}`).join("\n")}
|
|
117
|
+
` : "";
|
|
118
|
+
const reviewerOptionsSection = buildReviewerOptionsSection(r.reviewer_options);
|
|
119
|
+
return `# Calibration Receipt \u2014 ${r.profile_name}
|
|
120
|
+
|
|
121
|
+
- **Model:** ${r.model}
|
|
122
|
+
- **Architecture:** ${r.architecture}
|
|
123
|
+
- **Status:** ${r.status}
|
|
124
|
+
- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)
|
|
125
|
+
- **Calibrated at:** ${r.calibrated_at}
|
|
126
|
+
- **Research-OS version:** ${r.research_os_version}
|
|
127
|
+
- **Runtime:** ${runtimeSec} seconds
|
|
128
|
+
${reviewerOptionsSection}
|
|
129
|
+
## Headline metrics
|
|
130
|
+
|
|
131
|
+
- FP: ${r.good_fp_count} / ${r.fixture_good_claims}
|
|
132
|
+
- Any-flag recall: ${r.any_flag_recall.matched} / ${r.any_flag_recall.total} (${pct(r.any_flag_recall.ratio)})
|
|
133
|
+
- Strict recall: ${r.strict_recall.matched} / ${r.strict_recall.total} (${pct(r.strict_recall.ratio)})
|
|
134
|
+
- Decisions produced: ${r.decisions_produced_count} / 6
|
|
135
|
+
|
|
136
|
+
## PASS / FAIL
|
|
137
|
+
|
|
138
|
+
| Bar | Result |
|
|
139
|
+
|---|---|
|
|
140
|
+
| FP ceiling (\u22641) | ${pf.fp_ceiling} |
|
|
141
|
+
| Any-flag recall (\u226565%) | ${pf.any_flag_recall_floor} |
|
|
142
|
+
| Per-category any-flag (\u226550%) | ${pf.per_category_any_flag_floor} |
|
|
143
|
+
| Strict recall (\u226520%) | ${pf.strict_recall_floor} |
|
|
144
|
+
| Decision vocab (${bar.architecture} \u2265 ${bar.required}) | ${pf.decision_vocab_completeness} |
|
|
145
|
+
| Latency soft (\u226410 min) | ${pf.latency_soft} |
|
|
146
|
+
| Latency hard (\u226420 min) | ${pf.latency_hard} |
|
|
147
|
+
| Empty/malformed (=0) | ${pf.empty_or_malformed} |
|
|
148
|
+
| **OVERALL** | **${pf.overall}** |
|
|
149
|
+
|
|
150
|
+
## Per-category recall
|
|
151
|
+
|
|
152
|
+
| Category | Any-flag | Strict |
|
|
153
|
+
|---|---|---|
|
|
154
|
+
${perCatRows}
|
|
155
|
+
|
|
156
|
+
## Decision vocabulary
|
|
157
|
+
|
|
158
|
+
| Decision | Count |
|
|
159
|
+
|---|---:|
|
|
160
|
+
${dvRows}
|
|
161
|
+
${notesSection}`;
|
|
162
|
+
}
|
|
163
|
+
export {
|
|
164
|
+
buildReceiptMarkdown,
|
|
165
|
+
computeDecisionVocabBar,
|
|
166
|
+
computePassFail,
|
|
167
|
+
computeStatusLabel,
|
|
168
|
+
receiptToCalibrationSummary
|
|
169
|
+
};
|
|
170
|
+
//# sourceMappingURL=receipt.js.map
|