@roleplay-sh/cli 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -35,6 +35,25 @@ declare const reportSchema: z.ZodObject<{
35
35
  recommendations: z.ZodArray<z.ZodString, "many">;
36
36
  startedAt: z.ZodEffects<z.ZodString, string, string>;
37
37
  endedAt: z.ZodEffects<z.ZodString, string, string>;
38
+ judgeMetadata: z.ZodOptional<z.ZodObject<{
39
+ mode: z.ZodEnum<["rules", "semantic", "hybrid"]>;
40
+ provider: z.ZodOptional<z.ZodString>;
41
+ model: z.ZodOptional<z.ZodString>;
42
+ rulesApplied: z.ZodDefault<z.ZodBoolean>;
43
+ deterministicFindingsAdded: z.ZodDefault<z.ZodNumber>;
44
+ }, "strict", z.ZodTypeAny, {
45
+ mode: "rules" | "semantic" | "hybrid";
46
+ rulesApplied: boolean;
47
+ deterministicFindingsAdded: number;
48
+ provider?: string | undefined;
49
+ model?: string | undefined;
50
+ }, {
51
+ mode: "rules" | "semantic" | "hybrid";
52
+ provider?: string | undefined;
53
+ model?: string | undefined;
54
+ rulesApplied?: boolean | undefined;
55
+ deterministicFindingsAdded?: number | undefined;
56
+ }>>;
38
57
  rawJudgeOutput: z.ZodOptional<z.ZodUnknown>;
39
58
  }, "strict", z.ZodTypeAny, {
40
59
  summary: string;
@@ -55,6 +74,13 @@ declare const reportSchema: z.ZodObject<{
55
74
  recommendations: string[];
56
75
  startedAt: string;
57
76
  endedAt: string;
77
+ judgeMetadata?: {
78
+ mode: "rules" | "semantic" | "hybrid";
79
+ rulesApplied: boolean;
80
+ deterministicFindingsAdded: number;
81
+ provider?: string | undefined;
82
+ model?: string | undefined;
83
+ } | undefined;
58
84
  rawJudgeOutput?: unknown;
59
85
  }, {
60
86
  summary: string;
@@ -75,6 +101,13 @@ declare const reportSchema: z.ZodObject<{
75
101
  recommendations: string[];
76
102
  startedAt: string;
77
103
  endedAt: string;
104
+ judgeMetadata?: {
105
+ mode: "rules" | "semantic" | "hybrid";
106
+ provider?: string | undefined;
107
+ model?: string | undefined;
108
+ rulesApplied?: boolean | undefined;
109
+ deterministicFindingsAdded?: number | undefined;
110
+ } | undefined;
78
111
  rawJudgeOutput?: unknown;
79
112
  }>;
80
113
  type Report = z.infer<typeof reportSchema>;
@@ -111,12 +144,12 @@ declare const scenarioSchema: z.ZodObject<{
111
144
  sessionField: string;
112
145
  };
113
146
  type: "http";
114
- url: string;
115
- method: "POST" | "PUT" | "PATCH";
116
- headers: Record<string, string>;
117
147
  output: {
118
148
  responseField: string;
119
149
  };
150
+ url: string;
151
+ method: "POST" | "PUT" | "PATCH";
152
+ headers: Record<string, string>;
120
153
  timeoutMs: number;
121
154
  }, {
122
155
  type: "http";
@@ -125,11 +158,11 @@ declare const scenarioSchema: z.ZodObject<{
125
158
  messageField?: string | undefined;
126
159
  sessionField?: string | undefined;
127
160
  } | undefined;
128
- method?: "POST" | "PUT" | "PATCH" | undefined;
129
- headers?: Record<string, string> | undefined;
130
161
  output?: {
131
162
  responseField?: string | undefined;
132
163
  } | undefined;
164
+ method?: "POST" | "PUT" | "PATCH" | undefined;
165
+ headers?: Record<string, string> | undefined;
133
166
  timeoutMs?: number | undefined;
134
167
  }>, z.ZodObject<{
135
168
  type: z.ZodLiteral<"cli">;
@@ -140,14 +173,14 @@ declare const scenarioSchema: z.ZodObject<{
140
173
  }, "strip", z.ZodTypeAny, {
141
174
  command: string;
142
175
  type: "cli";
143
- timeoutMs: number;
144
176
  mode: "stdin" | "arg";
177
+ timeoutMs: number;
145
178
  shell: boolean;
146
179
  }, {
147
180
  command: string;
148
181
  type: "cli";
149
- timeoutMs?: number | undefined;
150
182
  mode?: "stdin" | "arg" | undefined;
183
+ timeoutMs?: number | undefined;
151
184
  shell?: boolean | undefined;
152
185
  }>, z.ZodObject<{
153
186
  type: z.ZodLiteral<"mock">;
@@ -204,11 +237,11 @@ declare const scenarioSchema: z.ZodObject<{
204
237
  model: z.ZodOptional<z.ZodString>;
205
238
  baseUrl: z.ZodOptional<z.ZodString>;
206
239
  }, "strip", z.ZodTypeAny, {
207
- provider: "mock" | "openai" | "anthropic" | "google" | "openai-compatible";
240
+ provider: "openai" | "anthropic" | "google" | "openai-compatible" | "mock";
208
241
  model?: string | undefined;
209
242
  baseUrl?: string | undefined;
210
243
  }, {
211
- provider?: "mock" | "openai" | "anthropic" | "google" | "openai-compatible" | undefined;
244
+ provider?: "openai" | "anthropic" | "google" | "openai-compatible" | "mock" | undefined;
212
245
  model?: string | undefined;
213
246
  baseUrl?: string | undefined;
214
247
  }>>;
@@ -220,12 +253,12 @@ declare const scenarioSchema: z.ZodObject<{
220
253
  type: z.ZodDefault<z.ZodEnum<["mock", "openai", "anthropic", "google", "openai-compatible"]>>;
221
254
  rubric: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
222
255
  }, "provider">, "strip", z.ZodTypeAny, {
223
- type: "mock" | "openai" | "anthropic" | "google" | "openai-compatible";
256
+ type: "openai" | "anthropic" | "google" | "openai-compatible" | "mock";
224
257
  model?: string | undefined;
225
258
  baseUrl?: string | undefined;
226
259
  rubric?: Record<string, number> | undefined;
227
260
  }, {
228
- type?: "mock" | "openai" | "anthropic" | "google" | "openai-compatible" | undefined;
261
+ type?: "openai" | "anthropic" | "google" | "openai-compatible" | "mock" | undefined;
229
262
  model?: string | undefined;
230
263
  baseUrl?: string | undefined;
231
264
  rubric?: Record<string, number> | undefined;
@@ -240,24 +273,30 @@ declare const scenarioSchema: z.ZodObject<{
240
273
  }, "strip", z.ZodTypeAny, {
241
274
  name: string;
242
275
  description: string;
276
+ judge: {
277
+ type: "openai" | "anthropic" | "google" | "openai-compatible" | "mock";
278
+ model?: string | undefined;
279
+ baseUrl?: string | undefined;
280
+ rubric?: Record<string, number> | undefined;
281
+ };
243
282
  target: {
244
283
  input: {
245
284
  messageField: string;
246
285
  sessionField: string;
247
286
  };
248
287
  type: "http";
249
- url: string;
250
- method: "POST" | "PUT" | "PATCH";
251
- headers: Record<string, string>;
252
288
  output: {
253
289
  responseField: string;
254
290
  };
291
+ url: string;
292
+ method: "POST" | "PUT" | "PATCH";
293
+ headers: Record<string, string>;
255
294
  timeoutMs: number;
256
295
  } | {
257
296
  command: string;
258
297
  type: "cli";
259
- timeoutMs: number;
260
298
  mode: "stdin" | "arg";
299
+ timeoutMs: number;
261
300
  shell: boolean;
262
301
  } | {
263
302
  type: "mock";
@@ -277,17 +316,11 @@ declare const scenarioSchema: z.ZodObject<{
277
316
  hiddenContext: string[];
278
317
  successCriteria: string[];
279
318
  failureCriteria: string[];
280
- judge: {
281
- type: "mock" | "openai" | "anthropic" | "google" | "openai-compatible";
282
- model?: string | undefined;
283
- baseUrl?: string | undefined;
284
- rubric?: Record<string, number> | undefined;
285
- };
286
319
  output?: {
287
320
  expectations: string[];
288
321
  } | undefined;
289
322
  attacker?: {
290
- provider: "mock" | "openai" | "anthropic" | "google" | "openai-compatible";
323
+ provider: "openai" | "anthropic" | "google" | "openai-compatible" | "mock";
291
324
  model?: string | undefined;
292
325
  baseUrl?: string | undefined;
293
326
  } | undefined;
@@ -300,17 +333,17 @@ declare const scenarioSchema: z.ZodObject<{
300
333
  messageField?: string | undefined;
301
334
  sessionField?: string | undefined;
302
335
  } | undefined;
303
- method?: "POST" | "PUT" | "PATCH" | undefined;
304
- headers?: Record<string, string> | undefined;
305
336
  output?: {
306
337
  responseField?: string | undefined;
307
338
  } | undefined;
339
+ method?: "POST" | "PUT" | "PATCH" | undefined;
340
+ headers?: Record<string, string> | undefined;
308
341
  timeoutMs?: number | undefined;
309
342
  } | {
310
343
  command: string;
311
344
  type: "cli";
312
- timeoutMs?: number | undefined;
313
345
  mode?: "stdin" | "arg" | undefined;
346
+ timeoutMs?: number | undefined;
314
347
  shell?: boolean | undefined;
315
348
  } | {
316
349
  type: "mock";
@@ -325,6 +358,12 @@ declare const scenarioSchema: z.ZodObject<{
325
358
  };
326
359
  successCriteria: string[];
327
360
  description?: string | undefined;
361
+ judge?: {
362
+ type?: "openai" | "anthropic" | "google" | "openai-compatible" | "mock" | undefined;
363
+ model?: string | undefined;
364
+ baseUrl?: string | undefined;
365
+ rubric?: Record<string, number> | undefined;
366
+ } | undefined;
328
367
  output?: {
329
368
  expectations?: string[] | undefined;
330
369
  } | undefined;
@@ -335,15 +374,9 @@ declare const scenarioSchema: z.ZodObject<{
335
374
  hiddenContext?: string[] | undefined;
336
375
  failureCriteria?: string[] | undefined;
337
376
  attacker?: {
338
- provider?: "mock" | "openai" | "anthropic" | "google" | "openai-compatible" | undefined;
339
- model?: string | undefined;
340
- baseUrl?: string | undefined;
341
- } | undefined;
342
- judge?: {
343
- type?: "mock" | "openai" | "anthropic" | "google" | "openai-compatible" | undefined;
377
+ provider?: "openai" | "anthropic" | "google" | "openai-compatible" | "mock" | undefined;
344
378
  model?: string | undefined;
345
379
  baseUrl?: string | undefined;
346
- rubric?: Record<string, number> | undefined;
347
380
  } | undefined;
348
381
  }>;
349
382
  type Scenario = z.infer<typeof scenarioSchema>;
@@ -425,6 +458,10 @@ declare const transcriptSchema: z.ZodEffects<z.ZodObject<{
425
458
  }>;
426
459
  type Transcript = z.infer<typeof transcriptSchema>;
427
460
 
461
+ type LlmProviderName = 'mock' | 'openai' | 'anthropic' | 'google' | 'openai-compatible';
462
+
463
+ type JudgeMode = 'rules' | 'semantic' | 'hybrid';
464
+
428
465
  interface RunPaths {
429
466
  runId: string;
430
467
  runDir: string;
@@ -435,8 +472,6 @@ interface RunPaths {
435
472
  metadataPath: string;
436
473
  }
437
474
 
438
- type LlmProviderName = 'mock' | 'openai' | 'anthropic' | 'google' | 'openai-compatible';
439
-
440
475
  interface RunOptions {
441
476
  scenarioRef: string;
442
477
  maxTurns?: number;
@@ -445,6 +480,7 @@ interface RunOptions {
445
480
  metadata?: Record<string, unknown>;
446
481
  attackerProvider?: LlmProviderName;
447
482
  judgeProvider?: LlmProviderName;
483
+ judgeMode?: JudgeMode;
448
484
  attackerModel?: string;
449
485
  judgeModel?: string;
450
486
  llmBaseUrl?: string;
@@ -505,6 +541,25 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
505
541
  recommendations: z.ZodArray<z.ZodString, "many">;
506
542
  startedAt: z.ZodEffects<z.ZodString, string, string>;
507
543
  endedAt: z.ZodEffects<z.ZodString, string, string>;
544
+ judgeMetadata: z.ZodOptional<z.ZodObject<{
545
+ mode: z.ZodEnum<["rules", "semantic", "hybrid"]>;
546
+ provider: z.ZodOptional<z.ZodString>;
547
+ model: z.ZodOptional<z.ZodString>;
548
+ rulesApplied: z.ZodDefault<z.ZodBoolean>;
549
+ deterministicFindingsAdded: z.ZodDefault<z.ZodNumber>;
550
+ }, "strict", z.ZodTypeAny, {
551
+ mode: "rules" | "semantic" | "hybrid";
552
+ rulesApplied: boolean;
553
+ deterministicFindingsAdded: number;
554
+ provider?: string | undefined;
555
+ model?: string | undefined;
556
+ }, {
557
+ mode: "rules" | "semantic" | "hybrid";
558
+ provider?: string | undefined;
559
+ model?: string | undefined;
560
+ rulesApplied?: boolean | undefined;
561
+ deterministicFindingsAdded?: number | undefined;
562
+ }>>;
508
563
  rawJudgeOutput: z.ZodOptional<z.ZodUnknown>;
509
564
  }, "strict", z.ZodTypeAny, {
510
565
  summary: string;
@@ -525,6 +580,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
525
580
  recommendations: string[];
526
581
  startedAt: string;
527
582
  endedAt: string;
583
+ judgeMetadata?: {
584
+ mode: "rules" | "semantic" | "hybrid";
585
+ rulesApplied: boolean;
586
+ deterministicFindingsAdded: number;
587
+ provider?: string | undefined;
588
+ model?: string | undefined;
589
+ } | undefined;
528
590
  rawJudgeOutput?: unknown;
529
591
  }, {
530
592
  summary: string;
@@ -545,6 +607,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
545
607
  recommendations: string[];
546
608
  startedAt: string;
547
609
  endedAt: string;
610
+ judgeMetadata?: {
611
+ mode: "rules" | "semantic" | "hybrid";
612
+ provider?: string | undefined;
613
+ model?: string | undefined;
614
+ rulesApplied?: boolean | undefined;
615
+ deterministicFindingsAdded?: number | undefined;
616
+ } | undefined;
548
617
  rawJudgeOutput?: unknown;
549
618
  }>;
550
619
  transcript: z.ZodOptional<z.ZodEffects<z.ZodObject<{
@@ -642,6 +711,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
642
711
  recommendations: string[];
643
712
  startedAt: string;
644
713
  endedAt: string;
714
+ judgeMetadata?: {
715
+ mode: "rules" | "semantic" | "hybrid";
716
+ rulesApplied: boolean;
717
+ deterministicFindingsAdded: number;
718
+ provider?: string | undefined;
719
+ model?: string | undefined;
720
+ } | undefined;
645
721
  rawJudgeOutput?: unknown;
646
722
  };
647
723
  transcript?: {
@@ -679,6 +755,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
679
755
  recommendations: string[];
680
756
  startedAt: string;
681
757
  endedAt: string;
758
+ judgeMetadata?: {
759
+ mode: "rules" | "semantic" | "hybrid";
760
+ provider?: string | undefined;
761
+ model?: string | undefined;
762
+ rulesApplied?: boolean | undefined;
763
+ deterministicFindingsAdded?: number | undefined;
764
+ } | undefined;
682
765
  rawJudgeOutput?: unknown;
683
766
  };
684
767
  transcript?: {
@@ -718,6 +801,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
718
801
  recommendations: string[];
719
802
  startedAt: string;
720
803
  endedAt: string;
804
+ judgeMetadata?: {
805
+ mode: "rules" | "semantic" | "hybrid";
806
+ rulesApplied: boolean;
807
+ deterministicFindingsAdded: number;
808
+ provider?: string | undefined;
809
+ model?: string | undefined;
810
+ } | undefined;
721
811
  rawJudgeOutput?: unknown;
722
812
  };
723
813
  transcript?: {
@@ -767,6 +857,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
767
857
  recommendations: string[];
768
858
  startedAt: string;
769
859
  endedAt: string;
860
+ judgeMetadata?: {
861
+ mode: "rules" | "semantic" | "hybrid";
862
+ provider?: string | undefined;
863
+ model?: string | undefined;
864
+ rulesApplied?: boolean | undefined;
865
+ deterministicFindingsAdded?: number | undefined;
866
+ } | undefined;
770
867
  rawJudgeOutput?: unknown;
771
868
  };
772
869
  transcript?: {
@@ -816,6 +913,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
816
913
  recommendations: string[];
817
914
  startedAt: string;
818
915
  endedAt: string;
916
+ judgeMetadata?: {
917
+ mode: "rules" | "semantic" | "hybrid";
918
+ rulesApplied: boolean;
919
+ deterministicFindingsAdded: number;
920
+ provider?: string | undefined;
921
+ model?: string | undefined;
922
+ } | undefined;
819
923
  rawJudgeOutput?: unknown;
820
924
  };
821
925
  transcript?: {
@@ -865,6 +969,13 @@ declare const cloudUploadSchema: z.ZodEffects<z.ZodObject<{
865
969
  recommendations: string[];
866
970
  startedAt: string;
867
971
  endedAt: string;
972
+ judgeMetadata?: {
973
+ mode: "rules" | "semantic" | "hybrid";
974
+ provider?: string | undefined;
975
+ model?: string | undefined;
976
+ rulesApplied?: boolean | undefined;
977
+ deterministicFindingsAdded?: number | undefined;
978
+ } | undefined;
868
979
  rawJudgeOutput?: unknown;
869
980
  };
870
981
  transcript?: {
package/dist/index.js CHANGED
@@ -179,6 +179,16 @@ async function loadScenarioFile(path) {
179
179
  }
180
180
  }
181
181
 
182
+ // src/core/scoring.ts
183
+ function statusFromScore(score, failures) {
184
+ if (failures.some((failure) => failure.severity === "high" || failure.severity === "critical")) {
185
+ return "failed";
186
+ }
187
+ if (score < 60) return "failed";
188
+ if (score < 80) return "warning";
189
+ return "passed";
190
+ }
191
+
182
192
  // src/providers/llm/client.ts
183
193
  var defaultModels = {
184
194
  openai: "gpt-4.1-mini",
@@ -352,18 +362,6 @@ function invalidProviderResponse(provider, raw) {
352
362
 
353
363
  // src/providers/judge/llm-judge.ts
354
364
  import { z as z2 } from "zod";
355
-
356
- // src/core/scoring.ts
357
- function statusFromScore(score, failures) {
358
- if (failures.some((failure) => failure.severity === "high" || failure.severity === "critical")) {
359
- return "failed";
360
- }
361
- if (score < 60) return "failed";
362
- if (score < 80) return "warning";
363
- return "passed";
364
- }
365
-
366
- // src/providers/judge/llm-judge.ts
367
365
  var criterionSchema = z2.object({
368
366
  criterion: z2.string().min(1),
369
367
  result: z2.enum(["passed", "failed", "unclear"]),
@@ -424,6 +422,13 @@ var LlmJudge = class {
424
422
  recommendations: parsed.data.recommendations.length ? parsed.data.recommendations : ["Review the failed criteria and strengthen verification before state-changing actions."],
425
423
  startedAt: input.transcript.startedAt,
426
424
  endedAt: input.transcript.endedAt ?? (/* @__PURE__ */ new Date()).toISOString(),
425
+ judgeMetadata: {
426
+ mode: "semantic",
427
+ provider: this.provider.provider,
428
+ model: this.provider.model,
429
+ rulesApplied: false,
430
+ deterministicFindingsAdded: 0
431
+ },
427
432
  rawJudgeOutput: {
428
433
  provider: this.provider.provider,
429
434
  model: this.provider.model,
@@ -550,16 +555,80 @@ var MockJudge = class {
550
555
  "Escalate out-of-policy requests instead of inventing exceptions."
551
556
  ] : ["Keep using scenario coverage for policy and tone regressions."],
552
557
  startedAt: input.transcript.startedAt,
553
- endedAt
558
+ endedAt,
559
+ judgeMetadata: {
560
+ mode: "rules",
561
+ rulesApplied: true,
562
+ deterministicFindingsAdded: failures.length
563
+ }
554
564
  };
555
565
  }
556
566
  };
557
567
 
558
568
  // src/providers/judge/index.ts
559
569
  function createJudge(options = {}) {
570
+ const mode = options.mode ?? (options.provider && options.provider !== "mock" ? "semantic" : "rules");
571
+ if (mode === "rules") return new MockJudge();
560
572
  const provider = options.provider ?? "mock";
561
573
  if (provider === "mock") return new MockJudge();
562
- return new LlmJudge(resolveProviderOptions({ provider, model: options.model, baseUrl: options.baseUrl }));
574
+ const semantic = new LlmJudge(resolveProviderOptions({ provider, model: options.model, baseUrl: options.baseUrl }));
575
+ if (mode === "hybrid") return new HybridJudge(semantic, new MockJudge(), provider, options.model);
576
+ return semantic;
577
+ }
578
+ var HybridJudge = class {
579
+ constructor(semantic, rules, provider, model) {
580
+ this.semantic = semantic;
581
+ this.rules = rules;
582
+ this.provider = provider;
583
+ this.model = model;
584
+ }
585
+ semantic;
586
+ rules;
587
+ provider;
588
+ model;
589
+ async judge(input) {
590
+ const semantic = await this.semantic.judge(input);
591
+ const rules = await this.rules.judge(input);
592
+ const addedFailures = mergeFailures(semantic.failures, rules.failures);
593
+ const addedCriteria = mergeCriteria(semantic.criteria, rules.criteria);
594
+ const failures = [...semantic.failures, ...addedFailures];
595
+ const criteria = [...semantic.criteria, ...addedCriteria];
596
+ const recommendations = [...semantic.recommendations];
597
+ for (const recommendation of rules.recommendations) {
598
+ if (!recommendations.includes(recommendation)) recommendations.push(recommendation);
599
+ }
600
+ return {
601
+ ...semantic,
602
+ score: Math.min(semantic.score, rules.score),
603
+ status: statusFromScore(Math.min(semantic.score, rules.score), failures),
604
+ criteria,
605
+ failures,
606
+ recommendations,
607
+ judgeMetadata: {
608
+ mode: "hybrid",
609
+ provider: this.provider,
610
+ model: this.model ?? semantic.judgeMetadata?.model,
611
+ rulesApplied: true,
612
+ deterministicFindingsAdded: addedFailures.length
613
+ },
614
+ rawJudgeOutput: {
615
+ semantic: semantic.rawJudgeOutput,
616
+ rules: {
617
+ score: rules.score,
618
+ failures: rules.failures,
619
+ criteria: rules.criteria
620
+ }
621
+ }
622
+ };
623
+ }
624
+ };
625
+ function mergeFailures(existing, candidates) {
626
+ const seen = new Set(existing.map((failure) => `${failure.type}:${failure.message}`));
627
+ return candidates.filter((failure) => !seen.has(`${failure.type}:${failure.message}`));
628
+ }
629
+ function mergeCriteria(existing, candidates) {
630
+ const seen = new Set(existing.map((criterion) => criterion.criterion));
631
+ return candidates.filter((criterion) => criterion.result === "failed" && !seen.has(criterion.criterion));
563
632
  }
564
633
 
565
634
  // src/providers/user-simulator/llm-user-simulator.ts
@@ -1034,6 +1103,7 @@ ${redactSecrets(
1034
1103
  - Run ID: ${safeReport.runId}
1035
1104
  - Status: ${safeReport.status}
1036
1105
  - Score: ${safeReport.score}/100
1106
+ - Evaluation: ${evaluationSummary(safeReport)}
1037
1107
  - Started: ${safeReport.startedAt}
1038
1108
  - Ended: ${safeReport.endedAt}
1039
1109
 
@@ -1059,6 +1129,14 @@ ${safeReport.recommendations.length ? safeReport.recommendations.map((item) => `
1059
1129
  ${safeTurns}
1060
1130
  `;
1061
1131
  }
1132
+ function evaluationSummary(report) {
1133
+ const metadata = report.judgeMetadata;
1134
+ if (!metadata) return "not recorded";
1135
+ const provider = metadata.provider ? ` via ${metadata.provider}` : "";
1136
+ const model = metadata.model ? ` (${metadata.model})` : "";
1137
+ const rules = metadata.rulesApplied ? `, deterministic guardrails applied${metadata.deterministicFindingsAdded ? `, ${metadata.deterministicFindingsAdded} added finding(s)` : ""}` : "";
1138
+ return `${metadata.mode}${provider}${model}${rules}`;
1139
+ }
1062
1140
 
1063
1141
  // src/core/engine.ts
1064
1142
  async function runScenario(options) {
@@ -1067,7 +1145,7 @@ async function runScenario(options) {
1067
1145
  const maxTurns = options.maxTurns ?? scenario.simulation.maxTurns;
1068
1146
  const paths = await createRunPaths(options.outDir);
1069
1147
  const transcript = createTranscript(paths.runId, scenario.name);
1070
- const defaultProvider = scenario.target.type === "mock" ? "mock" : "openai";
1148
+ const defaultProvider = scenario.target.type === "mock" ? "mock" : void 0;
1071
1149
  const scenarioJudgeProvider = scenario.judge.type === "mock" ? defaultProvider : scenario.judge.type;
1072
1150
  const scenarioAttackerProvider = scenario.attacker?.provider ?? scenarioJudgeProvider;
1073
1151
  const attackerProvider = options.attackerProvider ?? scenarioAttackerProvider;
@@ -1079,6 +1157,7 @@ async function runScenario(options) {
1079
1157
  });
1080
1158
  const target = createTargetAgent(scenario.target, { allowCliExecution: options.yes });
1081
1159
  const judge = createJudge({
1160
+ mode: options.judgeMode,
1082
1161
  provider: judgeProvider,
1083
1162
  model: options.judgeModel ?? scenario.judge.model,
1084
1163
  baseUrl: options.llmBaseUrl ?? scenario.judge.baseUrl
@@ -1130,6 +1209,13 @@ async function runScenario(options) {
1130
1209
  ],
1131
1210
  startedAt: transcript.startedAt,
1132
1211
  endedAt: transcript.endedAt ?? (/* @__PURE__ */ new Date()).toISOString(),
1212
+ judgeMetadata: {
1213
+ mode: options.judgeMode ?? (judgeProvider && judgeProvider !== "mock" ? "semantic" : "rules"),
1214
+ provider: judgeProvider,
1215
+ model: options.judgeModel ?? scenario.judge.model,
1216
+ rulesApplied: options.judgeMode !== "semantic",
1217
+ deterministicFindingsAdded: 0
1218
+ },
1133
1219
  rawJudgeOutput: appError.toJSON()
1134
1220
  };
1135
1221
  const markdown = generateMarkdownReport(report, transcript);
@@ -1151,6 +1237,13 @@ var failureSchema2 = z4.object({
1151
1237
  severity: z4.enum(["low", "medium", "high", "critical"]),
1152
1238
  message: requiredString("run.report.failures[].message is required")
1153
1239
  }).strict();
1240
+ var judgeMetadataSchema = z4.object({
1241
+ mode: z4.enum(["rules", "semantic", "hybrid"]),
1242
+ provider: z4.string().optional(),
1243
+ model: z4.string().optional(),
1244
+ rulesApplied: z4.boolean().default(false),
1245
+ deterministicFindingsAdded: z4.number().int().nonnegative().default(0)
1246
+ }).strict();
1154
1247
  var reportSchema = z4.object({
1155
1248
  runId: requiredString("run.report.runId is required"),
1156
1249
  scenario: requiredString("run.report.scenario is required"),
@@ -1162,6 +1255,7 @@ var reportSchema = z4.object({
1162
1255
  recommendations: z4.array(z4.string()),
1163
1256
  startedAt: requiredString("run.report.startedAt is required"),
1164
1257
  endedAt: requiredString("run.report.endedAt is required"),
1258
+ judgeMetadata: judgeMetadataSchema.optional(),
1165
1259
  rawJudgeOutput: z4.unknown().optional()
1166
1260
  }).strict();
1167
1261