@principles/pd-cli 1.96.0 → 1.98.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/dist/commands/quality-scorecard.d.ts +9 -0
  2. package/dist/commands/quality-scorecard.d.ts.map +1 -0
  3. package/dist/commands/quality-scorecard.js +241 -0
  4. package/dist/commands/quality-scorecard.js.map +1 -0
  5. package/dist/commands/runtime-internalization-queue.d.ts.map +1 -1
  6. package/dist/commands/runtime-internalization-queue.js +35 -3
  7. package/dist/commands/runtime-internalization-queue.js.map +1 -1
  8. package/dist/index.js +21 -0
  9. package/dist/index.js.map +1 -1
  10. package/dist/services/quality-scorecard/data-extractor.d.ts +28 -0
  11. package/dist/services/quality-scorecard/data-extractor.d.ts.map +1 -0
  12. package/dist/services/quality-scorecard/data-extractor.js +118 -0
  13. package/dist/services/quality-scorecard/data-extractor.js.map +1 -0
  14. package/dist/services/quality-scorecard/local-evaluator.d.ts +18 -0
  15. package/dist/services/quality-scorecard/local-evaluator.d.ts.map +1 -0
  16. package/dist/services/quality-scorecard/local-evaluator.js +112 -0
  17. package/dist/services/quality-scorecard/local-evaluator.js.map +1 -0
  18. package/dist/services/quality-scorecard/strong-model-gate.d.ts +14 -0
  19. package/dist/services/quality-scorecard/strong-model-gate.d.ts.map +1 -0
  20. package/dist/services/quality-scorecard/strong-model-gate.js +128 -0
  21. package/dist/services/quality-scorecard/strong-model-gate.js.map +1 -0
  22. package/package.json +1 -1
  23. package/src/commands/quality-scorecard.ts +272 -0
  24. package/src/commands/runtime-internalization-queue.ts +37 -3
  25. package/src/index.ts +24 -0
  26. package/src/services/quality-scorecard/data-extractor.ts +150 -0
  27. package/src/services/quality-scorecard/local-evaluator.ts +142 -0
  28. package/src/services/quality-scorecard/strong-model-gate.ts +160 -0
  29. package/tests/commands/runtime-internalization-queue.test.ts +140 -0
@@ -0,0 +1,160 @@
1
+ /**
2
+ * PRI-361 — Strong Model Adjudication Gate (I/O layer in pd-cli)
3
+ *
4
+ * Calls cloud model for adjudication. Uses core validation
5
+ * to parse responses — no unsafe casts.
6
+ */
7
+
8
+ import type {
9
+ PainEpisode,
10
+ LocalEvaluation,
11
+ StrongModelAdjudication,
12
+ AdjudicationStatus,
13
+ } from '@principles/core/quality-scorecard';
14
+ import {
15
+ RUBRIC_LABELS,
16
+ RUBRIC_DIMENSIONS as DIMS,
17
+ meetsMvpThreshold,
18
+ validateAdjudicationResponse,
19
+ extractJsonFromLlmResponse,
20
+ } from '@principles/core/quality-scorecard';
21
+
22
+ function buildAdjudicationPrompt(
23
+ episode: PainEpisode,
24
+ localEval: LocalEvaluation
25
+ ): string {
26
+ const localScores = DIMS.map(d =>
27
+ `- ${d} (${RUBRIC_LABELS[d]}): ${localEval.dimensionScores[d]}/2 — ${localEval.dimensionRationales[d]}`
28
+ ).join('\n');
29
+
30
+ return `You are a senior quality adjudicator for an AI agent evolution pipeline.
31
+ Your job is to independently re-evaluate a pain episode that was first scored by a local (smaller) model.
32
+ You must provide your own scores — do NOT simply copy the local model's scores.
33
+
34
+ ## Pain Episode
35
+ - ID: ${episode.episodeId}
36
+ - Source: ${episode.source}
37
+ - Pain Score: ${episode.score}
38
+ - Severity: ${episode.severity}
39
+ - Summary: ${episode.summary}
40
+ - Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
41
+ - Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
42
+
43
+ ## Local Model Scores (${localEval.model})
44
+ ${localScores}
45
+ Flags: ${localEval.flags.length > 0 ? localEval.flags.join(', ') : 'none'}
46
+
47
+ ## Your Task
48
+ 1. Independently score each dimension (0/1/2) based on the evidence.
49
+ 2. Check for: language inconsistency, over-abstraction, fabricated evidence.
50
+ 3. If your scores differ from the local model by >=2 points on any dimension, explain why.
51
+ 4. Give a final verdict: pass, fail, or needs-review.
52
+
53
+ ## Output Format (STRICT JSON)
54
+ {
55
+ "scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
56
+ "rationale": "Overall assessment...",
57
+ "verdict": "pass" | "fail" | "needs-review"
58
+ }
59
+
60
+ Do NOT output anything other than this JSON object.`;
61
+ }
62
+
63
+ export async function adjudicate(
64
+ episode: PainEpisode,
65
+ localEval: LocalEvaluation,
66
+ config: { modelId: string; log: (msg: string) => void }
67
+ ): Promise<StrongModelAdjudication> {
68
+ const { modelId: strongModelId, log } = config;
69
+ const prompt = buildAdjudicationPrompt(episode, localEval);
70
+ const baseUrl = process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
71
+ const apiKey = process.env.OPENAI_API_KEY;
72
+
73
+ if (!apiKey) {
74
+ return {
75
+ model: strongModelId,
76
+ adjudicationStatus: 'needs-review',
77
+ confirmedScores: null,
78
+ confirmedMvpMet: null,
79
+ rationale: 'OPENAI_API_KEY not set — cannot run strong-model adjudication',
80
+ nextAction: 'Set OPENAI_API_KEY and re-run with --strong-model',
81
+ };
82
+ }
83
+
84
+ try {
85
+ const resp = await fetch(`${baseUrl}/chat/completions`, {
86
+ method: 'POST',
87
+ headers: {
88
+ 'Content-Type': 'application/json',
89
+ Authorization: `Bearer ${apiKey}`,
90
+ },
91
+ body: JSON.stringify({
92
+ model: strongModelId,
93
+ messages: [
94
+ { role: 'system', content: 'You are a precise JSON-output quality adjudicator. Output only valid JSON.' },
95
+ { role: 'user', content: prompt },
96
+ ],
97
+ temperature: 0.1,
98
+ max_tokens: 2000,
99
+ }),
100
+ signal: AbortSignal.timeout(120_000),
101
+ });
102
+
103
+ if (!resp.ok) {
104
+ throw new Error(`Strong model request failed: ${resp.status}`);
105
+ }
106
+
107
+ const data = (await resp.json()) as { choices: { message: { content: string } }[] };
108
+ const content = data.choices?.[0]?.message?.content ?? '';
109
+ const parsed = extractJsonFromLlmResponse(content);
110
+ if (parsed === null) {
111
+ throw new Error('Strong model returned non-JSON');
112
+ }
113
+
114
+ const validated = validateAdjudicationResponse(parsed);
115
+ const { scores, verdict } = validated;
116
+
117
+ return {
118
+ model: strongModelId,
119
+ adjudicationStatus: verdict,
120
+ confirmedScores: scores,
121
+ confirmedMvpMet: meetsMvpThreshold(scores),
122
+ rationale: validated.rationale,
123
+ nextAction: null,
124
+ };
125
+ } catch (err: unknown) {
126
+ const msg = err instanceof Error ? err.message : String(err);
127
+ log(`Adjudication error: ${msg}`);
128
+ return {
129
+ model: strongModelId,
130
+ adjudicationStatus: 'needs-review',
131
+ confirmedScores: null,
132
+ confirmedMvpMet: null,
133
+ rationale: `Adjudication failed: ${msg}`,
134
+ nextAction: 'Retry with strong model or manually review',
135
+ };
136
+ }
137
+ }
138
+
139
+ export function skippedAdjudication(reason: string): StrongModelAdjudication {
140
+ return {
141
+ model: 'none',
142
+ adjudicationStatus: 'skipped',
143
+ confirmedScores: null,
144
+ confirmedMvpMet: null,
145
+ rationale: reason,
146
+ nextAction: 'Configure and run strong-model adjudication for final quality verdict',
147
+ };
148
+ }
149
+
150
+ export function determineFinalLabel(
151
+ localEval: LocalEvaluation,
152
+ adjudication: StrongModelAdjudication | null
153
+ ): AdjudicationStatus {
154
+ if (!adjudication || adjudication.adjudicationStatus === 'skipped') {
155
+ if (localEval.mvpMet && localEval.totalScore >= 12) return 'local-pass';
156
+ if (localEval.totalScore <= 6) return 'local-fail';
157
+ return 'needs-review';
158
+ }
159
+ return adjudication.adjudicationStatus;
160
+ }
@@ -37,6 +37,22 @@ vi.mock('../../src/services/feature-flag-loader.js', () => ({
37
37
  }),
38
38
  }));
39
39
 
40
+ const { mockLoadPdConfig, mockComputeFlagsFromLoadResult } = vi.hoisted(() => ({
41
+ mockLoadPdConfig: vi.fn().mockReturnValue({ config: {}, source: 'defaults' }),
42
+ mockComputeFlagsFromLoadResult: vi.fn().mockReturnValue({
43
+ flags: {
44
+ internalization_auto_consumer: { id: 'internalization_auto_consumer', enabled: true, category: 'quiet' },
45
+ },
46
+ source: 'defaults',
47
+ errors: [],
48
+ }),
49
+ }));
50
+
51
+ vi.mock('../../src/services/pd-config-loader.js', () => ({
52
+ loadPdConfig: mockLoadPdConfig,
53
+ computeFlagsFromLoadResult: mockComputeFlagsFromLoadResult,
54
+ }));
55
+
40
56
  import { handleRuntimeInternalizationQueue } from '../../src/commands/runtime-internalization-queue.js';
41
57
 
42
58
  const WS = '/fake/workspace';
@@ -68,6 +84,14 @@ describe('handleRuntimeInternalizationQueue', () => {
68
84
  vi.clearAllMocks();
69
85
  consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
70
86
  consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
87
+ mockLoadPdConfig.mockReturnValue({ ok: true, effective: {}, source: 'defaults' });
88
+ mockComputeFlagsFromLoadResult.mockReturnValue({
89
+ flags: {
90
+ internalization_auto_consumer: { id: 'internalization_auto_consumer', enabled: true, category: 'quiet' },
91
+ },
92
+ source: 'defaults',
93
+ errors: [],
94
+ });
71
95
  });
72
96
 
73
97
  afterEach(() => {
@@ -250,4 +274,120 @@ describe('handleRuntimeInternalizationQueue', () => {
250
274
  expect(output.countsByTaskKind.scribe).toBe(1);
251
275
  expect(output.countsByChannel.prompt).toBe(3);
252
276
  });
277
+
278
+ // ── nextAction / consumerStatus (PRI-381) ──────────────────────────────────
279
+
280
+ it('ready tasks + auto-consumer (core flag default) → consumerStatus=auto_consumer_enabled in JSON', async () => {
281
+ mockGetSnapshot.mockResolvedValue({
282
+ ...emptySnapshot(),
283
+ pendingCount: 3,
284
+ readyTasks: [
285
+ { taskId: 'task_dreamer_1', taskKind: 'dreamer', channel: 'prompt' },
286
+ { taskId: 'task_dreamer_2', taskKind: 'dreamer', channel: 'prompt' },
287
+ ],
288
+ noReadyTasks: null,
289
+ });
290
+
291
+ await handleRuntimeInternalizationQueue({ workspace: WS, json: true });
292
+
293
+ const output = JSON.parse(consoleLogSpy.mock.calls[0][0]);
294
+ expect(output.consumerStatus).toBe('auto_consumer_enabled');
295
+ expect(output.nextAction).toBeUndefined();
296
+ });
297
+
298
+ it('ready tasks + auto-consumer enabled via config → consumerStatus=auto_consumer_enabled in JSON', async () => {
299
+ mockGetSnapshot.mockResolvedValue({
300
+ ...emptySnapshot(),
301
+ pendingCount: 3,
302
+ readyTasks: [
303
+ { taskId: 'task_dreamer_1', taskKind: 'dreamer', channel: 'prompt' },
304
+ ],
305
+ noReadyTasks: null,
306
+ });
307
+ mockComputeFlagsFromLoadResult.mockReturnValue({
308
+ flags: {
309
+ internalization_auto_consumer: { id: 'internalization_auto_consumer', enabled: true, category: 'quiet' },
310
+ },
311
+ source: 'config',
312
+ errors: [],
313
+ });
314
+
315
+ await handleRuntimeInternalizationQueue({ workspace: WS, json: true });
316
+
317
+ const output = JSON.parse(consoleLogSpy.mock.calls[0][0]);
318
+ expect(output.consumerStatus).toBe('auto_consumer_enabled');
319
+ expect(output.nextAction).toBeUndefined();
320
+ });
321
+
322
+ it('ready tasks + auto-consumer disabled via config → consumerStatus=manual_action_required + nextAction in JSON', async () => {
323
+ mockGetSnapshot.mockResolvedValue({
324
+ ...emptySnapshot(),
325
+ pendingCount: 3,
326
+ readyTasks: [
327
+ { taskId: 'task_dreamer_1', taskKind: 'dreamer', channel: 'prompt' },
328
+ ],
329
+ noReadyTasks: null,
330
+ });
331
+ mockComputeFlagsFromLoadResult.mockReturnValue({
332
+ flags: {
333
+ internalization_auto_consumer: { id: 'internalization_auto_consumer', enabled: false, category: 'quiet' },
334
+ },
335
+ source: 'config',
336
+ errors: [],
337
+ });
338
+
339
+ await handleRuntimeInternalizationQueue({ workspace: WS, json: true });
340
+
341
+ const output = JSON.parse(consoleLogSpy.mock.calls[0][0]);
342
+ expect(output.consumerStatus).toBe('manual_action_required');
343
+ expect(output.nextAction).toContain('pd runtime internalization run-once');
344
+ });
345
+
346
+ it('no ready tasks → no consumerStatus or nextAction in JSON', async () => {
347
+ mockGetSnapshot.mockResolvedValue(emptySnapshot());
348
+
349
+ await handleRuntimeInternalizationQueue({ workspace: WS, json: true });
350
+
351
+ const output = JSON.parse(consoleLogSpy.mock.calls[0][0]);
352
+ expect(output.consumerStatus).toBeUndefined();
353
+ expect(output.nextAction).toBeUndefined();
354
+ });
355
+
356
+ it('ready tasks in text output show auto_consumer status (not manual nextAction when enabled)', async () => {
357
+ mockGetSnapshot.mockResolvedValue({
358
+ ...emptySnapshot(),
359
+ pendingCount: 1,
360
+ readyTasks: [{ taskId: 'task_003', taskKind: 'dreamer', channel: 'prompt' }],
361
+ noReadyTasks: null,
362
+ });
363
+
364
+ await handleRuntimeInternalizationQueue({ workspace: WS, json: false });
365
+
366
+ const text = consoleLogSpy.mock.calls.map(c => c.join(' ')).join('\n');
367
+ expect(text).toContain('auto_consumer_enabled');
368
+ expect(text).not.toContain('nextAction:');
369
+ });
370
+
371
+ it('ready tasks + auto-consumer disabled in text output shows manual_action_required + nextAction', async () => {
372
+ mockGetSnapshot.mockResolvedValue({
373
+ ...emptySnapshot(),
374
+ pendingCount: 1,
375
+ readyTasks: [{ taskId: 'task_003', taskKind: 'dreamer', channel: 'prompt' }],
376
+ noReadyTasks: null,
377
+ });
378
+ mockComputeFlagsFromLoadResult.mockReturnValue({
379
+ flags: {
380
+ internalization_auto_consumer: { id: 'internalization_auto_consumer', enabled: false, category: 'quiet' },
381
+ },
382
+ source: 'config',
383
+ errors: [],
384
+ });
385
+
386
+ await handleRuntimeInternalizationQueue({ workspace: WS, json: false });
387
+
388
+ const text = consoleLogSpy.mock.calls.map(c => c.join(' ')).join('\n');
389
+ expect(text).toContain('manual_action_required');
390
+ expect(text).toContain('nextAction:');
391
+ expect(text).toContain('pd runtime internalization run-once');
392
+ });
253
393
  });