@principles/pd-cli 1.95.0 → 1.97.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/commands/diagnose.d.ts +1 -0
  2. package/dist/commands/diagnose.d.ts.map +1 -1
  3. package/dist/commands/diagnose.js +44 -0
  4. package/dist/commands/diagnose.js.map +1 -1
  5. package/dist/commands/pain-record.d.ts.map +1 -1
  6. package/dist/commands/pain-record.js +4 -1
  7. package/dist/commands/pain-record.js.map +1 -1
  8. package/dist/commands/quality-scorecard.d.ts +9 -0
  9. package/dist/commands/quality-scorecard.d.ts.map +1 -0
  10. package/dist/commands/quality-scorecard.js +241 -0
  11. package/dist/commands/quality-scorecard.js.map +1 -0
  12. package/dist/index.js +22 -0
  13. package/dist/index.js.map +1 -1
  14. package/dist/services/quality-scorecard/data-extractor.d.ts +28 -0
  15. package/dist/services/quality-scorecard/data-extractor.d.ts.map +1 -0
  16. package/dist/services/quality-scorecard/data-extractor.js +118 -0
  17. package/dist/services/quality-scorecard/data-extractor.js.map +1 -0
  18. package/dist/services/quality-scorecard/local-evaluator.d.ts +18 -0
  19. package/dist/services/quality-scorecard/local-evaluator.d.ts.map +1 -0
  20. package/dist/services/quality-scorecard/local-evaluator.js +112 -0
  21. package/dist/services/quality-scorecard/local-evaluator.js.map +1 -0
  22. package/dist/services/quality-scorecard/strong-model-gate.d.ts +14 -0
  23. package/dist/services/quality-scorecard/strong-model-gate.d.ts.map +1 -0
  24. package/dist/services/quality-scorecard/strong-model-gate.js +128 -0
  25. package/dist/services/quality-scorecard/strong-model-gate.js.map +1 -0
  26. package/package.json +1 -1
  27. package/src/commands/diagnose.ts +45 -0
  28. package/src/commands/pain-record.ts +5 -2
  29. package/src/commands/quality-scorecard.ts +272 -0
  30. package/src/index.ts +25 -0
  31. package/src/services/quality-scorecard/data-extractor.ts +150 -0
  32. package/src/services/quality-scorecard/local-evaluator.ts +142 -0
  33. package/src/services/quality-scorecard/strong-model-gate.ts +160 -0
  34. package/tests/commands/diagnose.test.ts +69 -1
  35. package/tests/commands/pain-record-async.test.ts +4 -0
  36. package/tests/commands/product-path-regression.test.ts +81 -0
@@ -0,0 +1,160 @@
1
+ /**
2
+ * PRI-361 — Strong Model Adjudication Gate (I/O layer in pd-cli)
3
+ *
4
+ * Calls cloud model for adjudication. Uses core validation
5
+ * to parse responses — no unsafe casts.
6
+ */
7
+
8
+ import type {
9
+ PainEpisode,
10
+ LocalEvaluation,
11
+ StrongModelAdjudication,
12
+ AdjudicationStatus,
13
+ } from '@principles/core/quality-scorecard';
14
+ import {
15
+ RUBRIC_LABELS,
16
+ RUBRIC_DIMENSIONS as DIMS,
17
+ meetsMvpThreshold,
18
+ validateAdjudicationResponse,
19
+ extractJsonFromLlmResponse,
20
+ } from '@principles/core/quality-scorecard';
21
+
22
+ function buildAdjudicationPrompt(
23
+ episode: PainEpisode,
24
+ localEval: LocalEvaluation
25
+ ): string {
26
+ const localScores = DIMS.map(d =>
27
+ `- ${d} (${RUBRIC_LABELS[d]}): ${localEval.dimensionScores[d]}/2 — ${localEval.dimensionRationales[d]}`
28
+ ).join('\n');
29
+
30
+ return `You are a senior quality adjudicator for an AI agent evolution pipeline.
31
+ Your job is to independently re-evaluate a pain episode that was first scored by a local (smaller) model.
32
+ You must provide your own scores — do NOT simply copy the local model's scores.
33
+
34
+ ## Pain Episode
35
+ - ID: ${episode.episodeId}
36
+ - Source: ${episode.source}
37
+ - Pain Score: ${episode.score}
38
+ - Severity: ${episode.severity}
39
+ - Summary: ${episode.summary}
40
+ - Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
41
+ - Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
42
+
43
+ ## Local Model Scores (${localEval.model})
44
+ ${localScores}
45
+ Flags: ${localEval.flags.length > 0 ? localEval.flags.join(', ') : 'none'}
46
+
47
+ ## Your Task
48
+ 1. Independently score each dimension (0/1/2) based on the evidence.
49
+ 2. Check for: language inconsistency, over-abstraction, fabricated evidence.
50
+ 3. If your scores differ from the local model by >=2 points on any dimension, explain why.
51
+ 4. Give a final verdict: pass, fail, or needs-review.
52
+
53
+ ## Output Format (STRICT JSON)
54
+ {
55
+ "scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
56
+ "rationale": "Overall assessment...",
57
+ "verdict": "pass" | "fail" | "needs-review"
58
+ }
59
+
60
+ Do NOT output anything other than this JSON object.`;
61
+ }
62
+
63
+ export async function adjudicate(
64
+ episode: PainEpisode,
65
+ localEval: LocalEvaluation,
66
+ config: { modelId: string; log: (msg: string) => void }
67
+ ): Promise<StrongModelAdjudication> {
68
+ const { modelId: strongModelId, log } = config;
69
+ const prompt = buildAdjudicationPrompt(episode, localEval);
70
+ const baseUrl = process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
71
+ const apiKey = process.env.OPENAI_API_KEY;
72
+
73
+ if (!apiKey) {
74
+ return {
75
+ model: strongModelId,
76
+ adjudicationStatus: 'needs-review',
77
+ confirmedScores: null,
78
+ confirmedMvpMet: null,
79
+ rationale: 'OPENAI_API_KEY not set — cannot run strong-model adjudication',
80
+ nextAction: 'Set OPENAI_API_KEY and re-run with --strong-model',
81
+ };
82
+ }
83
+
84
+ try {
85
+ const resp = await fetch(`${baseUrl}/chat/completions`, {
86
+ method: 'POST',
87
+ headers: {
88
+ 'Content-Type': 'application/json',
89
+ Authorization: `Bearer ${apiKey}`,
90
+ },
91
+ body: JSON.stringify({
92
+ model: strongModelId,
93
+ messages: [
94
+ { role: 'system', content: 'You are a precise JSON-output quality adjudicator. Output only valid JSON.' },
95
+ { role: 'user', content: prompt },
96
+ ],
97
+ temperature: 0.1,
98
+ max_tokens: 2000,
99
+ }),
100
+ signal: AbortSignal.timeout(120_000),
101
+ });
102
+
103
+ if (!resp.ok) {
104
+ throw new Error(`Strong model request failed: ${resp.status}`);
105
+ }
106
+
107
+ const data = (await resp.json()) as { choices: { message: { content: string } }[] };
108
+ const content = data.choices?.[0]?.message?.content ?? '';
109
+ const parsed = extractJsonFromLlmResponse(content);
110
+ if (parsed === null) {
111
+ throw new Error('Strong model returned non-JSON');
112
+ }
113
+
114
+ const validated = validateAdjudicationResponse(parsed);
115
+ const { scores, verdict } = validated;
116
+
117
+ return {
118
+ model: strongModelId,
119
+ adjudicationStatus: verdict,
120
+ confirmedScores: scores,
121
+ confirmedMvpMet: meetsMvpThreshold(scores),
122
+ rationale: validated.rationale,
123
+ nextAction: null,
124
+ };
125
+ } catch (err: unknown) {
126
+ const msg = err instanceof Error ? err.message : String(err);
127
+ log(`Adjudication error: ${msg}`);
128
+ return {
129
+ model: strongModelId,
130
+ adjudicationStatus: 'needs-review',
131
+ confirmedScores: null,
132
+ confirmedMvpMet: null,
133
+ rationale: `Adjudication failed: ${msg}`,
134
+ nextAction: 'Retry with strong model or manually review',
135
+ };
136
+ }
137
+ }
138
+
139
+ export function skippedAdjudication(reason: string): StrongModelAdjudication {
140
+ return {
141
+ model: 'none',
142
+ adjudicationStatus: 'skipped',
143
+ confirmedScores: null,
144
+ confirmedMvpMet: null,
145
+ rationale: reason,
146
+ nextAction: 'Configure and run strong-model adjudication for final quality verdict',
147
+ };
148
+ }
149
+
150
+ export function determineFinalLabel(
151
+ localEval: LocalEvaluation,
152
+ adjudication: StrongModelAdjudication | null
153
+ ): AdjudicationStatus {
154
+ if (!adjudication || adjudication.adjudicationStatus === 'skipped') {
155
+ if (localEval.mvpMet && localEval.totalScore >= 12) return 'local-pass';
156
+ if (localEval.totalScore <= 6) return 'local-fail';
157
+ return 'needs-review';
158
+ }
159
+ return adjudication.adjudicationStatus;
160
+ }
@@ -122,7 +122,7 @@ vi.mock('../../src/services/pd-config-loader.js', () => ({
122
122
  computeFlagsFromLoadResult: vi.fn().mockReturnValue({}),
123
123
  }));
124
124
 
125
- import { handleDiagnoseRun, type DiagnoseRunOptions } from '../../src/commands/diagnose.js';
125
+ import { handleDiagnoseRun, handleDiagnoseStatus, type DiagnoseRunOptions } from '../../src/commands/diagnose.js';
126
126
 
127
127
  const SUCCEEDED_RESULT = {
128
128
  status: 'succeeded' as const,
@@ -811,3 +811,71 @@ describe('Commander wiring for --no-intake', () => {
811
811
  ).rejects.toThrow();
812
812
  });
813
813
  });
814
+
815
+ describe('pd status stalled-threshold validation', () => {
816
+ it('accepts valid positive integers', async () => {
817
+ const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
818
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
819
+
820
+ const runtimeV2 = await import('@principles/core/runtime-v2');
821
+ vi.mocked(runtimeV2.status).mockResolvedValueOnce({
822
+ taskId: 'test-task-1',
823
+ status: 'pending',
824
+ attemptCount: 0,
825
+ maxAttempts: 3,
826
+ lastError: null,
827
+ commitId: null,
828
+ artifactId: null,
829
+ candidateCount: null,
830
+ });
831
+
832
+ await handleDiagnoseStatus({
833
+ taskId: 'test-task-1',
834
+ stalledThreshold: '123',
835
+ });
836
+
837
+ expect(exitSpy).not.toHaveBeenCalled();
838
+ exitSpy.mockRestore();
839
+ consoleSpy.mockRestore();
840
+ });
841
+
842
+ it('rejects invalid inputs (0, negative, decimals, NaN, empty)', async () => {
843
+ const invalidInputs = ['0', '-10', '1.5', 'abc', 'NaN', ''];
844
+ for (const input of invalidInputs) {
845
+ const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
846
+ const consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
847
+
848
+ await handleDiagnoseStatus({
849
+ taskId: 'test-task-1',
850
+ stalledThreshold: input,
851
+ });
852
+
853
+ expect(exitSpy).toHaveBeenCalledWith(1);
854
+ expect(consoleErrorSpy).toHaveBeenCalledWith(expect.stringContaining('positive integer'));
855
+
856
+ exitSpy.mockRestore();
857
+ consoleErrorSpy.mockRestore();
858
+ }
859
+ });
860
+
861
+ it('rejects invalid inputs in JSON mode', async () => {
862
+ const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
863
+ const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
864
+
865
+ await handleDiagnoseStatus({
866
+ taskId: 'test-task-1',
867
+ stalledThreshold: '0',
868
+ json: true,
869
+ });
870
+
871
+ expect(exitSpy).toHaveBeenCalledWith(1);
872
+ expect(consoleSpy).toHaveBeenCalled();
873
+ const output = JSON.parse((consoleSpy.mock.calls[0] as string[])[0]);
874
+ expect(output.ok).toBe(false);
875
+ expect(output.reason).toBe('invalid_stalled_threshold');
876
+ expect(output.nextAction).toContain('positive integer');
877
+
878
+ exitSpy.mockRestore();
879
+ consoleSpy.mockRestore();
880
+ });
881
+ });
@@ -137,6 +137,10 @@ describe('pd pain record async mode (PRI-369)', () => {
137
137
  expect(jsonOutput.ledgerEntryIds).toEqual([]);
138
138
  expect(jsonOutput.latencyMs).toBe(120);
139
139
  expect(jsonOutput.message).toContain('pd task show');
140
+ expect(jsonOutput.reason).toContain('pd task show');
141
+ expect(jsonOutput.nextAction).toContain('pd diagnose run');
142
+ expect(jsonOutput.nextAction).toContain('--runtime pi-ai');
143
+ expect(jsonOutput.nextAction).toContain('--json');
140
144
  // submitted should NOT cause exit(1)
141
145
  expect(exitSpy).not.toHaveBeenCalledWith(1);
142
146
 
@@ -0,0 +1,81 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import * as fs from 'fs';
3
+ import * as path from 'path';
4
+ import * as os from 'os';
5
+ import { execSync } from 'child_process';
6
+ import { fileURLToPath } from 'url';
7
+
8
+ // Resolve __dirname in ESM
9
+ const __filename = fileURLToPath(import.meta.url);
10
+ const __dirname = path.dirname(__filename);
11
+
12
+ describe('Real CLI JSON product-path regression test (PRI-376)', () => {
13
+ it('outputs exactly one parseable JSON object on async pain record', () => {
14
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pd-regression-'));
15
+ const pdDir = path.join(tmpDir, '.pd');
16
+ fs.mkdirSync(pdDir, { recursive: true });
17
+
18
+ // Write a config to enable async mode
19
+ const configContent = `
20
+ version: 1
21
+ features:
22
+ diagnostician_async_cli:
23
+ category: quiet
24
+ enabled: true
25
+ runtimeProfiles:
26
+ openclaw.default:
27
+ type: openclaw
28
+ source: default
29
+ internalAgents:
30
+ defaultRuntime: openclaw.default
31
+ agents:
32
+ diagnostician:
33
+ enabled: true
34
+ runtimeProfile: openclaw.default
35
+ ui:
36
+ diagnostics:
37
+ mode: simple
38
+ `;
39
+ fs.writeFileSync(path.join(pdDir, 'config.yaml'), configContent.trim(), 'utf8');
40
+
41
+ // Resolve CLI binary path relative to this file to be workspace-independent
42
+ const cliBin = path.resolve(__dirname, '../../dist/index.js');
43
+ const cmd = `node "${cliBin}" pain record --reason "Regression test frustration" --json --workspace "${tmpDir}"`;
44
+
45
+ let stdoutStr: string;
46
+ try {
47
+ stdoutStr = execSync(cmd, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'inherit'] });
48
+ } finally {
49
+ fs.rmSync(tmpDir, { recursive: true, force: true });
50
+ }
51
+
52
+ // Ensure output is non-empty
53
+ expect(stdoutStr.trim()).not.toBe('');
54
+
55
+ // Ensure it's exactly one parseable JSON object
56
+ let parsed: Record<string, unknown>;
57
+ try {
58
+ parsed = JSON.parse(stdoutStr.trim()) as Record<string, unknown>;
59
+ } catch (err) {
60
+ throw new Error(`Stdout was not a single parseable JSON object:\n${stdoutStr}`, {
61
+ cause: err,
62
+ });
63
+ }
64
+
65
+ // Ensure required fields
66
+ expect(parsed).toBeDefined();
67
+ expect(parsed.status).toBe('submitted');
68
+ expect(parsed.taskId).toMatch(/^diagnosis_/);
69
+ expect(parsed.message).toBeDefined();
70
+ expect(parsed.reason).toBeTypeOf('string');
71
+ expect(parsed.nextAction).toBeTypeOf('string');
72
+
73
+ // Verify nextAction structure: pd diagnose run --task-id ... --runtime pi-ai --json
74
+ const nextAction = parsed.nextAction as string;
75
+ expect(nextAction).toContain('pd diagnose run');
76
+ expect(nextAction).toContain(`--task-id ${parsed.taskId}`);
77
+ expect(nextAction).toContain('--runtime pi-ai');
78
+ expect(nextAction).toContain('--json');
79
+ }, 20000);
80
+ });
81
+