@principles/pd-cli 1.95.0 → 1.97.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/diagnose.d.ts +1 -0
- package/dist/commands/diagnose.d.ts.map +1 -1
- package/dist/commands/diagnose.js +44 -0
- package/dist/commands/diagnose.js.map +1 -1
- package/dist/commands/pain-record.d.ts.map +1 -1
- package/dist/commands/pain-record.js +4 -1
- package/dist/commands/pain-record.js.map +1 -1
- package/dist/commands/quality-scorecard.d.ts +9 -0
- package/dist/commands/quality-scorecard.d.ts.map +1 -0
- package/dist/commands/quality-scorecard.js +241 -0
- package/dist/commands/quality-scorecard.js.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -1
- package/dist/services/quality-scorecard/data-extractor.d.ts +28 -0
- package/dist/services/quality-scorecard/data-extractor.d.ts.map +1 -0
- package/dist/services/quality-scorecard/data-extractor.js +118 -0
- package/dist/services/quality-scorecard/data-extractor.js.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts +18 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.js +112 -0
- package/dist/services/quality-scorecard/local-evaluator.js.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts +14 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.js +128 -0
- package/dist/services/quality-scorecard/strong-model-gate.js.map +1 -0
- package/package.json +1 -1
- package/src/commands/diagnose.ts +45 -0
- package/src/commands/pain-record.ts +5 -2
- package/src/commands/quality-scorecard.ts +272 -0
- package/src/index.ts +25 -0
- package/src/services/quality-scorecard/data-extractor.ts +150 -0
- package/src/services/quality-scorecard/local-evaluator.ts +142 -0
- package/src/services/quality-scorecard/strong-model-gate.ts +160 -0
- package/tests/commands/diagnose.test.ts +69 -1
- package/tests/commands/pain-record-async.test.ts +4 -0
- package/tests/commands/product-path-regression.test.ts +81 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Strong Model Adjudication Gate (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Calls cloud model for adjudication. Uses core validation
|
|
5
|
+
* to parse responses — no unsafe casts.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
PainEpisode,
|
|
10
|
+
LocalEvaluation,
|
|
11
|
+
StrongModelAdjudication,
|
|
12
|
+
AdjudicationStatus,
|
|
13
|
+
} from '@principles/core/quality-scorecard';
|
|
14
|
+
import {
|
|
15
|
+
RUBRIC_LABELS,
|
|
16
|
+
RUBRIC_DIMENSIONS as DIMS,
|
|
17
|
+
meetsMvpThreshold,
|
|
18
|
+
validateAdjudicationResponse,
|
|
19
|
+
extractJsonFromLlmResponse,
|
|
20
|
+
} from '@principles/core/quality-scorecard';
|
|
21
|
+
|
|
22
|
+
function buildAdjudicationPrompt(
|
|
23
|
+
episode: PainEpisode,
|
|
24
|
+
localEval: LocalEvaluation
|
|
25
|
+
): string {
|
|
26
|
+
const localScores = DIMS.map(d =>
|
|
27
|
+
`- ${d} (${RUBRIC_LABELS[d]}): ${localEval.dimensionScores[d]}/2 — ${localEval.dimensionRationales[d]}`
|
|
28
|
+
).join('\n');
|
|
29
|
+
|
|
30
|
+
return `You are a senior quality adjudicator for an AI agent evolution pipeline.
|
|
31
|
+
Your job is to independently re-evaluate a pain episode that was first scored by a local (smaller) model.
|
|
32
|
+
You must provide your own scores — do NOT simply copy the local model's scores.
|
|
33
|
+
|
|
34
|
+
## Pain Episode
|
|
35
|
+
- ID: ${episode.episodeId}
|
|
36
|
+
- Source: ${episode.source}
|
|
37
|
+
- Pain Score: ${episode.score}
|
|
38
|
+
- Severity: ${episode.severity}
|
|
39
|
+
- Summary: ${episode.summary}
|
|
40
|
+
- Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
|
|
41
|
+
- Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
|
|
42
|
+
|
|
43
|
+
## Local Model Scores (${localEval.model})
|
|
44
|
+
${localScores}
|
|
45
|
+
Flags: ${localEval.flags.length > 0 ? localEval.flags.join(', ') : 'none'}
|
|
46
|
+
|
|
47
|
+
## Your Task
|
|
48
|
+
1. Independently score each dimension (0/1/2) based on the evidence.
|
|
49
|
+
2. Check for: language inconsistency, over-abstraction, fabricated evidence.
|
|
50
|
+
3. If your scores differ from the local model by >=2 points on any dimension, explain why.
|
|
51
|
+
4. Give a final verdict: pass, fail, or needs-review.
|
|
52
|
+
|
|
53
|
+
## Output Format (STRICT JSON)
|
|
54
|
+
{
|
|
55
|
+
"scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
|
|
56
|
+
"rationale": "Overall assessment...",
|
|
57
|
+
"verdict": "pass" | "fail" | "needs-review"
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
Do NOT output anything other than this JSON object.`;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export async function adjudicate(
|
|
64
|
+
episode: PainEpisode,
|
|
65
|
+
localEval: LocalEvaluation,
|
|
66
|
+
config: { modelId: string; log: (msg: string) => void }
|
|
67
|
+
): Promise<StrongModelAdjudication> {
|
|
68
|
+
const { modelId: strongModelId, log } = config;
|
|
69
|
+
const prompt = buildAdjudicationPrompt(episode, localEval);
|
|
70
|
+
const baseUrl = process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
|
|
71
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
72
|
+
|
|
73
|
+
if (!apiKey) {
|
|
74
|
+
return {
|
|
75
|
+
model: strongModelId,
|
|
76
|
+
adjudicationStatus: 'needs-review',
|
|
77
|
+
confirmedScores: null,
|
|
78
|
+
confirmedMvpMet: null,
|
|
79
|
+
rationale: 'OPENAI_API_KEY not set — cannot run strong-model adjudication',
|
|
80
|
+
nextAction: 'Set OPENAI_API_KEY and re-run with --strong-model',
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
try {
|
|
85
|
+
const resp = await fetch(`${baseUrl}/chat/completions`, {
|
|
86
|
+
method: 'POST',
|
|
87
|
+
headers: {
|
|
88
|
+
'Content-Type': 'application/json',
|
|
89
|
+
Authorization: `Bearer ${apiKey}`,
|
|
90
|
+
},
|
|
91
|
+
body: JSON.stringify({
|
|
92
|
+
model: strongModelId,
|
|
93
|
+
messages: [
|
|
94
|
+
{ role: 'system', content: 'You are a precise JSON-output quality adjudicator. Output only valid JSON.' },
|
|
95
|
+
{ role: 'user', content: prompt },
|
|
96
|
+
],
|
|
97
|
+
temperature: 0.1,
|
|
98
|
+
max_tokens: 2000,
|
|
99
|
+
}),
|
|
100
|
+
signal: AbortSignal.timeout(120_000),
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
if (!resp.ok) {
|
|
104
|
+
throw new Error(`Strong model request failed: ${resp.status}`);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const data = (await resp.json()) as { choices: { message: { content: string } }[] };
|
|
108
|
+
const content = data.choices?.[0]?.message?.content ?? '';
|
|
109
|
+
const parsed = extractJsonFromLlmResponse(content);
|
|
110
|
+
if (parsed === null) {
|
|
111
|
+
throw new Error('Strong model returned non-JSON');
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const validated = validateAdjudicationResponse(parsed);
|
|
115
|
+
const { scores, verdict } = validated;
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
model: strongModelId,
|
|
119
|
+
adjudicationStatus: verdict,
|
|
120
|
+
confirmedScores: scores,
|
|
121
|
+
confirmedMvpMet: meetsMvpThreshold(scores),
|
|
122
|
+
rationale: validated.rationale,
|
|
123
|
+
nextAction: null,
|
|
124
|
+
};
|
|
125
|
+
} catch (err: unknown) {
|
|
126
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
127
|
+
log(`Adjudication error: ${msg}`);
|
|
128
|
+
return {
|
|
129
|
+
model: strongModelId,
|
|
130
|
+
adjudicationStatus: 'needs-review',
|
|
131
|
+
confirmedScores: null,
|
|
132
|
+
confirmedMvpMet: null,
|
|
133
|
+
rationale: `Adjudication failed: ${msg}`,
|
|
134
|
+
nextAction: 'Retry with strong model or manually review',
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
export function skippedAdjudication(reason: string): StrongModelAdjudication {
|
|
140
|
+
return {
|
|
141
|
+
model: 'none',
|
|
142
|
+
adjudicationStatus: 'skipped',
|
|
143
|
+
confirmedScores: null,
|
|
144
|
+
confirmedMvpMet: null,
|
|
145
|
+
rationale: reason,
|
|
146
|
+
nextAction: 'Configure and run strong-model adjudication for final quality verdict',
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export function determineFinalLabel(
|
|
151
|
+
localEval: LocalEvaluation,
|
|
152
|
+
adjudication: StrongModelAdjudication | null
|
|
153
|
+
): AdjudicationStatus {
|
|
154
|
+
if (!adjudication || adjudication.adjudicationStatus === 'skipped') {
|
|
155
|
+
if (localEval.mvpMet && localEval.totalScore >= 12) return 'local-pass';
|
|
156
|
+
if (localEval.totalScore <= 6) return 'local-fail';
|
|
157
|
+
return 'needs-review';
|
|
158
|
+
}
|
|
159
|
+
return adjudication.adjudicationStatus;
|
|
160
|
+
}
|
|
@@ -122,7 +122,7 @@ vi.mock('../../src/services/pd-config-loader.js', () => ({
|
|
|
122
122
|
computeFlagsFromLoadResult: vi.fn().mockReturnValue({}),
|
|
123
123
|
}));
|
|
124
124
|
|
|
125
|
-
import { handleDiagnoseRun, type DiagnoseRunOptions } from '../../src/commands/diagnose.js';
|
|
125
|
+
import { handleDiagnoseRun, handleDiagnoseStatus, type DiagnoseRunOptions } from '../../src/commands/diagnose.js';
|
|
126
126
|
|
|
127
127
|
const SUCCEEDED_RESULT = {
|
|
128
128
|
status: 'succeeded' as const,
|
|
@@ -811,3 +811,71 @@ describe('Commander wiring for --no-intake', () => {
|
|
|
811
811
|
).rejects.toThrow();
|
|
812
812
|
});
|
|
813
813
|
});
|
|
814
|
+
|
|
815
|
+
describe('pd status stalled-threshold validation', () => {
|
|
816
|
+
it('accepts valid positive integers', async () => {
|
|
817
|
+
const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
|
|
818
|
+
const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
|
|
819
|
+
|
|
820
|
+
const runtimeV2 = await import('@principles/core/runtime-v2');
|
|
821
|
+
vi.mocked(runtimeV2.status).mockResolvedValueOnce({
|
|
822
|
+
taskId: 'test-task-1',
|
|
823
|
+
status: 'pending',
|
|
824
|
+
attemptCount: 0,
|
|
825
|
+
maxAttempts: 3,
|
|
826
|
+
lastError: null,
|
|
827
|
+
commitId: null,
|
|
828
|
+
artifactId: null,
|
|
829
|
+
candidateCount: null,
|
|
830
|
+
});
|
|
831
|
+
|
|
832
|
+
await handleDiagnoseStatus({
|
|
833
|
+
taskId: 'test-task-1',
|
|
834
|
+
stalledThreshold: '123',
|
|
835
|
+
});
|
|
836
|
+
|
|
837
|
+
expect(exitSpy).not.toHaveBeenCalled();
|
|
838
|
+
exitSpy.mockRestore();
|
|
839
|
+
consoleSpy.mockRestore();
|
|
840
|
+
});
|
|
841
|
+
|
|
842
|
+
it('rejects invalid inputs (0, negative, decimals, NaN, empty)', async () => {
|
|
843
|
+
const invalidInputs = ['0', '-10', '1.5', 'abc', 'NaN', ''];
|
|
844
|
+
for (const input of invalidInputs) {
|
|
845
|
+
const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
|
|
846
|
+
const consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
|
|
847
|
+
|
|
848
|
+
await handleDiagnoseStatus({
|
|
849
|
+
taskId: 'test-task-1',
|
|
850
|
+
stalledThreshold: input,
|
|
851
|
+
});
|
|
852
|
+
|
|
853
|
+
expect(exitSpy).toHaveBeenCalledWith(1);
|
|
854
|
+
expect(consoleErrorSpy).toHaveBeenCalledWith(expect.stringContaining('positive integer'));
|
|
855
|
+
|
|
856
|
+
exitSpy.mockRestore();
|
|
857
|
+
consoleErrorSpy.mockRestore();
|
|
858
|
+
}
|
|
859
|
+
});
|
|
860
|
+
|
|
861
|
+
it('rejects invalid inputs in JSON mode', async () => {
|
|
862
|
+
const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
|
|
863
|
+
const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
|
|
864
|
+
|
|
865
|
+
await handleDiagnoseStatus({
|
|
866
|
+
taskId: 'test-task-1',
|
|
867
|
+
stalledThreshold: '0',
|
|
868
|
+
json: true,
|
|
869
|
+
});
|
|
870
|
+
|
|
871
|
+
expect(exitSpy).toHaveBeenCalledWith(1);
|
|
872
|
+
expect(consoleSpy).toHaveBeenCalled();
|
|
873
|
+
const output = JSON.parse((consoleSpy.mock.calls[0] as string[])[0]);
|
|
874
|
+
expect(output.ok).toBe(false);
|
|
875
|
+
expect(output.reason).toBe('invalid_stalled_threshold');
|
|
876
|
+
expect(output.nextAction).toContain('positive integer');
|
|
877
|
+
|
|
878
|
+
exitSpy.mockRestore();
|
|
879
|
+
consoleSpy.mockRestore();
|
|
880
|
+
});
|
|
881
|
+
});
|
|
@@ -137,6 +137,10 @@ describe('pd pain record async mode (PRI-369)', () => {
|
|
|
137
137
|
expect(jsonOutput.ledgerEntryIds).toEqual([]);
|
|
138
138
|
expect(jsonOutput.latencyMs).toBe(120);
|
|
139
139
|
expect(jsonOutput.message).toContain('pd task show');
|
|
140
|
+
expect(jsonOutput.reason).toContain('pd task show');
|
|
141
|
+
expect(jsonOutput.nextAction).toContain('pd diagnose run');
|
|
142
|
+
expect(jsonOutput.nextAction).toContain('--runtime pi-ai');
|
|
143
|
+
expect(jsonOutput.nextAction).toContain('--json');
|
|
140
144
|
// submitted should NOT cause exit(1)
|
|
141
145
|
expect(exitSpy).not.toHaveBeenCalledWith(1);
|
|
142
146
|
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
import * as os from 'os';
|
|
5
|
+
import { execSync } from 'child_process';
|
|
6
|
+
import { fileURLToPath } from 'url';
|
|
7
|
+
|
|
8
|
+
// Resolve __dirname in ESM
|
|
9
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
10
|
+
const __dirname = path.dirname(__filename);
|
|
11
|
+
|
|
12
|
+
describe('Real CLI JSON product-path regression test (PRI-376)', () => {
|
|
13
|
+
it('outputs exactly one parseable JSON object on async pain record', () => {
|
|
14
|
+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pd-regression-'));
|
|
15
|
+
const pdDir = path.join(tmpDir, '.pd');
|
|
16
|
+
fs.mkdirSync(pdDir, { recursive: true });
|
|
17
|
+
|
|
18
|
+
// Write a config to enable async mode
|
|
19
|
+
const configContent = `
|
|
20
|
+
version: 1
|
|
21
|
+
features:
|
|
22
|
+
diagnostician_async_cli:
|
|
23
|
+
category: quiet
|
|
24
|
+
enabled: true
|
|
25
|
+
runtimeProfiles:
|
|
26
|
+
openclaw.default:
|
|
27
|
+
type: openclaw
|
|
28
|
+
source: default
|
|
29
|
+
internalAgents:
|
|
30
|
+
defaultRuntime: openclaw.default
|
|
31
|
+
agents:
|
|
32
|
+
diagnostician:
|
|
33
|
+
enabled: true
|
|
34
|
+
runtimeProfile: openclaw.default
|
|
35
|
+
ui:
|
|
36
|
+
diagnostics:
|
|
37
|
+
mode: simple
|
|
38
|
+
`;
|
|
39
|
+
fs.writeFileSync(path.join(pdDir, 'config.yaml'), configContent.trim(), 'utf8');
|
|
40
|
+
|
|
41
|
+
// Resolve CLI binary path relative to this file to be workspace-independent
|
|
42
|
+
const cliBin = path.resolve(__dirname, '../../dist/index.js');
|
|
43
|
+
const cmd = `node "${cliBin}" pain record --reason "Regression test frustration" --json --workspace "${tmpDir}"`;
|
|
44
|
+
|
|
45
|
+
let stdoutStr: string;
|
|
46
|
+
try {
|
|
47
|
+
stdoutStr = execSync(cmd, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'inherit'] });
|
|
48
|
+
} finally {
|
|
49
|
+
fs.rmSync(tmpDir, { recursive: true, force: true });
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Ensure output is non-empty
|
|
53
|
+
expect(stdoutStr.trim()).not.toBe('');
|
|
54
|
+
|
|
55
|
+
// Ensure it's exactly one parseable JSON object
|
|
56
|
+
let parsed: Record<string, unknown>;
|
|
57
|
+
try {
|
|
58
|
+
parsed = JSON.parse(stdoutStr.trim()) as Record<string, unknown>;
|
|
59
|
+
} catch (err) {
|
|
60
|
+
throw new Error(`Stdout was not a single parseable JSON object:\n${stdoutStr}`, {
|
|
61
|
+
cause: err,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Ensure required fields
|
|
66
|
+
expect(parsed).toBeDefined();
|
|
67
|
+
expect(parsed.status).toBe('submitted');
|
|
68
|
+
expect(parsed.taskId).toMatch(/^diagnosis_/);
|
|
69
|
+
expect(parsed.message).toBeDefined();
|
|
70
|
+
expect(parsed.reason).toBeTypeOf('string');
|
|
71
|
+
expect(parsed.nextAction).toBeTypeOf('string');
|
|
72
|
+
|
|
73
|
+
// Verify nextAction structure: pd diagnose run --task-id ... --runtime pi-ai --json
|
|
74
|
+
const nextAction = parsed.nextAction as string;
|
|
75
|
+
expect(nextAction).toContain('pd diagnose run');
|
|
76
|
+
expect(nextAction).toContain(`--task-id ${parsed.taskId}`);
|
|
77
|
+
expect(nextAction).toContain('--runtime pi-ai');
|
|
78
|
+
expect(nextAction).toContain('--json');
|
|
79
|
+
}, 20000);
|
|
80
|
+
});
|
|
81
|
+
|