@principles/pd-cli 1.95.0 → 1.97.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/diagnose.d.ts +1 -0
- package/dist/commands/diagnose.d.ts.map +1 -1
- package/dist/commands/diagnose.js +44 -0
- package/dist/commands/diagnose.js.map +1 -1
- package/dist/commands/pain-record.d.ts.map +1 -1
- package/dist/commands/pain-record.js +4 -1
- package/dist/commands/pain-record.js.map +1 -1
- package/dist/commands/quality-scorecard.d.ts +9 -0
- package/dist/commands/quality-scorecard.d.ts.map +1 -0
- package/dist/commands/quality-scorecard.js +241 -0
- package/dist/commands/quality-scorecard.js.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -1
- package/dist/services/quality-scorecard/data-extractor.d.ts +28 -0
- package/dist/services/quality-scorecard/data-extractor.d.ts.map +1 -0
- package/dist/services/quality-scorecard/data-extractor.js +118 -0
- package/dist/services/quality-scorecard/data-extractor.js.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts +18 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.js +112 -0
- package/dist/services/quality-scorecard/local-evaluator.js.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts +14 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.js +128 -0
- package/dist/services/quality-scorecard/strong-model-gate.js.map +1 -0
- package/package.json +1 -1
- package/src/commands/diagnose.ts +45 -0
- package/src/commands/pain-record.ts +5 -2
- package/src/commands/quality-scorecard.ts +272 -0
- package/src/index.ts +25 -0
- package/src/services/quality-scorecard/data-extractor.ts +150 -0
- package/src/services/quality-scorecard/local-evaluator.ts +142 -0
- package/src/services/quality-scorecard/strong-model-gate.ts +160 -0
- package/tests/commands/diagnose.test.ts +69 -1
- package/tests/commands/pain-record-async.test.ts +4 -0
- package/tests/commands/product-path-regression.test.ts +81 -0
package/src/commands/diagnose.ts
CHANGED
|
@@ -40,10 +40,26 @@ import { loadPdConfig, computeFlagsFromLoadResult } from '../services/pd-config-
|
|
|
40
40
|
import { isFeatureEnabled, SPLIT_PIPELINE_TOTAL_TIMEOUT_MS } from '@principles/core/runtime-v2';
|
|
41
41
|
import * as path from 'path';
|
|
42
42
|
|
|
43
|
+
function validateStalledThreshold(val: unknown): number | undefined {
|
|
44
|
+
if (val === undefined) {
|
|
45
|
+
return undefined;
|
|
46
|
+
}
|
|
47
|
+
const str = String(val).trim();
|
|
48
|
+
if (!/^[1-9]\d*$/.test(str)) {
|
|
49
|
+
throw new Error('stalled-threshold must be a positive integer.');
|
|
50
|
+
}
|
|
51
|
+
const num = parseInt(str, 10);
|
|
52
|
+
if (isNaN(num)) {
|
|
53
|
+
throw new Error('stalled-threshold must be a positive integer.');
|
|
54
|
+
}
|
|
55
|
+
return num;
|
|
56
|
+
}
|
|
57
|
+
|
|
43
58
|
interface DiagnoseStatusOptions {
|
|
44
59
|
taskId: string;
|
|
45
60
|
workspace?: string;
|
|
46
61
|
json?: boolean;
|
|
62
|
+
stalledThreshold?: unknown;
|
|
47
63
|
}
|
|
48
64
|
|
|
49
65
|
interface DiagnoseRunOptions {
|
|
@@ -69,6 +85,25 @@ interface DiagnoseRunOptions {
|
|
|
69
85
|
* Inspects the current status of a diagnostician task.
|
|
70
86
|
*/
|
|
71
87
|
export async function handleDiagnoseStatus(opts: DiagnoseStatusOptions): Promise<void> {
|
|
88
|
+
let stalledThresholdSeconds: number | undefined;
|
|
89
|
+
try {
|
|
90
|
+
stalledThresholdSeconds = validateStalledThreshold(opts.stalledThreshold);
|
|
91
|
+
} catch (err) {
|
|
92
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
93
|
+
if (opts.json) {
|
|
94
|
+
console.log(JSON.stringify({
|
|
95
|
+
ok: false,
|
|
96
|
+
reason: 'invalid_stalled_threshold',
|
|
97
|
+
nextAction: 'Provide a valid positive integer for --stalled-threshold (e.g., --stalled-threshold 300).',
|
|
98
|
+
}));
|
|
99
|
+
} else {
|
|
100
|
+
console.error(`error: ${msg}`);
|
|
101
|
+
console.error('nextAction: Provide a valid positive integer for --stalled-threshold (e.g., --stalled-threshold 300).');
|
|
102
|
+
}
|
|
103
|
+
process.exit(1);
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
|
|
72
107
|
const workspaceDir = resolveWorkspaceDir(opts.workspace);
|
|
73
108
|
const stateManager = new RuntimeStateManager({ workspaceDir });
|
|
74
109
|
|
|
@@ -77,6 +112,7 @@ export async function handleDiagnoseStatus(opts: DiagnoseStatusOptions): Promise
|
|
|
77
112
|
const result = await diagnoseStatus({
|
|
78
113
|
taskId: opts.taskId,
|
|
79
114
|
stateManager,
|
|
115
|
+
stalledThresholdSeconds,
|
|
80
116
|
});
|
|
81
117
|
|
|
82
118
|
if (!result) {
|
|
@@ -102,6 +138,15 @@ export async function handleDiagnoseStatus(opts: DiagnoseStatusOptions): Promise
|
|
|
102
138
|
if (result.lastError) {
|
|
103
139
|
console.log(` Last Error: ${result.lastError}`);
|
|
104
140
|
}
|
|
141
|
+
if (result.reason) {
|
|
142
|
+
console.log(` Reason: ${result.reason}`);
|
|
143
|
+
}
|
|
144
|
+
if (result.age !== undefined && result.age !== null) {
|
|
145
|
+
console.log(` Age: ${result.age}s`);
|
|
146
|
+
}
|
|
147
|
+
if (result.nextAction) {
|
|
148
|
+
console.log(` Next Action: ${result.nextAction}`);
|
|
149
|
+
}
|
|
105
150
|
console.log('');
|
|
106
151
|
} finally {
|
|
107
152
|
await stateManager.close();
|
|
@@ -168,11 +168,14 @@ export async function handlePainRecord(opts: RecordOptions): Promise<void> {
|
|
|
168
168
|
}
|
|
169
169
|
|
|
170
170
|
if (opts.json) {
|
|
171
|
-
const out = { ...result };
|
|
171
|
+
const out: Record<string, unknown> = { ...result };
|
|
172
172
|
// Ensure nextAction is present for actionable states
|
|
173
173
|
if (out.status === 'submitted') {
|
|
174
174
|
if (!out.nextAction) {
|
|
175
|
-
out.nextAction = `pd diagnose run --task-id ${out.taskId} --workspace "${workspaceDir}"`;
|
|
175
|
+
out.nextAction = `pd diagnose run --task-id ${out.taskId} --workspace "${workspaceDir}" --runtime pi-ai --json`;
|
|
176
|
+
}
|
|
177
|
+
if (!out.reason) {
|
|
178
|
+
out.reason = out.message;
|
|
176
179
|
}
|
|
177
180
|
}
|
|
178
181
|
console.log(JSON.stringify(out, null, 2));
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pd quality scorecard — CLI command (PRI-361)
|
|
3
|
+
*
|
|
4
|
+
* JSON contract: --json mode outputs EXACTLY one JSON object to stdout.
|
|
5
|
+
* All progress/diagnostic output goes to stderr.
|
|
6
|
+
* Errors produce structured JSON: { ok: false, error, nextAction }.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { mkdirSync, writeFileSync } from 'fs';
|
|
10
|
+
import { dirname } from 'path';
|
|
11
|
+
import type {
|
|
12
|
+
EpisodeEvaluation,
|
|
13
|
+
QualityScorecardReport,
|
|
14
|
+
StrongModelAdjudication,
|
|
15
|
+
} from '@principles/core/quality-scorecard';
|
|
16
|
+
import {
|
|
17
|
+
validateCliOptions,
|
|
18
|
+
needsAdjudication,
|
|
19
|
+
generateMarkdownReport,
|
|
20
|
+
generateHtmlReport,
|
|
21
|
+
generateJsonReport,
|
|
22
|
+
} from '@principles/core/quality-scorecard';
|
|
23
|
+
import { extractEpisodes, extractLogStats } from '../services/quality-scorecard/data-extractor.js';
|
|
24
|
+
import { evaluateWithLocalModel, checkLmStudioAvailable } from '../services/quality-scorecard/local-evaluator.js';
|
|
25
|
+
import { adjudicate, skippedAdjudication, determineFinalLabel } from '../services/quality-scorecard/strong-model-gate.js';
|
|
26
|
+
|
|
27
|
+
// ── Logging: stderr only, silent in JSON mode ──────────────────────
|
|
28
|
+
|
|
29
|
+
let jsonMode = false;
|
|
30
|
+
|
|
31
|
+
function log(msg: string): void {
|
|
32
|
+
if (!jsonMode) {
|
|
33
|
+
process.stderr.write(msg + '\n');
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ── Structured JSON output helpers ─────────────────────────────────
|
|
38
|
+
|
|
39
|
+
function writeJsonOutput(data: unknown): void {
|
|
40
|
+
process.stdout.write(JSON.stringify(data, null, 2) + '\n');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function writeJsonError(error: string, nextAction: string): void {
|
|
44
|
+
writeJsonOutput({ ok: false, error, nextAction });
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ── Summary computation ────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
function computeSummary(evaluations: EpisodeEvaluation[]) {
|
|
50
|
+
const totalEpisodes = evaluations.length;
|
|
51
|
+
// localPassCount/localFailCount: based strictly on local model's own conclusion
|
|
52
|
+
// (localEval.mvpMet + totalScore), NOT finalLabel which may incorporate strong-model adjudication.
|
|
53
|
+
const localPassCount = evaluations.filter(e => e.localEvaluation.mvpMet && e.localEvaluation.totalScore >= 12).length;
|
|
54
|
+
const localFailCount = evaluations.filter(e => e.localEvaluation.totalScore <= 6).length;
|
|
55
|
+
const strongModelReviewedCount = evaluations.filter(e =>
|
|
56
|
+
e.strongModelAdjudication && e.strongModelAdjudication.adjudicationStatus !== 'skipped'
|
|
57
|
+
).length;
|
|
58
|
+
const finalPassCount = evaluations.filter(e => e.finalLabel === 'pass').length;
|
|
59
|
+
const finalFailCount = evaluations.filter(e => e.finalLabel === 'fail').length;
|
|
60
|
+
const needsReviewCount = evaluations.filter(e => e.finalLabel === 'needs-review').length;
|
|
61
|
+
const localOnlyCount = evaluations.filter(e => e.finalLabel === 'local-pass' || e.finalLabel === 'local-fail').length;
|
|
62
|
+
const averageLocalScore = totalEpisodes > 0
|
|
63
|
+
? evaluations.reduce((s, e) => s + e.localEvaluation.totalScore, 0) / totalEpisodes
|
|
64
|
+
: 0;
|
|
65
|
+
const mvpThresholdMetCount = evaluations.filter(e => e.localEvaluation.mvpMet).length;
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
totalEpisodes, localPassCount, localFailCount, strongModelReviewedCount,
|
|
69
|
+
finalPassCount, finalFailCount, needsReviewCount, localOnlyCount,
|
|
70
|
+
averageLocalScore, mvpThresholdMetCount,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ── Main handler ───────────────────────────────────────────────────
|
|
75
|
+
|
|
76
|
+
export async function handleQualityScorecard(opts: Record<string, unknown>): Promise<void> {
|
|
77
|
+
const isJson = Boolean(opts.json);
|
|
78
|
+
jsonMode = isJson;
|
|
79
|
+
|
|
80
|
+
// Resolve workspace paths
|
|
81
|
+
const { resolveWorkspaceDir } = await import('../resolve-workspace.js');
|
|
82
|
+
const { join } = await import('path');
|
|
83
|
+
const { existsSync } = await import('fs');
|
|
84
|
+
const workspace = resolveWorkspaceDir(opts.workspace as string | undefined);
|
|
85
|
+
const dbPath = join(workspace, '.state', 'trajectory.db');
|
|
86
|
+
const logsDir = join(workspace, '.state', 'logs');
|
|
87
|
+
|
|
88
|
+
// 1. Validate CLI options
|
|
89
|
+
const { options, errors } = validateCliOptions({
|
|
90
|
+
dbPath,
|
|
91
|
+
logsDir,
|
|
92
|
+
localModelBaseUrl: opts.localUrl ?? 'http://localhost:12341/v1',
|
|
93
|
+
localModelId: opts.localModel ?? 'qwen3.6-27b-mtp',
|
|
94
|
+
strongModelId: opts.strongModel ?? null,
|
|
95
|
+
limit: opts.limit ?? '0',
|
|
96
|
+
format: isJson ? 'json' : (opts.format ?? 'markdown'),
|
|
97
|
+
output: opts.output,
|
|
98
|
+
minPainScore: opts.minScore ?? '50',
|
|
99
|
+
skipStrongModel: opts.skipStrongModel ?? false,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
if (errors.length > 0) {
|
|
103
|
+
const msg = errors.map(e => `${e.field}: ${e.message}`).join('; ');
|
|
104
|
+
if (isJson) {
|
|
105
|
+
writeJsonError(msg, 'Fix the invalid options and retry');
|
|
106
|
+
} else {
|
|
107
|
+
process.stderr.write(`❌ Invalid options:\n${errors.map(e => ` - ${e.field}: ${e.message}`).join('\n')}\n`);
|
|
108
|
+
}
|
|
109
|
+
process.exitCode = 1;
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// 2. Check files exist
|
|
114
|
+
if (!existsSync(options.dbPath)) {
|
|
115
|
+
const msg = `trajectory.db not found at: ${options.dbPath}`;
|
|
116
|
+
if (isJson) {
|
|
117
|
+
writeJsonError(msg, 'Ensure the workspace has PD data (run PD first to generate trajectory.db)');
|
|
118
|
+
} else {
|
|
119
|
+
process.stderr.write(`❌ ${msg}\n`);
|
|
120
|
+
}
|
|
121
|
+
process.exitCode = 1;
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// 3. Ensure output directory exists
|
|
126
|
+
const outputDir = dirname(options.output);
|
|
127
|
+
if (outputDir && !existsSync(outputDir)) {
|
|
128
|
+
try {
|
|
129
|
+
mkdirSync(outputDir, { recursive: true });
|
|
130
|
+
log(`Created output directory: ${outputDir}`);
|
|
131
|
+
} catch (err: unknown) {
|
|
132
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
133
|
+
if (isJson) {
|
|
134
|
+
writeJsonError(`Cannot create output directory: ${msg}`, 'Ensure the output path is writable');
|
|
135
|
+
} else {
|
|
136
|
+
process.stderr.write(`❌ Cannot create output directory: ${msg}\n`);
|
|
137
|
+
}
|
|
138
|
+
process.exitCode = 1;
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// 4. Check LM Studio
|
|
144
|
+
log('🔍 PD Quality Scorecard — Starting...');
|
|
145
|
+
log(` DB: ${options.dbPath}`);
|
|
146
|
+
log(` Local Model: ${options.localModelId} @ ${options.localModelBaseUrl}`);
|
|
147
|
+
log(` Strong Model: ${options.strongModelId ?? 'skipped'}`);
|
|
148
|
+
|
|
149
|
+
const lmStatus = await checkLmStudioAvailable(options.localModelBaseUrl);
|
|
150
|
+
if (!lmStatus.available) {
|
|
151
|
+
if (isJson) {
|
|
152
|
+
writeJsonError(`LM Studio not available: ${lmStatus.error}`, 'Start LM Studio or check --local-url');
|
|
153
|
+
} else {
|
|
154
|
+
process.stderr.write(`❌ LM Studio not available at ${options.localModelBaseUrl}: ${lmStatus.error}\n`);
|
|
155
|
+
}
|
|
156
|
+
process.exitCode = 1;
|
|
157
|
+
return;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (!lmStatus.models.includes(options.localModelId)) {
|
|
161
|
+
if (isJson) {
|
|
162
|
+
writeJsonError(`Model "${options.localModelId}" not found. Available: ${lmStatus.models.join(', ')}`, 'Use --local-model with an available model');
|
|
163
|
+
} else {
|
|
164
|
+
process.stderr.write(`❌ Model "${options.localModelId}" not found. Available: ${lmStatus.models.join(', ')}\n`);
|
|
165
|
+
}
|
|
166
|
+
process.exitCode = 1;
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// 5. Extract data
|
|
171
|
+
log('\n📊 Extracting dogfood data...');
|
|
172
|
+
const { episodes, stats: extractStats } = await extractEpisodes(options.dbPath, {
|
|
173
|
+
minScore: options.minPainScore,
|
|
174
|
+
limit: options.limit,
|
|
175
|
+
});
|
|
176
|
+
log(` Found ${episodes.length} unique episodes (total pain events: ${extractStats.total})`);
|
|
177
|
+
|
|
178
|
+
const logStats = extractLogStats(options.logsDir);
|
|
179
|
+
log(` Event logs: ${logStats.totalEvents} events (${logStats.painSignalCount} pain signals)`);
|
|
180
|
+
|
|
181
|
+
// 6. Evaluate each episode
|
|
182
|
+
log('\n🤖 Running local model evaluation...');
|
|
183
|
+
const evaluations: EpisodeEvaluation[] = [];
|
|
184
|
+
|
|
185
|
+
for (let i = 0; i < episodes.length; i++) {
|
|
186
|
+
const ep = episodes[i];
|
|
187
|
+
if (!ep) continue;
|
|
188
|
+
log(` [${i + 1}/${episodes.length}] ${ep.episodeId} (score=${ep.score})...`);
|
|
189
|
+
|
|
190
|
+
const localEval = await evaluateWithLocalModel(ep, {
|
|
191
|
+
baseUrl: options.localModelBaseUrl,
|
|
192
|
+
model: options.localModelId,
|
|
193
|
+
}, (msg: string) => log(` ${msg}`));
|
|
194
|
+
log(` Local: ${localEval.totalScore}/14 MVP=${localEval.mvpMet} flags=[${localEval.flags.join(',')}]`);
|
|
195
|
+
|
|
196
|
+
// 7. Strong model adjudication
|
|
197
|
+
let adjudication: StrongModelAdjudication;
|
|
198
|
+
if (options.skipStrongModel || !options.strongModelId) {
|
|
199
|
+
adjudication = skippedAdjudication(
|
|
200
|
+
options.skipStrongModel
|
|
201
|
+
? 'Strong model skipped by --skip-strong-model flag'
|
|
202
|
+
: 'No strong model configured'
|
|
203
|
+
);
|
|
204
|
+
} else {
|
|
205
|
+
const decision = needsAdjudication(ep, localEval);
|
|
206
|
+
if (decision.shouldAdjudicate) {
|
|
207
|
+
log(` Adjudicating (${decision.priority}: ${decision.reason})...`);
|
|
208
|
+
adjudication = await adjudicate(ep, localEval, { modelId: options.strongModelId, log: (msg: string) => log(` ${msg}`) });
|
|
209
|
+
log(` Adjudication: ${adjudication.adjudicationStatus}`);
|
|
210
|
+
} else {
|
|
211
|
+
adjudication = skippedAdjudication(decision.reason);
|
|
212
|
+
log(` Adjudication skipped: ${decision.reason}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const finalLabel = determineFinalLabel(localEval, adjudication);
|
|
217
|
+
evaluations.push({ episode: ep, localEvaluation: localEval, strongModelAdjudication: adjudication, finalLabel });
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// 8. Build and write report
|
|
221
|
+
log('\n📝 Generating report...');
|
|
222
|
+
const summary = computeSummary(evaluations);
|
|
223
|
+
const report: QualityScorecardReport = {
|
|
224
|
+
generatedAt: new Date().toISOString(),
|
|
225
|
+
dataSource: {
|
|
226
|
+
painEventCount: extractStats.total,
|
|
227
|
+
evolutionTaskCount: 0,
|
|
228
|
+
principleEventCount: 0,
|
|
229
|
+
gateBlockCount: 0,
|
|
230
|
+
dateRange: extractStats.dateRange,
|
|
231
|
+
},
|
|
232
|
+
localEvaluatorConfig: {
|
|
233
|
+
model: options.localModelId,
|
|
234
|
+
baseUrl: options.localModelBaseUrl.replace(/\/v\d+$/, '/...'),
|
|
235
|
+
apiKeyStatus: 'not-required',
|
|
236
|
+
},
|
|
237
|
+
strongModelConfig: {
|
|
238
|
+
model: options.strongModelId,
|
|
239
|
+
status: options.skipStrongModel || !options.strongModelId ? 'skipped' : 'configured',
|
|
240
|
+
},
|
|
241
|
+
evaluations,
|
|
242
|
+
summary,
|
|
243
|
+
knownLimitations: [
|
|
244
|
+
'Local model scores are advisory only — not final quality conclusions.',
|
|
245
|
+
'Without strong-model adjudication, samples are marked local-pass/local-fail/needs-review.',
|
|
246
|
+
'Deduplication is based on reason text similarity — may miss distinct episodes.',
|
|
247
|
+
'Local model output is non-deterministic despite temperature=0.1.',
|
|
248
|
+
],
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
let content: string;
|
|
252
|
+
switch (options.format) {
|
|
253
|
+
case 'html': content = generateHtmlReport(report); break;
|
|
254
|
+
case 'json': content = generateJsonReport(report); break;
|
|
255
|
+
case 'markdown':
|
|
256
|
+
default: content = generateMarkdownReport(report); break;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
writeFileSync(options.output, content, 'utf-8');
|
|
260
|
+
|
|
261
|
+
log(`\n✅ Report written to: ${options.output}`);
|
|
262
|
+
log(` Format: ${options.format}`);
|
|
263
|
+
log(` Episodes: ${summary.totalEpisodes}`);
|
|
264
|
+
log(` Local Pass: ${summary.localPassCount} | Local Fail: ${summary.localFailCount}`);
|
|
265
|
+
log(` Strong Model Reviewed: ${summary.strongModelReviewedCount}`);
|
|
266
|
+
log(` Final Pass: ${summary.finalPassCount} | Final Fail: ${summary.finalFailCount} | Needs Review: ${summary.needsReviewCount}`);
|
|
267
|
+
|
|
268
|
+
// JSON mode: output exactly one JSON object to stdout
|
|
269
|
+
if (isJson) {
|
|
270
|
+
writeJsonOutput({ ok: true, report });
|
|
271
|
+
}
|
|
272
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -306,6 +306,7 @@ diagnoseCmd
|
|
|
306
306
|
.description('Inspect diagnostician task status')
|
|
307
307
|
.requiredOption('-t, --task-id <taskId>', 'Task ID to inspect')
|
|
308
308
|
.option('-w, --workspace <path>', 'Workspace directory')
|
|
309
|
+
.option('--stalled-threshold <seconds>', 'Age threshold in seconds for classifying task as stalled')
|
|
309
310
|
.option('--json', 'Output raw JSON')
|
|
310
311
|
.action(async (opts) => {
|
|
311
312
|
await handleDiagnoseStatus(opts);
|
|
@@ -926,4 +927,28 @@ consoleCmd.action(async (opts) => {
|
|
|
926
927
|
});
|
|
927
928
|
});
|
|
928
929
|
|
|
930
|
+
// ─── Quality Scorecard (PRI-361) ──────────────────────────────────
|
|
931
|
+
|
|
932
|
+
const qualityCmd = program
|
|
933
|
+
.command('quality')
|
|
934
|
+
.description('Quality scoring and evaluation');
|
|
935
|
+
|
|
936
|
+
qualityCmd
|
|
937
|
+
.command('scorecard')
|
|
938
|
+
.description('Generate quality scorecard report for PD pain→diagnosis→principle chain')
|
|
939
|
+
.option('-w, --workspace <path>', 'Workspace directory')
|
|
940
|
+
.option('--local-model <id>', 'LM Studio model ID', 'qwen3.6-27b-mtp')
|
|
941
|
+
.option('--local-url <url>', 'LM Studio base URL', 'http://localhost:12341/v1')
|
|
942
|
+
.option('--strong-model <id>', 'Strong model for adjudication (provider/model)')
|
|
943
|
+
.option('--skip-strong-model', 'Skip strong model adjudication', false)
|
|
944
|
+
.option('--min-score <n>', 'Minimum pain score to evaluate', '50')
|
|
945
|
+
.option('--limit <n>', 'Max episodes to evaluate (0=all)', '0')
|
|
946
|
+
.option('--format <fmt>', 'Output format: json, markdown, html', 'markdown')
|
|
947
|
+
.option('--output <path>', 'Output file path')
|
|
948
|
+
.option('--json', 'Output as JSON', false)
|
|
949
|
+
.action(async (opts) => {
|
|
950
|
+
const { handleQualityScorecard } = await import('./commands/quality-scorecard.js');
|
|
951
|
+
await handleQualityScorecard(opts);
|
|
952
|
+
});
|
|
953
|
+
|
|
929
954
|
program.parse();
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Data Extractor (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Reads trajectory.db and event logs. Uses runtime validation
|
|
5
|
+
* from @principles/core — no `as RawPainEvent[]` casts.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { PainEpisode } from '@principles/core/quality-scorecard';
|
|
9
|
+
import {
|
|
10
|
+
validatePainRow,
|
|
11
|
+
validateEvolutionRow,
|
|
12
|
+
validatePrincipleEventRow,
|
|
13
|
+
validateGateRow,
|
|
14
|
+
sanitize,
|
|
15
|
+
truncate,
|
|
16
|
+
} from '@principles/core/quality-scorecard';
|
|
17
|
+
import { readdirSync, readFileSync } from 'fs';
|
|
18
|
+
import { join } from 'path';
|
|
19
|
+
|
|
20
|
+
export interface ExtractionResult {
|
|
21
|
+
episodes: PainEpisode[];
|
|
22
|
+
stats: { total: number; dateRange: { from: string; to: string } };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export async function extractEpisodes(
|
|
26
|
+
dbPath: string,
|
|
27
|
+
options: { minScore?: number; limit?: number } = {}
|
|
28
|
+
): Promise<ExtractionResult> {
|
|
29
|
+
const Database = (await import('better-sqlite3')).default;
|
|
30
|
+
const db = new Database(dbPath, { readonly: true });
|
|
31
|
+
|
|
32
|
+
const minScore = options.minScore ?? 0;
|
|
33
|
+
const limit = options.limit ?? 0;
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
// Fetch pain events with parameterized query
|
|
37
|
+
let painQuery = 'SELECT id, session_id, source, score, reason, severity, created_at FROM pain_events WHERE 1=1';
|
|
38
|
+
const params: (string | number)[] = [];
|
|
39
|
+
if (minScore > 0) {
|
|
40
|
+
painQuery += ' AND score >= ?';
|
|
41
|
+
params.push(minScore);
|
|
42
|
+
}
|
|
43
|
+
painQuery += ' ORDER BY created_at DESC';
|
|
44
|
+
if (limit > 0) {
|
|
45
|
+
painQuery += ' LIMIT ?';
|
|
46
|
+
params.push(limit);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const rawPainRows = db.prepare(painQuery).all(...params);
|
|
50
|
+
// Validate each row — no unsafe cast
|
|
51
|
+
const painEvents = rawPainRows.map(validatePainRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
52
|
+
|
|
53
|
+
// Fetch evolution tasks
|
|
54
|
+
const rawEvoRows = db.prepare('SELECT task_id, score, status, resolution, created_at FROM evolution_tasks').all();
|
|
55
|
+
const evoTasks = rawEvoRows.map(validateEvolutionRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
56
|
+
|
|
57
|
+
// Fetch principle events
|
|
58
|
+
const rawPeRows = db.prepare('SELECT principle_id, event_type, created_at FROM principle_events').all();
|
|
59
|
+
const prEvents = rawPeRows.map(validatePrincipleEventRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
60
|
+
|
|
61
|
+
// Gate blocks count per session — validate each row
|
|
62
|
+
const rawGateRows = db.prepare('SELECT session_id, COUNT(*) as cnt FROM gate_blocks GROUP BY session_id').all();
|
|
63
|
+
const gateRows = rawGateRows.map(validateGateRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
64
|
+
const gateBlockMap = new Map(gateRows.map(g => [g.session_id, g.cnt]));
|
|
65
|
+
|
|
66
|
+
// Deduplicate by sanitized reason
|
|
67
|
+
const seen = new Set<string>();
|
|
68
|
+
const episodes: PainEpisode[] = [];
|
|
69
|
+
|
|
70
|
+
for (const pe of painEvents) {
|
|
71
|
+
const dedupKey = sanitize(pe.reason).substring(0, 80);
|
|
72
|
+
if (seen.has(dedupKey)) continue;
|
|
73
|
+
seen.add(dedupKey);
|
|
74
|
+
|
|
75
|
+
const peTime = new Date(pe.created_at).getTime();
|
|
76
|
+
const linkedTask = evoTasks.find(t => {
|
|
77
|
+
const tTime = new Date(t.created_at).getTime();
|
|
78
|
+
return Math.abs(tTime - peTime) < 3600000 && Math.abs(t.score - pe.score) <= 10;
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
const linkedPrinciples = prEvents
|
|
82
|
+
.filter(e => {
|
|
83
|
+
if (!e.principle_id) return false;
|
|
84
|
+
const eTime = new Date(e.created_at).getTime();
|
|
85
|
+
return Math.abs(eTime - peTime) < 7200000;
|
|
86
|
+
})
|
|
87
|
+
.map(e => e.principle_id)
|
|
88
|
+
.filter((v): v is string => v !== null)
|
|
89
|
+
.filter((v, i, a) => a.indexOf(v) === i);
|
|
90
|
+
|
|
91
|
+
episodes.push({
|
|
92
|
+
episodeId: `EP-${pe.id}`,
|
|
93
|
+
summary: truncate(sanitize(pe.reason)),
|
|
94
|
+
source: pe.source,
|
|
95
|
+
score: pe.score,
|
|
96
|
+
severity: pe.severity,
|
|
97
|
+
createdAt: pe.created_at,
|
|
98
|
+
evolutionTaskResolution: linkedTask?.resolution ?? null,
|
|
99
|
+
linkedPrinciples,
|
|
100
|
+
gateBlockCount: gateBlockMap.get(pe.session_id) ?? 0,
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const dates = painEvents.map(e => e.created_at).sort();
|
|
105
|
+
return {
|
|
106
|
+
episodes,
|
|
107
|
+
stats: {
|
|
108
|
+
total: painEvents.length,
|
|
109
|
+
dateRange: {
|
|
110
|
+
from: dates[0] ?? new Date().toISOString(),
|
|
111
|
+
to: dates[dates.length - 1] ?? new Date().toISOString(),
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
};
|
|
115
|
+
} finally {
|
|
116
|
+
db.close();
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export interface LogStats {
|
|
121
|
+
totalEvents: number;
|
|
122
|
+
painSignalCount: number;
|
|
123
|
+
degradedReasons: string[];
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export function extractLogStats(logsDir: string): LogStats {
|
|
127
|
+
const stats: LogStats = { totalEvents: 0, painSignalCount: 0, degradedReasons: [] };
|
|
128
|
+
|
|
129
|
+
try {
|
|
130
|
+
const files = readdirSync(logsDir).filter(f => f.endsWith('.jsonl'));
|
|
131
|
+
for (const file of files) {
|
|
132
|
+
const lines = readFileSync(join(logsDir, file), 'utf-8').split('\n').filter(Boolean);
|
|
133
|
+
for (const line of lines) {
|
|
134
|
+
try {
|
|
135
|
+
const ev = JSON.parse(line) as Record<string, unknown>;
|
|
136
|
+
stats.totalEvents++;
|
|
137
|
+
if (ev.type === 'pain_signal') stats.painSignalCount++;
|
|
138
|
+
} catch (parseErr: unknown) {
|
|
139
|
+
const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
|
|
140
|
+
stats.degradedReasons.push(`jsonl-parse-fail:${file}:${msg}`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
} catch (dirErr: unknown) {
|
|
145
|
+
const msg = dirErr instanceof Error ? dirErr.message : String(dirErr);
|
|
146
|
+
stats.degradedReasons.push(`logs-dir-unreadable:${msg}`);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return stats;
|
|
150
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Local Evaluator (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Calls LM Studio for advisory scoring. Uses core validation
|
|
5
|
+
* to parse LLM responses — no unsafe casts.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
PainEpisode,
|
|
10
|
+
LocalEvaluation,
|
|
11
|
+
RubricDimension,
|
|
12
|
+
RubricScore,
|
|
13
|
+
} from '@principles/core/quality-scorecard';
|
|
14
|
+
import {
|
|
15
|
+
RUBRIC_LABELS,
|
|
16
|
+
RUBRIC_PROMPTS,
|
|
17
|
+
RUBRIC_DIMENSIONS as DIMS,
|
|
18
|
+
meetsMvpThreshold,
|
|
19
|
+
sumScores,
|
|
20
|
+
validateLlmScoreResponse,
|
|
21
|
+
extractJsonFromLlmResponse,
|
|
22
|
+
} from '@principles/core/quality-scorecard';
|
|
23
|
+
|
|
24
|
+
function buildEvaluationPrompt(episode: PainEpisode): string {
|
|
25
|
+
const dimensions = DIMS.map(d => `${d} (${RUBRIC_LABELS[d]}): ${RUBRIC_PROMPTS[d]}`).join('\n');
|
|
26
|
+
|
|
27
|
+
return `You are a quality evaluator for an AI agent's pain-signal -> diagnosis -> principle pipeline.
|
|
28
|
+
|
|
29
|
+
## Task
|
|
30
|
+
Evaluate this pain episode on a 7-dimension rubric. Each dimension scores 0 (fail), 1 (partial), or 2 (pass).
|
|
31
|
+
|
|
32
|
+
## Pain Episode
|
|
33
|
+
- ID: ${episode.episodeId}
|
|
34
|
+
- Source: ${episode.source}
|
|
35
|
+
- Pain Score: ${episode.score}
|
|
36
|
+
- Severity: ${episode.severity}
|
|
37
|
+
- Summary: ${episode.summary}
|
|
38
|
+
- Created: ${episode.createdAt}
|
|
39
|
+
- Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
|
|
40
|
+
- Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
|
|
41
|
+
- Gate Blocks: ${episode.gateBlockCount}
|
|
42
|
+
|
|
43
|
+
## Rubric Dimensions
|
|
44
|
+
${dimensions}
|
|
45
|
+
|
|
46
|
+
## Additional Checks
|
|
47
|
+
- Is the language consistent (not mixing Chinese and English incoherently)?
|
|
48
|
+
- Is the diagnosis/principle overly abstract (no concrete actionable guidance)?
|
|
49
|
+
- Does it fabricate non-existent evidence, axioms, or references?
|
|
50
|
+
|
|
51
|
+
## Output Format (STRICT JSON)
|
|
52
|
+
Respond with ONLY a JSON object:
|
|
53
|
+
{
|
|
54
|
+
"scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
|
|
55
|
+
"rationales": { "G1": "...", "G2": "...", "G3": "...", "G4": "...", "G5": "...", "G6": "...", "G7": "..." },
|
|
56
|
+
"flags": ["list of issues found"]
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Do NOT output anything other than this JSON object.`;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export interface LocalEvaluatorConfig {
|
|
63
|
+
baseUrl: string;
|
|
64
|
+
model: string;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export async function evaluateWithLocalModel(
|
|
68
|
+
episode: PainEpisode,
|
|
69
|
+
config: LocalEvaluatorConfig,
|
|
70
|
+
log: (msg: string) => void
|
|
71
|
+
): Promise<LocalEvaluation> {
|
|
72
|
+
const prompt = buildEvaluationPrompt(episode);
|
|
73
|
+
const url = `${config.baseUrl.replace(/\/+$/, '')}/chat/completions`;
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
const resp = await fetch(url, {
|
|
77
|
+
method: 'POST',
|
|
78
|
+
headers: { 'Content-Type': 'application/json' },
|
|
79
|
+
body: JSON.stringify({
|
|
80
|
+
model: config.model,
|
|
81
|
+
messages: [
|
|
82
|
+
{ role: 'system', content: 'You are a precise JSON-output quality evaluator. Output only valid JSON.' },
|
|
83
|
+
{ role: 'user', content: prompt },
|
|
84
|
+
],
|
|
85
|
+
temperature: 0.1,
|
|
86
|
+
max_tokens: 2000,
|
|
87
|
+
}),
|
|
88
|
+
signal: AbortSignal.timeout(120_000),
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
if (!resp.ok) {
|
|
92
|
+
throw new Error(`LM Studio request failed: ${resp.status}`);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const data = (await resp.json()) as { choices: { message: { content: string } }[] };
|
|
96
|
+
const content = data.choices?.[0]?.message?.content ?? '';
|
|
97
|
+
|
|
98
|
+
const parsed = extractJsonFromLlmResponse(content);
|
|
99
|
+
if (parsed === null) {
|
|
100
|
+
throw new Error(`LM Studio returned non-JSON response`);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const { scores, rationales, flags } = validateLlmScoreResponse(parsed);
|
|
104
|
+
const totalScore = sumScores(scores);
|
|
105
|
+
|
|
106
|
+
return {
|
|
107
|
+
model: config.model,
|
|
108
|
+
dimensionScores: scores,
|
|
109
|
+
dimensionRationales: rationales,
|
|
110
|
+
totalScore,
|
|
111
|
+
maxScore: 14,
|
|
112
|
+
mvpMet: meetsMvpThreshold(scores),
|
|
113
|
+
flags: flags,
|
|
114
|
+
};
|
|
115
|
+
} catch (err: unknown) {
|
|
116
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
117
|
+
log(`Evaluation error for ${episode.episodeId}: ${msg}`);
|
|
118
|
+
const zeroScores = Object.fromEntries(DIMS.map(d => [d, 0])) as Record<RubricDimension, RubricScore>;
|
|
119
|
+
return {
|
|
120
|
+
model: config.model,
|
|
121
|
+
dimensionScores: zeroScores,
|
|
122
|
+
dimensionRationales: Object.fromEntries(DIMS.map(d => [d, `Evaluation failed: ${msg}`])) as Record<RubricDimension, string>,
|
|
123
|
+
totalScore: 0,
|
|
124
|
+
maxScore: 14,
|
|
125
|
+
mvpMet: false,
|
|
126
|
+
flags: ['evaluation_error'],
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
export async function checkLmStudioAvailable(baseUrl: string): Promise<{ available: boolean; models: string[]; error?: string }> {
|
|
132
|
+
try {
|
|
133
|
+
const url = `${baseUrl.replace(/\/+$/, '')}/models`;
|
|
134
|
+
const resp = await fetch(url, { signal: AbortSignal.timeout(5000) });
|
|
135
|
+
if (!resp.ok) return { available: false, models: [], error: `HTTP ${resp.status}` };
|
|
136
|
+
const data = (await resp.json()) as { data: { id: string }[] };
|
|
137
|
+
const models = (data.data || []).map((m) => m.id);
|
|
138
|
+
return { available: true, models };
|
|
139
|
+
} catch (err: unknown) {
|
|
140
|
+
return { available: false, models: [], error: err instanceof Error ? err.message : String(err) };
|
|
141
|
+
}
|
|
142
|
+
}
|