@principles/pd-cli 1.96.0 → 1.98.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/quality-scorecard.d.ts +9 -0
- package/dist/commands/quality-scorecard.d.ts.map +1 -0
- package/dist/commands/quality-scorecard.js +241 -0
- package/dist/commands/quality-scorecard.js.map +1 -0
- package/dist/commands/runtime-internalization-queue.d.ts.map +1 -1
- package/dist/commands/runtime-internalization-queue.js +35 -3
- package/dist/commands/runtime-internalization-queue.js.map +1 -1
- package/dist/index.js +21 -0
- package/dist/index.js.map +1 -1
- package/dist/services/quality-scorecard/data-extractor.d.ts +28 -0
- package/dist/services/quality-scorecard/data-extractor.d.ts.map +1 -0
- package/dist/services/quality-scorecard/data-extractor.js +118 -0
- package/dist/services/quality-scorecard/data-extractor.js.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts +18 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.js +112 -0
- package/dist/services/quality-scorecard/local-evaluator.js.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts +14 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.js +128 -0
- package/dist/services/quality-scorecard/strong-model-gate.js.map +1 -0
- package/package.json +1 -1
- package/src/commands/quality-scorecard.ts +272 -0
- package/src/commands/runtime-internalization-queue.ts +37 -3
- package/src/index.ts +24 -0
- package/src/services/quality-scorecard/data-extractor.ts +150 -0
- package/src/services/quality-scorecard/local-evaluator.ts +142 -0
- package/src/services/quality-scorecard/strong-model-gate.ts +160 -0
- package/tests/commands/runtime-internalization-queue.test.ts +140 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pd quality scorecard — CLI command (PRI-361)
|
|
3
|
+
*
|
|
4
|
+
* JSON contract: --json mode outputs EXACTLY one JSON object to stdout.
|
|
5
|
+
* All progress/diagnostic output goes to stderr.
|
|
6
|
+
* Errors produce structured JSON: { ok: false, error, nextAction }.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { mkdirSync, writeFileSync } from 'fs';
|
|
10
|
+
import { dirname } from 'path';
|
|
11
|
+
import type {
|
|
12
|
+
EpisodeEvaluation,
|
|
13
|
+
QualityScorecardReport,
|
|
14
|
+
StrongModelAdjudication,
|
|
15
|
+
} from '@principles/core/quality-scorecard';
|
|
16
|
+
import {
|
|
17
|
+
validateCliOptions,
|
|
18
|
+
needsAdjudication,
|
|
19
|
+
generateMarkdownReport,
|
|
20
|
+
generateHtmlReport,
|
|
21
|
+
generateJsonReport,
|
|
22
|
+
} from '@principles/core/quality-scorecard';
|
|
23
|
+
import { extractEpisodes, extractLogStats } from '../services/quality-scorecard/data-extractor.js';
|
|
24
|
+
import { evaluateWithLocalModel, checkLmStudioAvailable } from '../services/quality-scorecard/local-evaluator.js';
|
|
25
|
+
import { adjudicate, skippedAdjudication, determineFinalLabel } from '../services/quality-scorecard/strong-model-gate.js';
|
|
26
|
+
|
|
27
|
+
// ── Logging: stderr only, silent in JSON mode ──────────────────────
|
|
28
|
+
|
|
29
|
+
let jsonMode = false;
|
|
30
|
+
|
|
31
|
+
function log(msg: string): void {
|
|
32
|
+
if (!jsonMode) {
|
|
33
|
+
process.stderr.write(msg + '\n');
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ── Structured JSON output helpers ─────────────────────────────────
|
|
38
|
+
|
|
39
|
+
function writeJsonOutput(data: unknown): void {
|
|
40
|
+
process.stdout.write(JSON.stringify(data, null, 2) + '\n');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function writeJsonError(error: string, nextAction: string): void {
|
|
44
|
+
writeJsonOutput({ ok: false, error, nextAction });
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ── Summary computation ────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
function computeSummary(evaluations: EpisodeEvaluation[]) {
|
|
50
|
+
const totalEpisodes = evaluations.length;
|
|
51
|
+
// localPassCount/localFailCount: based strictly on local model's own conclusion
|
|
52
|
+
// (localEval.mvpMet + totalScore), NOT finalLabel which may incorporate strong-model adjudication.
|
|
53
|
+
const localPassCount = evaluations.filter(e => e.localEvaluation.mvpMet && e.localEvaluation.totalScore >= 12).length;
|
|
54
|
+
const localFailCount = evaluations.filter(e => e.localEvaluation.totalScore <= 6).length;
|
|
55
|
+
const strongModelReviewedCount = evaluations.filter(e =>
|
|
56
|
+
e.strongModelAdjudication && e.strongModelAdjudication.adjudicationStatus !== 'skipped'
|
|
57
|
+
).length;
|
|
58
|
+
const finalPassCount = evaluations.filter(e => e.finalLabel === 'pass').length;
|
|
59
|
+
const finalFailCount = evaluations.filter(e => e.finalLabel === 'fail').length;
|
|
60
|
+
const needsReviewCount = evaluations.filter(e => e.finalLabel === 'needs-review').length;
|
|
61
|
+
const localOnlyCount = evaluations.filter(e => e.finalLabel === 'local-pass' || e.finalLabel === 'local-fail').length;
|
|
62
|
+
const averageLocalScore = totalEpisodes > 0
|
|
63
|
+
? evaluations.reduce((s, e) => s + e.localEvaluation.totalScore, 0) / totalEpisodes
|
|
64
|
+
: 0;
|
|
65
|
+
const mvpThresholdMetCount = evaluations.filter(e => e.localEvaluation.mvpMet).length;
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
totalEpisodes, localPassCount, localFailCount, strongModelReviewedCount,
|
|
69
|
+
finalPassCount, finalFailCount, needsReviewCount, localOnlyCount,
|
|
70
|
+
averageLocalScore, mvpThresholdMetCount,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ── Main handler ───────────────────────────────────────────────────
|
|
75
|
+
|
|
76
|
+
export async function handleQualityScorecard(opts: Record<string, unknown>): Promise<void> {
|
|
77
|
+
const isJson = Boolean(opts.json);
|
|
78
|
+
jsonMode = isJson;
|
|
79
|
+
|
|
80
|
+
// Resolve workspace paths
|
|
81
|
+
const { resolveWorkspaceDir } = await import('../resolve-workspace.js');
|
|
82
|
+
const { join } = await import('path');
|
|
83
|
+
const { existsSync } = await import('fs');
|
|
84
|
+
const workspace = resolveWorkspaceDir(opts.workspace as string | undefined);
|
|
85
|
+
const dbPath = join(workspace, '.state', 'trajectory.db');
|
|
86
|
+
const logsDir = join(workspace, '.state', 'logs');
|
|
87
|
+
|
|
88
|
+
// 1. Validate CLI options
|
|
89
|
+
const { options, errors } = validateCliOptions({
|
|
90
|
+
dbPath,
|
|
91
|
+
logsDir,
|
|
92
|
+
localModelBaseUrl: opts.localUrl ?? 'http://localhost:12341/v1',
|
|
93
|
+
localModelId: opts.localModel ?? 'qwen3.6-27b-mtp',
|
|
94
|
+
strongModelId: opts.strongModel ?? null,
|
|
95
|
+
limit: opts.limit ?? '0',
|
|
96
|
+
format: isJson ? 'json' : (opts.format ?? 'markdown'),
|
|
97
|
+
output: opts.output,
|
|
98
|
+
minPainScore: opts.minScore ?? '50',
|
|
99
|
+
skipStrongModel: opts.skipStrongModel ?? false,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
if (errors.length > 0) {
|
|
103
|
+
const msg = errors.map(e => `${e.field}: ${e.message}`).join('; ');
|
|
104
|
+
if (isJson) {
|
|
105
|
+
writeJsonError(msg, 'Fix the invalid options and retry');
|
|
106
|
+
} else {
|
|
107
|
+
process.stderr.write(`❌ Invalid options:\n${errors.map(e => ` - ${e.field}: ${e.message}`).join('\n')}\n`);
|
|
108
|
+
}
|
|
109
|
+
process.exitCode = 1;
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// 2. Check files exist
|
|
114
|
+
if (!existsSync(options.dbPath)) {
|
|
115
|
+
const msg = `trajectory.db not found at: ${options.dbPath}`;
|
|
116
|
+
if (isJson) {
|
|
117
|
+
writeJsonError(msg, 'Ensure the workspace has PD data (run PD first to generate trajectory.db)');
|
|
118
|
+
} else {
|
|
119
|
+
process.stderr.write(`❌ ${msg}\n`);
|
|
120
|
+
}
|
|
121
|
+
process.exitCode = 1;
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// 3. Ensure output directory exists
|
|
126
|
+
const outputDir = dirname(options.output);
|
|
127
|
+
if (outputDir && !existsSync(outputDir)) {
|
|
128
|
+
try {
|
|
129
|
+
mkdirSync(outputDir, { recursive: true });
|
|
130
|
+
log(`Created output directory: ${outputDir}`);
|
|
131
|
+
} catch (err: unknown) {
|
|
132
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
133
|
+
if (isJson) {
|
|
134
|
+
writeJsonError(`Cannot create output directory: ${msg}`, 'Ensure the output path is writable');
|
|
135
|
+
} else {
|
|
136
|
+
process.stderr.write(`❌ Cannot create output directory: ${msg}\n`);
|
|
137
|
+
}
|
|
138
|
+
process.exitCode = 1;
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// 4. Check LM Studio
|
|
144
|
+
log('🔍 PD Quality Scorecard — Starting...');
|
|
145
|
+
log(` DB: ${options.dbPath}`);
|
|
146
|
+
log(` Local Model: ${options.localModelId} @ ${options.localModelBaseUrl}`);
|
|
147
|
+
log(` Strong Model: ${options.strongModelId ?? 'skipped'}`);
|
|
148
|
+
|
|
149
|
+
const lmStatus = await checkLmStudioAvailable(options.localModelBaseUrl);
|
|
150
|
+
if (!lmStatus.available) {
|
|
151
|
+
if (isJson) {
|
|
152
|
+
writeJsonError(`LM Studio not available: ${lmStatus.error}`, 'Start LM Studio or check --local-url');
|
|
153
|
+
} else {
|
|
154
|
+
process.stderr.write(`❌ LM Studio not available at ${options.localModelBaseUrl}: ${lmStatus.error}\n`);
|
|
155
|
+
}
|
|
156
|
+
process.exitCode = 1;
|
|
157
|
+
return;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (!lmStatus.models.includes(options.localModelId)) {
|
|
161
|
+
if (isJson) {
|
|
162
|
+
writeJsonError(`Model "${options.localModelId}" not found. Available: ${lmStatus.models.join(', ')}`, 'Use --local-model with an available model');
|
|
163
|
+
} else {
|
|
164
|
+
process.stderr.write(`❌ Model "${options.localModelId}" not found. Available: ${lmStatus.models.join(', ')}\n`);
|
|
165
|
+
}
|
|
166
|
+
process.exitCode = 1;
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// 5. Extract data
|
|
171
|
+
log('\n📊 Extracting dogfood data...');
|
|
172
|
+
const { episodes, stats: extractStats } = await extractEpisodes(options.dbPath, {
|
|
173
|
+
minScore: options.minPainScore,
|
|
174
|
+
limit: options.limit,
|
|
175
|
+
});
|
|
176
|
+
log(` Found ${episodes.length} unique episodes (total pain events: ${extractStats.total})`);
|
|
177
|
+
|
|
178
|
+
const logStats = extractLogStats(options.logsDir);
|
|
179
|
+
log(` Event logs: ${logStats.totalEvents} events (${logStats.painSignalCount} pain signals)`);
|
|
180
|
+
|
|
181
|
+
// 6. Evaluate each episode
|
|
182
|
+
log('\n🤖 Running local model evaluation...');
|
|
183
|
+
const evaluations: EpisodeEvaluation[] = [];
|
|
184
|
+
|
|
185
|
+
for (let i = 0; i < episodes.length; i++) {
|
|
186
|
+
const ep = episodes[i];
|
|
187
|
+
if (!ep) continue;
|
|
188
|
+
log(` [${i + 1}/${episodes.length}] ${ep.episodeId} (score=${ep.score})...`);
|
|
189
|
+
|
|
190
|
+
const localEval = await evaluateWithLocalModel(ep, {
|
|
191
|
+
baseUrl: options.localModelBaseUrl,
|
|
192
|
+
model: options.localModelId,
|
|
193
|
+
}, (msg: string) => log(` ${msg}`));
|
|
194
|
+
log(` Local: ${localEval.totalScore}/14 MVP=${localEval.mvpMet} flags=[${localEval.flags.join(',')}]`);
|
|
195
|
+
|
|
196
|
+
// 7. Strong model adjudication
|
|
197
|
+
let adjudication: StrongModelAdjudication;
|
|
198
|
+
if (options.skipStrongModel || !options.strongModelId) {
|
|
199
|
+
adjudication = skippedAdjudication(
|
|
200
|
+
options.skipStrongModel
|
|
201
|
+
? 'Strong model skipped by --skip-strong-model flag'
|
|
202
|
+
: 'No strong model configured'
|
|
203
|
+
);
|
|
204
|
+
} else {
|
|
205
|
+
const decision = needsAdjudication(ep, localEval);
|
|
206
|
+
if (decision.shouldAdjudicate) {
|
|
207
|
+
log(` Adjudicating (${decision.priority}: ${decision.reason})...`);
|
|
208
|
+
adjudication = await adjudicate(ep, localEval, { modelId: options.strongModelId, log: (msg: string) => log(` ${msg}`) });
|
|
209
|
+
log(` Adjudication: ${adjudication.adjudicationStatus}`);
|
|
210
|
+
} else {
|
|
211
|
+
adjudication = skippedAdjudication(decision.reason);
|
|
212
|
+
log(` Adjudication skipped: ${decision.reason}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const finalLabel = determineFinalLabel(localEval, adjudication);
|
|
217
|
+
evaluations.push({ episode: ep, localEvaluation: localEval, strongModelAdjudication: adjudication, finalLabel });
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// 8. Build and write report
|
|
221
|
+
log('\n📝 Generating report...');
|
|
222
|
+
const summary = computeSummary(evaluations);
|
|
223
|
+
const report: QualityScorecardReport = {
|
|
224
|
+
generatedAt: new Date().toISOString(),
|
|
225
|
+
dataSource: {
|
|
226
|
+
painEventCount: extractStats.total,
|
|
227
|
+
evolutionTaskCount: 0,
|
|
228
|
+
principleEventCount: 0,
|
|
229
|
+
gateBlockCount: 0,
|
|
230
|
+
dateRange: extractStats.dateRange,
|
|
231
|
+
},
|
|
232
|
+
localEvaluatorConfig: {
|
|
233
|
+
model: options.localModelId,
|
|
234
|
+
baseUrl: options.localModelBaseUrl.replace(/\/v\d+$/, '/...'),
|
|
235
|
+
apiKeyStatus: 'not-required',
|
|
236
|
+
},
|
|
237
|
+
strongModelConfig: {
|
|
238
|
+
model: options.strongModelId,
|
|
239
|
+
status: options.skipStrongModel || !options.strongModelId ? 'skipped' : 'configured',
|
|
240
|
+
},
|
|
241
|
+
evaluations,
|
|
242
|
+
summary,
|
|
243
|
+
knownLimitations: [
|
|
244
|
+
'Local model scores are advisory only — not final quality conclusions.',
|
|
245
|
+
'Without strong-model adjudication, samples are marked local-pass/local-fail/needs-review.',
|
|
246
|
+
'Deduplication is based on reason text similarity — may miss distinct episodes.',
|
|
247
|
+
'Local model output is non-deterministic despite temperature=0.1.',
|
|
248
|
+
],
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
let content: string;
|
|
252
|
+
switch (options.format) {
|
|
253
|
+
case 'html': content = generateHtmlReport(report); break;
|
|
254
|
+
case 'json': content = generateJsonReport(report); break;
|
|
255
|
+
case 'markdown':
|
|
256
|
+
default: content = generateMarkdownReport(report); break;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
writeFileSync(options.output, content, 'utf-8');
|
|
260
|
+
|
|
261
|
+
log(`\n✅ Report written to: ${options.output}`);
|
|
262
|
+
log(` Format: ${options.format}`);
|
|
263
|
+
log(` Episodes: ${summary.totalEpisodes}`);
|
|
264
|
+
log(` Local Pass: ${summary.localPassCount} | Local Fail: ${summary.localFailCount}`);
|
|
265
|
+
log(` Strong Model Reviewed: ${summary.strongModelReviewedCount}`);
|
|
266
|
+
log(` Final Pass: ${summary.finalPassCount} | Final Fail: ${summary.finalFailCount} | Needs Review: ${summary.needsReviewCount}`);
|
|
267
|
+
|
|
268
|
+
// JSON mode: output exactly one JSON object to stdout
|
|
269
|
+
if (isJson) {
|
|
270
|
+
writeJsonOutput({ ok: true, report });
|
|
271
|
+
}
|
|
272
|
+
}
|
|
@@ -12,13 +12,14 @@ import { createInternalizationQueueReadModel } from '@principles/core/runtime-v2
|
|
|
12
12
|
import type { InternalizationQueueSnapshot } from '@principles/core/runtime-v2';
|
|
13
13
|
import { resolveWorkspaceDir } from '../resolve-workspace.js';
|
|
14
14
|
import { loadEffectiveFeatureFlags } from '../services/feature-flag-loader.js';
|
|
15
|
+
import { loadPdConfig, computeFlagsFromLoadResult } from '../services/pd-config-loader.js';
|
|
15
16
|
|
|
16
17
|
interface QueueOptions {
|
|
17
18
|
workspace?: string;
|
|
18
19
|
json?: boolean;
|
|
19
20
|
}
|
|
20
21
|
|
|
21
|
-
function formatTextOutput(snap: InternalizationQueueSnapshot): string {
|
|
22
|
+
function formatTextOutput(snap: InternalizationQueueSnapshot, workspaceDir: string, autoConsumerEnabled: boolean): string {
|
|
22
23
|
const lines: string[] = [];
|
|
23
24
|
lines.push(`Internalization Queue Snapshot`);
|
|
24
25
|
lines.push(` pending: ${snap.pendingCount} retry_wait: ${snap.retryWaitCount}`);
|
|
@@ -78,6 +79,16 @@ function formatTextOutput(snap: InternalizationQueueSnapshot): string {
|
|
|
78
79
|
}
|
|
79
80
|
}
|
|
80
81
|
|
|
82
|
+
if (snap.readyTasks.length > 0) {
|
|
83
|
+
if (autoConsumerEnabled) {
|
|
84
|
+
lines.push(` consumerStatus: auto_consumer_enabled`);
|
|
85
|
+
} else {
|
|
86
|
+
const nextAction = `pd runtime internalization run-once --workspace "${workspaceDir}" --runner dreamer --runtime config --json`;
|
|
87
|
+
lines.push(` consumerStatus: manual_action_required`);
|
|
88
|
+
lines.push(` nextAction: ${nextAction}`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
81
92
|
return lines.join('\n');
|
|
82
93
|
}
|
|
83
94
|
|
|
@@ -95,10 +106,33 @@ export async function handleRuntimeInternalizationQueue(opts: QueueOptions): Pro
|
|
|
95
106
|
try {
|
|
96
107
|
const snapshot = await readModel.getSnapshot();
|
|
97
108
|
|
|
109
|
+
const pdConfigResult = loadPdConfig(workspaceDir);
|
|
110
|
+
if (!pdConfigResult.ok) {
|
|
111
|
+
const configWarning = JSON.stringify({
|
|
112
|
+
level: 'warning',
|
|
113
|
+
source: 'pd_config',
|
|
114
|
+
errors: pdConfigResult.errors.map(e => ({ reason: e.reason, nextAction: e.nextAction })),
|
|
115
|
+
});
|
|
116
|
+
process.stderr.write(`${configWarning}\n`);
|
|
117
|
+
}
|
|
118
|
+
const pdFlags = computeFlagsFromLoadResult(pdConfigResult);
|
|
119
|
+
const autoConsumerEnabled = pdFlags.flags.internalization_auto_consumer?.enabled ?? false;
|
|
120
|
+
|
|
98
121
|
if (opts.json) {
|
|
99
|
-
|
|
122
|
+
const output: Record<string, unknown> = { ...snapshot };
|
|
123
|
+
|
|
124
|
+
if (snapshot.readyTasks.length > 0) {
|
|
125
|
+
if (autoConsumerEnabled) {
|
|
126
|
+
output.consumerStatus = 'auto_consumer_enabled';
|
|
127
|
+
} else {
|
|
128
|
+
output.nextAction = `pd runtime internalization run-once --workspace "${workspaceDir}" --runner dreamer --runtime config --json`;
|
|
129
|
+
output.consumerStatus = 'manual_action_required';
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
console.log(JSON.stringify(output, null, 2));
|
|
100
134
|
} else {
|
|
101
|
-
console.log(formatTextOutput(snapshot));
|
|
135
|
+
console.log(formatTextOutput(snapshot, workspaceDir, autoConsumerEnabled));
|
|
102
136
|
}
|
|
103
137
|
} finally {
|
|
104
138
|
await close();
|
package/src/index.ts
CHANGED
|
@@ -927,4 +927,28 @@ consoleCmd.action(async (opts) => {
|
|
|
927
927
|
});
|
|
928
928
|
});
|
|
929
929
|
|
|
930
|
+
// ─── Quality Scorecard (PRI-361) ──────────────────────────────────
|
|
931
|
+
|
|
932
|
+
const qualityCmd = program
|
|
933
|
+
.command('quality')
|
|
934
|
+
.description('Quality scoring and evaluation');
|
|
935
|
+
|
|
936
|
+
qualityCmd
|
|
937
|
+
.command('scorecard')
|
|
938
|
+
.description('Generate quality scorecard report for PD pain→diagnosis→principle chain')
|
|
939
|
+
.option('-w, --workspace <path>', 'Workspace directory')
|
|
940
|
+
.option('--local-model <id>', 'LM Studio model ID', 'qwen3.6-27b-mtp')
|
|
941
|
+
.option('--local-url <url>', 'LM Studio base URL', 'http://localhost:12341/v1')
|
|
942
|
+
.option('--strong-model <id>', 'Strong model for adjudication (provider/model)')
|
|
943
|
+
.option('--skip-strong-model', 'Skip strong model adjudication', false)
|
|
944
|
+
.option('--min-score <n>', 'Minimum pain score to evaluate', '50')
|
|
945
|
+
.option('--limit <n>', 'Max episodes to evaluate (0=all)', '0')
|
|
946
|
+
.option('--format <fmt>', 'Output format: json, markdown, html', 'markdown')
|
|
947
|
+
.option('--output <path>', 'Output file path')
|
|
948
|
+
.option('--json', 'Output as JSON', false)
|
|
949
|
+
.action(async (opts) => {
|
|
950
|
+
const { handleQualityScorecard } = await import('./commands/quality-scorecard.js');
|
|
951
|
+
await handleQualityScorecard(opts);
|
|
952
|
+
});
|
|
953
|
+
|
|
930
954
|
program.parse();
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Data Extractor (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Reads trajectory.db and event logs. Uses runtime validation
|
|
5
|
+
* from @principles/core — no `as RawPainEvent[]` casts.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { PainEpisode } from '@principles/core/quality-scorecard';
|
|
9
|
+
import {
|
|
10
|
+
validatePainRow,
|
|
11
|
+
validateEvolutionRow,
|
|
12
|
+
validatePrincipleEventRow,
|
|
13
|
+
validateGateRow,
|
|
14
|
+
sanitize,
|
|
15
|
+
truncate,
|
|
16
|
+
} from '@principles/core/quality-scorecard';
|
|
17
|
+
import { readdirSync, readFileSync } from 'fs';
|
|
18
|
+
import { join } from 'path';
|
|
19
|
+
|
|
20
|
+
export interface ExtractionResult {
|
|
21
|
+
episodes: PainEpisode[];
|
|
22
|
+
stats: { total: number; dateRange: { from: string; to: string } };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export async function extractEpisodes(
|
|
26
|
+
dbPath: string,
|
|
27
|
+
options: { minScore?: number; limit?: number } = {}
|
|
28
|
+
): Promise<ExtractionResult> {
|
|
29
|
+
const Database = (await import('better-sqlite3')).default;
|
|
30
|
+
const db = new Database(dbPath, { readonly: true });
|
|
31
|
+
|
|
32
|
+
const minScore = options.minScore ?? 0;
|
|
33
|
+
const limit = options.limit ?? 0;
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
// Fetch pain events with parameterized query
|
|
37
|
+
let painQuery = 'SELECT id, session_id, source, score, reason, severity, created_at FROM pain_events WHERE 1=1';
|
|
38
|
+
const params: (string | number)[] = [];
|
|
39
|
+
if (minScore > 0) {
|
|
40
|
+
painQuery += ' AND score >= ?';
|
|
41
|
+
params.push(minScore);
|
|
42
|
+
}
|
|
43
|
+
painQuery += ' ORDER BY created_at DESC';
|
|
44
|
+
if (limit > 0) {
|
|
45
|
+
painQuery += ' LIMIT ?';
|
|
46
|
+
params.push(limit);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const rawPainRows = db.prepare(painQuery).all(...params);
|
|
50
|
+
// Validate each row — no unsafe cast
|
|
51
|
+
const painEvents = rawPainRows.map(validatePainRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
52
|
+
|
|
53
|
+
// Fetch evolution tasks
|
|
54
|
+
const rawEvoRows = db.prepare('SELECT task_id, score, status, resolution, created_at FROM evolution_tasks').all();
|
|
55
|
+
const evoTasks = rawEvoRows.map(validateEvolutionRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
56
|
+
|
|
57
|
+
// Fetch principle events
|
|
58
|
+
const rawPeRows = db.prepare('SELECT principle_id, event_type, created_at FROM principle_events').all();
|
|
59
|
+
const prEvents = rawPeRows.map(validatePrincipleEventRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
60
|
+
|
|
61
|
+
// Gate blocks count per session — validate each row
|
|
62
|
+
const rawGateRows = db.prepare('SELECT session_id, COUNT(*) as cnt FROM gate_blocks GROUP BY session_id').all();
|
|
63
|
+
const gateRows = rawGateRows.map(validateGateRow).filter((r): r is NonNullable<typeof r> => r !== null);
|
|
64
|
+
const gateBlockMap = new Map(gateRows.map(g => [g.session_id, g.cnt]));
|
|
65
|
+
|
|
66
|
+
// Deduplicate by sanitized reason
|
|
67
|
+
const seen = new Set<string>();
|
|
68
|
+
const episodes: PainEpisode[] = [];
|
|
69
|
+
|
|
70
|
+
for (const pe of painEvents) {
|
|
71
|
+
const dedupKey = sanitize(pe.reason).substring(0, 80);
|
|
72
|
+
if (seen.has(dedupKey)) continue;
|
|
73
|
+
seen.add(dedupKey);
|
|
74
|
+
|
|
75
|
+
const peTime = new Date(pe.created_at).getTime();
|
|
76
|
+
const linkedTask = evoTasks.find(t => {
|
|
77
|
+
const tTime = new Date(t.created_at).getTime();
|
|
78
|
+
return Math.abs(tTime - peTime) < 3600000 && Math.abs(t.score - pe.score) <= 10;
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
const linkedPrinciples = prEvents
|
|
82
|
+
.filter(e => {
|
|
83
|
+
if (!e.principle_id) return false;
|
|
84
|
+
const eTime = new Date(e.created_at).getTime();
|
|
85
|
+
return Math.abs(eTime - peTime) < 7200000;
|
|
86
|
+
})
|
|
87
|
+
.map(e => e.principle_id)
|
|
88
|
+
.filter((v): v is string => v !== null)
|
|
89
|
+
.filter((v, i, a) => a.indexOf(v) === i);
|
|
90
|
+
|
|
91
|
+
episodes.push({
|
|
92
|
+
episodeId: `EP-${pe.id}`,
|
|
93
|
+
summary: truncate(sanitize(pe.reason)),
|
|
94
|
+
source: pe.source,
|
|
95
|
+
score: pe.score,
|
|
96
|
+
severity: pe.severity,
|
|
97
|
+
createdAt: pe.created_at,
|
|
98
|
+
evolutionTaskResolution: linkedTask?.resolution ?? null,
|
|
99
|
+
linkedPrinciples,
|
|
100
|
+
gateBlockCount: gateBlockMap.get(pe.session_id) ?? 0,
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const dates = painEvents.map(e => e.created_at).sort();
|
|
105
|
+
return {
|
|
106
|
+
episodes,
|
|
107
|
+
stats: {
|
|
108
|
+
total: painEvents.length,
|
|
109
|
+
dateRange: {
|
|
110
|
+
from: dates[0] ?? new Date().toISOString(),
|
|
111
|
+
to: dates[dates.length - 1] ?? new Date().toISOString(),
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
};
|
|
115
|
+
} finally {
|
|
116
|
+
db.close();
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export interface LogStats {
|
|
121
|
+
totalEvents: number;
|
|
122
|
+
painSignalCount: number;
|
|
123
|
+
degradedReasons: string[];
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export function extractLogStats(logsDir: string): LogStats {
|
|
127
|
+
const stats: LogStats = { totalEvents: 0, painSignalCount: 0, degradedReasons: [] };
|
|
128
|
+
|
|
129
|
+
try {
|
|
130
|
+
const files = readdirSync(logsDir).filter(f => f.endsWith('.jsonl'));
|
|
131
|
+
for (const file of files) {
|
|
132
|
+
const lines = readFileSync(join(logsDir, file), 'utf-8').split('\n').filter(Boolean);
|
|
133
|
+
for (const line of lines) {
|
|
134
|
+
try {
|
|
135
|
+
const ev = JSON.parse(line) as Record<string, unknown>;
|
|
136
|
+
stats.totalEvents++;
|
|
137
|
+
if (ev.type === 'pain_signal') stats.painSignalCount++;
|
|
138
|
+
} catch (parseErr: unknown) {
|
|
139
|
+
const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
|
|
140
|
+
stats.degradedReasons.push(`jsonl-parse-fail:${file}:${msg}`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
} catch (dirErr: unknown) {
|
|
145
|
+
const msg = dirErr instanceof Error ? dirErr.message : String(dirErr);
|
|
146
|
+
stats.degradedReasons.push(`logs-dir-unreadable:${msg}`);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return stats;
|
|
150
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Local Evaluator (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Calls LM Studio for advisory scoring. Uses core validation
|
|
5
|
+
* to parse LLM responses — no unsafe casts.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
PainEpisode,
|
|
10
|
+
LocalEvaluation,
|
|
11
|
+
RubricDimension,
|
|
12
|
+
RubricScore,
|
|
13
|
+
} from '@principles/core/quality-scorecard';
|
|
14
|
+
import {
|
|
15
|
+
RUBRIC_LABELS,
|
|
16
|
+
RUBRIC_PROMPTS,
|
|
17
|
+
RUBRIC_DIMENSIONS as DIMS,
|
|
18
|
+
meetsMvpThreshold,
|
|
19
|
+
sumScores,
|
|
20
|
+
validateLlmScoreResponse,
|
|
21
|
+
extractJsonFromLlmResponse,
|
|
22
|
+
} from '@principles/core/quality-scorecard';
|
|
23
|
+
|
|
24
|
+
function buildEvaluationPrompt(episode: PainEpisode): string {
|
|
25
|
+
const dimensions = DIMS.map(d => `${d} (${RUBRIC_LABELS[d]}): ${RUBRIC_PROMPTS[d]}`).join('\n');
|
|
26
|
+
|
|
27
|
+
return `You are a quality evaluator for an AI agent's pain-signal -> diagnosis -> principle pipeline.
|
|
28
|
+
|
|
29
|
+
## Task
|
|
30
|
+
Evaluate this pain episode on a 7-dimension rubric. Each dimension scores 0 (fail), 1 (partial), or 2 (pass).
|
|
31
|
+
|
|
32
|
+
## Pain Episode
|
|
33
|
+
- ID: ${episode.episodeId}
|
|
34
|
+
- Source: ${episode.source}
|
|
35
|
+
- Pain Score: ${episode.score}
|
|
36
|
+
- Severity: ${episode.severity}
|
|
37
|
+
- Summary: ${episode.summary}
|
|
38
|
+
- Created: ${episode.createdAt}
|
|
39
|
+
- Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
|
|
40
|
+
- Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
|
|
41
|
+
- Gate Blocks: ${episode.gateBlockCount}
|
|
42
|
+
|
|
43
|
+
## Rubric Dimensions
|
|
44
|
+
${dimensions}
|
|
45
|
+
|
|
46
|
+
## Additional Checks
|
|
47
|
+
- Is the language consistent (not mixing Chinese and English incoherently)?
|
|
48
|
+
- Is the diagnosis/principle overly abstract (no concrete actionable guidance)?
|
|
49
|
+
- Does it fabricate non-existent evidence, axioms, or references?
|
|
50
|
+
|
|
51
|
+
## Output Format (STRICT JSON)
|
|
52
|
+
Respond with ONLY a JSON object:
|
|
53
|
+
{
|
|
54
|
+
"scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
|
|
55
|
+
"rationales": { "G1": "...", "G2": "...", "G3": "...", "G4": "...", "G5": "...", "G6": "...", "G7": "..." },
|
|
56
|
+
"flags": ["list of issues found"]
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Do NOT output anything other than this JSON object.`;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export interface LocalEvaluatorConfig {
|
|
63
|
+
baseUrl: string;
|
|
64
|
+
model: string;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export async function evaluateWithLocalModel(
|
|
68
|
+
episode: PainEpisode,
|
|
69
|
+
config: LocalEvaluatorConfig,
|
|
70
|
+
log: (msg: string) => void
|
|
71
|
+
): Promise<LocalEvaluation> {
|
|
72
|
+
const prompt = buildEvaluationPrompt(episode);
|
|
73
|
+
const url = `${config.baseUrl.replace(/\/+$/, '')}/chat/completions`;
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
const resp = await fetch(url, {
|
|
77
|
+
method: 'POST',
|
|
78
|
+
headers: { 'Content-Type': 'application/json' },
|
|
79
|
+
body: JSON.stringify({
|
|
80
|
+
model: config.model,
|
|
81
|
+
messages: [
|
|
82
|
+
{ role: 'system', content: 'You are a precise JSON-output quality evaluator. Output only valid JSON.' },
|
|
83
|
+
{ role: 'user', content: prompt },
|
|
84
|
+
],
|
|
85
|
+
temperature: 0.1,
|
|
86
|
+
max_tokens: 2000,
|
|
87
|
+
}),
|
|
88
|
+
signal: AbortSignal.timeout(120_000),
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
if (!resp.ok) {
|
|
92
|
+
throw new Error(`LM Studio request failed: ${resp.status}`);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const data = (await resp.json()) as { choices: { message: { content: string } }[] };
|
|
96
|
+
const content = data.choices?.[0]?.message?.content ?? '';
|
|
97
|
+
|
|
98
|
+
const parsed = extractJsonFromLlmResponse(content);
|
|
99
|
+
if (parsed === null) {
|
|
100
|
+
throw new Error(`LM Studio returned non-JSON response`);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const { scores, rationales, flags } = validateLlmScoreResponse(parsed);
|
|
104
|
+
const totalScore = sumScores(scores);
|
|
105
|
+
|
|
106
|
+
return {
|
|
107
|
+
model: config.model,
|
|
108
|
+
dimensionScores: scores,
|
|
109
|
+
dimensionRationales: rationales,
|
|
110
|
+
totalScore,
|
|
111
|
+
maxScore: 14,
|
|
112
|
+
mvpMet: meetsMvpThreshold(scores),
|
|
113
|
+
flags: flags,
|
|
114
|
+
};
|
|
115
|
+
} catch (err: unknown) {
|
|
116
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
117
|
+
log(`Evaluation error for ${episode.episodeId}: ${msg}`);
|
|
118
|
+
const zeroScores = Object.fromEntries(DIMS.map(d => [d, 0])) as Record<RubricDimension, RubricScore>;
|
|
119
|
+
return {
|
|
120
|
+
model: config.model,
|
|
121
|
+
dimensionScores: zeroScores,
|
|
122
|
+
dimensionRationales: Object.fromEntries(DIMS.map(d => [d, `Evaluation failed: ${msg}`])) as Record<RubricDimension, string>,
|
|
123
|
+
totalScore: 0,
|
|
124
|
+
maxScore: 14,
|
|
125
|
+
mvpMet: false,
|
|
126
|
+
flags: ['evaluation_error'],
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
export async function checkLmStudioAvailable(baseUrl: string): Promise<{ available: boolean; models: string[]; error?: string }> {
|
|
132
|
+
try {
|
|
133
|
+
const url = `${baseUrl.replace(/\/+$/, '')}/models`;
|
|
134
|
+
const resp = await fetch(url, { signal: AbortSignal.timeout(5000) });
|
|
135
|
+
if (!resp.ok) return { available: false, models: [], error: `HTTP ${resp.status}` };
|
|
136
|
+
const data = (await resp.json()) as { data: { id: string }[] };
|
|
137
|
+
const models = (data.data || []).map((m) => m.id);
|
|
138
|
+
return { available: true, models };
|
|
139
|
+
} catch (err: unknown) {
|
|
140
|
+
return { available: false, models: [], error: err instanceof Error ? err.message : String(err) };
|
|
141
|
+
}
|
|
142
|
+
}
|