@principles/pd-cli 1.96.0 → 1.97.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/quality-scorecard.d.ts +9 -0
- package/dist/commands/quality-scorecard.d.ts.map +1 -0
- package/dist/commands/quality-scorecard.js +241 -0
- package/dist/commands/quality-scorecard.js.map +1 -0
- package/dist/index.js +21 -0
- package/dist/index.js.map +1 -1
- package/dist/services/quality-scorecard/data-extractor.d.ts +28 -0
- package/dist/services/quality-scorecard/data-extractor.d.ts.map +1 -0
- package/dist/services/quality-scorecard/data-extractor.js +118 -0
- package/dist/services/quality-scorecard/data-extractor.js.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts +18 -0
- package/dist/services/quality-scorecard/local-evaluator.d.ts.map +1 -0
- package/dist/services/quality-scorecard/local-evaluator.js +112 -0
- package/dist/services/quality-scorecard/local-evaluator.js.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts +14 -0
- package/dist/services/quality-scorecard/strong-model-gate.d.ts.map +1 -0
- package/dist/services/quality-scorecard/strong-model-gate.js +128 -0
- package/dist/services/quality-scorecard/strong-model-gate.js.map +1 -0
- package/package.json +1 -1
- package/src/commands/quality-scorecard.ts +272 -0
- package/src/index.ts +24 -0
- package/src/services/quality-scorecard/data-extractor.ts +150 -0
- package/src/services/quality-scorecard/local-evaluator.ts +142 -0
- package/src/services/quality-scorecard/strong-model-gate.ts +160 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Local Evaluator (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Calls LM Studio for advisory scoring. Uses core validation
|
|
5
|
+
* to parse LLM responses — no unsafe casts.
|
|
6
|
+
*/
|
|
7
|
+
import { RUBRIC_LABELS, RUBRIC_PROMPTS, RUBRIC_DIMENSIONS as DIMS, meetsMvpThreshold, sumScores, validateLlmScoreResponse, extractJsonFromLlmResponse, } from '@principles/core/quality-scorecard';
|
|
8
|
+
function buildEvaluationPrompt(episode) {
|
|
9
|
+
const dimensions = DIMS.map(d => `${d} (${RUBRIC_LABELS[d]}): ${RUBRIC_PROMPTS[d]}`).join('\n');
|
|
10
|
+
return `You are a quality evaluator for an AI agent's pain-signal -> diagnosis -> principle pipeline.
|
|
11
|
+
|
|
12
|
+
## Task
|
|
13
|
+
Evaluate this pain episode on a 7-dimension rubric. Each dimension scores 0 (fail), 1 (partial), or 2 (pass).
|
|
14
|
+
|
|
15
|
+
## Pain Episode
|
|
16
|
+
- ID: ${episode.episodeId}
|
|
17
|
+
- Source: ${episode.source}
|
|
18
|
+
- Pain Score: ${episode.score}
|
|
19
|
+
- Severity: ${episode.severity}
|
|
20
|
+
- Summary: ${episode.summary}
|
|
21
|
+
- Created: ${episode.createdAt}
|
|
22
|
+
- Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
|
|
23
|
+
- Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
|
|
24
|
+
- Gate Blocks: ${episode.gateBlockCount}
|
|
25
|
+
|
|
26
|
+
## Rubric Dimensions
|
|
27
|
+
${dimensions}
|
|
28
|
+
|
|
29
|
+
## Additional Checks
|
|
30
|
+
- Is the language consistent (not mixing Chinese and English incoherently)?
|
|
31
|
+
- Is the diagnosis/principle overly abstract (no concrete actionable guidance)?
|
|
32
|
+
- Does it fabricate non-existent evidence, axioms, or references?
|
|
33
|
+
|
|
34
|
+
## Output Format (STRICT JSON)
|
|
35
|
+
Respond with ONLY a JSON object:
|
|
36
|
+
{
|
|
37
|
+
"scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
|
|
38
|
+
"rationales": { "G1": "...", "G2": "...", "G3": "...", "G4": "...", "G5": "...", "G6": "...", "G7": "..." },
|
|
39
|
+
"flags": ["list of issues found"]
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
Do NOT output anything other than this JSON object.`;
|
|
43
|
+
}
|
|
44
|
+
export async function evaluateWithLocalModel(episode, config, log) {
|
|
45
|
+
const prompt = buildEvaluationPrompt(episode);
|
|
46
|
+
const url = `${config.baseUrl.replace(/\/+$/, '')}/chat/completions`;
|
|
47
|
+
try {
|
|
48
|
+
const resp = await fetch(url, {
|
|
49
|
+
method: 'POST',
|
|
50
|
+
headers: { 'Content-Type': 'application/json' },
|
|
51
|
+
body: JSON.stringify({
|
|
52
|
+
model: config.model,
|
|
53
|
+
messages: [
|
|
54
|
+
{ role: 'system', content: 'You are a precise JSON-output quality evaluator. Output only valid JSON.' },
|
|
55
|
+
{ role: 'user', content: prompt },
|
|
56
|
+
],
|
|
57
|
+
temperature: 0.1,
|
|
58
|
+
max_tokens: 2000,
|
|
59
|
+
}),
|
|
60
|
+
signal: AbortSignal.timeout(120_000),
|
|
61
|
+
});
|
|
62
|
+
if (!resp.ok) {
|
|
63
|
+
throw new Error(`LM Studio request failed: ${resp.status}`);
|
|
64
|
+
}
|
|
65
|
+
const data = (await resp.json());
|
|
66
|
+
const content = data.choices?.[0]?.message?.content ?? '';
|
|
67
|
+
const parsed = extractJsonFromLlmResponse(content);
|
|
68
|
+
if (parsed === null) {
|
|
69
|
+
throw new Error(`LM Studio returned non-JSON response`);
|
|
70
|
+
}
|
|
71
|
+
const { scores, rationales, flags } = validateLlmScoreResponse(parsed);
|
|
72
|
+
const totalScore = sumScores(scores);
|
|
73
|
+
return {
|
|
74
|
+
model: config.model,
|
|
75
|
+
dimensionScores: scores,
|
|
76
|
+
dimensionRationales: rationales,
|
|
77
|
+
totalScore,
|
|
78
|
+
maxScore: 14,
|
|
79
|
+
mvpMet: meetsMvpThreshold(scores),
|
|
80
|
+
flags: flags,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
catch (err) {
|
|
84
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
85
|
+
log(`Evaluation error for ${episode.episodeId}: ${msg}`);
|
|
86
|
+
const zeroScores = Object.fromEntries(DIMS.map(d => [d, 0]));
|
|
87
|
+
return {
|
|
88
|
+
model: config.model,
|
|
89
|
+
dimensionScores: zeroScores,
|
|
90
|
+
dimensionRationales: Object.fromEntries(DIMS.map(d => [d, `Evaluation failed: ${msg}`])),
|
|
91
|
+
totalScore: 0,
|
|
92
|
+
maxScore: 14,
|
|
93
|
+
mvpMet: false,
|
|
94
|
+
flags: ['evaluation_error'],
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
export async function checkLmStudioAvailable(baseUrl) {
|
|
99
|
+
try {
|
|
100
|
+
const url = `${baseUrl.replace(/\/+$/, '')}/models`;
|
|
101
|
+
const resp = await fetch(url, { signal: AbortSignal.timeout(5000) });
|
|
102
|
+
if (!resp.ok)
|
|
103
|
+
return { available: false, models: [], error: `HTTP ${resp.status}` };
|
|
104
|
+
const data = (await resp.json());
|
|
105
|
+
const models = (data.data || []).map((m) => m.id);
|
|
106
|
+
return { available: true, models };
|
|
107
|
+
}
|
|
108
|
+
catch (err) {
|
|
109
|
+
return { available: false, models: [], error: err instanceof Error ? err.message : String(err) };
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
//# sourceMappingURL=local-evaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"local-evaluator.js","sourceRoot":"","sources":["../../../src/services/quality-scorecard/local-evaluator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAQH,OAAO,EACL,aAAa,EACb,cAAc,EACd,iBAAiB,IAAI,IAAI,EACzB,iBAAiB,EACjB,SAAS,EACT,wBAAwB,EACxB,0BAA0B,GAC3B,MAAM,oCAAoC,CAAC;AAE5C,SAAS,qBAAqB,CAAC,OAAoB;IACjD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,aAAa,CAAC,CAAC,CAAC,MAAM,cAAc,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEhG,OAAO;;;;;;QAMD,OAAO,CAAC,SAAS;YACb,OAAO,CAAC,MAAM;gBACV,OAAO,CAAC,KAAK;cACf,OAAO,CAAC,QAAQ;aACjB,OAAO,CAAC,OAAO;aACf,OAAO,CAAC,SAAS;+BACC,OAAO,CAAC,uBAAuB,IAAI,MAAM;uBACjD,OAAO,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM;iBACxF,OAAO,CAAC,cAAc;;;EAGrC,UAAU;;;;;;;;;;;;;;;oDAewC,CAAC;AACrD,CAAC;AAOD,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAC1C,OAAoB,EACpB,MAA4B,EAC5B,GAA0B;IAE1B,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;IAC9C,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,mBAAmB,CAAC;IAErE,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC5B,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,0EAA0E,EAAE;oBACvG,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;iBAClC;gBACD,WAAW,EAAE,GAAG;gBAChB,UAAU,EAAE,IAAI;aACjB,CAAC;YACF,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,6BAA6B,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;QAC9D,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAAoD,CAAC;QACpF,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAC;QAE1D,MAAM,MAAM,GAAG,0BAA0B,CAAC,OAAO,CAAC,CAAC;QACnD,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YACpB,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,KAAK,EAAE,GAAG,wBAAwB,CAAC,MAAM,CAAC,CAAC;QACvE,MAAM,UAAU,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC;QAErC,OAAO;YACL,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,eAAe,EAAE,MAAM;YACvB,mBAAmB,EAAE,UAAU;YAC/B,UAAU;YACV,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,iBAAiB,CAAC,MAAM,CAAC;YACjC,KAAK,EAAE,KAAK;SACb,CAAC;IACJ,CAAC;IAAC,OAAO,GAAY,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC7D,GAAG,CAAC,wBAAwB,OAAO,CAAC,SAAS,KAAK,GAAG,EAAE,CAAC,CAAC;QACzD,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAyC,CAAC;QACrG,OAAO;YACL,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,eAAe,EAAE,UAAU;YAC3B,mBAAmB,EAAE,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,sBAAsB,GAAG,EAAE,CAAC,CAAC,CAAoC;YAC3H,UAAU,EAAE,CAAC;YACb,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,KAAK;YACb,KAAK,EAAE,CAAC,kBAAkB,CAAC;SAC5B,CAAC;IACJ,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAAC,OAAe;IAC1D,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,SAAS,CAAC;QACpD,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,EAAE,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACrE,IAAI,CAAC,IAAI,CAAC,EAAE;YAAE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE,KAAK,EAAE,QAAQ,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;QACpF,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAA+B,CAAC;QAC/D,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAClD,OAAO,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;IACrC,CAAC;IAAC,OAAO,GAAY,EAAE,CAAC;QACtB,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;IACnG,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Strong Model Adjudication Gate (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Calls cloud model for adjudication. Uses core validation
|
|
5
|
+
* to parse responses — no unsafe casts.
|
|
6
|
+
*/
|
|
7
|
+
import type { PainEpisode, LocalEvaluation, StrongModelAdjudication, AdjudicationStatus } from '@principles/core/quality-scorecard';
|
|
8
|
+
export declare function adjudicate(episode: PainEpisode, localEval: LocalEvaluation, config: {
|
|
9
|
+
modelId: string;
|
|
10
|
+
log: (msg: string) => void;
|
|
11
|
+
}): Promise<StrongModelAdjudication>;
|
|
12
|
+
export declare function skippedAdjudication(reason: string): StrongModelAdjudication;
|
|
13
|
+
export declare function determineFinalLabel(localEval: LocalEvaluation, adjudication: StrongModelAdjudication | null): AdjudicationStatus;
|
|
14
|
+
//# sourceMappingURL=strong-model-gate.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strong-model-gate.d.ts","sourceRoot":"","sources":["../../../src/services/quality-scorecard/strong-model-gate.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,WAAW,EACX,eAAe,EACf,uBAAuB,EACvB,kBAAkB,EACnB,MAAM,oCAAoC,CAAC;AAkD5C,wBAAsB,UAAU,CAC9B,OAAO,EAAE,WAAW,EACpB,SAAS,EAAE,eAAe,EAC1B,MAAM,EAAE;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,IAAI,CAAA;CAAE,GACtD,OAAO,CAAC,uBAAuB,CAAC,CAsElC;AAED,wBAAgB,mBAAmB,CAAC,MAAM,EAAE,MAAM,GAAG,uBAAuB,CAS3E;AAED,wBAAgB,mBAAmB,CACjC,SAAS,EAAE,eAAe,EAC1B,YAAY,EAAE,uBAAuB,GAAG,IAAI,GAC3C,kBAAkB,CAOpB"}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRI-361 — Strong Model Adjudication Gate (I/O layer in pd-cli)
|
|
3
|
+
*
|
|
4
|
+
* Calls cloud model for adjudication. Uses core validation
|
|
5
|
+
* to parse responses — no unsafe casts.
|
|
6
|
+
*/
|
|
7
|
+
import { RUBRIC_LABELS, RUBRIC_DIMENSIONS as DIMS, meetsMvpThreshold, validateAdjudicationResponse, extractJsonFromLlmResponse, } from '@principles/core/quality-scorecard';
|
|
8
|
+
function buildAdjudicationPrompt(episode, localEval) {
|
|
9
|
+
const localScores = DIMS.map(d => `- ${d} (${RUBRIC_LABELS[d]}): ${localEval.dimensionScores[d]}/2 — ${localEval.dimensionRationales[d]}`).join('\n');
|
|
10
|
+
return `You are a senior quality adjudicator for an AI agent evolution pipeline.
|
|
11
|
+
Your job is to independently re-evaluate a pain episode that was first scored by a local (smaller) model.
|
|
12
|
+
You must provide your own scores — do NOT simply copy the local model's scores.
|
|
13
|
+
|
|
14
|
+
## Pain Episode
|
|
15
|
+
- ID: ${episode.episodeId}
|
|
16
|
+
- Source: ${episode.source}
|
|
17
|
+
- Pain Score: ${episode.score}
|
|
18
|
+
- Severity: ${episode.severity}
|
|
19
|
+
- Summary: ${episode.summary}
|
|
20
|
+
- Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
|
|
21
|
+
- Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
|
|
22
|
+
|
|
23
|
+
## Local Model Scores (${localEval.model})
|
|
24
|
+
${localScores}
|
|
25
|
+
Flags: ${localEval.flags.length > 0 ? localEval.flags.join(', ') : 'none'}
|
|
26
|
+
|
|
27
|
+
## Your Task
|
|
28
|
+
1. Independently score each dimension (0/1/2) based on the evidence.
|
|
29
|
+
2. Check for: language inconsistency, over-abstraction, fabricated evidence.
|
|
30
|
+
3. If your scores differ from the local model by >=2 points on any dimension, explain why.
|
|
31
|
+
4. Give a final verdict: pass, fail, or needs-review.
|
|
32
|
+
|
|
33
|
+
## Output Format (STRICT JSON)
|
|
34
|
+
{
|
|
35
|
+
"scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
|
|
36
|
+
"rationale": "Overall assessment...",
|
|
37
|
+
"verdict": "pass" | "fail" | "needs-review"
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
Do NOT output anything other than this JSON object.`;
|
|
41
|
+
}
|
|
42
|
+
export async function adjudicate(episode, localEval, config) {
|
|
43
|
+
const { modelId: strongModelId, log } = config;
|
|
44
|
+
const prompt = buildAdjudicationPrompt(episode, localEval);
|
|
45
|
+
const baseUrl = process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
|
|
46
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
47
|
+
if (!apiKey) {
|
|
48
|
+
return {
|
|
49
|
+
model: strongModelId,
|
|
50
|
+
adjudicationStatus: 'needs-review',
|
|
51
|
+
confirmedScores: null,
|
|
52
|
+
confirmedMvpMet: null,
|
|
53
|
+
rationale: 'OPENAI_API_KEY not set — cannot run strong-model adjudication',
|
|
54
|
+
nextAction: 'Set OPENAI_API_KEY and re-run with --strong-model',
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
try {
|
|
58
|
+
const resp = await fetch(`${baseUrl}/chat/completions`, {
|
|
59
|
+
method: 'POST',
|
|
60
|
+
headers: {
|
|
61
|
+
'Content-Type': 'application/json',
|
|
62
|
+
Authorization: `Bearer ${apiKey}`,
|
|
63
|
+
},
|
|
64
|
+
body: JSON.stringify({
|
|
65
|
+
model: strongModelId,
|
|
66
|
+
messages: [
|
|
67
|
+
{ role: 'system', content: 'You are a precise JSON-output quality adjudicator. Output only valid JSON.' },
|
|
68
|
+
{ role: 'user', content: prompt },
|
|
69
|
+
],
|
|
70
|
+
temperature: 0.1,
|
|
71
|
+
max_tokens: 2000,
|
|
72
|
+
}),
|
|
73
|
+
signal: AbortSignal.timeout(120_000),
|
|
74
|
+
});
|
|
75
|
+
if (!resp.ok) {
|
|
76
|
+
throw new Error(`Strong model request failed: ${resp.status}`);
|
|
77
|
+
}
|
|
78
|
+
const data = (await resp.json());
|
|
79
|
+
const content = data.choices?.[0]?.message?.content ?? '';
|
|
80
|
+
const parsed = extractJsonFromLlmResponse(content);
|
|
81
|
+
if (parsed === null) {
|
|
82
|
+
throw new Error('Strong model returned non-JSON');
|
|
83
|
+
}
|
|
84
|
+
const validated = validateAdjudicationResponse(parsed);
|
|
85
|
+
const { scores, verdict } = validated;
|
|
86
|
+
return {
|
|
87
|
+
model: strongModelId,
|
|
88
|
+
adjudicationStatus: verdict,
|
|
89
|
+
confirmedScores: scores,
|
|
90
|
+
confirmedMvpMet: meetsMvpThreshold(scores),
|
|
91
|
+
rationale: validated.rationale,
|
|
92
|
+
nextAction: null,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
catch (err) {
|
|
96
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
97
|
+
log(`Adjudication error: ${msg}`);
|
|
98
|
+
return {
|
|
99
|
+
model: strongModelId,
|
|
100
|
+
adjudicationStatus: 'needs-review',
|
|
101
|
+
confirmedScores: null,
|
|
102
|
+
confirmedMvpMet: null,
|
|
103
|
+
rationale: `Adjudication failed: ${msg}`,
|
|
104
|
+
nextAction: 'Retry with strong model or manually review',
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
export function skippedAdjudication(reason) {
|
|
109
|
+
return {
|
|
110
|
+
model: 'none',
|
|
111
|
+
adjudicationStatus: 'skipped',
|
|
112
|
+
confirmedScores: null,
|
|
113
|
+
confirmedMvpMet: null,
|
|
114
|
+
rationale: reason,
|
|
115
|
+
nextAction: 'Configure and run strong-model adjudication for final quality verdict',
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
export function determineFinalLabel(localEval, adjudication) {
|
|
119
|
+
if (!adjudication || adjudication.adjudicationStatus === 'skipped') {
|
|
120
|
+
if (localEval.mvpMet && localEval.totalScore >= 12)
|
|
121
|
+
return 'local-pass';
|
|
122
|
+
if (localEval.totalScore <= 6)
|
|
123
|
+
return 'local-fail';
|
|
124
|
+
return 'needs-review';
|
|
125
|
+
}
|
|
126
|
+
return adjudication.adjudicationStatus;
|
|
127
|
+
}
|
|
128
|
+
//# sourceMappingURL=strong-model-gate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strong-model-gate.js","sourceRoot":"","sources":["../../../src/services/quality-scorecard/strong-model-gate.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAQH,OAAO,EACL,aAAa,EACb,iBAAiB,IAAI,IAAI,EACzB,iBAAiB,EACjB,4BAA4B,EAC5B,0BAA0B,GAC3B,MAAM,oCAAoC,CAAC;AAE5C,SAAS,uBAAuB,CAC9B,OAAoB,EACpB,SAA0B;IAE1B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAC/B,KAAK,CAAC,KAAK,aAAa,CAAC,CAAC,CAAC,MAAM,SAAS,CAAC,eAAe,CAAC,CAAC,CAAC,QAAQ,SAAS,CAAC,mBAAmB,CAAC,CAAC,CAAC,EAAE,CACxG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEb,OAAO;;;;;QAKD,OAAO,CAAC,SAAS;YACb,OAAO,CAAC,MAAM;gBACV,OAAO,CAAC,KAAK;cACf,OAAO,CAAC,QAAQ;aACjB,OAAO,CAAC,OAAO;+BACG,OAAO,CAAC,uBAAuB,IAAI,MAAM;uBACjD,OAAO,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM;;yBAEhF,SAAS,CAAC,KAAK;EACtC,WAAW;SACJ,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM;;;;;;;;;;;;;;;oDAerB,CAAC;AACrD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,OAAoB,EACpB,SAA0B,EAC1B,MAAuD;IAEvD,MAAM,EAAE,OAAO,EAAE,aAAa,EAAE,GAAG,EAAE,GAAG,MAAM,CAAC;IAC/C,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;IAC3D,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,eAAe,IAAI,2BAA2B,CAAC;IAC3E,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO;YACL,KAAK,EAAE,aAAa;YACpB,kBAAkB,EAAE,cAAc;YAClC,eAAe,EAAE,IAAI;YACrB,eAAe,EAAE,IAAI;YACrB,SAAS,EAAE,+DAA+D;YAC1E,UAAU,EAAE,mDAAmD;SAChE,CAAC;IACJ,CAAC;IAED,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,GAAG,OAAO,mBAAmB,EAAE;YACtD,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,aAAa,EAAE,UAAU,MAAM,EAAE;aAClC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK,EAAE,aAAa;gBACpB,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,4EAA4E,EAAE;oBACzG,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;iBAClC;gBACD,WAAW,EAAE,GAAG;gBAChB,UAAU,EAAE,IAAI;aACjB,CAAC;YACF,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,gCAAgC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;QACjE,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAAoD,CAAC;QACpF,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAC;QAC1D,MAAM,MAAM,GAAG,0BAA0B,CAAC,OAAO,CAAC,CAAC;QACnD,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YACpB,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QACpD,CAAC;QAED,MAAM,SAAS,GAAG,4BAA4B,CAAC,MAAM,CAAC,CAAC;QACvD,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG,SAAS,CAAC;QAEtC,OAAO;YACL,KAAK,EAAE,aAAa;YACpB,kBAAkB,EAAE,OAAO;YAC3B,eAAe,EAAE,MAAM;YACvB,eAAe,EAAE,iBAAiB,CAAC,MAAM,CAAC;YAC1C,SAAS,EAAE,SAAS,CAAC,SAAS;YAC9B,UAAU,EAAE,IAAI;SACjB,CAAC;IACJ,CAAC;IAAC,OAAO,GAAY,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC7D,GAAG,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAClC,OAAO;YACL,KAAK,EAAE,aAAa;YACpB,kBAAkB,EAAE,cAAc;YAClC,eAAe,EAAE,IAAI;YACrB,eAAe,EAAE,IAAI;YACrB,SAAS,EAAE,wBAAwB,GAAG,EAAE;YACxC,UAAU,EAAE,4CAA4C;SACzD,CAAC;IACJ,CAAC;AACH,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,MAAc;IAChD,OAAO;QACL,KAAK,EAAE,MAAM;QACb,kBAAkB,EAAE,SAAS;QAC7B,eAAe,EAAE,IAAI;QACrB,eAAe,EAAE,IAAI;QACrB,SAAS,EAAE,MAAM;QACjB,UAAU,EAAE,uEAAuE;KACpF,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,SAA0B,EAC1B,YAA4C;IAE5C,IAAI,CAAC,YAAY,IAAI,YAAY,CAAC,kBAAkB,KAAK,SAAS,EAAE,CAAC;QACnE,IAAI,SAAS,CAAC,MAAM,IAAI,SAAS,CAAC,UAAU,IAAI,EAAE;YAAE,OAAO,YAAY,CAAC;QACxE,IAAI,SAAS,CAAC,UAAU,IAAI,CAAC;YAAE,OAAO,YAAY,CAAC;QACnD,OAAO,cAAc,CAAC;IACxB,CAAC;IACD,OAAO,YAAY,CAAC,kBAAkB,CAAC;AACzC,CAAC"}
|
package/package.json
CHANGED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pd quality scorecard — CLI command (PRI-361)
|
|
3
|
+
*
|
|
4
|
+
* JSON contract: --json mode outputs EXACTLY one JSON object to stdout.
|
|
5
|
+
* All progress/diagnostic output goes to stderr.
|
|
6
|
+
* Errors produce structured JSON: { ok: false, error, nextAction }.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { mkdirSync, writeFileSync } from 'fs';
|
|
10
|
+
import { dirname } from 'path';
|
|
11
|
+
import type {
|
|
12
|
+
EpisodeEvaluation,
|
|
13
|
+
QualityScorecardReport,
|
|
14
|
+
StrongModelAdjudication,
|
|
15
|
+
} from '@principles/core/quality-scorecard';
|
|
16
|
+
import {
|
|
17
|
+
validateCliOptions,
|
|
18
|
+
needsAdjudication,
|
|
19
|
+
generateMarkdownReport,
|
|
20
|
+
generateHtmlReport,
|
|
21
|
+
generateJsonReport,
|
|
22
|
+
} from '@principles/core/quality-scorecard';
|
|
23
|
+
import { extractEpisodes, extractLogStats } from '../services/quality-scorecard/data-extractor.js';
|
|
24
|
+
import { evaluateWithLocalModel, checkLmStudioAvailable } from '../services/quality-scorecard/local-evaluator.js';
|
|
25
|
+
import { adjudicate, skippedAdjudication, determineFinalLabel } from '../services/quality-scorecard/strong-model-gate.js';
|
|
26
|
+
|
|
27
|
+
// ── Logging: stderr only, silent in JSON mode ──────────────────────
|
|
28
|
+
|
|
29
|
+
let jsonMode = false;
|
|
30
|
+
|
|
31
|
+
function log(msg: string): void {
|
|
32
|
+
if (!jsonMode) {
|
|
33
|
+
process.stderr.write(msg + '\n');
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ── Structured JSON output helpers ─────────────────────────────────
|
|
38
|
+
|
|
39
|
+
function writeJsonOutput(data: unknown): void {
|
|
40
|
+
process.stdout.write(JSON.stringify(data, null, 2) + '\n');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function writeJsonError(error: string, nextAction: string): void {
|
|
44
|
+
writeJsonOutput({ ok: false, error, nextAction });
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ── Summary computation ────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
function computeSummary(evaluations: EpisodeEvaluation[]) {
|
|
50
|
+
const totalEpisodes = evaluations.length;
|
|
51
|
+
// localPassCount/localFailCount: based strictly on local model's own conclusion
|
|
52
|
+
// (localEval.mvpMet + totalScore), NOT finalLabel which may incorporate strong-model adjudication.
|
|
53
|
+
const localPassCount = evaluations.filter(e => e.localEvaluation.mvpMet && e.localEvaluation.totalScore >= 12).length;
|
|
54
|
+
const localFailCount = evaluations.filter(e => e.localEvaluation.totalScore <= 6).length;
|
|
55
|
+
const strongModelReviewedCount = evaluations.filter(e =>
|
|
56
|
+
e.strongModelAdjudication && e.strongModelAdjudication.adjudicationStatus !== 'skipped'
|
|
57
|
+
).length;
|
|
58
|
+
const finalPassCount = evaluations.filter(e => e.finalLabel === 'pass').length;
|
|
59
|
+
const finalFailCount = evaluations.filter(e => e.finalLabel === 'fail').length;
|
|
60
|
+
const needsReviewCount = evaluations.filter(e => e.finalLabel === 'needs-review').length;
|
|
61
|
+
const localOnlyCount = evaluations.filter(e => e.finalLabel === 'local-pass' || e.finalLabel === 'local-fail').length;
|
|
62
|
+
const averageLocalScore = totalEpisodes > 0
|
|
63
|
+
? evaluations.reduce((s, e) => s + e.localEvaluation.totalScore, 0) / totalEpisodes
|
|
64
|
+
: 0;
|
|
65
|
+
const mvpThresholdMetCount = evaluations.filter(e => e.localEvaluation.mvpMet).length;
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
totalEpisodes, localPassCount, localFailCount, strongModelReviewedCount,
|
|
69
|
+
finalPassCount, finalFailCount, needsReviewCount, localOnlyCount,
|
|
70
|
+
averageLocalScore, mvpThresholdMetCount,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ── Main handler ───────────────────────────────────────────────────
|
|
75
|
+
|
|
76
|
+
export async function handleQualityScorecard(opts: Record<string, unknown>): Promise<void> {
|
|
77
|
+
const isJson = Boolean(opts.json);
|
|
78
|
+
jsonMode = isJson;
|
|
79
|
+
|
|
80
|
+
// Resolve workspace paths
|
|
81
|
+
const { resolveWorkspaceDir } = await import('../resolve-workspace.js');
|
|
82
|
+
const { join } = await import('path');
|
|
83
|
+
const { existsSync } = await import('fs');
|
|
84
|
+
const workspace = resolveWorkspaceDir(opts.workspace as string | undefined);
|
|
85
|
+
const dbPath = join(workspace, '.state', 'trajectory.db');
|
|
86
|
+
const logsDir = join(workspace, '.state', 'logs');
|
|
87
|
+
|
|
88
|
+
// 1. Validate CLI options
|
|
89
|
+
const { options, errors } = validateCliOptions({
|
|
90
|
+
dbPath,
|
|
91
|
+
logsDir,
|
|
92
|
+
localModelBaseUrl: opts.localUrl ?? 'http://localhost:12341/v1',
|
|
93
|
+
localModelId: opts.localModel ?? 'qwen3.6-27b-mtp',
|
|
94
|
+
strongModelId: opts.strongModel ?? null,
|
|
95
|
+
limit: opts.limit ?? '0',
|
|
96
|
+
format: isJson ? 'json' : (opts.format ?? 'markdown'),
|
|
97
|
+
output: opts.output,
|
|
98
|
+
minPainScore: opts.minScore ?? '50',
|
|
99
|
+
skipStrongModel: opts.skipStrongModel ?? false,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
if (errors.length > 0) {
|
|
103
|
+
const msg = errors.map(e => `${e.field}: ${e.message}`).join('; ');
|
|
104
|
+
if (isJson) {
|
|
105
|
+
writeJsonError(msg, 'Fix the invalid options and retry');
|
|
106
|
+
} else {
|
|
107
|
+
process.stderr.write(`❌ Invalid options:\n${errors.map(e => ` - ${e.field}: ${e.message}`).join('\n')}\n`);
|
|
108
|
+
}
|
|
109
|
+
process.exitCode = 1;
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// 2. Check files exist
|
|
114
|
+
if (!existsSync(options.dbPath)) {
|
|
115
|
+
const msg = `trajectory.db not found at: ${options.dbPath}`;
|
|
116
|
+
if (isJson) {
|
|
117
|
+
writeJsonError(msg, 'Ensure the workspace has PD data (run PD first to generate trajectory.db)');
|
|
118
|
+
} else {
|
|
119
|
+
process.stderr.write(`❌ ${msg}\n`);
|
|
120
|
+
}
|
|
121
|
+
process.exitCode = 1;
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// 3. Ensure output directory exists
|
|
126
|
+
const outputDir = dirname(options.output);
|
|
127
|
+
if (outputDir && !existsSync(outputDir)) {
|
|
128
|
+
try {
|
|
129
|
+
mkdirSync(outputDir, { recursive: true });
|
|
130
|
+
log(`Created output directory: ${outputDir}`);
|
|
131
|
+
} catch (err: unknown) {
|
|
132
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
133
|
+
if (isJson) {
|
|
134
|
+
writeJsonError(`Cannot create output directory: ${msg}`, 'Ensure the output path is writable');
|
|
135
|
+
} else {
|
|
136
|
+
process.stderr.write(`❌ Cannot create output directory: ${msg}\n`);
|
|
137
|
+
}
|
|
138
|
+
process.exitCode = 1;
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// 4. Check LM Studio
|
|
144
|
+
log('🔍 PD Quality Scorecard — Starting...');
|
|
145
|
+
log(` DB: ${options.dbPath}`);
|
|
146
|
+
log(` Local Model: ${options.localModelId} @ ${options.localModelBaseUrl}`);
|
|
147
|
+
log(` Strong Model: ${options.strongModelId ?? 'skipped'}`);
|
|
148
|
+
|
|
149
|
+
const lmStatus = await checkLmStudioAvailable(options.localModelBaseUrl);
|
|
150
|
+
if (!lmStatus.available) {
|
|
151
|
+
if (isJson) {
|
|
152
|
+
writeJsonError(`LM Studio not available: ${lmStatus.error}`, 'Start LM Studio or check --local-url');
|
|
153
|
+
} else {
|
|
154
|
+
process.stderr.write(`❌ LM Studio not available at ${options.localModelBaseUrl}: ${lmStatus.error}\n`);
|
|
155
|
+
}
|
|
156
|
+
process.exitCode = 1;
|
|
157
|
+
return;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (!lmStatus.models.includes(options.localModelId)) {
|
|
161
|
+
if (isJson) {
|
|
162
|
+
writeJsonError(`Model "${options.localModelId}" not found. Available: ${lmStatus.models.join(', ')}`, 'Use --local-model with an available model');
|
|
163
|
+
} else {
|
|
164
|
+
process.stderr.write(`❌ Model "${options.localModelId}" not found. Available: ${lmStatus.models.join(', ')}\n`);
|
|
165
|
+
}
|
|
166
|
+
process.exitCode = 1;
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// 5. Extract data
|
|
171
|
+
log('\n📊 Extracting dogfood data...');
|
|
172
|
+
const { episodes, stats: extractStats } = await extractEpisodes(options.dbPath, {
|
|
173
|
+
minScore: options.minPainScore,
|
|
174
|
+
limit: options.limit,
|
|
175
|
+
});
|
|
176
|
+
log(` Found ${episodes.length} unique episodes (total pain events: ${extractStats.total})`);
|
|
177
|
+
|
|
178
|
+
const logStats = extractLogStats(options.logsDir);
|
|
179
|
+
log(` Event logs: ${logStats.totalEvents} events (${logStats.painSignalCount} pain signals)`);
|
|
180
|
+
|
|
181
|
+
// 6. Evaluate each episode
|
|
182
|
+
log('\n🤖 Running local model evaluation...');
|
|
183
|
+
const evaluations: EpisodeEvaluation[] = [];
|
|
184
|
+
|
|
185
|
+
for (let i = 0; i < episodes.length; i++) {
|
|
186
|
+
const ep = episodes[i];
|
|
187
|
+
if (!ep) continue;
|
|
188
|
+
log(` [${i + 1}/${episodes.length}] ${ep.episodeId} (score=${ep.score})...`);
|
|
189
|
+
|
|
190
|
+
const localEval = await evaluateWithLocalModel(ep, {
|
|
191
|
+
baseUrl: options.localModelBaseUrl,
|
|
192
|
+
model: options.localModelId,
|
|
193
|
+
}, (msg: string) => log(` ${msg}`));
|
|
194
|
+
log(` Local: ${localEval.totalScore}/14 MVP=${localEval.mvpMet} flags=[${localEval.flags.join(',')}]`);
|
|
195
|
+
|
|
196
|
+
// 7. Strong model adjudication
|
|
197
|
+
let adjudication: StrongModelAdjudication;
|
|
198
|
+
if (options.skipStrongModel || !options.strongModelId) {
|
|
199
|
+
adjudication = skippedAdjudication(
|
|
200
|
+
options.skipStrongModel
|
|
201
|
+
? 'Strong model skipped by --skip-strong-model flag'
|
|
202
|
+
: 'No strong model configured'
|
|
203
|
+
);
|
|
204
|
+
} else {
|
|
205
|
+
const decision = needsAdjudication(ep, localEval);
|
|
206
|
+
if (decision.shouldAdjudicate) {
|
|
207
|
+
log(` Adjudicating (${decision.priority}: ${decision.reason})...`);
|
|
208
|
+
adjudication = await adjudicate(ep, localEval, { modelId: options.strongModelId, log: (msg: string) => log(` ${msg}`) });
|
|
209
|
+
log(` Adjudication: ${adjudication.adjudicationStatus}`);
|
|
210
|
+
} else {
|
|
211
|
+
adjudication = skippedAdjudication(decision.reason);
|
|
212
|
+
log(` Adjudication skipped: ${decision.reason}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const finalLabel = determineFinalLabel(localEval, adjudication);
|
|
217
|
+
evaluations.push({ episode: ep, localEvaluation: localEval, strongModelAdjudication: adjudication, finalLabel });
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// 8. Build and write report
|
|
221
|
+
log('\n📝 Generating report...');
|
|
222
|
+
const summary = computeSummary(evaluations);
|
|
223
|
+
const report: QualityScorecardReport = {
|
|
224
|
+
generatedAt: new Date().toISOString(),
|
|
225
|
+
dataSource: {
|
|
226
|
+
painEventCount: extractStats.total,
|
|
227
|
+
evolutionTaskCount: 0,
|
|
228
|
+
principleEventCount: 0,
|
|
229
|
+
gateBlockCount: 0,
|
|
230
|
+
dateRange: extractStats.dateRange,
|
|
231
|
+
},
|
|
232
|
+
localEvaluatorConfig: {
|
|
233
|
+
model: options.localModelId,
|
|
234
|
+
baseUrl: options.localModelBaseUrl.replace(/\/v\d+$/, '/...'),
|
|
235
|
+
apiKeyStatus: 'not-required',
|
|
236
|
+
},
|
|
237
|
+
strongModelConfig: {
|
|
238
|
+
model: options.strongModelId,
|
|
239
|
+
status: options.skipStrongModel || !options.strongModelId ? 'skipped' : 'configured',
|
|
240
|
+
},
|
|
241
|
+
evaluations,
|
|
242
|
+
summary,
|
|
243
|
+
knownLimitations: [
|
|
244
|
+
'Local model scores are advisory only — not final quality conclusions.',
|
|
245
|
+
'Without strong-model adjudication, samples are marked local-pass/local-fail/needs-review.',
|
|
246
|
+
'Deduplication is based on reason text similarity — may miss distinct episodes.',
|
|
247
|
+
'Local model output is non-deterministic despite temperature=0.1.',
|
|
248
|
+
],
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
let content: string;
|
|
252
|
+
switch (options.format) {
|
|
253
|
+
case 'html': content = generateHtmlReport(report); break;
|
|
254
|
+
case 'json': content = generateJsonReport(report); break;
|
|
255
|
+
case 'markdown':
|
|
256
|
+
default: content = generateMarkdownReport(report); break;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
writeFileSync(options.output, content, 'utf-8');
|
|
260
|
+
|
|
261
|
+
log(`\n✅ Report written to: ${options.output}`);
|
|
262
|
+
log(` Format: ${options.format}`);
|
|
263
|
+
log(` Episodes: ${summary.totalEpisodes}`);
|
|
264
|
+
log(` Local Pass: ${summary.localPassCount} | Local Fail: ${summary.localFailCount}`);
|
|
265
|
+
log(` Strong Model Reviewed: ${summary.strongModelReviewedCount}`);
|
|
266
|
+
log(` Final Pass: ${summary.finalPassCount} | Final Fail: ${summary.finalFailCount} | Needs Review: ${summary.needsReviewCount}`);
|
|
267
|
+
|
|
268
|
+
// JSON mode: output exactly one JSON object to stdout
|
|
269
|
+
if (isJson) {
|
|
270
|
+
writeJsonOutput({ ok: true, report });
|
|
271
|
+
}
|
|
272
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -927,4 +927,28 @@ consoleCmd.action(async (opts) => {
|
|
|
927
927
|
});
|
|
928
928
|
});
|
|
929
929
|
|
|
930
|
+
// ─── Quality Scorecard (PRI-361) ──────────────────────────────────
|
|
931
|
+
|
|
932
|
+
const qualityCmd = program
|
|
933
|
+
.command('quality')
|
|
934
|
+
.description('Quality scoring and evaluation');
|
|
935
|
+
|
|
936
|
+
qualityCmd
|
|
937
|
+
.command('scorecard')
|
|
938
|
+
.description('Generate quality scorecard report for PD pain→diagnosis→principle chain')
|
|
939
|
+
.option('-w, --workspace <path>', 'Workspace directory')
|
|
940
|
+
.option('--local-model <id>', 'LM Studio model ID', 'qwen3.6-27b-mtp')
|
|
941
|
+
.option('--local-url <url>', 'LM Studio base URL', 'http://localhost:12341/v1')
|
|
942
|
+
.option('--strong-model <id>', 'Strong model for adjudication (provider/model)')
|
|
943
|
+
.option('--skip-strong-model', 'Skip strong model adjudication', false)
|
|
944
|
+
.option('--min-score <n>', 'Minimum pain score to evaluate', '50')
|
|
945
|
+
.option('--limit <n>', 'Max episodes to evaluate (0=all)', '0')
|
|
946
|
+
.option('--format <fmt>', 'Output format: json, markdown, html', 'markdown')
|
|
947
|
+
.option('--output <path>', 'Output file path')
|
|
948
|
+
.option('--json', 'Output as JSON', false)
|
|
949
|
+
.action(async (opts) => {
|
|
950
|
+
const { handleQualityScorecard } = await import('./commands/quality-scorecard.js');
|
|
951
|
+
await handleQualityScorecard(opts);
|
|
952
|
+
});
|
|
953
|
+
|
|
930
954
|
program.parse();
|