npm - @principles/pd-cli - Versions diffs - 1.95.0 → 1.97.0 - Mend

@principles/pd-cli 1.95.0 → 1.97.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/dist/commands/diagnose.d.ts +1 -0
package/dist/commands/diagnose.d.ts.map +1 -1
package/dist/commands/diagnose.js +44 -0
package/dist/commands/diagnose.js.map +1 -1
package/dist/commands/pain-record.d.ts.map +1 -1
package/dist/commands/pain-record.js +4 -1
package/dist/commands/pain-record.js.map +1 -1
package/dist/commands/quality-scorecard.d.ts +9 -0
package/dist/commands/quality-scorecard.d.ts.map +1 -0
package/dist/commands/quality-scorecard.js +241 -0
package/dist/commands/quality-scorecard.js.map +1 -0
package/dist/index.js +22 -0
package/dist/index.js.map +1 -1
package/dist/services/quality-scorecard/data-extractor.d.ts +28 -0
package/dist/services/quality-scorecard/data-extractor.d.ts.map +1 -0
package/dist/services/quality-scorecard/data-extractor.js +118 -0
package/dist/services/quality-scorecard/data-extractor.js.map +1 -0
package/dist/services/quality-scorecard/local-evaluator.d.ts +18 -0
package/dist/services/quality-scorecard/local-evaluator.d.ts.map +1 -0
package/dist/services/quality-scorecard/local-evaluator.js +112 -0
package/dist/services/quality-scorecard/local-evaluator.js.map +1 -0
package/dist/services/quality-scorecard/strong-model-gate.d.ts +14 -0
package/dist/services/quality-scorecard/strong-model-gate.d.ts.map +1 -0
package/dist/services/quality-scorecard/strong-model-gate.js +128 -0
package/dist/services/quality-scorecard/strong-model-gate.js.map +1 -0
package/package.json +1 -1
package/src/commands/diagnose.ts +45 -0
package/src/commands/pain-record.ts +5 -2
package/src/commands/quality-scorecard.ts +272 -0
package/src/index.ts +25 -0
package/src/services/quality-scorecard/data-extractor.ts +150 -0
package/src/services/quality-scorecard/local-evaluator.ts +142 -0
package/src/services/quality-scorecard/strong-model-gate.ts +160 -0
package/tests/commands/diagnose.test.ts +69 -1
package/tests/commands/pain-record-async.test.ts +4 -0
package/tests/commands/product-path-regression.test.ts +81 -0

package/src/services/quality-scorecard/strong-model-gate.ts ADDED Viewed

@@ -0,0 +1,160 @@
+/**
+ * PRI-361 — Strong Model Adjudication Gate (I/O layer in pd-cli)
+ *
+ * Calls cloud model for adjudication. Uses core validation
+ * to parse responses — no unsafe casts.
+ */
+import type {
+  PainEpisode,
+  LocalEvaluation,
+  StrongModelAdjudication,
+  AdjudicationStatus,
+} from '@principles/core/quality-scorecard';
+import {
+  RUBRIC_LABELS,
+  RUBRIC_DIMENSIONS as DIMS,
+  meetsMvpThreshold,
+  validateAdjudicationResponse,
+  extractJsonFromLlmResponse,
+} from '@principles/core/quality-scorecard';
+function buildAdjudicationPrompt(
+  episode: PainEpisode,
+  localEval: LocalEvaluation
+): string {
+  const localScores = DIMS.map(d =>
+    `- ${d} (${RUBRIC_LABELS[d]}): ${localEval.dimensionScores[d]}/2 — ${localEval.dimensionRationales[d]}`
+  ).join('\n');
+  return `You are a senior quality adjudicator for an AI agent evolution pipeline.
+Your job is to independently re-evaluate a pain episode that was first scored by a local (smaller) model.
+You must provide your own scores — do NOT simply copy the local model's scores.
+## Pain Episode
+- ID: ${episode.episodeId}
+- Source: ${episode.source}
+- Pain Score: ${episode.score}
+- Severity: ${episode.severity}
+- Summary: ${episode.summary}
+- Evolution Task Resolution: ${episode.evolutionTaskResolution ?? 'none'}
+- Linked Principles: ${episode.linkedPrinciples.length > 0 ? episode.linkedPrinciples.join(', ') : 'none'}
+## Local Model Scores (${localEval.model})
+${localScores}
+Flags: ${localEval.flags.length > 0 ? localEval.flags.join(', ') : 'none'}
+## Your Task
+1. Independently score each dimension (0/1/2) based on the evidence.
+2. Check for: language inconsistency, over-abstraction, fabricated evidence.
+3. If your scores differ from the local model by >=2 points on any dimension, explain why.
+4. Give a final verdict: pass, fail, or needs-review.
+## Output Format (STRICT JSON)
+{
+  "scores": { "G1": 0-2, "G2": 0-2, "G3": 0-2, "G4": 0-2, "G5": 0-2, "G6": 0-2, "G7": 0-2 },
+  "rationale": "Overall assessment...",
+  "verdict": "pass" | "fail" | "needs-review"
+}
+Do NOT output anything other than this JSON object.`;
+}
+export async function adjudicate(
+  episode: PainEpisode,
+  localEval: LocalEvaluation,
+  config: { modelId: string; log: (msg: string) => void }
+): Promise<StrongModelAdjudication> {
+  const { modelId: strongModelId, log } = config;
+  const prompt = buildAdjudicationPrompt(episode, localEval);
+  const baseUrl = process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
+  const apiKey = process.env.OPENAI_API_KEY;
+  if (!apiKey) {
+    return {
+      model: strongModelId,
+      adjudicationStatus: 'needs-review',
+      confirmedScores: null,
+      confirmedMvpMet: null,
+      rationale: 'OPENAI_API_KEY not set — cannot run strong-model adjudication',
+      nextAction: 'Set OPENAI_API_KEY and re-run with --strong-model',
+    };
+  }
+  try {
+    const resp = await fetch(`${baseUrl}/chat/completions`, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        Authorization: `Bearer ${apiKey}`,
+      },
+      body: JSON.stringify({
+        model: strongModelId,
+        messages: [
+          { role: 'system', content: 'You are a precise JSON-output quality adjudicator. Output only valid JSON.' },
+          { role: 'user', content: prompt },
+        ],
+        temperature: 0.1,
+        max_tokens: 2000,
+      }),
+      signal: AbortSignal.timeout(120_000),
+    });
+    if (!resp.ok) {
+      throw new Error(`Strong model request failed: ${resp.status}`);
+    }
+    const data = (await resp.json()) as { choices: { message: { content: string } }[] };
+    const content = data.choices?.[0]?.message?.content ?? '';
+    const parsed = extractJsonFromLlmResponse(content);
+    if (parsed === null) {
+      throw new Error('Strong model returned non-JSON');
+    }
+    const validated = validateAdjudicationResponse(parsed);
+    const { scores, verdict } = validated;
+    return {
+      model: strongModelId,
+      adjudicationStatus: verdict,
+      confirmedScores: scores,
+      confirmedMvpMet: meetsMvpThreshold(scores),
+      rationale: validated.rationale,
+      nextAction: null,
+    };
+  } catch (err: unknown) {
+    const msg = err instanceof Error ? err.message : String(err);
+    log(`Adjudication error: ${msg}`);
+    return {
+      model: strongModelId,
+      adjudicationStatus: 'needs-review',
+      confirmedScores: null,
+      confirmedMvpMet: null,
+      rationale: `Adjudication failed: ${msg}`,
+      nextAction: 'Retry with strong model or manually review',
+    };
+  }
+}
+export function skippedAdjudication(reason: string): StrongModelAdjudication {
+  return {
+    model: 'none',
+    adjudicationStatus: 'skipped',
+    confirmedScores: null,
+    confirmedMvpMet: null,
+    rationale: reason,
+    nextAction: 'Configure and run strong-model adjudication for final quality verdict',
+  };
+}
+export function determineFinalLabel(
+  localEval: LocalEvaluation,
+  adjudication: StrongModelAdjudication | null
+): AdjudicationStatus {
+  if (!adjudication || adjudication.adjudicationStatus === 'skipped') {
+    if (localEval.mvpMet && localEval.totalScore >= 12) return 'local-pass';
+    if (localEval.totalScore <= 6) return 'local-fail';
+    return 'needs-review';
+  }
+  return adjudication.adjudicationStatus;
+}

package/tests/commands/diagnose.test.ts CHANGED Viewed

@@ -122,7 +122,7 @@ vi.mock('../../src/services/pd-config-loader.js', () => ({
   computeFlagsFromLoadResult: vi.fn().mockReturnValue({}),
 }));
-import { handleDiagnoseRun, type DiagnoseRunOptions } from '../../src/commands/diagnose.js';
+import { handleDiagnoseRun, handleDiagnoseStatus, type DiagnoseRunOptions } from '../../src/commands/diagnose.js';
 const SUCCEEDED_RESULT = {
   status: 'succeeded' as const,
@@ -811,3 +811,71 @@ describe('Commander wiring for --no-intake', () => {
     ).rejects.toThrow();
   });
 });
+describe('pd status stalled-threshold validation', () => {
+  it('accepts valid positive integers', async () => {
+    const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
+    const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+    const runtimeV2 = await import('@principles/core/runtime-v2');
+    vi.mocked(runtimeV2.status).mockResolvedValueOnce({
+      taskId: 'test-task-1',
+      status: 'pending',
+      attemptCount: 0,
+      maxAttempts: 3,
+      lastError: null,
+      commitId: null,
+      artifactId: null,
+      candidateCount: null,
+    });
+    await handleDiagnoseStatus({
+      taskId: 'test-task-1',
+      stalledThreshold: '123',
+    });
+    expect(exitSpy).not.toHaveBeenCalled();
+    exitSpy.mockRestore();
+    consoleSpy.mockRestore();
+  });
+  it('rejects invalid inputs (0, negative, decimals, NaN, empty)', async () => {
+    const invalidInputs = ['0', '-10', '1.5', 'abc', 'NaN', ''];
+    for (const input of invalidInputs) {
+      const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
+      const consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+      await handleDiagnoseStatus({
+        taskId: 'test-task-1',
+        stalledThreshold: input,
+      });
+      expect(exitSpy).toHaveBeenCalledWith(1);
+      expect(consoleErrorSpy).toHaveBeenCalledWith(expect.stringContaining('positive integer'));
+      exitSpy.mockRestore();
+      consoleErrorSpy.mockRestore();
+    }
+  });
+  it('rejects invalid inputs in JSON mode', async () => {
+    const exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => undefined) as () => never);
+    const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+    await handleDiagnoseStatus({
+      taskId: 'test-task-1',
+      stalledThreshold: '0',
+      json: true,
+    });
+    expect(exitSpy).toHaveBeenCalledWith(1);
+    expect(consoleSpy).toHaveBeenCalled();
+    const output = JSON.parse((consoleSpy.mock.calls[0] as string[])[0]);
+    expect(output.ok).toBe(false);
+    expect(output.reason).toBe('invalid_stalled_threshold');
+    expect(output.nextAction).toContain('positive integer');
+    exitSpy.mockRestore();
+    consoleSpy.mockRestore();
+  });
+});

package/tests/commands/pain-record-async.test.ts CHANGED Viewed

@@ -137,6 +137,10 @@ describe('pd pain record async mode (PRI-369)', () => {
     expect(jsonOutput.ledgerEntryIds).toEqual([]);
     expect(jsonOutput.latencyMs).toBe(120);
     expect(jsonOutput.message).toContain('pd task show');
+    expect(jsonOutput.reason).toContain('pd task show');
+    expect(jsonOutput.nextAction).toContain('pd diagnose run');
+    expect(jsonOutput.nextAction).toContain('--runtime pi-ai');
+    expect(jsonOutput.nextAction).toContain('--json');
     // submitted should NOT cause exit(1)
     expect(exitSpy).not.toHaveBeenCalledWith(1);

package/tests/commands/product-path-regression.test.ts ADDED Viewed

@@ -0,0 +1,81 @@
+import { describe, it, expect } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { execSync } from 'child_process';
+import { fileURLToPath } from 'url';
+// Resolve __dirname in ESM
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+describe('Real CLI JSON product-path regression test (PRI-376)', () => {
+  it('outputs exactly one parseable JSON object on async pain record', () => {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pd-regression-'));
+    const pdDir = path.join(tmpDir, '.pd');
+    fs.mkdirSync(pdDir, { recursive: true });
+    // Write a config to enable async mode
+    const configContent = `
+version: 1
+features:
+  diagnostician_async_cli:
+    category: quiet
+    enabled: true
+runtimeProfiles:
+  openclaw.default:
+    type: openclaw
+    source: default
+internalAgents:
+  defaultRuntime: openclaw.default
+  agents:
+    diagnostician:
+      enabled: true
+      runtimeProfile: openclaw.default
+ui:
+  diagnostics:
+    mode: simple
+`;
+    fs.writeFileSync(path.join(pdDir, 'config.yaml'), configContent.trim(), 'utf8');
+    // Resolve CLI binary path relative to this file to be workspace-independent
+    const cliBin = path.resolve(__dirname, '../../dist/index.js');
+    const cmd = `node "${cliBin}" pain record --reason "Regression test frustration" --json --workspace "${tmpDir}"`;
+    let stdoutStr: string;
+    try {
+      stdoutStr = execSync(cmd, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'inherit'] });
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
+    // Ensure output is non-empty
+    expect(stdoutStr.trim()).not.toBe('');
+    // Ensure it's exactly one parseable JSON object
+    let parsed: Record<string, unknown>;
+    try {
+      parsed = JSON.parse(stdoutStr.trim()) as Record<string, unknown>;
+    } catch (err) {
+      throw new Error(`Stdout was not a single parseable JSON object:\n${stdoutStr}`, {
+        cause: err,
+      });
+    }
+    // Ensure required fields
+    expect(parsed).toBeDefined();
+    expect(parsed.status).toBe('submitted');
+    expect(parsed.taskId).toMatch(/^diagnosis_/);
+    expect(parsed.message).toBeDefined();
+    expect(parsed.reason).toBeTypeOf('string');
+    expect(parsed.nextAction).toBeTypeOf('string');
+    // Verify nextAction structure: pd diagnose run --task-id ... --runtime pi-ai --json
+    const nextAction = parsed.nextAction as string;
+    expect(nextAction).toContain('pd diagnose run');
+    expect(nextAction).toContain(`--task-id ${parsed.taskId}`);
+    expect(nextAction).toContain('--runtime pi-ai');
+    expect(nextAction).toContain('--json');
+  }, 20000);
+});