npm - @sienklogic/plan-build-run - Versions diffs - 2.22.1 → 2.23.0 - Mend

@sienklogic/plan-build-run 2.22.1 → 2.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/CHANGELOG.md +34 -0
package/dashboard/package.json +2 -1
package/dashboard/src/middleware/errorHandler.js +12 -2
package/dashboard/src/repositories/planning.repository.js +23 -1
package/dashboard/src/routes/pages.routes.js +65 -2
package/dashboard/src/services/local-llm-metrics.service.js +81 -0
package/dashboard/src/services/quick.service.js +62 -0
package/dashboard/src/views/partials/analytics-content.ejs +61 -0
package/dashboard/src/views/partials/quick-content.ejs +40 -0
package/dashboard/src/views/partials/quick-detail-content.ejs +29 -0
package/dashboard/src/views/partials/sidebar.ejs +8 -0
package/dashboard/src/views/quick-detail.ejs +5 -0
package/dashboard/src/views/quick.ejs +5 -0
package/package.json +1 -1
package/plugins/copilot-pbr/agents/debugger.agent.md +15 -0
package/plugins/copilot-pbr/agents/researcher.agent.md +20 -0
package/plugins/copilot-pbr/agents/synthesizer.agent.md +12 -0
package/plugins/copilot-pbr/plugin.json +1 -1
package/plugins/copilot-pbr/references/config-reference.md +89 -0
package/plugins/copilot-pbr/skills/continue/SKILL.md +1 -1
package/plugins/copilot-pbr/skills/health/SKILL.md +8 -1
package/plugins/copilot-pbr/skills/help/SKILL.md +17 -4
package/plugins/copilot-pbr/skills/milestone/SKILL.md +13 -13
package/plugins/copilot-pbr/skills/quick/SKILL.md +4 -1
package/plugins/copilot-pbr/skills/setup/SKILL.md +17 -6
package/plugins/copilot-pbr/skills/status/SKILL.md +37 -1
package/plugins/cursor-pbr/.cursor-plugin/plugin.json +1 -1
package/plugins/cursor-pbr/agents/debugger.md +15 -0
package/plugins/cursor-pbr/agents/researcher.md +20 -0
package/plugins/cursor-pbr/agents/synthesizer.md +12 -0
package/plugins/cursor-pbr/references/config-reference.md +89 -0
package/plugins/cursor-pbr/skills/continue/SKILL.md +1 -1
package/plugins/cursor-pbr/skills/health/SKILL.md +8 -1
package/plugins/cursor-pbr/skills/help/SKILL.md +17 -4
package/plugins/cursor-pbr/skills/milestone/SKILL.md +13 -13
package/plugins/cursor-pbr/skills/quick/SKILL.md +4 -1
package/plugins/cursor-pbr/skills/setup/SKILL.md +17 -6
package/plugins/cursor-pbr/skills/status/SKILL.md +37 -1
package/plugins/pbr/.claude-plugin/plugin.json +1 -1
package/plugins/pbr/agents/debugger.md +15 -0
package/plugins/pbr/agents/researcher.md +20 -0
package/plugins/pbr/agents/synthesizer.md +12 -0
package/plugins/pbr/references/config-reference.md +89 -0
package/plugins/pbr/scripts/check-config-change.js +33 -0
package/plugins/pbr/scripts/check-plan-format.js +52 -4
package/plugins/pbr/scripts/check-subagent-output.js +43 -3
package/plugins/pbr/scripts/config-schema.json +48 -0
package/plugins/pbr/scripts/local-llm/client.js +214 -0
package/plugins/pbr/scripts/local-llm/health.js +217 -0
package/plugins/pbr/scripts/local-llm/metrics.js +252 -0
package/plugins/pbr/scripts/local-llm/operations/classify-artifact.js +76 -0
package/plugins/pbr/scripts/local-llm/operations/classify-error.js +75 -0
package/plugins/pbr/scripts/local-llm/operations/score-source.js +72 -0
package/plugins/pbr/scripts/local-llm/operations/summarize-context.js +62 -0
package/plugins/pbr/scripts/local-llm/operations/validate-task.js +59 -0
package/plugins/pbr/scripts/local-llm/router.js +101 -0
package/plugins/pbr/scripts/local-llm/shadow.js +60 -0
package/plugins/pbr/scripts/local-llm/threshold-tuner.js +118 -0
package/plugins/pbr/scripts/pbr-tools.js +120 -3
package/plugins/pbr/scripts/post-write-dispatch.js +2 -2
package/plugins/pbr/scripts/progress-tracker.js +29 -3
package/plugins/pbr/scripts/session-cleanup.js +36 -1
package/plugins/pbr/scripts/validate-task.js +30 -1
package/plugins/pbr/skills/continue/SKILL.md +1 -1
package/plugins/pbr/skills/health/SKILL.md +8 -1
package/plugins/pbr/skills/help/SKILL.md +17 -4
package/plugins/pbr/skills/milestone/SKILL.md +13 -13
package/plugins/pbr/skills/quick/SKILL.md +4 -1
package/plugins/pbr/skills/setup/SKILL.md +17 -6
package/plugins/pbr/skills/status/SKILL.md +38 -2

package/plugins/pbr/scripts/local-llm/operations/classify-artifact.js ADDED Viewed

@@ -0,0 +1,76 @@
+'use strict';
+const { complete, tryParseJSON, isDisabled } = require('../client');
+const { logMetric } = require('../metrics');
+const { route } = require('../router');
+/**
+ * Classifies a PLAN.md or SUMMARY.md artifact using the local LLM.
+ *
+ * @param {object} config - resolved local_llm config block
+ * @param {string} planningDir - path to the .planning directory
+ * @param {string} content - file content to classify
+ * @param {string} fileType - 'PLAN' or 'SUMMARY'
+ * @param {string} [sessionId] - optional session identifier for metrics
+ * @returns {Promise<{ classification: string, confidence: number, reason: string, latency_ms: number, fallback_used: boolean }|null>}
+ */
+async function classifyArtifact(config, planningDir, content, fileType, sessionId) {
+  if (!config.enabled || !config.features.artifact_classification) return null;
+  if (isDisabled('artifact-classification', config.advanced.disable_after_failures)) return null;
+  const maxChars = (config.advanced.max_input_tokens || 1024) * 4;
+  const truncatedContent = content.length > maxChars ? content.slice(0, maxChars) : content;
+  let prompt;
+  if (fileType === 'PLAN') {
+    prompt =
+      'Classify this PLAN.md as stub, partial, or complete. A stub has placeholder tasks or missing required XML elements. A partial has some tasks filled but action/verify/done are vague. A complete has all tasks with specific steps, executable verify commands, and observable done conditions. Respond with JSON: {"classification": "stub"|"partial"|"complete", "confidence": 0.0-1.0, "reason": "one sentence"}\n\nContent:\n' +
+      truncatedContent;
+  } else if (fileType === 'SUMMARY') {
+    prompt =
+      'Classify this SUMMARY.md as substantive or thin. Substantive means it has specific artifact paths, commit hashes, and observable outcomes. Thin means vague or placeholder content. Respond with JSON: {"classification": "substantive"|"thin", "confidence": 0.0-1.0, "reason": "one sentence"}\n\nContent:\n' +
+      truncatedContent;
+  } else {
+    return null;
+  }
+  try {
+    const result = await route(config, prompt, 'artifact-classification', (logprobs) =>
+      complete(config, prompt, 'artifact-classification', { logprobs })
+    );
+    if (result === null) return null;
+    const parsed = tryParseJSON(result.content);
+    if (!parsed.ok) return null;
+    const validPlanClassifications = ['stub', 'partial', 'complete'];
+    const validSummaryClassifications = ['substantive', 'thin'];
+    const validValues = fileType === 'PLAN' ? validPlanClassifications : validSummaryClassifications;
+    if (!parsed.data.classification || !validValues.includes(parsed.data.classification)) return null;
+    const metricEntry = {
+      session_id: sessionId || 'unknown',
+      timestamp: new Date().toISOString(),
+      operation: 'artifact-classification',
+      model: config.model,
+      latency_ms: result.latency_ms,
+      tokens_used_local: result.tokens,
+      tokens_saved_frontier: 420,
+      result: parsed.data.classification,
+      fallback_used: false,
+      confidence: parsed.data.confidence || 0.9
+    };
+    logMetric(planningDir, metricEntry);
+    return {
+      classification: parsed.data.classification,
+      confidence: parsed.data.confidence || 0.9,
+      reason: parsed.data.reason || '',
+      latency_ms: result.latency_ms,
+      fallback_used: false
+    };
+  } catch (_) {
+    return null;
+  }
+}
+module.exports = { classifyArtifact };

package/plugins/pbr/scripts/local-llm/operations/classify-error.js ADDED Viewed

@@ -0,0 +1,75 @@
+'use strict';
+const { complete, tryParseJSON, isDisabled } = require('../client');
+const { logMetric } = require('../metrics');
+const { route } = require('../router');
+const ERROR_CATEGORIES = [
+  'connection_refused',
+  'timeout',
+  'missing_output',
+  'wrong_output_format',
+  'permission_error',
+  'unknown'
+];
+/**
+ * Classifies an agent error into one of 6 categories using the local LLM.
+ *
+ * @param {object} config - resolved local_llm config block
+ * @param {string} planningDir - path to the .planning directory
+ * @param {string} errorText - the error message or stack trace
+ * @param {string} [agentType] - the agent type that produced the error
+ * @param {string} [sessionId] - optional session identifier for metrics
+ * @returns {Promise<{ category: string, confidence: number, latency_ms: number, fallback_used: boolean }|null>}
+ */
+async function classifyError(config, planningDir, errorText, agentType, sessionId) {
+  if (!config.enabled) return null;
+  if (isDisabled('error-classification', config.advanced.disable_after_failures)) return null;
+  const truncatedError = errorText.length > 500 ? errorText.slice(0, 500) : errorText;
+  const prompt =
+    'Classify this agent error into one category. Categories: connection_refused (network/ECONNREFUSED), timeout (operation timed out), missing_output (expected file/artifact not found), wrong_output_format (output exists but malformed), permission_error (filesystem/permission issue), unknown (none of the above). Respond with JSON: {"category": "<one of the 6>", "confidence": 0.0-1.0}\n\nAgent: ' +
+    (agentType || 'unknown') +
+    '\nError: ' +
+    truncatedError;
+  try {
+    const result = await route(config, prompt, 'error-classification', (logprobs) =>
+      complete(config, prompt, 'error-classification', { logprobs })
+    );
+    if (result === null) return null;
+    const parsed = tryParseJSON(result.content);
+    if (!parsed.ok) return null;
+    const category = ERROR_CATEGORIES.includes(parsed.data.category)
+      ? parsed.data.category
+      : 'unknown';
+    const metricEntry = {
+      session_id: sessionId || 'unknown',
+      timestamp: new Date().toISOString(),
+      operation: 'error-classification',
+      model: config.model,
+      latency_ms: result.latency_ms,
+      tokens_used_local: result.tokens,
+      tokens_saved_frontier: 120,
+      result: category,
+      fallback_used: false,
+      confidence: parsed.data.confidence || 0.9
+    };
+    logMetric(planningDir, metricEntry);
+    return {
+      category,
+      confidence: parsed.data.confidence || 0.9,
+      latency_ms: result.latency_ms,
+      fallback_used: false
+    };
+  } catch (_) {
+    return null;
+  }
+}
+module.exports = { classifyError, ERROR_CATEGORIES };

package/plugins/pbr/scripts/local-llm/operations/score-source.js ADDED Viewed

@@ -0,0 +1,72 @@
+'use strict';
+const { complete, tryParseJSON, isDisabled } = require('../client');
+const { logMetric } = require('../metrics');
+const { route } = require('../router');
+const SOURCE_LEVELS = ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6'];
+/**
+ * Scores a research source on the S0-S6 credibility scale using the local LLM.
+ *
+ * S0=local prior research, S1=live MCP docs, S2=official docs, S3=official GitHub,
+ * S4=verified WebSearch (2+ sources), S5=unverified WebSearch, S6=training knowledge.
+ *
+ * @param {object} config - resolved local_llm config block
+ * @param {string} planningDir - path to the .planning directory
+ * @param {string} sourceText - text content from the source
+ * @param {string} sourceUrl - URL or identifier for the source
+ * @param {string} [sessionId] - optional session identifier for metrics
+ * @returns {Promise<{ level: string, confidence: number, reason: string, latency_ms: number, fallback_used: boolean }|null>}
+ */
+async function scoreSource(config, planningDir, sourceText, sourceUrl, sessionId) {
+  if (!config.enabled) return null;
+  if (!config.features || !config.features.source_scoring) return null;
+  if (isDisabled('source-scoring', config.advanced.disable_after_failures)) return null;
+  const maxChars = (config.advanced.max_input_tokens || 1024) * 4;
+  const truncated = sourceText.length > maxChars ? sourceText.slice(0, maxChars) : sourceText;
+  const prompt =
+    'Score this research source on the S0-S6 credibility scale. S0=local prior research, S1=live MCP docs, S2=official docs, S3=official GitHub, S4=verified WebSearch (2+ sources), S5=unverified WebSearch, S6=training knowledge. Respond with JSON: {"level": "S0"-"S6", "confidence": 0.0-1.0, "reason": "one sentence"}\n\nURL: ' +
+    sourceUrl +
+    '\nContent excerpt:\n' +
+    truncated;
+  try {
+    const result = await route(config, prompt, 'source-scoring', (logprobs) =>
+      complete(config, prompt, 'source-scoring', { logprobs })
+    );
+    if (result === null) return null;
+    const parsed = tryParseJSON(result.content);
+    if (!parsed.ok) return null;
+    const level = SOURCE_LEVELS.includes(parsed.data.level) ? parsed.data.level : 'S6';
+    const metricEntry = {
+      session_id: sessionId || 'unknown',
+      timestamp: new Date().toISOString(),
+      operation: 'source-scoring',
+      model: config.model,
+      latency_ms: result.latency_ms,
+      tokens_used_local: result.tokens,
+      tokens_saved_frontier: 80,
+      result: level,
+      fallback_used: false,
+      confidence: parsed.data.confidence || 0.9
+    };
+    logMetric(planningDir, metricEntry);
+    return {
+      level,
+      confidence: parsed.data.confidence || 0.9,
+      reason: parsed.data.reason || '',
+      latency_ms: result.latency_ms,
+      fallback_used: false
+    };
+  } catch (_) {
+    return null;
+  }
+}
+module.exports = { scoreSource, SOURCE_LEVELS };

package/plugins/pbr/scripts/local-llm/operations/summarize-context.js ADDED Viewed

@@ -0,0 +1,62 @@
+'use strict';
+const { complete, isDisabled } = require('../client');
+const { logMetric } = require('../metrics');
+const { route } = require('../router');
+/**
+ * Produces a concise plain-text summary of project context using the local LLM.
+ *
+ * @param {object} config - resolved local_llm config block
+ * @param {string} planningDir - path to the .planning directory
+ * @param {string} contextText - project context text to summarize
+ * @param {number} [maxWords] - target word count for the summary (default 150)
+ * @param {string} [sessionId] - optional session identifier for metrics
+ * @returns {Promise<{ summary: string, latency_ms: number, fallback_used: boolean }|null>}
+ */
+async function summarizeContext(config, planningDir, contextText, maxWords, sessionId) {
+  if (!config.enabled) return null;
+  if (!config.features.context_summarization) return null;
+  if (isDisabled('context-summarization', config.advanced.disable_after_failures)) return null;
+  const maxChars = (config.advanced.max_input_tokens || 1024) * 4;
+  const truncated = contextText.length > maxChars ? contextText.slice(0, maxChars) : contextText;
+  const targetWords = maxWords || 150;
+  const prompt =
+    'Summarize the following project context in under ' +
+    targetWords +
+    ' words. Focus on: current phase goal, key decisions made, what is built so far, and what is still needed. Output plain text only — no JSON, no headings, no bullet points.\n\nContext:\n' +
+    truncated;
+  try {
+    const result = await route(config, prompt, 'context-summarization', (logprobs) =>
+      complete(config, prompt, 'context-summarization', { logprobs })
+    );
+    if (result === null) return null;
+    const metricEntry = {
+      session_id: sessionId || 'unknown',
+      timestamp: new Date().toISOString(),
+      operation: 'context-summarization',
+      model: config.model,
+      latency_ms: result.latency_ms,
+      tokens_used_local: result.tokens,
+      tokens_saved_frontier: 350,
+      result: 'summary',
+      fallback_used: false,
+      confidence: 0.9
+    };
+    logMetric(planningDir, metricEntry);
+    return {
+      summary: result.content.trim(),
+      latency_ms: result.latency_ms,
+      fallback_used: false
+    };
+  } catch (_) {
+    return null;
+  }
+}
+module.exports = { summarizeContext };

package/plugins/pbr/scripts/local-llm/operations/validate-task.js ADDED Viewed

@@ -0,0 +1,59 @@
+'use strict';
+const { complete, tryParseJSON, isDisabled } = require('../client');
+const { logMetric } = require('../metrics');
+const { route } = require('../router');
+/**
+ * Validates a Task() call for coherence using the local LLM.
+ *
+ * @param {object} config - resolved local_llm config block
+ * @param {string} planningDir - path to the .planning directory
+ * @param {{ description: string, subagent_type: string }} taskInput - the Task() call to validate
+ * @param {string} [sessionId] - optional session identifier for metrics
+ * @returns {Promise<{ coherent: boolean, confidence: number, issue: string|null, latency_ms: number, fallback_used: boolean }|null>}
+ */
+async function validateTask(config, planningDir, taskInput, sessionId) {
+  if (!config.enabled || !config.features.task_validation) return null;
+  if (isDisabled('task-validation', config.advanced.disable_after_failures)) return null;
+  const prompt =
+    'Assess this Task() call for coherence. Check: (1) description is meaningful and not empty, (2) if subagent_type starts with \'pbr:\' the agent name is valid, (3) description matches the intended operation. Known pbr agents: researcher, planner, plan-checker, executor, verifier, integration-checker, debugger, codebase-mapper, synthesizer, general. Respond with JSON: {"coherent": true|false, "confidence": 0.0-1.0, "issue": "null or one sentence describing the problem"}\n\nTask input: ' +
+    JSON.stringify({ description: taskInput.description, subagent_type: taskInput.subagent_type });
+  try {
+    const result = await route(config, prompt, 'task-validation', (logprobs) =>
+      complete(config, prompt, 'task-validation', { logprobs })
+    );
+    if (result === null) return null;
+    const parsed = tryParseJSON(result.content);
+    if (!parsed.ok) return null;
+    if (typeof parsed.data.coherent !== 'boolean') return null;
+    const metricEntry = {
+      session_id: sessionId || 'unknown',
+      timestamp: new Date().toISOString(),
+      operation: 'task-validation',
+      model: config.model,
+      latency_ms: result.latency_ms,
+      tokens_used_local: result.tokens,
+      tokens_saved_frontier: 180,
+      result: parsed.data.coherent ? 'coherent' : 'incoherent',
+      fallback_used: false,
+      confidence: parsed.data.confidence || 0.9
+    };
+    logMetric(planningDir, metricEntry);
+    return {
+      coherent: parsed.data.coherent,
+      confidence: parsed.data.confidence || 0.9,
+      issue: parsed.data.issue || null,
+      latency_ms: result.latency_ms,
+      fallback_used: false
+    };
+  } catch (_) {
+    return null;
+  }
+}
+module.exports = { validateTask };

package/plugins/pbr/scripts/local-llm/router.js ADDED Viewed

@@ -0,0 +1,101 @@
+'use strict';
+const { runShadow } = require('./shadow');
+const COMPLEXITY_HIGH_THRESHOLD = 0.65;
+/**
+ * Scores the complexity of a prompt using a weighted surface heuristic.
+ * Returns a value in [0, 1] where higher means more complex.
+ *
+ * @param {string} prompt
+ * @returns {number}
+ */
+function scoreComplexity(prompt) {
+  const words = prompt.split(/\s+/).length;
+  const codeBlocks = (prompt.match(/```/g) || []).length / 2;
+  const constraints = (prompt.match(/\b(must|should|exactly|only|never|always)\b/gi) || []).length;
+  const reasoning = (prompt.match(/\b(why|explain|compare|analyze|reason|evaluate)\b/gi) || []).length;
+  const structuredOut = /\b(json|schema|yaml|frontmatter)\b/i.test(prompt) ? 1 : 0;
+  return Math.min(words / 500, 1.0) * 0.25 +
+         Math.min(codeBlocks / 3, 1.0) * 0.20 +
+         Math.min(constraints / 5, 1.0) * 0.20 +
+         Math.min(reasoning / 3, 1.0) * 0.20 +
+         structuredOut * 0.15;
+}
+/**
+ * Extracts a confidence score from logprobs data returned by the local LLM.
+ * Returns a value in [0, 1] or null if no logprobs data is available.
+ *
+ * @param {Array<{token: string, logprob: number}>|null|undefined} logprobsData
+ * @returns {number|null}
+ */
+function extractConfidence(logprobsData) {
+  if (!logprobsData || logprobsData.length === 0) return null;
+  const sum = logprobsData.reduce((acc, t) => acc + t.logprob, 0);
+  const avgLogprob = sum / logprobsData.length;
+  return Math.min(1, Math.max(0, Math.exp(avgLogprob)));
+}
+/**
+ * Routes a prompt through local LLM or signals caller to use frontier model.
+ * Returns the local LLM result if local is suitable, or null if caller should
+ * fall back to frontier. Never throws — all errors return null.
+ *
+ * @param {object} config - local_llm config block with routing_strategy and advanced settings
+ * @param {string} prompt - the prompt being routed
+ * @param {string} operationType - operation identifier
+ * @param {function(boolean): Promise<{content: string, logprobsData: Array|null}>} callLocalFn
+ *   Async function accepting a logprobs boolean, returns the local LLM result object.
+ * @param {string} [planningDir] - path to .planning directory; when provided enables shadow mode
+ * @param {Function} [frontierResultFn] - async function that calls the frontier model;
+ *   NOTE: parameter inversion vs shadow.js — here LOCAL has already run (it's the primary result)
+ *   and FRONTIER is the shadow. We pass frontierResultFn as shadow.js arg 4 (localResultFn slot)
+ *   so shadow.js calls it, and result.content as arg 5 (frontierResult slot, the committed result).
+ * @returns {Promise<{content: string, logprobsData: Array|null}|null>}
+ */
+async function route(config, prompt, operationType, callLocalFn, planningDir, frontierResultFn) {
+  try {
+    const routingStrategy = (config && config.routing_strategy) || 'local_first';
+    const confidenceThreshold = (config && config.advanced && config.advanced.confidence_threshold) || 0.9;
+    if (routingStrategy === 'quality_first') {
+      const score = scoreComplexity(prompt);
+      if (score >= 0.3) return null;
+      const result = await callLocalFn(false);
+      if (result !== null && planningDir && frontierResultFn) {
+        runShadow(config, planningDir, operationType, frontierResultFn, result.content);
+      }
+      return result;
+    }
+    if (routingStrategy === 'balanced') {
+      const score = scoreComplexity(prompt);
+      if (score > 0.45) return null;
+      const result = await callLocalFn(true);
+      const confidence = extractConfidence(result && result.logprobsData);
+      if (confidence === null || confidence < 0.75) return null;
+      if (result !== null && planningDir && frontierResultFn) {
+        runShadow(config, planningDir, operationType, frontierResultFn, result.content);
+      }
+      return result;
+    }
+    // Default: local_first
+    const score = scoreComplexity(prompt);
+    if (score > COMPLEXITY_HIGH_THRESHOLD) return null;
+    const result = await callLocalFn(true);
+    const confidence = extractConfidence(result && result.logprobsData);
+    if (confidence === null || confidence < confidenceThreshold) return null;
+    if (result !== null && planningDir && frontierResultFn) {
+      runShadow(config, planningDir, operationType, frontierResultFn, result.content);
+    }
+    return result;
+  } catch (_) {
+    return null;
+  }
+}
+module.exports = { route, scoreComplexity, extractConfidence };
+module.exports.COMPLEXITY_HIGH_THRESHOLD = COMPLEXITY_HIGH_THRESHOLD;

package/plugins/pbr/scripts/local-llm/shadow.js ADDED Viewed

@@ -0,0 +1,60 @@
+'use strict';
+const { logAgreement } = require('./metrics');
+/**
+ * Fire-and-forget shadow comparison.
+ * When shadow_mode is enabled, runs localResultFn() in the background and
+ * logs agreement/disagreement with frontierResult — but ALWAYS returns frontierResult.
+ *
+ * @param {object} config - resolved config from resolveConfig()
+ * @param {string} planningDir - path to the .planning directory
+ * @param {string} operationType - e.g. 'artifact_classification'
+ * @param {Function} localResultFn - async function that returns the local LLM result
+ * @param {*} frontierResult - the result already returned to the caller (never changed)
+ * @param {string} [sessionId] - current session identifier
+ * @returns {*} frontierResult — unchanged
+ */
+function runShadow(config, planningDir, operationType, localResultFn, frontierResult, sessionId) {
+  // Shadow off or LLM disabled — return immediately
+  if (!config.advanced || !config.advanced.shadow_mode) {
+    return frontierResult;
+  }
+  if (!config.enabled) {
+    return frontierResult;
+  }
+  // Fire-and-forget: never propagates errors, never affects frontierResult
+  Promise.resolve()
+    .then(async () => {
+      let localValue;
+      try {
+        const raw = await localResultFn();
+        localValue = typeof raw === 'string' ? raw : JSON.stringify(raw);
+      } catch (_) {
+        // Local call failed — log as disagreement
+        localValue = null;
+      }
+      const frontierStr =
+        typeof frontierResult === 'string' ? frontierResult : JSON.stringify(frontierResult);
+      const localStr = localValue != null ? localValue.trim() : null;
+      const agrees = localStr !== null && localStr === frontierStr.trim();
+      logAgreement(planningDir, {
+        timestamp: new Date().toISOString(),
+        operation: operationType,
+        session_id: sessionId || 'unknown',
+        agrees,
+        local_result: localStr,
+        frontier_result: frontierStr
+      });
+    })
+    .catch(() => {
+      // Swallow all errors — shadow must never throw
+    });
+  return frontierResult;
+}
+module.exports = { runShadow };

package/plugins/pbr/scripts/local-llm/threshold-tuner.js ADDED Viewed

@@ -0,0 +1,118 @@
+'use strict';
+const fs = require('fs');
+const path = require('path');
+// --- Constants ---
+/** Minimum shadow log entries per operation before suggesting an adjustment */
+const MIN_SAMPLES = 20;
+/** Step size for each threshold adjustment */
+const ADJUST_STEP = 0.05;
+/** Clamp floor for suggested threshold */
+const THRESHOLD_MIN = 0.5;
+/** Clamp ceiling for suggested threshold */
+const THRESHOLD_MAX = 0.99;
+/**
+ * Failure rate above which local LLM is considered too unreliable.
+ * Suggests raising the confidence_threshold so fewer calls are routed locally.
+ */
+const HIGH_FAILURE_RATE = 0.20;
+/**
+ * Failure rate below which local LLM is considered very reliable.
+ * Suggests lowering the confidence_threshold so more calls are routed locally.
+ */
+const LOW_FAILURE_RATE = 0.05;
+/**
+ * Reads the shadow agreement log and returns advisory threshold adjustments
+ * per operation type.
+ *
+ * Only emits a suggestion for an operation when it has >= MIN_SAMPLES entries.
+ * All suggestions are ±ADJUST_STEP clamped to [THRESHOLD_MIN, THRESHOLD_MAX].
+ * Never writes to config — purely advisory.
+ *
+ * @param {string} planningDir - Absolute path to the .planning directory
+ * @param {number} currentThreshold - Current confidence_threshold from config
+ * @returns {Array<{operation: string, current: number, suggested: number, sample_count: number, agreement_rate: number}>}
+ */
+function computeThresholdAdjustments(planningDir, currentThreshold) {
+  try {
+    const shadowLogPath = path.join(planningDir, 'logs', 'local-llm-shadow.jsonl');
+    if (!fs.existsSync(shadowLogPath)) {
+      return [];
+    }
+    const raw = fs.readFileSync(shadowLogPath, 'utf8');
+    const lines = raw.split('\n').filter(l => l.trim().length > 0);
+    // Parse lines, skip unparseable
+    const entries = [];
+    for (const line of lines) {
+      try {
+        const parsed = JSON.parse(line);
+        if (parsed && typeof parsed === 'object') {
+          entries.push(parsed);
+        }
+      } catch (_e) {
+        // Skip malformed lines
+      }
+    }
+    // Group by operation
+    const groups = {};
+    for (const entry of entries) {
+      const op = entry.operation;
+      if (!op) continue;
+      if (!groups[op]) {
+        groups[op] = { count: 0, agrees: 0 };
+      }
+      groups[op].count += 1;
+      if (entry.agrees === true) {
+        groups[op].agrees += 1;
+      }
+    }
+    // Build suggestions for operations with enough samples
+    const suggestions = [];
+    for (const [operation, stats] of Object.entries(groups)) {
+      if (stats.count < MIN_SAMPLES) continue;
+      const agreementRate = stats.agrees / stats.count;
+      const failureRate = 1 - agreementRate;
+      let suggested;
+      if (failureRate > HIGH_FAILURE_RATE) {
+        // Local is too unreliable — raise threshold (fewer local calls)
+        suggested = Math.min(THRESHOLD_MAX, currentThreshold + ADJUST_STEP);
+      } else if (failureRate < LOW_FAILURE_RATE) {
+        // Local is very reliable — lower threshold (more local calls)
+        suggested = Math.max(THRESHOLD_MIN, currentThreshold - ADJUST_STEP);
+      } else {
+        // Within acceptable range — no change
+        suggested = currentThreshold;
+      }
+      suggestions.push({
+        operation,
+        current: currentThreshold,
+        suggested,
+        sample_count: stats.count,
+        agreement_rate: agreementRate
+      });
+    }
+    return suggestions;
+  } catch (_e) {
+    // Never throws
+    return [];
+  }
+}
+module.exports = { computeThresholdAdjustments };