@sienklogic/plan-build-run 2.22.1 → 2.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/CHANGELOG.md +34 -0
  2. package/dashboard/package.json +2 -1
  3. package/dashboard/src/middleware/errorHandler.js +12 -2
  4. package/dashboard/src/repositories/planning.repository.js +23 -1
  5. package/dashboard/src/routes/pages.routes.js +65 -2
  6. package/dashboard/src/services/local-llm-metrics.service.js +81 -0
  7. package/dashboard/src/services/quick.service.js +62 -0
  8. package/dashboard/src/views/partials/analytics-content.ejs +61 -0
  9. package/dashboard/src/views/partials/quick-content.ejs +40 -0
  10. package/dashboard/src/views/partials/quick-detail-content.ejs +29 -0
  11. package/dashboard/src/views/partials/sidebar.ejs +8 -0
  12. package/dashboard/src/views/quick-detail.ejs +5 -0
  13. package/dashboard/src/views/quick.ejs +5 -0
  14. package/package.json +1 -1
  15. package/plugins/copilot-pbr/agents/debugger.agent.md +15 -0
  16. package/plugins/copilot-pbr/agents/researcher.agent.md +20 -0
  17. package/plugins/copilot-pbr/agents/synthesizer.agent.md +12 -0
  18. package/plugins/copilot-pbr/plugin.json +1 -1
  19. package/plugins/copilot-pbr/references/config-reference.md +89 -0
  20. package/plugins/copilot-pbr/skills/continue/SKILL.md +1 -1
  21. package/plugins/copilot-pbr/skills/health/SKILL.md +8 -1
  22. package/plugins/copilot-pbr/skills/help/SKILL.md +17 -4
  23. package/plugins/copilot-pbr/skills/milestone/SKILL.md +13 -13
  24. package/plugins/copilot-pbr/skills/quick/SKILL.md +4 -1
  25. package/plugins/copilot-pbr/skills/setup/SKILL.md +17 -6
  26. package/plugins/copilot-pbr/skills/status/SKILL.md +37 -1
  27. package/plugins/cursor-pbr/.cursor-plugin/plugin.json +1 -1
  28. package/plugins/cursor-pbr/agents/debugger.md +15 -0
  29. package/plugins/cursor-pbr/agents/researcher.md +20 -0
  30. package/plugins/cursor-pbr/agents/synthesizer.md +12 -0
  31. package/plugins/cursor-pbr/references/config-reference.md +89 -0
  32. package/plugins/cursor-pbr/skills/continue/SKILL.md +1 -1
  33. package/plugins/cursor-pbr/skills/health/SKILL.md +8 -1
  34. package/plugins/cursor-pbr/skills/help/SKILL.md +17 -4
  35. package/plugins/cursor-pbr/skills/milestone/SKILL.md +13 -13
  36. package/plugins/cursor-pbr/skills/quick/SKILL.md +4 -1
  37. package/plugins/cursor-pbr/skills/setup/SKILL.md +17 -6
  38. package/plugins/cursor-pbr/skills/status/SKILL.md +37 -1
  39. package/plugins/pbr/.claude-plugin/plugin.json +1 -1
  40. package/plugins/pbr/agents/debugger.md +15 -0
  41. package/plugins/pbr/agents/researcher.md +20 -0
  42. package/plugins/pbr/agents/synthesizer.md +12 -0
  43. package/plugins/pbr/references/config-reference.md +89 -0
  44. package/plugins/pbr/scripts/check-config-change.js +33 -0
  45. package/plugins/pbr/scripts/check-plan-format.js +52 -4
  46. package/plugins/pbr/scripts/check-subagent-output.js +43 -3
  47. package/plugins/pbr/scripts/config-schema.json +48 -0
  48. package/plugins/pbr/scripts/local-llm/client.js +214 -0
  49. package/plugins/pbr/scripts/local-llm/health.js +217 -0
  50. package/plugins/pbr/scripts/local-llm/metrics.js +252 -0
  51. package/plugins/pbr/scripts/local-llm/operations/classify-artifact.js +76 -0
  52. package/plugins/pbr/scripts/local-llm/operations/classify-error.js +75 -0
  53. package/plugins/pbr/scripts/local-llm/operations/score-source.js +72 -0
  54. package/plugins/pbr/scripts/local-llm/operations/summarize-context.js +62 -0
  55. package/plugins/pbr/scripts/local-llm/operations/validate-task.js +59 -0
  56. package/plugins/pbr/scripts/local-llm/router.js +101 -0
  57. package/plugins/pbr/scripts/local-llm/shadow.js +60 -0
  58. package/plugins/pbr/scripts/local-llm/threshold-tuner.js +118 -0
  59. package/plugins/pbr/scripts/pbr-tools.js +120 -3
  60. package/plugins/pbr/scripts/post-write-dispatch.js +2 -2
  61. package/plugins/pbr/scripts/progress-tracker.js +29 -3
  62. package/plugins/pbr/scripts/session-cleanup.js +36 -1
  63. package/plugins/pbr/scripts/validate-task.js +30 -1
  64. package/plugins/pbr/skills/continue/SKILL.md +1 -1
  65. package/plugins/pbr/skills/health/SKILL.md +8 -1
  66. package/plugins/pbr/skills/help/SKILL.md +17 -4
  67. package/plugins/pbr/skills/milestone/SKILL.md +13 -13
  68. package/plugins/pbr/skills/quick/SKILL.md +4 -1
  69. package/plugins/pbr/skills/setup/SKILL.md +17 -6
  70. package/plugins/pbr/skills/status/SKILL.md +38 -2
@@ -0,0 +1,76 @@
1
+ 'use strict';
2
+
3
+ const { complete, tryParseJSON, isDisabled } = require('../client');
4
+ const { logMetric } = require('../metrics');
5
+ const { route } = require('../router');
6
+
7
+ /**
8
+ * Classifies a PLAN.md or SUMMARY.md artifact using the local LLM.
9
+ *
10
+ * @param {object} config - resolved local_llm config block
11
+ * @param {string} planningDir - path to the .planning directory
12
+ * @param {string} content - file content to classify
13
+ * @param {string} fileType - 'PLAN' or 'SUMMARY'
14
+ * @param {string} [sessionId] - optional session identifier for metrics
15
+ * @returns {Promise<{ classification: string, confidence: number, reason: string, latency_ms: number, fallback_used: boolean }|null>}
16
+ */
17
+ async function classifyArtifact(config, planningDir, content, fileType, sessionId) {
18
+ if (!config.enabled || !config.features.artifact_classification) return null;
19
+ if (isDisabled('artifact-classification', config.advanced.disable_after_failures)) return null;
20
+
21
+ const maxChars = (config.advanced.max_input_tokens || 1024) * 4;
22
+ const truncatedContent = content.length > maxChars ? content.slice(0, maxChars) : content;
23
+
24
+ let prompt;
25
+ if (fileType === 'PLAN') {
26
+ prompt =
27
+ 'Classify this PLAN.md as stub, partial, or complete. A stub has placeholder tasks or missing required XML elements. A partial has some tasks filled but action/verify/done are vague. A complete has all tasks with specific steps, executable verify commands, and observable done conditions. Respond with JSON: {"classification": "stub"|"partial"|"complete", "confidence": 0.0-1.0, "reason": "one sentence"}\n\nContent:\n' +
28
+ truncatedContent;
29
+ } else if (fileType === 'SUMMARY') {
30
+ prompt =
31
+ 'Classify this SUMMARY.md as substantive or thin. Substantive means it has specific artifact paths, commit hashes, and observable outcomes. Thin means vague or placeholder content. Respond with JSON: {"classification": "substantive"|"thin", "confidence": 0.0-1.0, "reason": "one sentence"}\n\nContent:\n' +
32
+ truncatedContent;
33
+ } else {
34
+ return null;
35
+ }
36
+
37
+ try {
38
+ const result = await route(config, prompt, 'artifact-classification', (logprobs) =>
39
+ complete(config, prompt, 'artifact-classification', { logprobs })
40
+ );
41
+ if (result === null) return null;
42
+ const parsed = tryParseJSON(result.content);
43
+ if (!parsed.ok) return null;
44
+
45
+ const validPlanClassifications = ['stub', 'partial', 'complete'];
46
+ const validSummaryClassifications = ['substantive', 'thin'];
47
+ const validValues = fileType === 'PLAN' ? validPlanClassifications : validSummaryClassifications;
48
+ if (!parsed.data.classification || !validValues.includes(parsed.data.classification)) return null;
49
+
50
+ const metricEntry = {
51
+ session_id: sessionId || 'unknown',
52
+ timestamp: new Date().toISOString(),
53
+ operation: 'artifact-classification',
54
+ model: config.model,
55
+ latency_ms: result.latency_ms,
56
+ tokens_used_local: result.tokens,
57
+ tokens_saved_frontier: 420,
58
+ result: parsed.data.classification,
59
+ fallback_used: false,
60
+ confidence: parsed.data.confidence || 0.9
61
+ };
62
+ logMetric(planningDir, metricEntry);
63
+
64
+ return {
65
+ classification: parsed.data.classification,
66
+ confidence: parsed.data.confidence || 0.9,
67
+ reason: parsed.data.reason || '',
68
+ latency_ms: result.latency_ms,
69
+ fallback_used: false
70
+ };
71
+ } catch (_) {
72
+ return null;
73
+ }
74
+ }
75
+
76
+ module.exports = { classifyArtifact };
@@ -0,0 +1,75 @@
1
+ 'use strict';
2
+
3
+ const { complete, tryParseJSON, isDisabled } = require('../client');
4
+ const { logMetric } = require('../metrics');
5
+ const { route } = require('../router');
6
+
7
+ const ERROR_CATEGORIES = [
8
+ 'connection_refused',
9
+ 'timeout',
10
+ 'missing_output',
11
+ 'wrong_output_format',
12
+ 'permission_error',
13
+ 'unknown'
14
+ ];
15
+
16
+ /**
17
+ * Classifies an agent error into one of 6 categories using the local LLM.
18
+ *
19
+ * @param {object} config - resolved local_llm config block
20
+ * @param {string} planningDir - path to the .planning directory
21
+ * @param {string} errorText - the error message or stack trace
22
+ * @param {string} [agentType] - the agent type that produced the error
23
+ * @param {string} [sessionId] - optional session identifier for metrics
24
+ * @returns {Promise<{ category: string, confidence: number, latency_ms: number, fallback_used: boolean }|null>}
25
+ */
26
+ async function classifyError(config, planningDir, errorText, agentType, sessionId) {
27
+ if (!config.enabled) return null;
28
+ if (isDisabled('error-classification', config.advanced.disable_after_failures)) return null;
29
+
30
+ const truncatedError = errorText.length > 500 ? errorText.slice(0, 500) : errorText;
31
+
32
+ const prompt =
33
+ 'Classify this agent error into one category. Categories: connection_refused (network/ECONNREFUSED), timeout (operation timed out), missing_output (expected file/artifact not found), wrong_output_format (output exists but malformed), permission_error (filesystem/permission issue), unknown (none of the above). Respond with JSON: {"category": "<one of the 6>", "confidence": 0.0-1.0}\n\nAgent: ' +
34
+ (agentType || 'unknown') +
35
+ '\nError: ' +
36
+ truncatedError;
37
+
38
+ try {
39
+ const result = await route(config, prompt, 'error-classification', (logprobs) =>
40
+ complete(config, prompt, 'error-classification', { logprobs })
41
+ );
42
+ if (result === null) return null;
43
+ const parsed = tryParseJSON(result.content);
44
+ if (!parsed.ok) return null;
45
+
46
+ const category = ERROR_CATEGORIES.includes(parsed.data.category)
47
+ ? parsed.data.category
48
+ : 'unknown';
49
+
50
+ const metricEntry = {
51
+ session_id: sessionId || 'unknown',
52
+ timestamp: new Date().toISOString(),
53
+ operation: 'error-classification',
54
+ model: config.model,
55
+ latency_ms: result.latency_ms,
56
+ tokens_used_local: result.tokens,
57
+ tokens_saved_frontier: 120,
58
+ result: category,
59
+ fallback_used: false,
60
+ confidence: parsed.data.confidence || 0.9
61
+ };
62
+ logMetric(planningDir, metricEntry);
63
+
64
+ return {
65
+ category,
66
+ confidence: parsed.data.confidence || 0.9,
67
+ latency_ms: result.latency_ms,
68
+ fallback_used: false
69
+ };
70
+ } catch (_) {
71
+ return null;
72
+ }
73
+ }
74
+
75
+ module.exports = { classifyError, ERROR_CATEGORIES };
@@ -0,0 +1,72 @@
1
+ 'use strict';
2
+
3
+ const { complete, tryParseJSON, isDisabled } = require('../client');
4
+ const { logMetric } = require('../metrics');
5
+ const { route } = require('../router');
6
+
7
+ const SOURCE_LEVELS = ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6'];
8
+
9
+ /**
10
+ * Scores a research source on the S0-S6 credibility scale using the local LLM.
11
+ *
12
+ * S0=local prior research, S1=live MCP docs, S2=official docs, S3=official GitHub,
13
+ * S4=verified WebSearch (2+ sources), S5=unverified WebSearch, S6=training knowledge.
14
+ *
15
+ * @param {object} config - resolved local_llm config block
16
+ * @param {string} planningDir - path to the .planning directory
17
+ * @param {string} sourceText - text content from the source
18
+ * @param {string} sourceUrl - URL or identifier for the source
19
+ * @param {string} [sessionId] - optional session identifier for metrics
20
+ * @returns {Promise<{ level: string, confidence: number, reason: string, latency_ms: number, fallback_used: boolean }|null>}
21
+ */
22
+ async function scoreSource(config, planningDir, sourceText, sourceUrl, sessionId) {
23
+ if (!config.enabled) return null;
24
+ if (!config.features || !config.features.source_scoring) return null;
25
+ if (isDisabled('source-scoring', config.advanced.disable_after_failures)) return null;
26
+
27
+ const maxChars = (config.advanced.max_input_tokens || 1024) * 4;
28
+ const truncated = sourceText.length > maxChars ? sourceText.slice(0, maxChars) : sourceText;
29
+
30
+ const prompt =
31
+ 'Score this research source on the S0-S6 credibility scale. S0=local prior research, S1=live MCP docs, S2=official docs, S3=official GitHub, S4=verified WebSearch (2+ sources), S5=unverified WebSearch, S6=training knowledge. Respond with JSON: {"level": "S0"-"S6", "confidence": 0.0-1.0, "reason": "one sentence"}\n\nURL: ' +
32
+ sourceUrl +
33
+ '\nContent excerpt:\n' +
34
+ truncated;
35
+
36
+ try {
37
+ const result = await route(config, prompt, 'source-scoring', (logprobs) =>
38
+ complete(config, prompt, 'source-scoring', { logprobs })
39
+ );
40
+ if (result === null) return null;
41
+ const parsed = tryParseJSON(result.content);
42
+ if (!parsed.ok) return null;
43
+
44
+ const level = SOURCE_LEVELS.includes(parsed.data.level) ? parsed.data.level : 'S6';
45
+
46
+ const metricEntry = {
47
+ session_id: sessionId || 'unknown',
48
+ timestamp: new Date().toISOString(),
49
+ operation: 'source-scoring',
50
+ model: config.model,
51
+ latency_ms: result.latency_ms,
52
+ tokens_used_local: result.tokens,
53
+ tokens_saved_frontier: 80,
54
+ result: level,
55
+ fallback_used: false,
56
+ confidence: parsed.data.confidence || 0.9
57
+ };
58
+ logMetric(planningDir, metricEntry);
59
+
60
+ return {
61
+ level,
62
+ confidence: parsed.data.confidence || 0.9,
63
+ reason: parsed.data.reason || '',
64
+ latency_ms: result.latency_ms,
65
+ fallback_used: false
66
+ };
67
+ } catch (_) {
68
+ return null;
69
+ }
70
+ }
71
+
72
+ module.exports = { scoreSource, SOURCE_LEVELS };
@@ -0,0 +1,62 @@
1
+ 'use strict';
2
+
3
+ const { complete, isDisabled } = require('../client');
4
+ const { logMetric } = require('../metrics');
5
+ const { route } = require('../router');
6
+
7
+ /**
8
+ * Produces a concise plain-text summary of project context using the local LLM.
9
+ *
10
+ * @param {object} config - resolved local_llm config block
11
+ * @param {string} planningDir - path to the .planning directory
12
+ * @param {string} contextText - project context text to summarize
13
+ * @param {number} [maxWords] - target word count for the summary (default 150)
14
+ * @param {string} [sessionId] - optional session identifier for metrics
15
+ * @returns {Promise<{ summary: string, latency_ms: number, fallback_used: boolean }|null>}
16
+ */
17
+ async function summarizeContext(config, planningDir, contextText, maxWords, sessionId) {
18
+ if (!config.enabled) return null;
19
+ if (!config.features.context_summarization) return null;
20
+ if (isDisabled('context-summarization', config.advanced.disable_after_failures)) return null;
21
+
22
+ const maxChars = (config.advanced.max_input_tokens || 1024) * 4;
23
+ const truncated = contextText.length > maxChars ? contextText.slice(0, maxChars) : contextText;
24
+ const targetWords = maxWords || 150;
25
+
26
+ const prompt =
27
+ 'Summarize the following project context in under ' +
28
+ targetWords +
29
+ ' words. Focus on: current phase goal, key decisions made, what is built so far, and what is still needed. Output plain text only — no JSON, no headings, no bullet points.\n\nContext:\n' +
30
+ truncated;
31
+
32
+ try {
33
+ const result = await route(config, prompt, 'context-summarization', (logprobs) =>
34
+ complete(config, prompt, 'context-summarization', { logprobs })
35
+ );
36
+ if (result === null) return null;
37
+
38
+ const metricEntry = {
39
+ session_id: sessionId || 'unknown',
40
+ timestamp: new Date().toISOString(),
41
+ operation: 'context-summarization',
42
+ model: config.model,
43
+ latency_ms: result.latency_ms,
44
+ tokens_used_local: result.tokens,
45
+ tokens_saved_frontier: 350,
46
+ result: 'summary',
47
+ fallback_used: false,
48
+ confidence: 0.9
49
+ };
50
+ logMetric(planningDir, metricEntry);
51
+
52
+ return {
53
+ summary: result.content.trim(),
54
+ latency_ms: result.latency_ms,
55
+ fallback_used: false
56
+ };
57
+ } catch (_) {
58
+ return null;
59
+ }
60
+ }
61
+
62
+ module.exports = { summarizeContext };
@@ -0,0 +1,59 @@
1
+ 'use strict';
2
+
3
+ const { complete, tryParseJSON, isDisabled } = require('../client');
4
+ const { logMetric } = require('../metrics');
5
+ const { route } = require('../router');
6
+
7
+ /**
8
+ * Validates a Task() call for coherence using the local LLM.
9
+ *
10
+ * @param {object} config - resolved local_llm config block
11
+ * @param {string} planningDir - path to the .planning directory
12
+ * @param {{ description: string, subagent_type: string }} taskInput - the Task() call to validate
13
+ * @param {string} [sessionId] - optional session identifier for metrics
14
+ * @returns {Promise<{ coherent: boolean, confidence: number, issue: string|null, latency_ms: number, fallback_used: boolean }|null>}
15
+ */
16
+ async function validateTask(config, planningDir, taskInput, sessionId) {
17
+ if (!config.enabled || !config.features.task_validation) return null;
18
+ if (isDisabled('task-validation', config.advanced.disable_after_failures)) return null;
19
+
20
+ const prompt =
21
+ 'Assess this Task() call for coherence. Check: (1) description is meaningful and not empty, (2) if subagent_type starts with \'pbr:\' the agent name is valid, (3) description matches the intended operation. Known pbr agents: researcher, planner, plan-checker, executor, verifier, integration-checker, debugger, codebase-mapper, synthesizer, general. Respond with JSON: {"coherent": true|false, "confidence": 0.0-1.0, "issue": "null or one sentence describing the problem"}\n\nTask input: ' +
22
+ JSON.stringify({ description: taskInput.description, subagent_type: taskInput.subagent_type });
23
+
24
+ try {
25
+ const result = await route(config, prompt, 'task-validation', (logprobs) =>
26
+ complete(config, prompt, 'task-validation', { logprobs })
27
+ );
28
+ if (result === null) return null;
29
+ const parsed = tryParseJSON(result.content);
30
+ if (!parsed.ok) return null;
31
+ if (typeof parsed.data.coherent !== 'boolean') return null;
32
+
33
+ const metricEntry = {
34
+ session_id: sessionId || 'unknown',
35
+ timestamp: new Date().toISOString(),
36
+ operation: 'task-validation',
37
+ model: config.model,
38
+ latency_ms: result.latency_ms,
39
+ tokens_used_local: result.tokens,
40
+ tokens_saved_frontier: 180,
41
+ result: parsed.data.coherent ? 'coherent' : 'incoherent',
42
+ fallback_used: false,
43
+ confidence: parsed.data.confidence || 0.9
44
+ };
45
+ logMetric(planningDir, metricEntry);
46
+
47
+ return {
48
+ coherent: parsed.data.coherent,
49
+ confidence: parsed.data.confidence || 0.9,
50
+ issue: parsed.data.issue || null,
51
+ latency_ms: result.latency_ms,
52
+ fallback_used: false
53
+ };
54
+ } catch (_) {
55
+ return null;
56
+ }
57
+ }
58
+
59
+ module.exports = { validateTask };
@@ -0,0 +1,101 @@
1
+ 'use strict';
2
+
3
+ const { runShadow } = require('./shadow');
4
+
5
+ const COMPLEXITY_HIGH_THRESHOLD = 0.65;
6
+
7
+ /**
8
+ * Scores the complexity of a prompt using a weighted surface heuristic.
9
+ * Returns a value in [0, 1] where higher means more complex.
10
+ *
11
+ * @param {string} prompt
12
+ * @returns {number}
13
+ */
14
+ function scoreComplexity(prompt) {
15
+ const words = prompt.split(/\s+/).length;
16
+ const codeBlocks = (prompt.match(/```/g) || []).length / 2;
17
+ const constraints = (prompt.match(/\b(must|should|exactly|only|never|always)\b/gi) || []).length;
18
+ const reasoning = (prompt.match(/\b(why|explain|compare|analyze|reason|evaluate)\b/gi) || []).length;
19
+ const structuredOut = /\b(json|schema|yaml|frontmatter)\b/i.test(prompt) ? 1 : 0;
20
+ return Math.min(words / 500, 1.0) * 0.25 +
21
+ Math.min(codeBlocks / 3, 1.0) * 0.20 +
22
+ Math.min(constraints / 5, 1.0) * 0.20 +
23
+ Math.min(reasoning / 3, 1.0) * 0.20 +
24
+ structuredOut * 0.15;
25
+ }
26
+
27
+ /**
28
+ * Extracts a confidence score from logprobs data returned by the local LLM.
29
+ * Returns a value in [0, 1] or null if no logprobs data is available.
30
+ *
31
+ * @param {Array<{token: string, logprob: number}>|null|undefined} logprobsData
32
+ * @returns {number|null}
33
+ */
34
+ function extractConfidence(logprobsData) {
35
+ if (!logprobsData || logprobsData.length === 0) return null;
36
+ const sum = logprobsData.reduce((acc, t) => acc + t.logprob, 0);
37
+ const avgLogprob = sum / logprobsData.length;
38
+ return Math.min(1, Math.max(0, Math.exp(avgLogprob)));
39
+ }
40
+
41
+ /**
42
+ * Routes a prompt through local LLM or signals caller to use frontier model.
43
+ * Returns the local LLM result if local is suitable, or null if caller should
44
+ * fall back to frontier. Never throws — all errors return null.
45
+ *
46
+ * @param {object} config - local_llm config block with routing_strategy and advanced settings
47
+ * @param {string} prompt - the prompt being routed
48
+ * @param {string} operationType - operation identifier
49
+ * @param {function(boolean): Promise<{content: string, logprobsData: Array|null}>} callLocalFn
50
+ * Async function accepting a logprobs boolean, returns the local LLM result object.
51
+ * @param {string} [planningDir] - path to .planning directory; when provided enables shadow mode
52
+ * @param {Function} [frontierResultFn] - async function that calls the frontier model;
53
+ * NOTE: parameter inversion vs shadow.js — here LOCAL has already run (it's the primary result)
54
+ * and FRONTIER is the shadow. We pass frontierResultFn as shadow.js arg 4 (localResultFn slot)
55
+ * so shadow.js calls it, and result.content as arg 5 (frontierResult slot, the committed result).
56
+ * @returns {Promise<{content: string, logprobsData: Array|null}|null>}
57
+ */
58
+ async function route(config, prompt, operationType, callLocalFn, planningDir, frontierResultFn) {
59
+ try {
60
+ const routingStrategy = (config && config.routing_strategy) || 'local_first';
61
+ const confidenceThreshold = (config && config.advanced && config.advanced.confidence_threshold) || 0.9;
62
+
63
+ if (routingStrategy === 'quality_first') {
64
+ const score = scoreComplexity(prompt);
65
+ if (score >= 0.3) return null;
66
+ const result = await callLocalFn(false);
67
+ if (result !== null && planningDir && frontierResultFn) {
68
+ runShadow(config, planningDir, operationType, frontierResultFn, result.content);
69
+ }
70
+ return result;
71
+ }
72
+
73
+ if (routingStrategy === 'balanced') {
74
+ const score = scoreComplexity(prompt);
75
+ if (score > 0.45) return null;
76
+ const result = await callLocalFn(true);
77
+ const confidence = extractConfidence(result && result.logprobsData);
78
+ if (confidence === null || confidence < 0.75) return null;
79
+ if (result !== null && planningDir && frontierResultFn) {
80
+ runShadow(config, planningDir, operationType, frontierResultFn, result.content);
81
+ }
82
+ return result;
83
+ }
84
+
85
+ // Default: local_first
86
+ const score = scoreComplexity(prompt);
87
+ if (score > COMPLEXITY_HIGH_THRESHOLD) return null;
88
+ const result = await callLocalFn(true);
89
+ const confidence = extractConfidence(result && result.logprobsData);
90
+ if (confidence === null || confidence < confidenceThreshold) return null;
91
+ if (result !== null && planningDir && frontierResultFn) {
92
+ runShadow(config, planningDir, operationType, frontierResultFn, result.content);
93
+ }
94
+ return result;
95
+ } catch (_) {
96
+ return null;
97
+ }
98
+ }
99
+
100
+ module.exports = { route, scoreComplexity, extractConfidence };
101
+ module.exports.COMPLEXITY_HIGH_THRESHOLD = COMPLEXITY_HIGH_THRESHOLD;
@@ -0,0 +1,60 @@
1
+ 'use strict';
2
+
3
+ const { logAgreement } = require('./metrics');
4
+
5
+ /**
6
+ * Fire-and-forget shadow comparison.
7
+ * When shadow_mode is enabled, runs localResultFn() in the background and
8
+ * logs agreement/disagreement with frontierResult — but ALWAYS returns frontierResult.
9
+ *
10
+ * @param {object} config - resolved config from resolveConfig()
11
+ * @param {string} planningDir - path to the .planning directory
12
+ * @param {string} operationType - e.g. 'artifact_classification'
13
+ * @param {Function} localResultFn - async function that returns the local LLM result
14
+ * @param {*} frontierResult - the result already returned to the caller (never changed)
15
+ * @param {string} [sessionId] - current session identifier
16
+ * @returns {*} frontierResult — unchanged
17
+ */
18
+ function runShadow(config, planningDir, operationType, localResultFn, frontierResult, sessionId) {
19
+ // Shadow off or LLM disabled — return immediately
20
+ if (!config.advanced || !config.advanced.shadow_mode) {
21
+ return frontierResult;
22
+ }
23
+ if (!config.enabled) {
24
+ return frontierResult;
25
+ }
26
+
27
+ // Fire-and-forget: never propagates errors, never affects frontierResult
28
+ Promise.resolve()
29
+ .then(async () => {
30
+ let localValue;
31
+ try {
32
+ const raw = await localResultFn();
33
+ localValue = typeof raw === 'string' ? raw : JSON.stringify(raw);
34
+ } catch (_) {
35
+ // Local call failed — log as disagreement
36
+ localValue = null;
37
+ }
38
+
39
+ const frontierStr =
40
+ typeof frontierResult === 'string' ? frontierResult : JSON.stringify(frontierResult);
41
+ const localStr = localValue != null ? localValue.trim() : null;
42
+ const agrees = localStr !== null && localStr === frontierStr.trim();
43
+
44
+ logAgreement(planningDir, {
45
+ timestamp: new Date().toISOString(),
46
+ operation: operationType,
47
+ session_id: sessionId || 'unknown',
48
+ agrees,
49
+ local_result: localStr,
50
+ frontier_result: frontierStr
51
+ });
52
+ })
53
+ .catch(() => {
54
+ // Swallow all errors — shadow must never throw
55
+ });
56
+
57
+ return frontierResult;
58
+ }
59
+
60
+ module.exports = { runShadow };
@@ -0,0 +1,118 @@
1
+ 'use strict';
2
+
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+
6
+ // --- Constants ---
7
+
8
+ /** Minimum shadow log entries per operation before suggesting an adjustment */
9
+ const MIN_SAMPLES = 20;
10
+
11
+ /** Step size for each threshold adjustment */
12
+ const ADJUST_STEP = 0.05;
13
+
14
+ /** Clamp floor for suggested threshold */
15
+ const THRESHOLD_MIN = 0.5;
16
+
17
+ /** Clamp ceiling for suggested threshold */
18
+ const THRESHOLD_MAX = 0.99;
19
+
20
+ /**
21
+ * Failure rate above which local LLM is considered too unreliable.
22
+ * Suggests raising the confidence_threshold so fewer calls are routed locally.
23
+ */
24
+ const HIGH_FAILURE_RATE = 0.20;
25
+
26
+ /**
27
+ * Failure rate below which local LLM is considered very reliable.
28
+ * Suggests lowering the confidence_threshold so more calls are routed locally.
29
+ */
30
+ const LOW_FAILURE_RATE = 0.05;
31
+
32
+ /**
33
+ * Reads the shadow agreement log and returns advisory threshold adjustments
34
+ * per operation type.
35
+ *
36
+ * Only emits a suggestion for an operation when it has >= MIN_SAMPLES entries.
37
+ * All suggestions are ±ADJUST_STEP clamped to [THRESHOLD_MIN, THRESHOLD_MAX].
38
+ * Never writes to config — purely advisory.
39
+ *
40
+ * @param {string} planningDir - Absolute path to the .planning directory
41
+ * @param {number} currentThreshold - Current confidence_threshold from config
42
+ * @returns {Array<{operation: string, current: number, suggested: number, sample_count: number, agreement_rate: number}>}
43
+ */
44
+ function computeThresholdAdjustments(planningDir, currentThreshold) {
45
+ try {
46
+ const shadowLogPath = path.join(planningDir, 'logs', 'local-llm-shadow.jsonl');
47
+
48
+ if (!fs.existsSync(shadowLogPath)) {
49
+ return [];
50
+ }
51
+
52
+ const raw = fs.readFileSync(shadowLogPath, 'utf8');
53
+ const lines = raw.split('\n').filter(l => l.trim().length > 0);
54
+
55
+ // Parse lines, skip unparseable
56
+ const entries = [];
57
+ for (const line of lines) {
58
+ try {
59
+ const parsed = JSON.parse(line);
60
+ if (parsed && typeof parsed === 'object') {
61
+ entries.push(parsed);
62
+ }
63
+ } catch (_e) {
64
+ // Skip malformed lines
65
+ }
66
+ }
67
+
68
+ // Group by operation
69
+ const groups = {};
70
+ for (const entry of entries) {
71
+ const op = entry.operation;
72
+ if (!op) continue;
73
+ if (!groups[op]) {
74
+ groups[op] = { count: 0, agrees: 0 };
75
+ }
76
+ groups[op].count += 1;
77
+ if (entry.agrees === true) {
78
+ groups[op].agrees += 1;
79
+ }
80
+ }
81
+
82
+ // Build suggestions for operations with enough samples
83
+ const suggestions = [];
84
+ for (const [operation, stats] of Object.entries(groups)) {
85
+ if (stats.count < MIN_SAMPLES) continue;
86
+
87
+ const agreementRate = stats.agrees / stats.count;
88
+ const failureRate = 1 - agreementRate;
89
+
90
+ let suggested;
91
+ if (failureRate > HIGH_FAILURE_RATE) {
92
+ // Local is too unreliable — raise threshold (fewer local calls)
93
+ suggested = Math.min(THRESHOLD_MAX, currentThreshold + ADJUST_STEP);
94
+ } else if (failureRate < LOW_FAILURE_RATE) {
95
+ // Local is very reliable — lower threshold (more local calls)
96
+ suggested = Math.max(THRESHOLD_MIN, currentThreshold - ADJUST_STEP);
97
+ } else {
98
+ // Within acceptable range — no change
99
+ suggested = currentThreshold;
100
+ }
101
+
102
+ suggestions.push({
103
+ operation,
104
+ current: currentThreshold,
105
+ suggested,
106
+ sample_count: stats.count,
107
+ agreement_rate: agreementRate
108
+ });
109
+ }
110
+
111
+ return suggestions;
112
+ } catch (_e) {
113
+ // Never throws
114
+ return [];
115
+ }
116
+ }
117
+
118
+ module.exports = { computeThresholdAdjustments };