npm - cognitive-core - Versions diffs - 0.2.0 → 0.2.1 - Mend

cognitive-core 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/.claude/settings.json +111 -2
package/.sessionlog/settings.json +4 -0
package/dist/index.d.ts +1 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +5 -1
package/dist/index.js.map +1 -1
package/dist/learning/index.d.ts +1 -1
package/dist/learning/index.d.ts.map +1 -1
package/dist/learning/index.js.map +1 -1
package/dist/learning/unified-pipeline.d.ts +30 -0
package/dist/learning/unified-pipeline.d.ts.map +1 -1
package/dist/learning/unified-pipeline.js +207 -0
package/dist/learning/unified-pipeline.js.map +1 -1
package/dist/memory/candidate-retrieval.d.ts.map +1 -1
package/dist/memory/candidate-retrieval.js +3 -1
package/dist/memory/candidate-retrieval.js.map +1 -1
package/dist/utils/error-classifier.js +8 -8
package/dist/utils/error-classifier.js.map +1 -1
package/dist/workspace/efficacy-toolkit.d.ts +164 -0
package/dist/workspace/efficacy-toolkit.d.ts.map +1 -0
package/dist/workspace/efficacy-toolkit.js +281 -0
package/dist/workspace/efficacy-toolkit.js.map +1 -0
package/dist/workspace/index.d.ts +2 -1
package/dist/workspace/index.d.ts.map +1 -1
package/dist/workspace/index.js +3 -1
package/dist/workspace/index.js.map +1 -1
package/dist/workspace/templates/index.d.ts +3 -0
package/dist/workspace/templates/index.d.ts.map +1 -1
package/dist/workspace/templates/index.js +6 -0
package/dist/workspace/templates/index.js.map +1 -1
package/dist/workspace/templates/playbook-decay-detection.d.ts +46 -0
package/dist/workspace/templates/playbook-decay-detection.d.ts.map +1 -0
package/dist/workspace/templates/playbook-decay-detection.js +197 -0
package/dist/workspace/templates/playbook-decay-detection.js.map +1 -0
package/dist/workspace/templates/playbook-efficacy-audit.d.ts +46 -0
package/dist/workspace/templates/playbook-efficacy-audit.d.ts.map +1 -0
package/dist/workspace/templates/playbook-efficacy-audit.js +160 -0
package/dist/workspace/templates/playbook-efficacy-audit.js.map +1 -0
package/dist/workspace/templates/playbook-lifecycle-review.d.ts +51 -0
package/dist/workspace/templates/playbook-lifecycle-review.d.ts.map +1 -0
package/dist/workspace/templates/playbook-lifecycle-review.js +187 -0
package/dist/workspace/templates/playbook-lifecycle-review.js.map +1 -0
package/package.json +7 -1
package/src/index.ts +27 -0
package/src/learning/index.ts +1 -0
package/src/learning/unified-pipeline.ts +271 -1
package/src/memory/candidate-retrieval.ts +2 -1
package/src/utils/error-classifier.ts +8 -8
package/src/workspace/efficacy-toolkit.ts +496 -0
package/src/workspace/index.ts +29 -0
package/src/workspace/templates/index.ts +24 -0
package/src/workspace/templates/playbook-decay-detection.ts +272 -0
package/src/workspace/templates/playbook-efficacy-audit.ts +246 -0
package/src/workspace/templates/playbook-lifecycle-review.ts +274 -0
package/tests/fixtures/behavioral-trajectories.ts +210 -0
package/tests/integration/pipeline-data-correctness.test.ts +794 -0
package/tests/learning/meta-learner.test.ts +418 -0
package/tests/learning/pipeline-memory-updates.test.ts +721 -0
package/tests/learning/unified-pipeline-efficacy.test.ts +232 -0
package/tests/memory/candidate-retrieval.test.ts +167 -0
package/tests/memory/meta.test.ts +399 -0
package/tests/search/evaluator.test.ts +257 -0
package/tests/search/verification-runner.test.ts +357 -0
package/tests/utils/error-classifier.test.ts +149 -0
package/tests/utils/trajectory-helpers.test.ts +163 -0
package/tests/workspace/efficacy-toolkit.test.ts +404 -0
package/tests/workspace/templates/playbook-efficacy.test.ts +377 -0

package/src/workspace/templates/playbook-decay-detection.ts ADDED Viewed

@@ -0,0 +1,272 @@
+/**
+ * Playbook Decay Detection Template
+ *
+ * Portfolio-wide scan for playbooks showing declining efficacy.
+ * Uses temporal trend analysis to identify decay early, before
+ * playbooks accumulate enough failures to trigger confidence drops.
+ *
+ * The agent reviews pre-computed trends and cross-references with
+ * failure patterns to determine whether decay is real (environment
+ * changed) or noise (small sample size).
+ */
+import type { WorkspaceHandle } from 'agent-workspace';
+import type { Playbook } from '../../types/index.js';
+import type { TaskAnnotation, PlaybookEffectivenessEntry } from '../../learning/effectiveness.js';
+import type {
+  AgenticTaskTemplate,
+  AnalysisComplexity,
+} from '../types.js';
+import {
+  computeTemporalTrend,
+  computeNormalizedGain,
+  type TemporalTrend,
+} from '../efficacy-toolkit.js';
+import { getPlaybookSuccessRate } from '../../types/index.js';
+// ============================================================
+// Input / Output Types
+// ============================================================
+export interface PlaybookDecayDetectionInput {
+  playbooks: Playbook[];
+  annotations: TaskAnnotation[];
+  playbookEffectiveness: PlaybookEffectivenessEntry[];
+  unguidedSuccessRate: number;
+}
+export interface PlaybookDecayDetectionOutput {
+  /** Playbooks with detected decay signals */
+  decaying: DecaySignal[];
+  /** Playbooks at risk of decay (early warning) */
+  atRisk: DecaySignal[];
+  /** Playbooks confirmed stable or improving */
+  healthy: string[];
+  /** Agent's overall assessment */
+  portfolioAssessment: string;
+}
+export interface DecaySignal {
+  playbookId: string;
+  playbookName: string;
+  trend: TemporalTrend;
+  /** Agent's explanation of why this playbook is decaying */
+  diagnosis: string;
+  /** Recommended action */
+  recommendation: 'investigate' | 'refine' | 'deprecate' | 'monitor';
+  /** Severity: how urgently this needs attention */
+  severity: 'critical' | 'warning' | 'info';
+  /** Supporting evidence */
+  evidence: string[];
+}
+// ============================================================
+// Template Implementation
+// ============================================================
+export const playbookDecayDetectionTemplate: AgenticTaskTemplate<
+  PlaybookDecayDetectionInput,
+  PlaybookDecayDetectionOutput
+> = {
+  taskType: 'playbook-decay-detection',
+  domain: 'meta-learning',
+  description: 'Detect declining efficacy across the playbook portfolio',
+  assessComplexity(input: PlaybookDecayDetectionInput): AnalysisComplexity {
+    if (input.playbooks.length === 0) return 'heuristic';
+    if (input.annotations.length < 10) return 'heuristic';
+    if (input.playbooks.length > 20) return 'standard';
+    return 'lightweight';
+  },
+  async heuristicFallback(input: PlaybookDecayDetectionInput): Promise<PlaybookDecayDetectionOutput> {
+    const decaying: DecaySignal[] = [];
+    const atRisk: DecaySignal[] = [];
+    const healthy: string[] = [];
+    for (const pb of input.playbooks) {
+      const trend = computeTemporalTrend(input.annotations, pb.id);
+      if (trend.dataPoints < 4) {
+        healthy.push(pb.id);
+        continue;
+      }
+      if (trend.direction === 'decaying') {
+        decaying.push({
+          playbookId: pb.id,
+          playbookName: pb.name,
+          trend,
+          diagnosis: `Success rate declining from ${(trend.oldestSuccessRate * 100).toFixed(0)}% to ${(trend.recentSuccessRate * 100).toFixed(0)}%`,
+          recommendation: trend.slope < -0.15 ? 'refine' : 'monitor',
+          severity: trend.slope < -0.15 ? 'warning' : 'info',
+          evidence: [`Trend slope: ${trend.slope.toFixed(3)}`],
+        });
+      } else if (trend.daysSinceLastUse !== null && trend.daysSinceLastUse > 30) {
+        atRisk.push({
+          playbookId: pb.id,
+          playbookName: pb.name,
+          trend,
+          diagnosis: `No usage in ${Math.round(trend.daysSinceLastUse)} days — may be going stale`,
+          recommendation: 'investigate',
+          severity: 'info',
+          evidence: [`Last used ${Math.round(trend.daysSinceLastUse)} days ago`],
+        });
+      } else {
+        healthy.push(pb.id);
+      }
+    }
+    return {
+      decaying,
+      atRisk,
+      healthy,
+      portfolioAssessment: `${decaying.length} decaying, ${atRisk.length} at risk, ${healthy.length} healthy.`,
+    };
+  },
+  async prepareWorkspace(
+    input: PlaybookDecayDetectionInput,
+    handle: WorkspaceHandle
+  ): Promise<void> {
+    const effectivenessMap = new Map(
+      input.playbookEffectiveness.map(e => [e.playbookId, e])
+    );
+    // Pre-compute trends for all playbooks
+    const trendData = input.playbooks.map(pb => {
+      const trend = computeTemporalTrend(input.annotations, pb.id);
+      const eff = effectivenessMap.get(pb.id);
+      const successRate = getPlaybookSuccessRate(pb);
+      const appliedRate = eff?.appliedSuccessRate ?? successRate;
+      const normalizedGain = computeNormalizedGain(appliedRate, input.unguidedSuccessRate);
+      return {
+        playbookId: pb.id,
+        playbookName: pb.name,
+        confidence: pb.confidence,
+        successRate,
+        normalizedGain,
+        totalUses: pb.evolution.successCount + pb.evolution.failureCount,
+        domains: pb.applicability.domains,
+        trend,
+        recentFailureModes: pb.evolution.failures.slice(-3).map(f => f.failureMode),
+        adoptionRate: eff && eff.surfacedCount > 0
+          ? eff.appliedCount / eff.surfacedCount
+          : null,
+      };
+    });
+    await handle.writeJson('input', 'playbook-trends.json', trendData);
+    // Summary stats
+    await handle.writeJson('input', 'summary.json', {
+      totalPlaybooks: input.playbooks.length,
+      totalAnnotations: input.annotations.length,
+      unguidedSuccessRate: input.unguidedSuccessRate,
+      decayingCount: trendData.filter(t => t.trend.direction === 'decaying').length,
+      improvingCount: trendData.filter(t => t.trend.direction === 'improving').length,
+      stableCount: trendData.filter(t => t.trend.direction === 'stable').length,
+    });
+  },
+  buildTaskPrompt(input: PlaybookDecayDetectionInput): string {
+    return [
+      `Analyze ${input.playbooks.length} playbooks for signs of declining efficacy.`,
+      '',
+      'Read:',
+      '- input/playbook-trends.json — Pre-computed temporal trends, normalized gains, and failure modes for each playbook',
+      '- input/summary.json — Portfolio summary statistics',
+      '',
+      'For each playbook showing negative trends or staleness:',
+      '1. Is the decay real or just noise from small sample size?',
+      '2. What might be causing the decline? (environment changes, scope mismatch, guidance outdated)',
+      '3. What action should be taken?',
+      '',
+      'Also flag playbooks that aren\'t decaying yet but show early warning signs:',
+      '- Declining adoption rate (agents stop choosing to use it)',
+      '- Increasing failure modes diversity (suggests scope drift)',
+      '- High normalized gain but low confidence (fragile effectiveness)',
+      '',
+      'Write results to output/decay-report.json:',
+      '```json',
+      '{',
+      '  "decaying": [',
+      '    {',
+      '      "playbookId": "id",',
+      '      "playbookName": "name",',
+      '      "diagnosis": "why this is decaying",',
+      '      "recommendation": "investigate" | "refine" | "deprecate" | "monitor",',
+      '      "severity": "critical" | "warning" | "info",',
+      '      "evidence": ["supporting facts"]',
+      '    }',
+      '  ],',
+      '  "atRisk": [same structure],',
+      '  "healthy": ["playbookId1", "playbookId2"],',
+      '  "portfolioAssessment": "2-3 sentence overall assessment"',
+      '}',
+      '```',
+    ].join('\n');
+  },
+  getSkills() { return []; },
+  getResources() { return []; },
+  outputConfig: {
+    files: [
+      {
+        path: 'decay-report.json',
+        format: 'json' as const,
+        required: true,
+        description: 'Playbook decay detection results',
+      },
+    ],
+  },
+  async collectOutput(handle: WorkspaceHandle): Promise<PlaybookDecayDetectionOutput> {
+    const raw = await handle.readJson('output', 'decay-report.json') as Record<string, unknown>;
+    // Also read back trend data for full DecaySignal objects
+    const trendData = await handle.readJson('input', 'playbook-trends.json') as Array<{
+      playbookId: string;
+      trend: TemporalTrend;
+    }>;
+    const trendMap = new Map(trendData.map(t => [t.playbookId, t.trend]));
+    const parseSignals = (items: unknown): DecaySignal[] => {
+      if (!Array.isArray(items)) return [];
+      return (items as Record<string, unknown>[]).map(item => ({
+        playbookId: String(item.playbookId ?? ''),
+        playbookName: String(item.playbookName ?? ''),
+        trend: trendMap.get(String(item.playbookId ?? '')) ?? {
+          slope: 0, direction: 'stable' as const, dataPoints: 0,
+          recentSuccessRate: 0, oldestSuccessRate: 0, daysSinceLastUse: null,
+        },
+        diagnosis: String(item.diagnosis ?? ''),
+        recommendation: String(item.recommendation ?? 'monitor') as DecaySignal['recommendation'],
+        severity: String(item.severity ?? 'info') as DecaySignal['severity'],
+        evidence: Array.isArray(item.evidence) ? item.evidence.map(String) : [],
+      }));
+    };
+    return {
+      decaying: parseSignals(raw.decaying),
+      atRisk: parseSignals(raw.atRisk),
+      healthy: Array.isArray(raw.healthy) ? raw.healthy.map(String) : [],
+      portfolioAssessment: String(raw.portfolioAssessment ?? ''),
+    };
+  },
+  async processOutput(): Promise<void> {
+    // Caller decides how to act on decay signals
+  },
+  computeRequirements: {
+    mode: 'local',
+    complexity: 'lightweight',
+  },
+  agentType: 'claude-code',
+  timeout: 120_000,
+  captureToolCalls: true,
+};

package/src/workspace/templates/playbook-efficacy-audit.ts ADDED Viewed

@@ -0,0 +1,246 @@
+/**
+ * Playbook Efficacy Audit Template
+ *
+ * Deep-dive analysis of a single playbook's effectiveness.
+ * Uses the efficacy toolkit to pre-compute metrics, then asks an agent
+ * to synthesize findings, identify root causes, and recommend actions.
+ *
+ * Heuristic fallback: for playbooks with very few data points, return
+ * a summary without agent analysis.
+ */
+import type { WorkspaceHandle } from 'agent-workspace';
+import type { Playbook } from '../../types/index.js';
+import type { TaskAnnotation, PlaybookEffectivenessEntry } from '../../learning/effectiveness.js';
+import type {
+  AgenticTaskTemplate,
+  AnalysisComplexity,
+} from '../types.js';
+import {
+  buildEfficacyProfile,
+  type PlaybookEfficacyProfile,
+} from '../efficacy-toolkit.js';
+// ============================================================
+// Input / Output Types
+// ============================================================
+export interface PlaybookEfficacyAuditInput {
+  playbook: Playbook;
+  annotations: TaskAnnotation[];
+  playbookEffectiveness: PlaybookEffectivenessEntry | undefined;
+  unguidedSuccessRate: number;
+  /** Map of trajectoryId → domain for per-domain breakdown */
+  trajectoryDomainMap: Map<string, string>;
+}
+export interface PlaybookEfficacyAuditOutput {
+  /** Pre-computed profile (from toolkit) */
+  profile: PlaybookEfficacyProfile;
+  /** Agent's synthesis and recommendations */
+  assessment: {
+    /** Overall health rating */
+    health: 'healthy' | 'at-risk' | 'underperforming' | 'insufficient-data';
+    /** Root cause analysis for any issues */
+    rootCauses: string[];
+    /** Specific actionable recommendations */
+    recommendations: PlaybookRecommendation[];
+    /** Brief narrative summary */
+    summary: string;
+  };
+}
+export interface PlaybookRecommendation {
+  action: 'refine-guidance' | 'add-anti-pattern' | 'narrow-scope' | 'broaden-scope'
+    | 'deprecate' | 'merge-with' | 'split' | 'no-change';
+  description: string;
+  priority: 'high' | 'medium' | 'low';
+  /** For merge-with: target playbook name */
+  targetPlaybook?: string;
+}
+// ============================================================
+// Template Implementation
+// ============================================================
+export const playbookEfficacyAuditTemplate: AgenticTaskTemplate<
+  PlaybookEfficacyAuditInput,
+  PlaybookEfficacyAuditOutput
+> = {
+  taskType: 'playbook-efficacy-audit',
+  domain: 'meta-learning',
+  description: 'Deep-dive efficacy analysis of a single playbook with recommendations',
+  assessComplexity(input: PlaybookEfficacyAuditInput): AnalysisComplexity {
+    const totalUses = input.playbook.evolution.successCount + input.playbook.evolution.failureCount;
+    if (totalUses < 3) return 'heuristic';
+    if (totalUses < 10 && input.playbook.evolution.failures.length === 0) return 'heuristic';
+    if (totalUses > 20 || input.playbook.evolution.failures.length > 3) return 'standard';
+    return 'lightweight';
+  },
+  async heuristicFallback(input: PlaybookEfficacyAuditInput): Promise<PlaybookEfficacyAuditOutput> {
+    const profile = buildEfficacyProfile(
+      input.playbook,
+      input.annotations,
+      input.playbookEffectiveness,
+      input.unguidedSuccessRate,
+      input.trajectoryDomainMap,
+    );
+    return {
+      profile,
+      assessment: {
+        health: profile.totalUses < 3 ? 'insufficient-data' : (
+          profile.successRate >= 0.7 ? 'healthy' :
+          profile.successRate >= 0.4 ? 'at-risk' : 'underperforming'
+        ),
+        rootCauses: [],
+        recommendations: profile.totalUses < 3
+          ? [{ action: 'no-change', description: 'Insufficient data for assessment', priority: 'low' }]
+          : [],
+        summary: profile.totalUses < 3
+          ? `${profile.playbookName} has only ${profile.totalUses} uses — not enough for reliable assessment.`
+          : `${profile.playbookName}: ${(profile.successRate * 100).toFixed(0)}% success rate, normalized gain ${profile.normalizedGain.toFixed(2)}.`,
+      },
+    };
+  },
+  async prepareWorkspace(
+    input: PlaybookEfficacyAuditInput,
+    handle: WorkspaceHandle
+  ): Promise<void> {
+    const profile = buildEfficacyProfile(
+      input.playbook,
+      input.annotations,
+      input.playbookEffectiveness,
+      input.unguidedSuccessRate,
+      input.trajectoryDomainMap,
+    );
+    // Pre-computed efficacy profile — the core data the agent analyzes
+    await handle.writeJson('input', 'efficacy-profile.json', profile);
+    // Full playbook definition for context
+    await handle.writeJson('input', 'playbook.json', {
+      id: input.playbook.id,
+      name: input.playbook.name,
+      applicability: input.playbook.applicability,
+      guidance: input.playbook.guidance,
+      verification: input.playbook.verification,
+      evolution: {
+        version: input.playbook.evolution.version,
+        failures: input.playbook.evolution.failures.slice(-5),
+        refinements: input.playbook.evolution.refinements,
+        successCount: input.playbook.evolution.successCount,
+        failureCount: input.playbook.evolution.failureCount,
+      },
+      confidence: input.playbook.confidence,
+      complexity: input.playbook.complexity,
+    });
+    // Recent trajectory annotations for this playbook
+    const relevantAnnotations = input.annotations
+      .filter(a =>
+        a.knowledgeSurfaced.playbookIds.includes(input.playbook.id)
+      )
+      .slice(-20)
+      .map(a => ({
+        trajectoryId: a.trajectoryId,
+        success: a.outcome.success,
+        stepCount: a.outcome.stepCount,
+        errorRecoveries: a.outcome.errorRecoveries,
+        wasApplied: a.knowledgeApplied.playbookIdsUsed.includes(input.playbook.id),
+        reflection: a.reflection,
+        timestamp: a.timestamp,
+      }));
+    await handle.writeJson('input', 'recent-annotations.json', relevantAnnotations);
+  },
+  buildTaskPrompt(input: PlaybookEfficacyAuditInput): string {
+    return [
+      `Evaluate the efficacy of playbook "${input.playbook.name}".`,
+      '',
+      'Read the following input files:',
+      '- input/efficacy-profile.json — Pre-computed metrics (normalized gain, temporal trend, domain breakdown, usage profile)',
+      '- input/playbook.json — The playbook definition (guidance, verification, evolution history)',
+      '- input/recent-annotations.json — Recent trajectory outcomes when this playbook was surfaced',
+      '',
+      'Analyze:',
+      '1. Is this playbook providing marginal value above the unguided baseline? (check normalizedGain)',
+      '2. Is efficacy trending up, stable, or decaying? (check temporalTrend)',
+      '3. Does it work equally well across all domains, or is it domain-specific?',
+      '4. Are agents actually adopting it when surfaced? (check usage.adoptionRate)',
+      '5. Are there recurring failure modes that suggest guidance needs updating?',
+      '',
+      'Write your assessment to output/audit.json:',
+      '```json',
+      '{',
+      '  "health": "healthy" | "at-risk" | "underperforming" | "insufficient-data",',
+      '  "rootCauses": ["reason1", "reason2"],',
+      '  "recommendations": [',
+      '    {',
+      '      "action": "refine-guidance" | "add-anti-pattern" | "narrow-scope" | "broaden-scope" | "deprecate" | "merge-with" | "split" | "no-change",',
+      '      "description": "specific actionable recommendation",',
+      '      "priority": "high" | "medium" | "low",',
+      '      "targetPlaybook": "optional: for merge-with actions"',
+      '    }',
+      '  ],',
+      '  "summary": "2-3 sentence narrative summary"',
+      '}',
+      '```',
+    ].join('\n');
+  },
+  getSkills() { return []; },
+  getResources() { return []; },
+  outputConfig: {
+    files: [
+      {
+        path: 'audit.json',
+        format: 'json' as const,
+        required: true,
+        description: 'Playbook efficacy audit results',
+      },
+    ],
+  },
+  async collectOutput(handle: WorkspaceHandle): Promise<PlaybookEfficacyAuditOutput> {
+    const raw = await handle.readJson('output', 'audit.json') as Record<string, unknown>;
+    const profile = await handle.readJson('input', 'efficacy-profile.json') as PlaybookEfficacyProfile;
+    const recommendations: PlaybookRecommendation[] = Array.isArray(raw.recommendations)
+      ? (raw.recommendations as Record<string, unknown>[]).map(r => ({
+          action: String(r.action ?? 'no-change') as PlaybookRecommendation['action'],
+          description: String(r.description ?? ''),
+          priority: String(r.priority ?? 'medium') as PlaybookRecommendation['priority'],
+          targetPlaybook: r.targetPlaybook ? String(r.targetPlaybook) : undefined,
+        }))
+      : [];
+    return {
+      profile,
+      assessment: {
+        health: String(raw.health ?? 'insufficient-data') as PlaybookEfficacyAuditOutput['assessment']['health'],
+        rootCauses: Array.isArray(raw.rootCauses) ? raw.rootCauses.map(String) : [],
+        recommendations,
+        summary: String(raw.summary ?? ''),
+      },
+    };
+  },
+  async processOutput(): Promise<void> {
+    // Caller decides how to act on recommendations
+  },
+  computeRequirements: {
+    mode: 'local',
+    complexity: 'lightweight',
+  },
+  agentType: 'claude-code',
+  timeout: 120_000,
+  captureToolCalls: true,
+};