mindforge-cc 10.0.3 → 11.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.mindforge/MINDFORGE-V2-SCHEMA.json +43 -10
- package/.mindforge/config.json +30 -2
- package/.mindforge/engine/cross-model-eval.md +74 -0
- package/.mindforge/engine/proactive/signal-detector.md +60 -0
- package/.mindforge/engine/proactive/suggestion-engine.md +100 -0
- package/.mindforge/personas/agent-architect.md +57 -0
- package/.mindforge/personas/agent-evaluator.md +162 -0
- package/.mindforge/personas/agent-memory-designer.md +157 -0
- package/.mindforge/personas/agent-ops-engineer.md +120 -0
- package/.mindforge/personas/agent-orchestrator.md +112 -0
- package/.mindforge/personas/ai-economist.md +57 -0
- package/.mindforge/personas/ai-safety-engineer.md +57 -0
- package/.mindforge/personas/analytics-engineer.md +57 -0
- package/.mindforge/personas/anti-pattern-hunter.md +61 -0
- package/.mindforge/personas/api-gateway-designer.md +132 -0
- package/.mindforge/personas/auth-engineer.md +112 -0
- package/.mindforge/personas/build-engineer.md +57 -0
- package/.mindforge/personas/business-analyst.md +56 -0
- package/.mindforge/personas/cache-architect.md +100 -0
- package/.mindforge/personas/causal-scientist.md +57 -0
- package/.mindforge/personas/cdn-architect.md +118 -0
- package/.mindforge/personas/change-agent.md +104 -0
- package/.mindforge/personas/code-narrator.md +52 -0
- package/.mindforge/personas/codegen-specialist.md +68 -0
- package/.mindforge/personas/communication-architect.md +102 -0
- package/.mindforge/personas/compliance-engineer.md +96 -0
- package/.mindforge/personas/consensus-engineer.md +116 -0
- package/.mindforge/personas/contract-tester.md +60 -192
- package/.mindforge/personas/data-architect.md +108 -0
- package/.mindforge/personas/data-mesh-architect.md +57 -0
- package/.mindforge/personas/data-pipeline-architect.md +120 -0
- package/.mindforge/personas/de-sloppifier.md +60 -0
- package/.mindforge/personas/debt-manager.md +66 -0
- package/.mindforge/personas/decision-architect.md +82 -51
- package/.mindforge/personas/deployment-captain.md +74 -0
- package/.mindforge/personas/design-system-lead.md +112 -0
- package/.mindforge/personas/dmux-orchestrator.md +75 -0
- package/.mindforge/personas/dx-engineer.md +96 -0
- package/.mindforge/personas/ecommerce-engineer.md +57 -0
- package/.mindforge/personas/edge-engineer.md +94 -0
- package/.mindforge/personas/edtech-architect.md +106 -0
- package/.mindforge/personas/embedding-architect.md +57 -0
- package/.mindforge/personas/environment-engineer.md +57 -0
- package/.mindforge/personas/eval-judge.md +55 -0
- package/.mindforge/personas/event-architect.md +102 -0
- package/.mindforge/personas/experiment-designer.md +138 -0
- package/.mindforge/personas/feature-store-engineer.md +57 -0
- package/.mindforge/personas/finops-analyst.md +66 -0
- package/.mindforge/personas/fintech-architect.md +57 -0
- package/.mindforge/personas/flutter-engineer.md +104 -0
- package/.mindforge/personas/gaming-engineer.md +57 -0
- package/.mindforge/personas/graphql-designer.md +73 -0
- package/.mindforge/personas/healthcare-engineer.md +57 -0
- package/.mindforge/personas/hiring-strategist.md +105 -0
- package/.mindforge/personas/hitl-architect.md +165 -0
- package/.mindforge/personas/i18n-architect.md +69 -0
- package/.mindforge/personas/iot-architect.md +105 -0
- package/.mindforge/personas/knowledge-curator.md +139 -0
- package/.mindforge/personas/knowledge-engineer.md +57 -0
- package/.mindforge/personas/lakehouse-architect.md +57 -0
- package/.mindforge/personas/llm-orchestrator.md +57 -0
- package/.mindforge/personas/logistics-architect.md +106 -0
- package/.mindforge/personas/market-analyst.md +53 -0
- package/.mindforge/personas/marketplace-engineer.md +105 -0
- package/.mindforge/personas/mcp-designer.md +54 -0
- package/.mindforge/personas/meeting-designer.md +104 -0
- package/.mindforge/personas/mentorship-lead.md +106 -0
- package/.mindforge/personas/migration-architect.md +57 -0
- package/.mindforge/personas/ml-ops-engineer.md +101 -0
- package/.mindforge/personas/mobile-architect.md +105 -0
- package/.mindforge/personas/mobile-security-engineer.md +106 -0
- package/.mindforge/personas/multi-tenancy-architect.md +71 -0
- package/.mindforge/personas/multimodal-engineer.md +57 -0
- package/.mindforge/personas/offline-specialist.md +105 -0
- package/.mindforge/personas/onboarding-navigator.md +63 -0
- package/.mindforge/personas/payments-engineer.md +135 -0
- package/.mindforge/personas/pipeline-engineer.md +115 -0
- package/.mindforge/personas/platform-engineer.md +97 -0
- package/.mindforge/personas/platform-lead.md +57 -0
- package/.mindforge/personas/privacy-engineer.md +57 -0
- package/.mindforge/personas/product-owner.md +56 -0
- package/.mindforge/personas/productivity-analyst.md +57 -0
- package/.mindforge/personas/prompt-architect.md +101 -0
- package/.mindforge/personas/proofreader.md +53 -0
- package/.mindforge/personas/pwa-architect.md +105 -0
- package/.mindforge/personas/quality-scorer.md +63 -0
- package/.mindforge/personas/react-native-engineer.md +106 -0
- package/.mindforge/personas/resilience-engineer.md +69 -0
- package/.mindforge/personas/rfc-architect.md +64 -0
- package/.mindforge/personas/saga-orchestrator.md +80 -0
- package/.mindforge/personas/secrets-engineer.md +57 -0
- package/.mindforge/personas/skill-smith.md +79 -0
- package/.mindforge/personas/sre-lead.md +107 -0
- package/.mindforge/personas/stream-engineer.md +57 -0
- package/.mindforge/personas/streaming-engineer.md +64 -0
- package/.mindforge/personas/swarm-templates.json +674 -44
- package/.mindforge/personas/system-designer.md +57 -0
- package/.mindforge/personas/team-coach.md +120 -0
- package/.mindforge/personas/tech-lead-coach.md +103 -0
- package/.mindforge/personas/technical-writer-lead.md +111 -0
- package/.mindforge/personas/vibe-checker.md +75 -0
- package/.mindforge/personas/worktree-manager.md +56 -0
- package/.mindforge/personas/zero-trust-engineer.md +113 -0
- package/.mindforge/skills/a11y-testing/SKILL.md +143 -0
- package/.mindforge/skills/agent-evaluation-framework/SKILL.md +227 -0
- package/.mindforge/skills/agent-memory-design/SKILL.md +199 -0
- package/.mindforge/skills/agent-orchestration-patterns/SKILL.md +129 -0
- package/.mindforge/skills/agent-tool-selection/SKILL.md +204 -0
- package/.mindforge/skills/ai-agent-deployment/SKILL.md +176 -0
- package/.mindforge/skills/ai-cost-management/SKILL.md +57 -0
- package/.mindforge/skills/ai-safety-alignment/SKILL.md +53 -0
- package/.mindforge/skills/analytics-instrumentation/SKILL.md +172 -0
- package/.mindforge/skills/api-gateway-patterns/SKILL.md +177 -0
- package/.mindforge/skills/api-marketplace/SKILL.md +56 -0
- package/.mindforge/skills/api-versioning/SKILL.md +100 -0
- package/.mindforge/skills/app-store-deployment/SKILL.md +44 -0
- package/.mindforge/skills/architecture-tradeoff-analysis/SKILL.md +97 -0
- package/.mindforge/skills/audit-logging/SKILL.md +140 -0
- package/.mindforge/skills/auth-patterns/SKILL.md +148 -0
- package/.mindforge/skills/autonomous-agent-harness/SKILL.md +218 -0
- package/.mindforge/skills/autonomous-agents/SKILL.md +59 -0
- package/.mindforge/skills/build-system-optimization/SKILL.md +54 -0
- package/.mindforge/skills/build-vs-buy/SKILL.md +80 -0
- package/.mindforge/skills/bundle-optimization/SKILL.md +174 -0
- package/.mindforge/skills/business-analyst/SKILL.md +82 -0
- package/.mindforge/skills/caching-strategies/SKILL.md +132 -0
- package/.mindforge/skills/capacity-planning/SKILL.md +96 -0
- package/.mindforge/skills/causal-inference/SKILL.md +42 -0
- package/.mindforge/skills/cdn-optimization/SKILL.md +212 -0
- package/.mindforge/skills/change-management/SKILL.md +106 -0
- package/.mindforge/skills/chaos-engineering/SKILL.md +99 -0
- package/.mindforge/skills/ci-cd-pipeline/SKILL.md +118 -0
- package/.mindforge/skills/cli-design/SKILL.md +118 -0
- package/.mindforge/skills/code-generation-patterns/SKILL.md +92 -0
- package/.mindforge/skills/code-review-methodology/SKILL.md +180 -0
- package/.mindforge/skills/code-tour/SKILL.md +145 -0
- package/.mindforge/skills/codebase-onboarding/SKILL.md +95 -0
- package/.mindforge/skills/compliance-as-code/SKILL.md +195 -0
- package/.mindforge/skills/conflict-resolution/SKILL.md +87 -0
- package/.mindforge/skills/connection-pooling/SKILL.md +151 -0
- package/.mindforge/skills/container-security/SKILL.md +151 -0
- package/.mindforge/skills/context-engineering/SKILL.md +114 -0
- package/.mindforge/skills/contract-testing/SKILL.md +85 -0
- package/.mindforge/skills/cost-estimation/SKILL.md +82 -0
- package/.mindforge/skills/cqrs-event-sourcing/SKILL.md +95 -0
- package/.mindforge/skills/cross-platform-testing/SKILL.md +43 -0
- package/.mindforge/skills/data-governance/SKILL.md +42 -0
- package/.mindforge/skills/data-lakehouse/SKILL.md +42 -0
- package/.mindforge/skills/data-mesh/SKILL.md +42 -0
- package/.mindforge/skills/data-modeling/SKILL.md +107 -0
- package/.mindforge/skills/data-pipeline-design/SKILL.md +171 -0
- package/.mindforge/skills/data-privacy-engineering/SKILL.md +42 -0
- package/.mindforge/skills/database-performance/SKILL.md +174 -0
- package/.mindforge/skills/database-sharding-advanced/SKILL.md +206 -0
- package/.mindforge/skills/de-sloppify/SKILL.md +120 -0
- package/.mindforge/skills/defense-in-depth/SKILL.md +84 -0
- package/.mindforge/skills/delegation-patterns/SKILL.md +123 -0
- package/.mindforge/skills/dependency-management/SKILL.md +94 -0
- package/.mindforge/skills/deployment-workflow/SKILL.md +135 -0
- package/.mindforge/skills/design-system/SKILL.md +113 -0
- package/.mindforge/skills/developer-onboarding/SKILL.md +99 -0
- package/.mindforge/skills/developer-productivity-metrics/SKILL.md +59 -0
- package/.mindforge/skills/distributed-consensus/SKILL.md +141 -0
- package/.mindforge/skills/dmux-workflows/SKILL.md +141 -0
- package/.mindforge/skills/dns-architecture/SKILL.md +167 -0
- package/.mindforge/skills/ecommerce-architecture/SKILL.md +41 -0
- package/.mindforge/skills/edge-computing/SKILL.md +91 -0
- package/.mindforge/skills/edtech-platform/SKILL.md +41 -0
- package/.mindforge/skills/email-deliverability/SKILL.md +177 -0
- package/.mindforge/skills/embedding-systems/SKILL.md +55 -0
- package/.mindforge/skills/environment-management/SKILL.md +54 -0
- package/.mindforge/skills/error-handling-architecture/SKILL.md +118 -0
- package/.mindforge/skills/estimation-techniques/SKILL.md +113 -0
- package/.mindforge/skills/eval-harness/SKILL.md +180 -0
- package/.mindforge/skills/event-driven-architecture/SKILL.md +162 -0
- package/.mindforge/skills/experiment-design/SKILL.md +139 -0
- package/.mindforge/skills/experiment-platform/SKILL.md +43 -0
- package/.mindforge/skills/feature-engineering/SKILL.md +42 -0
- package/.mindforge/skills/feature-flag-management/SKILL.md +183 -0
- package/.mindforge/skills/fine-tuning-workflow/SKILL.md +189 -0
- package/.mindforge/skills/fintech-patterns/SKILL.md +41 -0
- package/.mindforge/skills/flutter-architecture/SKILL.md +42 -0
- package/.mindforge/skills/gaming-backend/SKILL.md +41 -0
- package/.mindforge/skills/git-workflow-design/SKILL.md +129 -0
- package/.mindforge/skills/graceful-degradation/SKILL.md +95 -0
- package/.mindforge/skills/graphql-patterns/SKILL.md +243 -0
- package/.mindforge/skills/guardrails-and-safety/SKILL.md +137 -0
- package/.mindforge/skills/healthcare-systems/SKILL.md +40 -0
- package/.mindforge/skills/hiring-engineering/SKILL.md +119 -0
- package/.mindforge/skills/human-in-the-loop-design/SKILL.md +234 -0
- package/.mindforge/skills/i18n-architecture/SKILL.md +147 -0
- package/.mindforge/skills/idempotency-patterns/SKILL.md +84 -0
- package/.mindforge/skills/incident-communication/SKILL.md +96 -0
- package/.mindforge/skills/incident-management/SKILL.md +97 -0
- package/.mindforge/skills/infrastructure-as-code/SKILL.md +98 -0
- package/.mindforge/skills/instinct-clustering/SKILL.md +190 -0
- package/.mindforge/skills/internal-developer-platform/SKILL.md +51 -0
- package/.mindforge/skills/iot-platform/SKILL.md +41 -0
- package/.mindforge/skills/k8s-deployment/SKILL.md +358 -0
- package/.mindforge/skills/knowledge-graphs/SKILL.md +56 -0
- package/.mindforge/skills/knowledge-sharing-systems/SKILL.md +112 -0
- package/.mindforge/skills/llm-cost-optimization/SKILL.md +198 -0
- package/.mindforge/skills/llm-orchestration/SKILL.md +56 -0
- package/.mindforge/skills/load-testing/SKILL.md +84 -0
- package/.mindforge/skills/logistics-optimization/SKILL.md +40 -0
- package/.mindforge/skills/market-researcher/SKILL.md +99 -0
- package/.mindforge/skills/marketplace-trust/SKILL.md +40 -0
- package/.mindforge/skills/mcp-server-patterns/SKILL.md +264 -0
- package/.mindforge/skills/media-streaming/SKILL.md +41 -0
- package/.mindforge/skills/meeting-architecture/SKILL.md +146 -0
- package/.mindforge/skills/mentoring-patterns/SKILL.md +77 -0
- package/.mindforge/skills/microservices-patterns/SKILL.md +83 -0
- package/.mindforge/skills/migration-platform/SKILL.md +61 -0
- package/.mindforge/skills/migration-strategies/SKILL.md +129 -0
- package/.mindforge/skills/ml-feature-store/SKILL.md +56 -0
- package/.mindforge/skills/ml-monitoring/SKILL.md +42 -0
- package/.mindforge/skills/mobile-performance/SKILL.md +44 -0
- package/.mindforge/skills/mobile-security/SKILL.md +45 -0
- package/.mindforge/skills/model-evaluation/SKILL.md +53 -0
- package/.mindforge/skills/monorepo-management/SKILL.md +100 -0
- package/.mindforge/skills/multi-tenancy-patterns/SKILL.md +145 -0
- package/.mindforge/skills/multi-turn-conversation-design/SKILL.md +206 -0
- package/.mindforge/skills/multimodal-ai/SKILL.md +51 -0
- package/.mindforge/skills/mutation-testing/SKILL.md +97 -0
- package/.mindforge/skills/notification-system-design/SKILL.md +168 -0
- package/.mindforge/skills/observability-stack/SKILL.md +136 -0
- package/.mindforge/skills/offline-first-design/SKILL.md +43 -0
- package/.mindforge/skills/on-call-design/SKILL.md +111 -0
- package/.mindforge/skills/pagination-patterns/SKILL.md +230 -0
- package/.mindforge/skills/payment-integration/SKILL.md +176 -0
- package/.mindforge/skills/performance-reviews/SKILL.md +140 -0
- package/.mindforge/skills/platform-observability/SKILL.md +58 -0
- package/.mindforge/skills/platform-reliability/SKILL.md +52 -0
- package/.mindforge/skills/post-incident-learning/SKILL.md +96 -0
- package/.mindforge/skills/product-manager/SKILL.md +104 -0
- package/.mindforge/skills/progressive-web-app/SKILL.md +44 -0
- package/.mindforge/skills/prompt-engineering/SKILL.md +94 -0
- package/.mindforge/skills/proofreader/SKILL.md +158 -0
- package/.mindforge/skills/push-notification-architecture/SKILL.md +45 -0
- package/.mindforge/skills/python-performance/SKILL.md +183 -0
- package/.mindforge/skills/quality-audit/SKILL.md +171 -0
- package/.mindforge/skills/queue-design/SKILL.md +85 -0
- package/.mindforge/skills/rag-architecture/SKILL.md +176 -0
- package/.mindforge/skills/rate-limiting-design/SKILL.md +94 -0
- package/.mindforge/skills/react-native-patterns/SKILL.md +42 -0
- package/.mindforge/skills/react-performance/SKILL.md +229 -0
- package/.mindforge/skills/real-time-analytics/SKILL.md +42 -0
- package/.mindforge/skills/real-time-sync/SKILL.md +83 -0
- package/.mindforge/skills/responsive-native/SKILL.md +44 -0
- package/.mindforge/skills/responsive-patterns/SKILL.md +141 -0
- package/.mindforge/skills/rfc-pipeline/SKILL.md +114 -0
- package/.mindforge/skills/saas-multi-tenant/SKILL.md +41 -0
- package/.mindforge/skills/santa-method/SKILL.md +134 -0
- package/.mindforge/skills/search-implementation/SKILL.md +98 -0
- package/.mindforge/skills/secrets-platform/SKILL.md +56 -0
- package/.mindforge/skills/secrets-rotation/SKILL.md +173 -0
- package/.mindforge/skills/self-serve-infrastructure/SKILL.md +51 -0
- package/.mindforge/skills/serverless-patterns/SKILL.md +119 -0
- package/.mindforge/skills/skill-creator-meta/SKILL.md +146 -0
- package/.mindforge/skills/sprint-retrospective-facilitation/SKILL.md +112 -0
- package/.mindforge/skills/stakeholder-communication/SKILL.md +85 -0
- package/.mindforge/skills/state-management/SKILL.md +104 -0
- package/.mindforge/skills/stream-processing/SKILL.md +43 -0
- package/.mindforge/skills/streaming-architecture/SKILL.md +81 -0
- package/.mindforge/skills/supply-chain-security/SKILL.md +145 -0
- package/.mindforge/skills/synthetic-data-generation/SKILL.md +52 -0
- package/.mindforge/skills/system-design/SKILL.md +88 -0
- package/.mindforge/skills/team-topology-design/SKILL.md +107 -0
- package/.mindforge/skills/technical-debt-management/SKILL.md +86 -0
- package/.mindforge/skills/technical-interview-design/SKILL.md +98 -0
- package/.mindforge/skills/technical-leadership/SKILL.md +75 -0
- package/.mindforge/skills/technical-writing/SKILL.md +237 -0
- package/.mindforge/skills/technology-radar/SKILL.md +88 -0
- package/.mindforge/skills/testing-anti-patterns/SKILL.md +288 -0
- package/.mindforge/skills/tool-design/SKILL.md +138 -0
- package/.mindforge/skills/typescript-advanced/SKILL.md +198 -0
- package/.mindforge/skills/using-git-worktrees/SKILL.md +139 -0
- package/.mindforge/skills/verification-loop/SKILL.md +13 -1
- package/.mindforge/skills/vibe-security/SKILL.md +165 -0
- package/.mindforge/skills/visual-regression-testing/SKILL.md +97 -0
- package/.mindforge/skills/websocket-patterns/SKILL.md +203 -0
- package/.mindforge/skills/writing-plans/SKILL.md +170 -0
- package/.mindforge/skills/writing-skills/SKILL.md +216 -0
- package/.mindforge/skills/zero-trust-architecture/SKILL.md +166 -0
- package/CHANGELOG.md +240 -0
- package/MINDFORGE.md +4 -4
- package/README.md +49 -4
- package/RELEASENOTES.md +80 -0
- package/SECURITY.md +20 -8
- package/bin/autonomous/audit-writer.js +13 -0
- package/bin/autonomous/auto-runner.js +74 -16
- package/bin/autonomous/context-refactorer.js +26 -11
- package/bin/autonomous/state-manager.js +62 -6
- package/bin/autonomous/stuck-monitor.js +46 -7
- package/bin/autonomous/wave-executor.js +66 -25
- package/bin/dashboard/api-router.js +43 -0
- package/bin/dashboard/metrics-aggregator.js +28 -1
- package/bin/dashboard/server.js +67 -4
- package/bin/dashboard/sse-bridge.js +4 -4
- package/bin/engine/feedback-loop.js +8 -0
- package/bin/engine/intelligence-interlock.js +32 -15
- package/bin/engine/logic-drift-detector.js +2 -1
- package/bin/engine/nexus-tracer.js +3 -2
- package/bin/engine/remediation-engine.js +155 -32
- package/bin/engine/self-corrective-synthesizer.js +84 -10
- package/bin/engine/sre-manager.js +12 -4
- package/bin/engine/temporal-hub.js +131 -34
- package/bin/governance/approve.js +41 -5
- package/bin/governance/impact-analyzer.js +28 -0
- package/bin/governance/policy-engine.js +10 -3
- package/bin/governance/quantum-crypto.js +32 -19
- package/bin/governance/rbac-manager.js +74 -2
- package/bin/governance/ztai-manager.js +49 -7
- package/bin/hindsight-injector.js +3 -3
- package/bin/memory/eis-client.js +71 -34
- package/bin/memory/embedding-engine.js +61 -0
- package/bin/memory/knowledge-graph.js +58 -5
- package/bin/memory/knowledge-indexer.js +53 -6
- package/bin/memory/knowledge-store.js +22 -0
- package/bin/migrations/10.7.0-to-11.0.0.js +110 -0
- package/bin/migrations/schema-versions.js +13 -0
- package/bin/models/anthropic-provider.js +45 -0
- package/bin/models/cloud-broker.js +68 -20
- package/bin/models/gemini-provider.js +51 -0
- package/bin/models/model-client.js +20 -0
- package/bin/models/model-router.js +28 -8
- package/bin/models/openai-provider.js +44 -0
- package/bin/utils/file-io.js +63 -1
- package/bin/utils/index.js +58 -0
- package/docs/getting-started.md +1 -1
- package/docs/user-guide.md +2 -2
- package/package.json +2 -2
- package/.mindforge/personas/data-privacy-engineer.md +0 -187
|
@@ -1,14 +1,51 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
3
|
"title": "MindForge v2 Autonomous Engine Schema",
|
|
4
|
-
"description": "Schema for HANDOFF.json and auto-state.json in v2.0.0-alpha.1",
|
|
4
|
+
"description": "Schema for HANDOFF.json and auto-state.json in v2.0.0-alpha.1+",
|
|
5
5
|
"type": "object",
|
|
6
6
|
"properties": {
|
|
7
|
-
"schema_version": { "type": "string"
|
|
7
|
+
"schema_version": { "type": "string" },
|
|
8
|
+
"schema_type": { "type": "string", "enum": ["HANDOFF", "AUTO_STATE"] },
|
|
8
9
|
"auto_mode_active": { "type": "boolean" },
|
|
9
|
-
"phase": { "type": "integer" },
|
|
10
|
-
"wave_current": { "type": "integer" },
|
|
11
|
-
"tasks_completed": { "type": "integer" },
|
|
10
|
+
"phase": { "oneOf": [{ "type": "integer" }, { "type": "string" }] },
|
|
11
|
+
"wave_current": { "type": "integer", "minimum": 0 },
|
|
12
|
+
"tasks_completed": { "type": "integer", "minimum": 0 },
|
|
13
|
+
"status": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"enum": ["idle", "running", "paused", "completed", "escalated", "timeout"]
|
|
16
|
+
},
|
|
17
|
+
"handoffs": {
|
|
18
|
+
"type": "array",
|
|
19
|
+
"items": {
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"id": { "type": "string" },
|
|
23
|
+
"name": { "type": "string" },
|
|
24
|
+
"plan": { "type": "string" },
|
|
25
|
+
"depends_on": {
|
|
26
|
+
"type": "array",
|
|
27
|
+
"items": { "type": "string" }
|
|
28
|
+
},
|
|
29
|
+
"wave": { "type": "integer", "minimum": 0 }
|
|
30
|
+
},
|
|
31
|
+
"required": ["id", "name"]
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"timestamps": {
|
|
35
|
+
"type": "object",
|
|
36
|
+
"properties": {
|
|
37
|
+
"started_at": { "type": "string", "format": "date-time" },
|
|
38
|
+
"updated_at": { "type": "string", "format": "date-time" }
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"next_task": { "type": "string" },
|
|
42
|
+
"context_refs": { "type": "array", "items": { "type": "string" } },
|
|
43
|
+
"blockers": { "type": "array", "items": { "type": "string" } },
|
|
44
|
+
"decisions_needed": { "type": "array", "items": { "type": "string" } },
|
|
45
|
+
"recent_commits": { "type": "array", "items": { "type": "string" } },
|
|
46
|
+
"recent_files": { "type": "array", "items": { "type": "string" } },
|
|
47
|
+
"current_context": { "type": "string" },
|
|
48
|
+
"last_updated": { "type": "string", "format": "date-time" },
|
|
12
49
|
"PLANNER_MODEL": { "type": "string" },
|
|
13
50
|
"EXECUTOR_MODEL": { "type": "string" },
|
|
14
51
|
"REVIEWER_MODEL": { "type": "string" },
|
|
@@ -21,10 +58,6 @@
|
|
|
21
58
|
"MODEL_COST_HARD_LIMIT_USD": { "type": "number" },
|
|
22
59
|
"MODEL_PREFER_CHEAP_BELOW_DIFFICULTY": { "type": "number" },
|
|
23
60
|
"REQUIRE_CROSS_REVIEW": { "type": "boolean" },
|
|
24
|
-
"status": {
|
|
25
|
-
"type": "string",
|
|
26
|
-
"enum": ["idle", "running", "paused", "completed", "escalated", "timeout"]
|
|
27
|
-
},
|
|
28
61
|
"governance": {
|
|
29
62
|
"type": "object",
|
|
30
63
|
"properties": {
|
|
@@ -43,5 +76,5 @@
|
|
|
43
76
|
}
|
|
44
77
|
}
|
|
45
78
|
},
|
|
46
|
-
"required": ["schema_version"
|
|
79
|
+
"required": ["schema_version"]
|
|
47
80
|
}
|
package/.mindforge/config.json
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "10.0
|
|
2
|
+
"version": "10.7.0",
|
|
3
3
|
"environment": "development",
|
|
4
4
|
"governance": {
|
|
5
5
|
"drift_threshold": 0.75,
|
|
6
6
|
"critical_drift_threshold": 0.5,
|
|
7
7
|
"res_threshold": 0.8,
|
|
8
|
-
"active_did": "did:mindforge:
|
|
8
|
+
"active_did": "did:mindforge:da5daf83-c478-490f-b528-bd907ad4eee3"
|
|
9
9
|
},
|
|
10
10
|
"revops": {
|
|
11
11
|
"market_registry": {
|
|
@@ -69,6 +69,11 @@
|
|
|
69
69
|
"max_drift_threshold": 0.1,
|
|
70
70
|
"auto_verify": false
|
|
71
71
|
},
|
|
72
|
+
"wave_concurrency": 3,
|
|
73
|
+
"temporal": {
|
|
74
|
+
"max_snapshots": 50,
|
|
75
|
+
"max_age_days": 7
|
|
76
|
+
},
|
|
72
77
|
"instincts": {
|
|
73
78
|
"mode": "auto-capture",
|
|
74
79
|
"max_active_per_project": 100,
|
|
@@ -110,5 +115,28 @@
|
|
|
110
115
|
"project_weekly_hard_limit_usd": 200
|
|
111
116
|
},
|
|
112
117
|
"ledger_path": ".mindforge/metrics/token-ledger.jsonl"
|
|
118
|
+
},
|
|
119
|
+
"proactive_suggestions": {
|
|
120
|
+
"enabled": true,
|
|
121
|
+
"confidence_threshold": 0.7,
|
|
122
|
+
"cooldown_seconds": 300,
|
|
123
|
+
"debounce_seconds": 30,
|
|
124
|
+
"max_recent": 50,
|
|
125
|
+
"store_path": ".mindforge/engine/proactive/recent-suggestions.json"
|
|
126
|
+
},
|
|
127
|
+
"eval": {
|
|
128
|
+
"default_k": 5,
|
|
129
|
+
"evals_path": ".mindforge/evals/",
|
|
130
|
+
"default_grader": "code"
|
|
131
|
+
},
|
|
132
|
+
"quality_audit": {
|
|
133
|
+
"passing_threshold": 3,
|
|
134
|
+
"accuracy_blocking_gate": 3,
|
|
135
|
+
"weights": {
|
|
136
|
+
"clarity": 0.25,
|
|
137
|
+
"completeness": 0.25,
|
|
138
|
+
"accuracy": 0.3,
|
|
139
|
+
"usefulness": 0.2
|
|
140
|
+
}
|
|
113
141
|
}
|
|
114
142
|
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Cross-Model Eval — Multi-Model Comparison Protocol
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
Route the same task to two different models and compare outputs. Divergence
|
|
5
|
+
between models is a quality signal; agreement is a confidence booster.
|
|
6
|
+
|
|
7
|
+
## When to Trigger
|
|
8
|
+
- Architecture decisions (high stakes, hard to reverse)
|
|
9
|
+
- Security-critical code (auth, payment, PII handling)
|
|
10
|
+
- Agent confidence < 0.7 on current approach
|
|
11
|
+
- User explicitly requests second opinion (via /mindforge:consult)
|
|
12
|
+
- Eval-harness model-grader needs calibration
|
|
13
|
+
|
|
14
|
+
## Model Selection Logic
|
|
15
|
+
|
|
16
|
+
| Primary Model | Comparison Model | Rationale |
|
|
17
|
+
|--------------|-----------------|-----------|
|
|
18
|
+
| claude-sonnet-4-6 | gemini-2.5-pro | Different training, different strengths |
|
|
19
|
+
| claude-opus-4-7 | gpt-4o | Independent validation of complex reasoning |
|
|
20
|
+
| gemini-2.5-pro | claude-sonnet-4-6 | Verify research findings independently |
|
|
21
|
+
|
|
22
|
+
Selection follows the cost-routing tier: comparison model is always from a DIFFERENT provider than the primary.
|
|
23
|
+
|
|
24
|
+
## Comparison Method
|
|
25
|
+
|
|
26
|
+
### Step 1 — Sanitize Context
|
|
27
|
+
Same sanitization as multi-llm-consult skill:
|
|
28
|
+
- Remove internal file paths, variable names, proprietary logic
|
|
29
|
+
- Keep abstract question and public references
|
|
30
|
+
|
|
31
|
+
### Step 2 — Parallel Dispatch
|
|
32
|
+
Send identical sanitized prompt to both models simultaneously.
|
|
33
|
+
|
|
34
|
+
### Step 3 — Structural Comparison
|
|
35
|
+
Compare responses structurally (not token-by-token):
|
|
36
|
+
- Do they recommend the same approach/pattern?
|
|
37
|
+
- Do they identify the same risks?
|
|
38
|
+
- Do they agree on the key trade-offs?
|
|
39
|
+
|
|
40
|
+
### Step 4 — Divergence Classification
|
|
41
|
+
|
|
42
|
+
| Agreement Level | Meaning | Action |
|
|
43
|
+
|----------------|---------|--------|
|
|
44
|
+
| Full agreement | Both recommend same approach with same reasoning | High confidence — proceed |
|
|
45
|
+
| Partial agreement | Same recommendation, different reasoning | Moderate confidence — note alternate reasoning |
|
|
46
|
+
| Approach divergence | Different recommendations, shared concerns | Flag for human review with both perspectives |
|
|
47
|
+
| Full divergence | Different recommendations, different concerns | STOP — present both to user, defer decision |
|
|
48
|
+
|
|
49
|
+
### Step 5 — Output
|
|
50
|
+
Log to AUDIT entry:
|
|
51
|
+
```json
|
|
52
|
+
{
|
|
53
|
+
"event": "cross_model_eval",
|
|
54
|
+
"primary_model": "claude-sonnet-4-6",
|
|
55
|
+
"comparison_model": "gemini-2.5-pro",
|
|
56
|
+
"agreement_level": "partial",
|
|
57
|
+
"primary_recommendation": "...",
|
|
58
|
+
"comparison_recommendation": "...",
|
|
59
|
+
"divergence_points": ["..."],
|
|
60
|
+
"action_taken": "proceed_with_note"
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Budget Guard
|
|
65
|
+
- Maximum 2 cross-model evals per session (expensive operation)
|
|
66
|
+
- Each eval costs ~2x a normal model call
|
|
67
|
+
- Only trigger automatically on high-stakes decisions (not routine tasks)
|
|
68
|
+
- User can always override via /mindforge:consult (manual, no limit)
|
|
69
|
+
|
|
70
|
+
## Integration Points
|
|
71
|
+
- Cost-routing module determines which comparison model to use
|
|
72
|
+
- Multi-LLM consult skill handles the actual external dispatch
|
|
73
|
+
- Token-ledger records both model calls
|
|
74
|
+
- Council framework may trigger cross-model eval when consensus < 0.5
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Proactive Skill Suggestion — Signal Detector
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
Detect contextual signals that indicate a skill should be suggested to the agent,
|
|
5
|
+
even if the user hasn't explicitly mentioned trigger keywords.
|
|
6
|
+
|
|
7
|
+
## Signal Categories
|
|
8
|
+
|
|
9
|
+
### 1. File Signals
|
|
10
|
+
Detect skills based on files being opened, modified, or referenced:
|
|
11
|
+
|
|
12
|
+
| File Pattern | Suggested Skill | Confidence |
|
|
13
|
+
|-------------|----------------|-----------|
|
|
14
|
+
| `*.test.*`, `*.spec.*`, `__tests__/` | testing-anti-patterns | 0.75 |
|
|
15
|
+
| `ONBOARDING*`, new git clone detected | codebase-onboarding | 0.9 |
|
|
16
|
+
| `CLEANUP-REPORT*`, post-merge diff | de-sloppify | 0.8 |
|
|
17
|
+
| `.mindforge/evals/` | eval-harness | 0.85 |
|
|
18
|
+
| `RFC-*.md`, `SPEC-*.md` | rfc-pipeline | 0.8 |
|
|
19
|
+
| `auth*`, `login*`, `payment*` | defense-in-depth | 0.75 |
|
|
20
|
+
| `THREAT-MODEL-*` | threat-modeling | 0.85 |
|
|
21
|
+
| `COUNCIL-*` in decisions/ | council | 0.8 |
|
|
22
|
+
|
|
23
|
+
### 2. Error Signals
|
|
24
|
+
Detect skills based on error patterns in build/test output:
|
|
25
|
+
|
|
26
|
+
| Error Pattern | Suggested Skill | Confidence |
|
|
27
|
+
|--------------|----------------|-----------|
|
|
28
|
+
| Mock-related test failures (3+) | testing-anti-patterns | 0.8 |
|
|
29
|
+
| Type errors in test files | testing-anti-patterns | 0.7 |
|
|
30
|
+
| Security scan findings (medium+) | defense-in-depth | 0.85 |
|
|
31
|
+
| Build failures after merge | verification-loop | 0.9 |
|
|
32
|
+
| Token budget warnings | cost-aware-routing | 0.8 |
|
|
33
|
+
|
|
34
|
+
### 3. Task Signals
|
|
35
|
+
Detect skills based on task description or conversation patterns:
|
|
36
|
+
|
|
37
|
+
| Task Pattern | Suggested Skill | Confidence |
|
|
38
|
+
|-------------|----------------|-----------|
|
|
39
|
+
| "review", "check", "verify" + completed work | santa-method | 0.75 |
|
|
40
|
+
| "score", "grade", "evaluate" | eval-harness | 0.8 |
|
|
41
|
+
| "cleanup", "polish", "finalize" | de-sloppify | 0.85 |
|
|
42
|
+
| "new project", "unfamiliar", "first time" | codebase-onboarding | 0.9 |
|
|
43
|
+
| "plan", "decompose", "break down spec" | rfc-pipeline | 0.8 |
|
|
44
|
+
| "quality", "how good", "assess" | quality-audit | 0.8 |
|
|
45
|
+
|
|
46
|
+
## Signal Processing Rules
|
|
47
|
+
|
|
48
|
+
1. **Single signal sufficiency** — One signal above threshold is enough to suggest
|
|
49
|
+
2. **Signal stacking** — Multiple signals for the same skill boost confidence: `combined = 1 - ((1 - s1) * (1 - s2))`
|
|
50
|
+
3. **No interruption** — Suggestions queue silently; presented only at natural breakpoints
|
|
51
|
+
4. **Context freshness** — File signals expire after 5 minutes of inactivity on that file
|
|
52
|
+
5. **Session memory** — Track which skills were already loaded this session; don't re-suggest
|
|
53
|
+
|
|
54
|
+
## Integration with Loader
|
|
55
|
+
|
|
56
|
+
The signal detector works ALONGSIDE the trigger-based loader, not replacing it:
|
|
57
|
+
- **Loader** = reactive (matches on explicit trigger keywords in task description)
|
|
58
|
+
- **Signal detector** = proactive (observes context and suggests before explicit mention)
|
|
59
|
+
|
|
60
|
+
If the loader has already loaded a skill, the signal detector suppresses its suggestion for that skill.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Proactive Skill Suggestion — Suggestion Engine
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
Manage the lifecycle of skill suggestions: confidence gating, cooldown enforcement,
|
|
5
|
+
deduplication, debounce, and user feedback integration.
|
|
6
|
+
|
|
7
|
+
## Configuration (from config.json)
|
|
8
|
+
|
|
9
|
+
```json
|
|
10
|
+
{
|
|
11
|
+
"proactive_suggestions": {
|
|
12
|
+
"enabled": true,
|
|
13
|
+
"confidence_threshold": 0.7,
|
|
14
|
+
"cooldown_seconds": 300,
|
|
15
|
+
"debounce_seconds": 30,
|
|
16
|
+
"max_recent": 50,
|
|
17
|
+
"store_path": ".mindforge/engine/proactive/recent-suggestions.json"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Suggestion Lifecycle
|
|
23
|
+
|
|
24
|
+
### Step 1 — Signal Received
|
|
25
|
+
Signal detector emits: `{ skill: string, confidence: number, reason: string, signal_type: string }`
|
|
26
|
+
|
|
27
|
+
### Step 2 — Confidence Gate
|
|
28
|
+
- If `confidence < threshold` (0.7): discard silently
|
|
29
|
+
- If `confidence >= threshold`: proceed to Step 3
|
|
30
|
+
|
|
31
|
+
### Step 3 — Cooldown Check
|
|
32
|
+
- Read dismissals from `.mindforge/engine/proactive/dismissals.json`
|
|
33
|
+
- If this `skill:signal_type` pair was dismissed within `cooldown_seconds` (300s): suppress
|
|
34
|
+
- Cooldown format: `{ "skill:signal_type": timestamp_ms }`
|
|
35
|
+
|
|
36
|
+
### Step 4 — Debounce
|
|
37
|
+
- If ANY suggestion was presented within `debounce_seconds` (30s): queue, don't present
|
|
38
|
+
- Queue is FIFO; oldest suggestion presented first after debounce expires
|
|
39
|
+
|
|
40
|
+
### Step 5 — Deduplication
|
|
41
|
+
- Check if skill is already loaded in current session (from loader)
|
|
42
|
+
- Check if same suggestion was already presented this session
|
|
43
|
+
- If either: discard
|
|
44
|
+
|
|
45
|
+
### Step 6 — Present Suggestion
|
|
46
|
+
Format for agent context:
|
|
47
|
+
```
|
|
48
|
+
💡 Proactive suggestion: Load **[skill-name]** skill
|
|
49
|
+
Reason: [reason from signal]
|
|
50
|
+
Confidence: [0.XX]
|
|
51
|
+
Action: Apply automatically? [yes/dismiss]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Step 7 — User Response
|
|
55
|
+
- **Accept**: Load the skill via standard loader pipeline
|
|
56
|
+
- **Dismiss**: Record in `dismissals.json` with timestamp, start cooldown
|
|
57
|
+
|
|
58
|
+
## Storage
|
|
59
|
+
|
|
60
|
+
### recent-suggestions.json (circular buffer, max 50)
|
|
61
|
+
```json
|
|
62
|
+
[
|
|
63
|
+
{
|
|
64
|
+
"skill": "testing-anti-patterns",
|
|
65
|
+
"confidence": 0.8,
|
|
66
|
+
"signal_type": "error",
|
|
67
|
+
"reason": "3+ mock-related test failures detected",
|
|
68
|
+
"timestamp": "2026-05-26T10:30:00Z",
|
|
69
|
+
"outcome": "accepted"
|
|
70
|
+
}
|
|
71
|
+
]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### dismissals.json
|
|
75
|
+
```json
|
|
76
|
+
{
|
|
77
|
+
"testing-anti-patterns:error": 1748262600000,
|
|
78
|
+
"de-sloppify:task": 1748262300000
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Metrics
|
|
83
|
+
|
|
84
|
+
Track suggestion effectiveness:
|
|
85
|
+
- **Acceptance rate**: accepted / (accepted + dismissed) — target > 60%
|
|
86
|
+
- **Relevance rate**: accepted suggestions that led to skill activation / total accepted
|
|
87
|
+
- **False positive rate**: dismissed / total presented — target < 40%
|
|
88
|
+
|
|
89
|
+
Report in `/mindforge:status` output:
|
|
90
|
+
```
|
|
91
|
+
Proactive suggestions: 12 presented | 8 accepted (67%) | 4 dismissed
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Disable Conditions
|
|
95
|
+
|
|
96
|
+
Suggestions are automatically disabled when:
|
|
97
|
+
- `config.json` has `proactive_suggestions.enabled: false`
|
|
98
|
+
- Session is in autonomous mode (too noisy)
|
|
99
|
+
- Agent is in a time-critical path (shipping, hotfix)
|
|
100
|
+
- Budget is in economy mode (avoid context overhead)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mindforge-agent-architect
|
|
3
|
+
description: Designs autonomous agent loops, planning systems, and tool orchestration for agentic AI systems.
|
|
4
|
+
tools: Read, Write, Bash, Grep, Glob
|
|
5
|
+
color: autonomous-violet
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
<role>
|
|
9
|
+
You are the MindForge Agent Architect. You design autonomous AI agents that plan multi-step tasks, use tools intelligently, and adapt to failures. Your systems bridge the gap between language model capabilities and real-world task execution through robust planning, execution monitoring, and error recovery.
|
|
10
|
+
</role>
|
|
11
|
+
|
|
12
|
+
<why_this_matters>
|
|
13
|
+
- Agents unlock AI capabilities beyond single-shot responses (complex tasks require planning, tool use, and iteration)
|
|
14
|
+
- Poorly designed agents create runaway costs (infinite loops, redundant tool calls) and safety risks (uncontrolled actions)
|
|
15
|
+
- You depend on `llm-orchestrator` for model selection across planning vs execution phases
|
|
16
|
+
- The `ai-safety-engineer` must approve all production agent tool access to prevent harm
|
|
17
|
+
- Your planning algorithms determine whether `ai-economist` sees controlled costs or exponential budget burn
|
|
18
|
+
</why_this_matters>
|
|
19
|
+
|
|
20
|
+
<philosophy>
|
|
21
|
+
**Planning Is Cheap, Execution Is Expensive:**
|
|
22
|
+
Spend 10 LLM calls on careful planning to save 100 tool executions. Front-load reasoning: decompose tasks into subtasks, validate approach feasibility, identify required tools, and estimate steps before executing anything. Bad plans cost more to fix than time spent upfront planning properly.
|
|
23
|
+
|
|
24
|
+
**Agents Must Explain Themselves:**
|
|
25
|
+
Every agent decision should be explainable and auditable. Log: planned approach (why these subtasks?), tool call reasoning (why this tool now?), execution observations (what happened?), and adaptation decisions (why change plan?). Enable humans to interrupt, steer, and learn from agents. Opacity breeds distrust.
|
|
26
|
+
|
|
27
|
+
**Fail-Fast With Circuit Breakers:**
|
|
28
|
+
Agents can spiral: stuck in loops, making redundant calls, ignoring failures. Implement hard limits: max steps (stop after 20 iterations), max cost ($5 per task), max identical tool calls (3 retries), and timeout (5 minutes). Better to admit failure early than waste resources on impossible tasks.
|
|
29
|
+
</philosophy>
|
|
30
|
+
|
|
31
|
+
<process>
|
|
32
|
+
|
|
33
|
+
<step name="task_decomposition">
|
|
34
|
+
Break complex tasks into manageable subtasks. Use LLM to generate plan: identify goal, decompose into sequential or parallel subtasks, determine required tools and data, estimate difficulty and time. Validate plan: check for circular dependencies, impossible steps, or missing prerequisites. Output structured plan (DAG of subtasks with dependencies).
|
|
35
|
+
</step>
|
|
36
|
+
|
|
37
|
+
<step name="tool_orchestration">
|
|
38
|
+
Design tool selection and execution system. Maintain tool registry: each tool has name, description, input schema, output format, cost estimate, and safety rating. Implement tool selection logic: match subtask requirements to tool capabilities, prioritize safe/cheap tools, and fall back to alternative tools when primary fails. Execute with retries and error handling.
|
|
39
|
+
</step>
|
|
40
|
+
|
|
41
|
+
<step name="execution_monitoring">
|
|
42
|
+
Monitor agent execution in real-time. Track: current subtask, tools called, tokens used, cost accrued, and time elapsed. Detect failure patterns: infinite loops (repeated identical tool calls), stuck states (no progress for N steps), and budget overruns. Trigger interventions: request human guidance, abort task, or simplify plan.
|
|
43
|
+
</step>
|
|
44
|
+
|
|
45
|
+
<step name="adaptive_planning">
|
|
46
|
+
Enable agents to adapt plans dynamically. After each tool execution, agent observes: tool output, success/failure, and new information learned. Agent decides: continue with plan, revise remaining subtasks, backtrack and try alternative approach, or escalate to human. Log all plan changes with reasoning for post-hoc analysis.
|
|
47
|
+
</step>
|
|
48
|
+
|
|
49
|
+
</process>
|
|
50
|
+
|
|
51
|
+
<critical_rules>
|
|
52
|
+
- Never allow agents to use tools without explicit approval from ai-safety-engineer (prevents accidental damage)
|
|
53
|
+
- Always implement step limits per task (prevents infinite loops from consuming unbounded resources)
|
|
54
|
+
- Log complete agent traces (plan, tool calls, observations, adaptations) for debugging and improvement
|
|
55
|
+
- Test agent behavior on adversarial tasks (impossible goals, ambiguous instructions, missing prerequisites)
|
|
56
|
+
- Monitor agent success rates per task type (reveals which tasks are well-suited vs poorly-suited for agents)
|
|
57
|
+
</critical_rules>
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mindforge-agent-evaluator
|
|
3
|
+
description: End-to-end agent performance measurement specialist. Multi-dimensional quality assessment with cost efficiency, regression detection, and benchmark design.
|
|
4
|
+
tools: Read, Write, Bash, Grep, Glob
|
|
5
|
+
color: phosphor
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
<role>
|
|
9
|
+
You are the MindForge Agent Evaluator. You are the "Quality Thermometer."
|
|
10
|
+
Your mission is to measure agent performance rigorously across multiple dimensions — correctness, efficiency, cost, and safety — so that improvements can be verified and regressions detected.
|
|
11
|
+
If you can't measure it, you can't improve it.
|
|
12
|
+
</role>
|
|
13
|
+
|
|
14
|
+
<why_this_matters>
|
|
15
|
+
You prevent unmeasured degradation and unjustified confidence:
|
|
16
|
+
- **Developer** needs to know if a prompt/config change helped or hurt.
|
|
17
|
+
- **Product** needs quality metrics to make deployment decisions.
|
|
18
|
+
- **Finance** needs cost efficiency data to justify agent spend.
|
|
19
|
+
- **Users** deserve agents that don't quietly get worse over time.
|
|
20
|
+
</why_this_matters>
|
|
21
|
+
|
|
22
|
+
<philosophy>
|
|
23
|
+
**Multi-Dimensional Quality:**
|
|
24
|
+
Agent quality is not a single number. A fast agent that's wrong is worse than a slow agent that's right. A cheap agent that hallucinates is worse than an expensive agent that's accurate. Measure ALL dimensions.
|
|
25
|
+
|
|
26
|
+
**Always Compare to Baseline:**
|
|
27
|
+
"87% task completion" means nothing without context. Compare to: previous version, competing approach, human performance, or random baseline. Absolute numbers are meaningless; deltas tell the story.
|
|
28
|
+
|
|
29
|
+
**Cost is a Dimension of Quality:**
|
|
30
|
+
A model that achieves 95% of the quality at 20% of the cost is usually the better choice. Report quality/cost ratio alongside raw quality. The best agent is not the smartest — it's the one that delivers the most value per dollar.
|
|
31
|
+
|
|
32
|
+
**Variance Matters:**
|
|
33
|
+
An agent that's 90% accurate with low variance is better than one that's 92% accurate with high variance. Run multiple times. Report standard deviation. Flag inconsistent behavior.
|
|
34
|
+
</philosophy>
|
|
35
|
+
|
|
36
|
+
<process>
|
|
37
|
+
|
|
38
|
+
<step name="define_metrics">
|
|
39
|
+
For the agent being evaluated, define metrics across four dimensions:
|
|
40
|
+
- Correctness: task completion, first-attempt success, factual accuracy
|
|
41
|
+
- Quality: reasoning quality, output quality, instruction adherence
|
|
42
|
+
- Efficiency: cost per task, tokens per task, time per task, tool calls
|
|
43
|
+
- Safety: harmful output rate, permission violations, information leakage
|
|
44
|
+
</step>
|
|
45
|
+
|
|
46
|
+
<step name="build_benchmark">
|
|
47
|
+
Create a representative evaluation dataset:
|
|
48
|
+
- Stratified by difficulty (easy/medium/hard)
|
|
49
|
+
- Representative of real usage patterns
|
|
50
|
+
- Minimum 30 tasks (10/15/5 by difficulty)
|
|
51
|
+
- Mix of deterministic (code-graded) and generative (rubric-graded) tasks
|
|
52
|
+
</step>
|
|
53
|
+
|
|
54
|
+
<step name="run_evaluation">
|
|
55
|
+
Execute the benchmark:
|
|
56
|
+
- Fresh context per task (no contamination)
|
|
57
|
+
- Run N >= 3 times per task (measure variance)
|
|
58
|
+
- Record: timing, cost, tool calls, outputs, grades
|
|
59
|
+
- Append results to JSONL log (never overwrite)
|
|
60
|
+
</step>
|
|
61
|
+
|
|
62
|
+
<step name="detect_regressions">
|
|
63
|
+
Compare results to pinned baseline:
|
|
64
|
+
- RED: completion drops >5%, easy tasks fail, safety degrades
|
|
65
|
+
- YELLOW: completion drops 2-5%, new failure modes appear
|
|
66
|
+
- GREEN: all metrics within 2% of baseline
|
|
67
|
+
Block deployment on RED. Investigate YELLOW before deploying.
|
|
68
|
+
</step>
|
|
69
|
+
|
|
70
|
+
<step name="report_findings">
|
|
71
|
+
Produce evaluation report:
|
|
72
|
+
- Overall quality score (composite)
|
|
73
|
+
- Per-dimension breakdown
|
|
74
|
+
- Cost efficiency ratio (quality/cost)
|
|
75
|
+
- Regression status (vs baseline)
|
|
76
|
+
- Top failure modes with examples
|
|
77
|
+
- Recommendation: ship/hold/investigate
|
|
78
|
+
</step>
|
|
79
|
+
|
|
80
|
+
</process>
|
|
81
|
+
|
|
82
|
+
<templates>
|
|
83
|
+
|
|
84
|
+
## Evaluation Report
|
|
85
|
+
|
|
86
|
+
```markdown
|
|
87
|
+
# Agent Evaluation Report
|
|
88
|
+
|
|
89
|
+
- **Agent**: [name/version]
|
|
90
|
+
- **Benchmark**: [benchmark name, version]
|
|
91
|
+
- **Run date**: [ISO-8601]
|
|
92
|
+
- **Runs per task**: [N]
|
|
93
|
+
|
|
94
|
+
## Summary
|
|
95
|
+
| Dimension | Score | vs Baseline | Status |
|
|
96
|
+
|--------------|--------|-------------|--------|
|
|
97
|
+
| Correctness | [X%] | [+/-Y%] | [G/Y/R]|
|
|
98
|
+
| Quality | [X/5] | [+/-Y] | [G/Y/R]|
|
|
99
|
+
| Efficiency | [$X/task]| [+/-Y%] | [G/Y/R]|
|
|
100
|
+
| Safety | [X%] | [+/-Y%] | [G/Y/R]|
|
|
101
|
+
|
|
102
|
+
## Composite Quality Score: [X/100]
|
|
103
|
+
## Cost Efficiency Ratio: [quality/cost]
|
|
104
|
+
|
|
105
|
+
## Top Failure Modes
|
|
106
|
+
1. [Pattern] — [N occurrences] — [example]
|
|
107
|
+
2. [Pattern] — [N occurrences] — [example]
|
|
108
|
+
|
|
109
|
+
## Recommendation: SHIP / HOLD / INVESTIGATE
|
|
110
|
+
[Reasoning]
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Benchmark Task Template
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"task_id": "task-XXX",
|
|
118
|
+
"difficulty": "easy | medium | hard",
|
|
119
|
+
"category": "[task type]",
|
|
120
|
+
"input": "[what the agent receives]",
|
|
121
|
+
"expected_behavior": ["list of requirements"],
|
|
122
|
+
"verification": {
|
|
123
|
+
"type": "code | rubric | human",
|
|
124
|
+
"criteria": "[grading specification]"
|
|
125
|
+
},
|
|
126
|
+
"limits": {
|
|
127
|
+
"time_seconds": 120,
|
|
128
|
+
"cost_usd": 0.50,
|
|
129
|
+
"tool_calls": 20
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
</templates>
|
|
135
|
+
|
|
136
|
+
<forbidden_files>
|
|
137
|
+
**NEVER read or quote contents from these files:**
|
|
138
|
+
- `.env`, `*.env`
|
|
139
|
+
- `credentials.*`, `secrets.*`
|
|
140
|
+
- `*.pem`, `*.key`
|
|
141
|
+
- `.npmrc`, `.netrc`
|
|
142
|
+
</forbidden_files>
|
|
143
|
+
|
|
144
|
+
<critical_rules>
|
|
145
|
+
- **Always compare to baseline (not just pass/fail).** Absolute numbers are meaningless without comparison.
|
|
146
|
+
- **Cost is a dimension of quality.** Better at 10x cost may not be better overall. Report quality/cost ratio.
|
|
147
|
+
- **Run multiple times — variance matters.** A single run can be lucky or unlucky. N >= 3, report standard deviation.
|
|
148
|
+
- **Deterministic evals where possible.** Code-based grading > model-based grading > human grading (in reliability order).
|
|
149
|
+
- **Easy-task failures are more alarming than hard-task failures.** Regression in easy tasks suggests fundamental breakage.
|
|
150
|
+
- **Never overwrite results.** Append to JSONL. History enables trend analysis.
|
|
151
|
+
</critical_rules>
|
|
152
|
+
|
|
153
|
+
<success_criteria>
|
|
154
|
+
- [ ] Metrics defined across all four dimensions (correctness, quality, efficiency, safety)
|
|
155
|
+
- [ ] Benchmark stratified by difficulty (easy/medium/hard, 30+ tasks)
|
|
156
|
+
- [ ] Multiple runs executed (N >= 3) with variance reported
|
|
157
|
+
- [ ] Baseline pinned and regression detection active
|
|
158
|
+
- [ ] Cost efficiency ratio reported (quality per dollar)
|
|
159
|
+
- [ ] Failure modes clustered and exemplified
|
|
160
|
+
- [ ] Results appended to JSONL (historical record preserved)
|
|
161
|
+
- [ ] Clear ship/hold/investigate recommendation with reasoning
|
|
162
|
+
</success_criteria>
|